Skip to content

Commit 5618086

Browse files
authored
Merge pull request #219 from Foohy/scraper_hotfix
Fix addon scraper not handling steam erroring with very large (>50,000) results
2 parents 9783ad1 + 73411cf commit 5618086

File tree

1 file changed

+32
-16
lines changed

1 file changed

+32
-16
lines changed

other/scraper/scrape.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
#!/usr/bin/env python
2+
13
import sys
24
import json
35
import time
46
import urllib.request
7+
import urllib.parse
58
import re
69

710
HOST = "http://api.steampowered.com"
@@ -11,16 +14,24 @@
1114
DELAY = 0.1 # How long to delay between requests
1215
FILENAME = "addons.txt"
1316

14-
ignore_words = ["content", "server"]
17+
# Not a whole word search, so nav also gets navmesh
18+
ignore_words = [
19+
"content",
20+
"server",
21+
"nav",
22+
"node",
23+
"icon"
24+
]
25+
1526
ignore_reg = "(?<!_){0}(?!_)" # Allow ignore words to be a part of the map name (surrounding underscores)
1627
def containsIgnoreWord(str, word):
17-
return re.search(ignore_reg.format(word), str) is not None
28+
return re.search(ignore_reg.format(word), str, flags=re.IGNORECASE) is not None
1829

1930
def containsIgnoreWords(str):
2031
for word in ignore_words:
2132
if containsIgnoreWord(str, word):
2233
return True
23-
34+
2435
return False
2536

2637
if __name__ == "__main__":
@@ -38,14 +49,20 @@ def containsIgnoreWords(str):
3849

3950
f = open(FILENAME, "w")
4051

41-
while True:
42-
req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&page={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, page)
43-
response = urllib.request.urlopen(req).read()
44-
resobj = json.loads(response.decode("utf-8", "ignore"))
45-
total = resobj["response"]["total"]
46-
47-
for addon in resobj["response"]["publishedfiledetails"]:
48-
if "title" in addon and containsIgnoreWords(addon["title"]):
52+
cursor = "*"
53+
last_cursor = None
54+
while cursor != None and cursor != last_cursor:
55+
req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&cursor={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, urllib.parse.quote_plus(cursor))
56+
response_data = urllib.request.urlopen(req).read()
57+
response = json.loads(response_data.decode("utf-8", "ignore"))["response"]
58+
total = response["total"]
59+
last_cursor = cursor
60+
cursor = response["next_cursor"]
61+
62+
for addon in response["publishedfiledetails"]:
63+
hasignorewords = "title" in addon and containsIgnoreWords(addon["title"])
64+
sexyfuntimes = "maybe_inappropriate_sex" in addon and addon["maybe_inappropriate_sex"] == True
65+
if hasignorewords or sexyfuntimes:
4966
ign_str = u"Ignoring: " + addon["title"]
5067
print(ign_str.encode('utf-8'))
5168
continue
@@ -56,18 +73,18 @@ def containsIgnoreWords(str):
5673
workshopids.append(wsid)
5774

5875
# Informative output
59-
finished = page * NUMPERPAGE + len(resobj["response"]["publishedfiledetails"])
76+
finished = page * NUMPERPAGE + len(response["publishedfiledetails"])
6077
print("Finished {0} addons. ({1:.2f}% of {2})".format(finished, finished * 100.0 / total, total))
6178

6279
# Move onto to the next page
6380
page += 1
6481

65-
if page * NUMPERPAGE > resobj["response"]["total"]:
82+
if page * NUMPERPAGE > response["total"]:
6683
break
67-
else:
84+
else:
6885
# so valve doesn't get angry at us
6986
time.sleep(DELAY)
70-
87+
7188
# Results come back sorted, but reverse it so
7289
# newer entries are added at the end instead of shifting everything at the beginning
7390
workshopids.reverse()
@@ -78,4 +95,3 @@ def containsIgnoreWords(str):
7895

7996
print("Finished!!")
8097
f.close()
81-

0 commit comments

Comments
 (0)