Merge pull request #219 from Foohy/scraper_hotfix

Foohy · web-flow · commit 56180863b5d2 · 2024-05-12T21:31:01.000-07:00
Fix addon scraper not handling steam erroring with very large (&gt;50,000) results
diff --git a/other/scraper/scrape.py b/other/scraper/scrape.py
@@ -1,7 +1,10 @@
+#!/usr/bin/env python
+
 import sys
 import json
 import time
 import urllib.request
+import urllib.parse
 import re
 
 HOST = "http://api.steampowered.com"
@@ -11,16 +14,24 @@
 DELAY = 0.1 # How long to delay between requests
 FILENAME = "addons.txt"
 
-ignore_words = ["content", "server"]
+# Not a whole word search, so nav also gets navmesh
+ignore_words = [
+    "content",
+    "server",
+    "nav",
+    "node",
+    "icon"
+]
+
 ignore_reg = "(?<!_){0}(?!_)" # Allow ignore words to be a part of the map name (surrounding underscores)
 def containsIgnoreWord(str, word):
-    return re.search(ignore_reg.format(word), str) is not None
+    return re.search(ignore_reg.format(word), str, flags=re.IGNORECASE) is not None
 
 def containsIgnoreWords(str):
     for word in ignore_words:
         if containsIgnoreWord(str, word):
             return True
-        
+
     return False
 
 if __name__ == "__main__":
@@ -38,14 +49,20 @@ def containsIgnoreWords(str):
 
     f = open(FILENAME, "w")
 
-    while True:
-        req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&page={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, page)
-        response = urllib.request.urlopen(req).read()
-        resobj = json.loads(response.decode("utf-8", "ignore"))
-        total = resobj["response"]["total"]
-
-        for addon in resobj["response"]["publishedfiledetails"]:
-            if "title" in addon and containsIgnoreWords(addon["title"]):
+    cursor = "*"
+    last_cursor = None
+    while cursor != None and cursor != last_cursor:
+        req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&cursor={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, urllib.parse.quote_plus(cursor))
+        response_data = urllib.request.urlopen(req).read()
+        response = json.loads(response_data.decode("utf-8", "ignore"))["response"]
+        total = response["total"]
+        last_cursor = cursor
+        cursor = response["next_cursor"]
+
+        for addon in response["publishedfiledetails"]:
+            hasignorewords = "title" in addon and containsIgnoreWords(addon["title"])
+            sexyfuntimes = "maybe_inappropriate_sex" in addon and addon["maybe_inappropriate_sex"] == True
+            if hasignorewords or sexyfuntimes:
                 ign_str = u"Ignoring: " + addon["title"]
                 print(ign_str.encode('utf-8'))
                 continue
@@ -56,18 +73,18 @@ def containsIgnoreWords(str):
                 workshopids.append(wsid)
 
         # Informative output
-        finished = page * NUMPERPAGE + len(resobj["response"]["publishedfiledetails"])
+        finished = page * NUMPERPAGE + len(response["publishedfiledetails"])
         print("Finished {0} addons. ({1:.2f}% of {2})".format(finished, finished * 100.0 / total, total))
 
         # Move onto to the next page
         page += 1
 
-        if page * NUMPERPAGE > resobj["response"]["total"]:
+        if page * NUMPERPAGE > response["total"]:
             break
-        else:   
+        else:
             # so valve doesn't get angry at us
             time.sleep(DELAY)
-    
+
     # Results come back sorted, but reverse it so
     # newer entries are added at the end instead of shifting everything at the beginning
     workshopids.reverse()
@@ -78,4 +95,3 @@ def containsIgnoreWords(str):
 
     print("Finished!!")
     f.close()
-