karpathy · Randl · May 18, 2021
diff --git a/analyze.py b/analyze.py
@@ -24,6 +24,7 @@
 for pid,j in db.items():
   n += 1
   idvv = '%sv%d' % (j['_rawid'], j['_version'])
+  idvv = idvv.split('/')[-1]  # older papers
   txt_path = os.path.join('data', 'txt', idvv) + '.pdf.txt'
   if os.path.isfile(txt_path): # some pdfs dont translate to txt
     with open(txt_path, 'r') as f:
@@ -39,13 +40,13 @@
 print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db)))
 
 # compute tfidf vectors with scikits
-v = TfidfVectorizer(input='content', 
-        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
-        lowercase=True, analyzer='word', stop_words='english', 
+v = TfidfVectorizer(input='content',
+        encoding='utf-8', decode_error='replace', strip_accents='unicode',
+        lowercase=True, analyzer='word', stop_words='english',
         token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
-        ngram_range=(1, 2), max_features = max_features, 
+        ngram_range=(1, 2), max_features = max_features,
         norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
-        max_df=1.0, min_df=1)
+        max_df=1.0, min_df=1, dtype=np.float32)
 
 # create an iterator object to conserve memory
 def make_corpus(paths):

diff --git a/download_pdfs.py b/download_pdfs.py
@@ -9,13 +9,20 @@
 
 timeout_secs = 10 # after this many seconds we give up on a paper
 if not os.path.exists(Config.pdf_dir): os.makedirs(Config.pdf_dir)
-have = set(os.listdir(Config.pdf_dir)) # get list of all pdfs we already have
+
+print('Reading pdf list')
+files = list()
+for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
+    files += [os.path.join(dirpath, file) for file in filenames]
+
+have = set([os.path.split(pdf_path)[-1] for pdf_path in files])  # get list of all pdfs we already have
+print('Read pdf list')
 
 numok = 0
 numtot = 0
 db = pickle.load(open(Config.db_path, 'rb'))
 for pid,j in db.items():
-  
+
   pdfs = [x['href'] for x in j['links'] if x['type'] == 'application/pdf']
   assert len(pdfs) == 1
   pdf_url = pdfs[0] + '.pdf'
@@ -37,8 +44,8 @@
   except Exception as e:
     print('error downloading: ', pdf_url)
     print(e)
-  
+
   print('%d/%d of %d downloaded ok.' % (numok, numtot, len(db)))
-  
+
 print('final number of papers downloaded okay: %d/%d' % (numok, len(db)))
 
diff --git a/fetch_papers.py b/fetch_papers.py
@@ -15,8 +15,8 @@
 from utils import Config, safe_pickle_dump
 
 def encode_feedparser_dict(d):
-  """ 
-  helper function to get rid of feedparser bs with a deep copy. 
+  """
+  helper function to get rid of feedparser bs with a deep copy.
   I hate when libs wrap simple things in their own classes.
   """
   if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
@@ -33,7 +33,7 @@ def encode_feedparser_dict(d):
     return d
 
 def parse_arxiv_url(url):
-  """ 
+  """
   examples is http://arxiv.org/abs/1512.08756v2
   we want to extract the raw id and the version
   """
@@ -43,82 +43,142 @@ def parse_arxiv_url(url):
   assert len(parts) == 2, 'error parsing url ' + url
   return parts[0], int(parts[1])
 
-if __name__ == "__main__":
 
-  # parse input arguments
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--search-query', type=str,
-                      default='cat:cs.CV+OR+cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.NE+OR+cat:stat.ML',
-                      help='query used for arxiv API. See http://arxiv.org/help/api/user-manual#detailed_examples')
-  parser.add_argument('--start-index', type=int, default=0, help='0 = most recent API result')
-  parser.add_argument('--max-index', type=int, default=10000, help='upper bound on paper index we will fetch')
-  parser.add_argument('--results-per-iteration', type=int, default=100, help='passed to arxiv API')
-  parser.add_argument('--wait-time', type=float, default=5.0, help='lets be gentle to arxiv API (in number of seconds)')
-  parser.add_argument('--break-on-no-added', type=int, default=1, help='break out early if all returned query papers are already in db? 1=yes, 0=no')
-  args = parser.parse_args()
-
-  # misc hardcoded variables
-  base_url = 'http://export.arxiv.org/api/query?' # base api query url
-  print('Searching arXiv for %s' % (args.search_query, ))
-
-  # lets load the existing database to memory
-  try:
-    db = pickle.load(open(Config.db_path, 'rb'))
-  except Exception as e:
-    print('error loading existing database:')
-    print(e)
-    print('starting from an empty database')
-    db = {}
-
-  # -----------------------------------------------------------------------------
-  # main loop where we fetch the new results
-  print('database has %d entries at start' % (len(db), ))
-  num_added_total = 0
-  for i in range(args.start_index, args.max_index, args.results_per_iteration):
-
-    print("Results %i - %i" % (i,i+args.results_per_iteration))
-    query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (args.search_query,
-                                                         i, args.results_per_iteration)
-    with urllib.request.urlopen(base_url+query) as url:
-      response = url.read()
-    parse = feedparser.parse(response)
-    num_added = 0
-    num_skipped = 0
-    for e in parse.entries:
-
-      j = encode_feedparser_dict(e)
-
-      # extract just the raw arxiv id and version for this paper
-      rawid, version = parse_arxiv_url(j['id'])
-      j['_rawid'] = rawid
-      j['_version'] = version
-
-      # add to our database if we didn't have it before, or if this is a new version
-      if not rawid in db or j['_version'] > db[rawid]['_version']:
-        db[rawid] = j
-        print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
-        num_added += 1
-        num_added_total += 1
-      else:
-        num_skipped += 1
-
-    # print some information
-    print('Added %d papers, already had %d.' % (num_added, num_skipped))
-
-    if len(parse.entries) == 0:
-      print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
-      print(response)
-      break
-
-    if num_added == 0 and args.break_on_no_added == 1:
-      print('No new papers were added. Assuming no new papers exist. Exiting.')
-      break
+def fetch_api(args, db):
+    # misc hardcoded variables
+    base_url = 'http://export.arxiv.org/api/query?'  # base api query url
+    print('Searching arXiv for %s' % (args.search_query,))
+
+    num_added_total = 0
+    for i in range(args.start_index, args.max_index, args.results_per_iteration):
+
+        print("Results %i - %i" % (i, i + args.results_per_iteration))
+        query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (args.search_query,
+                                                                                    i, args.results_per_iteration)
+        with urllib.request.urlopen(base_url + query) as url:
+            response = url.read()
+        parse = feedparser.parse(response)
+        num_added = 0
+        num_skipped = 0
+        for e in parse.entries:
+
+            j = encode_feedparser_dict(e)
+
+            # extract just the raw arxiv id and version for this paper
+            rawid, version = parse_arxiv_url(j['id'])
+            j['_rawid'] = rawid
+            j['_version'] = version
+
+            # add to our database if we didn't have it before, or if this is a new version
+            if not rawid in db or j['_version'] > db[rawid]['_version']:
+                db[rawid] = j
+                print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
+                num_added += 1
+                num_added_total += 1
+            else:
+                num_skipped += 1
+
+        # print some information
+        print('Added %d papers, already had %d.' % (num_added, num_skipped))
+
+        if len(parse.entries) == 0:
+            print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
+            print(response)
+            break
+
+        if num_added == 0 and args.break_on_no_added == 1:
+            print('No new papers were added. Assuming no new papers exist. Exiting.')
+            break
 
     print('Sleeping for %i seconds' % (args.wait_time , ))
     time.sleep(args.wait_time + random.uniform(0, 3))
 
-  # save the database before we quit, if we found anything new
-  if num_added_total > 0:
-    print('Saving database with %d papers to %s' % (len(db), Config.db_path))
-    safe_pickle_dump(db, Config.db_path)
+    return db, num_added_total
+
+
+def fetch_kaggle(args, db):
+    import kaggle
+    import jsonlines
+
+    cat_set = set(args.categories)
+
+    print('Authenticating at kaggle')
+    kaggle.api.authenticate()
+    print('Downloading kaggle data')
+    kaggle.api.dataset_download_files('Cornell-University/arxiv', path='./kaggle', unzip=True)
+    print('Downloaded kaggle data')
+    num_added_total = 0
+    num_skipped_total = 0
+    with jsonlines.open('kaggle/arxiv-metadata-oai-snapshot.json') as reader:
+        for paper in reader:
+            categories = set(paper['categories'].split())
+            if args.categories is None or len(categories.intersection(cat_set)) > 0:
+                paper['_version'] = len(paper['versions'])
+                paper['updated'] = paper['versions'][-1]['created']
+                paper['published'] = paper['versions'][0]['created']
+                paper['_authors'] = paper['authors']
+                paper['authors'] = [{'name': " ".join([x[1], x[0]]).strip()} for x in paper['authors_parsed']]
+                paper['links'] = [{'title': 'pdf',
+                                   'href': 'http://arxiv.org/pdf/{}{}'.format(paper['id'],
+                                                                              paper['versions'][-1]['version']),
+                                   'rel': 'related', 'type': 'application/pdf'}]
+                paper['link'] = 'http://arxiv.org/abs/{}{}'.format(paper['id'], paper['versions'][-1]['version'])
+                rawid = paper['_rawid'] = paper['id']
+                paper['tags'] = [{'term': x} for x in categories]
+                paper['arxiv_primary_category'] = paper['tags'][0]
+                paper['summary'] = paper['abstract']
+
+                # add to our database if we didn't have it before, or if this is a new version
+                if not rawid in db or paper['_version'] > db[rawid]['_version']:
+                    db[rawid] = paper
+                    print('Updated %s added %s' % (paper['updated'].encode('utf-8'), paper['title'].encode('utf-8')))
+                    num_added_total += 1
+                else:
+                    num_skipped_total += 1
+
+    print('Added %d papers, already had %d.' % (num_added_total, num_skipped_total))
+    return db, num_added_total
+
+
+if __name__ == "__main__":
 
+    # parse input arguments
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--categories', type=str,
+                        default=['cs.CV', 'cs.AI', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML', 'cond-mat.dis-nn'],
+                        help='categories to search for')
+    parser.add_argument('--search-query', type=str,
+                        default='cat:cs.CV+OR+cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.NE+OR+cat:stat.ML+OR+cat:cond-mat.dis-nn',
+                        help='query used for arxiv API. See http://arxiv.org/help/api/user-manual#detailed_examples')
+    parser.add_argument('--start-index', type=int, default=0, help='0 = most recent API result')
+    parser.add_argument('--max-index', type=int, default=10000, help='upper bound on paper index we will fetch')
+    parser.add_argument('--kaggle', dest='kaggle', action='store_true', help='use kaggle data')
+    parser.add_argument('--results-per-iteration', type=int, default=100, help='passed to arxiv API')
+    parser.add_argument('--wait-time', type=float, default=5.0,
+                        help='lets be gentle to arxiv API (in number of seconds)')
+    parser.add_argument('--break-on-no-added', type=int, default=1,
+                        help='break out early if all returned query papers are already in db? 1=yes, 0=no')
+    args = parser.parse_args()
+
+    # lets load the existing database to memory
+    try:
+        db = pickle.load(open(Config.db_path, 'rb'))
+    except Exception as e:
+        print('error loading existing database:')
+        print(e)
+        print('starting from an empty database')
+        db = {}
+
+    # -----------------------------------------------------------------------------
+    # main loop where we fetch the new results
+    print('database has %d entries at start' % (len(db),))
+
+    if args.kaggle:
+        db, num_added_total = fetch_kaggle(args, db)
+    else:
+        db, num_added_total = fetch_api(args, db)
+
+    if num_added_total > 0:
+        print('Saving database with %d papers to %s' % (len(db), Config.db_path))
+        safe_pickle_dump(db, Config.db_path)
diff --git a/parse_pdf_to_text.py b/parse_pdf_to_text.py
@@ -23,15 +23,21 @@
   os.makedirs(Config.txt_dir)
 
 have = set(os.listdir(Config.txt_dir))
-files = os.listdir(Config.pdf_dir)
-for i,f in enumerate(files): # there was a ,start=1 here that I removed, can't remember why it would be there. shouldn't be, i think.
+
+db = pickle.load(open(Config.db_path, 'rb'))
+db_filenames = set([([x['href'] for x in db[j]['links'] if x['type'] == 'application/pdf'][0] + '.pdf').split('/')[-1] for j in db])
+files = list()
+for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
+  files += [os.path.join(dirpath, file) for file in filenames if file in db_filenames]
+
+for i, pdf_path in enumerate(files):  # there was a ,start=1 here that I removed, can't remember why it would be there. shouldn't be, i think.
+  f = os.path.split(pdf_path)[-1]
 
   txt_basename = f + '.txt'
   if txt_basename in have:
     print('%d/%d skipping %s, already exists.' % (i, len(files), txt_basename, ))
     continue
 
-  pdf_path = os.path.join(Config.pdf_dir, f)
   txt_path = os.path.join(Config.txt_dir, txt_basename)
   cmd = "pdftotext %s %s" % (pdf_path, txt_path)
   os.system(cmd)

diff --git a/requirements.txt b/requirements.txt
@@ -16,3 +16,9 @@ flask
 flask_limiter
 tornado
 pymongo
+
+tqdm
+
+kaggle
+jsonlines
+google-cloud-storage
diff --git a/thumb_pdf.py b/thumb_pdf.py
@@ -4,6 +4,8 @@
 """
 
 import os
+import pickle
+import sys
 import time
 import shutil
 from subprocess import Popen
@@ -21,15 +23,19 @@
 if not os.path.exists(Config.tmp_dir): os.makedirs(Config.tmp_dir)
 
 # fetch all pdf filenames in the pdf directory
-files_in_pdf_dir = os.listdir(pdf_dir)
-pdf_files = [x for x in files_in_pdf_dir if x.endswith('.pdf')] # filter to just pdfs, just in case
+db = pickle.load(open(Config.db_path, 'rb'))
+db_filenames = set([([x['href'] for x in db[j]['links'] if x['type'] == 'application/pdf'][0] + '.pdf').split('/')[-1] for j in db])
+files_in_pdf_dir = list()
+for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
+  files_in_pdf_dir += [os.path.join(dirpath, file) for file in filenames if file in db_filenames]
+pdf_files = [x for x in files_in_pdf_dir if x.endswith('.pdf')]  # filter to just pdfs, just in case
 
 # iterate over all pdf files and create the thumbnails
-for i,p in enumerate(pdf_files):
-  pdf_path = os.path.join(pdf_dir, p)
+for i, pdf_path in enumerate(pdf_files):
+  p = os.path.split(pdf_path)[-1]
   thumb_path = os.path.join(Config.thumbs_dir, p + '.jpg')
 
-  if os.path.isfile(thumb_path): 
+  if os.path.isfile(thumb_path):
     print("skipping %s, thumbnail already exists." % (pdf_path, ))
     continue
 
@@ -39,7 +45,7 @@
   # tile them horizontally, use JPEG compression 80, trim the borders for each image
   #cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (pdf_path, "thumbs/" + f + ".jpg")
   #print "EXEC: " + cmd
-  
+
   # nvm, below using a roundabout alternative that is worse and requires temporary files, yuck!
   # but i found that it succeeds more often. I can't remember wha thappened anymore but I remember
   # that the version above, while more elegant, had some problem with it on some pdfs. I think.