Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
for pid,j in db.items():
n += 1
idvv = '%sv%d' % (j['_rawid'], j['_version'])
idvv = idvv.split('/')[-1] # older papers
txt_path = os.path.join('data', 'txt', idvv) + '.pdf.txt'
if os.path.isfile(txt_path): # some pdfs dont translate to txt
with open(txt_path, 'r') as f:
Expand All @@ -39,13 +40,13 @@
print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db)))

# compute tfidf vectors with scikits
v = TfidfVectorizer(input='content',
encoding='utf-8', decode_error='replace', strip_accents='unicode',
lowercase=True, analyzer='word', stop_words='english',
v = TfidfVectorizer(input='content',
encoding='utf-8', decode_error='replace', strip_accents='unicode',
lowercase=True, analyzer='word', stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
ngram_range=(1, 2), max_features = max_features,
ngram_range=(1, 2), max_features = max_features,
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
max_df=1.0, min_df=1)
max_df=1.0, min_df=1, dtype=np.float32)

# create an iterator object to conserve memory
def make_corpus(paths):
Expand Down
15 changes: 11 additions & 4 deletions download_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@

timeout_secs = 10 # after this many seconds we give up on a paper
if not os.path.exists(Config.pdf_dir): os.makedirs(Config.pdf_dir)
have = set(os.listdir(Config.pdf_dir)) # get list of all pdfs we already have

print('Reading pdf list')
files = list()
for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
files += [os.path.join(dirpath, file) for file in filenames]

have = set([os.path.split(pdf_path)[-1] for pdf_path in files]) # get list of all pdfs we already have
print('Read pdf list')

numok = 0
numtot = 0
db = pickle.load(open(Config.db_path, 'rb'))
for pid,j in db.items():

pdfs = [x['href'] for x in j['links'] if x['type'] == 'application/pdf']
assert len(pdfs) == 1
pdf_url = pdfs[0] + '.pdf'
Expand All @@ -37,8 +44,8 @@
except Exception as e:
print('error downloading: ', pdf_url)
print(e)

print('%d/%d of %d downloaded ok.' % (numok, numtot, len(db)))

print('final number of papers downloaded okay: %d/%d' % (numok, len(db)))

212 changes: 136 additions & 76 deletions fetch_papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from utils import Config, safe_pickle_dump

def encode_feedparser_dict(d):
"""
helper function to get rid of feedparser bs with a deep copy.
"""
helper function to get rid of feedparser bs with a deep copy.
I hate when libs wrap simple things in their own classes.
"""
if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
Expand All @@ -33,7 +33,7 @@ def encode_feedparser_dict(d):
return d

def parse_arxiv_url(url):
"""
"""
examples is http://arxiv.org/abs/1512.08756v2
we want to extract the raw id and the version
"""
Expand All @@ -43,82 +43,142 @@ def parse_arxiv_url(url):
assert len(parts) == 2, 'error parsing url ' + url
return parts[0], int(parts[1])

if __name__ == "__main__":

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--search-query', type=str,
default='cat:cs.CV+OR+cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.NE+OR+cat:stat.ML',
help='query used for arxiv API. See http://arxiv.org/help/api/user-manual#detailed_examples')
parser.add_argument('--start-index', type=int, default=0, help='0 = most recent API result')
parser.add_argument('--max-index', type=int, default=10000, help='upper bound on paper index we will fetch')
parser.add_argument('--results-per-iteration', type=int, default=100, help='passed to arxiv API')
parser.add_argument('--wait-time', type=float, default=5.0, help='lets be gentle to arxiv API (in number of seconds)')
parser.add_argument('--break-on-no-added', type=int, default=1, help='break out early if all returned query papers are already in db? 1=yes, 0=no')
args = parser.parse_args()

# misc hardcoded variables
base_url = 'http://export.arxiv.org/api/query?' # base api query url
print('Searching arXiv for %s' % (args.search_query, ))

# lets load the existing database to memory
try:
db = pickle.load(open(Config.db_path, 'rb'))
except Exception as e:
print('error loading existing database:')
print(e)
print('starting from an empty database')
db = {}

# -----------------------------------------------------------------------------
# main loop where we fetch the new results
print('database has %d entries at start' % (len(db), ))
num_added_total = 0
for i in range(args.start_index, args.max_index, args.results_per_iteration):

print("Results %i - %i" % (i,i+args.results_per_iteration))
query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (args.search_query,
i, args.results_per_iteration)
with urllib.request.urlopen(base_url+query) as url:
response = url.read()
parse = feedparser.parse(response)
num_added = 0
num_skipped = 0
for e in parse.entries:

j = encode_feedparser_dict(e)

# extract just the raw arxiv id and version for this paper
rawid, version = parse_arxiv_url(j['id'])
j['_rawid'] = rawid
j['_version'] = version

# add to our database if we didn't have it before, or if this is a new version
if not rawid in db or j['_version'] > db[rawid]['_version']:
db[rawid] = j
print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
num_added += 1
num_added_total += 1
else:
num_skipped += 1

# print some information
print('Added %d papers, already had %d.' % (num_added, num_skipped))

if len(parse.entries) == 0:
print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
print(response)
break

if num_added == 0 and args.break_on_no_added == 1:
print('No new papers were added. Assuming no new papers exist. Exiting.')
break
def fetch_api(args, db):
# misc hardcoded variables
base_url = 'http://export.arxiv.org/api/query?' # base api query url
print('Searching arXiv for %s' % (args.search_query,))

num_added_total = 0
for i in range(args.start_index, args.max_index, args.results_per_iteration):

print("Results %i - %i" % (i, i + args.results_per_iteration))
query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (args.search_query,
i, args.results_per_iteration)
with urllib.request.urlopen(base_url + query) as url:
response = url.read()
parse = feedparser.parse(response)
num_added = 0
num_skipped = 0
for e in parse.entries:

j = encode_feedparser_dict(e)

# extract just the raw arxiv id and version for this paper
rawid, version = parse_arxiv_url(j['id'])
j['_rawid'] = rawid
j['_version'] = version

# add to our database if we didn't have it before, or if this is a new version
if not rawid in db or j['_version'] > db[rawid]['_version']:
db[rawid] = j
print('Updated %s added %s' % (j['updated'].encode('utf-8'), j['title'].encode('utf-8')))
num_added += 1
num_added_total += 1
else:
num_skipped += 1

# print some information
print('Added %d papers, already had %d.' % (num_added, num_skipped))

if len(parse.entries) == 0:
print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
print(response)
break

if num_added == 0 and args.break_on_no_added == 1:
print('No new papers were added. Assuming no new papers exist. Exiting.')
break

print('Sleeping for %i seconds' % (args.wait_time , ))
time.sleep(args.wait_time + random.uniform(0, 3))

# save the database before we quit, if we found anything new
if num_added_total > 0:
print('Saving database with %d papers to %s' % (len(db), Config.db_path))
safe_pickle_dump(db, Config.db_path)
return db, num_added_total


def fetch_kaggle(args, db):
import kaggle
import jsonlines

cat_set = set(args.categories)

print('Authenticating at kaggle')
kaggle.api.authenticate()
print('Downloading kaggle data')
kaggle.api.dataset_download_files('Cornell-University/arxiv', path='./kaggle', unzip=True)
print('Downloaded kaggle data')
num_added_total = 0
num_skipped_total = 0
with jsonlines.open('kaggle/arxiv-metadata-oai-snapshot.json') as reader:
for paper in reader:
categories = set(paper['categories'].split())
if args.categories is None or len(categories.intersection(cat_set)) > 0:
paper['_version'] = len(paper['versions'])
paper['updated'] = paper['versions'][-1]['created']
paper['published'] = paper['versions'][0]['created']
paper['_authors'] = paper['authors']
paper['authors'] = [{'name': " ".join([x[1], x[0]]).strip()} for x in paper['authors_parsed']]
paper['links'] = [{'title': 'pdf',
'href': 'http://arxiv.org/pdf/{}{}'.format(paper['id'],
paper['versions'][-1]['version']),
'rel': 'related', 'type': 'application/pdf'}]
paper['link'] = 'http://arxiv.org/abs/{}{}'.format(paper['id'], paper['versions'][-1]['version'])
rawid = paper['_rawid'] = paper['id']
paper['tags'] = [{'term': x} for x in categories]
paper['arxiv_primary_category'] = paper['tags'][0]
paper['summary'] = paper['abstract']

# add to our database if we didn't have it before, or if this is a new version
if not rawid in db or paper['_version'] > db[rawid]['_version']:
db[rawid] = paper
print('Updated %s added %s' % (paper['updated'].encode('utf-8'), paper['title'].encode('utf-8')))
num_added_total += 1
else:
num_skipped_total += 1

print('Added %d papers, already had %d.' % (num_added_total, num_skipped_total))
return db, num_added_total


if __name__ == "__main__":

# parse input arguments
parser = argparse.ArgumentParser()

parser.add_argument('--categories', type=str,
default=['cs.CV', 'cs.AI', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML', 'cond-mat.dis-nn'],
help='categories to search for')
parser.add_argument('--search-query', type=str,
default='cat:cs.CV+OR+cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.NE+OR+cat:stat.ML+OR+cat:cond-mat.dis-nn',
help='query used for arxiv API. See http://arxiv.org/help/api/user-manual#detailed_examples')
parser.add_argument('--start-index', type=int, default=0, help='0 = most recent API result')
parser.add_argument('--max-index', type=int, default=10000, help='upper bound on paper index we will fetch')
parser.add_argument('--kaggle', dest='kaggle', action='store_true', help='use kaggle data')
parser.add_argument('--results-per-iteration', type=int, default=100, help='passed to arxiv API')
parser.add_argument('--wait-time', type=float, default=5.0,
help='lets be gentle to arxiv API (in number of seconds)')
parser.add_argument('--break-on-no-added', type=int, default=1,
help='break out early if all returned query papers are already in db? 1=yes, 0=no')
args = parser.parse_args()

# lets load the existing database to memory
try:
db = pickle.load(open(Config.db_path, 'rb'))
except Exception as e:
print('error loading existing database:')
print(e)
print('starting from an empty database')
db = {}

# -----------------------------------------------------------------------------
# main loop where we fetch the new results
print('database has %d entries at start' % (len(db),))

if args.kaggle:
db, num_added_total = fetch_kaggle(args, db)
else:
db, num_added_total = fetch_api(args, db)

if num_added_total > 0:
print('Saving database with %d papers to %s' % (len(db), Config.db_path))
safe_pickle_dump(db, Config.db_path)
12 changes: 9 additions & 3 deletions parse_pdf_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,21 @@
os.makedirs(Config.txt_dir)

have = set(os.listdir(Config.txt_dir))
files = os.listdir(Config.pdf_dir)
for i,f in enumerate(files): # there was a ,start=1 here that I removed, can't remember why it would be there. shouldn't be, i think.

db = pickle.load(open(Config.db_path, 'rb'))
db_filenames = set([([x['href'] for x in db[j]['links'] if x['type'] == 'application/pdf'][0] + '.pdf').split('/')[-1] for j in db])
files = list()
for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
files += [os.path.join(dirpath, file) for file in filenames if file in db_filenames]

for i, pdf_path in enumerate(files): # there was a ,start=1 here that I removed, can't remember why it would be there. shouldn't be, i think.
f = os.path.split(pdf_path)[-1]

txt_basename = f + '.txt'
if txt_basename in have:
print('%d/%d skipping %s, already exists.' % (i, len(files), txt_basename, ))
continue

pdf_path = os.path.join(Config.pdf_dir, f)
txt_path = os.path.join(Config.txt_dir, txt_basename)
cmd = "pdftotext %s %s" % (pdf_path, txt_path)
os.system(cmd)
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ flask
flask_limiter
tornado
pymongo

tqdm

kaggle
jsonlines
google-cloud-storage
18 changes: 12 additions & 6 deletions thumb_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""

import os
import pickle
import sys
import time
import shutil
from subprocess import Popen
Expand All @@ -21,15 +23,19 @@
if not os.path.exists(Config.tmp_dir): os.makedirs(Config.tmp_dir)

# fetch all pdf filenames in the pdf directory
files_in_pdf_dir = os.listdir(pdf_dir)
pdf_files = [x for x in files_in_pdf_dir if x.endswith('.pdf')] # filter to just pdfs, just in case
db = pickle.load(open(Config.db_path, 'rb'))
db_filenames = set([([x['href'] for x in db[j]['links'] if x['type'] == 'application/pdf'][0] + '.pdf').split('/')[-1] for j in db])
files_in_pdf_dir = list()
for (dirpath, dirnames, filenames) in os.walk(Config.pdf_dir):
files_in_pdf_dir += [os.path.join(dirpath, file) for file in filenames if file in db_filenames]
pdf_files = [x for x in files_in_pdf_dir if x.endswith('.pdf')] # filter to just pdfs, just in case

# iterate over all pdf files and create the thumbnails
for i,p in enumerate(pdf_files):
pdf_path = os.path.join(pdf_dir, p)
for i, pdf_path in enumerate(pdf_files):
p = os.path.split(pdf_path)[-1]
thumb_path = os.path.join(Config.thumbs_dir, p + '.jpg')

if os.path.isfile(thumb_path):
if os.path.isfile(thumb_path):
print("skipping %s, thumbnail already exists." % (pdf_path, ))
continue

Expand All @@ -39,7 +45,7 @@
# tile them horizontally, use JPEG compression 80, trim the borders for each image
#cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (pdf_path, "thumbs/" + f + ".jpg")
#print "EXEC: " + cmd

# nvm, below using a roundabout alternative that is worse and requires temporary files, yuck!
# but i found that it succeeds more often. I can't remember wha thappened anymore but I remember
# that the version above, while more elegant, had some problem with it on some pdfs. I think.
Expand Down
Loading