Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
9234e88
Lots of changes including tests of the import regexs and print statem…
craigmaclachlan Mar 23, 2016
99331a8
Fixed the depsy module call. It's not a proper module so can't be imp…
craigmaclachlan Mar 23, 2016
23f9a97
Testing something.
craigmaclachlan Mar 23, 2016
37a08e4
Removed redundant selfs.
craigmaclachlan Mar 23, 2016
eeddbe8
Fixed regex searching.
craigmaclachlan Mar 23, 2016
e286e0e
Fixed test results. They were wrong, not the code.
craigmaclachlan Mar 23, 2016
ff2d31c
Fixed regex code.
craigmaclachlan Mar 23, 2016
92e6285
Another regex fix.
craigmaclachlan Mar 23, 2016
c456275
Fix expected test results.
craigmaclachlan Mar 23, 2016
6479f4d
More fixing
craigmaclachlan Mar 23, 2016
1a6efbd
Fixy fixy
craigmaclachlan Mar 23, 2016
2026ceb
Removed testing bits.
craigmaclachlan Mar 23, 2016
979645c
Fixed module referencing.
craigmaclachlan Mar 23, 2016
ce261be
Brought Depsy functions into package.
craigmaclachlan Mar 23, 2016
5534d6c
Fix indent.
craigmaclachlan Mar 23, 2016
7ca2f84
Removed missing import.
craigmaclachlan Mar 23, 2016
c1ca333
Removed extaneous code.
craigmaclachlan Mar 23, 2016
cbb5adf
Scraping requirements doesn't work on our repo.
craigmaclachlan Mar 23, 2016
9a2ce78
Setup.py doesn't work either.
craigmaclachlan Mar 23, 2016
7df78d8
testing
craigmaclachlan Mar 23, 2016
a9e7e32
get the filelist manually.
craigmaclachlan Mar 23, 2016
9ea94c1
corrected import.
craigmaclachlan Mar 23, 2016
24a13a6
fixed basename command.
craigmaclachlan Mar 23, 2016
11e6f6a
Need to allow leading whitespace.
craigmaclachlan Mar 23, 2016
ef3072c
Need the right symbol for whitepace.
craigmaclachlan Mar 23, 2016
5e484f9
Removed start of line symbol from regex.
craigmaclachlan Mar 23, 2016
88e8bdb
remove whitespace.
craigmaclachlan Mar 23, 2016
d10513c
Fixed regex again!
craigmaclachlan Mar 23, 2016
e45e42b
protect lines that don't import.
craigmaclachlan Mar 23, 2016
eec6881
add print
craigmaclachlan Mar 23, 2016
0030bd8
more diag
craigmaclachlan Mar 23, 2016
f7b522e
change spliting
craigmaclachlan Mar 23, 2016
53f83c0
remove split?
craigmaclachlan Mar 23, 2016
be74c86
print the text
craigmaclachlan Mar 23, 2016
572cf12
change to cat file func
craigmaclachlan Mar 23, 2016
2e683c0
arrrgggh
craigmaclachlan Mar 23, 2016
6a7c41a
commit
craigmaclachlan Mar 23, 2016
c2714ad
simplify the regex
craigmaclachlan Mar 23, 2016
c6016e9
some exception handling.
craigmaclachlan Mar 23, 2016
d66a429
Remove excessive prints
craigmaclachlan Mar 23, 2016
11c48c4
arrrgh2
craigmaclachlan Mar 23, 2016
823fc36
stupid regex i can think of
craigmaclachlan Mar 23, 2016
b8abebb
changed match to search.
craigmaclachlan Mar 23, 2016
b91c215
OK working now. IN the most horrible way.
craigmaclachlan Mar 23, 2016
8a26d4f
Fixed the regular expression searching and added the filtering of the…
craigmaclachlan Mar 30, 2016
651c0aa
Uncommeted logic block that allows the use of requirements.txt and se…
craigmaclachlan Mar 30, 2016
8f402ad
tests now run in python 3python3 prints and fixed tabs/vs spaces
Apr 11, 2016
1e02d78
integrated `get_dependencies` MAY NOT WORK YET
Apr 12, 2016
988ebe3
fixed file mode py3k
Apr 14, 2016
3749d28
debug statements
Apr 14, 2016
5ab8547
can also have a setup.py with no dependencies in it
Apr 14, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Software Sustainability Institute Hackday 2016 Team
* Laurence Billingham <[email protected]>
* Martin Hammitzsch
* Steve Harris
* Craig MacLachlan
* Craig MacLachlan <[email protected]>

Contributors
------------
Expand Down
36 changes: 25 additions & 11 deletions commit_opener/commit_opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,43 @@
import pandas as pd
from shutil import rmtree

from . tree_scrape import author_minded
from . query_pmc import pmc_data

from commit_opener.grab_dependencies import get_dependencies
from commit_opener.tree_scrape import author_minded
from commit_opener.query_pmc import pmc_data as pubmed_data

OUT_SUBFOLDER = 'contrib_data'
AUTHOR_DATA = 'author_data.json'


def verify_local_repo_location(repo):
if not os.path.isdir(repo):
raise IOError('could not locate repository {}'.format(repo))


def build_out_path(repo_name, parent_path=None):
if parent_path is None:
parent_path = os.path.abspath(os.curdir)
out_path = os.path.join(parent_path, repo_name, OUT_SUBFOLDER)
return out_path


def make_output_folder(path_, overwrite):
if not os.path.exists(path_):
os.mkdir(path_)
else:
rmtree(path_)
os.mkdir(path_)


@click.command()
@click.option('--repo', prompt='git repository location', help='path to folder containing .git repository or url')
@click.option('--repo', prompt='git repository location',
help='path to folder containing .git repository or url')
@click.option('--out_dir', default=None,
help='parent dir for output data, default same as .git folder scraped')
help='parent dir for output data, default same as .git folder scraped')
@click.option('--clobber_output', default=True,
help='should we overwrite existing data?, default True')
@click.option('--verbose/--no-verbose', default=False)

help='should we overwrite existing data?, default True')
@click.option('--verbose/--no-verbose', default=True)
def main(repo, out_dir, clobber_output, verbose):
""" """
import logging
Expand All @@ -55,10 +61,18 @@ def main(repo, out_dir, clobber_output, verbose):
repo_name = os.path.basename(repo)
make_output_folder(out_dir, overwrite=clobber_output)
contributor_data = author_minded(repo)
citation_data = pmc_data('SPSS')
logging.info("output path: %s" % os.path.join(out_dir,'contributor_data.json'))
contributor_data.to_json(os.path.join(out_dir,'contributor_data.json'), date_format='iso')
citation_data['citations'].to_json(os.path.join(out_dir,'citation_data.json'))
citation_data = pubmed_data('SPSS')
depends_data = get_dependencies(repo_name, repo)
logging.info('got dependency data of type {}'.format(type(depends_data)))
logging.info('got dependency data:\n {}'.format(depends_data))
logging.info("output path: %s" % os.path.join(out_dir,
'contributor_data.json'))
contributor_data.to_json(os.path.join(out_dir,
'contributor_data.json'),
date_format='iso')
citation_data['citations'].to_json(os.path.join(out_dir,
'citation_data.json'))
depends_data.to_json(os.path.join(out_dir, 'dependencies_data.json'))

if __name__ == '__main__':
main()
133 changes: 133 additions & 0 deletions commit_opener/depsy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
import pickle
import ast
import os.path
import errno
import requests

# """Functions from depsy"""


def parse_requirements_txt(contents):
# see here for spec used in parsing the file:
# https://pip.readthedocs.org/en/1.1/requirements.html#the-requirements-file-format
# it doesn't mention the '#' comment but found it often in examples.
# not using this test str in the function, just a handy place to keep it.
test_str = """# my comment
file://blahblah
foo==10.2
baz>=3.6
# other comment
foo.bar>=3.33
foo-bar==2.2
foo_bar==1.1
foo == 5.5
.for some reason there is a dot sometimes
--index-url blahblah
-e http://blah
foo_with_space_in_front = 1.1"""

reqs = re.findall(
'^(?!file:|-|\.)\s*([\w\.-]+)',
contents,
re.MULTILINE | re.IGNORECASE
)
return sorted(reqs)


def parse_setup_py(contents):
parsed = ast.parse(contents)
ret = []
# see ast docs: https://greentreesnakes.readthedocs.org/en/latest/index.html
for node in ast.walk(parsed):
try:
if node.func.id == "setup":
for keyword in node.keywords:
if keyword.arg == "install_requires":
print("found requirements in setup.py 'install_requires' arg")
for elt in keyword.value.elts:
ret.append(_clean_setup_req(elt.s))

if keyword.arg == "requires":
print("found requirements in setup.py 'requires' arg")
for elt in keyword.value.elts:
ret.append(_clean_setup_req(elt.s))

if keyword.arg == "extras_require":
print("found requirements in setup.py 'extras_require' arg")
for my_list in keyword.value.values:
for elt in my_list.elts:
ret.append(_clean_setup_req(elt.s))

except AttributeError:
continue

return sorted(ret)


class PythonStandardLibs():

def __init__(self):
self.url = "https://docs.python.org/2.7/py-modindex.html"
self.data_dir = os.path.join(os.path.dirname(__file__),
"../../data")

self.pickle_path = os.path.join(self.data_dir,
"python_standard_libs.pickle")
self.libs = None

def _mkdir(self):
try:
os.makedirs(self.data_dir)
except OSError as exp:
if exp.errno != errno.EEXIST:
raise
self.pickle_path = os.path.join(self.data_dir,
"python_standard_libs.pickle")

def retrieve_from_web(self):
# only needs to be used once ever, here for tidiness
# checked the result into source control as python_standard_libs.pickle
html = requests.get(self.url).text
exp = r'class="xref">([^<]+)'
matches = re.findall(exp, html)
self.libs = [m for m in matches if '.' not in m]

def pickle_libs(self):

if self.libs is None:
self.retrieve_from_web()

self._mkdir()
with open(self.pickle_path, "wb") as f:
pickle.dump(self.libs, f)

print("saved these to file: {}".format(self.libs))

def get(self):
if self.libs is None:
try:
with open(self.pickle_path, "rb") as f:
print("Loading list of Stdandard Python Libraries from pickle file")
self.libs = pickle.load(f)
except:
self.retrieve_from_web()
self.pickle_libs()

def clean(self):
try:
os.remove(self.pickle_path)
except:
pass


def save_python_standard_libs(clean=False):
pystdlibs = PythonStandardLibs()
if clean:
pystdlibs.clean()
pystdlibs.get()

# to show the thing works
new_libs_obj = PythonStandardLibs()
new_libs_obj.get()
print("got these from pickled file: {}".format(new_libs_obj.libs))
100 changes: 98 additions & 2 deletions commit_opener/grab_dependencies.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,104 @@
"""
Extract the dependencies from the repository
Extract the dependencies from the repository

Issue:
work out dependencies #3
https://github.com/lbillingham/commit_opener/issues/3

"""
The key function is get_dependencies().

"""
import re
import os
import pandas

from commit_opener import depsy
from commit_opener import repo

def catfile(filename):
"""Get text contents of a file."""

with open(filename, 'r') as fhandle:
print("Opening file {} and reading contents".format(filename))
text = fhandle.read()
return text


def get_dependencies(name, url):
"""
Get the dependecies for a git repository or any local python package.

"""
# Let's instantiate the repo object, so we can parse through it.
myrepo = repo.Repo(name, url)
print("Created a repository instance for {}".format(url))

# Extract a local copy
myrepo.extract_local_copy()
print("Local copy now available here: {}".format(myrepo.tmpdir))
myrepo._get_filelist()

# Note: the file has to be opened and read before passing to depsy
# functions.
if myrepo.has("requirements.txt"):
print("Repository has a requirements.txt file")
filetext = catfile(myrepo.has("requirements.txt"))
reqs = depsy.parse_requirements_txt(filetext)
elif myrepo.has("setup.py"):
print("Repository has a setup.py file")
filetext = catfile(myrepo.has("setup.py"))
reqs = depsy.parse_setup_py(filetext)
if len(reqs) < 1:
print("No reqs in setup file,"
"so determining dependencies ourselves.")
reqs = search_files_for_imports(myrepo)
else:
# No standard descriptions of the dependencies so let's try to work
# them out for ourselves.
print("No req or setup file, so determining dependencies ourselves.")
reqs = search_files_for_imports(myrepo)

# Convert the list of requirements to a set.
reqs = set(reqs)
print("Found the following imports: {}".format("\n".join(reqs)))

# Get the list of standard packages so that these can be removed.
stdlibs = depsy.PythonStandardLibs()
stdlibs.get()
set_std_libs = set(stdlibs.libs)


data = pandas.Series(list(reqs-set_std_libs))
data.sort_values(inplace=True)
return data



def search_files_for_imports(repo_instance):
"""
Walk all the python files in the repository and extract the import info.

"""
dep_list = []
for f in repo_instance.file_list:
if ".py" in f:
print("Looking in {} for imports".format(os.path.basename(f)))
filetext = catfile(f)
dep_list.extend(find_imports(filetext))

return dep_list


def find_imports(text):
"""Apply regular expression searching to a file"""
# list of regexes.
reexps = [re.compile('^[\si]+mport\s+(\w+)[\s\.]', re.MULTILINE),
re.compile('^[\sf]+rom\s+(\w+)[\s\.]+', re.MULTILINE)
]
import_list = []
for myregex in reexps:
try:
import_list.extend(re.findall(myregex, text))
except AttributeError:
pass
return import_list
Loading