lbillingham · lbillingham · Mar 23, 2016 · Mar 23, 2016 · Mar 23, 2016 · Mar 23, 2016
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -9,7 +9,7 @@ Software Sustainability Institute Hackday 2016 Team
 * Laurence Billingham <[email protected]>
 * Martin Hammitzsch
 * Steve Harris
-* Craig MacLachlan
+* Craig MacLachlan <[email protected]>
 
 Contributors
 ------------

diff --git a/commit_opener/commit_opener.py b/commit_opener/commit_opener.py
@@ -4,37 +4,43 @@
 import pandas as pd
 from shutil import rmtree
 
-from . tree_scrape import author_minded
-from . query_pmc import pmc_data
+
+from commit_opener.grab_dependencies import get_dependencies
+from commit_opener.tree_scrape import author_minded
+from commit_opener.query_pmc import pmc_data as pubmed_data
 
 OUT_SUBFOLDER = 'contrib_data'
 AUTHOR_DATA = 'author_data.json'
 
+
 def verify_local_repo_location(repo):
     if not os.path.isdir(repo):
         raise IOError('could not locate repository {}'.format(repo))
 
+
 def build_out_path(repo_name, parent_path=None):
     if parent_path is None:
         parent_path = os.path.abspath(os.curdir)
     out_path = os.path.join(parent_path, repo_name, OUT_SUBFOLDER)
     return out_path
 
+
 def make_output_folder(path_, overwrite):
     if not os.path.exists(path_):
         os.mkdir(path_)
     else:
         rmtree(path_)
         os.mkdir(path_)
 
+
 @click.command()
-@click.option('--repo', prompt='git repository location', help='path to folder containing .git repository or url')
+@click.option('--repo', prompt='git repository location',
+              help='path to folder containing .git repository or url')
 @click.option('--out_dir', default=None,
-    help='parent dir for output data, default same as .git folder scraped')
+              help='parent dir for output data, default same as .git folder scraped')
 @click.option('--clobber_output', default=True,
-        help='should we overwrite existing data?, default True')
-@click.option('--verbose/--no-verbose', default=False)
-
+              help='should we overwrite existing data?, default True')
+@click.option('--verbose/--no-verbose', default=True)
 def main(repo, out_dir, clobber_output, verbose):
     """  """
     import logging
@@ -55,10 +61,18 @@ def main(repo, out_dir, clobber_output, verbose):
     repo_name = os.path.basename(repo)
     make_output_folder(out_dir, overwrite=clobber_output)
     contributor_data = author_minded(repo)
-    citation_data = pmc_data('SPSS')
-    logging.info("output path: %s" % os.path.join(out_dir,'contributor_data.json'))
-    contributor_data.to_json(os.path.join(out_dir,'contributor_data.json'), date_format='iso')
-    citation_data['citations'].to_json(os.path.join(out_dir,'citation_data.json'))
+    citation_data = pubmed_data('SPSS')
+    depends_data = get_dependencies(repo_name, repo)
+    logging.info('got dependency data of type {}'.format(type(depends_data)))
+    logging.info('got dependency data:\n {}'.format(depends_data))
+    logging.info("output path: %s" % os.path.join(out_dir,
+                                                  'contributor_data.json'))
+    contributor_data.to_json(os.path.join(out_dir,
+                                          'contributor_data.json'),
+                             date_format='iso')
+    citation_data['citations'].to_json(os.path.join(out_dir,
+                                                    'citation_data.json'))
+    depends_data.to_json(os.path.join(out_dir, 'dependencies_data.json'))
 
 if __name__ == '__main__':
     main()
diff --git a/commit_opener/depsy.py b/commit_opener/depsy.py
@@ -0,0 +1,133 @@
+import re
+import pickle
+import ast
+import os.path
+import errno
+import requests
+
+# """Functions from depsy"""
+
+
+def parse_requirements_txt(contents):
+    # see here for spec used in parsing the file:
+    # https://pip.readthedocs.org/en/1.1/requirements.html#the-requirements-file-format
+    # it doesn't mention the '#' comment but found it often in examples.
+    # not using this test str in  the function, just a handy place to keep it.
+    test_str = """# my comment
+file://blahblah
+foo==10.2
+baz>=3.6
+# other comment
+foo.bar>=3.33
+foo-bar==2.2
+foo_bar==1.1
+foo == 5.5
+.for some reason there is a dot sometimes
+--index-url blahblah
+-e http://blah
+  foo_with_space_in_front = 1.1"""
+
+    reqs = re.findall(
+        '^(?!file:|-|\.)\s*([\w\.-]+)',
+        contents,
+        re.MULTILINE | re.IGNORECASE
+    )
+    return sorted(reqs)
+
+
+def parse_setup_py(contents):
+    parsed = ast.parse(contents)
+    ret = []
+    # see ast docs: https://greentreesnakes.readthedocs.org/en/latest/index.html
+    for node in ast.walk(parsed):
+        try:
+            if node.func.id == "setup":
+                for keyword in node.keywords:
+                    if keyword.arg == "install_requires":
+                        print("found requirements in setup.py 'install_requires' arg")
+                        for elt in keyword.value.elts:
+                            ret.append(_clean_setup_req(elt.s))
+
+                    if keyword.arg == "requires":
+                        print("found requirements in setup.py 'requires' arg")
+                        for elt in keyword.value.elts:
+                            ret.append(_clean_setup_req(elt.s))
+
+                    if keyword.arg == "extras_require":
+                        print("found requirements in setup.py 'extras_require' arg")
+                        for my_list in keyword.value.values:
+                            for elt in my_list.elts:
+                                ret.append(_clean_setup_req(elt.s))
+
+        except AttributeError:
+            continue
+
+    return sorted(ret)
+
+
+class PythonStandardLibs():
+
+    def __init__(self):
+        self.url = "https://docs.python.org/2.7/py-modindex.html"
+        self.data_dir = os.path.join(os.path.dirname(__file__),
+                                     "../../data")
+
+        self.pickle_path = os.path.join(self.data_dir,
+                                        "python_standard_libs.pickle")
+        self.libs = None
+
+    def _mkdir(self):
+        try:
+            os.makedirs(self.data_dir)
+        except OSError as exp:
+            if exp.errno != errno.EEXIST:
+                raise
+        self.pickle_path = os.path.join(self.data_dir,
+                                        "python_standard_libs.pickle")
+
+    def retrieve_from_web(self):
+        # only needs to be used once ever, here for tidiness
+        # checked the result into source control as python_standard_libs.pickle
+        html = requests.get(self.url).text
+        exp = r'class="xref">([^<]+)'
+        matches = re.findall(exp, html)
+        self.libs = [m for m in matches if '.' not in m]
+
+    def pickle_libs(self):
+
+        if self.libs is None:
+            self.retrieve_from_web()
+
+        self._mkdir()
+        with open(self.pickle_path, "wb") as f:
+            pickle.dump(self.libs, f)
+
+        print("saved these to file: {}".format(self.libs))
+
+    def get(self):
+        if self.libs is None:
+            try:
+                with open(self.pickle_path, "rb") as f:
+                    print("Loading list of Stdandard Python Libraries from pickle file")
+                    self.libs = pickle.load(f)
+            except:
+                self.retrieve_from_web()
+                self.pickle_libs()
+
+    def clean(self):
+        try:
+            os.remove(self.pickle_path)
+        except:
+            pass
+
+
+def save_python_standard_libs(clean=False):
+    pystdlibs = PythonStandardLibs()
+    if clean:
+        pystdlibs.clean()
+    pystdlibs.get()
+
+    # to show the thing works
+    new_libs_obj = PythonStandardLibs()
+    new_libs_obj.get()
+    print("got these from pickled file: {}".format(new_libs_obj.libs))
diff --git a/commit_opener/grab_dependencies.py b/commit_opener/grab_dependencies.py
@@ -1,8 +1,104 @@
 """
-Extract the dependencies from the repository 
+Extract the dependencies from the repository
 
 Issue:
 work out dependencies #3
 https://github.com/lbillingham/commit_opener/issues/3
 
-"""
+The key function is get_dependencies().
+
+"""
+import re
+import os
+import pandas
+
+from commit_opener import depsy
+from commit_opener import repo
+
+def catfile(filename):
+    """Get text contents of a file."""
+
+    with open(filename, 'r') as fhandle:
+        print("Opening file {} and reading contents".format(filename))
+        text = fhandle.read()
+    return text
+
+
+def get_dependencies(name, url):
+    """
+    Get the dependecies for a git repository or any local python package.
+
+    """
+    # Let's instantiate the repo object, so we can parse through it.
+    myrepo = repo.Repo(name, url)
+    print("Created a repository instance for {}".format(url))
+
+    # Extract a local copy
+    myrepo.extract_local_copy()
+    print("Local copy now available here: {}".format(myrepo.tmpdir))
+    myrepo._get_filelist()
+
+    # Note: the file has to be opened and read before passing to depsy
+    # functions.
+    if myrepo.has("requirements.txt"):
+        print("Repository has a requirements.txt file")
+        filetext = catfile(myrepo.has("requirements.txt"))
+        reqs = depsy.parse_requirements_txt(filetext)
+    elif myrepo.has("setup.py"):
+        print("Repository has a setup.py file")
+        filetext = catfile(myrepo.has("setup.py"))
+        reqs = depsy.parse_setup_py(filetext)
+        if len(reqs) < 1:
+            print("No reqs in setup file,"
+                  "so determining dependencies ourselves.")
+            reqs = search_files_for_imports(myrepo)
+    else:
+        # No standard descriptions of the dependencies so let's try to work
+        # them out for ourselves.
+        print("No req or setup file, so determining dependencies ourselves.")
+        reqs = search_files_for_imports(myrepo)
+
+    # Convert the list of requirements to a set.
+    reqs = set(reqs)
+    print("Found the following imports: {}".format("\n".join(reqs)))
+
+    # Get the list of standard packages so that these can be removed.
+    stdlibs = depsy.PythonStandardLibs()
+    stdlibs.get()
+    set_std_libs = set(stdlibs.libs)
+
+
+    data = pandas.Series(list(reqs-set_std_libs))
+    data.sort_values(inplace=True)
+    return data
+
+
+
+def search_files_for_imports(repo_instance):
+    """
+    Walk all the python files in the repository and extract the import info.
+
+    """
+    dep_list = []
+    for f in repo_instance.file_list:
+        if ".py" in f:
+            print("Looking in {} for imports".format(os.path.basename(f)))
+            filetext = catfile(f)
+            dep_list.extend(find_imports(filetext))
+
+    return dep_list
+
+
+def find_imports(text):
+    """Apply regular expression searching to a file"""
+    # list of regexes.
+    reexps = [re.compile('^[\si]+mport\s+(\w+)[\s\.]', re.MULTILINE),
+              re.compile('^[\sf]+rom\s+(\w+)[\s\.]+', re.MULTILINE)
+              ]
+    import_list = []
+    for myregex in reexps:
+        try:
+            import_list.extend(re.findall(myregex, text))
+        except AttributeError:
+            pass
+    return import_list