Skip to content

Commit 970d225

Browse files
Merge pull request #451 from scholarly-python-package/develop
Release candidate for v1.7.3
2 parents ca4623a + 4cabc51 commit 970d225

File tree

6 files changed

+206
-107
lines changed

6 files changed

+206
-107
lines changed

.github/workflows/lint.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@ name: lint
22

33
on:
44
workflow_call:
5-
push:
6-
branches: [main, develop]
7-
pull_request:
8-
branches: [main, develop]
95

106
jobs:
117
lint:

.github/workflows/proxytests.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,9 @@
44
name: Python package (with methods that need proxy)
55

66
on:
7-
workflow_run:
8-
workflows: ["lint"]
9-
branches: [main]
10-
types:
11-
- completed
127
pull_request:
138
branches: [main]
149

15-
1610
jobs:
1711
build:
1812
runs-on: ubuntu-latest
@@ -37,7 +31,7 @@ jobs:
3731
# uses: typilus/[email protected]
3832
- name: Run unittests
3933
id: unittests
40-
if: ${{ github.event.pull_request.draft == false }}
34+
if: ${{ github.event.pull_request.ready_for_review == true }}
4135
continue-on-error: true
4236
env:
4337
CONNECTION_METHOD: ${{ secrets.CONNECTION_METHOD }}
@@ -48,8 +42,6 @@ jobs:
4842
run: |
4943
coverage run -m unittest -v test_module.TestScholarlyWithProxy
5044
- name: Generate coverage report
51-
if:
52-
"matrix.os == 'macos-latest'"
5345
run: |
5446
curl --data-binary @.github/.codecov.yml https://codecov.io/validate | head -n 1
5547
coverage xml

.github/workflows/pythonpackage.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,6 @@ on:
1010
branches: [main, develop]
1111
pull_request:
1212
branches: [main, develop]
13-
workflow_run:
14-
workflows: ["lint"]
15-
# branches: [main]
16-
types:
17-
- completed
1813

1914
jobs:
2015
lint:

scholarly/_scholarly.py

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,24 @@
11
"""scholarly.py"""
22
import requests
3+
import re
34
import os
45
import copy
56
import csv
67
import pprint
7-
from typing import Dict, List
8+
import datetime
9+
import re
10+
from typing import Dict, List, Union
811
from ._navigator import Navigator
912
from ._proxy_generator import ProxyGenerator
1013
from dotenv import find_dotenv, load_dotenv
1114
from .author_parser import AuthorParser
1215
from .publication_parser import PublicationParser, _SearchScholarIterator
13-
from .data_types import Author, AuthorSource, Journal, Publication, PublicationSource
16+
from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource
1417

1518
_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
1619
_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
1720
_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}'
21+
_KEYWORDSEARCH_PATTERN = "[-: #(),;]+" # Unallowed characters in the keywords.
1822
_PUBSEARCH = '/scholar?hl=en&q={0}'
1923
_CITEDBYSEARCH = '/scholar?hl=en&cites={0}'
2024
_ORGSEARCH = "/citations?view_op=view_org&hl=en&org={0}"
@@ -155,11 +159,11 @@ def search_pubs(self,
155159
sort_by=sort_by, include_last_year=include_last_year, start_index=start_index)
156160
return self.__nav.search_publications(url)
157161

158-
def search_citedby(self, publication_id: int, **kwargs):
162+
def search_citedby(self, publication_id: Union[int, str], **kwargs):
159163
"""Searches by Google Scholar publication id and returns a generator of Publication objects.
160164
161165
:param publication_id: Google Scholar publication id
162-
:type publication_id: int
166+
:type publication_id: int or str
163167
164168
For the remaining parameters, see documentation of `search_pubs`.
165169
"""
@@ -248,20 +252,70 @@ def bibtex(self, object: Publication)->str:
248252
self.logger.warning("Object not supported for bibtex exportation")
249253
return
250254

255+
@staticmethod
256+
def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end):
257+
years = []
258+
y_hi, y_lo = year_end, year_end
259+
running_count = 0
260+
for y in sorted(cites_per_year, reverse=True):
261+
if running_count + cites_per_year[y] <= 1000:
262+
running_count += cites_per_year[y]
263+
y_lo = y
264+
else:
265+
running_count = cites_per_year[y]
266+
years.append((y_hi, y_lo))
267+
y_hi = y
268+
269+
if running_count > 0:
270+
years.append((y_hi, y_lo))
271+
272+
return years
273+
251274
def citedby(self, object: Publication)->_SearchScholarIterator:
252275
"""Searches Google Scholar for other articles that cite this Publication
253276
and returns a Publication generator.
254277
255278
:param object: The Publication object for the bibtex exportation
256279
:type object: Publication
257280
"""
258-
if object['container_type'] == "Publication":
259-
publication_parser = PublicationParser(self.__nav)
260-
return publication_parser.citedby(object)
261-
else:
281+
282+
if object['container_type'] != "Publication":
262283
self.logger.warning("Object not supported for bibtex exportation")
263284
return
264285

286+
if object["num_citations"] <= 1000:
287+
return PublicationParser(self.__nav).citedby(object)
288+
289+
self.logger.debug("Since the paper titled %s has %d citations (>1000), "
290+
"fetching it on an annual basis.", object["bib"]["title"], object["num_citations"])
291+
292+
year_end = int(datetime.date.today().year)
293+
294+
if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY:
295+
self.fill(object)
296+
years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end)
297+
else:
298+
try:
299+
year_low = int(object["bib"]["pub_year"])
300+
except KeyError:
301+
self.logger.warning("Unknown publication year for paper %s, may result in incorrect number "
302+
"of citedby papers.", object["bib"]["title"])
303+
return PublicationParser(self.__nav).citedby(object)
304+
305+
# Go one year at a time in decreasing order
306+
years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1))
307+
308+
# Extract cites_id. Note: There could be multiple ones, separated by commas.
309+
m = re.search("cites=[\d+,]*", object["citedby_url"])
310+
pub_id = m.group()[6:]
311+
for y_hi, y_lo in years:
312+
sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi)
313+
if sub_citations.total_results and (sub_citations.total_results > 1000):
314+
self.logger.warn("The paper titled %s has %d citations in the year %d. "
315+
"Due to the limitation in Google Scholar, fetching only 1000 results "
316+
"from that year.", object["bib"]["title"], sub_citations.total_results, y_lo)
317+
yield from sub_citations
318+
265319
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author:
266320
"""Search by author id and return a single Author object
267321
:param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
@@ -321,7 +375,9 @@ def search_keyword(self, keyword: str):
321375
'source': 'SEARCH_AUTHOR_SNIPPETS',
322376
'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=lHrs3Y4AAAAJ'}
323377
"""
324-
url = _KEYWORDSEARCH.format(requests.utils.quote(keyword))
378+
379+
reg_keyword = re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword)
380+
url = _KEYWORDSEARCH.format(requests.utils.quote(reg_keyword))
325381
return self.__nav.search_authors(url)
326382

327383
def search_keywords(self, keywords: List[str]):
@@ -355,8 +411,8 @@ def search_keywords(self, keywords: List[str]):
355411
'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'}
356412
357413
"""
358-
359-
formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in keywords]
414+
reg_keywords = (re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords)
415+
formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in reg_keywords]
360416
formated_keywords = '+'.join(formated_keywords)
361417
url = _KEYWORDSEARCHBASE.format(formated_keywords)
362418
return self.__nav.search_authors(url)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.7.2',
8+
version='1.7.3',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
1010
1111
description='Simple access to Google Scholar authors and citations',

0 commit comments

Comments
 (0)