|
1 | 1 | """scholarly.py"""
|
2 | 2 | import requests
|
| 3 | +import re |
3 | 4 | import os
|
4 | 5 | import copy
|
5 | 6 | import csv
|
6 | 7 | import pprint
|
7 |
| -from typing import Dict, List |
| 8 | +import datetime |
| 9 | +import re |
| 10 | +from typing import Dict, List, Union |
8 | 11 | from ._navigator import Navigator
|
9 | 12 | from ._proxy_generator import ProxyGenerator
|
10 | 13 | from dotenv import find_dotenv, load_dotenv
|
11 | 14 | from .author_parser import AuthorParser
|
12 | 15 | from .publication_parser import PublicationParser, _SearchScholarIterator
|
13 |
| -from .data_types import Author, AuthorSource, Journal, Publication, PublicationSource |
| 16 | +from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource |
14 | 17 |
|
15 | 18 | _AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
|
16 | 19 | _KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
|
17 | 20 | _KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}'
|
| 21 | +_KEYWORDSEARCH_PATTERN = "[-: #(),;]+" # Unallowed characters in the keywords. |
18 | 22 | _PUBSEARCH = '/scholar?hl=en&q={0}'
|
19 | 23 | _CITEDBYSEARCH = '/scholar?hl=en&cites={0}'
|
20 | 24 | _ORGSEARCH = "/citations?view_op=view_org&hl=en&org={0}"
|
@@ -155,11 +159,11 @@ def search_pubs(self,
|
155 | 159 | sort_by=sort_by, include_last_year=include_last_year, start_index=start_index)
|
156 | 160 | return self.__nav.search_publications(url)
|
157 | 161 |
|
158 |
| - def search_citedby(self, publication_id: int, **kwargs): |
| 162 | + def search_citedby(self, publication_id: Union[int, str], **kwargs): |
159 | 163 | """Searches by Google Scholar publication id and returns a generator of Publication objects.
|
160 | 164 |
|
161 | 165 | :param publication_id: Google Scholar publication id
|
162 |
| - :type publication_id: int |
| 166 | + :type publication_id: int or str |
163 | 167 |
|
164 | 168 | For the remaining parameters, see documentation of `search_pubs`.
|
165 | 169 | """
|
@@ -248,20 +252,70 @@ def bibtex(self, object: Publication)->str:
|
248 | 252 | self.logger.warning("Object not supported for bibtex exportation")
|
249 | 253 | return
|
250 | 254 |
|
| 255 | + @staticmethod |
| 256 | + def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end): |
| 257 | + years = [] |
| 258 | + y_hi, y_lo = year_end, year_end |
| 259 | + running_count = 0 |
| 260 | + for y in sorted(cites_per_year, reverse=True): |
| 261 | + if running_count + cites_per_year[y] <= 1000: |
| 262 | + running_count += cites_per_year[y] |
| 263 | + y_lo = y |
| 264 | + else: |
| 265 | + running_count = cites_per_year[y] |
| 266 | + years.append((y_hi, y_lo)) |
| 267 | + y_hi = y |
| 268 | + |
| 269 | + if running_count > 0: |
| 270 | + years.append((y_hi, y_lo)) |
| 271 | + |
| 272 | + return years |
| 273 | + |
251 | 274 | def citedby(self, object: Publication)->_SearchScholarIterator:
|
252 | 275 | """Searches Google Scholar for other articles that cite this Publication
|
253 | 276 | and returns a Publication generator.
|
254 | 277 |
|
255 | 278 | :param object: The Publication object for the bibtex exportation
|
256 | 279 | :type object: Publication
|
257 | 280 | """
|
258 |
| - if object['container_type'] == "Publication": |
259 |
| - publication_parser = PublicationParser(self.__nav) |
260 |
| - return publication_parser.citedby(object) |
261 |
| - else: |
| 281 | + |
| 282 | + if object['container_type'] != "Publication": |
262 | 283 | self.logger.warning("Object not supported for bibtex exportation")
|
263 | 284 | return
|
264 | 285 |
|
| 286 | + if object["num_citations"] <= 1000: |
| 287 | + return PublicationParser(self.__nav).citedby(object) |
| 288 | + |
| 289 | + self.logger.debug("Since the paper titled %s has %d citations (>1000), " |
| 290 | + "fetching it on an annual basis.", object["bib"]["title"], object["num_citations"]) |
| 291 | + |
| 292 | + year_end = int(datetime.date.today().year) |
| 293 | + |
| 294 | + if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: |
| 295 | + self.fill(object) |
| 296 | + years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end) |
| 297 | + else: |
| 298 | + try: |
| 299 | + year_low = int(object["bib"]["pub_year"]) |
| 300 | + except KeyError: |
| 301 | + self.logger.warning("Unknown publication year for paper %s, may result in incorrect number " |
| 302 | + "of citedby papers.", object["bib"]["title"]) |
| 303 | + return PublicationParser(self.__nav).citedby(object) |
| 304 | + |
| 305 | + # Go one year at a time in decreasing order |
| 306 | + years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1)) |
| 307 | + |
| 308 | + # Extract cites_id. Note: There could be multiple ones, separated by commas. |
| 309 | + m = re.search("cites=[\d+,]*", object["citedby_url"]) |
| 310 | + pub_id = m.group()[6:] |
| 311 | + for y_hi, y_lo in years: |
| 312 | + sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi) |
| 313 | + if sub_citations.total_results and (sub_citations.total_results > 1000): |
| 314 | + self.logger.warn("The paper titled %s has %d citations in the year %d. " |
| 315 | + "Due to the limitation in Google Scholar, fetching only 1000 results " |
| 316 | + "from that year.", object["bib"]["title"], sub_citations.total_results, y_lo) |
| 317 | + yield from sub_citations |
| 318 | + |
265 | 319 | def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author:
|
266 | 320 | """Search by author id and return a single Author object
|
267 | 321 | :param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
|
@@ -321,7 +375,9 @@ def search_keyword(self, keyword: str):
|
321 | 375 | 'source': 'SEARCH_AUTHOR_SNIPPETS',
|
322 | 376 | 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=lHrs3Y4AAAAJ'}
|
323 | 377 | """
|
324 |
| - url = _KEYWORDSEARCH.format(requests.utils.quote(keyword)) |
| 378 | + |
| 379 | + reg_keyword = re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) |
| 380 | + url = _KEYWORDSEARCH.format(requests.utils.quote(reg_keyword)) |
325 | 381 | return self.__nav.search_authors(url)
|
326 | 382 |
|
327 | 383 | def search_keywords(self, keywords: List[str]):
|
@@ -355,8 +411,8 @@ def search_keywords(self, keywords: List[str]):
|
355 | 411 | 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'}
|
356 | 412 |
|
357 | 413 | """
|
358 |
| - |
359 |
| - formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in keywords] |
| 414 | + reg_keywords = (re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords) |
| 415 | + formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in reg_keywords] |
360 | 416 | formated_keywords = '+'.join(formated_keywords)
|
361 | 417 | url = _KEYWORDSEARCHBASE.format(formated_keywords)
|
362 | 418 | return self.__nav.search_authors(url)
|
|
0 commit comments