From fd371cff44f31f26e9c15bdf3cb9c166a3f8127b Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Wed, 7 May 2025 18:19:02 +0200 Subject: [PATCH 1/9] Implement ScienceDirectSearch using the PUT method --- .../sciencedirect/ScienceDirectSearch.rst | 163 ++++++++++------- .../sciencedirect/sciencedirect_search.py | 165 +++++++---------- .../tests/test_ScienceDirectSearch.py | 100 ++++++----- pybliometrics/superclasses/base.py | 115 +++++++----- pybliometrics/superclasses/search.py | 28 ++- pybliometrics/utils/constants.py | 1 + pybliometrics/utils/get_content.py | 167 +++++++++--------- pybliometrics/utils/parse_content.py | 14 +- 8 files changed, 405 insertions(+), 348 deletions(-) diff --git a/docs/reference/sciencedirect/ScienceDirectSearch.rst b/docs/reference/sciencedirect/ScienceDirectSearch.rst index 955daaf8..bc2d77ce 100644 --- a/docs/reference/sciencedirect/ScienceDirectSearch.rst +++ b/docs/reference/sciencedirect/ScienceDirectSearch.rst @@ -1,9 +1,33 @@ pybliometrics.sciencedirect.ScienceDirectSearch -================================================= - -`ScopusSearch()` implements the `ScienceDirect Search API `_. It executes a query to search for documents and retrieves the resulting records. -Any query that works in the `Advanced Document Search on sciencedirect.com `_ will work. -For a complete guide on how to query check the `documentation `_. +=============================================== + +`ScienceDirectSearch()` implements the `ScienceDirect Search API `_ using the `PUT method`. It executes a query to search for documents and retrieves the resulting records. +The class takes a `query`` dictionary as input which has to follow this schema: + +.. code-block:: text + + { + authors: string, + date: string, + display: { + highlights: boolean, + offset: integer, + show: integer, + sortBy: string + }, + filters: { + openAccess: boolean + }, + issue: string, + loadedAfter: string, + page: string, + pub: string, + qs: string, + title: string, + volume: string + } + +For a more detailed description of the parameters, please refer to the `ScienceDirect Search API migration documentation `_. .. currentmodule:: pybliometrics.sciencedirect .. contents:: Table of Contents @@ -19,16 +43,29 @@ Documentation Examples -------- -The class is initialized with a search query. To see the download progress, set `verbose=True`. +The class is initialized with a search query. +We can pass the field `qs` to search for a specific keywords. +Using `verbose=True` will print the progress of the download. .. code-block:: python >>> from pybliometrics.sciencedirect import ScienceDirectSearch, init >>> init() >>> # Retrieve documents based on the search query - >>> sds = ScienceDirectSearch('"neural radiance fields" AND "3D" AND YEAR(2024)', verbose=True) - Downloading results for query ""neural radiance fields" AND "3D" AND YEAR(2024)": - 100%|██████████| 8/8 [00:05<00:00, 1.39it/s] + >>> query = query = {'qs': '"neural radiance fields" AND "3D rendering"', 'date': '2024'} + >>> sds = ScienceDirectSearch(query, verbose=True) + Downloading results for query "{'qs': '"neural radiance fields" AND "3D rendering"', 'date': '2024', 'display': {'offset': 0, 'show': 100, 'sortBy': 'date'}, 'cursor': '*'}": + 100%|██████████| 1/1 [00:00<00:00, 3.23it/s] + + +To check the number of results, use the method `get_results_size()`. + +.. code-block:: python + + >>> # Check the number of results + >>> sds.get_results_size() + 10 + To access the results, use the attribute `results` which contains a list of `Document` namedtuples. @@ -36,9 +73,9 @@ To access the results, use the attribute `results` which contains a list of `Doc >>> # Access the results >>> results = sds.results - [Document(authors='Dong He;Wenhua Qian;Jinde Cao', first_author='Dong He', doi='10.1016/j.cag.2025.104181', title='GEAST-RF: Geometry Enhanced 3D Arbitrary Style Transfer Via Neural Radiance Fields', link='https://www.sciencedirect.com/science/article/pii/S0097849325000202?dgcid=api_sd_search-api-endpoint', load_date='2025-02-16T00:00:00.000Z', openaccess_status=False, pii='S0097849325000202', coverDate='2025-02-16', endingPage=None, publicationName='Computers & Graphics', startingPage='104181', api_link='https://api.elsevier.com/content/article/pii/S0097849325000202', volume=None), - Document(authors='Qicheng Xu;Min Hu;Xitao Zhang', first_author='Qicheng Xu', doi='10.1016/j.asr.2025.01.065', title='A neural radiance fields method for 3D reconstruction of space target', link='https://www.sciencedirect.com/science/article/pii/S0273117725000973?dgcid=api_sd_search-api-endpoint', load_date='2025-02-01T00:00:00.000Z', openaccess_status=False, pii='S0273117725000973', coverDate='2025-02-01', endingPage=None, publicationName='Advances in Space Research', startingPage=None, api_link='https://api.elsevier.com/content/article/pii/S0273117725000973', volume=None), - Document(authors='Jian Liu;Zhen Yu', first_author='Jian Liu', doi='10.1016/j.neucom.2025.129420', title='SA3D-L: A lightweight model for 3D object segmentation using neural radiance fields', link='https://www.sciencedirect.com/science/article/pii/S092523122500092X?dgcid=api_sd_search-api-endpoint', load_date='2025-01-14T00:00:00.000Z', openaccess_status=False, pii='S092523122500092X', coverDate='2025-03-28', endingPage=None, publicationName='Neurocomputing', startingPage='129420', api_link='https://api.elsevier.com/content/article/pii/S092523122500092X', volume='623'), + [Document(authors='Geontae Kim; Youngjin Cha', doi='10.1016/j.autcon.2024.105878', loadDate='2024-11-19T00:00:00.000Z', openAccess=True, first_page=105878, last_page=None, pii='S0926580524006149', publicationDate='2024-12-15', sourceTitle='Automation in Construction', title='3D Pixelwise damage mapping using a deep attention based modified Nerfacto', uri='https://www.sciencedirect.com/science/article/pii/S0926580524006149?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 168, Part B'), + Document(authors='Akram Akbar; Chun Liu; Zeran Xu', doi='10.1016/j.aei.2024.102913', loadDate='2024-11-16T00:00:00.000Z', openAccess=False, first_page=102913, last_page=None, pii='S1474034624005640', publicationDate='2024-10-31', sourceTitle='Advanced Engineering Informatics', title='Scene information guided aerial photogrammetric mission recomposition towards detailed level building reconstruction', uri='https://www.sciencedirect.com/science/article/pii/S1474034624005640?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 62, Part D'), + Document(authors='Ruxandra Stoean; Nebojsa Bacanin; Leonard Ionescu', doi='10.1016/j.culher.2024.07.008', loadDate='2024-08-09T00:00:00.000Z', openAccess=False, first_page=18, last_page=26, pii='S1296207424001468', publicationDate='2024-10-31', sourceTitle='Journal of Cultural Heritage', title='Bridging the past and present: AI-driven 3D restoration of degraded artefacts for museum digital display', uri='https://www.sciencedirect.com/science/article/pii/S1296207424001468?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 69'), ...] The list of results can be cast into a Pandas DataFrame. @@ -50,11 +87,12 @@ The list of results can be cast into a Pandas DataFrame. >>> df = pd.DataFrame(sds.results) >>> # Display available fields >>> df.columns - Index(['eid', 'filename', 'height', 'mimetype', 'ref', 'size', 'type', 'url', - 'width'], + Index(['authors', 'doi', 'loadDate', 'openAccess', 'first_page', 'last_page', + 'pii', 'publicationDate', 'sourceTitle', 'title', 'uri', 'volumeIssue'], dtype='object') >>> # Get shape of the DataFrame (rows x columns) - (200, 14) + >>> df.shape + (10, 12) >>> # Display the first 3 rows >>> df.head(3) @@ -82,74 +120,65 @@ The list of results can be cast into a Pandas DataFrame. authors - first_author doi - title - link - load_date - openaccess_status + loadDate + openAccess + first_page + last_page pii - coverDate - endingPage - publicationName - startingPage - api_link - volume + publicationDate + sourceTitle + title + uri + volumeIssue 0 - Dong He;Wenhua Qian;Jinde Cao - Dong He - 10.1016/j.cag.2025.104181 - GEAST-RF: Geometry Enhanced 3D Arbitrary Style... + Geontae Kim; Youngjin Cha + 10.1016/j.autcon.2024.105878 + 2024-11-19T00:00:00.000Z + True + 105878 + NaN + S0926580524006149 + 2024-12-15 + Automation in Construction + 3D Pixelwise damage mapping using a deep atten... https://www.sciencedirect.com/science/article/... - 2025-02-16T00:00:00.000Z - False - S0097849325000202 - 2025-02-16 - None - Computers & Graphics - 104181 - https://api.elsevier.com/content/article/pii/S... - None + Volume 168, Part B 1 - Qicheng Xu;Min Hu;Xitao Zhang - Qicheng Xu - 10.1016/j.asr.2025.01.065 - A neural radiance fields method for 3D reconst... - https://www.sciencedirect.com/science/article/... - 2025-02-01T00:00:00.000Z + Akram Akbar; Chun Liu; Zeran Xu + 10.1016/j.aei.2024.102913 + 2024-11-16T00:00:00.000Z False - S0273117725000973 - 2025-02-01 - None - Advances in Space Research - None - https://api.elsevier.com/content/article/pii/S... - None + 102913 + NaN + S1474034624005640 + 2024-10-31 + Advanced Engineering Informatics + Scene information guided aerial photogrammetri... + https://www.sciencedirect.com/science/article/... + Volume 62, Part D 2 - Jian Liu;Zhen Yu - Jian Liu - 10.1016/j.neucom.2025.129420 - SA3D-L: A lightweight model for 3D object segm... - https://www.sciencedirect.com/science/article/... - 2025-01-14T00:00:00.000Z + Ruxandra Stoean; Nebojsa Bacanin; Leonard Ionescu + 10.1016/j.culher.2024.07.008 + 2024-08-09T00:00:00.000Z False - S092523122500092X - 2025-03-28 - None - Neurocomputing - 129420 - https://api.elsevier.com/content/article/pii/S... - 623 + 18 + 26.0 + S1296207424001468 + 2024-10-31 + Journal of Cultural Heritage + Bridging the past and present: AI-driven 3D re... + https://www.sciencedirect.com/science/article/... + Volume 69 - - + \ No newline at end of file diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index 75873e9b..bbeb18cd 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -3,20 +3,21 @@ from pybliometrics.superclasses import Search from pybliometrics.utils import check_field_consistency, chained_get, \ - check_integrity, check_parameter_value, deduplicate, \ + check_integrity, check_parameter_value, deduplicate, make_int_if_possible, \ make_search_summary, VIEWS class ScienceDirectSearch(Search): @property - def results(self) -> Optional[list[namedtuple]]: - """A list of namedtuples in the form `(authors first_author doi title link - load_date openaccess_status pii coverDate endingPage publicationName startingPage - api_link volume)`. + def results(self) -> Optional[list]: + """ + A list of namedtuples in the form `(authors doi loadDate openAccess first_page last_page + pii publicationDate sourceTitle title uri volumeIssue)`. - Field definitions correspond to the `ScienceDirect Search Views - `__ and return the - values as-is, except for `authors` which are joined on `";"`. + Field definitions correspond to the `ScienceDirect Search API Migration Documentation + `__ and return the + values as-is, except for `authors` which are joined on `";"` and pages which are + parsed into `first_page` and `last_page`. Raises ------ @@ -29,49 +30,36 @@ def results(self) -> Optional[list[namedtuple]]: The list of authors and the list of affiliations per author are deduplicated. """ - fields = 'authors first_author doi title link load_date openaccess_status pii '\ - 'coverDate endingPage publicationName startingPage api_link volume' + fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ + 'sourceTitle title uri volumeIssue' doc = namedtuple('Document', fields) check_field_consistency(self._integrity, fields) # Parse elements one-by-one out = [] for item in self._json: # Get authors and create ";" separated string - authors_list = self._get_authors(item) - authors_list = deduplicate(authors_list) - authors = ';'.join(authors_list) - # Get links - links_found = item.get('link') - links = {'api_link': None, 'scidir': None} - for link in links_found: - if link.get('@ref') == 'self': - links['api_link'] = link.get('@href') - elif link.get('@ref') == 'scidir': - links['scidir'] = link.get('@href') - # Get doi - doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None + authors_list = deduplicate([a.get('name') for a in item.get('authors', {})]) + authors = "; ".join(authors_list) new = doc( authors=authors, - first_author=item.get('dc:creator'), - doi=doi, - title=item.get("dc:title"), - link=links["scidir"], - load_date=item.get("load-date"), - openaccess_status=item.get("openaccess"), - pii=item.get("pii"), - coverDate=item.get("prism:coverDate"), - endingPage=item.get("prism:endingPage"), - publicationName=item.get("prism:publicationName"), - startingPage=item.get("prism:startingPage"), - api_link=links["api_link"] or item.get("prism:url"), - volume=item.get("prism:volume") + doi=item.get('doi'), + loadDate=item.get('loadDate'), + openAccess=item.get('openAccess'), + first_page=make_int_if_possible(chained_get(item, ('pages', 'first'))), + last_page=make_int_if_possible(chained_get(item, ('pages', 'last'))), + pii=item.get('pii'), + publicationDate=item.get('publicationDate'), + sourceTitle=item.get('sourceTitle'), + title=item.get('title'), + uri=item.get('uri'), + volumeIssue=item.get('volumeIssue') ) out.append(new) check_integrity(out, self._integrity, self._action) return out or None def __init__(self, - query: str, + query: dict, refresh: Union[bool, int] = False, view: Optional[str] = None, verbose: bool = False, @@ -79,40 +67,44 @@ def __init__(self, integrity_fields: Optional[Union[list[str], tuple[str, ...]]] = None, integrity_action: str = "raise", subscriber: bool = True, - **kwds: str ) -> None: - """Interaction with the ScienceDirect Search API. This represents a search against the - ScienceDirect cluster, which contains serial/nonserial full-text articles. Note that this API - replicates the search experience on `ScienceDirect `__. - - :param query: A string of the query as used in the `ScienceDirect Search `__. - :param refresh: Whether to refresh the cached file if it exists or not. - If int is passed, cached file will be refreshed if the - number of days since last modification exceeds that value. - :param view: Which view to use for the query, see `the documentation `__. - Allowed values: `STANDARD`. - :param verbose: Whether to print a download progress bar. - :param download: Whether to download results (if they have not been - cached). - :param integrity_fields: A list or tuple with the names of fields whose completeness should - be checked. `ArticleMetadata` will perform the - action specified in `integrity_action` if - elements in these fields are missing. This - helps to avoid idiosynchratically missing - elements that should always be present - (e.g., doi or authors). - :param integrity_action: What to do in case integrity of provided fields - cannot be verified. Possible actions: - - `"raise"`: Raise an `AttributeError` - - `"warn"`: Raise a `UserWarning` - :param subscriber: Whether you access ScienceDirect with a subscription or not. - For subscribers, ScienceDirect's cursor navigation will be - used. Sets the number of entries in each query - iteration to the maximum number allowed by the - corresponding view. - :param kwds: Keywords passed on as query parameters. Must contain - fields and values mentioned in the `API specification `__. - + """ + Interaction with the ScienceDirect Search API using the `PUT` method. + See the official `documentation `__ + for more details. + + Parameters + ---------- + query : dict + The query to be sent to the API, e.g., + {'qs': '"Neural Networks" AND "Shapley"', 'date': '2019-2020'} + + refresh : bool or int, optional + Whether to refresh the cached file. If an int is passed, the cache + will refresh if older than that many days. + + view : str, optional + The API view to use. Default is "STANDARD". + + verbose : bool, optional + Whether to print a download progress bar. + + download : bool, optional + Whether to download results (if they haven't been cached). + + integrity_fields : list of str or tuple of str, optional + Fields whose completeness should be checked. If any field is missing, + the `integrity_action` will be triggered. + + integrity_action : {'raise', 'warn'}, optional + What to do if required fields are missing: + + - 'raise' : Raise an AttributeError + - 'warn' : Emit a UserWarning + + subscriber : bool, optional + If True, cursor navigation is enabled, allowing more than 5,000 results. + Raises ------ ScopusQueryError @@ -122,16 +114,7 @@ def __init__(self, If any of the parameters `integrity_action`, `refresh` or `view` is not one of the allowed values. - Notes - ----- - The directory for cached results is `{path}/{view}/{fname}`, - where `path` is specified in your configuration file and `fname` is - the md5-hashed version of `query`. - - The ScienceDirect Search API V2 has two available interfaces: `PUT` and `GET`. This library uses the - `GET` interface. """ - # Check view or set to default if view: check_parameter_value(view, VIEWS['ScienceDirectSearch'], "view") else: @@ -140,29 +123,17 @@ def __init__(self, allowed = ("warn", "raise") check_parameter_value(integrity_action, allowed, "integrity_action") - # Query self._action = integrity_action self._integrity = integrity_fields or [] self._refresh = refresh self._query = query self._view = view - Search.__init__(self, query=query, download=download, verbose=verbose, **kwds) + + Search.__init__(self, query=query, + cursor=subscriber, download=download, + verbose=verbose) def __str__(self): """Print a summary string.""" - return make_search_summary(self, "document", self.get_dois()) - - def get_dois(self): - """DOIs of retrieved documents.""" - return [d.get("prism:doi") or d.get("dc:identifier")[4:] if d.get("dc:identifier") else None for d in self._json] - - def _get_authors(self, item: dict) -> list: - """Auxiliary function to get the authors.""" - authors_data = chained_get(item, ['authors', 'author'], []) - if isinstance(authors_data, list): - authors_list = [a.get('$') for a in authors_data] - elif isinstance(authors_data, str): - authors_list = [authors_data] - else: - authors_list = [] - return authors_list + dois = [d.doi for d in self.results] if self.results else [] + return make_search_summary(self, "document", dois) diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index b4589c59..acfd823d 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -1,13 +1,24 @@ """Tests for sciencedirect.ScienceDirectSearch""" from collections import namedtuple -from pybliometrics.exception import Scopus400Error +import pytest + from pybliometrics.sciencedirect import ScienceDirectSearch, init init() -sds_standard = ScienceDirectSearch('TITLE("Assessing LLMs in malicious code deobfuscation of real-world malware campaigns") AND DATE(2012)', view="STANDARD", refresh=30) -sds_empty = ScienceDirectSearch('TITLE("Not a very realistic title")', view="STANDARD", refresh=30) +one_article_query = {'title': 'Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + 'date': '2024'} +sds_standard = ScienceDirectSearch(one_article_query, refresh=30) + +empty_query = {'title': 'Not a realistic title', 'date': '2012'} +sds_empty = ScienceDirectSearch(empty_query, view="STANDARD", refresh=30) + +huge_query = {'qs': 'Neural Networks', 'date': '2015-2020'} +sds_huge = ScienceDirectSearch(huge_query, view="STANDARD", download=False, refresh=30) + +pagination_query = {'qs': '"Neural Networks" AND "Shapley"', 'date': '2020'} +sds_pagination = ScienceDirectSearch(pagination_query, view="STANDARD", refresh=30) def test_empty_results(): @@ -16,64 +27,61 @@ def test_empty_results(): def test_all_fields(): - fields = 'authors first_author doi title link load_date openaccess_status pii '\ - 'coverDate endingPage publicationName startingPage api_link volume' - doc = namedtuple("Document", fields) + fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ + 'sourceTitle title uri volumeIssue' + doc = namedtuple('Document', fields) expected_standard_doc = doc( - authors="Constantinos Patsakis;Fran Casino;Nikolaos Lykousas", - first_author="Constantinos Patsakis", - doi="10.1016/j.eswa.2024.124912", - title="Assessing LLMs in malicious code deobfuscation of real-world malware campaigns", - link="https://www.sciencedirect.com/science/article/pii/S0957417424017792?dgcid=api_sd_search-api-endpoint", - load_date="2024-07-31T00:00:00.000Z", - openaccess_status=True, - pii="S0957417424017792", - coverDate="2024-12-05", - endingPage=None, - publicationName="Expert Systems with Applications", - startingPage="124912", - api_link="https://api.elsevier.com/content/article/pii/S0957417424017792", - volume="256", + authors='Constantinos Patsakis; Fran Casino; Nikolaos Lykousas', + doi='10.1016/j.eswa.2024.124912', + loadDate="2024-07-31T00:00:00.000Z", + openAccess=True, + first_page=124912, + last_page=None, + pii='S0957417424017792', + publicationDate='2024-12-05', + sourceTitle='Expert Systems with Applications', + title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + uri='https://www.sciencedirect.com/science/article/pii/S0957417424017792?dgcid=api_sd_search-api-endpoint', + volumeIssue='Volume 256' ) + assert sds_standard.results[0] == expected_standard_doc + expected_last_document = doc( + authors='Elhadji Amadou Oury Diallo; Ayumi Sugiyama; Toshiharu Sugawara', + doi='10.1016/j.neucom.2018.08.094', + loadDate='2019-04-25T00:00:00.000Z', + openAccess=False, + first_page=230, + last_page=240, + pii='S0925231219304424', + publicationDate='2020-07-05', + sourceTitle='Neurocomputing', + title='Coordinated behavior of cooperative agents using deep reinforcement learning', + uri='https://www.sciencedirect.com/science/article/pii/S0925231219304424?dgcid=api_sd_search-api-endpoint', + volumeIssue='Volume 396' + ) + assert sds_pagination.results[-1] == expected_last_document + + def test_field_consistency(): - am_wrong_field = ScienceDirectSearch('TITLE("Assessing LLMs in malicious code deobfuscation of real-world malware campaigns") AND DATE(2012)', + am_wrong_field = ScienceDirectSearch(one_article_query, integrity_fields=["notExistingField"], integrity_action="warn", view="STANDARD", refresh=30) - try: - am_wrong_field.results - except ValueError: - pass - except Exception as e: - raise AssertionError(f"Unexpected exception type: {type(e).__name__}") - else: - raise AssertionError("Expected ValueError but no exception was raised") + with pytest.raises(ValueError): + _ = am_wrong_field.results def test_length(): assert len(sds_standard.results) == sds_standard._n assert len(sds_standard.results) == sds_standard._n - + assert sds_huge.get_results_size() > 156_000 + assert len(sds_pagination.results) == 127 def test_string(): - str_start = ('Search \'TITLE("Assessing LLMs in malicious code deobfuscation of ' - 'real-world malware campaigns") AND DATE(2012)\' yielded 1 document as of') - assert sds_standard.__str__().startswith(str_start) - - -def test_wrong_query(): - try: - ScienceDirectSearch( - 'Th(s querY - has M&ny ( Errors', view="STANDARD", refresh=30 - ) - except Scopus400Error: - pass - except Exception as e: - raise AssertionError(f"Unexpected exception type: {type(e).__name__}") - else: - raise AssertionError("Expected Scopus400Error but no exception was raised") + expected_str = "Search '{'title': 'Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', 'date': '2024'}' yielded 1 document as of 2025-05-07:\n 10.1016/j.eswa.2024.124912" + assert str(sds_standard) == expected_str diff --git a/pybliometrics/superclasses/base.py b/pybliometrics/superclasses/base.py index b79c1b2f..eb543d09 100644 --- a/pybliometrics/superclasses/base.py +++ b/pybliometrics/superclasses/base.py @@ -54,10 +54,12 @@ def __init__(self, ab_ref_retrieval = (api == 'AbstractRetrieval') and (params['view'] == 'REF') # Check if object retrieval obj_retrieval = (api == 'ObjectRetrieval') + # Check if ScienceDirect Search API + sciencedirect_search = (api == 'ScienceDirectSearch') if fname.exists() and not self._refresh: self._mdate = mod_ts - if search_request: + if search_request or sciencedirect_search: self._json = [loads(line) for line in fname.read_text().split("\n") if line] self._n = len(self._json) @@ -66,61 +68,84 @@ def __init__(self, else: self._json = loads(fname.read_text()) else: - resp = get_content(url, api, params, **kwds) - header = resp.headers - - if ab_ref_retrieval: - kwds['startref'] = '1' - data = _get_all_refs(url, params, verbose, resp, **kwds) - self._json = data - data = [data] - elif search_request: - # Get number of results + if sciencedirect_search: + resp = get_content(url, api, params, 'PUT' ,**kwds) + header = resp.headers res = resp.json() - n = int(res['search-results'].get('opensearch:totalResults', 0) or 0) + # Get the number of results + n = int(res.get('resultsFound', 0)) self._n = n - # Results size check - cursor_exists = "cursor" in params - if not cursor_exists and n > SEARCH_MAX_ENTRIES: - # Stop if there are too many results - text = f'Found {n:,} matches. The query fails to return '\ - f'more than {SEARCH_MAX_ENTRIES} entries. Change '\ - 'your query such that it returns fewer entries.' - raise ScopusQueryError(text) self._json = [] - # Download results page-wise if download: - data = res.get('search-results', {}).get('entry', []) - if not n: - data = "" - if not cursor_exists: - start = params["start"] - # Download the remaining information in chunks + data = res.get('results', []) + n_chunks = ceil(n/params["display"]["show"]) if verbose: - print(f'Downloading results for query "{params["query"]}":') - n_chunks = ceil(n/params['count']) - for i in tqdm(range(1, n_chunks), disable=not verbose, - initial=1, total=n_chunks): - if cursor_exists: - cursor = res['search-results']['cursor']['@next'] - params.update({'cursor': cursor}) - else: - start += params["count"] - params.update({'start': start}) - resp = get_content(url, api, params, **kwds) + print(f'Downloading results for query "{params}":') + for i in tqdm(range(1, n_chunks), disable=not verbose): + params['display']['offset'] += params["display"]["show"] + resp = get_content(url, api, params, 'PUT' ,**kwds) res = resp.json() - data.extend(res.get('search-results', {}).get('entry', [])) + data.extend(res.get('results', [])) header = resp.headers # Use header of final call self._json = data else: data = None - elif obj_retrieval: - self._object = resp.content - data = [] else: - data = loads(resp.text) - self._json = data - data = [data] + resp = get_content(url, api, params, **kwds) + header = resp.headers + + if ab_ref_retrieval: + kwds['startref'] = '1' + data = _get_all_refs(url, params, verbose, resp, **kwds) + self._json = data + data = [data] + elif search_request: + # Get number of results + res = resp.json() + n = int(res['search-results'].get('opensearch:totalResults', 0) or 0) + self._n = n + # Results size check + cursor_exists = "cursor" in params + if not cursor_exists and n > SEARCH_MAX_ENTRIES: + # Stop if there are too many results + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SEARCH_MAX_ENTRIES} entries. Change '\ + 'your query such that it returns fewer entries.' + raise ScopusQueryError(text) + self._json = [] + # Download results page-wise + if download: + data = res.get('search-results', {}).get('entry', []) + if not n: + data = "" + if not cursor_exists: + start = params["start"] + # Download the remaining information in chunks + if verbose: + print(f'Downloading results for query "{params["query"]}":') + n_chunks = ceil(n/params['count']) + for i in tqdm(range(1, n_chunks), disable=not verbose, + initial=1, total=n_chunks): + if cursor_exists: + cursor = res['search-results']['cursor']['@next'] + params.update({'cursor': cursor}) + else: + start += params["count"] + params.update({'start': start}) + resp = get_content(url, api, params, **kwds) + res = resp.json() + data.extend(res.get('search-results', {}).get('entry', [])) + header = resp.headers # Use header of final call + self._json = data + else: + data = None + elif obj_retrieval: + self._object = resp.content + data = [] + else: + data = loads(resp.text) + self._json = data + data = [data] # Set private variables self._mdate = time() self._header = header diff --git a/pybliometrics/superclasses/search.py b/pybliometrics/superclasses/search.py index 9723048b..be5966c6 100644 --- a/pybliometrics/superclasses/search.py +++ b/pybliometrics/superclasses/search.py @@ -5,7 +5,7 @@ from typing import Union from pybliometrics.superclasses import Base -from pybliometrics.utils import get_config, COUNTS, URLS +from pybliometrics.utils import flatten_dict, get_config, COUNTS, URLS class Search(Base): @@ -37,13 +37,27 @@ def __init__(self, api = self.__class__.__name__ # Construct query parameters count = COUNTS[api][self._view] - params = {'count': count, 'view': self._view, **kwds} - if isinstance(query, dict): - params.update(query) - name = "&".join(["=".join(t) for t in zip(query.keys(), query.values())]) + + if api == 'ScienceDirectSearch': + # Add default parameters + params = {**query} + params.setdefault('display', {}) + defaults = {'offset': 0, 'show': count, 'sortBy': 'date'} + for key, default in defaults.items(): + params['display'].setdefault(key, default) + # Flatten query and create name + flat_query = flatten_dict(query) + name = "&".join(["=".join(map(str, t)) for t in zip(flat_query.keys(), flat_query.values())]) else: - params['query'] = query - name = query + params = {'count': count, 'view': self._view, **kwds} + + if isinstance(query, dict): + params.update(query) + name = "&".join(["=".join(t) for t in zip(query.keys(), query.values())]) + else: + params['query'] = query + name = query + if cursor: params.update({'cursor': '*'}) else: diff --git a/pybliometrics/utils/constants.py b/pybliometrics/utils/constants.py index fb0fb85c..b0caeb49 100644 --- a/pybliometrics/utils/constants.py +++ b/pybliometrics/utils/constants.py @@ -133,3 +133,4 @@ # Other API restrictions SEARCH_MAX_ENTRIES = 5_000 +SCIENCE_DIRECT_MAX_ENTRIES = 6_000 diff --git a/pybliometrics/utils/get_content.py b/pybliometrics/utils/get_content.py index 801e746c..9e0c1006 100644 --- a/pybliometrics/utils/get_content.py +++ b/pybliometrics/utils/get_content.py @@ -1,7 +1,9 @@ -from typing import Type +from typing import Literal, Optional, Type +from random import shuffle from requests import Session from requests.adapters import HTTPAdapter from requests.exceptions import JSONDecodeError +from time import sleep, time from urllib3.util import Retry from pybliometrics import __version__ @@ -31,62 +33,12 @@ def get_session() -> Type[Session]: return session -def get_content(url, api, params=None, **kwds): - """Helper function to download a file and return its content. - - Parameters - ---------- - url : str - The URL to be parsed. - - api : str - The Scopus API to be accessed. - - params : dict (optional) - Dictionary containing query parameters. For required keys - and accepted values see e.g. - https://api.elsevier.com/documentation/AuthorRetrievalAPI.wadl - - **kwds : key-value parings, optional - Keywords passed on to as query parameters. Must contain fields - and values specified in the respective API specification. - - Raises - ------ - ScopusHtmlError or HTTPError - If the status of the response is not ok. - - ValueError - If the accept parameter is not one of the accepted values. - - Returns - ------- - resp : byte-like object - The content of the file, which needs to be serialized. - """ - from random import shuffle - from time import sleep, time - - # Get needed ressources for query - config = get_config() - +def prepare_headers_and_tokens(params): + """Prepare headers and tokens for the request.""" keys = get_keys() - - # Get tokens and zip with keys - insttokens = get_insttokens() - insttokens = list(zip(keys, insttokens)) - - # Keep keys that are not insttokens + insttokens = list(zip(keys, get_insttokens())) keys = keys[len(insttokens):] - session = get_session() - - params = params or {} - params.update(**kwds) - proxies = dict(config._sections.get("Proxy", {})) - timeout = config.getint("Requests", "Timeout", fallback=20) - - # Get keys/tokens and create header token_key, insttoken = None, None if "insttoken" in params: token_key = params.pop("apikey") @@ -98,64 +50,109 @@ def get_content(url, api, params=None, **kwds): else: key = keys.pop(0) - header = {'Accept': 'application/json', - 'User-Agent': user_agent, - 'X-ELS-APIKey': token_key or key} + header = { + 'Accept': 'application/json', + 'User-Agent': user_agent, + 'X-ELS-APIKey': token_key or key + } + + if insttoken: + header['X-ELS-Insttoken'] = insttoken + + return header, insttokens, keys + - # Eventually wait bc of throttling +def handle_throttling(api): + """Handle throttling based on API limits.""" if len(_throttling_params[api]) == _throttling_params[api].maxlen: try: sleep(1 - (time() - _throttling_params[api][0])) except (IndexError, ValueError): pass + +def handle_response(resp): + """Handle the response and raise appropriate errors.""" + try: + error_type = errors[resp.status_code] + try: + reason = resp.json()['service-error']['status']['statusText'] + except KeyError: + try: + reason = resp.json()['message'] + except KeyError: + try: + reason = resp.json()['error-response']['error-message'] + except KeyError: + reason = "" + raise error_type(reason) + except (JSONDecodeError, KeyError): + resp.raise_for_status() + + +def get_content(url: str, + api: str, + params: Optional[dict], + method: Literal['GET', 'PUT'] = 'GET', + **kwds): + """Helper function to download a file and return its content.""" + config = get_config() + + session = get_session() + + params = params or {} + params.update(**kwds) + proxies = dict(config._sections.get("Proxy", {})) + timeout = config.getint("Requests", "Timeout", fallback=20) + + header, insttokens, keys = prepare_headers_and_tokens(params) + handle_throttling(api) + # Use insttoken if available - if insttoken: - header['X-ELS-Insttoken'] = insttoken - resp = session.get(url, headers=header, params=params, timeout=timeout) + if 'X-ELS-Insttoken' in header: + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout) else: - resp = session.get(url, headers=header, params=params, timeout=timeout, proxies=proxies) + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout, proxies=proxies) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout, proxies=proxies) + - # If 429 try other tokens - while (resp.status_code == 429) or (resp.status_code == 401): + # Retry logic for 429 or 401 + while resp.status_code in (429, 401): try: token_key, token = insttokens.pop(0) # Get and remove current key header['X-ELS-APIKey'] = token_key header['X-ELS-Insttoken'] = token shuffle(insttokens) - resp = session.get(url, headers=header, params=params, timeout=timeout) + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout) except IndexError: # All tokens depleted break - # Remove Insttoken from header (if present) - if 'X-ELS-Insttoken' in header: - del header['X-ELS-Insttoken'] - - # If 429 try other keys - while (resp.status_code == 429) or (resp.status_code == 401): + while resp.status_code in (429, 401): try: key = keys.pop(0) # Remove current key header['X-ELS-APIKey'] = key shuffle(keys) - resp = session.get(url, headers=header, proxies=proxies, params=params, timeout=timeout) + if method == 'GET': + resp = session.get(url, headers=header, proxies=proxies, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout, proxies=proxies) except IndexError: # All keys depleted break + if 'X-ELS-Insttoken' in header: + del header['X-ELS-Insttoken'] + _throttling_params[api].append(time()) - # Eventually raise error, if possible with supplied error message - try: - error_type = errors[resp.status_code] - try: - reason = resp.json()['service-error']['status']['statusText'] - except KeyError: - try: - reason = resp.json()['message'] - except: - reason = "" - raise error_type(reason) - except (JSONDecodeError, KeyError): - resp.raise_for_status() + handle_response(resp) return resp diff --git a/pybliometrics/utils/parse_content.py b/pybliometrics/utils/parse_content.py index 277a0e14..e41e05b7 100644 --- a/pybliometrics/utils/parse_content.py +++ b/pybliometrics/utils/parse_content.py @@ -65,6 +65,18 @@ def deduplicate(lst): return new +def flatten_dict(d, parent_key='', sep='.'): + """Recursively flatten a nested dictionary.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + def get_id(s, integer=True): """Helper function to return the Scopus ID at a fixed position.""" path = ['coredata', 'dc:identifier'] @@ -126,7 +138,7 @@ def make_int_if_possible(val): """Attempt a conversion to int type.""" try: return int(val) - except TypeError: + except (TypeError, ValueError): return val From 22882f12679b756c7e2d02cb641e5677ad06a641 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Wed, 7 May 2025 18:42:30 +0200 Subject: [PATCH 2/9] Raise error if results exceed limit --- .../sciencedirect/ScienceDirectSearch.rst | 8 ++++---- .../sciencedirect/sciencedirect_search.py | 14 ++++++++------ .../tests/test_ScienceDirectSearch.py | 17 +++++++++++------ pybliometrics/superclasses/base.py | 7 ++++++- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/docs/reference/sciencedirect/ScienceDirectSearch.rst b/docs/reference/sciencedirect/ScienceDirectSearch.rst index bc2d77ce..688828cb 100644 --- a/docs/reference/sciencedirect/ScienceDirectSearch.rst +++ b/docs/reference/sciencedirect/ScienceDirectSearch.rst @@ -1,8 +1,8 @@ pybliometrics.sciencedirect.ScienceDirectSearch =============================================== -`ScienceDirectSearch()` implements the `ScienceDirect Search API `_ using the `PUT method`. It executes a query to search for documents and retrieves the resulting records. -The class takes a `query`` dictionary as input which has to follow this schema: +`ScienceDirectSearch()` implements the `ScienceDirect Search API `_ using the `PUT` method. It executes a query to search for documents and retrieves the resulting records. +The class takes a `query` dictionary as input which has to follow this schema: .. code-block:: text @@ -44,7 +44,7 @@ Examples -------- The class is initialized with a search query. -We can pass the field `qs` to search for a specific keywords. +We can pass the field `qs`` to search for specific keywords. Using `verbose=True` will print the progress of the download. .. code-block:: python @@ -78,7 +78,7 @@ To access the results, use the attribute `results` which contains a list of `Doc Document(authors='Ruxandra Stoean; Nebojsa Bacanin; Leonard Ionescu', doi='10.1016/j.culher.2024.07.008', loadDate='2024-08-09T00:00:00.000Z', openAccess=False, first_page=18, last_page=26, pii='S1296207424001468', publicationDate='2024-10-31', sourceTitle='Journal of Cultural Heritage', title='Bridging the past and present: AI-driven 3D restoration of degraded artefacts for museum digital display', uri='https://www.sciencedirect.com/science/article/pii/S1296207424001468?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 69'), ...] -The list of results can be cast into a Pandas DataFrame. +The list of results can be converted into a Pandas DataFrame. .. code-block:: python diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index bbeb18cd..c80b07f2 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -1,3 +1,4 @@ +"""ScienceDirectSearch class for searching documents in ScienceDirect.""" from collections import namedtuple from typing import Optional, Union @@ -8,11 +9,16 @@ class ScienceDirectSearch(Search): + """ + Interaction with the ScienceDirect Search API using the `PUT` method. + See the official `documentation `__ + for more details. + """ @property def results(self) -> Optional[list]: """ - A list of namedtuples in the form `(authors doi loadDate openAccess first_page last_page - pii publicationDate sourceTitle title uri volumeIssue)`. + A list of namedtuples in the form `(authors, doi, loadDate, openAccess, first_page, last_page + pii, publicationDate, sourceTitle, title, uri, volumeIssue)`. Field definitions correspond to the `ScienceDirect Search API Migration Documentation `__ and return the @@ -69,10 +75,6 @@ def __init__(self, subscriber: bool = True, ) -> None: """ - Interaction with the ScienceDirect Search API using the `PUT` method. - See the official `documentation `__ - for more details. - Parameters ---------- query : dict diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index acfd823d..79fc5e87 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -3,6 +3,7 @@ import pytest +from pybliometrics.exception import ScopusQueryError from pybliometrics.sciencedirect import ScienceDirectSearch, init init() @@ -20,12 +21,6 @@ pagination_query = {'qs': '"Neural Networks" AND "Shapley"', 'date': '2020'} sds_pagination = ScienceDirectSearch(pagination_query, view="STANDARD", refresh=30) - -def test_empty_results(): - assert sds_empty.results is None - assert sds_empty._n == 0 - - def test_all_fields(): fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ 'sourceTitle title uri volumeIssue' @@ -65,6 +60,10 @@ def test_all_fields(): assert sds_pagination.results[-1] == expected_last_document +def test_empty_results(): + assert sds_empty.results is None + assert sds_empty._n == 0 + def test_field_consistency(): am_wrong_field = ScienceDirectSearch(one_article_query, @@ -76,12 +75,18 @@ def test_field_consistency(): _ = am_wrong_field.results +def test_large_results(): + with pytest.raises(ScopusQueryError): + _ = ScienceDirectSearch(huge_query, view="STANDARD", download=True, refresh=30) + + def test_length(): assert len(sds_standard.results) == sds_standard._n assert len(sds_standard.results) == sds_standard._n assert sds_huge.get_results_size() > 156_000 assert len(sds_pagination.results) == 127 + def test_string(): expected_str = "Search '{'title': 'Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', 'date': '2024'}' yielded 1 document as of 2025-05-07:\n 10.1016/j.eswa.2024.124912" assert str(sds_standard) == expected_str diff --git a/pybliometrics/superclasses/base.py b/pybliometrics/superclasses/base.py index eb543d09..d5cb4285 100644 --- a/pybliometrics/superclasses/base.py +++ b/pybliometrics/superclasses/base.py @@ -8,7 +8,7 @@ from tqdm import tqdm from pybliometrics.exception import ScopusQueryError -from pybliometrics.utils import get_content, parse_content, SEARCH_MAX_ENTRIES +from pybliometrics.utils import get_content, parse_content, SCIENCE_DIRECT_MAX_ENTRIES, SEARCH_MAX_ENTRIES from pybliometrics.utils import listify @@ -77,6 +77,11 @@ def __init__(self, self._n = n self._json = [] if download: + if n > SCIENCE_DIRECT_MAX_ENTRIES: + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ + 'refine your query.' + raise ScopusQueryError(text) data = res.get('results', []) n_chunks = ceil(n/params["display"]["show"]) if verbose: From 0803af92b19fdaf8ef38b6cefaf3f724c7841373 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Sun, 18 May 2025 19:49:56 +0200 Subject: [PATCH 3/9] Requested changes --- .../sciencedirect/ScienceDirectSearch.rst | 9 ++--- .../sciencedirect/sciencedirect_search.py | 23 ++++++++--- .../tests/test_ScienceDirectSearch.py | 38 ++++++++++--------- pybliometrics/superclasses/search.py | 11 +++--- 4 files changed, 48 insertions(+), 33 deletions(-) diff --git a/docs/reference/sciencedirect/ScienceDirectSearch.rst b/docs/reference/sciencedirect/ScienceDirectSearch.rst index 688828cb..6d001680 100644 --- a/docs/reference/sciencedirect/ScienceDirectSearch.rst +++ b/docs/reference/sciencedirect/ScienceDirectSearch.rst @@ -2,7 +2,7 @@ pybliometrics.sciencedirect.ScienceDirectSearch =============================================== `ScienceDirectSearch()` implements the `ScienceDirect Search API `_ using the `PUT` method. It executes a query to search for documents and retrieves the resulting records. -The class takes a `query` dictionary as input which has to follow this schema: +The class takes a `query` string that searches through all the article's or chapter's content. You can also pass any of the following parameters as keyword arguments: .. code-block:: text @@ -44,16 +44,15 @@ Examples -------- The class is initialized with a search query. -We can pass the field `qs`` to search for specific keywords. +We can pass `date` as keyword argument to search for documents published in a specific date. Using `verbose=True` will print the progress of the download. .. code-block:: python >>> from pybliometrics.sciencedirect import ScienceDirectSearch, init >>> init() - >>> # Retrieve documents based on the search query - >>> query = query = {'qs': '"neural radiance fields" AND "3D rendering"', 'date': '2024'} - >>> sds = ScienceDirectSearch(query, verbose=True) + >>> # Retrieve documents based on the search query and date + >>> sds = ScienceDirectSearch('"neural radiance fields" AND "3D rendering"', date='2024', verbose=True) Downloading results for query "{'qs': '"neural radiance fields" AND "3D rendering"', 'date': '2024', 'display': {'offset': 0, 'show': 100, 'sortBy': 'date'}, 'cursor': '*'}": 100%|██████████| 1/1 [00:00<00:00, 3.23it/s] diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index c80b07f2..7a02f739 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -65,7 +65,7 @@ def results(self) -> Optional[list]: return out or None def __init__(self, - query: dict, + query: str = '', refresh: Union[bool, int] = False, view: Optional[str] = None, verbose: bool = False, @@ -73,13 +73,13 @@ def __init__(self, integrity_fields: Optional[Union[list[str], tuple[str, ...]]] = None, integrity_action: str = "raise", subscriber: bool = True, + **kwds: str ) -> None: """ Parameters ---------- - query : dict - The query to be sent to the API, e.g., - {'qs': '"Neural Networks" AND "Shapley"', 'date': '2019-2020'} + query : str + The query to be sent to the API, e.g. '"Neural Networks" AND "Shapley"' refresh : bool or int, optional Whether to refresh the cached file. If an int is passed, the cache @@ -107,6 +107,12 @@ def __init__(self, subscriber : bool, optional If True, cursor navigation is enabled, allowing more than 5,000 results. + **kwds: str + Additional keyword arguments to be passed to the API. These can be any available + search fields, such as `authors`, `pub-date` and `title`. For a full list of + available fields, see the `ScienceDirect Search API Migration Documentation + `__. + Raises ------ ScopusQueryError @@ -117,8 +123,13 @@ def __init__(self, is not one of the allowed values. """ + # Check if the query and keyword arguments are empty + if not (query or kwds): + msg = "The query is empty. Please provide either a query string or keyword arguments." + raise ValueError(msg) + if view: - check_parameter_value(view, VIEWS['ScienceDirectSearch'], "view") + check_parameter_value(view, VIEWS["ScienceDirectSearch"], "view") else: view = "STANDARD" @@ -133,7 +144,7 @@ def __init__(self, Search.__init__(self, query=query, cursor=subscriber, download=download, - verbose=verbose) + verbose=verbose, **kwds) def __str__(self): """Print a summary string.""" diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index 79fc5e87..492fd6fb 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -8,18 +8,21 @@ init() -one_article_query = {'title': 'Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', - 'date': '2024'} -sds_standard = ScienceDirectSearch(one_article_query, refresh=30) +sds_standard = ScienceDirectSearch(title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + date='2024', + refresh=30) -empty_query = {'title': 'Not a realistic title', 'date': '2012'} -sds_empty = ScienceDirectSearch(empty_query, view="STANDARD", refresh=30) +sds_empty = ScienceDirectSearch(title='Not a realistic title', + date='2012', + view="STANDARD", refresh=30) -huge_query = {'qs': 'Neural Networks', 'date': '2015-2020'} -sds_huge = ScienceDirectSearch(huge_query, view="STANDARD", download=False, refresh=30) +sds_huge = ScienceDirectSearch('Neural Networks', + date='2015-2020', + view="STANDARD", download=False, refresh=30) -pagination_query = {'qs': '"Neural Networks" AND "Shapley"', 'date': '2020'} -sds_pagination = ScienceDirectSearch(pagination_query, view="STANDARD", refresh=30) +sds_pagination = ScienceDirectSearch('"Neural Networks" AND "Shapley"', + date='2020', + view="STANDARD", refresh=30) def test_all_fields(): fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ @@ -66,18 +69,19 @@ def test_empty_results(): def test_field_consistency(): - am_wrong_field = ScienceDirectSearch(one_article_query, - integrity_fields=["notExistingField"], - integrity_action="warn", - view="STANDARD", - refresh=30) + am_wrong_field = ScienceDirectSearch(query='', + title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + date='2024', + integrity_fields=["notExistingField"], + integrity_action="warn", + view="STANDARD", refresh=30) with pytest.raises(ValueError): _ = am_wrong_field.results def test_large_results(): with pytest.raises(ScopusQueryError): - _ = ScienceDirectSearch(huge_query, view="STANDARD", download=True, refresh=30) + _ = ScienceDirectSearch('Neural Networks', view="STANDARD", download=True, refresh=30) def test_length(): @@ -88,5 +92,5 @@ def test_length(): def test_string(): - expected_str = "Search '{'title': 'Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', 'date': '2024'}' yielded 1 document as of 2025-05-07:\n 10.1016/j.eswa.2024.124912" - assert str(sds_standard) == expected_str + expected_str = "Search '' yielded 1 document as of" + assert str(sds_standard).startswith(expected_str) diff --git a/pybliometrics/superclasses/search.py b/pybliometrics/superclasses/search.py index be5966c6..8234225a 100644 --- a/pybliometrics/superclasses/search.py +++ b/pybliometrics/superclasses/search.py @@ -39,15 +39,16 @@ def __init__(self, count = COUNTS[api][self._view] if api == 'ScienceDirectSearch': - # Add default parameters - params = {**query} + # Set qs, keyword arguments and add default parameters + params = {'qs': query, **kwds} + # Flatten query and create name + flat_query = flatten_dict(params) + name = "&".join(["=".join(map(str, t)) for t in zip(flat_query.keys(), flat_query.values())]) + # Add default parameters for pagination params.setdefault('display', {}) defaults = {'offset': 0, 'show': count, 'sortBy': 'date'} for key, default in defaults.items(): params['display'].setdefault(key, default) - # Flatten query and create name - flat_query = flatten_dict(query) - name = "&".join(["=".join(map(str, t)) for t in zip(flat_query.keys(), flat_query.values())]) else: params = {'count': count, 'view': self._view, **kwds} From 98b050459378f7091b045c3662b3aa169a540b08 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Sun, 18 May 2025 20:00:47 +0200 Subject: [PATCH 4/9] Let query be optional and set '' as default --- pybliometrics/sciencedirect/sciencedirect_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index 7a02f739..fa36bb25 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -65,7 +65,7 @@ def results(self) -> Optional[list]: return out or None def __init__(self, - query: str = '', + query: Optional[str] = None, refresh: Union[bool, int] = False, view: Optional[str] = None, verbose: bool = False, @@ -127,6 +127,7 @@ def __init__(self, if not (query or kwds): msg = "The query is empty. Please provide either a query string or keyword arguments." raise ValueError(msg) + query = query or '' if view: check_parameter_value(view, VIEWS["ScienceDirectSearch"], "view") From a5e69f8260b060bbe86faae685420afcddf426aa Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Sun, 18 May 2025 20:17:45 +0200 Subject: [PATCH 5/9] ScienceDirectSearch: Test empty query --- .../sciencedirect/tests/test_ScienceDirectSearch.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index 492fd6fb..3edb007f 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -68,6 +68,11 @@ def test_empty_results(): assert sds_empty._n == 0 +def test_empty_query(): + with pytest.raises(ValueError): + _ = ScienceDirectSearch(view="STANDARD", refresh=30) + + def test_field_consistency(): am_wrong_field = ScienceDirectSearch(query='', title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', From 4e069054f2d85652d5e6242dcda20055cc819968 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Fri, 13 Jun 2025 16:00:16 +0200 Subject: [PATCH 6/9] Make minimal changes --- pybliometrics/utils/get_content.py | 152 +++++++++++++++++------------ 1 file changed, 87 insertions(+), 65 deletions(-) diff --git a/pybliometrics/utils/get_content.py b/pybliometrics/utils/get_content.py index 9e0c1006..50040f35 100644 --- a/pybliometrics/utils/get_content.py +++ b/pybliometrics/utils/get_content.py @@ -1,9 +1,7 @@ from typing import Literal, Optional, Type -from random import shuffle from requests import Session from requests.adapters import HTTPAdapter from requests.exceptions import JSONDecodeError -from time import sleep, time from urllib3.util import Retry from pybliometrics import __version__ @@ -33,12 +31,66 @@ def get_session() -> Type[Session]: return session -def prepare_headers_and_tokens(params): - """Prepare headers and tokens for the request.""" +def get_content(url: str, + api: str, + params: Optional[dict], + method: Literal['GET', 'PUT'] = 'GET', + **kwds): + """Helper function to download a file and return its content. + + Parameters + ---------- + url : str + The URL to be parsed. + + api : str + The Scopus API to be accessed. + + params : dict (optional) + Dictionary containing query parameters. For required keys + and accepted values see e.g. + https://api.elsevier.com/documentation/AuthorRetrievalAPI.wadl + + **kwds : key-value parings, optional + Keywords passed on to as query parameters. Must contain fields + and values specified in the respective API specification. + + Raises + ------ + ScopusHtmlError or HTTPError + If the status of the response is not ok. + + ValueError + If the accept parameter is not one of the accepted values. + + Returns + ------- + resp : byte-like object + The content of the file, which needs to be serialized. + """ + from random import shuffle + from time import sleep, time + + # Get needed ressources for query + config = get_config() + keys = get_keys() - insttokens = list(zip(keys, get_insttokens())) + + # Get tokens and zip with keys + insttokens = get_insttokens() + insttokens = list(zip(keys, insttokens)) + + # Keep keys that are not insttokens keys = keys[len(insttokens):] + session = get_session() + + params = params or {} + params.update(**kwds) + proxies = dict(config._sections.get("Proxy", {})) + timeout = config.getint("Requests", "Timeout", fallback=20) + + # Get keys/tokens and create header token_key, insttoken = None, None if "insttoken" in params: token_key = params.pop("apikey") @@ -50,66 +102,20 @@ def prepare_headers_and_tokens(params): else: key = keys.pop(0) - header = { - 'Accept': 'application/json', - 'User-Agent': user_agent, - 'X-ELS-APIKey': token_key or key - } + header = {'Accept': 'application/json', + 'User-Agent': user_agent, + 'X-ELS-APIKey': token_key or key} - if insttoken: - header['X-ELS-Insttoken'] = insttoken - - return header, insttokens, keys - - -def handle_throttling(api): - """Handle throttling based on API limits.""" + # Eventually wait bc of throttling if len(_throttling_params[api]) == _throttling_params[api].maxlen: try: sleep(1 - (time() - _throttling_params[api][0])) except (IndexError, ValueError): pass - -def handle_response(resp): - """Handle the response and raise appropriate errors.""" - try: - error_type = errors[resp.status_code] - try: - reason = resp.json()['service-error']['status']['statusText'] - except KeyError: - try: - reason = resp.json()['message'] - except KeyError: - try: - reason = resp.json()['error-response']['error-message'] - except KeyError: - reason = "" - raise error_type(reason) - except (JSONDecodeError, KeyError): - resp.raise_for_status() - - -def get_content(url: str, - api: str, - params: Optional[dict], - method: Literal['GET', 'PUT'] = 'GET', - **kwds): - """Helper function to download a file and return its content.""" - config = get_config() - - session = get_session() - - params = params or {} - params.update(**kwds) - proxies = dict(config._sections.get("Proxy", {})) - timeout = config.getint("Requests", "Timeout", fallback=20) - - header, insttokens, keys = prepare_headers_and_tokens(params) - handle_throttling(api) - # Use insttoken if available - if 'X-ELS-Insttoken' in header: + if insttoken: + header['X-ELS-Insttoken'] = insttoken if method == 'GET': resp = session.get(url, headers=header, params=params, timeout=timeout) else: @@ -120,9 +126,8 @@ def get_content(url: str, else: resp = session.put(url, headers=header, json=params, timeout=timeout, proxies=proxies) - - # Retry logic for 429 or 401 - while resp.status_code in (429, 401): + # If 429 try other tokens + while (resp.status_code == 429) or (resp.status_code == 401): try: token_key, token = insttokens.pop(0) # Get and remove current key header['X-ELS-APIKey'] = token_key @@ -135,7 +140,12 @@ def get_content(url: str, except IndexError: # All tokens depleted break - while resp.status_code in (429, 401): + # Remove Insttoken from header (if present) + if 'X-ELS-Insttoken' in header: + del header['X-ELS-Insttoken'] + + # If 429 try other keys + while (resp.status_code == 429) or (resp.status_code == 401): try: key = keys.pop(0) # Remove current key header['X-ELS-APIKey'] = key @@ -147,12 +157,24 @@ def get_content(url: str, except IndexError: # All keys depleted break - if 'X-ELS-Insttoken' in header: - del header['X-ELS-Insttoken'] - _throttling_params[api].append(time()) - handle_response(resp) + # Eventually raise error, if possible with supplied error message + try: + error_type = errors[resp.status_code] + try: + reason = resp.json()['service-error']['status']['statusText'] + except KeyError: + try: + reason = resp.json()['message'] + except KeyError: + try: + reason = resp.json()['error-response']['error-message'] + except KeyError: + reason = "" + raise error_type(reason) + except (JSONDecodeError, KeyError): + resp.raise_for_status() return resp From 4c5e2133709d4464c2ef5522a8f2309aad22c3e0 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Fri, 13 Jun 2025 16:16:42 +0200 Subject: [PATCH 7/9] Requested changes: ScienceDirectSearch --- .../sciencedirect/sciencedirect_search.py | 79 +++++++++---------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index fa36bb25..11b72b22 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -9,11 +9,6 @@ class ScienceDirectSearch(Search): - """ - Interaction with the ScienceDirect Search API using the `PUT` method. - See the official `documentation `__ - for more details. - """ @property def results(self) -> Optional[list]: """ @@ -76,42 +71,38 @@ def __init__(self, **kwds: str ) -> None: """ - Parameters - ---------- - query : str - The query to be sent to the API, e.g. '"Neural Networks" AND "Shapley"' - - refresh : bool or int, optional - Whether to refresh the cached file. If an int is passed, the cache - will refresh if older than that many days. - - view : str, optional - The API view to use. Default is "STANDARD". - - verbose : bool, optional - Whether to print a download progress bar. - - download : bool, optional - Whether to download results (if they haven't been cached). - - integrity_fields : list of str or tuple of str, optional - Fields whose completeness should be checked. If any field is missing, - the `integrity_action` will be triggered. - - integrity_action : {'raise', 'warn'}, optional - What to do if required fields are missing: - - - 'raise' : Raise an AttributeError - - 'warn' : Emit a UserWarning - - subscriber : bool, optional - If True, cursor navigation is enabled, allowing more than 5,000 results. - - **kwds: str - Additional keyword arguments to be passed to the API. These can be any available - search fields, such as `authors`, `pub-date` and `title`. For a full list of - available fields, see the `ScienceDirect Search API Migration Documentation - `__. + Interaction with the ScienceDirect Search API using the `PUT` method. + See the official `documentation `__ + for more details. + + :param query: Free text query string as the `qs`field in the `documentation + `__. + :param refresh: Whether to refresh the cached file if it exists or not. + If int is passed, cached file will be refreshed if the + number of days since last modification exceeds that value. + :param view: Which view to use for the query, see `the documentation `__. + Allowed values: `STANDARD`. + :param verbose: Whether to print a download progress bar. + :param download: Whether to download results (if they have not been + cached). + :param integrity_fields: A list or tuple with the names of fields whose completeness should + be checked. `ScienceDirectSearch` will perform the + action specified in `integrity_action` if + elements in these fields are missing. This + helps to avoid idiosynchratically missing + elements that should always be present + (e.g., doi or authors). + :param integrity_action: What to do in case integrity of provided fields + cannot be verified. Possible actions: + - `"raise"`: Raise an `AttributeError` + - `"warn"`: Raise a `UserWarning` + :param subscriber: Whether you access ScienceDirect with a subscription or not. + For subscribers, ScienceDirect's cursor navigation will be + used. Sets the number of entries in each query + iteration to the maximum number allowed by the + corresponding view. + :param kwds: Keywords passed on as query parameters. Must contain + fields and values mentioned in the `API specification `__. Raises ------ @@ -121,6 +112,12 @@ def __init__(self, ValueError If any of the parameters `integrity_action`, `refresh` or `view` is not one of the allowed values. + + Notes + ----- + The directory for cached results is `{path}/{view}/{fname}`, + where `path` is specified in your configuration file and `fname` is + the md5-hashed version of the flattened `query`. """ # Check if the query and keyword arguments are empty From fc6f15ea486f32dd305f67def4c01b71e4db270c Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Fri, 13 Jun 2025 17:20:58 +0200 Subject: [PATCH 8/9] Base: Put Science Direct search conditional second --- .../tests/test_ScienceDirectSearch.py | 8 +-- pybliometrics/superclasses/base.py | 56 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index 3edb007f..7ec0673e 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -10,19 +10,19 @@ sds_standard = ScienceDirectSearch(title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', date='2024', - refresh=30) + refresh=True) sds_empty = ScienceDirectSearch(title='Not a realistic title', date='2012', - view="STANDARD", refresh=30) + view="STANDARD", refresh=True) sds_huge = ScienceDirectSearch('Neural Networks', date='2015-2020', - view="STANDARD", download=False, refresh=30) + view="STANDARD", download=False, refresh=True) sds_pagination = ScienceDirectSearch('"Neural Networks" AND "Shapley"', date='2020', - view="STANDARD", refresh=30) + view="STANDARD", refresh=True) def test_all_fields(): fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ diff --git a/pybliometrics/superclasses/base.py b/pybliometrics/superclasses/base.py index d5cb4285..58046788 100644 --- a/pybliometrics/superclasses/base.py +++ b/pybliometrics/superclasses/base.py @@ -68,34 +68,7 @@ def __init__(self, else: self._json = loads(fname.read_text()) else: - if sciencedirect_search: - resp = get_content(url, api, params, 'PUT' ,**kwds) - header = resp.headers - res = resp.json() - # Get the number of results - n = int(res.get('resultsFound', 0)) - self._n = n - self._json = [] - if download: - if n > SCIENCE_DIRECT_MAX_ENTRIES: - text = f'Found {n:,} matches. The query fails to return '\ - f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ - 'refine your query.' - raise ScopusQueryError(text) - data = res.get('results', []) - n_chunks = ceil(n/params["display"]["show"]) - if verbose: - print(f'Downloading results for query "{params}":') - for i in tqdm(range(1, n_chunks), disable=not verbose): - params['display']['offset'] += params["display"]["show"] - resp = get_content(url, api, params, 'PUT' ,**kwds) - res = resp.json() - data.extend(res.get('results', [])) - header = resp.headers # Use header of final call - self._json = data - else: - data = None - else: + if not sciencedirect_search: resp = get_content(url, api, params, **kwds) header = resp.headers @@ -151,6 +124,33 @@ def __init__(self, data = loads(resp.text) self._json = data data = [data] + else: # ScienceDirect Search API + resp = get_content(url, api, params, 'PUT' ,**kwds) + header = resp.headers + res = resp.json() + # Get the number of results + n = int(res.get('resultsFound', 0)) + self._n = n + self._json = [] + if download: + if n > SCIENCE_DIRECT_MAX_ENTRIES: + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ + 'refine your query.' + raise ScopusQueryError(text) + data = res.get('results', []) + n_chunks = ceil(n/params["display"]["show"]) + if verbose: + print(f'Downloading results for query "{params}":') + for i in tqdm(range(1, n_chunks), disable=not verbose): + params['display']['offset'] += params["display"]["show"] + resp = get_content(url, api, params, 'PUT' ,**kwds) + res = resp.json() + data.extend(res.get('results', [])) + header = resp.headers # Use header of final call + self._json = data + else: + data = None # Set private variables self._mdate = time() self._header = header From 921fa26ca1039e8a6ad026511bf2f852c463bcbd Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Fri, 13 Jun 2025 17:26:06 +0200 Subject: [PATCH 9/9] Undo last commit --- pybliometrics/superclasses/base.py | 56 +++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pybliometrics/superclasses/base.py b/pybliometrics/superclasses/base.py index 58046788..d5cb4285 100644 --- a/pybliometrics/superclasses/base.py +++ b/pybliometrics/superclasses/base.py @@ -68,7 +68,34 @@ def __init__(self, else: self._json = loads(fname.read_text()) else: - if not sciencedirect_search: + if sciencedirect_search: + resp = get_content(url, api, params, 'PUT' ,**kwds) + header = resp.headers + res = resp.json() + # Get the number of results + n = int(res.get('resultsFound', 0)) + self._n = n + self._json = [] + if download: + if n > SCIENCE_DIRECT_MAX_ENTRIES: + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ + 'refine your query.' + raise ScopusQueryError(text) + data = res.get('results', []) + n_chunks = ceil(n/params["display"]["show"]) + if verbose: + print(f'Downloading results for query "{params}":') + for i in tqdm(range(1, n_chunks), disable=not verbose): + params['display']['offset'] += params["display"]["show"] + resp = get_content(url, api, params, 'PUT' ,**kwds) + res = resp.json() + data.extend(res.get('results', [])) + header = resp.headers # Use header of final call + self._json = data + else: + data = None + else: resp = get_content(url, api, params, **kwds) header = resp.headers @@ -124,33 +151,6 @@ def __init__(self, data = loads(resp.text) self._json = data data = [data] - else: # ScienceDirect Search API - resp = get_content(url, api, params, 'PUT' ,**kwds) - header = resp.headers - res = resp.json() - # Get the number of results - n = int(res.get('resultsFound', 0)) - self._n = n - self._json = [] - if download: - if n > SCIENCE_DIRECT_MAX_ENTRIES: - text = f'Found {n:,} matches. The query fails to return '\ - f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ - 'refine your query.' - raise ScopusQueryError(text) - data = res.get('results', []) - n_chunks = ceil(n/params["display"]["show"]) - if verbose: - print(f'Downloading results for query "{params}":') - for i in tqdm(range(1, n_chunks), disable=not verbose): - params['display']['offset'] += params["display"]["show"] - resp = get_content(url, api, params, 'PUT' ,**kwds) - res = resp.json() - data.extend(res.get('results', [])) - header = resp.headers # Use header of final call - self._json = data - else: - data = None # Set private variables self._mdate = time() self._header = header