diff --git a/docs/reference/sciencedirect/ScienceDirectSearch.rst b/docs/reference/sciencedirect/ScienceDirectSearch.rst index 955daaf..6d00168 100644 --- a/docs/reference/sciencedirect/ScienceDirectSearch.rst +++ b/docs/reference/sciencedirect/ScienceDirectSearch.rst @@ -1,9 +1,33 @@ pybliometrics.sciencedirect.ScienceDirectSearch -================================================= - -`ScopusSearch()` implements the `ScienceDirect Search API `_. It executes a query to search for documents and retrieves the resulting records. -Any query that works in the `Advanced Document Search on sciencedirect.com `_ will work. -For a complete guide on how to query check the `documentation `_. +=============================================== + +`ScienceDirectSearch()` implements the `ScienceDirect Search API `_ using the `PUT` method. It executes a query to search for documents and retrieves the resulting records. +The class takes a `query` string that searches through all the article's or chapter's content. You can also pass any of the following parameters as keyword arguments: + +.. code-block:: text + + { + authors: string, + date: string, + display: { + highlights: boolean, + offset: integer, + show: integer, + sortBy: string + }, + filters: { + openAccess: boolean + }, + issue: string, + loadedAfter: string, + page: string, + pub: string, + qs: string, + title: string, + volume: string + } + +For a more detailed description of the parameters, please refer to the `ScienceDirect Search API migration documentation `_. .. currentmodule:: pybliometrics.sciencedirect .. contents:: Table of Contents @@ -19,16 +43,28 @@ Documentation Examples -------- -The class is initialized with a search query. To see the download progress, set `verbose=True`. +The class is initialized with a search query. +We can pass `date` as keyword argument to search for documents published in a specific date. +Using `verbose=True` will print the progress of the download. .. code-block:: python >>> from pybliometrics.sciencedirect import ScienceDirectSearch, init >>> init() - >>> # Retrieve documents based on the search query - >>> sds = ScienceDirectSearch('"neural radiance fields" AND "3D" AND YEAR(2024)', verbose=True) - Downloading results for query ""neural radiance fields" AND "3D" AND YEAR(2024)": - 100%|██████████| 8/8 [00:05<00:00, 1.39it/s] + >>> # Retrieve documents based on the search query and date + >>> sds = ScienceDirectSearch('"neural radiance fields" AND "3D rendering"', date='2024', verbose=True) + Downloading results for query "{'qs': '"neural radiance fields" AND "3D rendering"', 'date': '2024', 'display': {'offset': 0, 'show': 100, 'sortBy': 'date'}, 'cursor': '*'}": + 100%|██████████| 1/1 [00:00<00:00, 3.23it/s] + + +To check the number of results, use the method `get_results_size()`. + +.. code-block:: python + + >>> # Check the number of results + >>> sds.get_results_size() + 10 + To access the results, use the attribute `results` which contains a list of `Document` namedtuples. @@ -36,12 +72,12 @@ To access the results, use the attribute `results` which contains a list of `Doc >>> # Access the results >>> results = sds.results - [Document(authors='Dong He;Wenhua Qian;Jinde Cao', first_author='Dong He', doi='10.1016/j.cag.2025.104181', title='GEAST-RF: Geometry Enhanced 3D Arbitrary Style Transfer Via Neural Radiance Fields', link='https://www.sciencedirect.com/science/article/pii/S0097849325000202?dgcid=api_sd_search-api-endpoint', load_date='2025-02-16T00:00:00.000Z', openaccess_status=False, pii='S0097849325000202', coverDate='2025-02-16', endingPage=None, publicationName='Computers & Graphics', startingPage='104181', api_link='https://api.elsevier.com/content/article/pii/S0097849325000202', volume=None), - Document(authors='Qicheng Xu;Min Hu;Xitao Zhang', first_author='Qicheng Xu', doi='10.1016/j.asr.2025.01.065', title='A neural radiance fields method for 3D reconstruction of space target', link='https://www.sciencedirect.com/science/article/pii/S0273117725000973?dgcid=api_sd_search-api-endpoint', load_date='2025-02-01T00:00:00.000Z', openaccess_status=False, pii='S0273117725000973', coverDate='2025-02-01', endingPage=None, publicationName='Advances in Space Research', startingPage=None, api_link='https://api.elsevier.com/content/article/pii/S0273117725000973', volume=None), - Document(authors='Jian Liu;Zhen Yu', first_author='Jian Liu', doi='10.1016/j.neucom.2025.129420', title='SA3D-L: A lightweight model for 3D object segmentation using neural radiance fields', link='https://www.sciencedirect.com/science/article/pii/S092523122500092X?dgcid=api_sd_search-api-endpoint', load_date='2025-01-14T00:00:00.000Z', openaccess_status=False, pii='S092523122500092X', coverDate='2025-03-28', endingPage=None, publicationName='Neurocomputing', startingPage='129420', api_link='https://api.elsevier.com/content/article/pii/S092523122500092X', volume='623'), + [Document(authors='Geontae Kim; Youngjin Cha', doi='10.1016/j.autcon.2024.105878', loadDate='2024-11-19T00:00:00.000Z', openAccess=True, first_page=105878, last_page=None, pii='S0926580524006149', publicationDate='2024-12-15', sourceTitle='Automation in Construction', title='3D Pixelwise damage mapping using a deep attention based modified Nerfacto', uri='https://www.sciencedirect.com/science/article/pii/S0926580524006149?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 168, Part B'), + Document(authors='Akram Akbar; Chun Liu; Zeran Xu', doi='10.1016/j.aei.2024.102913', loadDate='2024-11-16T00:00:00.000Z', openAccess=False, first_page=102913, last_page=None, pii='S1474034624005640', publicationDate='2024-10-31', sourceTitle='Advanced Engineering Informatics', title='Scene information guided aerial photogrammetric mission recomposition towards detailed level building reconstruction', uri='https://www.sciencedirect.com/science/article/pii/S1474034624005640?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 62, Part D'), + Document(authors='Ruxandra Stoean; Nebojsa Bacanin; Leonard Ionescu', doi='10.1016/j.culher.2024.07.008', loadDate='2024-08-09T00:00:00.000Z', openAccess=False, first_page=18, last_page=26, pii='S1296207424001468', publicationDate='2024-10-31', sourceTitle='Journal of Cultural Heritage', title='Bridging the past and present: AI-driven 3D restoration of degraded artefacts for museum digital display', uri='https://www.sciencedirect.com/science/article/pii/S1296207424001468?dgcid=api_sd_search-api-endpoint', volumeIssue='Volume 69'), ...] -The list of results can be cast into a Pandas DataFrame. +The list of results can be converted into a Pandas DataFrame. .. code-block:: python @@ -50,11 +86,12 @@ The list of results can be cast into a Pandas DataFrame. >>> df = pd.DataFrame(sds.results) >>> # Display available fields >>> df.columns - Index(['eid', 'filename', 'height', 'mimetype', 'ref', 'size', 'type', 'url', - 'width'], + Index(['authors', 'doi', 'loadDate', 'openAccess', 'first_page', 'last_page', + 'pii', 'publicationDate', 'sourceTitle', 'title', 'uri', 'volumeIssue'], dtype='object') >>> # Get shape of the DataFrame (rows x columns) - (200, 14) + >>> df.shape + (10, 12) >>> # Display the first 3 rows >>> df.head(3) @@ -82,74 +119,65 @@ The list of results can be cast into a Pandas DataFrame. authors - first_author doi - title - link - load_date - openaccess_status + loadDate + openAccess + first_page + last_page pii - coverDate - endingPage - publicationName - startingPage - api_link - volume + publicationDate + sourceTitle + title + uri + volumeIssue 0 - Dong He;Wenhua Qian;Jinde Cao - Dong He - 10.1016/j.cag.2025.104181 - GEAST-RF: Geometry Enhanced 3D Arbitrary Style... + Geontae Kim; Youngjin Cha + 10.1016/j.autcon.2024.105878 + 2024-11-19T00:00:00.000Z + True + 105878 + NaN + S0926580524006149 + 2024-12-15 + Automation in Construction + 3D Pixelwise damage mapping using a deep atten... https://www.sciencedirect.com/science/article/... - 2025-02-16T00:00:00.000Z - False - S0097849325000202 - 2025-02-16 - None - Computers & Graphics - 104181 - https://api.elsevier.com/content/article/pii/S... - None + Volume 168, Part B 1 - Qicheng Xu;Min Hu;Xitao Zhang - Qicheng Xu - 10.1016/j.asr.2025.01.065 - A neural radiance fields method for 3D reconst... - https://www.sciencedirect.com/science/article/... - 2025-02-01T00:00:00.000Z + Akram Akbar; Chun Liu; Zeran Xu + 10.1016/j.aei.2024.102913 + 2024-11-16T00:00:00.000Z False - S0273117725000973 - 2025-02-01 - None - Advances in Space Research - None - https://api.elsevier.com/content/article/pii/S... - None + 102913 + NaN + S1474034624005640 + 2024-10-31 + Advanced Engineering Informatics + Scene information guided aerial photogrammetri... + https://www.sciencedirect.com/science/article/... + Volume 62, Part D 2 - Jian Liu;Zhen Yu - Jian Liu - 10.1016/j.neucom.2025.129420 - SA3D-L: A lightweight model for 3D object segm... - https://www.sciencedirect.com/science/article/... - 2025-01-14T00:00:00.000Z + Ruxandra Stoean; Nebojsa Bacanin; Leonard Ionescu + 10.1016/j.culher.2024.07.008 + 2024-08-09T00:00:00.000Z False - S092523122500092X - 2025-03-28 - None - Neurocomputing - 129420 - https://api.elsevier.com/content/article/pii/S... - 623 + 18 + 26.0 + S1296207424001468 + 2024-10-31 + Journal of Cultural Heritage + Bridging the past and present: AI-driven 3D re... + https://www.sciencedirect.com/science/article/... + Volume 69 - - + \ No newline at end of file diff --git a/pybliometrics/sciencedirect/sciencedirect_search.py b/pybliometrics/sciencedirect/sciencedirect_search.py index 75873e9..11b72b2 100644 --- a/pybliometrics/sciencedirect/sciencedirect_search.py +++ b/pybliometrics/sciencedirect/sciencedirect_search.py @@ -1,22 +1,24 @@ +"""ScienceDirectSearch class for searching documents in ScienceDirect.""" from collections import namedtuple from typing import Optional, Union from pybliometrics.superclasses import Search from pybliometrics.utils import check_field_consistency, chained_get, \ - check_integrity, check_parameter_value, deduplicate, \ + check_integrity, check_parameter_value, deduplicate, make_int_if_possible, \ make_search_summary, VIEWS class ScienceDirectSearch(Search): @property - def results(self) -> Optional[list[namedtuple]]: - """A list of namedtuples in the form `(authors first_author doi title link - load_date openaccess_status pii coverDate endingPage publicationName startingPage - api_link volume)`. + def results(self) -> Optional[list]: + """ + A list of namedtuples in the form `(authors, doi, loadDate, openAccess, first_page, last_page + pii, publicationDate, sourceTitle, title, uri, volumeIssue)`. - Field definitions correspond to the `ScienceDirect Search Views - `__ and return the - values as-is, except for `authors` which are joined on `";"`. + Field definitions correspond to the `ScienceDirect Search API Migration Documentation + `__ and return the + values as-is, except for `authors` which are joined on `";"` and pages which are + parsed into `first_page` and `last_page`. Raises ------ @@ -29,49 +31,36 @@ def results(self) -> Optional[list[namedtuple]]: The list of authors and the list of affiliations per author are deduplicated. """ - fields = 'authors first_author doi title link load_date openaccess_status pii '\ - 'coverDate endingPage publicationName startingPage api_link volume' + fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ + 'sourceTitle title uri volumeIssue' doc = namedtuple('Document', fields) check_field_consistency(self._integrity, fields) # Parse elements one-by-one out = [] for item in self._json: # Get authors and create ";" separated string - authors_list = self._get_authors(item) - authors_list = deduplicate(authors_list) - authors = ';'.join(authors_list) - # Get links - links_found = item.get('link') - links = {'api_link': None, 'scidir': None} - for link in links_found: - if link.get('@ref') == 'self': - links['api_link'] = link.get('@href') - elif link.get('@ref') == 'scidir': - links['scidir'] = link.get('@href') - # Get doi - doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None + authors_list = deduplicate([a.get('name') for a in item.get('authors', {})]) + authors = "; ".join(authors_list) new = doc( authors=authors, - first_author=item.get('dc:creator'), - doi=doi, - title=item.get("dc:title"), - link=links["scidir"], - load_date=item.get("load-date"), - openaccess_status=item.get("openaccess"), - pii=item.get("pii"), - coverDate=item.get("prism:coverDate"), - endingPage=item.get("prism:endingPage"), - publicationName=item.get("prism:publicationName"), - startingPage=item.get("prism:startingPage"), - api_link=links["api_link"] or item.get("prism:url"), - volume=item.get("prism:volume") + doi=item.get('doi'), + loadDate=item.get('loadDate'), + openAccess=item.get('openAccess'), + first_page=make_int_if_possible(chained_get(item, ('pages', 'first'))), + last_page=make_int_if_possible(chained_get(item, ('pages', 'last'))), + pii=item.get('pii'), + publicationDate=item.get('publicationDate'), + sourceTitle=item.get('sourceTitle'), + title=item.get('title'), + uri=item.get('uri'), + volumeIssue=item.get('volumeIssue') ) out.append(new) check_integrity(out, self._integrity, self._action) return out or None def __init__(self, - query: str, + query: Optional[str] = None, refresh: Union[bool, int] = False, view: Optional[str] = None, verbose: bool = False, @@ -81,11 +70,13 @@ def __init__(self, subscriber: bool = True, **kwds: str ) -> None: - """Interaction with the ScienceDirect Search API. This represents a search against the - ScienceDirect cluster, which contains serial/nonserial full-text articles. Note that this API - replicates the search experience on `ScienceDirect `__. + """ + Interaction with the ScienceDirect Search API using the `PUT` method. + See the official `documentation `__ + for more details. - :param query: A string of the query as used in the `ScienceDirect Search `__. + :param query: Free text query string as the `qs`field in the `documentation + `__. :param refresh: Whether to refresh the cached file if it exists or not. If int is passed, cached file will be refreshed if the number of days since last modification exceeds that value. @@ -95,7 +86,7 @@ def __init__(self, :param download: Whether to download results (if they have not been cached). :param integrity_fields: A list or tuple with the names of fields whose completeness should - be checked. `ArticleMetadata` will perform the + be checked. `ScienceDirectSearch` will perform the action specified in `integrity_action` if elements in these fields are missing. This helps to avoid idiosynchratically missing @@ -111,8 +102,8 @@ def __init__(self, iteration to the maximum number allowed by the corresponding view. :param kwds: Keywords passed on as query parameters. Must contain - fields and values mentioned in the `API specification `__. - + fields and values mentioned in the `API specification `__. + Raises ------ ScopusQueryError @@ -121,48 +112,39 @@ def __init__(self, ValueError If any of the parameters `integrity_action`, `refresh` or `view` is not one of the allowed values. - + Notes ----- The directory for cached results is `{path}/{view}/{fname}`, where `path` is specified in your configuration file and `fname` is - the md5-hashed version of `query`. + the md5-hashed version of the flattened `query`. - The ScienceDirect Search API V2 has two available interfaces: `PUT` and `GET`. This library uses the - `GET` interface. """ - # Check view or set to default + # Check if the query and keyword arguments are empty + if not (query or kwds): + msg = "The query is empty. Please provide either a query string or keyword arguments." + raise ValueError(msg) + query = query or '' + if view: - check_parameter_value(view, VIEWS['ScienceDirectSearch'], "view") + check_parameter_value(view, VIEWS["ScienceDirectSearch"], "view") else: view = "STANDARD" allowed = ("warn", "raise") check_parameter_value(integrity_action, allowed, "integrity_action") - # Query self._action = integrity_action self._integrity = integrity_fields or [] self._refresh = refresh self._query = query self._view = view - Search.__init__(self, query=query, download=download, verbose=verbose, **kwds) + + Search.__init__(self, query=query, + cursor=subscriber, download=download, + verbose=verbose, **kwds) def __str__(self): """Print a summary string.""" - return make_search_summary(self, "document", self.get_dois()) - - def get_dois(self): - """DOIs of retrieved documents.""" - return [d.get("prism:doi") or d.get("dc:identifier")[4:] if d.get("dc:identifier") else None for d in self._json] - - def _get_authors(self, item: dict) -> list: - """Auxiliary function to get the authors.""" - authors_data = chained_get(item, ['authors', 'author'], []) - if isinstance(authors_data, list): - authors_list = [a.get('$') for a in authors_data] - elif isinstance(authors_data, str): - authors_list = [authors_data] - else: - authors_list = [] - return authors_list + dois = [d.doi for d in self.results] if self.results else [] + return make_search_summary(self, "document", dois) diff --git a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py index b4589c5..7ec0673 100644 --- a/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py +++ b/pybliometrics/sciencedirect/tests/test_ScienceDirectSearch.py @@ -1,79 +1,101 @@ """Tests for sciencedirect.ScienceDirectSearch""" from collections import namedtuple -from pybliometrics.exception import Scopus400Error +import pytest + +from pybliometrics.exception import ScopusQueryError from pybliometrics.sciencedirect import ScienceDirectSearch, init init() -sds_standard = ScienceDirectSearch('TITLE("Assessing LLMs in malicious code deobfuscation of real-world malware campaigns") AND DATE(2012)', view="STANDARD", refresh=30) -sds_empty = ScienceDirectSearch('TITLE("Not a very realistic title")', view="STANDARD", refresh=30) +sds_standard = ScienceDirectSearch(title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + date='2024', + refresh=True) +sds_empty = ScienceDirectSearch(title='Not a realistic title', + date='2012', + view="STANDARD", refresh=True) -def test_empty_results(): - assert sds_empty.results is None - assert sds_empty._n == 0 +sds_huge = ScienceDirectSearch('Neural Networks', + date='2015-2020', + view="STANDARD", download=False, refresh=True) +sds_pagination = ScienceDirectSearch('"Neural Networks" AND "Shapley"', + date='2020', + view="STANDARD", refresh=True) def test_all_fields(): - fields = 'authors first_author doi title link load_date openaccess_status pii '\ - 'coverDate endingPage publicationName startingPage api_link volume' - doc = namedtuple("Document", fields) + fields = 'authors doi loadDate openAccess first_page last_page pii publicationDate ' \ + 'sourceTitle title uri volumeIssue' + doc = namedtuple('Document', fields) expected_standard_doc = doc( - authors="Constantinos Patsakis;Fran Casino;Nikolaos Lykousas", - first_author="Constantinos Patsakis", - doi="10.1016/j.eswa.2024.124912", - title="Assessing LLMs in malicious code deobfuscation of real-world malware campaigns", - link="https://www.sciencedirect.com/science/article/pii/S0957417424017792?dgcid=api_sd_search-api-endpoint", - load_date="2024-07-31T00:00:00.000Z", - openaccess_status=True, - pii="S0957417424017792", - coverDate="2024-12-05", - endingPage=None, - publicationName="Expert Systems with Applications", - startingPage="124912", - api_link="https://api.elsevier.com/content/article/pii/S0957417424017792", - volume="256", + authors='Constantinos Patsakis; Fran Casino; Nikolaos Lykousas', + doi='10.1016/j.eswa.2024.124912', + loadDate="2024-07-31T00:00:00.000Z", + openAccess=True, + first_page=124912, + last_page=None, + pii='S0957417424017792', + publicationDate='2024-12-05', + sourceTitle='Expert Systems with Applications', + title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + uri='https://www.sciencedirect.com/science/article/pii/S0957417424017792?dgcid=api_sd_search-api-endpoint', + volumeIssue='Volume 256' ) + assert sds_standard.results[0] == expected_standard_doc + expected_last_document = doc( + authors='Elhadji Amadou Oury Diallo; Ayumi Sugiyama; Toshiharu Sugawara', + doi='10.1016/j.neucom.2018.08.094', + loadDate='2019-04-25T00:00:00.000Z', + openAccess=False, + first_page=230, + last_page=240, + pii='S0925231219304424', + publicationDate='2020-07-05', + sourceTitle='Neurocomputing', + title='Coordinated behavior of cooperative agents using deep reinforcement learning', + uri='https://www.sciencedirect.com/science/article/pii/S0925231219304424?dgcid=api_sd_search-api-endpoint', + volumeIssue='Volume 396' + ) + assert sds_pagination.results[-1] == expected_last_document + + +def test_empty_results(): + assert sds_empty.results is None + assert sds_empty._n == 0 + + +def test_empty_query(): + with pytest.raises(ValueError): + _ = ScienceDirectSearch(view="STANDARD", refresh=30) + def test_field_consistency(): - am_wrong_field = ScienceDirectSearch('TITLE("Assessing LLMs in malicious code deobfuscation of real-world malware campaigns") AND DATE(2012)', - integrity_fields=["notExistingField"], - integrity_action="warn", - view="STANDARD", - refresh=30) - try: - am_wrong_field.results - except ValueError: - pass - except Exception as e: - raise AssertionError(f"Unexpected exception type: {type(e).__name__}") - else: - raise AssertionError("Expected ValueError but no exception was raised") + am_wrong_field = ScienceDirectSearch(query='', + title='Assessing LLMs in malicious code deobfuscation of real-world malware campaigns', + date='2024', + integrity_fields=["notExistingField"], + integrity_action="warn", + view="STANDARD", refresh=30) + with pytest.raises(ValueError): + _ = am_wrong_field.results + + +def test_large_results(): + with pytest.raises(ScopusQueryError): + _ = ScienceDirectSearch('Neural Networks', view="STANDARD", download=True, refresh=30) def test_length(): assert len(sds_standard.results) == sds_standard._n assert len(sds_standard.results) == sds_standard._n + assert sds_huge.get_results_size() > 156_000 + assert len(sds_pagination.results) == 127 def test_string(): - str_start = ('Search \'TITLE("Assessing LLMs in malicious code deobfuscation of ' - 'real-world malware campaigns") AND DATE(2012)\' yielded 1 document as of') - assert sds_standard.__str__().startswith(str_start) - - -def test_wrong_query(): - try: - ScienceDirectSearch( - 'Th(s querY - has M&ny ( Errors', view="STANDARD", refresh=30 - ) - except Scopus400Error: - pass - except Exception as e: - raise AssertionError(f"Unexpected exception type: {type(e).__name__}") - else: - raise AssertionError("Expected Scopus400Error but no exception was raised") + expected_str = "Search '' yielded 1 document as of" + assert str(sds_standard).startswith(expected_str) diff --git a/pybliometrics/superclasses/base.py b/pybliometrics/superclasses/base.py index b79c1b2..d5cb428 100644 --- a/pybliometrics/superclasses/base.py +++ b/pybliometrics/superclasses/base.py @@ -8,7 +8,7 @@ from tqdm import tqdm from pybliometrics.exception import ScopusQueryError -from pybliometrics.utils import get_content, parse_content, SEARCH_MAX_ENTRIES +from pybliometrics.utils import get_content, parse_content, SCIENCE_DIRECT_MAX_ENTRIES, SEARCH_MAX_ENTRIES from pybliometrics.utils import listify @@ -54,10 +54,12 @@ def __init__(self, ab_ref_retrieval = (api == 'AbstractRetrieval') and (params['view'] == 'REF') # Check if object retrieval obj_retrieval = (api == 'ObjectRetrieval') + # Check if ScienceDirect Search API + sciencedirect_search = (api == 'ScienceDirectSearch') if fname.exists() and not self._refresh: self._mdate = mod_ts - if search_request: + if search_request or sciencedirect_search: self._json = [loads(line) for line in fname.read_text().split("\n") if line] self._n = len(self._json) @@ -66,61 +68,89 @@ def __init__(self, else: self._json = loads(fname.read_text()) else: - resp = get_content(url, api, params, **kwds) - header = resp.headers - - if ab_ref_retrieval: - kwds['startref'] = '1' - data = _get_all_refs(url, params, verbose, resp, **kwds) - self._json = data - data = [data] - elif search_request: - # Get number of results + if sciencedirect_search: + resp = get_content(url, api, params, 'PUT' ,**kwds) + header = resp.headers res = resp.json() - n = int(res['search-results'].get('opensearch:totalResults', 0) or 0) + # Get the number of results + n = int(res.get('resultsFound', 0)) self._n = n - # Results size check - cursor_exists = "cursor" in params - if not cursor_exists and n > SEARCH_MAX_ENTRIES: - # Stop if there are too many results - text = f'Found {n:,} matches. The query fails to return '\ - f'more than {SEARCH_MAX_ENTRIES} entries. Change '\ - 'your query such that it returns fewer entries.' - raise ScopusQueryError(text) self._json = [] - # Download results page-wise if download: - data = res.get('search-results', {}).get('entry', []) - if not n: - data = "" - if not cursor_exists: - start = params["start"] - # Download the remaining information in chunks + if n > SCIENCE_DIRECT_MAX_ENTRIES: + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SCIENCE_DIRECT_MAX_ENTRIES} entries. Please '\ + 'refine your query.' + raise ScopusQueryError(text) + data = res.get('results', []) + n_chunks = ceil(n/params["display"]["show"]) if verbose: - print(f'Downloading results for query "{params["query"]}":') - n_chunks = ceil(n/params['count']) - for i in tqdm(range(1, n_chunks), disable=not verbose, - initial=1, total=n_chunks): - if cursor_exists: - cursor = res['search-results']['cursor']['@next'] - params.update({'cursor': cursor}) - else: - start += params["count"] - params.update({'start': start}) - resp = get_content(url, api, params, **kwds) + print(f'Downloading results for query "{params}":') + for i in tqdm(range(1, n_chunks), disable=not verbose): + params['display']['offset'] += params["display"]["show"] + resp = get_content(url, api, params, 'PUT' ,**kwds) res = resp.json() - data.extend(res.get('search-results', {}).get('entry', [])) + data.extend(res.get('results', [])) header = resp.headers # Use header of final call self._json = data else: data = None - elif obj_retrieval: - self._object = resp.content - data = [] else: - data = loads(resp.text) - self._json = data - data = [data] + resp = get_content(url, api, params, **kwds) + header = resp.headers + + if ab_ref_retrieval: + kwds['startref'] = '1' + data = _get_all_refs(url, params, verbose, resp, **kwds) + self._json = data + data = [data] + elif search_request: + # Get number of results + res = resp.json() + n = int(res['search-results'].get('opensearch:totalResults', 0) or 0) + self._n = n + # Results size check + cursor_exists = "cursor" in params + if not cursor_exists and n > SEARCH_MAX_ENTRIES: + # Stop if there are too many results + text = f'Found {n:,} matches. The query fails to return '\ + f'more than {SEARCH_MAX_ENTRIES} entries. Change '\ + 'your query such that it returns fewer entries.' + raise ScopusQueryError(text) + self._json = [] + # Download results page-wise + if download: + data = res.get('search-results', {}).get('entry', []) + if not n: + data = "" + if not cursor_exists: + start = params["start"] + # Download the remaining information in chunks + if verbose: + print(f'Downloading results for query "{params["query"]}":') + n_chunks = ceil(n/params['count']) + for i in tqdm(range(1, n_chunks), disable=not verbose, + initial=1, total=n_chunks): + if cursor_exists: + cursor = res['search-results']['cursor']['@next'] + params.update({'cursor': cursor}) + else: + start += params["count"] + params.update({'start': start}) + resp = get_content(url, api, params, **kwds) + res = resp.json() + data.extend(res.get('search-results', {}).get('entry', [])) + header = resp.headers # Use header of final call + self._json = data + else: + data = None + elif obj_retrieval: + self._object = resp.content + data = [] + else: + data = loads(resp.text) + self._json = data + data = [data] # Set private variables self._mdate = time() self._header = header diff --git a/pybliometrics/superclasses/search.py b/pybliometrics/superclasses/search.py index 9723048..8234225 100644 --- a/pybliometrics/superclasses/search.py +++ b/pybliometrics/superclasses/search.py @@ -5,7 +5,7 @@ from typing import Union from pybliometrics.superclasses import Base -from pybliometrics.utils import get_config, COUNTS, URLS +from pybliometrics.utils import flatten_dict, get_config, COUNTS, URLS class Search(Base): @@ -37,13 +37,28 @@ def __init__(self, api = self.__class__.__name__ # Construct query parameters count = COUNTS[api][self._view] - params = {'count': count, 'view': self._view, **kwds} - if isinstance(query, dict): - params.update(query) - name = "&".join(["=".join(t) for t in zip(query.keys(), query.values())]) + + if api == 'ScienceDirectSearch': + # Set qs, keyword arguments and add default parameters + params = {'qs': query, **kwds} + # Flatten query and create name + flat_query = flatten_dict(params) + name = "&".join(["=".join(map(str, t)) for t in zip(flat_query.keys(), flat_query.values())]) + # Add default parameters for pagination + params.setdefault('display', {}) + defaults = {'offset': 0, 'show': count, 'sortBy': 'date'} + for key, default in defaults.items(): + params['display'].setdefault(key, default) else: - params['query'] = query - name = query + params = {'count': count, 'view': self._view, **kwds} + + if isinstance(query, dict): + params.update(query) + name = "&".join(["=".join(t) for t in zip(query.keys(), query.values())]) + else: + params['query'] = query + name = query + if cursor: params.update({'cursor': '*'}) else: diff --git a/pybliometrics/utils/constants.py b/pybliometrics/utils/constants.py index fb0fb85..b0caeb4 100644 --- a/pybliometrics/utils/constants.py +++ b/pybliometrics/utils/constants.py @@ -133,3 +133,4 @@ # Other API restrictions SEARCH_MAX_ENTRIES = 5_000 +SCIENCE_DIRECT_MAX_ENTRIES = 6_000 diff --git a/pybliometrics/utils/get_content.py b/pybliometrics/utils/get_content.py index 801e746..50040f3 100644 --- a/pybliometrics/utils/get_content.py +++ b/pybliometrics/utils/get_content.py @@ -1,4 +1,4 @@ -from typing import Type +from typing import Literal, Optional, Type from requests import Session from requests.adapters import HTTPAdapter from requests.exceptions import JSONDecodeError @@ -31,7 +31,11 @@ def get_session() -> Type[Session]: return session -def get_content(url, api, params=None, **kwds): +def get_content(url: str, + api: str, + params: Optional[dict], + method: Literal['GET', 'PUT'] = 'GET', + **kwds): """Helper function to download a file and return its content. Parameters @@ -112,9 +116,15 @@ def get_content(url, api, params=None, **kwds): # Use insttoken if available if insttoken: header['X-ELS-Insttoken'] = insttoken - resp = session.get(url, headers=header, params=params, timeout=timeout) + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout) else: - resp = session.get(url, headers=header, params=params, timeout=timeout, proxies=proxies) + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout, proxies=proxies) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout, proxies=proxies) # If 429 try other tokens while (resp.status_code == 429) or (resp.status_code == 401): @@ -123,7 +133,10 @@ def get_content(url, api, params=None, **kwds): header['X-ELS-APIKey'] = token_key header['X-ELS-Insttoken'] = token shuffle(insttokens) - resp = session.get(url, headers=header, params=params, timeout=timeout) + if method == 'GET': + resp = session.get(url, headers=header, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout) except IndexError: # All tokens depleted break @@ -137,7 +150,10 @@ def get_content(url, api, params=None, **kwds): key = keys.pop(0) # Remove current key header['X-ELS-APIKey'] = key shuffle(keys) - resp = session.get(url, headers=header, proxies=proxies, params=params, timeout=timeout) + if method == 'GET': + resp = session.get(url, headers=header, proxies=proxies, params=params, timeout=timeout) + else: + resp = session.put(url, headers=header, json=params, timeout=timeout, proxies=proxies) except IndexError: # All keys depleted break @@ -151,8 +167,11 @@ def get_content(url, api, params=None, **kwds): except KeyError: try: reason = resp.json()['message'] - except: - reason = "" + except KeyError: + try: + reason = resp.json()['error-response']['error-message'] + except KeyError: + reason = "" raise error_type(reason) except (JSONDecodeError, KeyError): resp.raise_for_status() diff --git a/pybliometrics/utils/parse_content.py b/pybliometrics/utils/parse_content.py index 277a0e1..e41e05b 100644 --- a/pybliometrics/utils/parse_content.py +++ b/pybliometrics/utils/parse_content.py @@ -65,6 +65,18 @@ def deduplicate(lst): return new +def flatten_dict(d, parent_key='', sep='.'): + """Recursively flatten a nested dictionary.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + def get_id(s, integer=True): """Helper function to return the Scopus ID at a fixed position.""" path = ['coredata', 'dc:identifier'] @@ -126,7 +138,7 @@ def make_int_if_possible(val): """Attempt a conversion to int type.""" try: return int(val) - except TypeError: + except (TypeError, ValueError): return val