From 17dc400c07d518271658163f3b9e44d4603820bd Mon Sep 17 00:00:00 2001 From: x-j Date: Fri, 1 Apr 2022 15:07:31 +0200 Subject: [PATCH 01/15] add get_subarticles to Article --- allofplos/article.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/allofplos/article.py b/allofplos/article.py index 49ab4571..472fb371 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1354,6 +1354,19 @@ def word_count(self): body_word_count = 0 return body_word_count + # region: review_crawling2022 + def get_subarticles(self): + """Get sub-articles embedded in the XML tree of this article. + + :rtype: list + :return: list of lxml elements that are roots of each sub-article + """ + sub_articles = self.root.findall('sub-article') + return sub_articles # TODO: return list of Articles instead? + + # endregion + + @filename.setter def filename(self, value): """Sets an article object using a local filename. From 553ec35ac6ecb3811a426bdc25c5c1050a32c3d1 Mon Sep 17 00:00:00 2001 From: x-j Date: Fri, 15 Apr 2022 14:21:45 +0200 Subject: [PATCH 02/15] from_xml constructor for Articles takes an XML string as input --- allofplos/article.py | 47 +++++++++++++++++++++++++++++------------ allofplos/plos_regex.py | 5 +++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index 472fb371..3674ad24 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1214,7 +1214,7 @@ def body(self): :rtype: {str} """ - xml_tree = et.parse(self.filename) + xml_tree = et.parse(self.filepath) root = xml_tree.getroot() # limit the text to the body section @@ -1354,19 +1354,6 @@ def word_count(self): body_word_count = 0 return body_word_count - # region: review_crawling2022 - def get_subarticles(self): - """Get sub-articles embedded in the XML tree of this article. - - :rtype: list - :return: list of lxml elements that are roots of each sub-article - """ - sub_articles = self.root.findall('sub-article') - return sub_articles # TODO: return list of Articles instead? - - # endregion - - @filename.setter def filename(self, value): """Sets an article object using a local filename. @@ -1391,3 +1378,35 @@ def from_filename(cls, filename): else: directory = None return cls(filename_to_doi(filename), directory=directory) + + # region: review_crawling2022 + @classmethod + def from_xml(cls, source): + """Initiate an article object using an XML-encoded string. + Parses the XML to obtain the article's doi. + Does not change the default directory parameter, so the resulting Article has no filename associated. + """ + root = et.fromstring(source) + doi = root.find('front//article-id').text + a = Article(doi) + a.tree = root.getroottree() + return a + + @tree.setter + def tree(self, value): + """ + Set tree to the given object. + """ + assert isinstance(value, et._Element) + self._tree = value + + def get_subarticles(self): + """Get sub-articles embedded in the XML tree of this article. + + :rtype: list + :return: list of lxml elements that are roots of each sub-article + """ + sub_articles = self.root.findall('sub-article') + return sub_articles # TODO: return list of Articles instead? + + # endregion \ No newline at end of file diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index a25528e9..a5ddc69a 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -9,8 +9,9 @@ newarticledir_regex = re.escape(newarticledir) regex_match_prefix = r"^10\.1371/" -regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)" +regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))") +regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches sub-articles regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)" @@ -19,7 +20,7 @@ r"|([a-zA-Z0-9]{32}$))") regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") -full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) +full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match+regex_suffix_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) From cb4445f47dd1f78524d9313ecaebc109573fbff8 Mon Sep 17 00:00:00 2001 From: x-j Date: Fri, 15 Apr 2022 14:45:10 +0200 Subject: [PATCH 03/15] refactor body property --- allofplos/article.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index 3674ad24..6d3386ec 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1214,11 +1214,8 @@ def body(self): :rtype: {str} """ - xml_tree = et.parse(self.filepath) - root = xml_tree.getroot() - # limit the text to the body section - body = root.find('./body') + body = self.root.find('./body') # remove supplementary material section for sec in body.findall('.//sec'): @@ -1384,10 +1381,11 @@ def from_filename(cls, filename): def from_xml(cls, source): """Initiate an article object using an XML-encoded string. Parses the XML to obtain the article's doi. - Does not change the default directory parameter, so the resulting Article has no filename associated. + + Does not set `self.directory` parameter, so the resulting Article may have no file associated. """ root = et.fromstring(source) - doi = root.find('front//article-id').text + doi = root.find("front//article-id[@pub-id-type='doi']").text a = Article(doi) a.tree = root.getroottree() return a @@ -1397,7 +1395,7 @@ def tree(self, value): """ Set tree to the given object. """ - assert isinstance(value, et._Element) + assert isinstance(value, et._ElementTree) # TODO better validation? self._tree = value def get_subarticles(self): From 35a6df626c32cb16e9fc95fd2a0e56b520a23820 Mon Sep 17 00:00:00 2001 From: x-j Date: Sat, 16 Apr 2022 04:37:01 +0200 Subject: [PATCH 04/15] extend validating urls to include peer reviews modify and rename validate_url to accept supplementary materials --- allofplos/plos_regex.py | 37 +++++++++++++++++++--------- allofplos/samples/corpus_analysis.py | 4 +-- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index a5ddc69a..e0af5e13 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -11,7 +11,7 @@ regex_match_prefix = r"^10\.1371/" regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))") -regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches sub-articles +regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches sub-articles (reviews and supplementary materials) regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)" @@ -25,12 +25,15 @@ "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) file_regex_match = re.compile(regex_file_search+r"\.xml") -BASE_URL = 'https://journals.plos.org/plosone/article/file?id=' -URL_SUFFIX = '&type=manuscript' -external_url_regex_match = re.compile(re.escape(BASE_URL) + - re.escape("10.1371/") + - regex_body_search + - re.escape(URL_SUFFIX)) +regex_type_match = r"(article)|(peerReview)" +regex_file_suffix = r"&type=((manuscript)|(supplementary))" + +BASE_URL = 'https://journals.plos.org/plosone/' +external_url_regex_match = re.compile(re.escape(BASE_URL) + re.escape("article/file?id=10.1371/") + + regex_body_search + regex_suffix_match + regex_file_suffix) +plos_url_regex_match = re.compile(re.escape("https://journals.plos.org/") + r"[a-z]+/" + + regex_type_match + re.escape("?id=10.1371/") + + regex_body_search + regex_suffix_match) def validate_doi(doi): @@ -58,14 +61,26 @@ def validate_filename(filename): return False -def validate_url(url): +def validate_file_url(url): """ - For an individual string, tests whether the full string is in a valid article url format or not + For an individual string, tests whether the full string is in a valid article (manuscript) url format or not Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True, but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False - :return: True if string is in a valid PLOS article url; False if not + + Urls leading to files containing supplementary material are valid. + example: '' + :return: True if string is in a valid PLOS file url; False if not + """ + return bool(external_url_regex_match.match(url)) + + +def validate_plos_url(url): + """ + Tests whether the given `url` string is a valid PLOS website format. + + :return True if string is in a valid PLOS url; False otherwise """ - return bool(external_url_regex_match.search(url)) + return bool(plos_url_regex_match.search(url)) def find_valid_dois(doi): diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index c28f9b9e..0a3855f0 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -17,7 +17,7 @@ from .. import get_corpus_dir, newarticledir -from ..plos_regex import (validate_doi, full_doi_regex_match, validate_url, validate_filename) +from ..plos_regex import (validate_doi, full_doi_regex_match, validate_file_url, validate_filename) from ..transformations import (filename_to_doi, doi_to_url) from ..corpus.plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list, download_updated_xml, get_all_solr_dois, @@ -49,7 +49,7 @@ def validate_corpus(directory=None): # check urls plos_urls = [doi_to_url(doi) for doi in plos_valid_dois] - plos_valid_urls = [url for url in plos_urls if validate_url(url)] + plos_valid_urls = [url for url in plos_urls if validate_file_url(url)] if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(plos_valid_dois): pass else: From 115fba2c515d6556ad84346437a8090d710b6104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartlomiej=20Zaj=C4=85c?= Date: Wed, 20 Apr 2022 00:56:19 +0200 Subject: [PATCH 05/15] add option to not unzip, keeping zip file instead --- allofplos/corpus/gdrive.py | 14 +++++++++----- allofplos/corpus/plos_corpus.py | 10 ++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/allofplos/corpus/gdrive.py b/allofplos/corpus/gdrive.py index f5dbfc96..1b2d9091 100644 --- a/allofplos/corpus/gdrive.py +++ b/allofplos/corpus/gdrive.py @@ -1,5 +1,6 @@ import datetime import os +import re import tarfile from zipfile import ZipFile, BadZipFile @@ -57,7 +58,8 @@ def download_file_from_google_drive(id, filename, key=None, directory=None, if not os.path.isfile(file_path): session = requests.Session() - response = session.get(GDRIVE_URL, params={'id': id, 'resourcekey': key, 'authuser': '0', 'export': 'download'}, stream=True) + params = {'id': id, 'resourcekey': key, 'authuser': '0', 'export': 'download'} + response = session.get(GDRIVE_URL, params=params, stream=True) token = get_confirm_token(response) if token: params = {'id': id, 'confirm': token, 'resourcekey': key, 'authuser': '0', 'export': 'download'} @@ -69,13 +71,15 @@ def download_file_from_google_drive(id, filename, key=None, directory=None, def get_confirm_token(response): """ Part of keep-alive method for downloading large files from Google Drive - Discards packets of data that aren't the actual file + Discards packets of data that aren't the actual file # the behavior of this function does not match its description :param response: session-based google query - :return: either datapacket or discard unneeded data + :return: either datapacket or discard unneeded data """ for key, value in response.cookies.items(): if key.startswith('download_warning'): return value + # the code above will likely not work in 2022 + return 't' return None @@ -107,7 +111,7 @@ def save_response_content(response, download_path, file_size=None): f.write(chunk) -def get_zip_metadata(method='initial'): +def get_zip_metadata(method='initial', directory=get_corpus_dir()): """ Gets metadata txt file from Google Drive, that has info about zip file Used to get the file name, as well as byte size for progress bar @@ -116,7 +120,7 @@ def get_zip_metadata(method='initial'): :return: tuple of data about zip file: date zip created, zip size, and location of metadata txt file """ if method == 'initial': - metadata_path = download_file_from_google_drive(METADATA_ID, ZIP_METADATA, key=METADATA_KEY) + metadata_path = download_file_from_google_drive(METADATA_ID, ZIP_METADATA, key=METADATA_KEY, directory=directory) with open(metadata_path) as f: zip_stats = f.read().splitlines() zip_datestring = zip_stats[0] diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index f05a35a7..e98ef400 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -575,7 +575,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination): move_articles(tempdir, destination) -def create_local_plos_corpus(directory=None, rm_metadata=True): +def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True): """ Downloads a fresh copy of the PLOS corpus by: 1) creating directory if it doesn't exist @@ -583,6 +583,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True): 2) downloading the zip file (defaults to corpus directory) 3) extracting the individual XML files into the corpus directory :param directory: directory where the corpus is to be downloaded and extracted + :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. :param rm_metadata: COMPLETE HERE :return: None """ @@ -591,9 +592,10 @@ def create_local_plos_corpus(directory=None, rm_metadata=True): if not os.path.isdir(directory): print('Creating folder for article xml') os.makedirs(directory, exist_ok=True) - zip_date, zip_size, metadata_path = get_zip_metadata() - zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size) - unzip_articles(file_path=zip_path) + zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory) + zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory) + if unzip: + unzip_articles(file_path=zip_path) if rm_metadata: os.remove(metadata_path) From c10be09b9cea4495c5424f35a401c1571778ab4c Mon Sep 17 00:00:00 2001 From: x-j Date: Sat, 23 Apr 2022 00:26:16 +0200 Subject: [PATCH 06/15] add parameters to create_local_plos_corpus --- allofplos/corpus/plos_corpus.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index e98ef400..cbf2bfe2 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -575,7 +575,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination): move_articles(tempdir, destination) -def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True): +def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delete_file=True): """ Downloads a fresh copy of the PLOS corpus by: 1) creating directory if it doesn't exist @@ -583,8 +583,9 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True): 2) downloading the zip file (defaults to corpus directory) 3) extracting the individual XML files into the corpus directory :param directory: directory where the corpus is to be downloaded and extracted + :param rm_metadata: whether to remove the txt file containing metadata for the zip archive :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. - :param rm_metadata: COMPLETE HERE + :param delete_file: whether to delete the compressed archive after extracting articles :return: None """ if directory is None: @@ -595,7 +596,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True): zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory) zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory) if unzip: - unzip_articles(file_path=zip_path) + unzip_articles(file_path=zip_path, delete_file=delete_file) if rm_metadata: os.remove(metadata_path) From 8d33d63ccaabcc3fa1169d5fa344570a59fad5ab Mon Sep 17 00:00:00 2001 From: x-j Date: Sat, 23 Apr 2022 00:26:43 +0200 Subject: [PATCH 07/15] fix from_xml article constructor --- allofplos/article.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index 6d3386ec..639c6142 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1378,15 +1378,16 @@ def from_filename(cls, filename): # region: review_crawling2022 @classmethod - def from_xml(cls, source): + def from_xml(cls, source, directory = None): """Initiate an article object using an XML-encoded string. Parses the XML to obtain the article's doi. - Does not set `self.directory` parameter, so the resulting Article may have no file associated. + :param source: string containing XML describing an article + :param directory: path to directory containing the XML for this article. Defaults to `get_corpus_dir()` via `Article().__init__`. """ root = et.fromstring(source) - doi = root.find("front//article-id[@pub-id-type='doi']").text - a = Article(doi) + doi = root.find("front//article-id[@pub-id-type='doi']").text.strip() + a = Article(doi, directory) a.tree = root.getroottree() return a @@ -1405,6 +1406,6 @@ def get_subarticles(self): :return: list of lxml elements that are roots of each sub-article """ sub_articles = self.root.findall('sub-article') - return sub_articles # TODO: return list of Articles instead? + return sub_articles # maybe return list of Articles instead? # endregion \ No newline at end of file From 982d358437d226c46081851f9c5446312c318e7c Mon Sep 17 00:00:00 2001 From: x-j Date: Mon, 25 Apr 2022 14:56:01 +0200 Subject: [PATCH 08/15] add functions to article class self.categories and self.get_author_names() --- allofplos/article.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/allofplos/article.py b/allofplos/article.py index 639c6142..9a54ac29 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1408,4 +1408,36 @@ def get_subarticles(self): sub_articles = self.root.findall('sub-article') return sub_articles # maybe return list of Articles instead? + def get_author_names(self): + """ + Compresses the list of dicts stored in `self.authors` into a simpler list of author names. + + :rtype: list + """ + parsed_authors = [] + for author in self.authors: + if author['given_names'] is None and author['surname'] is None: + parsed_authors.append(author['group_name']) + else: + parsed_authors.append(author['given_names']+ ' ' +author['surname']) + return parsed_authors + + @property + def categories(self): + """ + Get the categories (or keywords) defined for this article. + + :rtype: list + """ + keywords_set = set() # using a set because they tend to be duplicated + categories = self.root.find('.//front').find('.//article-categories') + if categories is None: + return None + + for el in categories[1:]: # skipping the first one because it's a "heading" + for subj in el.iterdescendants(): + if len(subj) == 1: keywords_set.add(subj[0].text.strip()) + return list(keywords_set) + + # endregion \ No newline at end of file From ee96a67eddcbd3357ce402a9d45ab6a2865cd56f Mon Sep 17 00:00:00 2001 From: x-j Date: Mon, 25 Apr 2022 14:56:25 +0200 Subject: [PATCH 09/15] add comments --- allofplos/corpus/plos_corpus.py | 10 +++++----- allofplos/plos_regex.py | 5 +++-- allofplos/transformations.py | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index cbf2bfe2..4606a4b7 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -582,10 +582,10 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delet 2) downloading metadata about the .zip of all PLOS XML 2) downloading the zip file (defaults to corpus directory) 3) extracting the individual XML files into the corpus directory - :param directory: directory where the corpus is to be downloaded and extracted - :param rm_metadata: whether to remove the txt file containing metadata for the zip archive - :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. - :param delete_file: whether to delete the compressed archive after extracting articles + :param directory: directory where the corpus is to be downloaded + :param rm_metadata: whether to remove the txt file containing metadata for the zip archive. Defaults to `True` + :param unzip: whether to extract article files to corpus dir, or just keep the zip file instead. Defaults to `True` + :param delete_file: whether to delete the compressed archive after extracting articles. Defaults to `True` :return: None """ if directory is None: @@ -596,7 +596,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delet zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory) zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory) if unzip: - unzip_articles(file_path=zip_path, delete_file=delete_file) + unzip_articles(file_path=zip_path, extract_directory=get_corpus_dir(), delete_file=delete_file) if rm_metadata: os.remove(metadata_path) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index e0af5e13..d05cf344 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -11,7 +11,7 @@ regex_match_prefix = r"^10\.1371/" regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))") -regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches sub-articles (reviews and supplementary materials) +regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches reviews and supplementary materials regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)" @@ -68,7 +68,8 @@ def validate_file_url(url): but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False Urls leading to files containing supplementary material are valid. - example: '' + example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0222522.s002&type=supplementary' is True + :return: True if string is in a valid PLOS file url; False if not """ return bool(external_url_regex_match.match(url)) diff --git a/allofplos/transformations.py b/allofplos/transformations.py index aa144aa5..1975824e 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -37,6 +37,7 @@ 'assetXMLFile': 'article/file', 'articleMetrics': 'article/metrics', 'articleRelated': 'article/related'} + # 'peerReviews': 'article/peerReview def _get_base_page(journal): @@ -144,8 +145,8 @@ def url_to_doi(url): Example: url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \ '10.1371/journal.pone.1000001' - :param url: online location of a PLOS article's XML - :return: full unique identifier for a PLOS article + :param url: online location of a PLOS article's XML (not neccessarily, base link works fine too) + :return: full unique identifier for a PLOS article (or for a peer review, or supplementary material etc.) """ return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX) From cadff269f66fd462842a3e6fbcabe2eaa8c7213b Mon Sep 17 00:00:00 2001 From: x-j Date: Tue, 21 Jun 2022 00:28:55 +0200 Subject: [PATCH 10/15] get_page handles reviews --- allofplos/article.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/allofplos/article.py b/allofplos/article.py index 9a54ac29..bcc25f05 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -842,9 +842,11 @@ def get_page(self, page_type='article'): """Get any of the PLOS URLs associated with a particular DOI. Based on `get_page_base()`, which customizes the beginning URL by journal. - :param page_type: one of the keys in `plos_page_dict`, defaults to article + :param page_type: one of the keys in `plos_page_dict` or the string "reviews". defaults to article """ BASE_LANDING_PAGE = _get_base_page(self.journal) + if page_type == "reviews": + page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) try: page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) From e2d3c314f28d0001c2f4b6ac8da0b3c57adeb609 Mon Sep 17 00:00:00 2001 From: x-j Date: Thu, 23 Jun 2022 01:57:21 +0200 Subject: [PATCH 11/15] get_page now handles reviews --- allofplos/article.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index bcc25f05..459b9c74 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -847,13 +847,13 @@ def get_page(self, page_type='article'): BASE_LANDING_PAGE = _get_base_page(self.journal) if page_type == "reviews": page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) - try: - page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], - self.doi) - if page_type == 'assetXMLFile': - page += URL_SUFFIX - except KeyError: - raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys()))) + else: + try: + page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) + if page_type == 'assetXMLFile': + page += URL_SUFFIX + except KeyError: + raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys()))) return page @property From 7d2fa18d2dba8c5f7ebf6e81710ec67b3e99a82c Mon Sep 17 00:00:00 2001 From: x-j Date: Thu, 23 Jun 2022 02:10:39 +0200 Subject: [PATCH 12/15] add peerReviews to plos_page_dict --- allofplos/article.py | 19 +++++++++---------- allofplos/transformations.py | 4 ++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index 459b9c74..12f8cf41 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -842,18 +842,17 @@ def get_page(self, page_type='article'): """Get any of the PLOS URLs associated with a particular DOI. Based on `get_page_base()`, which customizes the beginning URL by journal. - :param page_type: one of the keys in `plos_page_dict` or the string "reviews". defaults to article + :param page_type: one of the keys in `plos_page_dict`. defaults to article """ BASE_LANDING_PAGE = _get_base_page(self.journal) - if page_type == "reviews": - page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) - else: - try: - page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) - if page_type == 'assetXMLFile': - page += URL_SUFFIX - except KeyError: - raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys()))) + if page_type == "peerReview": + return BASE_LANDING_PAGE + "article/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) + try: + page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) + if page_type == 'assetXMLFile': + page += URL_SUFFIX + except KeyError: + raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys()))) return page @property diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 1975824e..83f423ab 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -36,8 +36,8 @@ 'assetFile': 'article/file', 'assetXMLFile': 'article/file', 'articleMetrics': 'article/metrics', - 'articleRelated': 'article/related'} - # 'peerReviews': 'article/peerReview + 'articleRelated': 'article/related', + 'peerReviews': 'peerReview'} # get_page function handles peerReviews differently def _get_base_page(journal): From f01530ba8798185e30788b857c457c38d2bc0bc9 Mon Sep 17 00:00:00 2001 From: x-j Date: Fri, 24 Jun 2022 00:27:04 +0200 Subject: [PATCH 13/15] peerReviews -> peerReview --- allofplos/transformations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 83f423ab..71627176 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -37,7 +37,7 @@ 'assetXMLFile': 'article/file', 'articleMetrics': 'article/metrics', 'articleRelated': 'article/related', - 'peerReviews': 'peerReview'} # get_page function handles peerReviews differently + 'peerReview': 'peerReview'} # get_page function handles peerReview differently def _get_base_page(journal): From 4e847da0712508fb53f7b7a06dc5b5c0d731e3cc Mon Sep 17 00:00:00 2001 From: x-j Date: Fri, 24 Jun 2022 00:45:13 +0200 Subject: [PATCH 14/15] add peerReview to plos_page_dict --- allofplos/transformations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 1975824e..b84d34db 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -36,8 +36,8 @@ 'assetFile': 'article/file', 'assetXMLFile': 'article/file', 'articleMetrics': 'article/metrics', - 'articleRelated': 'article/related'} - # 'peerReviews': 'article/peerReview + 'articleRelated': 'article/related', + 'peerReview': 'article/peerReview'} def _get_base_page(journal): From 63e1a7fb54b737fe98d83df2c6496e15f3ed76aa Mon Sep 17 00:00:00 2001 From: x-j Date: Mon, 18 Jul 2022 01:38:39 +0200 Subject: [PATCH 15/15] change to os-universal paths --- tests/test_unittests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_unittests.py b/tests/test_unittests.py index 9ecff9ad..ac601fa3 100644 --- a/tests/test_unittests.py +++ b/tests/test_unittests.py @@ -92,7 +92,7 @@ def test_class_doi1(self): self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [{'contrib_initials': 'EGL', 'given_names': 'Eric Gordon', 'surname': 'Lamb', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'contrib_type': 'editor', 'author_type': None, 'editor_type': None, 'email': None, 'affiliations': ['University of Saskatchewan, CANADA'], 'author_roles': {None: ['Editor']}, 'footnotes': []}], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/journal.pone.0185809.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","journal.pone.0185809.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185809", 'page does not transform correctly for {}'.format(article.doi)) @@ -133,7 +133,7 @@ def test_example_doi(self): self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/journal.pbio.2001413.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","journal.pbio.2001413.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS Biology", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001413", 'page does not transform correctly for {}'.format(article.doi)) @@ -172,7 +172,7 @@ def test_example_doi2(self): self.assertEqual(article.dtd, "NLM 3.0", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6", 'page does not transform correctly for {}'.format(article.doi))