From 17dc400c07d518271658163f3b9e44d4603820bd Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Fri, 1 Apr 2022 15:07:31 +0200
Subject: [PATCH 01/15] add get_subarticles to Article

---
 allofplos/article.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/allofplos/article.py b/allofplos/article.py
index 49ab4571..472fb371 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -1354,6 +1354,19 @@ def word_count(self):
             body_word_count = 0
         return body_word_count
 
+    # region: review_crawling2022
+    def get_subarticles(self):
+        """Get sub-articles embedded in the XML tree of this article.
+
+        :rtype: list
+        :return: list of lxml elements that are roots of each sub-article 
+        """
+        sub_articles = self.root.findall('sub-article')
+        return sub_articles # TODO: return list of Articles instead?
+
+    # endregion
+
+
     @filename.setter
     def filename(self, value):
         """Sets an article object using a local filename.

From 553ec35ac6ecb3811a426bdc25c5c1050a32c3d1 Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Fri, 15 Apr 2022 14:21:45 +0200
Subject: [PATCH 02/15] from_xml constructor for Articles takes an XML string
 as input

---
 allofplos/article.py    | 47 +++++++++++++++++++++++++++++------------
 allofplos/plos_regex.py |  5 +++--
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index 472fb371..3674ad24 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -1214,7 +1214,7 @@ def body(self):
         :rtype: {str}
         """
 
-        xml_tree = et.parse(self.filename)
+        xml_tree = et.parse(self.filepath)
         root = xml_tree.getroot()
 
         # limit the text to the body section
@@ -1354,19 +1354,6 @@ def word_count(self):
             body_word_count = 0
         return body_word_count
 
-    # region: review_crawling2022
-    def get_subarticles(self):
-        """Get sub-articles embedded in the XML tree of this article.
-
-        :rtype: list
-        :return: list of lxml elements that are roots of each sub-article 
-        """
-        sub_articles = self.root.findall('sub-article')
-        return sub_articles # TODO: return list of Articles instead?
-
-    # endregion
-
-
     @filename.setter
     def filename(self, value):
         """Sets an article object using a local filename.
@@ -1391,3 +1378,35 @@ def from_filename(cls, filename):
         else:
             directory = None
         return cls(filename_to_doi(filename), directory=directory)
+
+    # region: review_crawling2022
+    @classmethod
+    def from_xml(cls, source):
+        """Initiate an article object using an XML-encoded string.
+            Parses the XML to obtain the article's doi. 
+            Does not change the default directory parameter, so the resulting Article has no filename associated.
+        """
+        root = et.fromstring(source)
+        doi = root.find('front//article-id').text
+        a = Article(doi)
+        a.tree = root.getroottree()
+        return a
+
+    @tree.setter
+    def tree(self, value):
+        """
+        Set tree to the given object.
+        """
+        assert isinstance(value, et._Element)
+        self._tree = value
+
+    def get_subarticles(self):
+        """Get sub-articles embedded in the XML tree of this article.
+
+        :rtype: list
+        :return: list of lxml elements that are roots of each sub-article 
+        """
+        sub_articles = self.root.findall('sub-article')
+        return sub_articles # TODO: return list of Articles instead?
+
+    # endregion
\ No newline at end of file
diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py
index a25528e9..a5ddc69a 100644
--- a/allofplos/plos_regex.py
+++ b/allofplos/plos_regex.py
@@ -9,8 +9,9 @@
 
 newarticledir_regex = re.escape(newarticledir)
 regex_match_prefix = r"^10\.1371/"
-regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)"
+regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                     r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
+regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches sub-articles
 regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
 regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
@@ -19,7 +20,7 @@
                        r"|([a-zA-Z0-9]{32}$))")
 regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
-full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match)
+full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match+regex_suffix_match)
 full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}"
                                    "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
 currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)

From cb4445f47dd1f78524d9313ecaebc109573fbff8 Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Fri, 15 Apr 2022 14:45:10 +0200
Subject: [PATCH 03/15] refactor body property

---
 allofplos/article.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index 3674ad24..6d3386ec 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -1214,11 +1214,8 @@ def body(self):
         :rtype: {str}
         """
 
-        xml_tree = et.parse(self.filepath)
-        root = xml_tree.getroot()
-
         # limit the text to the body section
-        body = root.find('./body')
+        body = self.root.find('./body')
 
         # remove supplementary material section
         for sec in body.findall('.//sec'):
@@ -1384,10 +1381,11 @@ def from_filename(cls, filename):
     def from_xml(cls, source):
         """Initiate an article object using an XML-encoded string.
             Parses the XML to obtain the article's doi. 
-            Does not change the default directory parameter, so the resulting Article has no filename associated.
+            
+            Does not set `self.directory` parameter, so the resulting Article may have no file associated.
         """
         root = et.fromstring(source)
-        doi = root.find('front//article-id').text
+        doi = root.find("front//article-id[@pub-id-type='doi']").text
         a = Article(doi)
         a.tree = root.getroottree()
         return a
@@ -1397,7 +1395,7 @@ def tree(self, value):
         """
         Set tree to the given object.
         """
-        assert isinstance(value, et._Element)
+        assert isinstance(value, et._ElementTree)   # TODO better validation?
         self._tree = value
 
     def get_subarticles(self):

From 35a6df626c32cb16e9fc95fd2a0e56b520a23820 Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Sat, 16 Apr 2022 04:37:01 +0200
Subject: [PATCH 04/15] extend validating urls to include peer reviews modify
 and rename validate_url to accept supplementary materials

---
 allofplos/plos_regex.py              | 37 +++++++++++++++++++---------
 allofplos/samples/corpus_analysis.py |  4 +--
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py
index a5ddc69a..e0af5e13 100644
--- a/allofplos/plos_regex.py
+++ b/allofplos/plos_regex.py
@@ -11,7 +11,7 @@
 regex_match_prefix = r"^10\.1371/"
 regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                     r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
-regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches sub-articles
+regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches sub-articles (reviews and supplementary materials)
 regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
 regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
@@ -25,12 +25,15 @@
                                    "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
 currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)
 file_regex_match = re.compile(regex_file_search+r"\.xml")
-BASE_URL = 'https://journals.plos.org/plosone/article/file?id='
-URL_SUFFIX = '&type=manuscript'
-external_url_regex_match = re.compile(re.escape(BASE_URL) +
-                                      re.escape("10.1371/") +
-                                      regex_body_search +
-                                      re.escape(URL_SUFFIX))
+regex_type_match = r"(article)|(peerReview)"
+regex_file_suffix = r"&type=((manuscript)|(supplementary))"
+
+BASE_URL = 'https://journals.plos.org/plosone/'
+external_url_regex_match = re.compile(re.escape(BASE_URL) + re.escape("article/file?id=10.1371/") +
+                                      regex_body_search + regex_suffix_match + regex_file_suffix)
+plos_url_regex_match = re.compile(re.escape("https://journals.plos.org/") + r"[a-z]+/" +
+                                  regex_type_match + re.escape("?id=10.1371/") +
+                                  regex_body_search + regex_suffix_match)
 
 
 def validate_doi(doi):
@@ -58,14 +61,26 @@ def validate_filename(filename):
         return False
 
 
-def validate_url(url):
+def validate_file_url(url):
     """
-    For an individual string, tests whether the full string is in a valid article url format or not
+    For an individual string, tests whether the full string is in a valid article (manuscript) url format or not
     Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True,
     but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False
-    :return: True if string is in a valid PLOS article url; False if not
+
+    Urls leading to files containing supplementary material are valid.
+    example: ''
+    :return: True if string is in a valid PLOS file url; False if not
+    """
+    return bool(external_url_regex_match.match(url))
+
+
+def validate_plos_url(url):
+    """
+    Tests whether the given `url` string is a valid PLOS website format.
+
+    :return True if string is in a valid PLOS url; False otherwise
     """
-    return bool(external_url_regex_match.search(url))
+    return bool(plos_url_regex_match.search(url))
 
 
 def find_valid_dois(doi):
diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py
index c28f9b9e..0a3855f0 100644
--- a/allofplos/samples/corpus_analysis.py
+++ b/allofplos/samples/corpus_analysis.py
@@ -17,7 +17,7 @@
 
 from .. import get_corpus_dir, newarticledir
 
-from ..plos_regex import (validate_doi, full_doi_regex_match, validate_url, validate_filename)
+from ..plos_regex import (validate_doi, full_doi_regex_match, validate_file_url, validate_filename)
 from ..transformations import (filename_to_doi, doi_to_url)
 from ..corpus.plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
                                   download_updated_xml, get_all_solr_dois,
@@ -49,7 +49,7 @@ def validate_corpus(directory=None):
 
     # check urls
     plos_urls = [doi_to_url(doi) for doi in plos_valid_dois]
-    plos_valid_urls = [url for url in plos_urls if validate_url(url)]
+    plos_valid_urls = [url for url in plos_urls if validate_file_url(url)]
     if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(plos_valid_dois):
         pass
     else:

From 115fba2c515d6556ad84346437a8090d710b6104 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bartlomiej=20Zaj=C4=85c?=
 <bartlomiejzajac@bartlomiejs-macbook-pro.home>
Date: Wed, 20 Apr 2022 00:56:19 +0200
Subject: [PATCH 05/15] add option to not unzip, keeping zip file instead

---
 allofplos/corpus/gdrive.py      | 14 +++++++++-----
 allofplos/corpus/plos_corpus.py | 10 ++++++----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/allofplos/corpus/gdrive.py b/allofplos/corpus/gdrive.py
index f5dbfc96..1b2d9091 100644
--- a/allofplos/corpus/gdrive.py
+++ b/allofplos/corpus/gdrive.py
@@ -1,5 +1,6 @@
 import datetime
 import os
+import re
 import tarfile
 from zipfile import ZipFile, BadZipFile
 
@@ -57,7 +58,8 @@ def download_file_from_google_drive(id, filename, key=None, directory=None,
     if not os.path.isfile(file_path):
         session = requests.Session()
 
-        response = session.get(GDRIVE_URL, params={'id': id, 'resourcekey': key, 'authuser': '0', 'export': 'download'}, stream=True)
+        params = {'id': id, 'resourcekey': key, 'authuser': '0', 'export': 'download'}
+        response = session.get(GDRIVE_URL, params=params, stream=True)
         token = get_confirm_token(response)
         if token:
             params = {'id': id, 'confirm': token, 'resourcekey': key, 'authuser': '0', 'export': 'download'}
@@ -69,13 +71,15 @@ def download_file_from_google_drive(id, filename, key=None, directory=None,
 def get_confirm_token(response):
     """
     Part of keep-alive method for downloading large files from Google Drive
-    Discards packets of data that aren't the actual file
+    Discards packets of data that aren't the actual file      # the behavior of this function does not match its description
     :param response: session-based google query
-    :return: either datapacket or discard unneeded data
+    :return: either datapacket or discard unneeded data    
     """
     for key, value in response.cookies.items():
         if key.startswith('download_warning'):
             return value
+    # the code above will likely not work in 2022
+    return 't'
     return None
 
 
@@ -107,7 +111,7 @@ def save_response_content(response, download_path, file_size=None):
                     f.write(chunk)
 
 
-def get_zip_metadata(method='initial'):
+def get_zip_metadata(method='initial', directory=get_corpus_dir()):
     """
     Gets metadata txt file from Google Drive, that has info about zip file
     Used to get the file name, as well as byte size for progress bar
@@ -116,7 +120,7 @@ def get_zip_metadata(method='initial'):
     :return: tuple of data about zip file: date zip created, zip size, and location of metadata txt file
     """
     if method == 'initial':
-        metadata_path = download_file_from_google_drive(METADATA_ID, ZIP_METADATA, key=METADATA_KEY)
+        metadata_path = download_file_from_google_drive(METADATA_ID, ZIP_METADATA, key=METADATA_KEY, directory=directory)
     with open(metadata_path) as f:
         zip_stats = f.read().splitlines()
     zip_datestring = zip_stats[0]
diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
index f05a35a7..e98ef400 100644
--- a/allofplos/corpus/plos_corpus.py
+++ b/allofplos/corpus/plos_corpus.py
@@ -575,7 +575,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination):
     move_articles(tempdir, destination)
 
 
-def create_local_plos_corpus(directory=None, rm_metadata=True):
+def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True):
     """
     Downloads a fresh copy of the PLOS corpus by:
     1) creating directory if it doesn't exist
@@ -583,6 +583,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
     2) downloading the zip file (defaults to corpus directory)
     3) extracting the individual XML files into the corpus directory
     :param directory: directory where the corpus is to be downloaded and extracted
+    :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. 
     :param rm_metadata: COMPLETE HERE
     :return: None
     """
@@ -591,9 +592,10 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
     if not os.path.isdir(directory):
         print('Creating folder for article xml')
     os.makedirs(directory, exist_ok=True)
-    zip_date, zip_size, metadata_path = get_zip_metadata()
-    zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size)
-    unzip_articles(file_path=zip_path)
+    zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory)
+    zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory)
+    if unzip:
+        unzip_articles(file_path=zip_path)
     if rm_metadata:
         os.remove(metadata_path)
 

From c10be09b9cea4495c5424f35a401c1571778ab4c Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Sat, 23 Apr 2022 00:26:16 +0200
Subject: [PATCH 06/15] add parameters to create_local_plos_corpus

---
 allofplos/corpus/plos_corpus.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
index e98ef400..cbf2bfe2 100644
--- a/allofplos/corpus/plos_corpus.py
+++ b/allofplos/corpus/plos_corpus.py
@@ -575,7 +575,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination):
     move_articles(tempdir, destination)
 
 
-def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True):
+def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delete_file=True):
     """
     Downloads a fresh copy of the PLOS corpus by:
     1) creating directory if it doesn't exist
@@ -583,8 +583,9 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True):
     2) downloading the zip file (defaults to corpus directory)
     3) extracting the individual XML files into the corpus directory
     :param directory: directory where the corpus is to be downloaded and extracted
+    :param rm_metadata: whether to remove the txt file containing metadata for the zip archive
     :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. 
-    :param rm_metadata: COMPLETE HERE
+    :param delete_file: whether to delete the compressed archive after extracting articles
     :return: None
     """
     if directory is None:
@@ -595,7 +596,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True):
     zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory)
     zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory)
     if unzip:
-        unzip_articles(file_path=zip_path)
+        unzip_articles(file_path=zip_path, delete_file=delete_file)
     if rm_metadata:
         os.remove(metadata_path)
 

From 8d33d63ccaabcc3fa1169d5fa344570a59fad5ab Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Sat, 23 Apr 2022 00:26:43 +0200
Subject: [PATCH 07/15] fix from_xml article constructor

---
 allofplos/article.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index 6d3386ec..639c6142 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -1378,15 +1378,16 @@ def from_filename(cls, filename):
 
     # region: review_crawling2022
     @classmethod
-    def from_xml(cls, source):
+    def from_xml(cls, source, directory = None):
         """Initiate an article object using an XML-encoded string.
             Parses the XML to obtain the article's doi. 
             
-            Does not set `self.directory` parameter, so the resulting Article may have no file associated.
+            :param source: string containing XML describing an article
+            :param directory: path to directory containing the XML for this article. Defaults to `get_corpus_dir()` via `Article().__init__`.
         """
         root = et.fromstring(source)
-        doi = root.find("front//article-id[@pub-id-type='doi']").text
-        a = Article(doi)
+        doi = root.find("front//article-id[@pub-id-type='doi']").text.strip()
+        a = Article(doi, directory)
         a.tree = root.getroottree()
         return a
 
@@ -1405,6 +1406,6 @@ def get_subarticles(self):
         :return: list of lxml elements that are roots of each sub-article 
         """
         sub_articles = self.root.findall('sub-article')
-        return sub_articles # TODO: return list of Articles instead?
+        return sub_articles     # maybe return list of Articles instead?
 
     # endregion
\ No newline at end of file

From 982d358437d226c46081851f9c5446312c318e7c Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Mon, 25 Apr 2022 14:56:01 +0200
Subject: [PATCH 08/15] add functions to article class self.categories and
 self.get_author_names()

---
 allofplos/article.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/allofplos/article.py b/allofplos/article.py
index 639c6142..9a54ac29 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -1408,4 +1408,36 @@ def get_subarticles(self):
         sub_articles = self.root.findall('sub-article')
         return sub_articles     # maybe return list of Articles instead?
 
+    def get_author_names(self):
+        """
+        Compresses the list of dicts stored in `self.authors` into a simpler list of author names.
+
+        :rtype: list
+        """
+        parsed_authors = []
+        for author in self.authors:
+            if author['given_names'] is None and author['surname'] is None:
+                parsed_authors.append(author['group_name'])
+            else:
+                parsed_authors.append(author['given_names']+ ' ' +author['surname'])
+        return parsed_authors
+
+    @property
+    def categories(self):
+        """
+        Get the categories (or keywords) defined for this article.
+
+        :rtype: list
+        """
+        keywords_set = set()    # using a set because they tend to be duplicated
+        categories = self.root.find('.//front').find('.//article-categories')
+        if categories is None:
+            return None
+
+        for el in categories[1:]:   # skipping the first one because it's a "heading"
+            for subj in el.iterdescendants():
+                if len(subj) == 1:  keywords_set.add(subj[0].text.strip())
+        return list(keywords_set)
+
+
     # endregion
\ No newline at end of file

From ee96a67eddcbd3357ce402a9d45ab6a2865cd56f Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Mon, 25 Apr 2022 14:56:25 +0200
Subject: [PATCH 09/15] add comments

---
 allofplos/corpus/plos_corpus.py | 10 +++++-----
 allofplos/plos_regex.py         |  5 +++--
 allofplos/transformations.py    |  5 +++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
index cbf2bfe2..4606a4b7 100644
--- a/allofplos/corpus/plos_corpus.py
+++ b/allofplos/corpus/plos_corpus.py
@@ -582,10 +582,10 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delet
     2) downloading metadata about the .zip of all PLOS XML
     2) downloading the zip file (defaults to corpus directory)
     3) extracting the individual XML files into the corpus directory
-    :param directory: directory where the corpus is to be downloaded and extracted
-    :param rm_metadata: whether to remove the txt file containing metadata for the zip archive
-    :param unzip: whether to extract article files, or keep the zip file instead. Defaults to extracting and removing the zip file afterwards. 
-    :param delete_file: whether to delete the compressed archive after extracting articles
+    :param directory: directory where the corpus is to be downloaded
+    :param rm_metadata: whether to remove the txt file containing metadata for the zip archive. Defaults to `True`
+    :param unzip: whether to extract article files to corpus dir, or just keep the zip file instead. Defaults to `True`
+    :param delete_file: whether to delete the compressed archive after extracting articles. Defaults to `True`
     :return: None
     """
     if directory is None:
@@ -596,7 +596,7 @@ def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delet
     zip_date, zip_size, metadata_path = get_zip_metadata(directory=directory)
     zip_path = download_file_from_google_drive(ZIP_ID, LOCAL_ZIP, key=ZIP_KEY, file_size=zip_size, directory=directory)
     if unzip:
-        unzip_articles(file_path=zip_path, delete_file=delete_file)
+        unzip_articles(file_path=zip_path, extract_directory=get_corpus_dir(), delete_file=delete_file)
     if rm_metadata:
         os.remove(metadata_path)
 
diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py
index e0af5e13..d05cf344 100644
--- a/allofplos/plos_regex.py
+++ b/allofplos/plos_regex.py
@@ -11,7 +11,7 @@
 regex_match_prefix = r"^10\.1371/"
 regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                     r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
-regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches sub-articles (reviews and supplementary materials)
+regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches reviews and supplementary materials
 regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
 regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
@@ -68,7 +68,8 @@ def validate_file_url(url):
     but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False
 
     Urls leading to files containing supplementary material are valid.
-    example: ''
+    example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0222522.s002&type=supplementary' is True
+
     :return: True if string is in a valid PLOS file url; False if not
     """
     return bool(external_url_regex_match.match(url))
diff --git a/allofplos/transformations.py b/allofplos/transformations.py
index aa144aa5..1975824e 100644
--- a/allofplos/transformations.py
+++ b/allofplos/transformations.py
@@ -37,6 +37,7 @@
                   'assetXMLFile': 'article/file',
                   'articleMetrics': 'article/metrics',
                   'articleRelated': 'article/related'}
+                #   'peerReviews': 'article/peerReview
 
 
 def _get_base_page(journal):
@@ -144,8 +145,8 @@ def url_to_doi(url):
     Example:
     url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \
     '10.1371/journal.pone.1000001'
-    :param url: online location of a PLOS article's XML
-    :return: full unique identifier for a PLOS article
+    :param url: online location of a PLOS article's XML (not neccessarily, base link works fine too)
+    :return: full unique identifier for a PLOS article (or for a peer review, or supplementary material etc.)
     """
     return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX)
 

From cadff269f66fd462842a3e6fbcabe2eaa8c7213b Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Tue, 21 Jun 2022 00:28:55 +0200
Subject: [PATCH 10/15] get_page handles reviews

---
 allofplos/article.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index 9a54ac29..bcc25f05 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -842,9 +842,11 @@ def get_page(self, page_type='article'):
         """Get any of the PLOS URLs associated with a particular DOI.
 
         Based on `get_page_base()`, which customizes the beginning URL by journal.
-        :param page_type: one of the keys in `plos_page_dict`, defaults to article
+        :param page_type: one of the keys in `plos_page_dict` or the string "reviews".  defaults to article
         """
         BASE_LANDING_PAGE = _get_base_page(self.journal)
+        if page_type == "reviews":
+            page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
         try:
             page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type],
                                                                   self.doi)

From e2d3c314f28d0001c2f4b6ac8da0b3c57adeb609 Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Thu, 23 Jun 2022 01:57:21 +0200
Subject: [PATCH 11/15] get_page now handles reviews

---
 allofplos/article.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index bcc25f05..459b9c74 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -847,13 +847,13 @@ def get_page(self, page_type='article'):
         BASE_LANDING_PAGE = _get_base_page(self.journal)
         if page_type == "reviews":
             page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
-        try:
-            page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type],
-                                                                  self.doi)
-            if page_type == 'assetXMLFile':
-                page += URL_SUFFIX
-        except KeyError:
-            raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys())))
+        else:
+            try:
+                page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
+                if page_type == 'assetXMLFile':
+                    page += URL_SUFFIX
+            except KeyError:
+                raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys())))
         return page
 
     @property

From 7d2fa18d2dba8c5f7ebf6e81710ec67b3e99a82c Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Thu, 23 Jun 2022 02:10:39 +0200
Subject: [PATCH 12/15] add peerReviews to plos_page_dict

---
 allofplos/article.py         | 19 +++++++++----------
 allofplos/transformations.py |  4 ++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/allofplos/article.py b/allofplos/article.py
index 459b9c74..12f8cf41 100644
--- a/allofplos/article.py
+++ b/allofplos/article.py
@@ -842,18 +842,17 @@ def get_page(self, page_type='article'):
         """Get any of the PLOS URLs associated with a particular DOI.
 
         Based on `get_page_base()`, which customizes the beginning URL by journal.
-        :param page_type: one of the keys in `plos_page_dict` or the string "reviews".  defaults to article
+        :param page_type: one of the keys in `plos_page_dict`.  defaults to article
         """
         BASE_LANDING_PAGE = _get_base_page(self.journal)
-        if page_type == "reviews":
-            page = BASE_LANDING_PAGE + "article/peerReview/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
-        else:
-            try:
-                page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
-                if page_type == 'assetXMLFile':
-                    page += URL_SUFFIX
-            except KeyError:
-                raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys())))
+        if page_type == "peerReview":
+            return BASE_LANDING_PAGE + "article/" + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
+        try:
+            page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi)
+            if page_type == 'assetXMLFile':
+                page += URL_SUFFIX
+        except KeyError:
+            raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys())))
         return page
 
     @property
diff --git a/allofplos/transformations.py b/allofplos/transformations.py
index 1975824e..83f423ab 100644
--- a/allofplos/transformations.py
+++ b/allofplos/transformations.py
@@ -36,8 +36,8 @@
                   'assetFile': 'article/file',
                   'assetXMLFile': 'article/file',
                   'articleMetrics': 'article/metrics',
-                  'articleRelated': 'article/related'}
-                #   'peerReviews': 'article/peerReview
+                  'articleRelated': 'article/related',
+                  'peerReviews': 'peerReview'}  # get_page function handles peerReviews differently
 
 
 def _get_base_page(journal):

From f01530ba8798185e30788b857c457c38d2bc0bc9 Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Fri, 24 Jun 2022 00:27:04 +0200
Subject: [PATCH 13/15] peerReviews -> peerReview

---
 allofplos/transformations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allofplos/transformations.py b/allofplos/transformations.py
index 83f423ab..71627176 100644
--- a/allofplos/transformations.py
+++ b/allofplos/transformations.py
@@ -37,7 +37,7 @@
                   'assetXMLFile': 'article/file',
                   'articleMetrics': 'article/metrics',
                   'articleRelated': 'article/related',
-                  'peerReviews': 'peerReview'}  # get_page function handles peerReviews differently
+                  'peerReview': 'peerReview'}  # get_page function handles peerReview differently
 
 
 def _get_base_page(journal):

From 4e847da0712508fb53f7b7a06dc5b5c0d731e3cc Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Fri, 24 Jun 2022 00:45:13 +0200
Subject: [PATCH 14/15] add peerReview to plos_page_dict

---
 allofplos/transformations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allofplos/transformations.py b/allofplos/transformations.py
index 1975824e..b84d34db 100644
--- a/allofplos/transformations.py
+++ b/allofplos/transformations.py
@@ -36,8 +36,8 @@
                   'assetFile': 'article/file',
                   'assetXMLFile': 'article/file',
                   'articleMetrics': 'article/metrics',
-                  'articleRelated': 'article/related'}
-                #   'peerReviews': 'article/peerReview
+                  'articleRelated': 'article/related',
+                  'peerReview': 'article/peerReview'}
 
 
 def _get_base_page(journal):

From 63e1a7fb54b737fe98d83df2c6496e15f3ed76aa Mon Sep 17 00:00:00 2001
From: x-j <xjasienski@gmail.com>
Date: Mon, 18 Jul 2022 01:38:39 +0200
Subject: [PATCH 15/15] change to os-universal paths

---
 tests/test_unittests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_unittests.py b/tests/test_unittests.py
index 9ecff9ad..ac601fa3 100644
--- a/tests/test_unittests.py
+++ b/tests/test_unittests.py
@@ -92,7 +92,7 @@ def test_class_doi1(self):
         self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [{'contrib_initials': 'EGL', 'given_names': 'Eric Gordon', 'surname': 'Lamb', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'contrib_type': 'editor', 'author_type': None, 'editor_type': None, 'email': None, 'affiliations': ['University of Saskatchewan, CANADA'], 'author_roles': {None: ['Editor']}, 'footnotes': []}], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/journal.pone.0185809.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","journal.pone.0185809.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185809", 'page does not transform correctly for {}'.format(article.doi))
@@ -133,7 +133,7 @@ def test_example_doi(self):
         self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/journal.pbio.2001413.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","journal.pbio.2001413.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS Biology", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001413", 'page does not transform correctly for {}'.format(article.doi))
@@ -172,7 +172,7 @@ def test_example_doi2(self):
         self.assertEqual(article.dtd, "NLM 3.0", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6", 'page does not transform correctly for {}'.format(article.doi))