Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ How to run the program

Execute the following command.

``(allofplos)$ python -m allofplos.plos_corpus``
``(allofplos)$ python -m allofplos.update``

The first time it runs it will download a >4.4 Gb zip file
(**allofplos_xml.zip**) with all the XML files inside.
Expand All @@ -59,7 +59,7 @@ allofplos\_xml directory inside your installation of `allofplos`.

If you want to see the directory on your file system where this is installed run

``python -c "from allofplos.plos_regex import corpusdir; print(corpusdir)"``
``python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"``

If you ever downloaded the corpus before, it will make an incremental
update to the existing corpus, the script checks for and then downloads
Expand All @@ -80,8 +80,8 @@ Here’s what the print statements might look like on a typical run:

147 new articles to download.
147 new articles downloaded.
3 corrected articles found.
0 corrected articles downloaded with new xml.
3 amended articles found.
0 amended articles downloaded with new xml.
Creating new text list of uncorrected proofs from scratch.
No new VOR articles indexed in Solr.
17 VOR articles directly downloaded.
Expand All @@ -106,9 +106,9 @@ Should return something like this:

::

......
........
----------------------------------------------------------------------
Ran 6 tests in 3.327s
Ran 8 tests in 0.257s
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is no longer accurate after #89, could you update it?


OK

Expand Down
2 changes: 1 addition & 1 deletion allofplos/article_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def doi(self, d):
instantiating the article object.
"""
if validate_doi(d) is False:
raise Exception("Invalid format for PLOS DOI")
raise Exception("Invalid format for PLOS DOI: {}".format(d))
self.reset_memoized_attrs()
self._doi = d

Expand Down
82 changes: 15 additions & 67 deletions allofplos/corpus/plos_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,16 @@ def compare_article_pubdate(doi, days=22, directory=None):
print("Pubdate error in {}".format(doi))


def download_xml(doi, tempdir=newarticledir):
Copy link
Collaborator

@mpacer mpacer Feb 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is really clever! I hadn't thought of using an article object as a downloader… why don't we just bake this into the Article class itself?

Edit: I actually had thought about it in the context of the async-await stuff (#46) but I think I chose not to implement it because I thought I'd get pushback and that it wasn't how we were planning on using the Article class. No matter what, this is super clever in a good, straightforward code way!

"""For a given DOI, download its remote XML file to tempdir."""
art = Article(doi, directory=tempdir)
with open(art.filename, 'w', encoding='utf8') as f:
f.write(art.get_remote_xml())
return art


def download_updated_xml(article_file,
tempdir=newarticledir,
vor_check=False):
tempdir=newarticledir):
"""
For an article file, compare local XML to remote XML
If they're different, download new version of article
Expand All @@ -305,21 +312,9 @@ def download_updated_xml(article_file,

if articleXML_remote == articleXML_local:
updated = False
get_new = False
else:
get_new = True
if vor_check:
# make sure that update is to a VOR for uncorrected proof
get_new = False
if article.remote_proof == 'vor_update':
get_new = True
# else:
# updated = False
if get_new:
article_new = Article(article.doi, directory=tempdir)
with open(article_new.filename, 'w', encoding='utf8') as f:
f.write(articleXML_remote)
updated = True
article_new = download_xml(article.doi, tempdir=tempdir)
updated = True
return updated


Expand Down Expand Up @@ -496,10 +491,10 @@ def download_vor_updates(directory=None, tempdir=newarticledir,
if vor_updates_available is None:
vor_updates_available = check_for_vor_updates()
vor_updated_article_list = []
for article in tqdm(vor_updates_available, disable=None):
updated = download_updated_xml(article, vor_check=True)
for doi in tqdm(vor_updates_available, disable=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does vor_updates_available return a list of dois?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed it does

updated = download_updated_xml(doi_to_path(doi), tempdir=tempdir)
if updated:
vor_updated_article_list.append(article)
vor_updated_article_list.append(doi)

old_uncorrected_proofs = get_uncorrected_proofs()
new_uncorrected_proofs_list = list(old_uncorrected_proofs - set(vor_updated_article_list))
Expand Down Expand Up @@ -547,7 +542,7 @@ def remote_proofs_direct_check(tempdir=newarticledir, article_list=None):
print("Checking directly for additional VOR updates...")
for doi in tqdm(article_list, disable=None):
f = doi_to_path(doi)
updated = download_updated_xml(f, vor_check=True)
updated = download_updated_xml(f)
if updated:
proofs_download_list.append(doi)
if proofs_download_list:
Expand Down Expand Up @@ -646,50 +641,3 @@ def download_corpus_metadata_files(csv_abstracts=True, csv_no_abstracts=True, sq
inF.close()
outF.close()
print("Extraction complete.")


def main():
"""
Entry point for the program. This is used when the program is used as a
standalone script
:return: None
"""
directory = get_corpus_dir()

# Step 0: Initialize first copy of repository
try:
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
os.path.join(directory, name))]
except FileNotFoundError:
corpus_files = []
if len(corpus_files) < min_files_for_valid_corpus:
print('Not enough articles in {}, re-downloading zip file'.format(directory))
# TODO: check if zip file is in top-level directory before downloading
create_local_plos_corpus()

# Step 1: Query solr via URL and construct DOI list
# Filtered by article type & scheduled for the last 14 days.
# Returns specific URL query & the number of search results.
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
print("Checking for new articles...")
dois_needed_list = get_dois_needed_list()

# Step 2: Download new articles
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
# If no new articles, don't run any other cells
# Check if articles are uncorrected proofs
# Check if amended articles linked to new amendment articles are updated
# Merge new XML into folder
# If need to bulk download, please start here:
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
download_check_and_move(dois_needed_list,
uncorrected_proofs_text_list,
tempdir=newarticledir,
destination=get_corpus_dir()
)
return None


if __name__ == "__main__":
main()
8 changes: 7 additions & 1 deletion allofplos/plos_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from .corpus.plos_corpus import main
import warnings

from .update import main

if __name__ == "__main__":
warnings.simplefilter('always', DeprecationWarning)
warnings.warn("This update method is deprecated. use 'python -m allofplos.update'",
DeprecationWarning,
stacklevel=2)
main()
20 changes: 10 additions & 10 deletions allofplos/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,13 @@ def filename_to_doi(filename):
:return: full unique identifier for a PLOS article
"""
filename = os.path.basename(filename)
if correction in filename and validate_filename(filename):
if not validate_filename(filename):
raise Exception("Invalid format for PLOS filename: {}".format(filename))
elif correction in filename:
article = 'annotation/' + filename.split('.', 4)[2]
doi = PREFIX + article
elif validate_filename(filename):
else:
doi = PREFIX + os.path.splitext(filename)[0]
# NOTE: A filename should never validate as a DOI, so the next elif is wrong.
elif validate_doi(filename):
doi = filename
return doi


Expand Down Expand Up @@ -155,6 +154,8 @@ def doi_to_url(doi):
:param doi: full unique identifier for a PLOS article
:return: online location of a PLOS article's XML
"""
if validate_doi(doi) is False:
raise Exception("Invalid format for PLOS DOI: {}".format(doi))
journal = Journal.doi_to_journal(doi)
base_page = _get_base_page(journal)
return ''.join([base_page, 'article/file?id=', doi, URL_SUFFIX])
Expand All @@ -174,13 +175,12 @@ def doi_to_path(doi, directory=None):
"""
if directory is None:
directory = get_corpus_dir()
if doi.startswith(ANNOTATION_DOI) and validate_doi(doi):
if not validate_doi(doi):
raise Exception("Invalid format for PLOS DOI: {}".format(doi))
elif doi.startswith(ANNOTATION_DOI):
article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + SUFFIX_LOWER)
elif validate_doi(doi):
else:
article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER)
# NOTE: The following check is weird, a DOI should never validate as a file name.
elif validate_filename(doi):
article_file = doi
return article_file


Expand Down
52 changes: 52 additions & 0 deletions allofplos/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list
Copy link
Collaborator

@mpacer mpacer Mar 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is uncorrect_proofs_text_list in __init__.py? Isn't it specific to the update function? Does anything else use it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because the .txt file is stored in the top-level directory (which made sense when plos_corpus.py was there).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be wiped out when you reinstall allofplos

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, just like people who have installed the corpus directory in the default location. let's make obsolete in future PR

from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move,
min_files_for_valid_corpus)


def main():
"""
Entry point for the program. This is used when the program is used as a
standalone script
:return: None
"""
directory = get_corpus_dir()

# Step 0: Initialize first copy of repository
try:
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
os.path.join(directory, name))]
except FileNotFoundError:
corpus_files = []
if len(corpus_files) < min_files_for_valid_corpus:
print('Not enough articles in {}, re-downloading zip file'.format(directory))
# TODO: check if zip file is in top-level directory before downloading
create_local_plos_corpus()

# Step 1: Query solr via URL and construct DOI list
# Filtered by article type & scheduled for the last 14 days.
# Returns specific URL query & the number of search results.
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
print("Checking for new articles...")
dois_needed_list = get_dois_needed_list()

# Step 2: Download new articles
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
# If no new articles, don't run any other cells
# Check if articles are uncorrected proofs
# Check if amended articles linked to new amendment articles are updated
# Merge new XML into folder
# If need to bulk download, please start here:
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
download_check_and_move(dois_needed_list,
uncorrected_proofs_text_list,
tempdir=newarticledir,
destination=get_corpus_dir()
)
return None


if __name__ == "__main__":
main()