From bac0a72cf7816275884f1753fc8fa7f99b74744a Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Tue, 28 Nov 2017 20:40:53 -0800
Subject: [PATCH 01/11] use makedirs rather than try/except; handles all mkdir
 -p functionality

---
 allofplos/plos_corpus.py             | 15 +++------------
 allofplos/samples/corpus_analysis.py |  5 +----
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py
index a1da70d9..45baed0f 100644
--- a/allofplos/plos_corpus.py
+++ b/allofplos/plos_corpus.py
@@ -218,10 +218,7 @@ def repo_download(dois, tempdir, ignore_existing=True, plos_network=False):
     :param ignore_existing: Don't re-download to tempdir if already downloaded
     """
     # make temporary directory, if needed
-    try:
-        os.mkdir(tempdir)
-    except FileExistsError:
-        pass
+    os.makedirs(tempdir, exist_ok=True)
 
     if ignore_existing:
         existing_articles = [filename_to_doi(file) for file in listdir_nohidden(tempdir)]
@@ -423,10 +420,7 @@ def download_updated_xml(article_file,
     :return: boolean for whether update was available & downloaded
     """
     doi = filename_to_doi(article_file)
-    try:
-        os.mkdir(tempdir)
-    except FileExistsError:
-        pass
+    os.makedirs(tempdir, exist_ok=True)
     url = URL_TMP.format(doi)
     articletree_remote = et.parse(url)
     articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode')
@@ -693,10 +687,7 @@ def remote_proofs_direct_check(tempdir=newarticledir, article_list=None, plos_ne
     :param article-list: list of uncorrected proofs to check for updates.
     :return: list of all articles with updated vor
     """
-    try:
-        os.mkdir(tempdir)
-    except FileExistsError:
-        pass
+    os.makedirs(tempdir, exist_ok=True)
     proofs_download_list = []
     if article_list is None:
         article_list = get_uncorrected_proofs_list()
diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py
index 0b51541d..b1309731 100644
--- a/allofplos/samples/corpus_analysis.py
+++ b/allofplos/samples/corpus_analysis.py
@@ -290,10 +290,7 @@ def revisiondate_sanity_check(article_list=None, tempdir=newarticledir, director
         article_list = sorted(pubdates, key=pubdates.__getitem__, reverse=True)
         article_list = article_list[:30000]
 
-    try:
-        os.mkdir(tempdir)
-    except FileExistsError:
-        pass
+    os.makedirs(tempdir, exist_ok=True)
     articles_different_list = []
     max_value = len(article_list)
     bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value)

From fcb8d9b51f45158b53efe52a858c7ddb45336c77 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 00:25:30 -0800
Subject: [PATCH 02/11] Changing other pattern of os.mkdir with a print
 statement if not present to os.makedirs

---
 allofplos/plos_corpus.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py
index 45baed0f..c5717197 100644
--- a/allofplos/plos_corpus.py
+++ b/allofplos/plos_corpus.py
@@ -857,9 +857,7 @@ def create_local_plos_corpus(corpusdir=corpusdir, rm_metadata=True):
     :param rm_metadata: COMPLETE HERE
     :return: None
     """
-    if os.path.isdir(corpusdir) is False:
-        os.mkdir(corpusdir)
-        print('Creating folder for article xml')
+    os.makedirs(tempdir, exist_ok=True)
     zip_date, zip_size, metadata_path = get_zip_metadata()
     zip_path = download_file_from_google_drive(zip_id, local_zip, file_size=zip_size)
     unzip_articles(file_path=zip_path)
@@ -876,9 +874,7 @@ def create_test_plos_corpus(corpusdir=corpusdir):
     :param corpusdir: directory where the corpus is to be downloaded and extracted
     :return: None
     """
-    if os.path.isdir(corpusdir) is False:
-        os.mkdir(corpusdir)
-        print('Creating folder for article xml')
+    os.makedirs(tempdir, exist_ok=True)
     zip_path = download_file_from_google_drive(test_zip_id, local_test_zip)
     unzip_articles(file_path=zip_path, extract_directory=corpusdir)
 

From 17ee2e3ae0340bb376043bf6b8f8a5fd627a08f4 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 12:48:30 -0800
Subject: [PATCH 03/11] add class method from_bytes for creating articles from
 byte strings

---
 allofplos/article_class.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/allofplos/article_class.py b/allofplos/article_class.py
index 864baf6d..d78aa038 100644
--- a/allofplos/article_class.py
+++ b/allofplos/article_class.py
@@ -2,9 +2,12 @@
 import re
 import subprocess
 
+from io import BytesIO
+
 import lxml.etree as et
 import requests
 
+
 from allofplos.transformations import (filename_to_doi, EXT_URL_TMP, INT_URL_TMP,
                                        BASE_URL_ARTICLE_LANDING_PAGE)
 from allofplos.plos_regex import (validate_doi, corpusdir)
@@ -1097,3 +1100,27 @@ def from_filename(cls, filename):
         """Initiate an article object using a local XML file.
         """
         return cls(filename_to_doi(filename))
+        
+    @classmethod
+    def from_bytes(cls, resp, directory=corpusdir, write=False, overwrite=True):
+        tree = et.parse(BytesIO(resp)) 
+        root = tree.getroot()
+        tag_path = ["/",
+                    "article",
+                    "front",
+                    "article-meta",
+                    "article-id"]
+        tag_location = '/'.join(tag_path)
+        article_ids = root.xpath(tag_location)
+        for art_id in article_ids:
+            if art_id.get('pub-id-type')=='doi':
+                temp = cls(art_id.text, directory=directory)
+                temp._tree = tree
+                if write and (not os.path.isfile(temp.filename) or overwrite):
+                    with open(temp.filename, 'w') as file:
+                        file.write(et.tostring(tree, method='xml', encoding='unicode'))
+                break
+        return temp
+            
+                
+ 

From 38be2b1bdb8ded52be419d9004f201e9f245bfef Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 12:49:36 -0800
Subject: [PATCH 04/11] basic async file getting utilities with timing
 comparisons

---
 fetch_test.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 fetch_test.py

diff --git a/fetch_test.py b/fetch_test.py
new file mode 100644
index 00000000..673da9c5
--- /dev/null
+++ b/fetch_test.py
@@ -0,0 +1,118 @@
+import asyncio
+import aiohttp
+import requests
+import time
+import os
+import shutil
+
+
+import lxml.etree as et
+from timeit import default_timer
+
+from allofplos.plos_corpus import listdir_nohidden
+from allofplos.plos_regex import ALLOFPLOS_DIR_PATH, corpusdir
+from allofplos.transformations import URL_TMP, url_to_doi
+from allofplos.samples.corpus_analysis import get_all_local_dois
+from allofplos import Article
+
+begin_time = default_timer()
+
+ASYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "async_test")
+MIN_FILES = 9990
+NUM_FILES = 10
+
+async def fetch(url, session):
+    """Fetch a url, using specified ClientSession."""
+    fetch.start_time[url] = default_timer()
+    async with session.get(url) as response:
+        resp = await response.read()
+        article = Article.from_bytes(resp,
+                                     directory=ASYNC_DIRECTORY,
+                                     write=True,
+                                     overwrite=True)
+        now = default_timer()
+        elapsed = now - fetch.start_time[url]
+        # print('{0:5.2f} {1:30}{2:5.2} '.format(now, url, elapsed))
+        return article
+        
+async def fetch_all(dois, max_rate=1.0, limit_per_host=3.0):
+    """Launch requests for all web pages."""
+    tasks = []
+    fetch.start_time = dict() # dictionary of start times for each url
+    conn = aiohttp.TCPConnector(limit_per_host=limit_per_host)
+    async with aiohttp.ClientSession(connector=conn) as session:
+        for doi in dois:
+            await asyncio.sleep(max_rate) # ensures no more requests than max_rate per second
+            task = asyncio.ensure_future(
+                fetch(URL_TMP.format(doi), session))
+            tasks.append(task) # create list of tasks
+            
+        first_batch = await asyncio.gather(*tasks) # gather task responses
+        corrected_dois = [article.related_doi 
+                          for article in first_batch 
+                          if article.type_=="correction"]
+        for doi in corrected_dois:
+            await asyncio.sleep(max_rate) # ensures no more requests than max_rate per second
+            task = asyncio.ensure_future(
+                fetch(URL_TMP.format(doi), session))
+            tasks.append(task) # create list of tasks
+        
+        second_batch = await asyncio.gather(*tasks) # gather task responses
+        
+    
+    # -------------- TOTAL SECONDS: 178.59        
+
+def sequential_fetch(doi):
+    "Fetch individual web pages as part of a sequence"
+    url = URL_TMP.format(doi)
+    response = requests.get(url)
+    time.sleep(1)
+    article = Article.from_bytes(response.text.encode('utf-8'), 
+                                 directory=ASYNC_DIRECTORY,
+                                 write=True)
+    return article
+
+def demo_sequential(dois):
+    """Fetch list of web pages sequentially."""
+    handle_dir()
+    start_time = default_timer()
+    for doi in dois:
+        start_time_url = default_timer()
+        article = sequential_fetch(doi)
+        now = default_timer()
+        elapsed = now - start_time_url
+        if article.type_ == "correction":
+            new_article = sequential_fetch(article.related_doi)
+            
+        # print('{0:5.2f} {1:30}{2:5.2f} '.format(now, url, elapsed)) 
+    
+    tot_elapsed = default_timer() - start_time
+    print(' TOTAL SECONDS: '.rjust(30, '-') + '{0:5.2f} '. \
+        format(tot_elapsed, '\n'))
+
+
+def demo_async(dois):
+    handle_dir()
+    start_time = default_timer()
+    loop = asyncio.get_event_loop() # event loop
+    future = asyncio.ensure_future(fetch_all(dois)) # tasks to do
+    loop.run_until_complete(future) # loop until done
+    loop.run_until_complete(asyncio.sleep(0))
+    loop.close()
+    tot_elapsed = default_timer() - start_time
+    print(' TOTAL SECONDS: '.rjust(30, '-') + '{0:5.2f} '. \
+        format(tot_elapsed, '\n'))
+        
+def main():
+    
+    dois = get_all_local_dois(corpusdir)[MIN_FILES:MIN_FILES+NUM_FILES]
+    
+    demo_sequential(dois)
+    demo_async(dois)
+
+def handle_dir():
+    if os.path.isdir(ASYNC_DIRECTORY):
+        shutil.rmtree(ASYNC_DIRECTORY)
+    os.makedirs(ASYNC_DIRECTORY, exist_ok=True)
+if __name__ == '__main__':
+    main()

From 0eca201bf43a7a66e2570ad57301ddfd5e5159a9 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 13:26:53 -0800
Subject: [PATCH 05/11] require python 3.5 or greater

---
 setup.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 00c25d96..8152ab80 100644
--- a/setup.py
+++ b/setup.py
@@ -3,9 +3,9 @@
 import sys
 
 if sys.version_info.major < 3:
-    sys.exit('Sorry, Python < 3.4 is not supported')
-elif sys.version_info.minor < 4:
-    sys.exit('Sorry, Python < 3.4 is not supported')
+    sys.exit('Sorry, Python < 3.5 is not supported')
+elif sys.version_info.minor < 5:
+    sys.exit('Sorry, Python < 3.5 is not supported')
 
 here = path.abspath(path.dirname(__file__))
 
@@ -27,7 +27,6 @@
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering',
         'License :: OSI Approved :: MIT License',
-        'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
     ],
@@ -50,7 +49,7 @@
         'tqdm==4.17.1',
         'urllib3==1.22',
         ],
-    python_requires='>=3.4',
+    python_requires='>=3.5',
     # If there are data files included in your packages that need to be
     # installed, specify them here.  If using Python 2.6 or less, then these
     # have to be included in MANIFEST.in as well.

From 5cdd28f015836365df98780657b15c9c71227282 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 13:29:22 -0800
Subject: [PATCH 06/11] add MIN_DELAY as configurable for testing on plos
 network

---
 fetch_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fetch_test.py b/fetch_test.py
index 673da9c5..652ba521 100644
--- a/fetch_test.py
+++ b/fetch_test.py
@@ -17,9 +17,10 @@
 
 begin_time = default_timer()
 
+MIN_DELAY = 1.0
 ASYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "async_test")
 MIN_FILES = 9990
-NUM_FILES = 10
+NUM_FILES = 10 
 
 async def fetch(url, session):
     """Fetch a url, using specified ClientSession."""
@@ -35,7 +36,7 @@ async def fetch(url, session):
         # print('{0:5.2f} {1:30}{2:5.2} '.format(now, url, elapsed))
         return article
         
-async def fetch_all(dois, max_rate=1.0, limit_per_host=3.0):
+async def fetch_all(dois, max_rate=MIN_DELAY, limit_per_host=3.0):
     """Launch requests for all web pages."""
     tasks = []
     fetch.start_time = dict() # dictionary of start times for each url
@@ -66,7 +67,7 @@ def sequential_fetch(doi):
     "Fetch individual web pages as part of a sequence"
     url = URL_TMP.format(doi)
     response = requests.get(url)
-    time.sleep(1)
+    time.sleep(MIN_DELAY)
     article = Article.from_bytes(response.text.encode('utf-8'), 
                                  directory=ASYNC_DIRECTORY,
                                  write=True)

From f85990dcf2de29c0c0b521e711411189e479507a Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 13:33:33 -0800
Subject: [PATCH 07/11] move fetch test inside async_utils package

---
 allofplos/async_utils/__init__.py                    | 0
 fetch_test.py => allofplos/async_utils/fetch_test.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 allofplos/async_utils/__init__.py
 rename fetch_test.py => allofplos/async_utils/fetch_test.py (100%)

diff --git a/allofplos/async_utils/__init__.py b/allofplos/async_utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/fetch_test.py b/allofplos/async_utils/fetch_test.py
similarity index 100%
rename from fetch_test.py
rename to allofplos/async_utils/fetch_test.py

From 07ffabcd44d03315fc91a4983d3e73029b5c0f09 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Wed, 29 Nov 2017 13:39:39 -0800
Subject: [PATCH 08/11] open correct directories with makedirs

---
 allofplos/plos_corpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py
index c5717197..1093da6b 100644
--- a/allofplos/plos_corpus.py
+++ b/allofplos/plos_corpus.py
@@ -857,7 +857,7 @@ def create_local_plos_corpus(corpusdir=corpusdir, rm_metadata=True):
     :param rm_metadata: COMPLETE HERE
     :return: None
     """
-    os.makedirs(tempdir, exist_ok=True)
+    os.makedirs(corpusdir, exist_ok=True)
     zip_date, zip_size, metadata_path = get_zip_metadata()
     zip_path = download_file_from_google_drive(zip_id, local_zip, file_size=zip_size)
     unzip_articles(file_path=zip_path)
@@ -874,7 +874,7 @@ def create_test_plos_corpus(corpusdir=corpusdir):
     :param corpusdir: directory where the corpus is to be downloaded and extracted
     :return: None
     """
-    os.makedirs(tempdir, exist_ok=True)
+    os.makedirs(corpusdir, exist_ok=True)
     zip_path = download_file_from_google_drive(test_zip_id, local_test_zip)
     unzip_articles(file_path=zip_path, extract_directory=corpusdir)
 

From 7e38b3ec85c4d558203ae6586c27e3555ea9fddb Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Thu, 30 Nov 2017 18:15:25 -0800
Subject: [PATCH 09/11] build two directories so that they can be compared

---
 allofplos/async_utils/fetch_test.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/allofplos/async_utils/fetch_test.py b/allofplos/async_utils/fetch_test.py
index 652ba521..8ffca305 100644
--- a/allofplos/async_utils/fetch_test.py
+++ b/allofplos/async_utils/fetch_test.py
@@ -18,7 +18,8 @@
 begin_time = default_timer()
 
 MIN_DELAY = 1.0
-ASYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "async_test")
+ASYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "async_test_dir")
+SYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "sync_test_dir")
 MIN_FILES = 9990
 NUM_FILES = 10 
 
@@ -75,7 +76,7 @@ def sequential_fetch(doi):
 
 def demo_sequential(dois):
     """Fetch list of web pages sequentially."""
-    handle_dir()
+    recreate_dir(SYNC_DIRECTORY)
     start_time = default_timer()
     for doi in dois:
         start_time_url = default_timer()
@@ -93,7 +94,7 @@ def demo_sequential(dois):
 
 
 def demo_async(dois):
-    handle_dir()
+    recreate_dir(ASYNC_DIRECTORY)
     start_time = default_timer()
     loop = asyncio.get_event_loop() # event loop
     future = asyncio.ensure_future(fetch_all(dois)) # tasks to do
@@ -103,17 +104,19 @@ def demo_async(dois):
     tot_elapsed = default_timer() - start_time
     print(' TOTAL SECONDS: '.rjust(30, '-') + '{0:5.2f} '. \
         format(tot_elapsed, '\n'))
+    
+def recreate_dir(directory):
+    """Removes and recreates the directory.
+    """
+    if os.path.isdir(directory):
+        shutil.rmtree(directory)
+    os.makedirs(directory, exist_ok=True)
         
 def main():
     
     dois = get_all_local_dois(corpusdir)[MIN_FILES:MIN_FILES+NUM_FILES]
-    
     demo_sequential(dois)
     demo_async(dois)
 
-def handle_dir():
-    if os.path.isdir(ASYNC_DIRECTORY):
-        shutil.rmtree(ASYNC_DIRECTORY)
-    os.makedirs(ASYNC_DIRECTORY, exist_ok=True)
 if __name__ == '__main__':
     main()

From 4b18b03db2c20e3bd0fe0d6a571d0e8deaa12648 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Thu, 30 Nov 2017 18:18:04 -0800
Subject: [PATCH 10/11] make the inner functions accept dois (not urls)

---
 allofplos/async_utils/fetch_test.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/allofplos/async_utils/fetch_test.py b/allofplos/async_utils/fetch_test.py
index 8ffca305..3a712e71 100644
--- a/allofplos/async_utils/fetch_test.py
+++ b/allofplos/async_utils/fetch_test.py
@@ -23,9 +23,10 @@
 MIN_FILES = 9990
 NUM_FILES = 10 
 
-async def fetch(url, session):
     """Fetch a url, using specified ClientSession."""
     fetch.start_time[url] = default_timer()
+async def fetch(doi, session):
+    url = URL_TMP.format(doi)
     async with session.get(url) as response:
         resp = await response.read()
         article = Article.from_bytes(resp,
@@ -45,8 +46,7 @@ async def fetch_all(dois, max_rate=MIN_DELAY, limit_per_host=3.0):
     async with aiohttp.ClientSession(connector=conn) as session:
         for doi in dois:
             await asyncio.sleep(max_rate) # ensures no more requests than max_rate per second
-            task = asyncio.ensure_future(
-                fetch(URL_TMP.format(doi), session))
+            task = asyncio.ensure_future(fetch(doi, session))
             tasks.append(task) # create list of tasks
             
         first_batch = await asyncio.gather(*tasks) # gather task responses
@@ -55,8 +55,7 @@ async def fetch_all(dois, max_rate=MIN_DELAY, limit_per_host=3.0):
                           if article.type_=="correction"]
         for doi in corrected_dois:
             await asyncio.sleep(max_rate) # ensures no more requests than max_rate per second
-            task = asyncio.ensure_future(
-                fetch(URL_TMP.format(doi), session))
+            task = asyncio.ensure_future(fetch(doi, session))
             tasks.append(task) # create list of tasks
         
         second_batch = await asyncio.gather(*tasks) # gather task responses

From a4a946c4c54e8e3731dc0ff712c52f14a1725fd5 Mon Sep 17 00:00:00 2001
From: M Pacer <mpacer@berkeley.edu>
Date: Thu, 30 Nov 2017 18:26:15 -0800
Subject: [PATCH 11/11] doc improvements and a bunch of cleanup

---
 allofplos/async_utils/fetch_test.py | 62 +++++++++++++++++++----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/allofplos/async_utils/fetch_test.py b/allofplos/async_utils/fetch_test.py
index 3a712e71..c21a200d 100644
--- a/allofplos/async_utils/fetch_test.py
+++ b/allofplos/async_utils/fetch_test.py
@@ -15,17 +15,22 @@
 from allofplos.samples.corpus_analysis import get_all_local_dois
 from allofplos import Article
 
-begin_time = default_timer()
+MIN_DELAY = 1.0 # minimum for wait before beginning the next http-request (in s)
+MIN_FILES = 9990 # index of the files to start with
+NUM_FILES = 10 # how many files do you process
 
-MIN_DELAY = 1.0
 ASYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "async_test_dir")
 SYNC_DIRECTORY = os.path.join(ALLOFPLOS_DIR_PATH, "sync_test_dir")
-MIN_FILES = 9990
-NUM_FILES = 10 
 
-    """Fetch a url, using specified ClientSession."""
-    fetch.start_time[url] = default_timer()
 async def fetch(doi, session):
+    """Given a doi, fetch the associated url, using the given asynchronous
+    session (a ClientSession) as a context manager.
+    
+    Returns the article created by transforming the content of the response.
+    
+    NB: This needs to do better error handling if the url fails or points to an
+    invalid xml file.
+    """
     url = URL_TMP.format(doi)
     async with session.get(url) as response:
         resp = await response.read()
@@ -33,15 +38,18 @@ async def fetch(doi, session):
                                      directory=ASYNC_DIRECTORY,
                                      write=True,
                                      overwrite=True)
-        now = default_timer()
-        elapsed = now - fetch.start_time[url]
-        # print('{0:5.2f} {1:30}{2:5.2} '.format(now, url, elapsed))
         return article
         
 async def fetch_all(dois, max_rate=MIN_DELAY, limit_per_host=3.0):
-    """Launch requests for all web pages."""
+    """Launch requests for each doi.
+    
+    This first gets all of the dois passed in as dois.
+    
+    Then it checks for the existence of dois that are corrected articles that
+    should also be downloaded.
+    """
+    
     tasks = []
-    fetch.start_time = dict() # dictionary of start times for each url
     conn = aiohttp.TCPConnector(limit_per_host=limit_per_host)
     async with aiohttp.ClientSession(connector=conn) as session:
         for doi in dois:
@@ -61,10 +69,17 @@ async def fetch_all(dois, max_rate=MIN_DELAY, limit_per_host=3.0):
         second_batch = await asyncio.gather(*tasks) # gather task responses
         
     
-    # -------------- TOTAL SECONDS: 178.59        
 
 def sequential_fetch(doi):
-    "Fetch individual web pages as part of a sequence"
+    """
+    Fetch urls on the basis of the doi being passed in as part of a sequential
+    process.
+
+    Returns the article created by transforming the content of the response.
+
+    NB: This needs to do better error handling if the url fails or points to an
+    invalid xml file.
+    """
     url = URL_TMP.format(doi)
     response = requests.get(url)
     time.sleep(MIN_DELAY)
@@ -74,31 +89,36 @@ def sequential_fetch(doi):
     return article
 
 def demo_sequential(dois):
-    """Fetch list of web pages sequentially."""
+    """Organises the process of downloading articles associated with dois
+    to SYNC_DIRECTORY sequentially.
+    
+    Side-effect: prints a timer to indicate how long it took.
+    """
     recreate_dir(SYNC_DIRECTORY)
     start_time = default_timer()
     for doi in dois:
         start_time_url = default_timer()
         article = sequential_fetch(doi)
-        now = default_timer()
-        elapsed = now - start_time_url
         if article.type_ == "correction":
             new_article = sequential_fetch(article.related_doi)
-            
-        # print('{0:5.2f} {1:30}{2:5.2f} '.format(now, url, elapsed)) 
-    
+
     tot_elapsed = default_timer() - start_time
     print(' TOTAL SECONDS: '.rjust(30, '-') + '{0:5.2f} '. \
         format(tot_elapsed, '\n'))
 
 
 def demo_async(dois):
+    """Organises the process of downloading articles associated with the doi to
+    ASYNC_DIRECTORY asynchronous functionality. 
+    
+    Side-effect: prints a timer to indicate how long it took.
+    """
     recreate_dir(ASYNC_DIRECTORY)
     start_time = default_timer()
     loop = asyncio.get_event_loop() # event loop
     future = asyncio.ensure_future(fetch_all(dois)) # tasks to do
     loop.run_until_complete(future) # loop until done
-    loop.run_until_complete(asyncio.sleep(0))
+    loop.run_until_complete(asyncio.sleep(0)) 
     loop.close()
     tot_elapsed = default_timer() - start_time
     print(' TOTAL SECONDS: '.rjust(30, '-') + '{0:5.2f} '. \
@@ -112,6 +132,8 @@ def recreate_dir(directory):
     os.makedirs(directory, exist_ok=True)
         
 def main():
+    """Main loop for running and comparing the different appraoches.
+    """
     
     dois = get_all_local_dois(corpusdir)[MIN_FILES:MIN_FILES+NUM_FILES]
     demo_sequential(dois)