From ba62e988fbfc23f43d02261f1436471fbb165096 Mon Sep 17 00:00:00 2001
From: Sourcery AI <bot@sourcery.ai>
Date: Sun, 10 Dec 2023 16:34:39 +0000
Subject: [PATCH] 'Refactored by Sourcery'

---
 app.py                                  |  54 +--
 arxiv_public_data/authors.py            |  43 +--
 arxiv_public_data/config.py             |  19 +-
 arxiv_public_data/embeddings/tf_hub.py  |   6 +-
 arxiv_public_data/embeddings/util.py    |   2 +-
 arxiv_public_data/fulltext.py           |  29 +-
 arxiv_public_data/internal_citations.py |  13 +-
 arxiv_public_data/oai_metadata.py       |  45 +--
 arxiv_public_data/pdfstamp.py           |  27 +-
 arxiv_public_data/regex_arxiv.py        |  99 ++---
 arxiv_public_data/s3_bulk_download.py   |  49 ++-
 arxiv_public_data/slice_pdfs.py         |  16 +-
 src/Surveyor.py                         | 489 ++++++++++++------------
 13 files changed, 424 insertions(+), 467 deletions(-)

diff --git a/app.py b/app.py
index 9c318fe..9fd492a 100644
--- a/app.py
+++ b/app.py
@@ -73,30 +73,32 @@ class ArxivIDsModel(BaseModel):
     )
 
 if __name__ == '__main__':
-    st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
-    st.title('Auto-Research')
-    st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' 
-             '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
-    st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
-    st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
-    download_placeholder = st.container()
-
-    with st.sidebar.form(key="survey_keywords_form"):
-        session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
-        st.write('or')
-        session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
-        submit = st.form_submit_button(label="Submit")
-    st.sidebar.write('#### execution log:')
-        
-    run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
-                  'download_placeholder':download_placeholder}
-    if submit:
-        if session_data['research_keywords'] != '':
-            run_kwargs.update({'research_keywords':session_data['research_keywords'], 
-                               'max_search':session_data['max_search'], 
-                               'num_papers':session_data['num_papers']})
-        elif session_data['arxiv_ids'] != '':
-            run_kwargs.update({'arxiv_ids':[id.strip() for id in session_data['arxiv_ids'].split(',')]})
-
-        run_survey(**run_kwargs)
+     st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
+     st.title('Auto-Research')
+     st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' 
+              '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
+     st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
+     st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
+     download_placeholder = st.container()
+
+     with st.sidebar.form(key="survey_keywords_form"):
+         session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
+         st.write('or')
+         session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
+         submit = st.form_submit_button(label="Submit")
+     st.sidebar.write('#### execution log:')
+
+     run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
+                   'download_placeholder':download_placeholder}
+     if submit:
+          if session_data['research_keywords'] != '':
+               run_kwargs.update({'research_keywords':session_data['research_keywords'], 
+                                  'max_search':session_data['max_search'], 
+                                  'num_papers':session_data['num_papers']})
+          elif session_data['arxiv_ids'] != '':
+               run_kwargs['arxiv_ids'] = [
+                   id.strip() for id in session_data['arxiv_ids'].split(',')
+               ]
+
+          run_survey(**run_kwargs)
         
diff --git a/arxiv_public_data/authors.py b/arxiv_public_data/authors.py
index 955f044..a8b7a23 100644
--- a/arxiv_public_data/authors.py
+++ b/arxiv_public_data/authors.py
@@ -160,7 +160,7 @@ def _parse_author_affil_split(author_line: str) -> Dict:
                 2), match.group(3), match.group(4))
             author_entry = [s, match.group(1), '']
         elif mtype == 'name-prefix-name':
-            s = '{} {}'.format(match.group(2), match.group(3))
+            s = f'{match.group(2)} {match.group(3)}'
             author_entry = [s, match.group(1), '']
         elif mtype == 'name-name-prefix':
             author_entry = [match.group(2), match.group(1), match.group(3)]
@@ -197,9 +197,8 @@ def _remove_double_commas(items: List[str]) -> List[str]:
     for pt in items:
         if pt == ',' and last == ',':
             continue
-        else:
-            parts.append(pt)
-            last = pt
+        parts.append(pt)
+        last = pt
     return parts
 
 
@@ -210,13 +209,12 @@ def _tidy_name(name: str) -> str:
     return name
 
 
-def _collaboration_at_start(names: List[str]) \
-        -> Tuple[List[str], List[List[str]], int]:
+def _collaboration_at_start(names: List[str]) -> Tuple[List[str], List[List[str]], int]:
     """Perform special handling of collaboration at start."""
     author_list = []
 
     back_propagate_affiliations_to = 0
-    while len(names) > 0:
+    while names:
         m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))',
                       names[0], flags=re.IGNORECASE)
         if not m:
@@ -228,13 +226,13 @@ def _collaboration_at_start(names: List[str]) \
         # Remove from names
         names.pop(0)
         # Also swallow and following comma or colon
-        if names and (names[0] == ',' or names[0] == ':'):
+        if names and names[0] in [',', ':']:
             names.pop(0)
 
     return names, author_list, back_propagate_affiliations_to
 
 
-def _enum_collaboration_at_end(author_line: str)->Dict:
+def _enum_collaboration_at_end(author_line: str) -> Dict:
     """Get separate set of enumerated affiliations from end of author_line."""
     # Now see if we have a separate set of enumerated affiliations
     # This is indicated by finding '(\s*('
@@ -247,9 +245,7 @@ def _enum_collaboration_at_end(author_line: str)->Dict:
 
     # Now expect to have '1) affil1 (2) affil2 (3) affil3'
     for affil in affils.split('('):
-        # Now expect `1) affil1 ', discard if no match
-        m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil)
-        if m:
+        if m := re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil):
             enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2))
 
     return enumaffils
@@ -266,7 +262,7 @@ def _add_affiliation(author_line: str,
     Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2)
     """
     en = re.escape(name)
-    namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*'))
+    namerex = f"{en.replace(' ', 's*')}\s*\(([^\(\)]+)"
     m = re.search(namerex, author_line, flags=re.IGNORECASE)
     if not m:
         return author_entry
@@ -341,21 +337,19 @@ def split_authors(authors: str) -> List:
         for bit in aus:
             if bit == '':
                 continue
-            if bit == '(':  # track open parentheses
+            if bit == '(':
                 depth += 1
                 if depth == 1:
                     blocks.append(c)
                     c = '('
                 else:
                     c = c + bit
-            elif bit == ')':  # track close parentheses
+            elif bit == ')':
                 depth -= 1
                 c = c + bit
                 if depth == 0:
                     blocks.append(c)
                     c = ''
-                else:  # haven't closed, so keep accumulating
-                    continue
             else:
                 c = c + bit
         if c:
@@ -373,8 +367,7 @@ def split_authors(authors: str) -> List:
             for name in names:
                 if not name:
                     continue
-                name = name.rstrip().lstrip()
-                if name:
+                if name := name.rstrip().lstrip():
                     listx.append(name)
 
     # Recombine suffixes that were separated with a comma
@@ -386,7 +379,7 @@ def split_authors(authors: str) -> List:
                 and not re.match(r'\)$', parts[-2]):
             separator = parts.pop()
             last = parts.pop()
-            recomb = "{}{} {}".format(last, separator, p)
+            recomb = f"{last}{separator} {p}"
             parts.append(recomb)
         else:
             parts.append(p)
@@ -429,7 +422,7 @@ def _parse_article_authors(article_author):
     try:
         return [article_author[0], parse_author_affil_utf(article_author[1])]
     except Exception as e:
-        msg = "Author split failed for article {}".format(article_author[0])
+        msg = f"Author split failed for article {article_author[0]}"
         logger.error(msg)
         logger.exception(e)
         return [article_author[0], '']
@@ -455,15 +448,13 @@ def parse_authorline_parallel(article_authors, n_processes=None):
              [ author3_keyname, author3_firstnames, author1_suffix ]
             ]
     """
-    logger.info(
-        'Parsing author lines for {} articles...'.format(len(article_authors))
-    )
+    logger.info(f'Parsing author lines for {len(article_authors)} articles...')
 
     pool = Pool(n_processes)
     parsed = pool.map(_parse_article_authors, article_authors)
-    outdict = {aid: auth for aid, auth in parsed}
+    outdict = dict(parsed)
 
     filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz')
-    logger.info('Saving to {}'.format(filename))
+    logger.info(f'Saving to {filename}')
     with gzip.open(filename, 'wb') as fout:
         fout.write(json.dumps(outdict).encode('utf-8'))
diff --git a/arxiv_public_data/config.py b/arxiv_public_data/config.py
index 7cfbd41..ff7ba04 100644
--- a/arxiv_public_data/config.py
+++ b/arxiv_public_data/config.py
@@ -22,18 +22,17 @@ def get_outdir():
     """
     if os.environ.get(KEY):
         out = os.environ.get(KEY)
-    else:
-        if os.path.exists(JSONFILE):
-            js = json.load(open(JSONFILE))
-            if not KEY in js:
-                logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE))
-                logger.warn("default output directory is {}".format(DEFAULT_PATH))
-                out = DEFAULT_PATH
-            else:
-                out = js[KEY]
+    elif os.path.exists(JSONFILE):
+        js = json.load(open(JSONFILE))
+        if KEY in js:
+            out = js[KEY]
         else:
-            logger.warn("default output directory is {}".format(DEFAULT_PATH))
+            logger.warn(f'Configuration in "{JSONFILE}" invalid, using default')
+            logger.warn(f"default output directory is {DEFAULT_PATH}")
             out = DEFAULT_PATH
+    else:
+        logger.warn(f"default output directory is {DEFAULT_PATH}")
+        out = DEFAULT_PATH
     return out
 
 try:
diff --git a/arxiv_public_data/embeddings/tf_hub.py b/arxiv_public_data/embeddings/tf_hub.py
index bf06d94..4d8e4ea 100644
--- a/arxiv_public_data/embeddings/tf_hub.py
+++ b/arxiv_public_data/embeddings/tf_hub.py
@@ -61,7 +61,7 @@ def elmo_strings(batches, filename, batchsize=32):
 
         for i, batch in enumerate(batches):
             # grab mean-pooling of contextualized word reps
-            logger.info("Computing/saving batch {}".format(i))
+            logger.info(f"Computing/saving batch {i}")
             with open(filename, 'ab') as fout:
                 pickle.dump(sess.run(
                     embeddings, feed_dict={text_input: batch}
@@ -125,7 +125,7 @@ def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32):
         sess.run(init_op)
         for i, batch in enumerate(batches):
             values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch)
-            logger.info("Computing/saving batch {}".format(i))
+            logger.info(f"Computing/saving batch {i}")
             emb = sess.run(
                 embeddings, 
                 feed_dict={
@@ -180,6 +180,6 @@ def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args=
         for h in headers:
             pickle.dump(h, fout)
 
-    logger.info("Saving embeddings to {}".format(savename))
+    logger.info(f"Saving embeddings to {savename}")
     encoder(batches, savename, *encoder_args, 
             **encoder_kwargs)
diff --git a/arxiv_public_data/embeddings/util.py b/arxiv_public_data/embeddings/util.py
index 9b56ffa..5bc9c65 100644
--- a/arxiv_public_data/embeddings/util.py
+++ b/arxiv_public_data/embeddings/util.py
@@ -39,7 +39,7 @@ def id_to_pathname(aid):
     """
     if '.' in aid:  # new style ArXiv ID
         yymm = aid.split('.')[0]
-        return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')
+        return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, f'{aid}.txt')
 
     # old style ArXiv ID
     cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
diff --git a/arxiv_public_data/fulltext.py b/arxiv_public_data/fulltext.py
index a147d8a..d5bfadc 100644
--- a/arxiv_public_data/fulltext.py
+++ b/arxiv_public_data/fulltext.py
@@ -24,7 +24,7 @@
 def reextension(filename: str, extension: str) -> str:
     """ Give a filename a new extension """
     name, _ = os.path.splitext(filename)
-    return '{}.{}'.format(name, extension)
+    return f'{name}.{extension}'
 
 
 def average_word_length(txt):
@@ -43,8 +43,7 @@ def average_word_length(txt):
     #txt = re.subn(RE_REPEATS, '', txt)[0]
     nw = len(txt.split())
     nc = len(txt)
-    avgw = nc / (nw + 1)
-    return avgw
+    return nc / (nw + 1)
 
 
 def process_timeout(cmd, timeout):
@@ -71,7 +70,7 @@ def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''):
     output : str
         Full plain text output
     """
-    log.debug('Running {} on {}'.format(PDF2TXT, pdffile))
+    log.debug(f'Running {PDF2TXT} on {pdffile}')
     tmpfile = reextension(pdffile, 'pdf2txt')
 
     cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format(
@@ -101,7 +100,7 @@ def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str:
     output : str
         Full plain text output
     """
-    log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile))
+    log.debug(f'Running {PDFTOTEXT} on {pdffile}')
     tmpfile = reextension(pdffile, 'pdftotxt')
 
     cmd = '{cmd} "{pdf}" "{output}"'.format(
@@ -161,7 +160,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
         raise FileNotFoundError(pdffile)
 
     if os.stat(pdffile).st_size == 0:  # file is empty
-        raise RuntimeError('"{}" is an empty file'.format(pdffile))
+        raise RuntimeError(f'"{pdffile}" is an empty file')
 
     try:
         output = run_pdftotext(pdffile, timelimit=timelimit)
@@ -188,9 +187,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
     wordlength = average_word_length(output)
 
     if wordlength > 45:
-        raise RuntimeError(
-            'No accurate text could be extracted from "{}"'.format(pdffile)
-        )
+        raise RuntimeError(f'No accurate text could be extracted from "{pdffile}"')
 
     try:
         os.remove(reextension(pdffile, 'pdftotxt'))  # remove the tempfile
@@ -255,8 +252,8 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
     globber = os.path.join(path, '*.pdf')
     pdffiles = sorted_files(globber)
 
-    log.info('Searching "{}"...'.format(globber))
-    log.info('Found: {} pdfs'.format(len(pdffiles)))
+    log.info(f'Searching "{globber}"...')
+    log.info(f'Found: {len(pdffiles)} pdfs')
 
     for pdffile in pdffiles:
         txtfile = reextension(pdffile, 'txt')
@@ -271,7 +268,7 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
             with open(txtfile, 'w') as f:
                 f.write(text)
         except Exception as e:
-            log.error("Conversion failed for '{}'".format(pdffile))
+            log.error(f"Conversion failed for '{pdffile}'")
             log.exception(e)
             continue
 
@@ -297,8 +294,8 @@ def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMEL
     globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob
     pdffiles = sorted_files(globber)  # a list of path
 
-    log.info('Searching "{}"...'.format(globber))
-    log.info('Found: {} pdfs'.format(len(pdffiles)))
+    log.info(f'Searching "{globber}"...')
+    log.info(f'Found: {len(pdffiles)} pdfs')
 
     pool = Pool(processes=processes)
     result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles)
@@ -311,7 +308,7 @@ def convert_safe(pdffile: str, timelimit: int = TIMELIMIT):
     try:
         convert(pdffile, timelimit=timelimit)
     except Exception as e:
-        log.error('File conversion failed for {}: {}'.format(pdffile, e))
+        log.error(f'File conversion failed for {pdffile}: {e}')
 
 
 def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
@@ -332,7 +329,7 @@ def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
         Location of text file.
     """
     if not os.path.exists(path):
-        raise RuntimeError('No such path: %s' % path)
+        raise RuntimeError(f'No such path: {path}')
     outpath = reextension(path, 'txt')
 
     if os.path.exists(outpath):
diff --git a/arxiv_public_data/internal_citations.py b/arxiv_public_data/internal_citations.py
index 3ab715a..5bade7b 100644
--- a/arxiv_public_data/internal_citations.py
+++ b/arxiv_public_data/internal_citations.py
@@ -33,10 +33,7 @@ def all_articles(directory=DIR_FULLTEXT):
     directory = os.path.abspath(os.path.expanduser(directory))
 
     for root, dirs, files in os.walk(directory):
-        for f in files:
-            if 'txt' in f:
-                out.append(os.path.join(root, f))
-
+        out.extend(os.path.join(root, f) for f in files if 'txt' in f)
     return out
 
 def extract_references(filename, pattern=RE_FLEX):
@@ -75,12 +72,12 @@ def citation_list_inner(articles):
     cites = {}
     for i, article in enumerate(articles):
         if i > 0 and i % 1000 == 0:
-            log.info('Completed {} articles'.format(i))
+            log.info(f'Completed {i} articles')
         try:
             refs = extract_references(article)
             cites[path_to_id(article)] = refs
         except:
-            log.error("Error in {}".format(article))
+            log.error(f"Error in {article}")
             continue
     return cites
 
@@ -100,7 +97,7 @@ def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT):
             all arXiv citations in all articles
     """
     articles = all_articles(directory)
-    log.info('Calculating citation network for {} articles'.format(len(articles)))
+    log.info(f'Calculating citation network for {len(articles)} articles')
 
     pool = Pool(N)
 
@@ -123,6 +120,6 @@ def default_filename():
 def save_to_default_location(citations):
     filename = default_filename()
 
-    log.info('Saving to "{}"'.format(filename))
+    log.info(f'Saving to "{filename}"')
     with gzip.open(filename, 'wb') as fn:
         fn.write(json.dumps(citations).encode('utf-8'))
diff --git a/arxiv_public_data/oai_metadata.py b/arxiv_public_data/oai_metadata.py
index 3f98716..66baf9b 100644
--- a/arxiv_public_data/oai_metadata.py
+++ b/arxiv_public_data/oai_metadata.py
@@ -72,27 +72,24 @@ def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI,
     if response.status_code == 200:
         return response.text
 
-    if response.status_code == 503:
-        secs = int(response.headers.get('Retry-After', 20)) * 1.5
-        log.info('Requested to wait, waiting {} seconds until retry...'.format(secs))
-
-        time.sleep(secs)
-        return get_list_record_chunk(resumptionToken=resumptionToken)
-    else:
+    if response.status_code != 503:
         raise Exception(
-            'Unknown error in HTTP request {}, status code: {}'.format(
-                response.url, response.status_code
-            )
+            f'Unknown error in HTTP request {response.url}, status code: {response.status_code}'
         )
+    secs = int(response.headers.get('Retry-After', 20)) * 1.5
+    log.info(f'Requested to wait, waiting {secs} seconds until retry...')
+
+    time.sleep(secs)
+    return get_list_record_chunk(resumptionToken=resumptionToken)
 
 def _record_element_text(elm, name):
     """ XML helper function for extracting text from leaf (single-node) elements """
-    item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
+    item = elm.find(f'arXiv:{name}', OAI_XML_NAMESPACES)
     return item.text if item is not None else None
 
 def _record_element_all(elm, name):
     """ XML helper function for extracting text from queries with multiple nodes """
-    return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
+    return elm.findall(f'arXiv:{name}', OAI_XML_NAMESPACES)
 
 def parse_record(elm):
     """
@@ -160,9 +157,7 @@ def check_xml_errors(root):
     error = root.find('OAI:error', OAI_XML_NAMESPACES)
 
     if error is not None:
-        raise RuntimeError(
-            'OAI service returned error: {}'.format(error.text)
-        )
+        raise RuntimeError(f'OAI service returned error: {error.text}')
 
 def find_default_locations():
     outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz')
@@ -172,9 +167,7 @@ def find_default_locations():
     fn_outfile = sorted(glob.glob(outfile))
     fn_resume = sorted(glob.glob(resume))
 
-    if len(fn_outfile) > 0:
-        return fn_outfile[-1]
-    return None
+    return fn_outfile[-1] if len(fn_outfile) > 0 else None
 
 def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
     """
@@ -195,28 +188,26 @@ def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
     date = str(datetime.datetime.now()).split(' ')[0]
 
     outfile = (
-        outfile or # user-supplied
-        find_default_locations() or # already in progress 
-        os.path.join(
-            DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date)
-        ) # new file
+        outfile
+        or find_default_locations()
+        or os.path.join(DIR_BASE, f'arxiv-metadata-oai-{date}.json.gz')
     )
 
     directory = os.path.split(outfile)[0]
     if directory and not os.path.exists(directory):
         os.makedirs(directory)
-    tokenfile = '{}-resumptionToken.txt'.format(outfile)
+    tokenfile = f'{outfile}-resumptionToken.txt'
     chunk_index = 0
     total_records = 0
 
-    log.info('Saving metadata to "{}"'.format(outfile))
+    log.info(f'Saving metadata to "{outfile}"')
 
     resumptionToken = None
     if autoresume:
         try:
             resumptionToken = open(tokenfile, 'r').read()
         except Exception as e:
-            log.warn("No tokenfile found '{}'".format(tokenfile))
+            log.warn(f"No tokenfile found '{tokenfile}'")
             log.info("Starting download from scratch...")
 
     while True:
@@ -277,6 +268,6 @@ def validate_abstract_hashes(metadata, metadata_no_abstract):
     """ Validate that abstracts match the hashes """
     for m, n in zip(metadata, metadata_no_abstract):
         md5 = hashlib.md5(m['abstract'].encode()).hexdigest()
-        if not md5 == n['abstract_md5']:
+        if md5 != n['abstract_md5']:
             return False
     return True
diff --git a/arxiv_public_data/pdfstamp.py b/arxiv_public_data/pdfstamp.py
index d8ea220..9cab987 100644
--- a/arxiv_public_data/pdfstamp.py
+++ b/arxiv_public_data/pdfstamp.py
@@ -1,19 +1,14 @@
 import re
 
 SPACE_DIGIT = r'\s*\d\s*'
-SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT)
 SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*'
-SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR)
+SPACE_WORD = f'(?:{SPACE_CHAR})+'
 
 # old style ID, 7 digits in a row
 RE_NUM_OLD = SPACE_DIGIT*7
 
 # new style ID, 4 digits, ., 4,5 digits
-RE_NUM_NEW = (
-    SPACE_DIGIT*4 +
-    r'\.' +
-    SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT)
-)
+RE_NUM_NEW = SPACE_DIGIT * 4 + r'\.' + SPACE_DIGIT * 4 + f'(?:{SPACE_DIGIT})?'
 
 # the version part v1 V2 v 1, etc
 RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?'
@@ -22,20 +17,20 @@
 RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*'
 
 # any words within square brackets [cs.A I]
-RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD)
+RE_CATEGORIES = f'\[{SPACE_WORD}\]'
 
+SPACE_NUMBER = f'(?:{SPACE_DIGIT})+'
 # two digit date, month, year "29 Jan 2012"
 RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}')
 
 # the full identifier for the banner
 RE_ARXIV_ID = (
-    RE_ARXIV +
-    r'(?:' +
-    r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) +
-    r')' +
-    RE_VERSION +
-    RE_CATEGORIES +
-    RE_DATE
+    f'{RE_ARXIV}(?:'
+    + f'(?:{RE_NUM_NEW})|(?:{RE_NUM_OLD})'
+    + r')'
+    + RE_VERSION
+    + RE_CATEGORIES
+    + RE_DATE
 )
 
 REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID)
@@ -51,7 +46,7 @@ def _extract_arxiv_stamp(txt):
         return txt, ''
 
     s, e = match.span()
-    return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip()
+    return f'{txt[:s].strip()} {txt[e:].strip()}', txt[s:e].strip()
 
 
 def remove_stamp(txt, split=1000):
diff --git a/arxiv_public_data/regex_arxiv.py b/arxiv_public_data/regex_arxiv.py
index 2e620fe..76a7a63 100644
--- a/arxiv_public_data/regex_arxiv.py
+++ b/arxiv_public_data/regex_arxiv.py
@@ -45,19 +45,18 @@ def strip_version(name):
 
 def format_cat(name):
     """ Strip subcategory, add hyphen to category name if missing """
-    if '/' in name:  # OLD ID, names contains subcategory 
-        catsubcat, aid = name.split('/')
-        cat = catsubcat.split('.')[0] 
-        return dashdict.get(cat, cat) + "/" + aid
-    else:
+    if '/' not in name:
         return name
+    catsubcat, aid = name.split('/')
+    cat = catsubcat.split('.')[0]
+    return f"{dashdict.get(cat, cat)}/{aid}"
 
 def zeropad_1501(name):
     """ Arxiv IDs after yymm=1501 are padded to 5 zeros """
-    if not '/' in name:  # new ID
+    if '/' not in name:  # new ID
         yymm, num = name.split('.')
         if int(yymm) > 1500 and len(num) < 5:
-            return yymm + ".0" + num
+            return f"{yymm}.0{num}"
     return name
 
 def clean(name):
@@ -89,10 +88,10 @@ def clean(name):
 RE_NUM_OLD = RE_DATE + r'(?:\d{3})' + RE_VERSION
 
 # matches: 1612.00001 1203.0023v2
-RE_ID_NEW = r'(?:{})'.format(RE_NUM_NEW)
+RE_ID_NEW = f'(?:{RE_NUM_NEW})'
 
 # matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2
-RE_ID_OLD = r'(?:{}/{})'.format(RE_CATEGORIES, RE_NUM_OLD)
+RE_ID_OLD = f'(?:{RE_CATEGORIES}/{RE_NUM_OLD})'
 
 # =============================================================================
 # matches: https://arxiv.org/abs/ abs/ arxiv.org/abs/
@@ -109,60 +108,64 @@ def clean(name):
 RE_PREFIX_ARXIV = r'(?i:arxiv\s*[:/\s,.]*\s*)'
 
 # matches:  cs.AI/ cs.AI nucl-th
-RE_PREFIX_CATEGORIES = r'(?i:{})'.format(RE_CATEGORIES)
+RE_PREFIX_CATEGORIES = f'(?i:{RE_CATEGORIES})'
 
 # matches: e-prints: e-print eprints:
 RE_PREFIX_EPRINT = r'(?i:e[-]?print[s]?.{1,3})'
 
 # =============================================================================
 # matches simple old or new identifiers, no fancy business
-REGEX_ARXIV_SIMPLE = r'(?:{}|{})'.format(RE_ID_OLD, RE_ID_NEW)
+REGEX_ARXIV_SIMPLE = f'(?:{RE_ID_OLD}|{RE_ID_NEW})'
 
 # this one follows the guide set forth by:
 #   https://arxiv.org/help/arxiv_identifier
 REGEX_ARXIV_STRICT = (
-    r'(?:{})'.format(RE_PREFIX_ARXIV) +
-    r'(?:'
-      r'({})'.format(RE_ID_OLD) +
-    r'|'
-      r'({})'.format(RE_ID_NEW) +
-    r')'
-)
+    (f'(?:{RE_PREFIX_ARXIV})' + f'(?:({RE_ID_OLD})') + f'|({RE_ID_NEW})'
+) + r')'
 
 # this regex essentially accepts anything that looks like an arxiv id and has
 # the slightest smell of being one as well. that is, if it is an id and
 # mentions anything about the arxiv before hand, then it is an id.
 REGEX_ARXIV_FLEXIBLE = (
-    r'(?:'
-      r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
-    r')|(?:'
-      r'(?:'
-        r'(?:{})?'.format(RE_PREFIX_URL) +
-        r'(?:{})?'.format(RE_PREFIX_EPRINT) +
-        r'(?:'
-          r'(?:{})?'.format(RE_PREFIX_ARXIV) +
-          r'({})'.format(RE_ID_OLD) +  # capture
-        r'|'
-          r'(?:{})'.format(RE_PREFIX_ARXIV) +
-          r'(?:{}/)?'.format(RE_CATEGORIES) +
-          r'({})'.format(RE_ID_NEW) +  # capture
-        r')'
-      r')'
-    r'|'
-      r'(?:'
-        r'(?:{})|'.format(RE_PREFIX_URL) +
-        r'(?:{})|'.format(RE_PREFIX_EPRINT) +
-        r'(?:{})|'.format(RE_PREFIX_CATEGORIES) +
-        r'(?:{})'.format(RE_PREFIX_ARXIV) +
-      r')'
-      r'.*?'
-      r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
-    r')|(?:'
-      r'(?:[\[\(]\s*)'
-        r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
-      r'(?:\s*[\]\)])'
-    r')'
-)
+    (
+        (
+            (
+                (
+                    (
+                        (
+                            (
+                                (
+                                    (
+                                        (
+                                            (
+                                                (
+                                                    f'(?:({REGEX_ARXIV_SIMPLE})'
+                                                    + f')|(?:(?:(?:{RE_PREFIX_URL})?'
+                                                )
+                                                + f'(?:{RE_PREFIX_EPRINT})?'
+                                            )
+                                            + f'(?:(?:{RE_PREFIX_ARXIV})?'
+                                        )
+                                        + f'({RE_ID_OLD})'
+                                    )
+                                    + f'|(?:{RE_PREFIX_ARXIV})'
+                                )
+                                + f'(?:{RE_CATEGORIES}/)?'
+                            )
+                            + f'({RE_ID_NEW})'
+                        )
+                        + f'))|(?:(?:{RE_PREFIX_URL})|'
+                    )
+                    + f'(?:{RE_PREFIX_EPRINT})|'
+                )
+                + f'(?:{RE_PREFIX_CATEGORIES})|'
+            )
+            + f'(?:{RE_PREFIX_ARXIV})'
+        )
+        + f').*?({REGEX_ARXIV_SIMPLE})'
+    )
+    + f')|(?:(?:[\[\(]\s*)({REGEX_ARXIV_SIMPLE})'
+) + r'(?:\s*[\]\)])' r')'
 
 TEST_POSITIVE = [
     'arXiv:quant-ph 1503.01017v3',
diff --git a/arxiv_public_data/s3_bulk_download.py b/arxiv_public_data/s3_bulk_download.py
index 12a0524..14293e5 100644
--- a/arxiv_public_data/s3_bulk_download.py
+++ b/arxiv_public_data/s3_bulk_download.py
@@ -102,17 +102,17 @@ def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False,
         }
     )
     if not dryrun:
-        logger.info('Requesting "{}" (costs money!)'.format(filename))
+        logger.info(f'Requesting "{filename}" (costs money!)')
         request = requests.get(url, stream=True)
         response_iter = request.iter_content(chunk_size=chunk_size)
-        logger.info("\t Writing {}".format(outfile))
+        logger.info(f"\t Writing {outfile}")
         with gzip.open(outfile, 'wb') as fout:
-            for i, chunk in enumerate(response_iter):
+            for chunk in response_iter:
                 fout.write(chunk)
                 md5.update(chunk)
     else:
-        logger.info('Requesting "{}" (free!)'.format(filename))
-        logger.info("\t Writing {}".format(outfile))
+        logger.info(f'Requesting "{filename}" (free!)')
+        logger.info(f"\t Writing {outfile}")
     return md5.hexdigest()
 
 def default_manifest_filename():
@@ -159,7 +159,7 @@ def parse_manifest(manifest):
     ]
 
 def _tar_to_filename(filename):
-    return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'
+    return f'{os.path.join(DIR_PDFTARS, os.path.basename(filename))}.gz'
 
 def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=False):
     """ Download filename, check its md5sum, and form the output path """
@@ -170,9 +170,7 @@ def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=Fals
 
     if not dryrun:
         if md5_expected != md5_downloaded:
-            msg = "MD5 '{}' does not match expected '{}' for file '{}'".format(
-                md5_downloaded, md5_expected, filename
-            )
+            msg = f"MD5 '{md5_downloaded}' does not match expected '{md5_expected}' for file '{filename}'"
             raise AssertionError(msg)
 
     return outname
@@ -195,13 +193,12 @@ def download_check_tarfiles(list_of_fileinfo, dryrun=False):
 
 def call(cmd, dryrun=False, debug=False):
     """ Spawn a subprocess and execute the string in cmd """
-    if dryrun:
-        logger.info(cmd)
-        return 0
-    else:
+    if not dryrun:
         return subprocess.check_call(
             shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
         )
+    logger.info(cmd)
+    return 0
 
 def _make_pathname(filename):
     """
@@ -235,7 +232,7 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
     outname = _tar_to_filename(filename)
 
     if not os.path.exists(outname):
-        msg = 'Tarfile from manifest not found {}, skipping...'.format(outname)
+        msg = f'Tarfile from manifest not found {outname}, skipping...'
         logger.error(msg)
         return
 
@@ -245,7 +242,7 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
         cmd = 'tar --one-top-level -C {} -xf {} {}'
         cmd = cmd.format(DIR_PDFTARS, outname, namelist)
     else:
-        cmd = 'tar --one-top-level -C {} -xf {}'.format(DIR_PDFTARS, outname)
+        cmd = f'tar --one-top-level -C {DIR_PDFTARS} -xf {outname}'
     _call(cmd, dryrun)
 
     basename = os.path.splitext(os.path.basename(filename))[0]
@@ -257,18 +254,18 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
     )
 
     # move txt into final file structure
-    txtfiles = glob.glob('{}/*.txt'.format(pdfdir))
+    txtfiles = glob.glob(f'{pdfdir}/*.txt')
     for tf in txtfiles:
         mvfn = _make_pathname(tf)
         dirname = os.path.dirname(mvfn)
         if not os.path.exists(dirname):
-            _call('mkdir -p {}'.format(dirname), dryrun)
+            _call(f'mkdir -p {dirname}', dryrun)
 
         if not dryrun:
             shutil.move(tf, mvfn)
 
     # clean up pdfs
-    _call('rm -rf {}'.format(os.path.join(DIR_PDFTARS, basename)), dryrun)
+    _call(f'rm -rf {os.path.join(DIR_PDFTARS, basename)}', dryrun)
 
 def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processes=1):
     """
@@ -295,10 +292,10 @@ def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processe
     md5sum = fileinfo['md5sum']
 
     if check_if_any_processed(fileinfo):
-        logger.info('Tar file appears processed, skipping {}...'.format(filename))
+        logger.info(f'Tar file appears processed, skipping {filename}...')
         return
 
-    logger.info('Processing tar "{}" ...'.format(filename))
+    logger.info(f'Processing tar "{filename}" ...')
     process_tarfile_inner(filename, pdfnames=None, processes=processes, dryrun=dryrun)
 
 def process_manifest_files(list_of_fileinfo, processes=1, dryrun=False):
@@ -341,9 +338,9 @@ def generate_tarfile_indices(manifest):
 
     for fileinfo in manifest:
         name = fileinfo['filename']
-        logger.info("Indexing {}...".format(name))
+        logger.info(f"Indexing {name}...")
 
-        tarname = os.path.join(DIR_PDFTARS, os.path.basename(name))+'.gz'
+        tarname = f'{os.path.join(DIR_PDFTARS, os.path.basename(name))}.gz'
         files = [i for i in tarfile.open(tarname).getnames() if i.endswith('.pdf')]
 
         index[name] = files
@@ -356,7 +353,7 @@ def check_missing_txt_files(index):
     """
     missing = defaultdict(list)
     for tar, pdflist in index.items():
-        logger.info("Checking {}...".format(tar))
+        logger.info(f"Checking {tar}...")
         for pdf in pdflist:
             txt = _make_pathname(pdf).replace('.pdf', '.txt')
 
@@ -371,12 +368,10 @@ def rerun_missing(missing, processes=1):
     files which are missing from the conversion. There are various reasons
     that they can fail.
     """
-    sort = list(reversed(
-        sorted([(k, v) for k, v in missing.items()], key=lambda x: len(x[1]))
-    ))
+    sort = list(reversed(sorted(list(missing.items()), key=lambda x: len(x[1]))))
 
     for tar, names in sort:
-        logger.info("Running {} ({} to do)...".format(tar, len(names)))
+        logger.info(f"Running {tar} ({len(names)} to do)...")
         process_tarfile_inner(
             tar, pdfnames=names, processes=processes,
             timelimit=5 * fulltext.TIMELIMIT
diff --git a/arxiv_public_data/slice_pdfs.py b/arxiv_public_data/slice_pdfs.py
index 1b99f80..199685a 100644
--- a/arxiv_public_data/slice_pdfs.py
+++ b/arxiv_public_data/slice_pdfs.py
@@ -8,10 +8,10 @@
 def id_to_tarpdf(n):
     if '.' in n:
         ym = n.split('.')[0]
-        return '{}/{}.pdf'.format(ym, n)
+        return f'{ym}/{n}.pdf'
     else:
         ym = n.split('/')[1][:4]
-        return '{}/{}.pdf'.format(ym, n.replace('/', ''))
+        return f"{ym}/{n.replace('/', '')}.pdf"
 
 def _call(cmd, dryrun=False, debug=False):
     """ Spawn a subprocess and execute the string in cmd """
@@ -20,7 +20,7 @@ def _call(cmd, dryrun=False, debug=False):
     )
 
 def _tar_to_filename(filename):
-    return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'
+    return f'{os.path.join(DIR_PDFTARS, os.path.basename(filename))}.gz'
 
 def extract_files(tarfile, pdfs, outdir):
     """
@@ -34,9 +34,9 @@ def extract_files(tarfile, pdfs, outdir):
     tdir = os.path.join(DIR_PDFTARS, basename)
     outpdfs = ' '.join([os.path.join(tdir, id_to_tarpdf(i)) for i in pdfs])
 
-    cmd0 = 'tar --one-top-level -C {} -xf {} {}'.format(DIR_PDFTARS, outname, namelist)
-    cmd1 = 'cp -a {} {}'.format(outpdfs, outdir)
-    cmd2 = 'rm -rf {}'.format(tdir)
+    cmd0 = f'tar --one-top-level -C {DIR_PDFTARS} -xf {outname} {namelist}'
+    cmd1 = f'cp -a {outpdfs} {outdir}'
+    cmd2 = f'rm -rf {tdir}'
 
     _call(cmd0)
     _call(cmd1)
@@ -56,9 +56,9 @@ def call_list(ai, manifest):
     num = 0
     for i in ai:
         aid = i.get('id')
-    
+
         tar = id_to_tarpdf(aid)
-        if not tar in inv:
+        if tar not in inv:
             continue
         tars[inv[id_to_tarpdf(aid)]].append(aid)
 
diff --git a/src/Surveyor.py b/src/Surveyor.py
index 3d0657e..b3dbcab 100644
--- a/src/Surveyor.py
+++ b/src/Surveyor.py
@@ -116,7 +116,7 @@ def __init__(
             self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name, trust_remote_code=True).to(self.torch_device)
             self.title_model.eval()
             if not no_save_models:
-                self.title_model.save_pretrained(models_dir + "/title_model")
+                self.title_model.save_pretrained(f"{models_dir}/title_model")
             #self.title_tokenizer.save_pretrained(models_dir + "/title_tokenizer")
 
             # summary model
@@ -127,10 +127,10 @@ def __init__(
                 self.torch_device)
             self.summ_model.eval()
             if not no_save_models:
-                self.summ_model.save_pretrained(models_dir + "/summ_model")
+                self.summ_model.save_pretrained(f"{models_dir}/summ_model")
             #self.summ_tokenizer.save_pretrained(models_dir + "/summ_tokenizer")
             self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer)
-            
+
             if 'led' in ledmodel_name:
                 self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name)
                 self.ledmodel = LEDForConditionalGeneration.from_pretrained(ledmodel_name).to(self.torch_device)
@@ -142,40 +142,49 @@ def __init__(
                 self.ledmodel = BartForConditionalGeneration.from_pretrained(ledmodel_name).to(self.torch_device)
             self.ledmodel.eval()
             if not no_save_models:
-                self.ledmodel.save_pretrained(models_dir + "/ledmodel")
+                self.ledmodel.save_pretrained(f"{models_dir}/ledmodel")
             #self.ledtokenizer.save_pretrained(models_dir + "/ledtokenizer")
 
             self.embedder = SentenceTransformer(embedder_name)
             self.embedder.eval()
             if not no_save_models:
-                self.embedder.save(models_dir + "/embedder")
+                self.embedder.save(f"{models_dir}/embedder")
         else:
             self.print_fn("\n- Initializing from previously saved models at" + models_dir)
             self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
-            self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
+            self.title_model = AutoModelForSeq2SeqLM.from_pretrained(
+                f"{models_dir}/title_model"
+            ).to(self.torch_device)
             self.title_model.eval()
 
             # summary model
             #self.summ_config = AutoConfig.from_pretrained(ex_summ_model_name)
             #self.summ_config.output_hidden_states = True
             self.summ_tokenizer = AutoTokenizer.from_pretrained(ex_summ_model_name)
-            self.summ_model = AutoModel.from_pretrained(models_dir + "/summ_model").to(
-                self.torch_device)
+            self.summ_model = AutoModel.from_pretrained(
+                f"{models_dir}/summ_model"
+            ).to(self.torch_device)
             self.summ_model.eval()
             self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer)
 
             if 'led' in ledmodel_name:
                 self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name)
-                self.ledmodel = LEDForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device)
+                self.ledmodel = LEDForConditionalGeneration.from_pretrained(
+                    f"{models_dir}/ledmodel"
+                ).to(self.torch_device)
             elif 't5' in ledmodel_name:
                 self.ledtokenizer = AutoTokenizer.from_pretrained(ledmodel_name)
-                self.ledmodel = T5ForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device)
+                self.ledmodel = T5ForConditionalGeneration.from_pretrained(
+                    f"{models_dir}/ledmodel"
+                ).to(self.torch_device)
             elif 'bart' in ledmodel_name:
                 self.ledtokenizer = AutoTokenizer.from_pretrained(ledmodel_name)
-                self.ledmodel = BartForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device)
+                self.ledmodel = BartForConditionalGeneration.from_pretrained(
+                    f"{models_dir}/ledmodel"
+                ).to(self.torch_device)
             self.ledmodel.eval()
 
-            self.embedder = SentenceTransformer(models_dir + "/embedder")
+            self.embedder = SentenceTransformer(f"{models_dir}/embedder")
             self.embedder.eval()
 
         self.nlp = spacy.load(nlp_name)
@@ -185,33 +194,13 @@ def __init__(
 
     def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
 
-        if pdf_dir:
-            survey_pdf_dir = pdf_dir
-        else:
-            survey_pdf_dir = self.DEFAULTS["pdf_dir"]
-
-        if txt_dir:
-            survey_txt_dir = txt_dir
-        else:
-            survey_txt_dir = self.DEFAULTS["txt_dir"]
-
-        if img_dir:
-            survey_img_dir = img_dir
-        else:
-            survey_img_dir = self.DEFAULTS["img_dir"]
-
-        if tab_dir:
-            survey_tab_dir = tab_dir
-        else:
-            survey_tab_dir = self.DEFAULTS["tab_dir"]
-
-        if dump_dir:
-            survey_dump_dir = dump_dir
-        else:
-            survey_dump_dir = self.DEFAULTS["dump_dir"]
-
+        survey_pdf_dir = pdf_dir if pdf_dir else self.DEFAULTS["pdf_dir"]
+        survey_txt_dir = txt_dir if txt_dir else self.DEFAULTS["txt_dir"]
+        survey_img_dir = img_dir if img_dir else self.DEFAULTS["img_dir"]
+        survey_tab_dir = tab_dir if tab_dir else self.DEFAULTS["tab_dir"]
+        survey_dump_dir = dump_dir if dump_dir else self.DEFAULTS["dump_dir"]
         dirs = [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir]
-        if sum([True for dir in dirs if 'arxiv_data/' in dir]):
+        if sum(True for dir in dirs if 'arxiv_data/' in dir):
             base = os.path.dirname("arxiv_data/")
             if not os.path.exists(base):
                 os.mkdir(base)
@@ -251,7 +240,7 @@ def pdf_route(self, pdf_dir, txt_dir, img_dir, tab_dir, dump_dir, papers_meta):
         self.print_fn("\n- Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
         papers.extend(new_papers)
 
-        joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
+        joblib.dump(papers, f'{dump_dir}papers_extracted_pdf_route.dmp')
         copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
         copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
 
@@ -296,7 +285,7 @@ def fetch_papers(self, dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir, rep
         # plugging citations to our papers object
         self.print_fn("\n- plugging in citation network.. ")
         papers, cites = self.cocitation_network(papers, txt_dir)
-        joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
+        joblib.dump(papers, f'{dump_dir}papers_selected_pdf_route.dmp')
         from distutils.dir_util import copy_tree
         copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
         copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
@@ -335,90 +324,89 @@ def build_doc(self, research_sections, papers, query=None, filename='survey.txt'
         bibentries = [r.bibtex() for r in bibentries]
 
         self.print_fn("\n- building final survey file .. at "+ filename)
-        file = open(filename, 'w+')
-        if query is None:
-            query = 'Internal(existing) research'
-        self.survey_print_fn("#### Generated_survey:")
-        file.write("----------------------------------------------------------------------")
-        file.write("Title: A survey on " + query)
-        self.survey_print_fn("")
-        self.survey_print_fn("----------------------------------------------------------------------")
-        self.survey_print_fn("Title: A survey on " + query)
-        file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
-        self.survey_print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
-        file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
-        self.survey_print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
-        file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
-                   "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
-                   "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
-                   "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
-                   "\nentries(only to avoid LaTex overhead). ")
-        self.survey_print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
-                "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
-                "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
-                "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
-                "\nentries(only to avoid LaTex overhead). ")
-        file.write("----------------------------------------------------------------------")
-        self.survey_print_fn("----------------------------------------------------------------------")
-        file.write("")
-        self.survey_print_fn("")
-        file.write('ABSTRACT')
-        self.survey_print_fn('ABSTRACT')
-        self.survey_print_fn("=================================================")
-        file.write("=================================================")
-        file.write("")
-        self.survey_print_fn("")
-        file.write(research_sections['abstract'])
-        self.survey_print_fn(research_sections['abstract'])
-        file.write("")
-        self.survey_print_fn("")
-        file.write('INTRODUCTION')
-        self.survey_print_fn('INTRODUCTION')
-        self.survey_print_fn("=================================================")
-        file.write("=================================================")
-        file.write("")
-        self.survey_print_fn("")
-        file.write(research_sections['introduction'])
-        self.survey_print_fn(research_sections['introduction'])
-        file.write("")
-        self.survey_print_fn("")
-        for k, v in research_sections.items():
-            if k not in ['abstract', 'introduction', 'conclusion']:
-                file.write(k.upper())
-                self.survey_print_fn(k.upper())
-                self.survey_print_fn("=================================================")
-                file.write("=================================================")
-                file.write("")
-                self.survey_print_fn("")
-                file.write(v)
-                self.survey_print_fn(v)
-                file.write("")
-                self.survey_print_fn("")
-        file.write('CONCLUSION')
-        self.survey_print_fn('CONCLUSION')
-        self.survey_print_fn("=================================================")
-        file.write("=================================================")
-        file.write("")
-        self.survey_print_fn("")
-        file.write(research_sections['conclusion'])
-        self.survey_print_fn(research_sections['conclusion'])
-        file.write("")
-        self.survey_print_fn("")
-
-        file.write('REFERENCES')
-        self.survey_print_fn('REFERENCES')
-        self.survey_print_fn("=================================================")
-        file.write("=================================================")
-        file.write("")
-        self.survey_print_fn("")
-        for entry in bibentries:
-            file.write(entry)
-            self.survey_print_fn(entry)
+        with open(filename, 'w+') as file:
+            if query is None:
+                query = 'Internal(existing) research'
+            self.survey_print_fn("#### Generated_survey:")
+            file.write("----------------------------------------------------------------------")
+            file.write(f"Title: A survey on {query}")
+            self.survey_print_fn("")
+            self.survey_print_fn("----------------------------------------------------------------------")
+            self.survey_print_fn(f"Title: A survey on {query}")
+            file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
+            self.survey_print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
+            file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
+            self.survey_print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
+            file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
+                       "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
+                       "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
+                       "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
+                       "\nentries(only to avoid LaTex overhead). ")
+            self.survey_print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
+                    "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
+                    "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
+                    "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
+                    "\nentries(only to avoid LaTex overhead). ")
+            file.write("----------------------------------------------------------------------")
+            self.survey_print_fn("----------------------------------------------------------------------")
+            file.write("")
+            self.survey_print_fn("")
+            file.write('ABSTRACT')
+            self.survey_print_fn('ABSTRACT')
+            self.survey_print_fn("=================================================")
+            file.write("=================================================")
+            file.write("")
+            self.survey_print_fn("")
+            file.write(research_sections['abstract'])
+            self.survey_print_fn(research_sections['abstract'])
+            file.write("")
+            self.survey_print_fn("")
+            file.write('INTRODUCTION')
+            self.survey_print_fn('INTRODUCTION')
+            self.survey_print_fn("=================================================")
+            file.write("=================================================")
+            file.write("")
+            self.survey_print_fn("")
+            file.write(research_sections['introduction'])
+            self.survey_print_fn(research_sections['introduction'])
+            file.write("")
+            self.survey_print_fn("")
+            for k, v in research_sections.items():
+                if k not in ['abstract', 'introduction', 'conclusion']:
+                    file.write(k.upper())
+                    self.survey_print_fn(k.upper())
+                    self.survey_print_fn("=================================================")
+                    file.write("=================================================")
+                    file.write("")
+                    self.survey_print_fn("")
+                    file.write(v)
+                    self.survey_print_fn(v)
+                    file.write("")
+                    self.survey_print_fn("")
+            file.write('CONCLUSION')
+            self.survey_print_fn('CONCLUSION')
+            self.survey_print_fn("=================================================")
+            file.write("=================================================")
+            file.write("")
+            self.survey_print_fn("")
+            file.write(research_sections['conclusion'])
+            self.survey_print_fn(research_sections['conclusion'])
             file.write("")
             self.survey_print_fn("")
-        self.survey_print_fn("========================XXX=========================")
-        file.write("========================XXX=========================")
-        file.close()
+
+            file.write('REFERENCES')
+            self.survey_print_fn('REFERENCES')
+            self.survey_print_fn("=================================================")
+            file.write("=================================================")
+            file.write("")
+            self.survey_print_fn("")
+            for entry in bibentries:
+                file.write(entry)
+                self.survey_print_fn(entry)
+                file.write("")
+                self.survey_print_fn("")
+            self.survey_print_fn("========================XXX=========================")
+            file.write("========================XXX=========================")
 
     def build_basic_blocks(self, corpus_known_sections, corpus):
 
@@ -430,8 +418,8 @@ def build_basic_blocks(self, corpus_known_sections, corpus):
             with torch.no_grad():
                 summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
             res = self.nlp(summtext)
-            res = set([str(sent) for sent in list(res.sents)])
-            summtext = ''.join([line for line in res])
+            res = {str(sent) for sent in list(res.sents)}
+            summtext = ''.join(list(res))
             # pself.print_fn(summtext)
             research_blocks[head] = summtext
 
@@ -457,10 +445,8 @@ def abstractive_summary(self, longtext):
         summary = self.ledtokenizer.batch_decode(summary_ids, skip_special_tokens=True,
                                                  clean_up_tokenization_spaces=True)
         res = self.nlp(summary[0])
-        res = set([str(sent) for sent in list(res.sents)])
-        summtext = ''.join([line for line in res])
-        #self.print_fn("abstractive summary type:" + str(type(summary)))
-        return summtext
+        res = {str(sent) for sent in list(res.sents)}
+        return ''.join(list(res))
 
     def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
 
@@ -482,7 +468,7 @@ def get_corpus_lines(self, corpus):
         for k, v in corpus.items():
             # self.print_fn(v)
             types.add(type(v))
-            abstext = k + '. ' + v.replace('\n', ' ')
+            abstext = f'{k}. ' + v.replace('\n', ' ')
             abstext = self.nlp(abstext)
             abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
         #self.print_fn("unique corpus value types:" + str(types))
@@ -498,10 +484,11 @@ def get_sectioned_docs(self, papers, papers_meta):
                     content = self.extractive_summary(''.join(section['highlights']))
                     docs.append(content)
         selected_pids = [p['id'] for p in papers]
-        meta_abs = []
-        for p in papers_meta:
-            if p['id'] not in selected_pids:
-                meta_abs.append(self.generate_title(p['abstract']))
+        meta_abs = [
+            self.generate_title(p['abstract'])
+            for p in papers_meta
+            if p['id'] not in selected_pids
+        ]
         docs.extend(meta_abs)
         #self.print_fn("meta_abs num"+str(len(meta_abs)))
         #self.print_fn("selected_pids num"+str(len(selected_pids)))
@@ -609,18 +596,23 @@ def get_clustered_sections(self, clustered_lines):
         for i, cluster in clustered_lines.items():
             # self.print_fn(cluster)
             try:
-                clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
-                    str(" ".join(cluster)).lower())
+                clusters_dict[
+                    self.generate_title(" ".join(cluster))
+                ] = self.abstractive_summary(" ".join(cluster).lower())
             except:
-                clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
-                    self.extractive_summary(str(" ".join(cluster)).lower()))
+                clusters_dict[
+                    self.generate_title(" ".join(cluster))
+                ] = self.abstractive_summary(
+                    self.extractive_summary(" ".join(cluster).lower())
+                )
 
         return clusters_dict
 
     def get_intro(self, corpus_known_sections, research_blocks):
         intro_lines = ""
-        intro_lines += str(" ".join([l.lower() for l in corpus_known_sections['introduction']])) + str(
-            " ".join([l.lower() for l in corpus_known_sections['conclusion']]))
+        intro_lines += " ".join(
+            [l.lower() for l in corpus_known_sections['introduction']]
+        ) + " ".join([l.lower() for l in corpus_known_sections['conclusion']])
         intro_lines += research_blocks['introduction'] + research_blocks['conclusion']
         try:
             return self.abstractive_summary(intro_lines)
@@ -628,10 +620,7 @@ def get_intro(self, corpus_known_sections, research_blocks):
             return self.abstractive_summary(self.extractive_summary(intro_lines))
 
     def get_conclusion(self, research_sections):
-        paper_body = ""
-        for k, v in research_sections.items():
-            paper_body += v
-
+        paper_body = "".join(v for k, v in research_sections.items())
         try:
             return self.abstractive_summary(paper_body)
         except:
@@ -670,11 +659,11 @@ def build_corpus(self, papers, papers_meta):
         corpus = self.build_meta_corpus(papers_meta)
         for p in papers:
             ph = []
-            for sid, section in enumerate(p['sections']):
+            for section in p['sections']:
                 ph.extend(section['highlights'])
             for pid, ls in corpus.items():
                 if pid == p['id']:
-                    corpus[pid] = p['abstract'] + str(' '.join(ph))
+                    corpus[pid] = p['abstract'] + ' '.join(ph)
         '''
         self.print_fn("==================    final corpus       ====================")
         self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()]))
@@ -700,7 +689,7 @@ def build_meta_corpus(self, papers):
             ptext = p['title'] + ". " + p['abstract']
             doc = self.nlp(ptext)
             phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
-            meta_corpus[pid] = str(' '.join(phs))
+            meta_corpus[pid] = ' '.join(phs)
         '''
         self.print_fn("==================    meta corpus       ====================")
         self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()]))
@@ -732,7 +721,7 @@ def select_papers(self, papers, query, num_papers=20):
         # self.print_fn("argsort pids("+str(num_papers)+" papers): "+ str(idx))
         papers_selected = [p for p in papers if p['id'] in idx]
         # assert(len(papers_selected)==num_papers)
-        self.print_fn("num papers selected: " + str(len(papers_selected)))
+        self.print_fn(f"num papers selected: {len(papers_selected)}")
         for p in papers_selected:
             self.print_fn("Selected Paper: " + p['title'])
 
@@ -751,7 +740,7 @@ def extractive_summary(self, text):
         with torch.no_grad():
             res = self.model(text, ratio=0.5)
         res_doc = self.nlp(res)
-        return " ".join(set([str(sent) for sent in list(res_doc.sents)]))
+        return " ".join({str(sent) for sent in list(res_doc.sents)})
 
     def extractive_highlights(self, lines):
         # text = " ".join(lines)
@@ -762,20 +751,25 @@ def extractive_highlights(self, lines):
         with torch.no_grad():
             res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
         res_doc = self.nlp(res)
-        res_lines = set([str(sent) for sent in list(res_doc.sents)])
+        res_lines = {str(sent) for sent in list(res_doc.sents)}
         # self.print_fn("\n- ".join(res_sents))
         with torch.no_grad():
-            keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
-            keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
-                                                    keyphrase_ngram_range=(4, 4),
-                                                    stop_words='english', use_mmr=True, diversity=0.7)
+            keywords = self.kw_model.extract_keywords(
+                " ".join([l.lower() for l in lines]), stop_words='english'
+            )
+            keyphrases = self.kw_model.extract_keywords(
+                " ".join([l.lower() for l in lines]),
+                keyphrase_ngram_range=(4, 4),
+                stop_words='english',
+                use_mmr=True,
+                diversity=0.7,
+            )
         return res_lines, keywords, keyphrases
 
     def extract_highlights(self, papers):
         for p in papers:
-            sid = 0
             p['sections'] = []
-            for heading, lines in p['body_text'].items():
+            for sid, (heading, lines) in enumerate(p['body_text'].items()):
                 hs, kws, kps = self.extractive_highlights(lines)
                 p['sections'].append({
                     'sid': sid,
@@ -785,7 +779,6 @@ def extract_highlights(self, papers):
                     'keywords': kws,
                     'keyphrases': kps,
                 })
-                sid += 1
         return papers
 
     def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
@@ -808,7 +801,7 @@ def extract_parts(self, papers, txt_dir, dump_dir):
         # model = build_summarizer()
         #for file in glob.glob(txt_dir + '/*.txt'):
         for p in papers:
-            file = txt_dir + '/'+ p['id'] +'.txt'
+            file = f'{txt_dir}/' + p['id'] + '.txt'
             refined, headings_extracted = self.extract_headings(file)
             sections = self.extract_sections(headings_extracted, refined)
             # highlights = {k: extract_highlights(model,v) for k, v in sections.items()}
@@ -831,15 +824,13 @@ def extract_parts(self, papers, txt_dir, dump_dir):
         # pself.print_fn({f: len(h) for f,h in headings_all.items()})
         papers_none = [p for p in papers if p['id'] in ids_none]
         for p in papers_none:
-            os.remove(txt_dir + '/'+ p['id'] + '.txt')
+            os.remove(f'{txt_dir}/' + p['id'] + '.txt')
             papers.remove(p)
 
         return papers, ids_none
 
     def check_para(self, df):
-        size = 0
-        for col in df.columns:
-            size += df[col].apply(lambda x: len(str(x))).median()
+        size = sum(df[col].apply(lambda x: len(str(x))).median() for col in df.columns)
         return size / len(df.columns) > 25
 
     def scan_blocks(self, lines):
@@ -868,7 +859,7 @@ def extract_sections(self, headings, lines, min_part_length=2):
               sections[start] = section
             '''
             sections[start] = section
-        return {k: v for k, v in sections.items()}
+        return dict(sections)
 
     def is_rubbish(self, s, rubbish_tolerance=0.2, min_char_len=4):
         # numbers = sum(c.isdigit() for c in s)
@@ -877,10 +868,9 @@ def is_rubbish(self, s, rubbish_tolerance=0.2, min_char_len=4):
         # others  = len(s) - numbers - letters - spaces
         if len(s) == 0:
             return False
-        if ((len(s) - (letters + spaces)) / len(s) >= rubbish_tolerance) or self.alpha_length(s) < min_char_len:
-            return True
-        else:
-            return False
+        return (len(s) - (letters + spaces)) / len(
+            s
+        ) >= rubbish_tolerance or self.alpha_length(s) < min_char_len
 
     def get_section(self, first, last, lines):
         try:
@@ -890,21 +880,17 @@ def get_section(self, first, last, lines):
             # end = lines.index( last, start )
             start = [i for i in range(len(lines)) if first is lines[i]][0]
             end = [i for i in range(len(lines)) if last is lines[i]][0]
-            section_lines = lines[start + 1:end]
-            # self.print_fn("heading: " + str(first))
-            # self.print_fn("section_lines: "+ str(section_lines))
-            # self.print_fn(section_lines)
-            return section_lines
+            return lines[start + 1:end]
         except ValueError:
             self.print_fn("value error :")
-            self.print_fn("first heading :" + str(first) + ", second heading :" + str(last))
-            self.print_fn("first index :" + str(start) + ", second index :" + str(end))
+            self.print_fn(f"first heading :{str(first)}, second heading :{str(last)}")
+            self.print_fn(f"first index :{str(start)}, second index :{str(end)}")
             return ""
 
     def check_list_elems_in_list(self, headings, lines):
         import numpy as np
         # [self.print_fn(head) for head in headings if head not in lines ]
-        return np.all([True if head in lines else False for head in headings])
+        return np.all([head in lines for head in headings])
 
     def check_first_char_upper(self, text):
         for c in text:
@@ -929,17 +915,15 @@ def extract_headings(self, txt_file):
 
         # scan_failed - rescan with first match for abstract hook
         if len(headings) == 0:
-            # self.print_fn('===================')
-            # self.print_fn("run 1 failed")
-            abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
-            if len(abs_cans) != 0:
+            if abs_cans := [
+                line
+                for line in lines
+                if 'abstract' in re.sub("\s+", "", line.strip().lower())
+            ]:
                 abs_head = abs_cans[0]
                 refined, headings = self.scan_text(lines, abs_head=abs_head)
                 self.check_list_elems_in_list(headings, refined)
                 headings = self.check_duplicates(headings)
-                # self.print_fn('===================')
-                # self.print_fn(txt_file +": second scan: \n"+str(len(headings))+" headings")
-
         # if len(headings) == 0:
         # self.print_fn("heading scan failed completely")
 
@@ -947,8 +931,7 @@ def extract_headings(self, txt_file):
 
     def check_duplicates(self, my_list):
         my_finallist = []
-        dups = [s for s in my_list if my_list.count(s) > 1]
-        if len(dups) > 0:
+        if dups := [s for s in my_list if my_list.count(s) > 1]:
             [my_finallist.append(n) for n in my_list if n not in my_finallist]
 
         # self.print_fn("original: "+str(len(my_list))+" new: "+str(len(my_finallist)))
@@ -961,18 +944,22 @@ def clean_lines(self, text):
         # lines = [str(sent) for sent in doc.sents]
         lines = text.replace('\r', '').split('\n')
         lines = [line for line in lines if not self.is_rubbish(line)]
-        lines = [line for line in lines if
-                 re.match("^[a-zA-Z1-9\.\[\]\(\):\-,\"\"\s]*$", line) and not 'Figure' in line and not 'Table' in line]
+        lines = [
+            line
+            for line in lines
+            if re.match("^[a-zA-Z1-9\.\[\]\(\):\-,\"\"\s]*$", line)
+            and 'Figure' not in line
+            and 'Table' not in line
+        ]
 
         lengths_cleaned = [self.alpha_length(line) for line in lines]
         mean_length_cleaned = np.median(lengths_cleaned)
         lines_standardized = []
         for line in lines:
             if len(line) >= (1.8 * mean_length_cleaned):
-                first_half = line[0:len(line) // 2]
+                first_half = line[:len(line) // 2]
                 second_half = line[len(line) // 2 if len(line) % 2 == 0 else ((len(line) // 2) + 1):]
-                lines_standardized.append(first_half)
-                lines_standardized.append(second_half)
+                lines_standardized.extend((first_half, second_half))
             else:
                 lines_standardized.append(line)
 
@@ -1009,10 +996,8 @@ def scanline(self, record, headings, refined, id, lines):
         import re
         line = lines[id]
 
-        if not len(line) == 0:
-            # self.print_fn("in scanline")
-            # self.print_fn(line)
-            if record:
+        if record:
+            if len(line) != 0:
                 refined.append(line)
                 if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
                         "^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
@@ -1029,21 +1014,18 @@ def scanline(self, record, headings, refined, id, lines):
                 else:
                     known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
                     missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
-                    # for h in missing:
-                    head = [line for h in missing if h in re.sub("\s+", "", line.strip().lower())]
-                    # head = [line for known]
-                    if len(head) > 0:
+                    if head := [
+                        line
+                        for h in missing
+                        if h in re.sub("\s+", "", line.strip().lower())
+                    ]:
                         headings.append(head[0])
                         assert (head[0] in refined)
 
         return refined, headings
 
     def char_length(self, s):
-        # numbers = sum(c.isdigit() for c in s)
-        letters = sum(c.isalpha() for c in s)
-        # spaces  = sum(c.isspace() for c in s)
-        # others  = len(s) - numbers - letters - spaces
-        return letters
+        return sum(c.isalpha() for c in s)
 
     def get_by_file(self, file, papers):
         import os
@@ -1063,10 +1045,7 @@ def alpha_length(self, s):
         return letters + spaces
 
     def check_append(self, baselist, addstr):
-        check = False
-        for e in baselist:
-            if addstr in e:
-                check = True
+        check = any(addstr in e for e in baselist)
         if not check:
             baselist.append(addstr)
         return baselist
@@ -1098,9 +1077,16 @@ def extract_images_from_file(self, pdf_file_name, img_dir):
         for page_index in range(len(pdf_file)):
             page = pdf_file[page_index]
             images.extend(page.getImageList())
-        images_files = [self.save_image(pdf_file.extractImage(img[0]), i, pdf_file_name.replace('.pdf', ''), img_dir) for i, img in
-                        enumerate(set(images)) if img[0]]
-        return images_files
+        return [
+            self.save_image(
+                pdf_file.extractImage(img[0]),
+                i,
+                pdf_file_name.replace('.pdf', ''),
+                img_dir,
+            )
+            for i, img in enumerate(set(images))
+            if img[0]
+        ]
 
     def save_image(self, base_image, img_index, pid, img_dir):
         from PIL import Image
@@ -1111,7 +1097,7 @@ def save_image(self, base_image, img_index, pid, img_dir):
         # load it to PIL
         image = Image.open(io.BytesIO(image_bytes))
         # save it to local disk
-        fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
+        fname = f"{img_dir}/{str(pid)}_{str(img_index + 1)}.{image_ext}"
         image.save(open(f"{fname}", "wb"))
         # self.print_fn(fname)
         return fname
@@ -1121,7 +1107,7 @@ def save_tables(self, dfs, pid, tab_dir):
         dfs = [df for df in dfs if not self.check_para(df)]
         files = []
         for df in dfs:
-            filename = tab_dir + "/" + str(pid) + ".csv"
+            filename = f"{tab_dir}/{str(pid)}.csv"
             files.append(filename)
             df.to_csv(filename, index=False)
         return files
@@ -1160,7 +1146,7 @@ def search(self, query_text=None, id_list=None, max_search=100):
                 id_list=id_list
             )
 
-        results = [result for result in search.get()]
+        results = list(search.get())
 
         searched_papers = []
         discarded_ids = []
@@ -1199,7 +1185,7 @@ def download_pdfs(self, papers, pdf_dir):
         papers_filtered = arxiv.Search(id_list=ids).get()
         for p in papers_filtered:
             p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0]
-            download_file = pdf_dir + "/" + p_id + ".pdf"
+            download_file = f"{pdf_dir}/{p_id}.pdf"
             p.download_pdf(filename=download_file)
 
 
@@ -1211,7 +1197,7 @@ def download_sources(self, papers, src_dir):
         papers_filtered = arxiv.Search(id_list=ids).get()
         for p in papers_filtered:
             p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0]
-            download_file = src_dir + "/" + p_id + ".tar.gz"
+            download_file = f"{src_dir}/{p_id}.tar.gz"
             p.download_source(filename=download_file)
 
     def convert_pdfs(self, pdf_dir, txt_dir):
@@ -1221,13 +1207,12 @@ def convert_pdfs(self, pdf_dir, txt_dir):
         # import arxiv_public_data
 
         convert_directory_parallel(pdf_dir, multiprocessing.cpu_count())
-        for file in glob.glob(pdf_dir + '/*.txt'):
+        for file in glob.glob(f'{pdf_dir}/*.txt'):
             shutil.move(file, txt_dir)
 
     def read_paper(self, path):
-        f = open(path, 'r', encoding="utf-8")
-        text = str(f.read())
-        f.close()
+        with open(path, 'r', encoding="utf-8") as f:
+            text = str(f.read())
         return text
 
     def cocitation_network(self, papers, txt_dir):
@@ -1244,28 +1229,35 @@ def lookup_author(self, author_query):
         from scholarly import scholarly
         import operator
         # Retrieve the author's data, fill-in, and print
-        self.print_fn("Searching Author: " + author_query)
+        self.print_fn(f"Searching Author: {author_query}")
         search_result = next(scholarly.search_author(author_query), None)
 
         if search_result is not None:
             author = scholarly.fill(search_result)
-            author_stats = {
+            return {
                 'name': author_query,
-                'affiliation': author['affiliation'] if author['affiliation'] else None,
+                'affiliation': author['affiliation']
+                if author['affiliation']
+                else None,
                 'citedby': author['citedby'] if 'citedby' in author.keys() else 0,
-                'most_cited_year': max(author['cites_per_year'].items(), key=operator.itemgetter(1))[0] if len(
-                    author['cites_per_year']) > 0 else None,
+                'most_cited_year': max(
+                    author['cites_per_year'].items(), key=operator.itemgetter(1)
+                )[0]
+                if len(author['cites_per_year']) > 0
+                else None,
                 'coauthors': [c['name'] for c in author['coauthors']],
                 'hindex': author['hindex'],
                 'impact': author['i10index'],
                 'interests': author['interests'],
-                'publications': [{'title': p['bib']['title'], 'citations': p['num_citations']} for p in
-                                 author['publications']],
+                'publications': [
+                    {'title': p['bib']['title'], 'citations': p['num_citations']}
+                    for p in author['publications']
+                ],
                 'url_picture': author['url_picture'],
             }
         else:
             self.print_fn("author not found")
-            author_stats = {
+            return {
                 'name': author_query,
                 'affiliation': "",
                 'citedby': 0,
@@ -1278,18 +1270,13 @@ def lookup_author(self, author_query):
                 'url_picture': "",
             }
 
-        # pself.print_fn(author_stats)
-        return author_stats
-
     def author_stats(self, papers):
         all_authors = []
         for p in papers:
-            paper_authors = [a for a in p['authors']]
+            paper_authors = list(p['authors'])
             all_authors.extend(paper_authors)
 
-        searched_authors = [self.lookup_author(a) for a in set(all_authors)]
-
-        return searched_authors
+        return [self.lookup_author(a) for a in set(all_authors)]
 
     def text_similarity(self, text1, text2):
         doc1 = self.similarity_nlp(text1)
@@ -1316,9 +1303,9 @@ def ask(self, corpus, question):
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
-        self.print_fn("context: " + text)
-        self.print_fn("question: " + question)
-        self.print_fn("outputs: " + outputs)
+        self.print_fn(f"context: {text}")
+        self.print_fn(f"question: {question}")
+        self.print_fn(f"outputs: {outputs}")
         return outputs
 
     def zip_outputs(self, dump_dir, zip_name):
@@ -1352,7 +1339,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         # arxiv api relevance search and data preparation
         self.print_fn("\n- searching arXiv for top 100 papers.. ")
         results, searched_papers = self.search(query, id_list, max_search=max_search)
-        joblib.dump(searched_papers, survey_dump_dir + 'papers_metadata.dmp')
+        joblib.dump(searched_papers, f'{survey_dump_dir}papers_metadata.dmp')
         self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
 
         # paper selection by scibert vector embedding relevance scores
@@ -1364,27 +1351,27 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         if weigh_authors:
             authors = self.author_stats(papers_highlighted)
 
-        joblib.dump(papers_highlighted, survey_dump_dir + 'papers_highlighted.dmp')
+        joblib.dump(papers_highlighted, f'{survey_dump_dir}papers_highlighted.dmp')
 
         self.print_fn("\n- Standardizing known section headings per paper.. ")
         papers_standardized = self.standardize_headings(papers_highlighted)
-        joblib.dump(papers_standardized, survey_dump_dir + 'papers_standardized.dmp')
+        joblib.dump(papers_standardized, f'{survey_dump_dir}papers_standardized.dmp')
 
         self.print_fn("\n- Building paper-wise corpus.. ")
         corpus = self.build_corpus(papers_highlighted, searched_papers)
-        joblib.dump(corpus, survey_dump_dir + 'corpus.dmp')
+        joblib.dump(corpus, f'{survey_dump_dir}corpus.dmp')
 
         self.print_fn("\n- Building section-wise corpus.. ")
         corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
-        joblib.dump(corpus_sectionwise, survey_dump_dir + 'corpus_sectionwise.dmp')
+        joblib.dump(corpus_sectionwise, f'{survey_dump_dir}corpus_sectionwise.dmp')
 
         self.print_fn("\n- Building basic research highlights.. ")
         research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
-        joblib.dump(research_blocks, survey_dump_dir + 'research_blocks.dmp')
+        joblib.dump(research_blocks, f'{survey_dump_dir}research_blocks.dmp')
 
         self.print_fn("\n- Reducing corpus to lines.. ")
         corpus_lines = self.get_corpus_lines(corpus)
-        joblib.dump(corpus_lines, survey_dump_dir + 'corpus_lines.dmp')
+        joblib.dump(corpus_lines, f'{survey_dump_dir}corpus_lines.dmp')
 
         # temp
         # searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
@@ -1418,7 +1405,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
 
         self.print_fn("\n- Building abstract.. ")
         abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
-        joblib.dump(abstract_block, survey_dump_dir + 'abstract_block.dmp')
+        joblib.dump(abstract_block, f'{survey_dump_dir}abstract_block.dmp')
         '''
         self.print_fn("abstract_block type:"+ str(type(abstract_block)))
         self.print_fn("abstract_block:")
@@ -1427,7 +1414,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
 
         self.print_fn("\n- Building introduction.. ")
         intro_block = self.get_intro(corpus_sectionwise, research_blocks)
-        joblib.dump(intro_block, survey_dump_dir + 'intro_block.dmp')
+        joblib.dump(intro_block, f'{survey_dump_dir}intro_block.dmp')
         '''
         self.print_fn("intro_block type:"+ str(type(intro_block)))
         self.print_fn("intro_block:")
@@ -1435,8 +1422,8 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         '''
         self.print_fn("\n- Building custom sections.. ")
         clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
-        joblib.dump(clustered_sections, survey_dump_dir + 'clustered_sections.dmp')
-        joblib.dump(clustered_sentences, survey_dump_dir + 'clustered_sentences.dmp')
+        joblib.dump(clustered_sections, f'{survey_dump_dir}clustered_sections.dmp')
+        joblib.dump(clustered_sentences, f'{survey_dump_dir}clustered_sentences.dmp')
 
         '''
         self.print_fn("clusters extracted")
@@ -1449,11 +1436,11 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         '''
         clustered_sections['abstract'] = abstract_block
         clustered_sections['introduction'] = intro_block
-        joblib.dump(clustered_sections, survey_dump_dir + 'research_sections.dmp')
+        joblib.dump(clustered_sections, f'{survey_dump_dir}research_sections.dmp')
 
         self.print_fn("\n- Building conclusion.. ")
         conclusion_block = self.get_conclusion(clustered_sections)
-        joblib.dump(conclusion_block, survey_dump_dir + 'conclusion_block.dmp')
+        joblib.dump(conclusion_block, f'{survey_dump_dir}conclusion_block.dmp')
         clustered_sections['conclusion'] = conclusion_block
         '''
         self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
@@ -1461,7 +1448,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         self.print_fn(conclusion_block)
         '''
         if query is None:
-            query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
+            query = self.generate_title(' '.join(list(clustered_sections.values())))
 
         survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
         survey_file = Path(survey_dump_dir).resolve() / survey_file
@@ -1470,9 +1457,9 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb
         self.survey_print_fn("\n-citation-network: ")
         self.survey_print_fn(cites)
 
-        shutil.copytree('arxiv_data/', survey_dump_dir + '/arxiv_data/')
+        shutil.copytree('arxiv_data/', f'{survey_dump_dir}/arxiv_data/')
         assert (os.path.exists(survey_file))
-        
+
         zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
         zip_name = Path(survey_dump_dir).parent.resolve() / zip_name
         self.zip_outputs(survey_dump_dir, str(zip_name))