sidphbot · SourceryAI · Dec 10, 2023 · SourceryAI · Dec 10, 2023 · SourceryAI
diff --git a/app.py b/app.py
@@ -73,30 +73,32 @@ class ArxivIDsModel(BaseModel):
     )
 
 if __name__ == '__main__':
-    st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
-    st.title('Auto-Research')
-    st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' 
-             '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
-    st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
-    st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
-    download_placeholder = st.container()
-
-    with st.sidebar.form(key="survey_keywords_form"):
-        session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
-        st.write('or')
-        session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
-        submit = st.form_submit_button(label="Submit")
-    st.sidebar.write('#### execution log:')
-
-    run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
-                  'download_placeholder':download_placeholder}
-    if submit:
-        if session_data['research_keywords'] != '':
-            run_kwargs.update({'research_keywords':session_data['research_keywords'], 
-                               'max_search':session_data['max_search'], 
-                               'num_papers':session_data['num_papers']})
-        elif session_data['arxiv_ids'] != '':
-            run_kwargs.update({'arxiv_ids':[id.strip() for id in session_data['arxiv_ids'].split(',')]})
-
-        run_survey(**run_kwargs)
+     st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
+     st.title('Auto-Research')
+     st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' 
+              '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
+     st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
+     st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
+     download_placeholder = st.container()
+
+     with st.sidebar.form(key="survey_keywords_form"):
+         session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
+         st.write('or')
+         session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
+         submit = st.form_submit_button(label="Submit")
+     st.sidebar.write('#### execution log:')
+
+     run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
+                   'download_placeholder':download_placeholder}
+     if submit:
+          if session_data['research_keywords'] != '':
+               run_kwargs.update({'research_keywords':session_data['research_keywords'], 
+                                  'max_search':session_data['max_search'], 
+                                  'num_papers':session_data['num_papers']})
+          elif session_data['arxiv_ids'] != '':
+               run_kwargs['arxiv_ids'] = [
+                   id.strip() for id in session_data['arxiv_ids'].split(',')
+               ]
+
+          run_survey(**run_kwargs)
 
diff --git a/arxiv_public_data/authors.py b/arxiv_public_data/authors.py
@@ -160,7 +160,7 @@ def _parse_author_affil_split(author_line: str) -> Dict:
                 2), match.group(3), match.group(4))
             author_entry = [s, match.group(1), '']
         elif mtype == 'name-prefix-name':
-            s = '{} {}'.format(match.group(2), match.group(3))
+            s = f'{match.group(2)} {match.group(3)}'
             author_entry = [s, match.group(1), '']
         elif mtype == 'name-name-prefix':
             author_entry = [match.group(2), match.group(1), match.group(3)]
@@ -197,9 +197,8 @@ def _remove_double_commas(items: List[str]) -> List[str]:
     for pt in items:
         if pt == ',' and last == ',':
             continue
-        else:
-            parts.append(pt)
-            last = pt
+        parts.append(pt)
+        last = pt
     return parts
 
 
@@ -210,13 +209,12 @@ def _tidy_name(name: str) -> str:
     return name
 
 
-def _collaboration_at_start(names: List[str]) \
-        -> Tuple[List[str], List[List[str]], int]:
+def _collaboration_at_start(names: List[str]) -> Tuple[List[str], List[List[str]], int]:
     """Perform special handling of collaboration at start."""
     author_list = []
 
     back_propagate_affiliations_to = 0
-    while len(names) > 0:
+    while names:
         m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))',
                       names[0], flags=re.IGNORECASE)
         if not m:
@@ -228,13 +226,13 @@ def _collaboration_at_start(names: List[str]) \
         # Remove from names
         names.pop(0)
         # Also swallow and following comma or colon
-        if names and (names[0] == ',' or names[0] == ':'):
+        if names and names[0] in [',', ':']:
             names.pop(0)
 
     return names, author_list, back_propagate_affiliations_to
 
 
-def _enum_collaboration_at_end(author_line: str)->Dict:
+def _enum_collaboration_at_end(author_line: str) -> Dict:
     """Get separate set of enumerated affiliations from end of author_line."""
     # Now see if we have a separate set of enumerated affiliations
     # This is indicated by finding '(\s*('
@@ -247,9 +245,7 @@ def _enum_collaboration_at_end(author_line: str)->Dict:
 
     # Now expect to have '1) affil1 (2) affil2 (3) affil3'
     for affil in affils.split('('):
-        # Now expect `1) affil1 ', discard if no match
-        m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil)
-        if m:
+        if m := re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil):
             enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2))
 
     return enumaffils
@@ -266,7 +262,7 @@ def _add_affiliation(author_line: str,
     Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2)
     """
     en = re.escape(name)
-    namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*'))
+    namerex = f"{en.replace(' ', 's*')}\s*\(([^\(\)]+)"
     m = re.search(namerex, author_line, flags=re.IGNORECASE)
     if not m:
         return author_entry
@@ -341,21 +337,19 @@ def split_authors(authors: str) -> List:
         for bit in aus:
             if bit == '':
                 continue
-            if bit == '(':  # track open parentheses
+            if bit == '(':
                 depth += 1
                 if depth == 1:
                     blocks.append(c)
                     c = '('
                 else:
                     c = c + bit
-            elif bit == ')':  # track close parentheses
+            elif bit == ')':
                 depth -= 1
                 c = c + bit
                 if depth == 0:
                     blocks.append(c)
                     c = ''
-                else:  # haven't closed, so keep accumulating
-                    continue
             else:
                 c = c + bit
         if c:
@@ -373,8 +367,7 @@ def split_authors(authors: str) -> List:
             for name in names:
                 if not name:
                     continue
-                name = name.rstrip().lstrip()
-                if name:
+                if name := name.rstrip().lstrip():
                     listx.append(name)
 
     # Recombine suffixes that were separated with a comma
@@ -386,7 +379,7 @@ def split_authors(authors: str) -> List:
                 and not re.match(r'\)$', parts[-2]):
             separator = parts.pop()
             last = parts.pop()
-            recomb = "{}{} {}".format(last, separator, p)
+            recomb = f"{last}{separator} {p}"
             parts.append(recomb)
         else:
             parts.append(p)
@@ -429,7 +422,7 @@ def _parse_article_authors(article_author):
     try:
         return [article_author[0], parse_author_affil_utf(article_author[1])]
     except Exception as e:
-        msg = "Author split failed for article {}".format(article_author[0])
+        msg = f"Author split failed for article {article_author[0]}"
         logger.error(msg)
         logger.exception(e)
         return [article_author[0], '']
@@ -455,15 +448,13 @@ def parse_authorline_parallel(article_authors, n_processes=None):
              [ author3_keyname, author3_firstnames, author1_suffix ]
             ]
     """
-    logger.info(
-        'Parsing author lines for {} articles...'.format(len(article_authors))
-    )
+    logger.info(f'Parsing author lines for {len(article_authors)} articles...')
 
     pool = Pool(n_processes)
     parsed = pool.map(_parse_article_authors, article_authors)
-    outdict = {aid: auth for aid, auth in parsed}
+    outdict = dict(parsed)
 
     filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz')
-    logger.info('Saving to {}'.format(filename))
+    logger.info(f'Saving to {filename}')
     with gzip.open(filename, 'wb') as fout:
         fout.write(json.dumps(outdict).encode('utf-8'))
diff --git a/arxiv_public_data/config.py b/arxiv_public_data/config.py
@@ -22,18 +22,17 @@ def get_outdir():
     """
     if os.environ.get(KEY):
         out = os.environ.get(KEY)
-    else:
-        if os.path.exists(JSONFILE):
-            js = json.load(open(JSONFILE))
-            if not KEY in js:
-                logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE))
-                logger.warn("default output directory is {}".format(DEFAULT_PATH))
-                out = DEFAULT_PATH
-            else:
-                out = js[KEY]
+    elif os.path.exists(JSONFILE):
+        js = json.load(open(JSONFILE))
+        if KEY in js:
+            out = js[KEY]
         else:
-            logger.warn("default output directory is {}".format(DEFAULT_PATH))
+            logger.warn(f'Configuration in "{JSONFILE}" invalid, using default')
+            logger.warn(f"default output directory is {DEFAULT_PATH}")
             out = DEFAULT_PATH
+    else:
+        logger.warn(f"default output directory is {DEFAULT_PATH}")
+        out = DEFAULT_PATH
     return out
 
 try:

diff --git a/arxiv_public_data/embeddings/tf_hub.py b/arxiv_public_data/embeddings/tf_hub.py
@@ -61,7 +61,7 @@ def elmo_strings(batches, filename, batchsize=32):
 
         for i, batch in enumerate(batches):
             # grab mean-pooling of contextualized word reps
-            logger.info("Computing/saving batch {}".format(i))
+            logger.info(f"Computing/saving batch {i}")
             with open(filename, 'ab') as fout:
                 pickle.dump(sess.run(
                     embeddings, feed_dict={text_input: batch}
@@ -125,7 +125,7 @@ def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32):
         sess.run(init_op)
         for i, batch in enumerate(batches):
             values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch)
-            logger.info("Computing/saving batch {}".format(i))
+            logger.info(f"Computing/saving batch {i}")
             emb = sess.run(
                 embeddings, 
                 feed_dict={
@@ -180,6 +180,6 @@ def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args=
         for h in headers:
             pickle.dump(h, fout)
 
-    logger.info("Saving embeddings to {}".format(savename))
+    logger.info(f"Saving embeddings to {savename}")
     encoder(batches, savename, *encoder_args, 
             **encoder_kwargs)
diff --git a/arxiv_public_data/embeddings/util.py b/arxiv_public_data/embeddings/util.py
@@ -39,7 +39,7 @@ def id_to_pathname(aid):
     """
     if '.' in aid:  # new style ArXiv ID
         yymm = aid.split('.')[0]
-        return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')
+        return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, f'{aid}.txt')
 
     # old style ArXiv ID
     cat, arxiv_id = re.split(r'(\d+)', aid)[:2]

diff --git a/arxiv_public_data/fulltext.py b/arxiv_public_data/fulltext.py
@@ -24,7 +24,7 @@
 def reextension(filename: str, extension: str) -> str:
     """ Give a filename a new extension """
     name, _ = os.path.splitext(filename)
-    return '{}.{}'.format(name, extension)
+    return f'{name}.{extension}'
 
 
 def average_word_length(txt):
@@ -43,8 +43,7 @@ def average_word_length(txt):
     #txt = re.subn(RE_REPEATS, '', txt)[0]
     nw = len(txt.split())
     nc = len(txt)
-    avgw = nc / (nw + 1)
-    return avgw
+    return nc / (nw + 1)
 
 
 def process_timeout(cmd, timeout):
@@ -71,7 +70,7 @@ def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''):
     output : str
         Full plain text output
     """
-    log.debug('Running {} on {}'.format(PDF2TXT, pdffile))
+    log.debug(f'Running {PDF2TXT} on {pdffile}')
     tmpfile = reextension(pdffile, 'pdf2txt')
 
     cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format(
@@ -101,7 +100,7 @@ def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str:
     output : str
         Full plain text output
     """
-    log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile))
+    log.debug(f'Running {PDFTOTEXT} on {pdffile}')
     tmpfile = reextension(pdffile, 'pdftotxt')
 
     cmd = '{cmd} "{pdf}" "{output}"'.format(
@@ -161,7 +160,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
         raise FileNotFoundError(pdffile)
 
     if os.stat(pdffile).st_size == 0:  # file is empty
-        raise RuntimeError('"{}" is an empty file'.format(pdffile))
+        raise RuntimeError(f'"{pdffile}" is an empty file')
 
     try:
         output = run_pdftotext(pdffile, timelimit=timelimit)
@@ -188,9 +187,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
     wordlength = average_word_length(output)
 
     if wordlength > 45:
-        raise RuntimeError(
-            'No accurate text could be extracted from "{}"'.format(pdffile)
-        )
+        raise RuntimeError(f'No accurate text could be extracted from "{pdffile}"')
 
     try:
         os.remove(reextension(pdffile, 'pdftotxt'))  # remove the tempfile
@@ -255,8 +252,8 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
     globber = os.path.join(path, '*.pdf')
     pdffiles = sorted_files(globber)
 
-    log.info('Searching "{}"...'.format(globber))
-    log.info('Found: {} pdfs'.format(len(pdffiles)))
+    log.info(f'Searching "{globber}"...')
+    log.info(f'Found: {len(pdffiles)} pdfs')
 
     for pdffile in pdffiles:
         txtfile = reextension(pdffile, 'txt')
@@ -271,7 +268,7 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
             with open(txtfile, 'w') as f:
                 f.write(text)
         except Exception as e:
-            log.error("Conversion failed for '{}'".format(pdffile))
+            log.error(f"Conversion failed for '{pdffile}'")
             log.exception(e)
             continue
 
@@ -297,8 +294,8 @@ def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMEL
     globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob
     pdffiles = sorted_files(globber)  # a list of path
 
-    log.info('Searching "{}"...'.format(globber))
-    log.info('Found: {} pdfs'.format(len(pdffiles)))
+    log.info(f'Searching "{globber}"...')
+    log.info(f'Found: {len(pdffiles)} pdfs')
 
     pool = Pool(processes=processes)
     result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles)
@@ -311,7 +308,7 @@ def convert_safe(pdffile: str, timelimit: int = TIMELIMIT):
     try:
         convert(pdffile, timelimit=timelimit)
     except Exception as e:
-        log.error('File conversion failed for {}: {}'.format(pdffile, e))
+        log.error(f'File conversion failed for {pdffile}: {e}')
 
 
 def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
@@ -332,7 +329,7 @@ def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
         Location of text file.
     """
     if not os.path.exists(path):
-        raise RuntimeError('No such path: %s' % path)
+        raise RuntimeError(f'No such path: {path}')
     outpath = reextension(path, 'txt')
 
     if os.path.exists(outpath):