Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,30 +73,32 @@ class ArxivIDsModel(BaseModel):
)

if __name__ == '__main__':
st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
st.title('Auto-Research')
st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections'
'(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
download_placeholder = st.container()

with st.sidebar.form(key="survey_keywords_form"):
session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
st.write('or')
session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
submit = st.form_submit_button(label="Submit")
st.sidebar.write('#### execution log:')

run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
'download_placeholder':download_placeholder}
if submit:
if session_data['research_keywords'] != '':
run_kwargs.update({'research_keywords':session_data['research_keywords'],
'max_search':session_data['max_search'],
'num_papers':session_data['num_papers']})
elif session_data['arxiv_ids'] != '':
run_kwargs.update({'arxiv_ids':[id.strip() for id in session_data['arxiv_ids'].split(',')]})

run_survey(**run_kwargs)
st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always')
st.title('Auto-Research')
st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections'
'(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).')
st.write('##### Data Provider: arXiv Open Archive Initiative OAI')
st.write('##### GitHub: https://github.com/sidphbot/Auto-Research')
download_placeholder = st.container()

with st.sidebar.form(key="survey_keywords_form"):
session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel)
st.write('or')
session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel))
submit = st.form_submit_button(label="Submit")
st.sidebar.write('#### execution log:')

run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
'download_placeholder':download_placeholder}
if submit:
if session_data['research_keywords'] != '':
run_kwargs.update({'research_keywords':session_data['research_keywords'],
'max_search':session_data['max_search'],
'num_papers':session_data['num_papers']})
elif session_data['arxiv_ids'] != '':
run_kwargs['arxiv_ids'] = [
id.strip() for id in session_data['arxiv_ids'].split(',')
]

run_survey(**run_kwargs)
Comment on lines -76 to +103
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 76-101 refactored with the following changes:


43 changes: 17 additions & 26 deletions arxiv_public_data/authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _parse_author_affil_split(author_line: str) -> Dict:
2), match.group(3), match.group(4))
author_entry = [s, match.group(1), '']
elif mtype == 'name-prefix-name':
s = '{} {}'.format(match.group(2), match.group(3))
s = f'{match.group(2)} {match.group(3)}'
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _parse_author_affil_split refactored with the following changes:

author_entry = [s, match.group(1), '']
elif mtype == 'name-name-prefix':
author_entry = [match.group(2), match.group(1), match.group(3)]
Expand Down Expand Up @@ -197,9 +197,8 @@ def _remove_double_commas(items: List[str]) -> List[str]:
for pt in items:
if pt == ',' and last == ',':
continue
else:
parts.append(pt)
last = pt
parts.append(pt)
last = pt
Comment on lines -200 to +201
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _remove_double_commas refactored with the following changes:

return parts


Expand All @@ -210,13 +209,12 @@ def _tidy_name(name: str) -> str:
return name


def _collaboration_at_start(names: List[str]) \
-> Tuple[List[str], List[List[str]], int]:
def _collaboration_at_start(names: List[str]) -> Tuple[List[str], List[List[str]], int]:
"""Perform special handling of collaboration at start."""
author_list = []

back_propagate_affiliations_to = 0
while len(names) > 0:
while names:
Comment on lines -213 to +217
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _collaboration_at_start refactored with the following changes:

m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))',
names[0], flags=re.IGNORECASE)
if not m:
Expand All @@ -228,13 +226,13 @@ def _collaboration_at_start(names: List[str]) \
# Remove from names
names.pop(0)
# Also swallow and following comma or colon
if names and (names[0] == ',' or names[0] == ':'):
if names and names[0] in [',', ':']:
names.pop(0)

return names, author_list, back_propagate_affiliations_to


def _enum_collaboration_at_end(author_line: str)->Dict:
def _enum_collaboration_at_end(author_line: str) -> Dict:
Comment on lines -237 to +235
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _enum_collaboration_at_end refactored with the following changes:

This removes the following comments ( why? ):

# Now expect `1) affil1 ', discard if no match

"""Get separate set of enumerated affiliations from end of author_line."""
# Now see if we have a separate set of enumerated affiliations
# This is indicated by finding '(\s*('
Expand All @@ -247,9 +245,7 @@ def _enum_collaboration_at_end(author_line: str)->Dict:

# Now expect to have '1) affil1 (2) affil2 (3) affil3'
for affil in affils.split('('):
# Now expect `1) affil1 ', discard if no match
m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil)
if m:
if m := re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil):
enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2))

return enumaffils
Expand All @@ -266,7 +262,7 @@ def _add_affiliation(author_line: str,
Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2)
"""
en = re.escape(name)
namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*'))
namerex = f"{en.replace(' ', 's*')}\s*\(([^\(\)]+)"
Comment on lines -269 to +265
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _add_affiliation refactored with the following changes:

m = re.search(namerex, author_line, flags=re.IGNORECASE)
if not m:
return author_entry
Expand Down Expand Up @@ -341,21 +337,19 @@ def split_authors(authors: str) -> List:
for bit in aus:
if bit == '':
continue
if bit == '(': # track open parentheses
if bit == '(':
depth += 1
if depth == 1:
blocks.append(c)
c = '('
else:
c = c + bit
elif bit == ')': # track close parentheses
elif bit == ')':
depth -= 1
c = c + bit
if depth == 0:
blocks.append(c)
c = ''
else: # haven't closed, so keep accumulating
continue
Comment on lines -344 to -358
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function split_authors refactored with the following changes:

This removes the following comments ( why? ):

# track open parentheses
# haven't closed, so keep accumulating
# track close parentheses

else:
c = c + bit
if c:
Expand All @@ -373,8 +367,7 @@ def split_authors(authors: str) -> List:
for name in names:
if not name:
continue
name = name.rstrip().lstrip()
if name:
if name := name.rstrip().lstrip():
listx.append(name)

# Recombine suffixes that were separated with a comma
Expand All @@ -386,7 +379,7 @@ def split_authors(authors: str) -> List:
and not re.match(r'\)$', parts[-2]):
separator = parts.pop()
last = parts.pop()
recomb = "{}{} {}".format(last, separator, p)
recomb = f"{last}{separator} {p}"
parts.append(recomb)
else:
parts.append(p)
Expand Down Expand Up @@ -429,7 +422,7 @@ def _parse_article_authors(article_author):
try:
return [article_author[0], parse_author_affil_utf(article_author[1])]
except Exception as e:
msg = "Author split failed for article {}".format(article_author[0])
msg = f"Author split failed for article {article_author[0]}"
Comment on lines -432 to +425
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _parse_article_authors refactored with the following changes:

logger.error(msg)
logger.exception(e)
return [article_author[0], '']
Expand All @@ -455,15 +448,13 @@ def parse_authorline_parallel(article_authors, n_processes=None):
[ author3_keyname, author3_firstnames, author1_suffix ]
]
"""
logger.info(
'Parsing author lines for {} articles...'.format(len(article_authors))
)
logger.info(f'Parsing author lines for {len(article_authors)} articles...')

pool = Pool(n_processes)
parsed = pool.map(_parse_article_authors, article_authors)
outdict = {aid: auth for aid, auth in parsed}
outdict = dict(parsed)

filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz')
logger.info('Saving to {}'.format(filename))
logger.info(f'Saving to {filename}')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function parse_authorline_parallel refactored with the following changes:

with gzip.open(filename, 'wb') as fout:
fout.write(json.dumps(outdict).encode('utf-8'))
19 changes: 9 additions & 10 deletions arxiv_public_data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,17 @@ def get_outdir():
"""
if os.environ.get(KEY):
out = os.environ.get(KEY)
else:
if os.path.exists(JSONFILE):
js = json.load(open(JSONFILE))
if not KEY in js:
logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE))
logger.warn("default output directory is {}".format(DEFAULT_PATH))
out = DEFAULT_PATH
else:
out = js[KEY]
elif os.path.exists(JSONFILE):
js = json.load(open(JSONFILE))
if KEY in js:
out = js[KEY]
else:
logger.warn("default output directory is {}".format(DEFAULT_PATH))
logger.warn(f'Configuration in "{JSONFILE}" invalid, using default')
logger.warn(f"default output directory is {DEFAULT_PATH}")
out = DEFAULT_PATH
else:
logger.warn(f"default output directory is {DEFAULT_PATH}")
out = DEFAULT_PATH
Comment on lines +25 to +35
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_outdir refactored with the following changes:

return out

try:
Expand Down
6 changes: 3 additions & 3 deletions arxiv_public_data/embeddings/tf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def elmo_strings(batches, filename, batchsize=32):

for i, batch in enumerate(batches):
# grab mean-pooling of contextualized word reps
logger.info("Computing/saving batch {}".format(i))
logger.info(f"Computing/saving batch {i}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function elmo_strings refactored with the following changes:

with open(filename, 'ab') as fout:
pickle.dump(sess.run(
embeddings, feed_dict={text_input: batch}
Expand Down Expand Up @@ -125,7 +125,7 @@ def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32):
sess.run(init_op)
for i, batch in enumerate(batches):
values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch)
logger.info("Computing/saving batch {}".format(i))
logger.info(f"Computing/saving batch {i}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function universal_sentence_encoder_lite refactored with the following changes:

emb = sess.run(
embeddings,
feed_dict={
Expand Down Expand Up @@ -180,6 +180,6 @@ def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args=
for h in headers:
pickle.dump(h, fout)

logger.info("Saving embeddings to {}".format(savename))
logger.info(f"Saving embeddings to {savename}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function create_save_embeddings refactored with the following changes:

encoder(batches, savename, *encoder_args,
**encoder_kwargs)
2 changes: 1 addition & 1 deletion arxiv_public_data/embeddings/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def id_to_pathname(aid):
"""
if '.' in aid: # new style ArXiv ID
yymm = aid.split('.')[0]
return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')
return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, f'{aid}.txt')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function id_to_pathname refactored with the following changes:


# old style ArXiv ID
cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
Expand Down
29 changes: 13 additions & 16 deletions arxiv_public_data/fulltext.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def reextension(filename: str, extension: str) -> str:
""" Give a filename a new extension """
name, _ = os.path.splitext(filename)
return '{}.{}'.format(name, extension)
return f'{name}.{extension}'
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function reextension refactored with the following changes:



def average_word_length(txt):
Expand All @@ -43,8 +43,7 @@ def average_word_length(txt):
#txt = re.subn(RE_REPEATS, '', txt)[0]
nw = len(txt.split())
nc = len(txt)
avgw = nc / (nw + 1)
return avgw
return nc / (nw + 1)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function average_word_length refactored with the following changes:



def process_timeout(cmd, timeout):
Expand All @@ -71,7 +70,7 @@ def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''):
output : str
Full plain text output
"""
log.debug('Running {} on {}'.format(PDF2TXT, pdffile))
log.debug(f'Running {PDF2TXT} on {pdffile}')
Comment on lines -74 to +73
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function run_pdf2txt refactored with the following changes:

tmpfile = reextension(pdffile, 'pdf2txt')

cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format(
Expand Down Expand Up @@ -101,7 +100,7 @@ def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str:
output : str
Full plain text output
"""
log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile))
log.debug(f'Running {PDFTOTEXT} on {pdffile}')
Comment on lines -104 to +103
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function run_pdftotext refactored with the following changes:

tmpfile = reextension(pdffile, 'pdftotxt')

cmd = '{cmd} "{pdf}" "{output}"'.format(
Expand Down Expand Up @@ -161,7 +160,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
raise FileNotFoundError(pdffile)

if os.stat(pdffile).st_size == 0: # file is empty
raise RuntimeError('"{}" is an empty file'.format(pdffile))
raise RuntimeError(f'"{pdffile}" is an empty file')
Comment on lines -164 to +163
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function fulltext refactored with the following changes:


try:
output = run_pdftotext(pdffile, timelimit=timelimit)
Expand All @@ -188,9 +187,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
wordlength = average_word_length(output)

if wordlength > 45:
raise RuntimeError(
'No accurate text could be extracted from "{}"'.format(pdffile)
)
raise RuntimeError(f'No accurate text could be extracted from "{pdffile}"')

try:
os.remove(reextension(pdffile, 'pdftotxt')) # remove the tempfile
Expand Down Expand Up @@ -255,8 +252,8 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
globber = os.path.join(path, '*.pdf')
pdffiles = sorted_files(globber)

log.info('Searching "{}"...'.format(globber))
log.info('Found: {} pdfs'.format(len(pdffiles)))
log.info(f'Searching "{globber}"...')
log.info(f'Found: {len(pdffiles)} pdfs')
Comment on lines -258 to +256
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function convert_directory refactored with the following changes:


for pdffile in pdffiles:
txtfile = reextension(pdffile, 'txt')
Expand All @@ -271,7 +268,7 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT):
with open(txtfile, 'w') as f:
f.write(text)
except Exception as e:
log.error("Conversion failed for '{}'".format(pdffile))
log.error(f"Conversion failed for '{pdffile}'")
log.exception(e)
continue

Expand All @@ -297,8 +294,8 @@ def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMEL
globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob
pdffiles = sorted_files(globber) # a list of path

log.info('Searching "{}"...'.format(globber))
log.info('Found: {} pdfs'.format(len(pdffiles)))
log.info(f'Searching "{globber}"...')
log.info(f'Found: {len(pdffiles)} pdfs')
Comment on lines -300 to +298
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function convert_directory_parallel refactored with the following changes:


pool = Pool(processes=processes)
result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles)
Expand All @@ -311,7 +308,7 @@ def convert_safe(pdffile: str, timelimit: int = TIMELIMIT):
try:
convert(pdffile, timelimit=timelimit)
except Exception as e:
log.error('File conversion failed for {}: {}'.format(pdffile, e))
log.error(f'File conversion failed for {pdffile}: {e}')
Comment on lines -314 to +311
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function convert_safe refactored with the following changes:



def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
Expand All @@ -332,7 +329,7 @@ def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
Location of text file.
"""
if not os.path.exists(path):
raise RuntimeError('No such path: %s' % path)
raise RuntimeError(f'No such path: {path}')
Comment on lines -335 to +332
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function convert refactored with the following changes:

outpath = reextension(path, 'txt')

if os.path.exists(outpath):
Expand Down
Loading