-
Notifications
You must be signed in to change notification settings - Fork 7
Sourcery Starbot ⭐ refactored sidphbot/Auto-Research #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,7 +160,7 @@ def _parse_author_affil_split(author_line: str) -> Dict: | |
| 2), match.group(3), match.group(4)) | ||
| author_entry = [s, match.group(1), ''] | ||
| elif mtype == 'name-prefix-name': | ||
| s = '{} {}'.format(match.group(2), match.group(3)) | ||
| s = f'{match.group(2)} {match.group(3)}' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| author_entry = [s, match.group(1), ''] | ||
| elif mtype == 'name-name-prefix': | ||
| author_entry = [match.group(2), match.group(1), match.group(3)] | ||
|
|
@@ -197,9 +197,8 @@ def _remove_double_commas(items: List[str]) -> List[str]: | |
| for pt in items: | ||
| if pt == ',' and last == ',': | ||
| continue | ||
| else: | ||
| parts.append(pt) | ||
| last = pt | ||
| parts.append(pt) | ||
| last = pt | ||
|
Comment on lines
-200
to
+201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| return parts | ||
|
|
||
|
|
||
|
|
@@ -210,13 +209,12 @@ def _tidy_name(name: str) -> str: | |
| return name | ||
|
|
||
|
|
||
| def _collaboration_at_start(names: List[str]) \ | ||
| -> Tuple[List[str], List[List[str]], int]: | ||
| def _collaboration_at_start(names: List[str]) -> Tuple[List[str], List[List[str]], int]: | ||
| """Perform special handling of collaboration at start.""" | ||
| author_list = [] | ||
|
|
||
| back_propagate_affiliations_to = 0 | ||
| while len(names) > 0: | ||
| while names: | ||
|
Comment on lines
-213
to
+217
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))', | ||
| names[0], flags=re.IGNORECASE) | ||
| if not m: | ||
|
|
@@ -228,13 +226,13 @@ def _collaboration_at_start(names: List[str]) \ | |
| # Remove from names | ||
| names.pop(0) | ||
| # Also swallow and following comma or colon | ||
| if names and (names[0] == ',' or names[0] == ':'): | ||
| if names and names[0] in [',', ':']: | ||
| names.pop(0) | ||
|
|
||
| return names, author_list, back_propagate_affiliations_to | ||
|
|
||
|
|
||
| def _enum_collaboration_at_end(author_line: str)->Dict: | ||
| def _enum_collaboration_at_end(author_line: str) -> Dict: | ||
|
Comment on lines
-237
to
+235
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ): |
||
| """Get separate set of enumerated affiliations from end of author_line.""" | ||
| # Now see if we have a separate set of enumerated affiliations | ||
| # This is indicated by finding '(\s*(' | ||
|
|
@@ -247,9 +245,7 @@ def _enum_collaboration_at_end(author_line: str)->Dict: | |
|
|
||
| # Now expect to have '1) affil1 (2) affil2 (3) affil3' | ||
| for affil in affils.split('('): | ||
| # Now expect `1) affil1 ', discard if no match | ||
| m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil) | ||
| if m: | ||
| if m := re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil): | ||
| enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2)) | ||
|
|
||
| return enumaffils | ||
|
|
@@ -266,7 +262,7 @@ def _add_affiliation(author_line: str, | |
| Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2) | ||
| """ | ||
| en = re.escape(name) | ||
| namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*')) | ||
| namerex = f"{en.replace(' ', 's*')}\s*\(([^\(\)]+)" | ||
|
Comment on lines
-269
to
+265
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| m = re.search(namerex, author_line, flags=re.IGNORECASE) | ||
| if not m: | ||
| return author_entry | ||
|
|
@@ -341,21 +337,19 @@ def split_authors(authors: str) -> List: | |
| for bit in aus: | ||
| if bit == '': | ||
| continue | ||
| if bit == '(': # track open parentheses | ||
| if bit == '(': | ||
| depth += 1 | ||
| if depth == 1: | ||
| blocks.append(c) | ||
| c = '(' | ||
| else: | ||
| c = c + bit | ||
| elif bit == ')': # track close parentheses | ||
| elif bit == ')': | ||
| depth -= 1 | ||
| c = c + bit | ||
| if depth == 0: | ||
| blocks.append(c) | ||
| c = '' | ||
| else: # haven't closed, so keep accumulating | ||
| continue | ||
|
Comment on lines
-344
to
-358
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ): |
||
| else: | ||
| c = c + bit | ||
| if c: | ||
|
|
@@ -373,8 +367,7 @@ def split_authors(authors: str) -> List: | |
| for name in names: | ||
| if not name: | ||
| continue | ||
| name = name.rstrip().lstrip() | ||
| if name: | ||
| if name := name.rstrip().lstrip(): | ||
| listx.append(name) | ||
|
|
||
| # Recombine suffixes that were separated with a comma | ||
|
|
@@ -386,7 +379,7 @@ def split_authors(authors: str) -> List: | |
| and not re.match(r'\)$', parts[-2]): | ||
| separator = parts.pop() | ||
| last = parts.pop() | ||
| recomb = "{}{} {}".format(last, separator, p) | ||
| recomb = f"{last}{separator} {p}" | ||
| parts.append(recomb) | ||
| else: | ||
| parts.append(p) | ||
|
|
@@ -429,7 +422,7 @@ def _parse_article_authors(article_author): | |
| try: | ||
| return [article_author[0], parse_author_affil_utf(article_author[1])] | ||
| except Exception as e: | ||
| msg = "Author split failed for article {}".format(article_author[0]) | ||
| msg = f"Author split failed for article {article_author[0]}" | ||
|
Comment on lines
-432
to
+425
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| logger.error(msg) | ||
| logger.exception(e) | ||
| return [article_author[0], ''] | ||
|
|
@@ -455,15 +448,13 @@ def parse_authorline_parallel(article_authors, n_processes=None): | |
| [ author3_keyname, author3_firstnames, author1_suffix ] | ||
| ] | ||
| """ | ||
| logger.info( | ||
| 'Parsing author lines for {} articles...'.format(len(article_authors)) | ||
| ) | ||
| logger.info(f'Parsing author lines for {len(article_authors)} articles...') | ||
|
|
||
| pool = Pool(n_processes) | ||
| parsed = pool.map(_parse_article_authors, article_authors) | ||
| outdict = {aid: auth for aid, auth in parsed} | ||
| outdict = dict(parsed) | ||
|
|
||
| filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz') | ||
| logger.info('Saving to {}'.format(filename)) | ||
| logger.info(f'Saving to {filename}') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| with gzip.open(filename, 'wb') as fout: | ||
| fout.write(json.dumps(outdict).encode('utf-8')) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,18 +22,17 @@ def get_outdir(): | |
| """ | ||
| if os.environ.get(KEY): | ||
| out = os.environ.get(KEY) | ||
| else: | ||
| if os.path.exists(JSONFILE): | ||
| js = json.load(open(JSONFILE)) | ||
| if not KEY in js: | ||
| logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE)) | ||
| logger.warn("default output directory is {}".format(DEFAULT_PATH)) | ||
| out = DEFAULT_PATH | ||
| else: | ||
| out = js[KEY] | ||
| elif os.path.exists(JSONFILE): | ||
| js = json.load(open(JSONFILE)) | ||
| if KEY in js: | ||
| out = js[KEY] | ||
| else: | ||
| logger.warn("default output directory is {}".format(DEFAULT_PATH)) | ||
| logger.warn(f'Configuration in "{JSONFILE}" invalid, using default') | ||
| logger.warn(f"default output directory is {DEFAULT_PATH}") | ||
| out = DEFAULT_PATH | ||
| else: | ||
| logger.warn(f"default output directory is {DEFAULT_PATH}") | ||
| out = DEFAULT_PATH | ||
|
Comment on lines
+25
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| return out | ||
|
|
||
| try: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,7 +61,7 @@ def elmo_strings(batches, filename, batchsize=32): | |
|
|
||
| for i, batch in enumerate(batches): | ||
| # grab mean-pooling of contextualized word reps | ||
| logger.info("Computing/saving batch {}".format(i)) | ||
| logger.info(f"Computing/saving batch {i}") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| with open(filename, 'ab') as fout: | ||
| pickle.dump(sess.run( | ||
| embeddings, feed_dict={text_input: batch} | ||
|
|
@@ -125,7 +125,7 @@ def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32): | |
| sess.run(init_op) | ||
| for i, batch in enumerate(batches): | ||
| values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch) | ||
| logger.info("Computing/saving batch {}".format(i)) | ||
| logger.info(f"Computing/saving batch {i}") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| emb = sess.run( | ||
| embeddings, | ||
| feed_dict={ | ||
|
|
@@ -180,6 +180,6 @@ def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args= | |
| for h in headers: | ||
| pickle.dump(h, fout) | ||
|
|
||
| logger.info("Saving embeddings to {}".format(savename)) | ||
| logger.info(f"Saving embeddings to {savename}") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| encoder(batches, savename, *encoder_args, | ||
| **encoder_kwargs) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,7 @@ def id_to_pathname(aid): | |
| """ | ||
| if '.' in aid: # new style ArXiv ID | ||
| yymm = aid.split('.')[0] | ||
| return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt') | ||
| return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, f'{aid}.txt') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| # old style ArXiv ID | ||
| cat, arxiv_id = re.split(r'(\d+)', aid)[:2] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ | |
| def reextension(filename: str, extension: str) -> str: | ||
| """ Give a filename a new extension """ | ||
| name, _ = os.path.splitext(filename) | ||
| return '{}.{}'.format(name, extension) | ||
| return f'{name}.{extension}' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
|
|
||
| def average_word_length(txt): | ||
|
|
@@ -43,8 +43,7 @@ def average_word_length(txt): | |
| #txt = re.subn(RE_REPEATS, '', txt)[0] | ||
| nw = len(txt.split()) | ||
| nc = len(txt) | ||
| avgw = nc / (nw + 1) | ||
| return avgw | ||
| return nc / (nw + 1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
|
|
||
| def process_timeout(cmd, timeout): | ||
|
|
@@ -71,7 +70,7 @@ def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''): | |
| output : str | ||
| Full plain text output | ||
| """ | ||
| log.debug('Running {} on {}'.format(PDF2TXT, pdffile)) | ||
| log.debug(f'Running {PDF2TXT} on {pdffile}') | ||
|
Comment on lines
-74
to
+73
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| tmpfile = reextension(pdffile, 'pdf2txt') | ||
|
|
||
| cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format( | ||
|
|
@@ -101,7 +100,7 @@ def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str: | |
| output : str | ||
| Full plain text output | ||
| """ | ||
| log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile)) | ||
| log.debug(f'Running {PDFTOTEXT} on {pdffile}') | ||
|
Comment on lines
-104
to
+103
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| tmpfile = reextension(pdffile, 'pdftotxt') | ||
|
|
||
| cmd = '{cmd} "{pdf}" "{output}"'.format( | ||
|
|
@@ -161,7 +160,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT): | |
| raise FileNotFoundError(pdffile) | ||
|
|
||
| if os.stat(pdffile).st_size == 0: # file is empty | ||
| raise RuntimeError('"{}" is an empty file'.format(pdffile)) | ||
| raise RuntimeError(f'"{pdffile}" is an empty file') | ||
|
Comment on lines
-164
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| try: | ||
| output = run_pdftotext(pdffile, timelimit=timelimit) | ||
|
|
@@ -188,9 +187,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT): | |
| wordlength = average_word_length(output) | ||
|
|
||
| if wordlength > 45: | ||
| raise RuntimeError( | ||
| 'No accurate text could be extracted from "{}"'.format(pdffile) | ||
| ) | ||
| raise RuntimeError(f'No accurate text could be extracted from "{pdffile}"') | ||
|
|
||
| try: | ||
| os.remove(reextension(pdffile, 'pdftotxt')) # remove the tempfile | ||
|
|
@@ -255,8 +252,8 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT): | |
| globber = os.path.join(path, '*.pdf') | ||
| pdffiles = sorted_files(globber) | ||
|
|
||
| log.info('Searching "{}"...'.format(globber)) | ||
| log.info('Found: {} pdfs'.format(len(pdffiles))) | ||
| log.info(f'Searching "{globber}"...') | ||
| log.info(f'Found: {len(pdffiles)} pdfs') | ||
|
Comment on lines
-258
to
+256
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| for pdffile in pdffiles: | ||
| txtfile = reextension(pdffile, 'txt') | ||
|
|
@@ -271,7 +268,7 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT): | |
| with open(txtfile, 'w') as f: | ||
| f.write(text) | ||
| except Exception as e: | ||
| log.error("Conversion failed for '{}'".format(pdffile)) | ||
| log.error(f"Conversion failed for '{pdffile}'") | ||
| log.exception(e) | ||
| continue | ||
|
|
||
|
|
@@ -297,8 +294,8 @@ def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMEL | |
| globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob | ||
| pdffiles = sorted_files(globber) # a list of path | ||
|
|
||
| log.info('Searching "{}"...'.format(globber)) | ||
| log.info('Found: {} pdfs'.format(len(pdffiles))) | ||
| log.info(f'Searching "{globber}"...') | ||
| log.info(f'Found: {len(pdffiles)} pdfs') | ||
|
Comment on lines
-300
to
+298
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| pool = Pool(processes=processes) | ||
| result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles) | ||
|
|
@@ -311,7 +308,7 @@ def convert_safe(pdffile: str, timelimit: int = TIMELIMIT): | |
| try: | ||
| convert(pdffile, timelimit=timelimit) | ||
| except Exception as e: | ||
| log.error('File conversion failed for {}: {}'.format(pdffile, e)) | ||
| log.error(f'File conversion failed for {pdffile}: {e}') | ||
|
Comment on lines
-314
to
+311
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
|
|
||
| def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str: | ||
|
|
@@ -332,7 +329,7 @@ def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str: | |
| Location of text file. | ||
| """ | ||
| if not os.path.exists(path): | ||
| raise RuntimeError('No such path: %s' % path) | ||
| raise RuntimeError(f'No such path: {path}') | ||
|
Comment on lines
-335
to
+332
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| outpath = reextension(path, 'txt') | ||
|
|
||
| if os.path.exists(outpath): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lines
76-101refactored with the following changes:simplify-dictionary-update)