From ba62e988fbfc23f43d02261f1436471fbb165096 Mon Sep 17 00:00:00 2001 From: Sourcery AI Date: Sun, 10 Dec 2023 16:34:39 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- app.py | 54 +-- arxiv_public_data/authors.py | 43 +-- arxiv_public_data/config.py | 19 +- arxiv_public_data/embeddings/tf_hub.py | 6 +- arxiv_public_data/embeddings/util.py | 2 +- arxiv_public_data/fulltext.py | 29 +- arxiv_public_data/internal_citations.py | 13 +- arxiv_public_data/oai_metadata.py | 45 +-- arxiv_public_data/pdfstamp.py | 27 +- arxiv_public_data/regex_arxiv.py | 99 ++--- arxiv_public_data/s3_bulk_download.py | 49 ++- arxiv_public_data/slice_pdfs.py | 16 +- src/Surveyor.py | 489 ++++++++++++------------ 13 files changed, 424 insertions(+), 467 deletions(-) diff --git a/app.py b/app.py index 9c318fe..9fd492a 100644 --- a/app.py +++ b/app.py @@ -73,30 +73,32 @@ class ArxivIDsModel(BaseModel): ) if __name__ == '__main__': - st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always') - st.title('Auto-Research') - st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' - '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).') - st.write('##### Data Provider: arXiv Open Archive Initiative OAI') - st.write('##### GitHub: https://github.com/sidphbot/Auto-Research') - download_placeholder = st.container() - - with st.sidebar.form(key="survey_keywords_form"): - session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel) - st.write('or') - session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel)) - submit = st.form_submit_button(label="Submit") - st.sidebar.write('#### execution log:') - - run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write), - 'download_placeholder':download_placeholder} - if submit: - if session_data['research_keywords'] != '': - run_kwargs.update({'research_keywords':session_data['research_keywords'], - 'max_search':session_data['max_search'], - 'num_papers':session_data['num_papers']}) - elif session_data['arxiv_ids'] != '': - run_kwargs.update({'arxiv_ids':[id.strip() for id in session_data['arxiv_ids'].split(',')]}) - - run_survey(**run_kwargs) + st.sidebar.image(Image.open('logo_landscape.png'), use_column_width = 'always') + st.title('Auto-Research') + st.write('#### A no-code utility to generate a detailed well-cited survey with topic clustered sections' + '(draft paper format) and other interesting artifacts from a single research query or a curated set of papers(arxiv ids).') + st.write('##### Data Provider: arXiv Open Archive Initiative OAI') + st.write('##### GitHub: https://github.com/sidphbot/Auto-Research') + download_placeholder = st.container() + + with st.sidebar.form(key="survey_keywords_form"): + session_data = sp.pydantic_input(key="keywords_input_model", model=KeywordsModel) + st.write('or') + session_data.update(sp.pydantic_input(key="arxiv_ids_input_model", model=ArxivIDsModel)) + submit = st.form_submit_button(label="Submit") + st.sidebar.write('#### execution log:') + + run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write), + 'download_placeholder':download_placeholder} + if submit: + if session_data['research_keywords'] != '': + run_kwargs.update({'research_keywords':session_data['research_keywords'], + 'max_search':session_data['max_search'], + 'num_papers':session_data['num_papers']}) + elif session_data['arxiv_ids'] != '': + run_kwargs['arxiv_ids'] = [ + id.strip() for id in session_data['arxiv_ids'].split(',') + ] + + run_survey(**run_kwargs) diff --git a/arxiv_public_data/authors.py b/arxiv_public_data/authors.py index 955f044..a8b7a23 100644 --- a/arxiv_public_data/authors.py +++ b/arxiv_public_data/authors.py @@ -160,7 +160,7 @@ def _parse_author_affil_split(author_line: str) -> Dict: 2), match.group(3), match.group(4)) author_entry = [s, match.group(1), ''] elif mtype == 'name-prefix-name': - s = '{} {}'.format(match.group(2), match.group(3)) + s = f'{match.group(2)} {match.group(3)}' author_entry = [s, match.group(1), ''] elif mtype == 'name-name-prefix': author_entry = [match.group(2), match.group(1), match.group(3)] @@ -197,9 +197,8 @@ def _remove_double_commas(items: List[str]) -> List[str]: for pt in items: if pt == ',' and last == ',': continue - else: - parts.append(pt) - last = pt + parts.append(pt) + last = pt return parts @@ -210,13 +209,12 @@ def _tidy_name(name: str) -> str: return name -def _collaboration_at_start(names: List[str]) \ - -> Tuple[List[str], List[List[str]], int]: +def _collaboration_at_start(names: List[str]) -> Tuple[List[str], List[List[str]], int]: """Perform special handling of collaboration at start.""" author_list = [] back_propagate_affiliations_to = 0 - while len(names) > 0: + while names: m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))', names[0], flags=re.IGNORECASE) if not m: @@ -228,13 +226,13 @@ def _collaboration_at_start(names: List[str]) \ # Remove from names names.pop(0) # Also swallow and following comma or colon - if names and (names[0] == ',' or names[0] == ':'): + if names and names[0] in [',', ':']: names.pop(0) return names, author_list, back_propagate_affiliations_to -def _enum_collaboration_at_end(author_line: str)->Dict: +def _enum_collaboration_at_end(author_line: str) -> Dict: """Get separate set of enumerated affiliations from end of author_line.""" # Now see if we have a separate set of enumerated affiliations # This is indicated by finding '(\s*(' @@ -247,9 +245,7 @@ def _enum_collaboration_at_end(author_line: str)->Dict: # Now expect to have '1) affil1 (2) affil2 (3) affil3' for affil in affils.split('('): - # Now expect `1) affil1 ', discard if no match - m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil) - if m: + if m := re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil): enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2)) return enumaffils @@ -266,7 +262,7 @@ def _add_affiliation(author_line: str, Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2) """ en = re.escape(name) - namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*')) + namerex = f"{en.replace(' ', 's*')}\s*\(([^\(\)]+)" m = re.search(namerex, author_line, flags=re.IGNORECASE) if not m: return author_entry @@ -341,21 +337,19 @@ def split_authors(authors: str) -> List: for bit in aus: if bit == '': continue - if bit == '(': # track open parentheses + if bit == '(': depth += 1 if depth == 1: blocks.append(c) c = '(' else: c = c + bit - elif bit == ')': # track close parentheses + elif bit == ')': depth -= 1 c = c + bit if depth == 0: blocks.append(c) c = '' - else: # haven't closed, so keep accumulating - continue else: c = c + bit if c: @@ -373,8 +367,7 @@ def split_authors(authors: str) -> List: for name in names: if not name: continue - name = name.rstrip().lstrip() - if name: + if name := name.rstrip().lstrip(): listx.append(name) # Recombine suffixes that were separated with a comma @@ -386,7 +379,7 @@ def split_authors(authors: str) -> List: and not re.match(r'\)$', parts[-2]): separator = parts.pop() last = parts.pop() - recomb = "{}{} {}".format(last, separator, p) + recomb = f"{last}{separator} {p}" parts.append(recomb) else: parts.append(p) @@ -429,7 +422,7 @@ def _parse_article_authors(article_author): try: return [article_author[0], parse_author_affil_utf(article_author[1])] except Exception as e: - msg = "Author split failed for article {}".format(article_author[0]) + msg = f"Author split failed for article {article_author[0]}" logger.error(msg) logger.exception(e) return [article_author[0], ''] @@ -455,15 +448,13 @@ def parse_authorline_parallel(article_authors, n_processes=None): [ author3_keyname, author3_firstnames, author1_suffix ] ] """ - logger.info( - 'Parsing author lines for {} articles...'.format(len(article_authors)) - ) + logger.info(f'Parsing author lines for {len(article_authors)} articles...') pool = Pool(n_processes) parsed = pool.map(_parse_article_authors, article_authors) - outdict = {aid: auth for aid, auth in parsed} + outdict = dict(parsed) filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz') - logger.info('Saving to {}'.format(filename)) + logger.info(f'Saving to {filename}') with gzip.open(filename, 'wb') as fout: fout.write(json.dumps(outdict).encode('utf-8')) diff --git a/arxiv_public_data/config.py b/arxiv_public_data/config.py index 7cfbd41..ff7ba04 100644 --- a/arxiv_public_data/config.py +++ b/arxiv_public_data/config.py @@ -22,18 +22,17 @@ def get_outdir(): """ if os.environ.get(KEY): out = os.environ.get(KEY) - else: - if os.path.exists(JSONFILE): - js = json.load(open(JSONFILE)) - if not KEY in js: - logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE)) - logger.warn("default output directory is {}".format(DEFAULT_PATH)) - out = DEFAULT_PATH - else: - out = js[KEY] + elif os.path.exists(JSONFILE): + js = json.load(open(JSONFILE)) + if KEY in js: + out = js[KEY] else: - logger.warn("default output directory is {}".format(DEFAULT_PATH)) + logger.warn(f'Configuration in "{JSONFILE}" invalid, using default') + logger.warn(f"default output directory is {DEFAULT_PATH}") out = DEFAULT_PATH + else: + logger.warn(f"default output directory is {DEFAULT_PATH}") + out = DEFAULT_PATH return out try: diff --git a/arxiv_public_data/embeddings/tf_hub.py b/arxiv_public_data/embeddings/tf_hub.py index bf06d94..4d8e4ea 100644 --- a/arxiv_public_data/embeddings/tf_hub.py +++ b/arxiv_public_data/embeddings/tf_hub.py @@ -61,7 +61,7 @@ def elmo_strings(batches, filename, batchsize=32): for i, batch in enumerate(batches): # grab mean-pooling of contextualized word reps - logger.info("Computing/saving batch {}".format(i)) + logger.info(f"Computing/saving batch {i}") with open(filename, 'ab') as fout: pickle.dump(sess.run( embeddings, feed_dict={text_input: batch} @@ -125,7 +125,7 @@ def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32): sess.run(init_op) for i, batch in enumerate(batches): values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch) - logger.info("Computing/saving batch {}".format(i)) + logger.info(f"Computing/saving batch {i}") emb = sess.run( embeddings, feed_dict={ @@ -180,6 +180,6 @@ def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args= for h in headers: pickle.dump(h, fout) - logger.info("Saving embeddings to {}".format(savename)) + logger.info(f"Saving embeddings to {savename}") encoder(batches, savename, *encoder_args, **encoder_kwargs) diff --git a/arxiv_public_data/embeddings/util.py b/arxiv_public_data/embeddings/util.py index 9b56ffa..5bc9c65 100644 --- a/arxiv_public_data/embeddings/util.py +++ b/arxiv_public_data/embeddings/util.py @@ -39,7 +39,7 @@ def id_to_pathname(aid): """ if '.' in aid: # new style ArXiv ID yymm = aid.split('.')[0] - return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt') + return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, f'{aid}.txt') # old style ArXiv ID cat, arxiv_id = re.split(r'(\d+)', aid)[:2] diff --git a/arxiv_public_data/fulltext.py b/arxiv_public_data/fulltext.py index a147d8a..d5bfadc 100644 --- a/arxiv_public_data/fulltext.py +++ b/arxiv_public_data/fulltext.py @@ -24,7 +24,7 @@ def reextension(filename: str, extension: str) -> str: """ Give a filename a new extension """ name, _ = os.path.splitext(filename) - return '{}.{}'.format(name, extension) + return f'{name}.{extension}' def average_word_length(txt): @@ -43,8 +43,7 @@ def average_word_length(txt): #txt = re.subn(RE_REPEATS, '', txt)[0] nw = len(txt.split()) nc = len(txt) - avgw = nc / (nw + 1) - return avgw + return nc / (nw + 1) def process_timeout(cmd, timeout): @@ -71,7 +70,7 @@ def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''): output : str Full plain text output """ - log.debug('Running {} on {}'.format(PDF2TXT, pdffile)) + log.debug(f'Running {PDF2TXT} on {pdffile}') tmpfile = reextension(pdffile, 'pdf2txt') cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format( @@ -101,7 +100,7 @@ def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str: output : str Full plain text output """ - log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile)) + log.debug(f'Running {PDFTOTEXT} on {pdffile}') tmpfile = reextension(pdffile, 'pdftotxt') cmd = '{cmd} "{pdf}" "{output}"'.format( @@ -161,7 +160,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT): raise FileNotFoundError(pdffile) if os.stat(pdffile).st_size == 0: # file is empty - raise RuntimeError('"{}" is an empty file'.format(pdffile)) + raise RuntimeError(f'"{pdffile}" is an empty file') try: output = run_pdftotext(pdffile, timelimit=timelimit) @@ -188,9 +187,7 @@ def fulltext(pdffile: str, timelimit: int = TIMELIMIT): wordlength = average_word_length(output) if wordlength > 45: - raise RuntimeError( - 'No accurate text could be extracted from "{}"'.format(pdffile) - ) + raise RuntimeError(f'No accurate text could be extracted from "{pdffile}"') try: os.remove(reextension(pdffile, 'pdftotxt')) # remove the tempfile @@ -255,8 +252,8 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT): globber = os.path.join(path, '*.pdf') pdffiles = sorted_files(globber) - log.info('Searching "{}"...'.format(globber)) - log.info('Found: {} pdfs'.format(len(pdffiles))) + log.info(f'Searching "{globber}"...') + log.info(f'Found: {len(pdffiles)} pdfs') for pdffile in pdffiles: txtfile = reextension(pdffile, 'txt') @@ -271,7 +268,7 @@ def convert_directory(path: str, timelimit: int = TIMELIMIT): with open(txtfile, 'w') as f: f.write(text) except Exception as e: - log.error("Conversion failed for '{}'".format(pdffile)) + log.error(f"Conversion failed for '{pdffile}'") log.exception(e) continue @@ -297,8 +294,8 @@ def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMEL globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob pdffiles = sorted_files(globber) # a list of path - log.info('Searching "{}"...'.format(globber)) - log.info('Found: {} pdfs'.format(len(pdffiles))) + log.info(f'Searching "{globber}"...') + log.info(f'Found: {len(pdffiles)} pdfs') pool = Pool(processes=processes) result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles) @@ -311,7 +308,7 @@ def convert_safe(pdffile: str, timelimit: int = TIMELIMIT): try: convert(pdffile, timelimit=timelimit) except Exception as e: - log.error('File conversion failed for {}: {}'.format(pdffile, e)) + log.error(f'File conversion failed for {pdffile}: {e}') def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str: @@ -332,7 +329,7 @@ def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str: Location of text file. """ if not os.path.exists(path): - raise RuntimeError('No such path: %s' % path) + raise RuntimeError(f'No such path: {path}') outpath = reextension(path, 'txt') if os.path.exists(outpath): diff --git a/arxiv_public_data/internal_citations.py b/arxiv_public_data/internal_citations.py index 3ab715a..5bade7b 100644 --- a/arxiv_public_data/internal_citations.py +++ b/arxiv_public_data/internal_citations.py @@ -33,10 +33,7 @@ def all_articles(directory=DIR_FULLTEXT): directory = os.path.abspath(os.path.expanduser(directory)) for root, dirs, files in os.walk(directory): - for f in files: - if 'txt' in f: - out.append(os.path.join(root, f)) - + out.extend(os.path.join(root, f) for f in files if 'txt' in f) return out def extract_references(filename, pattern=RE_FLEX): @@ -75,12 +72,12 @@ def citation_list_inner(articles): cites = {} for i, article in enumerate(articles): if i > 0 and i % 1000 == 0: - log.info('Completed {} articles'.format(i)) + log.info(f'Completed {i} articles') try: refs = extract_references(article) cites[path_to_id(article)] = refs except: - log.error("Error in {}".format(article)) + log.error(f"Error in {article}") continue return cites @@ -100,7 +97,7 @@ def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT): all arXiv citations in all articles """ articles = all_articles(directory) - log.info('Calculating citation network for {} articles'.format(len(articles))) + log.info(f'Calculating citation network for {len(articles)} articles') pool = Pool(N) @@ -123,6 +120,6 @@ def default_filename(): def save_to_default_location(citations): filename = default_filename() - log.info('Saving to "{}"'.format(filename)) + log.info(f'Saving to "{filename}"') with gzip.open(filename, 'wb') as fn: fn.write(json.dumps(citations).encode('utf-8')) diff --git a/arxiv_public_data/oai_metadata.py b/arxiv_public_data/oai_metadata.py index 3f98716..66baf9b 100644 --- a/arxiv_public_data/oai_metadata.py +++ b/arxiv_public_data/oai_metadata.py @@ -72,27 +72,24 @@ def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI, if response.status_code == 200: return response.text - if response.status_code == 503: - secs = int(response.headers.get('Retry-After', 20)) * 1.5 - log.info('Requested to wait, waiting {} seconds until retry...'.format(secs)) - - time.sleep(secs) - return get_list_record_chunk(resumptionToken=resumptionToken) - else: + if response.status_code != 503: raise Exception( - 'Unknown error in HTTP request {}, status code: {}'.format( - response.url, response.status_code - ) + f'Unknown error in HTTP request {response.url}, status code: {response.status_code}' ) + secs = int(response.headers.get('Retry-After', 20)) * 1.5 + log.info(f'Requested to wait, waiting {secs} seconds until retry...') + + time.sleep(secs) + return get_list_record_chunk(resumptionToken=resumptionToken) def _record_element_text(elm, name): """ XML helper function for extracting text from leaf (single-node) elements """ - item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES) + item = elm.find(f'arXiv:{name}', OAI_XML_NAMESPACES) return item.text if item is not None else None def _record_element_all(elm, name): """ XML helper function for extracting text from queries with multiple nodes """ - return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES) + return elm.findall(f'arXiv:{name}', OAI_XML_NAMESPACES) def parse_record(elm): """ @@ -160,9 +157,7 @@ def check_xml_errors(root): error = root.find('OAI:error', OAI_XML_NAMESPACES) if error is not None: - raise RuntimeError( - 'OAI service returned error: {}'.format(error.text) - ) + raise RuntimeError(f'OAI service returned error: {error.text}') def find_default_locations(): outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz') @@ -172,9 +167,7 @@ def find_default_locations(): fn_outfile = sorted(glob.glob(outfile)) fn_resume = sorted(glob.glob(resume)) - if len(fn_outfile) > 0: - return fn_outfile[-1] - return None + return fn_outfile[-1] if len(fn_outfile) > 0 else None def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True): """ @@ -195,28 +188,26 @@ def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True): date = str(datetime.datetime.now()).split(' ')[0] outfile = ( - outfile or # user-supplied - find_default_locations() or # already in progress - os.path.join( - DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date) - ) # new file + outfile + or find_default_locations() + or os.path.join(DIR_BASE, f'arxiv-metadata-oai-{date}.json.gz') ) directory = os.path.split(outfile)[0] if directory and not os.path.exists(directory): os.makedirs(directory) - tokenfile = '{}-resumptionToken.txt'.format(outfile) + tokenfile = f'{outfile}-resumptionToken.txt' chunk_index = 0 total_records = 0 - log.info('Saving metadata to "{}"'.format(outfile)) + log.info(f'Saving metadata to "{outfile}"') resumptionToken = None if autoresume: try: resumptionToken = open(tokenfile, 'r').read() except Exception as e: - log.warn("No tokenfile found '{}'".format(tokenfile)) + log.warn(f"No tokenfile found '{tokenfile}'") log.info("Starting download from scratch...") while True: @@ -277,6 +268,6 @@ def validate_abstract_hashes(metadata, metadata_no_abstract): """ Validate that abstracts match the hashes """ for m, n in zip(metadata, metadata_no_abstract): md5 = hashlib.md5(m['abstract'].encode()).hexdigest() - if not md5 == n['abstract_md5']: + if md5 != n['abstract_md5']: return False return True diff --git a/arxiv_public_data/pdfstamp.py b/arxiv_public_data/pdfstamp.py index d8ea220..9cab987 100644 --- a/arxiv_public_data/pdfstamp.py +++ b/arxiv_public_data/pdfstamp.py @@ -1,19 +1,14 @@ import re SPACE_DIGIT = r'\s*\d\s*' -SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT) SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*' -SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR) +SPACE_WORD = f'(?:{SPACE_CHAR})+' # old style ID, 7 digits in a row RE_NUM_OLD = SPACE_DIGIT*7 # new style ID, 4 digits, ., 4,5 digits -RE_NUM_NEW = ( - SPACE_DIGIT*4 + - r'\.' + - SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT) -) +RE_NUM_NEW = SPACE_DIGIT * 4 + r'\.' + SPACE_DIGIT * 4 + f'(?:{SPACE_DIGIT})?' # the version part v1 V2 v 1, etc RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?' @@ -22,20 +17,20 @@ RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*' # any words within square brackets [cs.A I] -RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD) +RE_CATEGORIES = f'\[{SPACE_WORD}\]' +SPACE_NUMBER = f'(?:{SPACE_DIGIT})+' # two digit date, month, year "29 Jan 2012" RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}') # the full identifier for the banner RE_ARXIV_ID = ( - RE_ARXIV + - r'(?:' + - r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) + - r')' + - RE_VERSION + - RE_CATEGORIES + - RE_DATE + f'{RE_ARXIV}(?:' + + f'(?:{RE_NUM_NEW})|(?:{RE_NUM_OLD})' + + r')' + + RE_VERSION + + RE_CATEGORIES + + RE_DATE ) REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID) @@ -51,7 +46,7 @@ def _extract_arxiv_stamp(txt): return txt, '' s, e = match.span() - return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip() + return f'{txt[:s].strip()} {txt[e:].strip()}', txt[s:e].strip() def remove_stamp(txt, split=1000): diff --git a/arxiv_public_data/regex_arxiv.py b/arxiv_public_data/regex_arxiv.py index 2e620fe..76a7a63 100644 --- a/arxiv_public_data/regex_arxiv.py +++ b/arxiv_public_data/regex_arxiv.py @@ -45,19 +45,18 @@ def strip_version(name): def format_cat(name): """ Strip subcategory, add hyphen to category name if missing """ - if '/' in name: # OLD ID, names contains subcategory - catsubcat, aid = name.split('/') - cat = catsubcat.split('.')[0] - return dashdict.get(cat, cat) + "/" + aid - else: + if '/' not in name: return name + catsubcat, aid = name.split('/') + cat = catsubcat.split('.')[0] + return f"{dashdict.get(cat, cat)}/{aid}" def zeropad_1501(name): """ Arxiv IDs after yymm=1501 are padded to 5 zeros """ - if not '/' in name: # new ID + if '/' not in name: # new ID yymm, num = name.split('.') if int(yymm) > 1500 and len(num) < 5: - return yymm + ".0" + num + return f"{yymm}.0{num}" return name def clean(name): @@ -89,10 +88,10 @@ def clean(name): RE_NUM_OLD = RE_DATE + r'(?:\d{3})' + RE_VERSION # matches: 1612.00001 1203.0023v2 -RE_ID_NEW = r'(?:{})'.format(RE_NUM_NEW) +RE_ID_NEW = f'(?:{RE_NUM_NEW})' # matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2 -RE_ID_OLD = r'(?:{}/{})'.format(RE_CATEGORIES, RE_NUM_OLD) +RE_ID_OLD = f'(?:{RE_CATEGORIES}/{RE_NUM_OLD})' # ============================================================================= # matches: https://arxiv.org/abs/ abs/ arxiv.org/abs/ @@ -109,60 +108,64 @@ def clean(name): RE_PREFIX_ARXIV = r'(?i:arxiv\s*[:/\s,.]*\s*)' # matches: cs.AI/ cs.AI nucl-th -RE_PREFIX_CATEGORIES = r'(?i:{})'.format(RE_CATEGORIES) +RE_PREFIX_CATEGORIES = f'(?i:{RE_CATEGORIES})' # matches: e-prints: e-print eprints: RE_PREFIX_EPRINT = r'(?i:e[-]?print[s]?.{1,3})' # ============================================================================= # matches simple old or new identifiers, no fancy business -REGEX_ARXIV_SIMPLE = r'(?:{}|{})'.format(RE_ID_OLD, RE_ID_NEW) +REGEX_ARXIV_SIMPLE = f'(?:{RE_ID_OLD}|{RE_ID_NEW})' # this one follows the guide set forth by: # https://arxiv.org/help/arxiv_identifier REGEX_ARXIV_STRICT = ( - r'(?:{})'.format(RE_PREFIX_ARXIV) + - r'(?:' - r'({})'.format(RE_ID_OLD) + - r'|' - r'({})'.format(RE_ID_NEW) + - r')' -) + (f'(?:{RE_PREFIX_ARXIV})' + f'(?:({RE_ID_OLD})') + f'|({RE_ID_NEW})' +) + r')' # this regex essentially accepts anything that looks like an arxiv id and has # the slightest smell of being one as well. that is, if it is an id and # mentions anything about the arxiv before hand, then it is an id. REGEX_ARXIV_FLEXIBLE = ( - r'(?:' - r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture - r')|(?:' - r'(?:' - r'(?:{})?'.format(RE_PREFIX_URL) + - r'(?:{})?'.format(RE_PREFIX_EPRINT) + - r'(?:' - r'(?:{})?'.format(RE_PREFIX_ARXIV) + - r'({})'.format(RE_ID_OLD) + # capture - r'|' - r'(?:{})'.format(RE_PREFIX_ARXIV) + - r'(?:{}/)?'.format(RE_CATEGORIES) + - r'({})'.format(RE_ID_NEW) + # capture - r')' - r')' - r'|' - r'(?:' - r'(?:{})|'.format(RE_PREFIX_URL) + - r'(?:{})|'.format(RE_PREFIX_EPRINT) + - r'(?:{})|'.format(RE_PREFIX_CATEGORIES) + - r'(?:{})'.format(RE_PREFIX_ARXIV) + - r')' - r'.*?' - r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture - r')|(?:' - r'(?:[\[\(]\s*)' - r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture - r'(?:\s*[\]\)])' - r')' -) + ( + ( + ( + ( + ( + ( + ( + ( + ( + ( + ( + ( + f'(?:({REGEX_ARXIV_SIMPLE})' + + f')|(?:(?:(?:{RE_PREFIX_URL})?' + ) + + f'(?:{RE_PREFIX_EPRINT})?' + ) + + f'(?:(?:{RE_PREFIX_ARXIV})?' + ) + + f'({RE_ID_OLD})' + ) + + f'|(?:{RE_PREFIX_ARXIV})' + ) + + f'(?:{RE_CATEGORIES}/)?' + ) + + f'({RE_ID_NEW})' + ) + + f'))|(?:(?:{RE_PREFIX_URL})|' + ) + + f'(?:{RE_PREFIX_EPRINT})|' + ) + + f'(?:{RE_PREFIX_CATEGORIES})|' + ) + + f'(?:{RE_PREFIX_ARXIV})' + ) + + f').*?({REGEX_ARXIV_SIMPLE})' + ) + + f')|(?:(?:[\[\(]\s*)({REGEX_ARXIV_SIMPLE})' +) + r'(?:\s*[\]\)])' r')' TEST_POSITIVE = [ 'arXiv:quant-ph 1503.01017v3', diff --git a/arxiv_public_data/s3_bulk_download.py b/arxiv_public_data/s3_bulk_download.py index 12a0524..14293e5 100644 --- a/arxiv_public_data/s3_bulk_download.py +++ b/arxiv_public_data/s3_bulk_download.py @@ -102,17 +102,17 @@ def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False, } ) if not dryrun: - logger.info('Requesting "{}" (costs money!)'.format(filename)) + logger.info(f'Requesting "{filename}" (costs money!)') request = requests.get(url, stream=True) response_iter = request.iter_content(chunk_size=chunk_size) - logger.info("\t Writing {}".format(outfile)) + logger.info(f"\t Writing {outfile}") with gzip.open(outfile, 'wb') as fout: - for i, chunk in enumerate(response_iter): + for chunk in response_iter: fout.write(chunk) md5.update(chunk) else: - logger.info('Requesting "{}" (free!)'.format(filename)) - logger.info("\t Writing {}".format(outfile)) + logger.info(f'Requesting "{filename}" (free!)') + logger.info(f"\t Writing {outfile}") return md5.hexdigest() def default_manifest_filename(): @@ -159,7 +159,7 @@ def parse_manifest(manifest): ] def _tar_to_filename(filename): - return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz' + return f'{os.path.join(DIR_PDFTARS, os.path.basename(filename))}.gz' def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=False): """ Download filename, check its md5sum, and form the output path """ @@ -170,9 +170,7 @@ def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=Fals if not dryrun: if md5_expected != md5_downloaded: - msg = "MD5 '{}' does not match expected '{}' for file '{}'".format( - md5_downloaded, md5_expected, filename - ) + msg = f"MD5 '{md5_downloaded}' does not match expected '{md5_expected}' for file '{filename}'" raise AssertionError(msg) return outname @@ -195,13 +193,12 @@ def download_check_tarfiles(list_of_fileinfo, dryrun=False): def call(cmd, dryrun=False, debug=False): """ Spawn a subprocess and execute the string in cmd """ - if dryrun: - logger.info(cmd) - return 0 - else: + if not dryrun: return subprocess.check_call( shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w') ) + logger.info(cmd) + return 0 def _make_pathname(filename): """ @@ -235,7 +232,7 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False, outname = _tar_to_filename(filename) if not os.path.exists(outname): - msg = 'Tarfile from manifest not found {}, skipping...'.format(outname) + msg = f'Tarfile from manifest not found {outname}, skipping...' logger.error(msg) return @@ -245,7 +242,7 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False, cmd = 'tar --one-top-level -C {} -xf {} {}' cmd = cmd.format(DIR_PDFTARS, outname, namelist) else: - cmd = 'tar --one-top-level -C {} -xf {}'.format(DIR_PDFTARS, outname) + cmd = f'tar --one-top-level -C {DIR_PDFTARS} -xf {outname}' _call(cmd, dryrun) basename = os.path.splitext(os.path.basename(filename))[0] @@ -257,18 +254,18 @@ def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False, ) # move txt into final file structure - txtfiles = glob.glob('{}/*.txt'.format(pdfdir)) + txtfiles = glob.glob(f'{pdfdir}/*.txt') for tf in txtfiles: mvfn = _make_pathname(tf) dirname = os.path.dirname(mvfn) if not os.path.exists(dirname): - _call('mkdir -p {}'.format(dirname), dryrun) + _call(f'mkdir -p {dirname}', dryrun) if not dryrun: shutil.move(tf, mvfn) # clean up pdfs - _call('rm -rf {}'.format(os.path.join(DIR_PDFTARS, basename)), dryrun) + _call(f'rm -rf {os.path.join(DIR_PDFTARS, basename)}', dryrun) def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processes=1): """ @@ -295,10 +292,10 @@ def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processe md5sum = fileinfo['md5sum'] if check_if_any_processed(fileinfo): - logger.info('Tar file appears processed, skipping {}...'.format(filename)) + logger.info(f'Tar file appears processed, skipping {filename}...') return - logger.info('Processing tar "{}" ...'.format(filename)) + logger.info(f'Processing tar "{filename}" ...') process_tarfile_inner(filename, pdfnames=None, processes=processes, dryrun=dryrun) def process_manifest_files(list_of_fileinfo, processes=1, dryrun=False): @@ -341,9 +338,9 @@ def generate_tarfile_indices(manifest): for fileinfo in manifest: name = fileinfo['filename'] - logger.info("Indexing {}...".format(name)) + logger.info(f"Indexing {name}...") - tarname = os.path.join(DIR_PDFTARS, os.path.basename(name))+'.gz' + tarname = f'{os.path.join(DIR_PDFTARS, os.path.basename(name))}.gz' files = [i for i in tarfile.open(tarname).getnames() if i.endswith('.pdf')] index[name] = files @@ -356,7 +353,7 @@ def check_missing_txt_files(index): """ missing = defaultdict(list) for tar, pdflist in index.items(): - logger.info("Checking {}...".format(tar)) + logger.info(f"Checking {tar}...") for pdf in pdflist: txt = _make_pathname(pdf).replace('.pdf', '.txt') @@ -371,12 +368,10 @@ def rerun_missing(missing, processes=1): files which are missing from the conversion. There are various reasons that they can fail. """ - sort = list(reversed( - sorted([(k, v) for k, v in missing.items()], key=lambda x: len(x[1])) - )) + sort = list(reversed(sorted(list(missing.items()), key=lambda x: len(x[1])))) for tar, names in sort: - logger.info("Running {} ({} to do)...".format(tar, len(names))) + logger.info(f"Running {tar} ({len(names)} to do)...") process_tarfile_inner( tar, pdfnames=names, processes=processes, timelimit=5 * fulltext.TIMELIMIT diff --git a/arxiv_public_data/slice_pdfs.py b/arxiv_public_data/slice_pdfs.py index 1b99f80..199685a 100644 --- a/arxiv_public_data/slice_pdfs.py +++ b/arxiv_public_data/slice_pdfs.py @@ -8,10 +8,10 @@ def id_to_tarpdf(n): if '.' in n: ym = n.split('.')[0] - return '{}/{}.pdf'.format(ym, n) + return f'{ym}/{n}.pdf' else: ym = n.split('/')[1][:4] - return '{}/{}.pdf'.format(ym, n.replace('/', '')) + return f"{ym}/{n.replace('/', '')}.pdf" def _call(cmd, dryrun=False, debug=False): """ Spawn a subprocess and execute the string in cmd """ @@ -20,7 +20,7 @@ def _call(cmd, dryrun=False, debug=False): ) def _tar_to_filename(filename): - return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz' + return f'{os.path.join(DIR_PDFTARS, os.path.basename(filename))}.gz' def extract_files(tarfile, pdfs, outdir): """ @@ -34,9 +34,9 @@ def extract_files(tarfile, pdfs, outdir): tdir = os.path.join(DIR_PDFTARS, basename) outpdfs = ' '.join([os.path.join(tdir, id_to_tarpdf(i)) for i in pdfs]) - cmd0 = 'tar --one-top-level -C {} -xf {} {}'.format(DIR_PDFTARS, outname, namelist) - cmd1 = 'cp -a {} {}'.format(outpdfs, outdir) - cmd2 = 'rm -rf {}'.format(tdir) + cmd0 = f'tar --one-top-level -C {DIR_PDFTARS} -xf {outname} {namelist}' + cmd1 = f'cp -a {outpdfs} {outdir}' + cmd2 = f'rm -rf {tdir}' _call(cmd0) _call(cmd1) @@ -56,9 +56,9 @@ def call_list(ai, manifest): num = 0 for i in ai: aid = i.get('id') - + tar = id_to_tarpdf(aid) - if not tar in inv: + if tar not in inv: continue tars[inv[id_to_tarpdf(aid)]].append(aid) diff --git a/src/Surveyor.py b/src/Surveyor.py index 3d0657e..b3dbcab 100644 --- a/src/Surveyor.py +++ b/src/Surveyor.py @@ -116,7 +116,7 @@ def __init__( self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name, trust_remote_code=True).to(self.torch_device) self.title_model.eval() if not no_save_models: - self.title_model.save_pretrained(models_dir + "/title_model") + self.title_model.save_pretrained(f"{models_dir}/title_model") #self.title_tokenizer.save_pretrained(models_dir + "/title_tokenizer") # summary model @@ -127,10 +127,10 @@ def __init__( self.torch_device) self.summ_model.eval() if not no_save_models: - self.summ_model.save_pretrained(models_dir + "/summ_model") + self.summ_model.save_pretrained(f"{models_dir}/summ_model") #self.summ_tokenizer.save_pretrained(models_dir + "/summ_tokenizer") self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer) - + if 'led' in ledmodel_name: self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name) self.ledmodel = LEDForConditionalGeneration.from_pretrained(ledmodel_name).to(self.torch_device) @@ -142,40 +142,49 @@ def __init__( self.ledmodel = BartForConditionalGeneration.from_pretrained(ledmodel_name).to(self.torch_device) self.ledmodel.eval() if not no_save_models: - self.ledmodel.save_pretrained(models_dir + "/ledmodel") + self.ledmodel.save_pretrained(f"{models_dir}/ledmodel") #self.ledtokenizer.save_pretrained(models_dir + "/ledtokenizer") self.embedder = SentenceTransformer(embedder_name) self.embedder.eval() if not no_save_models: - self.embedder.save(models_dir + "/embedder") + self.embedder.save(f"{models_dir}/embedder") else: self.print_fn("\n- Initializing from previously saved models at" + models_dir) self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name) - self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device) + self.title_model = AutoModelForSeq2SeqLM.from_pretrained( + f"{models_dir}/title_model" + ).to(self.torch_device) self.title_model.eval() # summary model #self.summ_config = AutoConfig.from_pretrained(ex_summ_model_name) #self.summ_config.output_hidden_states = True self.summ_tokenizer = AutoTokenizer.from_pretrained(ex_summ_model_name) - self.summ_model = AutoModel.from_pretrained(models_dir + "/summ_model").to( - self.torch_device) + self.summ_model = AutoModel.from_pretrained( + f"{models_dir}/summ_model" + ).to(self.torch_device) self.summ_model.eval() self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer) if 'led' in ledmodel_name: self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name) - self.ledmodel = LEDForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device) + self.ledmodel = LEDForConditionalGeneration.from_pretrained( + f"{models_dir}/ledmodel" + ).to(self.torch_device) elif 't5' in ledmodel_name: self.ledtokenizer = AutoTokenizer.from_pretrained(ledmodel_name) - self.ledmodel = T5ForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device) + self.ledmodel = T5ForConditionalGeneration.from_pretrained( + f"{models_dir}/ledmodel" + ).to(self.torch_device) elif 'bart' in ledmodel_name: self.ledtokenizer = AutoTokenizer.from_pretrained(ledmodel_name) - self.ledmodel = BartForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device) + self.ledmodel = BartForConditionalGeneration.from_pretrained( + f"{models_dir}/ledmodel" + ).to(self.torch_device) self.ledmodel.eval() - self.embedder = SentenceTransformer(models_dir + "/embedder") + self.embedder = SentenceTransformer(f"{models_dir}/embedder") self.embedder.eval() self.nlp = spacy.load(nlp_name) @@ -185,33 +194,13 @@ def __init__( def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None): - if pdf_dir: - survey_pdf_dir = pdf_dir - else: - survey_pdf_dir = self.DEFAULTS["pdf_dir"] - - if txt_dir: - survey_txt_dir = txt_dir - else: - survey_txt_dir = self.DEFAULTS["txt_dir"] - - if img_dir: - survey_img_dir = img_dir - else: - survey_img_dir = self.DEFAULTS["img_dir"] - - if tab_dir: - survey_tab_dir = tab_dir - else: - survey_tab_dir = self.DEFAULTS["tab_dir"] - - if dump_dir: - survey_dump_dir = dump_dir - else: - survey_dump_dir = self.DEFAULTS["dump_dir"] - + survey_pdf_dir = pdf_dir if pdf_dir else self.DEFAULTS["pdf_dir"] + survey_txt_dir = txt_dir if txt_dir else self.DEFAULTS["txt_dir"] + survey_img_dir = img_dir if img_dir else self.DEFAULTS["img_dir"] + survey_tab_dir = tab_dir if tab_dir else self.DEFAULTS["tab_dir"] + survey_dump_dir = dump_dir if dump_dir else self.DEFAULTS["dump_dir"] dirs = [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir] - if sum([True for dir in dirs if 'arxiv_data/' in dir]): + if sum(True for dir in dirs if 'arxiv_data/' in dir): base = os.path.dirname("arxiv_data/") if not os.path.exists(base): os.mkdir(base) @@ -251,7 +240,7 @@ def pdf_route(self, pdf_dir, txt_dir, img_dir, tab_dir, dump_dir, papers_meta): self.print_fn("\n- Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers])) papers.extend(new_papers) - joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp') + joblib.dump(papers, f'{dump_dir}papers_extracted_pdf_route.dmp') copy_tree(img_dir, dump_dir + os.path.basename(img_dir)) copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir)) @@ -296,7 +285,7 @@ def fetch_papers(self, dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir, rep # plugging citations to our papers object self.print_fn("\n- plugging in citation network.. ") papers, cites = self.cocitation_network(papers, txt_dir) - joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp') + joblib.dump(papers, f'{dump_dir}papers_selected_pdf_route.dmp') from distutils.dir_util import copy_tree copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir)) copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir)) @@ -335,90 +324,89 @@ def build_doc(self, research_sections, papers, query=None, filename='survey.txt' bibentries = [r.bibtex() for r in bibentries] self.print_fn("\n- building final survey file .. at "+ filename) - file = open(filename, 'w+') - if query is None: - query = 'Internal(existing) research' - self.survey_print_fn("#### Generated_survey:") - file.write("----------------------------------------------------------------------") - file.write("Title: A survey on " + query) - self.survey_print_fn("") - self.survey_print_fn("----------------------------------------------------------------------") - self.survey_print_fn("Title: A survey on " + query) - file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)") - self.survey_print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)") - file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)") - self.survey_print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)") - file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+ - "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+ - "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+ - "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+ - "\nentries(only to avoid LaTex overhead). ") - self.survey_print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+ - "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+ - "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+ - "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+ - "\nentries(only to avoid LaTex overhead). ") - file.write("----------------------------------------------------------------------") - self.survey_print_fn("----------------------------------------------------------------------") - file.write("") - self.survey_print_fn("") - file.write('ABSTRACT') - self.survey_print_fn('ABSTRACT') - self.survey_print_fn("=================================================") - file.write("=================================================") - file.write("") - self.survey_print_fn("") - file.write(research_sections['abstract']) - self.survey_print_fn(research_sections['abstract']) - file.write("") - self.survey_print_fn("") - file.write('INTRODUCTION') - self.survey_print_fn('INTRODUCTION') - self.survey_print_fn("=================================================") - file.write("=================================================") - file.write("") - self.survey_print_fn("") - file.write(research_sections['introduction']) - self.survey_print_fn(research_sections['introduction']) - file.write("") - self.survey_print_fn("") - for k, v in research_sections.items(): - if k not in ['abstract', 'introduction', 'conclusion']: - file.write(k.upper()) - self.survey_print_fn(k.upper()) - self.survey_print_fn("=================================================") - file.write("=================================================") - file.write("") - self.survey_print_fn("") - file.write(v) - self.survey_print_fn(v) - file.write("") - self.survey_print_fn("") - file.write('CONCLUSION') - self.survey_print_fn('CONCLUSION') - self.survey_print_fn("=================================================") - file.write("=================================================") - file.write("") - self.survey_print_fn("") - file.write(research_sections['conclusion']) - self.survey_print_fn(research_sections['conclusion']) - file.write("") - self.survey_print_fn("") - - file.write('REFERENCES') - self.survey_print_fn('REFERENCES') - self.survey_print_fn("=================================================") - file.write("=================================================") - file.write("") - self.survey_print_fn("") - for entry in bibentries: - file.write(entry) - self.survey_print_fn(entry) + with open(filename, 'w+') as file: + if query is None: + query = 'Internal(existing) research' + self.survey_print_fn("#### Generated_survey:") + file.write("----------------------------------------------------------------------") + file.write(f"Title: A survey on {query}") + self.survey_print_fn("") + self.survey_print_fn("----------------------------------------------------------------------") + self.survey_print_fn(f"Title: A survey on {query}") + file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)") + self.survey_print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)") + file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)") + self.survey_print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)") + file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+ + "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+ + "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+ + "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+ + "\nentries(only to avoid LaTex overhead). ") + self.survey_print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+ + "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+ + "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+ + "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+ + "\nentries(only to avoid LaTex overhead). ") + file.write("----------------------------------------------------------------------") + self.survey_print_fn("----------------------------------------------------------------------") + file.write("") + self.survey_print_fn("") + file.write('ABSTRACT') + self.survey_print_fn('ABSTRACT') + self.survey_print_fn("=================================================") + file.write("=================================================") + file.write("") + self.survey_print_fn("") + file.write(research_sections['abstract']) + self.survey_print_fn(research_sections['abstract']) + file.write("") + self.survey_print_fn("") + file.write('INTRODUCTION') + self.survey_print_fn('INTRODUCTION') + self.survey_print_fn("=================================================") + file.write("=================================================") + file.write("") + self.survey_print_fn("") + file.write(research_sections['introduction']) + self.survey_print_fn(research_sections['introduction']) + file.write("") + self.survey_print_fn("") + for k, v in research_sections.items(): + if k not in ['abstract', 'introduction', 'conclusion']: + file.write(k.upper()) + self.survey_print_fn(k.upper()) + self.survey_print_fn("=================================================") + file.write("=================================================") + file.write("") + self.survey_print_fn("") + file.write(v) + self.survey_print_fn(v) + file.write("") + self.survey_print_fn("") + file.write('CONCLUSION') + self.survey_print_fn('CONCLUSION') + self.survey_print_fn("=================================================") + file.write("=================================================") + file.write("") + self.survey_print_fn("") + file.write(research_sections['conclusion']) + self.survey_print_fn(research_sections['conclusion']) file.write("") self.survey_print_fn("") - self.survey_print_fn("========================XXX=========================") - file.write("========================XXX=========================") - file.close() + + file.write('REFERENCES') + self.survey_print_fn('REFERENCES') + self.survey_print_fn("=================================================") + file.write("=================================================") + file.write("") + self.survey_print_fn("") + for entry in bibentries: + file.write(entry) + self.survey_print_fn(entry) + file.write("") + self.survey_print_fn("") + self.survey_print_fn("========================XXX=========================") + file.write("========================XXX=========================") def build_basic_blocks(self, corpus_known_sections, corpus): @@ -430,8 +418,8 @@ def build_basic_blocks(self, corpus_known_sections, corpus): with torch.no_grad(): summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5) res = self.nlp(summtext) - res = set([str(sent) for sent in list(res.sents)]) - summtext = ''.join([line for line in res]) + res = {str(sent) for sent in list(res.sents)} + summtext = ''.join(list(res)) # pself.print_fn(summtext) research_blocks[head] = summtext @@ -457,10 +445,8 @@ def abstractive_summary(self, longtext): summary = self.ledtokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) res = self.nlp(summary[0]) - res = set([str(sent) for sent in list(res.sents)]) - summtext = ''.join([line for line in res]) - #self.print_fn("abstractive summary type:" + str(type(summary))) - return summtext + res = {str(sent) for sent in list(res.sents)} + return ''.join(list(res)) def get_abstract(self, abs_lines, corpus_known_sections, research_blocks): @@ -482,7 +468,7 @@ def get_corpus_lines(self, corpus): for k, v in corpus.items(): # self.print_fn(v) types.add(type(v)) - abstext = k + '. ' + v.replace('\n', ' ') + abstext = f'{k}. ' + v.replace('\n', ' ') abstext = self.nlp(abstext) abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)]) #self.print_fn("unique corpus value types:" + str(types)) @@ -498,10 +484,11 @@ def get_sectioned_docs(self, papers, papers_meta): content = self.extractive_summary(''.join(section['highlights'])) docs.append(content) selected_pids = [p['id'] for p in papers] - meta_abs = [] - for p in papers_meta: - if p['id'] not in selected_pids: - meta_abs.append(self.generate_title(p['abstract'])) + meta_abs = [ + self.generate_title(p['abstract']) + for p in papers_meta + if p['id'] not in selected_pids + ] docs.extend(meta_abs) #self.print_fn("meta_abs num"+str(len(meta_abs))) #self.print_fn("selected_pids num"+str(len(selected_pids))) @@ -609,18 +596,23 @@ def get_clustered_sections(self, clustered_lines): for i, cluster in clustered_lines.items(): # self.print_fn(cluster) try: - clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary( - str(" ".join(cluster)).lower()) + clusters_dict[ + self.generate_title(" ".join(cluster)) + ] = self.abstractive_summary(" ".join(cluster).lower()) except: - clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary( - self.extractive_summary(str(" ".join(cluster)).lower())) + clusters_dict[ + self.generate_title(" ".join(cluster)) + ] = self.abstractive_summary( + self.extractive_summary(" ".join(cluster).lower()) + ) return clusters_dict def get_intro(self, corpus_known_sections, research_blocks): intro_lines = "" - intro_lines += str(" ".join([l.lower() for l in corpus_known_sections['introduction']])) + str( - " ".join([l.lower() for l in corpus_known_sections['conclusion']])) + intro_lines += " ".join( + [l.lower() for l in corpus_known_sections['introduction']] + ) + " ".join([l.lower() for l in corpus_known_sections['conclusion']]) intro_lines += research_blocks['introduction'] + research_blocks['conclusion'] try: return self.abstractive_summary(intro_lines) @@ -628,10 +620,7 @@ def get_intro(self, corpus_known_sections, research_blocks): return self.abstractive_summary(self.extractive_summary(intro_lines)) def get_conclusion(self, research_sections): - paper_body = "" - for k, v in research_sections.items(): - paper_body += v - + paper_body = "".join(v for k, v in research_sections.items()) try: return self.abstractive_summary(paper_body) except: @@ -670,11 +659,11 @@ def build_corpus(self, papers, papers_meta): corpus = self.build_meta_corpus(papers_meta) for p in papers: ph = [] - for sid, section in enumerate(p['sections']): + for section in p['sections']: ph.extend(section['highlights']) for pid, ls in corpus.items(): if pid == p['id']: - corpus[pid] = p['abstract'] + str(' '.join(ph)) + corpus[pid] = p['abstract'] + ' '.join(ph) ''' self.print_fn("================== final corpus ====================") self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()])) @@ -700,7 +689,7 @@ def build_meta_corpus(self, papers): ptext = p['title'] + ". " + p['abstract'] doc = self.nlp(ptext) phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)]) - meta_corpus[pid] = str(' '.join(phs)) + meta_corpus[pid] = ' '.join(phs) ''' self.print_fn("================== meta corpus ====================") self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()])) @@ -732,7 +721,7 @@ def select_papers(self, papers, query, num_papers=20): # self.print_fn("argsort pids("+str(num_papers)+" papers): "+ str(idx)) papers_selected = [p for p in papers if p['id'] in idx] # assert(len(papers_selected)==num_papers) - self.print_fn("num papers selected: " + str(len(papers_selected))) + self.print_fn(f"num papers selected: {len(papers_selected)}") for p in papers_selected: self.print_fn("Selected Paper: " + p['title']) @@ -751,7 +740,7 @@ def extractive_summary(self, text): with torch.no_grad(): res = self.model(text, ratio=0.5) res_doc = self.nlp(res) - return " ".join(set([str(sent) for sent in list(res_doc.sents)])) + return " ".join({str(sent) for sent in list(res_doc.sents)}) def extractive_highlights(self, lines): # text = " ".join(lines) @@ -762,20 +751,25 @@ def extractive_highlights(self, lines): with torch.no_grad(): res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, ) res_doc = self.nlp(res) - res_lines = set([str(sent) for sent in list(res_doc.sents)]) + res_lines = {str(sent) for sent in list(res_doc.sents)} # self.print_fn("\n- ".join(res_sents)) with torch.no_grad(): - keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english') - keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), - keyphrase_ngram_range=(4, 4), - stop_words='english', use_mmr=True, diversity=0.7) + keywords = self.kw_model.extract_keywords( + " ".join([l.lower() for l in lines]), stop_words='english' + ) + keyphrases = self.kw_model.extract_keywords( + " ".join([l.lower() for l in lines]), + keyphrase_ngram_range=(4, 4), + stop_words='english', + use_mmr=True, + diversity=0.7, + ) return res_lines, keywords, keyphrases def extract_highlights(self, papers): for p in papers: - sid = 0 p['sections'] = [] - for heading, lines in p['body_text'].items(): + for sid, (heading, lines) in enumerate(p['body_text'].items()): hs, kws, kps = self.extractive_highlights(lines) p['sections'].append({ 'sid': sid, @@ -785,7 +779,6 @@ def extract_highlights(self, papers): 'keywords': kws, 'keyphrases': kps, }) - sid += 1 return papers def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False): @@ -808,7 +801,7 @@ def extract_parts(self, papers, txt_dir, dump_dir): # model = build_summarizer() #for file in glob.glob(txt_dir + '/*.txt'): for p in papers: - file = txt_dir + '/'+ p['id'] +'.txt' + file = f'{txt_dir}/' + p['id'] + '.txt' refined, headings_extracted = self.extract_headings(file) sections = self.extract_sections(headings_extracted, refined) # highlights = {k: extract_highlights(model,v) for k, v in sections.items()} @@ -831,15 +824,13 @@ def extract_parts(self, papers, txt_dir, dump_dir): # pself.print_fn({f: len(h) for f,h in headings_all.items()}) papers_none = [p for p in papers if p['id'] in ids_none] for p in papers_none: - os.remove(txt_dir + '/'+ p['id'] + '.txt') + os.remove(f'{txt_dir}/' + p['id'] + '.txt') papers.remove(p) return papers, ids_none def check_para(self, df): - size = 0 - for col in df.columns: - size += df[col].apply(lambda x: len(str(x))).median() + size = sum(df[col].apply(lambda x: len(str(x))).median() for col in df.columns) return size / len(df.columns) > 25 def scan_blocks(self, lines): @@ -868,7 +859,7 @@ def extract_sections(self, headings, lines, min_part_length=2): sections[start] = section ''' sections[start] = section - return {k: v for k, v in sections.items()} + return dict(sections) def is_rubbish(self, s, rubbish_tolerance=0.2, min_char_len=4): # numbers = sum(c.isdigit() for c in s) @@ -877,10 +868,9 @@ def is_rubbish(self, s, rubbish_tolerance=0.2, min_char_len=4): # others = len(s) - numbers - letters - spaces if len(s) == 0: return False - if ((len(s) - (letters + spaces)) / len(s) >= rubbish_tolerance) or self.alpha_length(s) < min_char_len: - return True - else: - return False + return (len(s) - (letters + spaces)) / len( + s + ) >= rubbish_tolerance or self.alpha_length(s) < min_char_len def get_section(self, first, last, lines): try: @@ -890,21 +880,17 @@ def get_section(self, first, last, lines): # end = lines.index( last, start ) start = [i for i in range(len(lines)) if first is lines[i]][0] end = [i for i in range(len(lines)) if last is lines[i]][0] - section_lines = lines[start + 1:end] - # self.print_fn("heading: " + str(first)) - # self.print_fn("section_lines: "+ str(section_lines)) - # self.print_fn(section_lines) - return section_lines + return lines[start + 1:end] except ValueError: self.print_fn("value error :") - self.print_fn("first heading :" + str(first) + ", second heading :" + str(last)) - self.print_fn("first index :" + str(start) + ", second index :" + str(end)) + self.print_fn(f"first heading :{str(first)}, second heading :{str(last)}") + self.print_fn(f"first index :{str(start)}, second index :{str(end)}") return "" def check_list_elems_in_list(self, headings, lines): import numpy as np # [self.print_fn(head) for head in headings if head not in lines ] - return np.all([True if head in lines else False for head in headings]) + return np.all([head in lines for head in headings]) def check_first_char_upper(self, text): for c in text: @@ -929,17 +915,15 @@ def extract_headings(self, txt_file): # scan_failed - rescan with first match for abstract hook if len(headings) == 0: - # self.print_fn('===================') - # self.print_fn("run 1 failed") - abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())] - if len(abs_cans) != 0: + if abs_cans := [ + line + for line in lines + if 'abstract' in re.sub("\s+", "", line.strip().lower()) + ]: abs_head = abs_cans[0] refined, headings = self.scan_text(lines, abs_head=abs_head) self.check_list_elems_in_list(headings, refined) headings = self.check_duplicates(headings) - # self.print_fn('===================') - # self.print_fn(txt_file +": second scan: \n"+str(len(headings))+" headings") - # if len(headings) == 0: # self.print_fn("heading scan failed completely") @@ -947,8 +931,7 @@ def extract_headings(self, txt_file): def check_duplicates(self, my_list): my_finallist = [] - dups = [s for s in my_list if my_list.count(s) > 1] - if len(dups) > 0: + if dups := [s for s in my_list if my_list.count(s) > 1]: [my_finallist.append(n) for n in my_list if n not in my_finallist] # self.print_fn("original: "+str(len(my_list))+" new: "+str(len(my_finallist))) @@ -961,18 +944,22 @@ def clean_lines(self, text): # lines = [str(sent) for sent in doc.sents] lines = text.replace('\r', '').split('\n') lines = [line for line in lines if not self.is_rubbish(line)] - lines = [line for line in lines if - re.match("^[a-zA-Z1-9\.\[\]\(\):\-,\"\"\s]*$", line) and not 'Figure' in line and not 'Table' in line] + lines = [ + line + for line in lines + if re.match("^[a-zA-Z1-9\.\[\]\(\):\-,\"\"\s]*$", line) + and 'Figure' not in line + and 'Table' not in line + ] lengths_cleaned = [self.alpha_length(line) for line in lines] mean_length_cleaned = np.median(lengths_cleaned) lines_standardized = [] for line in lines: if len(line) >= (1.8 * mean_length_cleaned): - first_half = line[0:len(line) // 2] + first_half = line[:len(line) // 2] second_half = line[len(line) // 2 if len(line) % 2 == 0 else ((len(line) // 2) + 1):] - lines_standardized.append(first_half) - lines_standardized.append(second_half) + lines_standardized.extend((first_half, second_half)) else: lines_standardized.append(line) @@ -1009,10 +996,8 @@ def scanline(self, record, headings, refined, id, lines): import re line = lines[id] - if not len(line) == 0: - # self.print_fn("in scanline") - # self.print_fn(line) - if record: + if record: + if len(line) != 0: refined.append(line) if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match( "^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$", @@ -1029,21 +1014,18 @@ def scanline(self, record, headings, refined, id, lines): else: known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography'] missing = [h for h in known_headings if not np.any([True for head in headings if h in head])] - # for h in missing: - head = [line for h in missing if h in re.sub("\s+", "", line.strip().lower())] - # head = [line for known] - if len(head) > 0: + if head := [ + line + for h in missing + if h in re.sub("\s+", "", line.strip().lower()) + ]: headings.append(head[0]) assert (head[0] in refined) return refined, headings def char_length(self, s): - # numbers = sum(c.isdigit() for c in s) - letters = sum(c.isalpha() for c in s) - # spaces = sum(c.isspace() for c in s) - # others = len(s) - numbers - letters - spaces - return letters + return sum(c.isalpha() for c in s) def get_by_file(self, file, papers): import os @@ -1063,10 +1045,7 @@ def alpha_length(self, s): return letters + spaces def check_append(self, baselist, addstr): - check = False - for e in baselist: - if addstr in e: - check = True + check = any(addstr in e for e in baselist) if not check: baselist.append(addstr) return baselist @@ -1098,9 +1077,16 @@ def extract_images_from_file(self, pdf_file_name, img_dir): for page_index in range(len(pdf_file)): page = pdf_file[page_index] images.extend(page.getImageList()) - images_files = [self.save_image(pdf_file.extractImage(img[0]), i, pdf_file_name.replace('.pdf', ''), img_dir) for i, img in - enumerate(set(images)) if img[0]] - return images_files + return [ + self.save_image( + pdf_file.extractImage(img[0]), + i, + pdf_file_name.replace('.pdf', ''), + img_dir, + ) + for i, img in enumerate(set(images)) + if img[0] + ] def save_image(self, base_image, img_index, pid, img_dir): from PIL import Image @@ -1111,7 +1097,7 @@ def save_image(self, base_image, img_index, pid, img_dir): # load it to PIL image = Image.open(io.BytesIO(image_bytes)) # save it to local disk - fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext + fname = f"{img_dir}/{str(pid)}_{str(img_index + 1)}.{image_ext}" image.save(open(f"{fname}", "wb")) # self.print_fn(fname) return fname @@ -1121,7 +1107,7 @@ def save_tables(self, dfs, pid, tab_dir): dfs = [df for df in dfs if not self.check_para(df)] files = [] for df in dfs: - filename = tab_dir + "/" + str(pid) + ".csv" + filename = f"{tab_dir}/{str(pid)}.csv" files.append(filename) df.to_csv(filename, index=False) return files @@ -1160,7 +1146,7 @@ def search(self, query_text=None, id_list=None, max_search=100): id_list=id_list ) - results = [result for result in search.get()] + results = list(search.get()) searched_papers = [] discarded_ids = [] @@ -1199,7 +1185,7 @@ def download_pdfs(self, papers, pdf_dir): papers_filtered = arxiv.Search(id_list=ids).get() for p in papers_filtered: p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0] - download_file = pdf_dir + "/" + p_id + ".pdf" + download_file = f"{pdf_dir}/{p_id}.pdf" p.download_pdf(filename=download_file) @@ -1211,7 +1197,7 @@ def download_sources(self, papers, src_dir): papers_filtered = arxiv.Search(id_list=ids).get() for p in papers_filtered: p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0] - download_file = src_dir + "/" + p_id + ".tar.gz" + download_file = f"{src_dir}/{p_id}.tar.gz" p.download_source(filename=download_file) def convert_pdfs(self, pdf_dir, txt_dir): @@ -1221,13 +1207,12 @@ def convert_pdfs(self, pdf_dir, txt_dir): # import arxiv_public_data convert_directory_parallel(pdf_dir, multiprocessing.cpu_count()) - for file in glob.glob(pdf_dir + '/*.txt'): + for file in glob.glob(f'{pdf_dir}/*.txt'): shutil.move(file, txt_dir) def read_paper(self, path): - f = open(path, 'r', encoding="utf-8") - text = str(f.read()) - f.close() + with open(path, 'r', encoding="utf-8") as f: + text = str(f.read()) return text def cocitation_network(self, papers, txt_dir): @@ -1244,28 +1229,35 @@ def lookup_author(self, author_query): from scholarly import scholarly import operator # Retrieve the author's data, fill-in, and print - self.print_fn("Searching Author: " + author_query) + self.print_fn(f"Searching Author: {author_query}") search_result = next(scholarly.search_author(author_query), None) if search_result is not None: author = scholarly.fill(search_result) - author_stats = { + return { 'name': author_query, - 'affiliation': author['affiliation'] if author['affiliation'] else None, + 'affiliation': author['affiliation'] + if author['affiliation'] + else None, 'citedby': author['citedby'] if 'citedby' in author.keys() else 0, - 'most_cited_year': max(author['cites_per_year'].items(), key=operator.itemgetter(1))[0] if len( - author['cites_per_year']) > 0 else None, + 'most_cited_year': max( + author['cites_per_year'].items(), key=operator.itemgetter(1) + )[0] + if len(author['cites_per_year']) > 0 + else None, 'coauthors': [c['name'] for c in author['coauthors']], 'hindex': author['hindex'], 'impact': author['i10index'], 'interests': author['interests'], - 'publications': [{'title': p['bib']['title'], 'citations': p['num_citations']} for p in - author['publications']], + 'publications': [ + {'title': p['bib']['title'], 'citations': p['num_citations']} + for p in author['publications'] + ], 'url_picture': author['url_picture'], } else: self.print_fn("author not found") - author_stats = { + return { 'name': author_query, 'affiliation': "", 'citedby': 0, @@ -1278,18 +1270,13 @@ def lookup_author(self, author_query): 'url_picture': "", } - # pself.print_fn(author_stats) - return author_stats - def author_stats(self, papers): all_authors = [] for p in papers: - paper_authors = [a for a in p['authors']] + paper_authors = list(p['authors']) all_authors.extend(paper_authors) - searched_authors = [self.lookup_author(a) for a in set(all_authors)] - - return searched_authors + return [self.lookup_author(a) for a in set(all_authors)] def text_similarity(self, text1, text2): doc1 = self.similarity_nlp(text1) @@ -1316,9 +1303,9 @@ def ask(self, corpus, question): start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions) - self.print_fn("context: " + text) - self.print_fn("question: " + question) - self.print_fn("outputs: " + outputs) + self.print_fn(f"context: {text}") + self.print_fn(f"question: {question}") + self.print_fn(f"outputs: {outputs}") return outputs def zip_outputs(self, dump_dir, zip_name): @@ -1352,7 +1339,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb # arxiv api relevance search and data preparation self.print_fn("\n- searching arXiv for top 100 papers.. ") results, searched_papers = self.search(query, id_list, max_search=max_search) - joblib.dump(searched_papers, survey_dump_dir + 'papers_metadata.dmp') + joblib.dump(searched_papers, f'{survey_dump_dir}papers_metadata.dmp') self.print_fn("\n- found " + str(len(searched_papers)) + " papers") # paper selection by scibert vector embedding relevance scores @@ -1364,27 +1351,27 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb if weigh_authors: authors = self.author_stats(papers_highlighted) - joblib.dump(papers_highlighted, survey_dump_dir + 'papers_highlighted.dmp') + joblib.dump(papers_highlighted, f'{survey_dump_dir}papers_highlighted.dmp') self.print_fn("\n- Standardizing known section headings per paper.. ") papers_standardized = self.standardize_headings(papers_highlighted) - joblib.dump(papers_standardized, survey_dump_dir + 'papers_standardized.dmp') + joblib.dump(papers_standardized, f'{survey_dump_dir}papers_standardized.dmp') self.print_fn("\n- Building paper-wise corpus.. ") corpus = self.build_corpus(papers_highlighted, searched_papers) - joblib.dump(corpus, survey_dump_dir + 'corpus.dmp') + joblib.dump(corpus, f'{survey_dump_dir}corpus.dmp') self.print_fn("\n- Building section-wise corpus.. ") corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized) - joblib.dump(corpus_sectionwise, survey_dump_dir + 'corpus_sectionwise.dmp') + joblib.dump(corpus_sectionwise, f'{survey_dump_dir}corpus_sectionwise.dmp') self.print_fn("\n- Building basic research highlights.. ") research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus) - joblib.dump(research_blocks, survey_dump_dir + 'research_blocks.dmp') + joblib.dump(research_blocks, f'{survey_dump_dir}research_blocks.dmp') self.print_fn("\n- Reducing corpus to lines.. ") corpus_lines = self.get_corpus_lines(corpus) - joblib.dump(corpus_lines, survey_dump_dir + 'corpus_lines.dmp') + joblib.dump(corpus_lines, f'{survey_dump_dir}corpus_lines.dmp') # temp # searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp') @@ -1418,7 +1405,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb self.print_fn("\n- Building abstract.. ") abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks) - joblib.dump(abstract_block, survey_dump_dir + 'abstract_block.dmp') + joblib.dump(abstract_block, f'{survey_dump_dir}abstract_block.dmp') ''' self.print_fn("abstract_block type:"+ str(type(abstract_block))) self.print_fn("abstract_block:") @@ -1427,7 +1414,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb self.print_fn("\n- Building introduction.. ") intro_block = self.get_intro(corpus_sectionwise, research_blocks) - joblib.dump(intro_block, survey_dump_dir + 'intro_block.dmp') + joblib.dump(intro_block, f'{survey_dump_dir}intro_block.dmp') ''' self.print_fn("intro_block type:"+ str(type(intro_block))) self.print_fn("intro_block:") @@ -1435,8 +1422,8 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb ''' self.print_fn("\n- Building custom sections.. ") clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers) - joblib.dump(clustered_sections, survey_dump_dir + 'clustered_sections.dmp') - joblib.dump(clustered_sentences, survey_dump_dir + 'clustered_sentences.dmp') + joblib.dump(clustered_sections, f'{survey_dump_dir}clustered_sections.dmp') + joblib.dump(clustered_sentences, f'{survey_dump_dir}clustered_sentences.dmp') ''' self.print_fn("clusters extracted") @@ -1449,11 +1436,11 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb ''' clustered_sections['abstract'] = abstract_block clustered_sections['introduction'] = intro_block - joblib.dump(clustered_sections, survey_dump_dir + 'research_sections.dmp') + joblib.dump(clustered_sections, f'{survey_dump_dir}research_sections.dmp') self.print_fn("\n- Building conclusion.. ") conclusion_block = self.get_conclusion(clustered_sections) - joblib.dump(conclusion_block, survey_dump_dir + 'conclusion_block.dmp') + joblib.dump(conclusion_block, f'{survey_dump_dir}conclusion_block.dmp') clustered_sections['conclusion'] = conclusion_block ''' self.print_fn("conclusion_block type:"+ str(type(conclusion_block))) @@ -1461,7 +1448,7 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb self.print_fn(conclusion_block) ''' if query is None: - query = self.generate_title(' '.join([v for v in clustered_sections.values()])) + query = self.generate_title(' '.join(list(clustered_sections.values()))) survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt' survey_file = Path(survey_dump_dir).resolve() / survey_file @@ -1470,9 +1457,9 @@ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, deb self.survey_print_fn("\n-citation-network: ") self.survey_print_fn(cites) - shutil.copytree('arxiv_data/', survey_dump_dir + '/arxiv_data/') + shutil.copytree('arxiv_data/', f'{survey_dump_dir}/arxiv_data/') assert (os.path.exists(survey_file)) - + zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip' zip_name = Path(survey_dump_dir).parent.resolve() / zip_name self.zip_outputs(survey_dump_dir, str(zip_name))