AutoLLM · linhkid · Mar 31, 2024 · Mar 31, 2024 · Mar 31, 2024 · Apr 2, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 <p align="center"><img src="./readme_images/banner.png" width=500 /></p>
 
-**ArXiv Digest and Personalized Recommendations using Large Language Models.**
+**ArXiv Digest (extra version) and Personalized Recommendations using Large Language Models.**
+
+*(Note: This is an adjusted repo to match my needs. For original repo please refer to **AutoLLM** that I forked from)*
 
 This repo aims to provide a better daily digest for newly published arXiv papers based on your own research interests and natural-language descriptions, using relevancy ratings from GPT.
 

diff --git a/config.yaml b/config.yaml
@@ -3,13 +3,13 @@ topic: "Computer Science"
 # An empty list here will include all categories in a topic
 # Use the natural language names of the topics, found here: https://arxiv.org
 # Including more categories will result in more calls to the large language model
-categories: ["Artificial Intelligence", "Computation and Language"]
+categories: ["Artificial Intelligence", "Computation and Language", "Machine Learning"]
 
 # Relevance score threshold. abstracts that receive a score less than this from the large language model
 # will have their papers filtered out.
 #
 # Must be within 1-10
-threshold: 7
+threshold: 6
 
 # A natural language statement that the large language model will use to judge which papers are relevant 
 #
@@ -23,5 +23,6 @@ threshold: 7
 interest: |
   1. Large language model pretraining and finetunings
   2. Multimodal machine learning
-  3. Do not care about specific application, for example, information extraction, summarization, etc.
-  4. Not interested in paper focus on specific languages, e.g., Arabic, Chinese, etc.
+  3. RAGs
+  4. Optimization of LLM and GenAI
+  5. Do not care about specific application, for example, information extraction, summarization, etc.
diff --git a/src/action.py b/src/action.py
@@ -1,15 +1,15 @@
 from sendgrid import SendGridAPIClient
 from sendgrid.helpers.mail import Mail, Email, To, Content
 
-from datetime import date
-
 import argparse
 import yaml
 import os
 from dotenv import load_dotenv
 import openai
 from relevancy import generate_relevance_score, process_subject_fields
 from download_new_papers import get_papers
+from datetime import date
+
 
 
 # Hackathon quality code. Don't judge too harshly.
@@ -247,11 +247,15 @@ def generate_body(topic, categories, interest, threshold):
             papers,
             query={"interest": interest},
             threshold_score=threshold,
-            num_paper_in_prompt=16,
+            num_paper_in_prompt=20,
         )
         body = "<br><br>".join(
             [
-                f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
+                f'<b>Title:</b> <a href="{paper["main_page"]}">{paper["title"]}</a><br><b>Authors:</b> {paper["authors"]}<br>'
+                f'<b>Score:</b> {paper["Relevancy score"]}<br><b>Reason:</b> {paper["Reasons for match"]}<br>'
+                f'<b>Goal:</b> {paper["Goal"]}<br><b>Data</b>: {paper["Data"]}<br><b>Methodology:</b> {paper["Methodology"]}<br>'
+                f'<b>Experiments & Results</b>: {paper["Experiments & Results"]}<br><b>Git</b>: {paper["Git"]}<br>'
+                f'<b>Discussion & Next steps</b>: {paper["Discussion & Next steps"]}'
                 for paper in relevancy
             ]
         )
@@ -269,6 +273,10 @@ def generate_body(topic, categories, interest, threshold):
         )
     return body
 
+def get_date():
+    today = date.today()
+    formatted_date = today.strftime("%d%m%Y")
+    return formatted_date
 
 if __name__ == "__main__":
     # Load the .env file.
@@ -292,7 +300,8 @@ def generate_body(topic, categories, interest, threshold):
     threshold = config["threshold"]
     interest = config["interest"]
     body = generate_body(topic, categories, interest, threshold)
-    with open("digest.html", "w") as f:
+    today_date = get_date()
+    with open(f"digest_{today_date}.html", "w") as f:
         f.write(body)
     if os.environ.get("SENDGRID_API_KEY", None):
         sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY"))

diff --git a/src/download_new_papers.py b/src/download_new_papers.py
@@ -7,7 +7,7 @@
 import datetime
 import pytz
 
-
+#Linh - add new def crawl_html_version(html_link) here
 def _download_new_papers(field_abbr):
     NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new'  # https://arxiv.org/list/cs/new
     page = urllib.request.urlopen(NEW_SUB_URL)
@@ -21,6 +21,7 @@ def _download_new_papers(field_abbr):
     dt_list = content.dl.find_all("dt")
     dd_list = content.dl.find_all("dd")
     arxiv_base = "https://arxiv.org/abs/"
+    arxiv_html = "https://arxiv.org/html/"
 
     assert len(dt_list) == len(dd_list)
     new_paper_list = []
@@ -29,6 +30,7 @@ def _download_new_papers(field_abbr):
         paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
         paper['main_page'] = arxiv_base + paper_number
         paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
+        paper['html'] = arxiv_html + paper_number + "v1"
 
         paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
         paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \

diff --git a/src/relevancy.py b/src/relevancy.py
@@ -35,6 +35,14 @@ def encode_prompt(query, prompt_papers):
     return prompt
 
 
+def is_json(myjson):
+    try:
+        json.loads(myjson)
+    except ValueError as e:
+        return False
+    return True
+
+
 def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
     selected_data = []
     if response is None:
@@ -45,9 +53,14 @@ def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
     try:
         score_items = [
             json.loads(re.sub(pattern, "", line))
-            for line in json_items if "relevancy score" in line.lower()]
-    except Exception:
+            for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
+    except Exception as e:
         pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
+        try:
+            score_items = score_items[:-1]
+        except Exception:
+            score_items = []
+        print(e)
         raise RuntimeError("failed")
     pprint.pprint(score_items)
     scores = []
@@ -91,8 +104,8 @@ def generate_relevance_score(
     all_papers,
     query,
     model_name="gpt-3.5-turbo-16k",
-    threshold_score=8,
-    num_paper_in_prompt=4,
+    threshold_score=7,
+    num_paper_in_prompt=8,
     temperature=0.4,
     top_p=1.0,
     sorting=True
@@ -136,7 +149,7 @@ def generate_relevance_score(
     return ans_data, hallucination
 
 def run_all_day_paper(
-    query={"interest":"", "subjects":["Computation and Language", "Artificial Intelligence"]},
+    query={"interest":"Computer Science", "subjects":["Machine Learning", "Computation and Language", "Artificial Intelligence"]},
     date=None,
     data_dir="../data",
     model_name="gpt-3.5-turbo-16k",

diff --git a/src/relevancy_prompt.txt b/src/relevancy_prompt.txt
@@ -1,7 +1,8 @@
 You have been asked to read a list of a few arxiv papers, each with title, authors and abstract.
-Based on my specific research interests, elevancy score out of 10 for each paper, based on my specific research interest, with a higher score indicating greater relevance. A relevance score more than 7 will need person's attention for details.
-Additionally, please generate 1-2 sentence summary for each paper explaining why it's relevant to my research interests.
+Based on my specific research interests, relevancy score out of 10 for each paper, based on my specific research interest, with a higher score indicating greater relevance. A relevance score more than 7 will need person's attention for details.
+Additionally, please generate summary, for each paper explaining why it's relevant to my research interests.
 Please keep the paper order the same as in the input list, with one json format per line. Example is:
-1. {"Relevancy score": "an integer score out of 10", "Reasons for match": "1-2 sentence short reasonings"}
 
-My research interests are:
+1. {"Relevancy score": "an integer score out of 10", "Reasons for match": "1-2 sentence short reasonings", "Goal": "What kind of pain points the paper is trying to solve?", "Data": "Summary of the data source used in the paper", "Methodology": "Summary of methodologies used in the paper", "Git": "Link to the code repo (if available)", "Experiments & Results": "Summary of any experiments & its results", "Discussion & Next steps": "Further discussion and next steps of the research"}
+
+My research interests are: NLP, RAGs, LLM, Optmization in Machine learning, Data science, Generative AI, Optimization in LLM, Finance modelling ...
 prompt += query['interest'] 
 prompt += query['interest']