Skip to content

Commit d132c6e

Browse files
authored
[CI] Remove need for operational metrics persistent storage (#501)
Originally, the script for scraping LLVM commit info required persistent storage to keep track of the last commit we've seen. This was to prevent any overlap in our scrapes and avoid processing the same commits multiple times. However, currently the script will only ever scrape a full day at a time, so there should never be any overlap. We no longer need persistent storage to maintain the most recently viewed commits since we don't expect the CronJob to be executed twice in the same day.
1 parent 9afab3c commit d132c6e

File tree

1 file changed

+12
-92
lines changed

1 file changed

+12
-92
lines changed

llvm-ops-metrics/ops-container/process_llvm_commits.py

Lines changed: 12 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,7 @@
99
GRAFANA_URL = (
1010
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
1111
)
12-
13-
# Path to checked out llvm/llvm-project repository
14-
REPOSITORY_PATH = "/data/llvm-project"
15-
16-
# Path to record of most recently processed commits
17-
DATA_PATH = "/data/recent_commits.csv"
12+
REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"
1813

1914
# Number of days to look back for new commits
2015
# We allow some buffer time between when a commit is made and when it is queried
@@ -61,99 +56,33 @@ class LLVMCommitInfo:
6156
is_approved: bool = False
6257

6358

64-
def read_past_commits() -> list[list[str]]:
65-
"""Read recently scraped commits from the data path.
66-
67-
Returns:
68-
List of commits that have been scraped.
69-
"""
70-
# If the data path doesn't exist, we haven't scraped any commits yet.
71-
if not os.path.exists(DATA_PATH):
72-
logging.warning(
73-
" Data path %s does not exist. No past commits found.", DATA_PATH
74-
)
75-
return []
76-
77-
# Read the past commits from the data path
78-
with open(DATA_PATH, "r") as f:
79-
f.readline() # Skip header
80-
rows = f.readlines()
81-
commit_history = [row.strip().split(",") for row in rows if row.strip()]
82-
return commit_history
83-
84-
85-
def record_new_commits(new_commits: list[LLVMCommitInfo]) -> None:
86-
"""Record newly scraped commits to the data path.
87-
88-
Args:
89-
new_commits: List of commits to record.
90-
91-
Returns:
92-
None
93-
"""
94-
with open(DATA_PATH, "w") as f:
95-
96-
# Write CSV header
97-
f.write(
98-
",".join([
99-
"commit_sha",
100-
"commit_datetime",
101-
"has_pull_request",
102-
"pull_request_number",
103-
"is_reviewed",
104-
"is_approved",
105-
])
106-
+ "\n"
107-
)
108-
109-
# We want the newest commit as the last entry, so iterate backwards
110-
for i in range(len(new_commits) - 1, -1, -1):
111-
commit_info = new_commits[i]
112-
record = ",".join([
113-
commit_info.commit_sha,
114-
commit_info.commit_datetime.astimezone(
115-
datetime.timezone.utc
116-
).isoformat(),
117-
str(commit_info.has_pull_request),
118-
str(commit_info.pr_number),
119-
str(commit_info.is_reviewed),
120-
str(commit_info.is_approved),
121-
])
122-
f.write(f"{record}\n")
123-
124-
12559
def scrape_new_commits_by_date(
126-
last_known_commit: str, target_datetime: datetime.datetime
60+
target_datetime: datetime.datetime,
12761
) -> list[git.Commit]:
12862
"""Scrape new commits from a given dates.
12963
13064
Args:
131-
last_known_commit: The last known scraped commit.
13265
target_datetime: The date to scrape for new commits.
13366
13467
Returns:
13568
List of new commits made on the given date.
13669
"""
137-
# Pull any new commits into local repository
138-
repo = git.Repo(REPOSITORY_PATH)
139-
repo.remotes.origin.pull()
70+
# Clone repository to current working directory
71+
repo = git.Repo.clone_from(
72+
url=REPOSITORY_URL,
73+
to_path="./llvm-project",
74+
)
14075

14176
# Scrape for new commits
14277
# iter_commits() yields commits in reverse chronological order
14378
new_commits = []
14479
for commit in repo.iter_commits():
145-
# Skip commits that are too new
80+
# Skip commits that don't match the target date
14681
committed_datetime = commit.committed_datetime.astimezone(
14782
datetime.timezone.utc
14883
)
149-
if committed_datetime.date() > target_datetime.date():
84+
if committed_datetime.date() != target_datetime.date():
15085
continue
151-
# Stop scraping if the commit is older than the target date
152-
if committed_datetime.date() < target_datetime.date():
153-
break
154-
# Stop scraping if we've already recorded this commit
155-
if commit.hexsha == last_known_commit:
156-
break
15786

15887
new_commits.append(commit)
15988

@@ -274,20 +203,15 @@ def main() -> None:
274203
grafana_api_key = os.environ["GRAFANA_API_KEY"]
275204
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
276205

277-
logging.info("Reading recently processed commits.")
278-
recorded_commits = read_past_commits()
279-
280-
last_known_commit = recorded_commits[-1][0] if recorded_commits else ""
281-
282-
# Scrape new commits, if any
206+
# Scrape new commits
283207
date_to_scrape = datetime.datetime.now(
284208
datetime.timezone.utc
285209
) - datetime.timedelta(days=LOOKBACK_DAYS)
286210
logging.info(
287-
"Scraping checked out llvm/llvm-project for new commits on %s",
211+
"Cloning and scraping llvm/llvm-project for new commits on %s",
288212
date_to_scrape.strftime("%Y-%m-%d"),
289213
)
290-
new_commits = scrape_new_commits_by_date(last_known_commit, date_to_scrape)
214+
new_commits = scrape_new_commits_by_date(date_to_scrape)
291215
if not new_commits:
292216
logging.info("No new commits found. Exiting.")
293217
return
@@ -298,11 +222,7 @@ def main() -> None:
298222
logging.info("Uploading metrics to Grafana.")
299223
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
300224

301-
logging.info("Recording new commits.")
302-
record_new_commits(new_commit_info)
303-
304225

305226
if __name__ == "__main__":
306227
logging.basicConfig(level=logging.INFO)
307228
main()
308-

0 commit comments

Comments
 (0)