Skip to content

Commit 1327d34

Browse files
committed
[CI] Validate scraped push commits via GitHub GraphQL API
1 parent d5b79f7 commit 1327d34

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

llvm-ops-metrics/ops-container/process_llvm_commits.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,13 @@
99
GRAFANA_URL = (
1010
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
1111
)
12+
GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql"
1213
REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"
1314

15+
# How many commits to query the GitHub GraphQL API for at a time.
16+
# Querying too many commits at once often leads to the call failing.
17+
GITHUB_API_BATCH_SIZE = 75
18+
1419
# Number of days to look back for new commits
1520
# We allow some buffer time between when a commit is made and when it is queried
1621
# for reviews. This is allow time for any events to propogate in the GitHub
@@ -44,6 +49,23 @@
4449
AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL
4550
"""
4651

52+
# Template GraphQL subquery to check if a commit has an associated pull request
53+
# and whether that pull request has been reviewed and approved.
54+
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE = """
55+
commit_{commit_sha}:
56+
object(oid:"{commit_sha}") {{
57+
... on Commit {{
58+
associatedPullRequests(first: 1) {{
59+
totalCount
60+
pullRequest: nodes {{
61+
number
62+
reviewDecision
63+
}}
64+
}}
65+
}}
66+
}}
67+
"""
68+
4769

4870
@dataclasses.dataclass
4971
class LLVMCommitInfo:
@@ -153,6 +175,88 @@ def query_for_reviews(
153175
return list(new_commits.values())
154176

155177

178+
def validate_push_commits(
179+
new_commits: list[LLVMCommitInfo], github_token: str
180+
) -> None:
181+
"""Validate that push commits don't have a pull request.
182+
183+
To address lossiness of data from GitHub Archive BigQuery, we check each
184+
commit to see if it actually has an associated pull request.
185+
186+
Args:
187+
new_commits: List of commits to validate.
188+
github_token: The access token to use with the GitHub GraphQL API.
189+
190+
Returns:
191+
None
192+
"""
193+
194+
# Get all push commits from new commits and form their subqueries
195+
commit_subqueries = []
196+
potential_push_commits = {}
197+
for commit in new_commits:
198+
if commit.has_pull_request:
199+
continue
200+
potential_push_commits[commit.commit_sha] = commit
201+
commit_subqueries.append(
202+
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE.format(commit_sha=commit.commit_sha)
203+
)
204+
logging.info("Found %d potential push commits", len(potential_push_commits))
205+
206+
# Query GitHub GraphQL API for pull requests associated with push commits
207+
# We query in batches as large queries often fail
208+
api_commit_data = {}
209+
query_template = """
210+
query {
211+
repository(owner:"llvm", name:"llvm-project"){
212+
%s
213+
}
214+
}
215+
"""
216+
num_batches = len(commit_subqueries) // GITHUB_API_BATCH_SIZE + 1
217+
logging.info("Querying GitHub GraphQL API in %d batches", num_batches)
218+
for i in range(num_batches):
219+
subquery_batch = commit_subqueries[
220+
i * GITHUB_API_BATCH_SIZE : (i + 1) * GITHUB_API_BATCH_SIZE
221+
]
222+
query = query_template % "".join(subquery_batch)
223+
224+
logging.info(
225+
"Querying batch %d of %d (%d commits)",
226+
i + 1,
227+
num_batches,
228+
len(subquery_batch),
229+
)
230+
response = requests.post(
231+
url=GITHUB_GRAPHQL_API_URL,
232+
headers={
233+
"Authorization": f"bearer {github_token}",
234+
},
235+
json={"query": query},
236+
)
237+
if response.status_code < 200 or response.status_code >= 300:
238+
logging.error("Failed to query GitHub GraphQL API: %s", response.text)
239+
api_commit_data.update(response.json()["data"]["repository"])
240+
241+
amend_count = 0
242+
for commit_sha, data in api_commit_data.items():
243+
# Verify that push commit has no pull requests
244+
commit_sha = commit_sha.removeprefix("commit_")
245+
if data["associatedPullRequests"]["totalCount"] == 0:
246+
continue
247+
248+
# Amend fields with new data from API
249+
pull_request = data["associatedPullRequests"]["pullRequest"][0]
250+
commit_info = potential_push_commits[commit_sha]
251+
commit_info.has_pull_request = True
252+
commit_info.pr_number = pull_request["number"]
253+
commit_info.is_reviewed = pull_request["reviewDecision"] is not None
254+
commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
255+
amend_count += 1
256+
257+
logging.info("Amended %d commits", amend_count)
258+
259+
156260
def upload_daily_metrics(
157261
grafana_api_key: str,
158262
grafana_metrics_userid: str,
@@ -200,6 +304,7 @@ def upload_daily_metrics(
200304

201305

202306
def main() -> None:
307+
github_token = os.environ["GITHUB_TOKEN"]
203308
grafana_api_key = os.environ["GRAFANA_API_KEY"]
204309
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
205310

@@ -219,6 +324,9 @@ def main() -> None:
219324
logging.info("Querying for reviews of new commits.")
220325
new_commit_info = query_for_reviews(new_commits, date_to_scrape)
221326

327+
logging.info("Validating push commits.")
328+
validate_push_commits(new_commit_info, github_token)
329+
222330
logging.info("Uploading metrics to Grafana.")
223331
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
224332

0 commit comments

Comments
 (0)