-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
feat(release health): Update snuba metrics to know about session unhandled state #96707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -720,7 +720,7 @@ def _get_errored_sessions_for_overview( | |
end: datetime, | ||
) -> Mapping[tuple[int, str], int]: | ||
""" | ||
Count of errored sessions, incl fatal (abnormal, crashed) sessions, | ||
Count of errored sessions, incl fatal (abnormal, unhandled, crashed) session | ||
excl errored *preaggregated* sessions | ||
""" | ||
project_ids = [p.id for p in projects] | ||
|
@@ -774,12 +774,13 @@ def _get_session_by_status_for_overview( | |
end: datetime, | ||
) -> Mapping[tuple[int, str, str], int]: | ||
""" | ||
Counts of init, abnormal and crashed sessions, purpose-built for overview | ||
Counts of init, abnormal, unhandled and crashed sessions, purpose-built for overview | ||
""" | ||
project_ids = [p.id for p in projects] | ||
|
||
select = [ | ||
MetricField(metric_mri=SessionMRI.ABNORMAL.value, alias="abnormal", op=None), | ||
MetricField(metric_mri=SessionMRI.UNHANDLED.value, alias="unhandled", op=None), | ||
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="crashed", op=None), | ||
MetricField(metric_mri=SessionMRI.ALL.value, alias="init", op=None), | ||
MetricField( | ||
|
@@ -820,7 +821,13 @@ def _get_session_by_status_for_overview( | |
release = by.get("release") | ||
|
||
totals = group.get("totals", {}) | ||
for status in ["abnormal", "crashed", "init", "errored_preaggr"]: | ||
for status in [ | ||
"abnormal", | ||
"unhandled", | ||
"crashed", | ||
"init", | ||
"errored_preaggr", | ||
]: | ||
value = totals.get(status) | ||
if value is not None and value != 0.0: | ||
ret_val[(proj_id, release, status)] = value | ||
|
@@ -842,6 +849,9 @@ def _get_users_and_crashed_users_for_overview( | |
select = [ | ||
MetricField(metric_mri=SessionMRI.ALL_USER.value, alias="all_users", op=None), | ||
MetricField(metric_mri=SessionMRI.CRASHED_USER.value, alias="crashed_users", op=None), | ||
MetricField( | ||
metric_mri=SessionMRI.UNHANDLED_USER.value, alias="unhandled_users", op=None | ||
), | ||
] | ||
|
||
groupby = [ | ||
|
@@ -1037,8 +1047,10 @@ def get_release_health_data_overview( | |
if not has_health_data and summary_stats_period != "90d": | ||
fetch_has_health_data_releases.add((project_id, release)) | ||
|
||
sessions_unhandled = rv_sessions.get((project_id, release, "unhandled"), 0) | ||
sessions_crashed = rv_sessions.get((project_id, release, "crashed"), 0) | ||
|
||
users_unhandled = rv_users.get((project_id, release, "unhandled_users"), 0) | ||
Comment on lines
+1050
to
+1053
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential bug: The `_get_users_and_crashed_users_for_overview` function fetches `unhandled_users` data but doesn't process it, causing `users_unhandled` to always be zero and related metrics to be incorrect.
Did we get this right? 👍 / 👎 to inform future reviews. |
||
users_crashed = rv_users.get((project_id, release, "crashed_users"), 0) | ||
|
||
rv_row = rv[project_id, release] = { | ||
|
@@ -1050,19 +1062,36 @@ def get_release_health_data_overview( | |
"total_project_sessions_24h": adoption_info.get("project_sessions_24h"), | ||
"total_sessions": total_sessions, | ||
"total_users": total_users, | ||
"has_health_data": has_health_data, | ||
"sessions_crashed": sessions_crashed, | ||
# Users where the error was `unhandled`; possibly resulting in a crash | ||
"unhandled_user_rate": ( | ||
(users_unhandled + users_crashed) / total_users * 100 if total_users else None | ||
), | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Users where the error was not a crash (but may have been unhandled) | ||
"crash_free_users": ( | ||
100 - users_crashed / total_users * 100 if total_users else None | ||
), | ||
"has_health_data": has_health_data, | ||
# Sessions where the error was specifically `unhandled`; NOT resulting in a crash | ||
"sessions_unhandled": sessions_unhandled, | ||
# Sessions where the error was a crash | ||
"sessions_crashed": sessions_crashed, | ||
# Sessions where the error was `unhandled`; possibly resulting in a crash | ||
"unhandled_session_rate": ( | ||
(sessions_unhandled + sessions_crashed) / total_sessions * 100 | ||
if total_sessions | ||
else None | ||
), | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Sessions where the error was not a crash (but may have been unhandled) | ||
"crash_free_sessions": ( | ||
100 - sessions_crashed / float(total_sessions) * 100 if total_sessions else None | ||
), | ||
# Sessions where the error was handled | ||
"sessions_errored": max( | ||
0, | ||
rv_errored_sessions.get((project_id, release), 0) | ||
+ rv_sessions.get((project_id, release, "errored_preaggr"), 0) | ||
- sessions_crashed | ||
- sessions_unhandled | ||
- rv_sessions.get((project_id, release, "abnormal"), 0), | ||
), | ||
"duration_p50": None, | ||
|
@@ -1427,6 +1456,9 @@ def get_project_release_stats( | |
MetricField( | ||
metric_mri=SessionMRI.CRASHED_USER.value, alias="users_crashed", op=None | ||
), | ||
MetricField( | ||
metric_mri=SessionMRI.UNHANDLED_USER.value, alias="users_unhandled", op=None | ||
), | ||
MetricField( | ||
metric_mri=SessionMRI.ERRORED_USER.value, alias="users_errored", op=None | ||
), | ||
|
@@ -1440,6 +1472,11 @@ def get_project_release_stats( | |
MetricField( | ||
metric_mri=SessionMRI.ABNORMAL.value, alias="sessions_abnormal", op=None | ||
), | ||
MetricField( | ||
metric_mri=SessionMRI.UNHANDLED.value, | ||
alias="sessions_unhandled", | ||
op=None, | ||
), | ||
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="sessions_crashed", op=None), | ||
MetricField(metric_mri=SessionMRI.ERRORED.value, alias="sessions_errored", op=None), | ||
MetricField(metric_mri=SessionMRI.HEALTHY.value, alias="sessions_healthy", op=None), | ||
|
@@ -1500,6 +1537,7 @@ def get_project_release_stats( | |
f"{stat}": 0, | ||
f"{stat}_abnormal": 0, | ||
f"{stat}_crashed": 0, | ||
f"{stat}_unhandled": 0, | ||
f"{stat}_errored": 0, | ||
f"{stat}_healthy": 0, | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,7 +106,13 @@ def get_snuba_columns(self, raw_groupby) -> list[str]: ... | |
class SessionsField: | ||
def get_snuba_columns(self, raw_groupby): | ||
if "session.status" in raw_groupby: | ||
return ["sessions", "sessions_abnormal", "sessions_crashed", "sessions_errored"] | ||
return [ | ||
"sessions", | ||
"sessions_abnormal", | ||
"sessions_crashed", | ||
"sessions_errored", | ||
"sessions_unhandled", | ||
] | ||
return ["sessions"] | ||
|
||
def extract_from_row(self, row, group): | ||
|
@@ -116,15 +122,20 @@ def extract_from_row(self, row, group): | |
if status is None: | ||
return row["sessions"] | ||
if status == "healthy": | ||
healthy_sessions = row["sessions"] - row["sessions_errored"] | ||
healthy_sessions = row["sessions"] - row["sessions_errored"] - row["sessions_unhandled"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Double-Subtraction Error Inflates Healthy MetricsHealthy session and user counts are inflated due to unhandled sessions/users being double-subtracted. The calculation for healthy metrics subtracts |
||
return max(healthy_sessions, 0) | ||
if status == "abnormal": | ||
return row["sessions_abnormal"] | ||
if status == "unhandled": | ||
return row["sessions_unhandled"] | ||
if status == "crashed": | ||
return row["sessions_crashed"] | ||
if status == "errored": | ||
errored_sessions = ( | ||
row["sessions_errored"] - row["sessions_crashed"] - row["sessions_abnormal"] | ||
row["sessions_errored"] | ||
- row["sessions_unhandled"] | ||
- row["sessions_crashed"] | ||
- row["sessions_abnormal"] | ||
) | ||
return max(errored_sessions, 0) | ||
return 0 | ||
|
@@ -133,7 +144,7 @@ def extract_from_row(self, row, group): | |
class UsersField: | ||
def get_snuba_columns(self, raw_groupby): | ||
if "session.status" in raw_groupby: | ||
return ["users", "users_abnormal", "users_crashed", "users_errored"] | ||
return ["users", "users_abnormal", "users_crashed", "users_errored", "users_unhandled"] | ||
return ["users"] | ||
|
||
def extract_from_row(self, row, group): | ||
|
@@ -143,14 +154,21 @@ def extract_from_row(self, row, group): | |
if status is None: | ||
return row["users"] | ||
if status == "healthy": | ||
healthy_users = row["users"] - row["users_errored"] | ||
healthy_users = row["users"] - row["users_errored"] - row["users_unhandled"] | ||
return max(healthy_users, 0) | ||
if status == "abnormal": | ||
return row["users_abnormal"] | ||
if status == "unhandled": | ||
return row["users_unhandled"] | ||
if status == "crashed": | ||
return row["users_crashed"] | ||
if status == "errored": | ||
errored_users = row["users_errored"] - row["users_crashed"] - row["users_abnormal"] | ||
errored_users = ( | ||
row["users_errored"] | ||
- row["users_crashed"] | ||
- row["users_abnormal"] | ||
- row["users_unhandled"] | ||
) | ||
return max(errored_users, 0) | ||
return 0 | ||
|
||
|
@@ -232,7 +250,10 @@ def get_snuba_groupby(self): | |
return [] | ||
|
||
def get_keys_for_row(self, row): | ||
return [("session.status", key) for key in ["healthy", "abnormal", "crashed", "errored"]] | ||
return [ | ||
("session.status", key) | ||
for key in ["healthy", "abnormal", "crashed", "errored", "unhandled"] | ||
] | ||
|
||
|
||
# NOTE: in the future we might add new `user_agent` and `os` fields | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: Unhandled Users Metric Missing
The
_get_users_and_crashed_users_for_overview
method queries for "unhandled_users" data, but the subsequent processing logic fails to store this metric in its return dictionary. Consequently,unhandled_users
metrics are always reported as 0, leading to incorrect calculations for rates likeunhandled_user_rate
.