Skip to content

feat(release health): Update snuba metrics to know about session unhandled state #96707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/sentry/release_health/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
"crash_free_rate(user)",
"anr_rate()",
"foreground_anr_rate()",
"unhandled_rate(session)",
"unhandled_rate(user)",
]

GroupByFieldName = Literal[
Expand Down Expand Up @@ -182,6 +184,9 @@ class ReleaseHealthOverview(TypedDict, total=False):
duration_p50: float | None
duration_p90: float | None
stats: Mapping[StatsPeriod, ReleaseHealthStats]
sessions_unhandled: int
unhandled_session_rate: float | None
unhandled_user_rate: float | None


class CrashFreeBreakdown(TypedDict):
Expand All @@ -202,6 +207,7 @@ class UserCounts(TypedDict):
users_healthy: int
users_crashed: int
users_abnormal: int
users_unhandled: int
users_errored: int


Expand All @@ -214,6 +220,7 @@ class SessionCounts(TypedDict):
sessions_healthy: int
sessions_crashed: int
sessions_abnormal: int
sessions_unhandled: int
sessions_errored: int


Expand Down
48 changes: 43 additions & 5 deletions src/sentry/release_health/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def _get_errored_sessions_for_overview(
end: datetime,
) -> Mapping[tuple[int, str], int]:
"""
Count of errored sessions, incl fatal (abnormal, crashed) sessions,
Count of errored sessions, incl fatal (abnormal, unhandled, crashed) session
excl errored *preaggregated* sessions
"""
project_ids = [p.id for p in projects]
Expand Down Expand Up @@ -774,12 +774,13 @@ def _get_session_by_status_for_overview(
end: datetime,
) -> Mapping[tuple[int, str, str], int]:
"""
Counts of init, abnormal and crashed sessions, purpose-built for overview
Counts of init, abnormal, unhandled and crashed sessions, purpose-built for overview
"""
project_ids = [p.id for p in projects]

select = [
MetricField(metric_mri=SessionMRI.ABNORMAL.value, alias="abnormal", op=None),
MetricField(metric_mri=SessionMRI.UNHANDLED.value, alias="unhandled", op=None),
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="crashed", op=None),
MetricField(metric_mri=SessionMRI.ALL.value, alias="init", op=None),
MetricField(
Expand Down Expand Up @@ -820,7 +821,13 @@ def _get_session_by_status_for_overview(
release = by.get("release")

totals = group.get("totals", {})
for status in ["abnormal", "crashed", "init", "errored_preaggr"]:
for status in [
"abnormal",
"unhandled",
"crashed",
"init",
"errored_preaggr",
]:
value = totals.get(status)
if value is not None and value != 0.0:
ret_val[(proj_id, release, status)] = value
Expand All @@ -842,6 +849,9 @@ def _get_users_and_crashed_users_for_overview(
select = [
MetricField(metric_mri=SessionMRI.ALL_USER.value, alias="all_users", op=None),
MetricField(metric_mri=SessionMRI.CRASHED_USER.value, alias="crashed_users", op=None),
MetricField(
metric_mri=SessionMRI.UNHANDLED_USER.value, alias="unhandled_users", op=None
),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Unhandled Users Metric Missing

The _get_users_and_crashed_users_for_overview method queries for "unhandled_users" data, but the subsequent processing logic fails to store this metric in its return dictionary. Consequently, unhandled_users metrics are always reported as 0, leading to incorrect calculations for rates like unhandled_user_rate.

Fix in Cursor Fix in Web

]

groupby = [
Expand Down Expand Up @@ -1037,8 +1047,10 @@ def get_release_health_data_overview(
if not has_health_data and summary_stats_period != "90d":
fetch_has_health_data_releases.add((project_id, release))

sessions_unhandled = rv_sessions.get((project_id, release, "unhandled"), 0)
sessions_crashed = rv_sessions.get((project_id, release, "crashed"), 0)

users_unhandled = rv_users.get((project_id, release, "unhandled_users"), 0)
Comment on lines +1050 to +1053
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential bug: The `_get_users_and_crashed_users_for_overview` function fetches `unhandled_users` data but doesn't process it, causing `users_unhandled` to always be zero and related metrics to be incorrect.
  • Description: The function _get_users_and_crashed_users_for_overview was updated to query for unhandled_users from the database. However, the subsequent loop that populates the ret_val dictionary was not updated to handle the "unhandled_users" status. As a result, this data is fetched but then discarded. Consequently, any call to rv_users.get((project_id, release, "unhandled_users"), 0) will always fall back to the default value of 0. This leads to systematically incorrect release health analytics, as metrics like unhandled_user_rate will always be calculated as zero, providing misleading information to users.
  • Suggested fix: Update the loop in _get_users_and_crashed_users_for_overview to process the "unhandled_users" status. Add "unhandled_users" to the list of statuses being iterated over, similar to how "all_users" and "crashed_users" are handled, to ensure the fetched data is correctly populated into the return dictionary.
    severity: 0.7, confidence: 0.95

Did we get this right? 👍 / 👎 to inform future reviews.

users_crashed = rv_users.get((project_id, release, "crashed_users"), 0)

rv_row = rv[project_id, release] = {
Expand All @@ -1050,19 +1062,36 @@ def get_release_health_data_overview(
"total_project_sessions_24h": adoption_info.get("project_sessions_24h"),
"total_sessions": total_sessions,
"total_users": total_users,
"has_health_data": has_health_data,
"sessions_crashed": sessions_crashed,
# Users where the error was `unhandled`; possibly resulting in a crash
"unhandled_user_rate": (
(users_unhandled + users_crashed) / total_users * 100 if total_users else None
),
# Users where the error was not a crash (but may have been unhandled)
"crash_free_users": (
100 - users_crashed / total_users * 100 if total_users else None
),
"has_health_data": has_health_data,
# Sessions where the error was specifically `unhandled`; NOT resulting in a crash
"sessions_unhandled": sessions_unhandled,
# Sessions where the error was a crash
"sessions_crashed": sessions_crashed,
# Sessions where the error was `unhandled`; possibly resulting in a crash
"unhandled_session_rate": (
(sessions_unhandled + sessions_crashed) / total_sessions * 100
if total_sessions
else None
),
# Sessions where the error was not a crash (but may have been unhandled)
"crash_free_sessions": (
100 - sessions_crashed / float(total_sessions) * 100 if total_sessions else None
),
# Sessions where the error was handled
"sessions_errored": max(
0,
rv_errored_sessions.get((project_id, release), 0)
+ rv_sessions.get((project_id, release, "errored_preaggr"), 0)
- sessions_crashed
- sessions_unhandled
- rv_sessions.get((project_id, release, "abnormal"), 0),
),
"duration_p50": None,
Expand Down Expand Up @@ -1427,6 +1456,9 @@ def get_project_release_stats(
MetricField(
metric_mri=SessionMRI.CRASHED_USER.value, alias="users_crashed", op=None
),
MetricField(
metric_mri=SessionMRI.UNHANDLED_USER.value, alias="users_unhandled", op=None
),
MetricField(
metric_mri=SessionMRI.ERRORED_USER.value, alias="users_errored", op=None
),
Expand All @@ -1440,6 +1472,11 @@ def get_project_release_stats(
MetricField(
metric_mri=SessionMRI.ABNORMAL.value, alias="sessions_abnormal", op=None
),
MetricField(
metric_mri=SessionMRI.UNHANDLED.value,
alias="sessions_unhandled",
op=None,
),
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="sessions_crashed", op=None),
MetricField(metric_mri=SessionMRI.ERRORED.value, alias="sessions_errored", op=None),
MetricField(metric_mri=SessionMRI.HEALTHY.value, alias="sessions_healthy", op=None),
Expand Down Expand Up @@ -1500,6 +1537,7 @@ def get_project_release_stats(
f"{stat}": 0,
f"{stat}_abnormal": 0,
f"{stat}_crashed": 0,
f"{stat}_unhandled": 0,
f"{stat}_errored": 0,
f"{stat}_healthy": 0,
}
Expand Down
8 changes: 8 additions & 0 deletions src/sentry/release_health/metrics_sessions_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class SessionStatus(Enum):
CRASHED = "crashed"
ERRORED = "errored"
HEALTHY = "healthy"
UNHANDLED = "unhandled"


ALL_STATUSES = frozenset(iter(SessionStatus))
Expand Down Expand Up @@ -242,6 +243,7 @@ def _get_metric_fields(
self.status_to_metric_field[SessionStatus.ABNORMAL],
self.status_to_metric_field[SessionStatus.CRASHED],
self.status_to_metric_field[SessionStatus.ERRORED],
self.status_to_metric_field[SessionStatus.UNHANDLED],
]
return [self.get_all_field()]

Expand All @@ -265,6 +267,7 @@ class SumSessionField(CountField):
SessionStatus.ABNORMAL: MetricField(None, SessionMRI.ABNORMAL.value),
SessionStatus.CRASHED: MetricField(None, SessionMRI.CRASHED.value),
SessionStatus.ERRORED: MetricField(None, SessionMRI.ERRORED.value),
SessionStatus.UNHANDLED: MetricField(None, SessionMRI.UNHANDLED.value),
None: MetricField(None, SessionMRI.ALL.value),
}

Expand Down Expand Up @@ -298,6 +301,7 @@ def __init__(
SessionStatus.ABNORMAL: MetricField(None, SessionMRI.ABNORMAL_USER.value),
SessionStatus.CRASHED: MetricField(None, SessionMRI.CRASHED_USER.value),
SessionStatus.ERRORED: MetricField(None, SessionMRI.ERRORED_USER.value),
SessionStatus.UNHANDLED: MetricField(None, SessionMRI.UNHANDLED_USER.value),
None: MetricField(None, SessionMRI.ALL_USER.value),
}

Expand Down Expand Up @@ -341,6 +345,8 @@ class SimpleForwardingField(Field):
"crash_free_rate(user)": SessionMRI.CRASH_FREE_USER_RATE,
"anr_rate()": SessionMRI.ANR_RATE,
"foreground_anr_rate()": SessionMRI.FOREGROUND_ANR_RATE,
"unhandled_rate(session)": SessionMRI.UNHANDLED_RATE,
"unhandled_rate(user)": SessionMRI.UNHANDLED_USER_RATE,
}

def __init__(self, name: str, raw_groupby: Sequence[str], status_filter: StatusFilter):
Expand Down Expand Up @@ -379,6 +385,8 @@ def _get_metric_fields(
"crash_free_rate(user)": SimpleForwardingField,
"anr_rate()": SimpleForwardingField,
"foreground_anr_rate()": SimpleForwardingField,
"unhandled_rate(session)": SimpleForwardingField,
"unhandled_rate(user)": SimpleForwardingField,
}
PREFLIGHT_QUERY_COLUMNS = {"release.timestamp"}
VirtualOrderByName = Literal["release.timestamp"]
Expand Down
18 changes: 18 additions & 0 deletions src/sentry/snuba/metrics/fields/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
sum_if_column_snql,
team_key_transaction_snql,
tolerated_count_transaction,
unhandled_sessions,
unhandled_users,
uniq_aggregation_on_metric,
uniq_if_column_snql,
)
Expand Down Expand Up @@ -1375,6 +1377,22 @@ def generate_where_statements(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.UNHANDLED.value,
metrics=[SessionMRI.RAW_SESSION.value],
unit="sessions",
snql=lambda project_ids, org_id, metric_ids, alias=None: unhandled_sessions(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.UNHANDLED_USER.value,
metrics=[SessionMRI.RAW_USER.value],
unit="users",
snql=lambda project_ids, org_id, metric_ids, alias=None: unhandled_users(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.CRASHED.value,
metrics=[SessionMRI.RAW_SESSION.value],
Expand Down
14 changes: 14 additions & 0 deletions src/sentry/snuba/metrics/fields/snql.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ def all_users(org_id: int, metric_ids: Sequence[int], alias: str | None = None)
return uniq_aggregation_on_metric(metric_ids, alias)


def unhandled_sessions(
org_id: int, metric_ids: Sequence[int], alias: str | None = None
) -> Function:
return _counter_sum_aggregation_on_session_status_factory(
org_id, session_status="unhandled", metric_ids=metric_ids, alias=alias
)


def unhandled_users(org_id: int, metric_ids: Sequence[int], alias: str | None = None) -> Function:
return _set_uniq_aggregation_on_session_status_factory(
org_id, session_status="unhandled", metric_ids=metric_ids, alias=alias
)


def crashed_sessions(org_id: int, metric_ids: Sequence[int], alias: str | None = None) -> Function:
return _counter_sum_aggregation_on_session_status_factory(
org_id, session_status="crashed", metric_ids=metric_ids, alias=alias
Expand Down
14 changes: 13 additions & 1 deletion src/sentry/snuba/metrics/naming_layer/mri.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,32 @@ class SessionMRI(Enum):
ERRORED_PREAGGREGATED = "e:sessions/error.preaggr@none"
ERRORED_SET = "e:sessions/error.unique@none"
ERRORED_ALL = "e:sessions/all_errored@none"
HANDLED = "e:sessions/handled.unique@none" # all sessions excluding handled and crashed
UNHANDLED = "e:sessions/unhandled@none" # unhandled, does not include crashed
CRASHED_AND_ABNORMAL = "e:sessions/crashed_abnormal@none"
CRASHED = "e:sessions/crashed@none"
CRASH_FREE = "e:sessions/crash_free@none"
ABNORMAL = "e:sessions/abnormal@none"
HANDLED_RATE = "e:sessions/handled_rate@ratio" # all sessions excluding handled and crashed
UNHANDLED_RATE = "e:sessions/unhandled_rate@ratio" # unhandled, does not include crashed
CRASH_RATE = "e:sessions/crash_rate@ratio"
CRASH_FREE_RATE = "e:sessions/crash_free_rate@ratio"
CRASH_FREE_RATE = "e:sessions/crash_free_rate@ratio" # includes handled and unhandled
ALL_USER = "e:sessions/user.all@none"
HEALTHY_USER = "e:sessions/user.healthy@none"
ERRORED_USER = "e:sessions/user.errored@none"
ERRORED_USER_ALL = "e:sessions/user.all_errored@none"
HANDLED_USER = "e:sessions/user.handled@none" # all sessions excluding handled and crashed
UNHANDLED_USER = "e:sessions/user.unhandled@none" # unhandled, does not include crashed
CRASHED_AND_ABNORMAL_USER = "e:sessions/user.crashed_abnormal@none"
CRASHED_USER = "e:sessions/user.crashed@none"
CRASH_FREE_USER = "e:sessions/user.crash_free@none"
ABNORMAL_USER = "e:sessions/user.abnormal@none"
HANDLED_USER_RATE = (
"e:sessions/user.handled_rate@ratio" # all sessions excluding handled and crashed
)
UNHANDLED_USER_RATE = (
"e:sessions/user.unhandled_rate@ratio" # unhandled, does not include crashed
)
CRASH_USER_RATE = "e:sessions/user.crash_rate@ratio"
CRASH_FREE_USER_RATE = "e:sessions/user.crash_free_rate@ratio"
ANR_USER = "e:sessions/user.anr@none"
Expand Down
2 changes: 2 additions & 0 deletions src/sentry/snuba/metrics/naming_layer/public.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class SessionMetricKey(Enum):
DURATION = "session.duration"
ALL = "session.all"
ABNORMAL = "session.abnormal"
UNHANDLED = "session.unhandled"
CRASHED = "session.crashed"
CRASH_FREE = "session.crash_free"
ERRORED = "session.errored"
Expand All @@ -46,6 +47,7 @@ class SessionMetricKey(Enum):
ALL_USER = "session.all_user"
ABNORMAL_USER = "session.abnormal_user"
CRASHED_USER = "session.crashed_user"
UNHANDLED_USER = "session.unhandled_user"
CRASH_FREE_USER = "session.crash_free_user"
ERRORED_USER = "session.errored_user"
HEALTHY_USER = "session.healthy_user"
Expand Down
35 changes: 28 additions & 7 deletions src/sentry/snuba/sessions_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ def get_snuba_columns(self, raw_groupby) -> list[str]: ...
class SessionsField:
def get_snuba_columns(self, raw_groupby):
if "session.status" in raw_groupby:
return ["sessions", "sessions_abnormal", "sessions_crashed", "sessions_errored"]
return [
"sessions",
"sessions_abnormal",
"sessions_crashed",
"sessions_errored",
"sessions_unhandled",
]
return ["sessions"]

def extract_from_row(self, row, group):
Expand All @@ -116,15 +122,20 @@ def extract_from_row(self, row, group):
if status is None:
return row["sessions"]
if status == "healthy":
healthy_sessions = row["sessions"] - row["sessions_errored"]
healthy_sessions = row["sessions"] - row["sessions_errored"] - row["sessions_unhandled"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Double-Subtraction Error Inflates Healthy Metrics

Healthy session and user counts are inflated due to unhandled sessions/users being double-subtracted. The calculation for healthy metrics subtracts unhandled metrics separately, despite them already being included within the errored metric, leading to an over-reduction of the total.

Locations (2)
Fix in Cursor Fix in Web

return max(healthy_sessions, 0)
if status == "abnormal":
return row["sessions_abnormal"]
if status == "unhandled":
return row["sessions_unhandled"]
if status == "crashed":
return row["sessions_crashed"]
if status == "errored":
errored_sessions = (
row["sessions_errored"] - row["sessions_crashed"] - row["sessions_abnormal"]
row["sessions_errored"]
- row["sessions_unhandled"]
- row["sessions_crashed"]
- row["sessions_abnormal"]
)
return max(errored_sessions, 0)
return 0
Expand All @@ -133,7 +144,7 @@ def extract_from_row(self, row, group):
class UsersField:
def get_snuba_columns(self, raw_groupby):
if "session.status" in raw_groupby:
return ["users", "users_abnormal", "users_crashed", "users_errored"]
return ["users", "users_abnormal", "users_crashed", "users_errored", "users_unhandled"]
return ["users"]

def extract_from_row(self, row, group):
Expand All @@ -143,14 +154,21 @@ def extract_from_row(self, row, group):
if status is None:
return row["users"]
if status == "healthy":
healthy_users = row["users"] - row["users_errored"]
healthy_users = row["users"] - row["users_errored"] - row["users_unhandled"]
return max(healthy_users, 0)
if status == "abnormal":
return row["users_abnormal"]
if status == "unhandled":
return row["users_unhandled"]
if status == "crashed":
return row["users_crashed"]
if status == "errored":
errored_users = row["users_errored"] - row["users_crashed"] - row["users_abnormal"]
errored_users = (
row["users_errored"]
- row["users_crashed"]
- row["users_abnormal"]
- row["users_unhandled"]
)
return max(errored_users, 0)
return 0

Expand Down Expand Up @@ -232,7 +250,10 @@ def get_snuba_groupby(self):
return []

def get_keys_for_row(self, row):
return [("session.status", key) for key in ["healthy", "abnormal", "crashed", "errored"]]
return [
("session.status", key)
for key in ["healthy", "abnormal", "crashed", "errored", "unhandled"]
]


# NOTE: in the future we might add new `user_agent` and `os` fields
Expand Down
Loading
Loading