Skip to content

feat(release health): Update snuba metrics to know about session unhandled state #96707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
8 changes: 8 additions & 0 deletions src/sentry/release_health/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
"crash_free_rate(user)",
"anr_rate()",
"foreground_anr_rate()",
"unhandled_rate(session)",
"unhandled_rate(user)",
]

GroupByFieldName = Literal[
Expand Down Expand Up @@ -182,6 +184,10 @@ class ReleaseHealthOverview(TypedDict, total=False):
duration_p50: float | None
duration_p90: float | None
stats: Mapping[StatsPeriod, ReleaseHealthStats]
sessions_unhandled: int
handled_sessions: float | None
unhandled_sessions: float | None
unhandled_users: float | None


class CrashFreeBreakdown(TypedDict):
Expand All @@ -202,6 +208,7 @@ class UserCounts(TypedDict):
users_healthy: int
users_crashed: int
users_abnormal: int
users_unhandled: int
users_errored: int


Expand All @@ -214,6 +221,7 @@ class SessionCounts(TypedDict):
sessions_healthy: int
sessions_crashed: int
sessions_abnormal: int
sessions_unhandled: int
sessions_errored: int


Expand Down
47 changes: 42 additions & 5 deletions src/sentry/release_health/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def _get_errored_sessions_for_overview(
end: datetime,
) -> Mapping[tuple[int, str], int]:
"""
Count of errored sessions, incl fatal (abnormal, crashed) sessions,
Count of errored sessions, incl fatal (abnormal, unhandled, crashed) session
excl errored *preaggregated* sessions
"""
project_ids = [p.id for p in projects]
Expand Down Expand Up @@ -774,12 +774,13 @@ def _get_session_by_status_for_overview(
end: datetime,
) -> Mapping[tuple[int, str, str], int]:
"""
Counts of init, abnormal and crashed sessions, purpose-built for overview
Counts of init, abnormal, unhandled and crashed sessions, purpose-built for overview
"""
project_ids = [p.id for p in projects]

select = [
MetricField(metric_mri=SessionMRI.ABNORMAL.value, alias="abnormal", op=None),
MetricField(metric_mri=SessionMRI.UNHANDLED.value, alias="unhandled", op=None),
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="crashed", op=None),
MetricField(metric_mri=SessionMRI.ALL.value, alias="init", op=None),
MetricField(
Expand Down Expand Up @@ -820,7 +821,13 @@ def _get_session_by_status_for_overview(
release = by.get("release")

totals = group.get("totals", {})
for status in ["abnormal", "crashed", "init", "errored_preaggr"]:
for status in [
"abnormal",
"unhandled",
"crashed",
"init",
"errored_preaggr",
]:
value = totals.get(status)
if value is not None and value != 0.0:
ret_val[(proj_id, release, status)] = value
Expand Down Expand Up @@ -1037,8 +1044,10 @@ def get_release_health_data_overview(
if not has_health_data and summary_stats_period != "90d":
fetch_has_health_data_releases.add((project_id, release))

sessions_unhandled = rv_sessions.get((project_id, release, "unhandled"), 0)
sessions_crashed = rv_sessions.get((project_id, release, "crashed"), 0)

users_unhandled = rv_users.get((project_id, release, "unhandled_users"), 0)
users_crashed = rv_users.get((project_id, release, "crashed_users"), 0)

rv_row = rv[project_id, release] = {
Expand All @@ -1050,19 +1059,38 @@ def get_release_health_data_overview(
"total_project_sessions_24h": adoption_info.get("project_sessions_24h"),
"total_sessions": total_sessions,
"total_users": total_users,
"has_health_data": has_health_data,
"sessions_crashed": sessions_crashed,
# Users where the error was `unhandled`; possibly resulting in a crash
"unhandled_users": (
100 - (users_unhandled + users_crashed) / total_users * 100
if total_users
else None
),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Metric Mismatch: Unhandled Users Not Tracked

The unhandled_users metric is always 0 because the _get_users_and_crashed_users_for_overview function, which populates the rv_users dictionary, does not query for "unhandled_users" data. It only fetches "all_users" and "crashed_users". Consequently, the users_unhandled value defaults to 0, making the calculated unhandled_users percentage incorrect and effectively identical to crash_free_users.

Locations (1)
Fix in Cursor Fix in Web

# Users where the error was not a crash (but may have been unhandled)
"crash_free_users": (
100 - users_crashed / total_users * 100 if total_users else None
),
"has_health_data": has_health_data,
# Sessions where the error was specifically `unhandled`; NOT resulting in a crash
"sessions_unhandled": sessions_unhandled,
# Sessions where the error was a crash
"sessions_crashed": sessions_crashed,
# Sessions where the error was `unhandled`; possibly resulting in a crash
"unhandled_sessions": (
100 - (sessions_unhandled + sessions_crashed) / float(total_sessions) * 100
if total_sessions
else None
),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Error Metrics Calculation Inversion

The unhandled_users and unhandled_sessions fields are incorrectly calculated. Despite their names and comments indicating they represent users/sessions with unhandled errors, their formulas (100 - (unhandled + crashed) / total * 100) compute the percentage of users/sessions that are neither unhandled nor crashed. This results in a semantic mismatch where the fields represent the inverse of their intended meaning.

Locations (1)
Fix in Cursor Fix in Web

# Sessions where the error was not a crash (but may have been unhandled)
"crash_free_sessions": (
100 - sessions_crashed / float(total_sessions) * 100 if total_sessions else None
),
# Sessions where the error was handled
"sessions_errored": max(
0,
rv_errored_sessions.get((project_id, release), 0)
+ rv_sessions.get((project_id, release, "errored_preaggr"), 0)
- sessions_crashed
- sessions_unhandled
- rv_sessions.get((project_id, release, "abnormal"), 0),
),
"duration_p50": None,
Expand Down Expand Up @@ -1427,6 +1455,9 @@ def get_project_release_stats(
MetricField(
metric_mri=SessionMRI.CRASHED_USER.value, alias="users_crashed", op=None
),
MetricField(
metric_mri=SessionMRI.UNHANDLED_USER.value, alias="users_unhandled", op=None
),
MetricField(
metric_mri=SessionMRI.ERRORED_USER.value, alias="users_errored", op=None
),
Expand All @@ -1440,6 +1471,11 @@ def get_project_release_stats(
MetricField(
metric_mri=SessionMRI.ABNORMAL.value, alias="sessions_abnormal", op=None
),
MetricField(
metric_mri=SessionMRI.UNHANDLED.value,
alias="sessions_unhandled",
op=None,
),
MetricField(metric_mri=SessionMRI.CRASHED.value, alias="sessions_crashed", op=None),
MetricField(metric_mri=SessionMRI.ERRORED.value, alias="sessions_errored", op=None),
MetricField(metric_mri=SessionMRI.HEALTHY.value, alias="sessions_healthy", op=None),
Expand Down Expand Up @@ -1500,6 +1536,7 @@ def get_project_release_stats(
f"{stat}": 0,
f"{stat}_abnormal": 0,
f"{stat}_crashed": 0,
f"{stat}_unhandled": 0,
f"{stat}_errored": 0,
f"{stat}_healthy": 0,
}
Expand Down
11 changes: 9 additions & 2 deletions src/sentry/release_health/metrics_sessions_v2.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
""" This module offers the same functionality as sessions_v2, but pulls its data
"""This module offers the same functionality as sessions_v2, but pulls its data
from the `metrics` dataset instead of `sessions`.

Do not call this module directly. Use the `release_health` service instead. """
Do not call this module directly. Use the `release_health` service instead."""

import logging
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -242,6 +242,7 @@ def _get_metric_fields(
self.status_to_metric_field[SessionStatus.ABNORMAL],
self.status_to_metric_field[SessionStatus.CRASHED],
self.status_to_metric_field[SessionStatus.ERRORED],
self.status_to_metric_field[SessionStatus.UNHANDLED],
]
return [self.get_all_field()]

Expand All @@ -265,6 +266,7 @@ class SumSessionField(CountField):
SessionStatus.ABNORMAL: MetricField(None, SessionMRI.ABNORMAL.value),
SessionStatus.CRASHED: MetricField(None, SessionMRI.CRASHED.value),
SessionStatus.ERRORED: MetricField(None, SessionMRI.ERRORED.value),
SessionStatus.UNHANDLED: MetricField(None, SessionMRI.UNHANDLED.value),
None: MetricField(None, SessionMRI.ALL.value),
}

Expand Down Expand Up @@ -298,6 +300,7 @@ def __init__(
SessionStatus.ABNORMAL: MetricField(None, SessionMRI.ABNORMAL_USER.value),
SessionStatus.CRASHED: MetricField(None, SessionMRI.CRASHED_USER.value),
SessionStatus.ERRORED: MetricField(None, SessionMRI.ERRORED_USER.value),
SessionStatus.UNHANDLED: MetricField(None, SessionMRI.UNHANDLED_USER.value),
None: MetricField(None, SessionMRI.ALL_USER.value),
}

Expand Down Expand Up @@ -341,6 +344,8 @@ class SimpleForwardingField(Field):
"crash_free_rate(user)": SessionMRI.CRASH_FREE_USER_RATE,
"anr_rate()": SessionMRI.ANR_RATE,
"foreground_anr_rate()": SessionMRI.FOREGROUND_ANR_RATE,
"unhandled_rate(session)": SessionMRI.UNHANDLED_RATE,
"unhandled_rate(user)": SessionMRI.UNHANDLED_USER_RATE,
}

def __init__(self, name: str, raw_groupby: Sequence[str], status_filter: StatusFilter):
Expand Down Expand Up @@ -379,6 +384,8 @@ def _get_metric_fields(
"crash_free_rate(user)": SimpleForwardingField,
"anr_rate()": SimpleForwardingField,
"foreground_anr_rate()": SimpleForwardingField,
"unhandled_rate(session)": SimpleForwardingField,
"unhandled_rate(user)": SimpleForwardingField,
}
PREFLIGHT_QUERY_COLUMNS = {"release.timestamp"}
VirtualOrderByName = Literal["release.timestamp"]
Expand Down
20 changes: 19 additions & 1 deletion src/sentry/snuba/metrics/fields/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
sum_if_column_snql,
team_key_transaction_snql,
tolerated_count_transaction,
unhandled_sessions,
unhandled_users,
uniq_aggregation_on_metric,
uniq_if_column_snql,
)
Expand Down Expand Up @@ -1375,6 +1377,22 @@ def generate_where_statements(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.UNHANDLED.value,
metrics=[SessionMRI.RAW_SESSION.value],
unit="sessions",
snql=lambda project_ids, org_id, metric_ids, alias=None: unhandled_sessions(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.UNHANDLED_USER.value,
metrics=[SessionMRI.RAW_USER.value],
unit="users",
snql=lambda project_ids, org_id, metric_ids, alias=None: unhandled_users(
org_id, metric_ids, alias=alias
),
),
SingularEntityDerivedMetric(
metric_mri=SessionMRI.CRASHED.value,
metrics=[SessionMRI.RAW_SESSION.value],
Expand Down Expand Up @@ -1926,7 +1944,7 @@ def metric_object_factory(op: MetricOperationType | None, metric_mri: str) -> Me


def generate_bottom_up_dependency_tree_for_metrics(
metrics_query_fields_set: set[tuple[MetricOperationType | None, str, str]]
metrics_query_fields_set: set[tuple[MetricOperationType | None, str, str]],
) -> list[tuple[MetricOperationType | None, str, str]]:
"""
This function basically generates a dependency list for all instances of
Expand Down
14 changes: 14 additions & 0 deletions src/sentry/snuba/metrics/fields/snql.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ def all_users(org_id: int, metric_ids: Sequence[int], alias: str | None = None)
return uniq_aggregation_on_metric(metric_ids, alias)


def unhandled_sessions(
org_id: int, metric_ids: Sequence[int], alias: str | None = None
) -> Function:
return _counter_sum_aggregation_on_session_status_factory(
org_id, session_status="unhandled", metric_ids=metric_ids, alias=alias
)


def unhandled_users(org_id: int, metric_ids: Sequence[int], alias: str | None = None) -> Function:
return _set_uniq_aggregation_on_session_status_factory(
org_id, session_status="unhandled", metric_ids=metric_ids, alias=alias
)


def crashed_sessions(org_id: int, metric_ids: Sequence[int], alias: str | None = None) -> Function:
return _counter_sum_aggregation_on_session_status_factory(
org_id, session_status="crashed", metric_ids=metric_ids, alias=alias
Expand Down
14 changes: 13 additions & 1 deletion src/sentry/snuba/metrics/naming_layer/mri.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,32 @@ class SessionMRI(Enum):
ERRORED_PREAGGREGATED = "e:sessions/error.preaggr@none"
ERRORED_SET = "e:sessions/error.unique@none"
ERRORED_ALL = "e:sessions/all_errored@none"
HANDLED = "e:sessions/handled.unique@none" # all sessions excluding handled and crashed
UNHANDLED = "e:sessions/unhandled@none" # unhandled, does not include crashed
CRASHED_AND_ABNORMAL = "e:sessions/crashed_abnormal@none"
CRASHED = "e:sessions/crashed@none"
CRASH_FREE = "e:sessions/crash_free@none"
ABNORMAL = "e:sessions/abnormal@none"
HANDLED_RATE = "e:sessions/handled_rate@ratio" # all sessions excluding handled and crashed
UNHANDLED_RATE = "e:sessions/unhandled_rate@ratio" # unhandled, does not include crashed
CRASH_RATE = "e:sessions/crash_rate@ratio"
CRASH_FREE_RATE = "e:sessions/crash_free_rate@ratio"
CRASH_FREE_RATE = "e:sessions/crash_free_rate@ratio" # includes handled and unhandled
ALL_USER = "e:sessions/user.all@none"
HEALTHY_USER = "e:sessions/user.healthy@none"
ERRORED_USER = "e:sessions/user.errored@none"
ERRORED_USER_ALL = "e:sessions/user.all_errored@none"
HANDLED_USER = "e:sessions/user.handled@none" # all sessions excluding handled and crashed
UNHANDLED_USER = "e:sessions/user.unhandled@none" # unhandled, does not include crashed
CRASHED_AND_ABNORMAL_USER = "e:sessions/user.crashed_abnormal@none"
CRASHED_USER = "e:sessions/user.crashed@none"
CRASH_FREE_USER = "e:sessions/user.crash_free@none"
ABNORMAL_USER = "e:sessions/user.abnormal@none"
HANDLED_USER_RATE = (
"e:sessions/user.handled_rate@ratio" # all sessions excluding handled and crashed
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Incorrect Comments for Handled Metrics

The comments for HANDLED, HANDLED_RATE, HANDLED_USER, and HANDLED_USER_RATE metrics incorrectly state "all sessions excluding handled and crashed". This is logically inconsistent with metrics representing handled sessions and appears to be a copy-paste error.

Locations (2)
Fix in Cursor Fix in Web

UNHANDLED_USER_RATE = (
"e:sessions/user.unhandled_rate@ratio" # unhandled, does not include crashed
)
CRASH_USER_RATE = "e:sessions/user.crash_rate@ratio"
CRASH_FREE_USER_RATE = "e:sessions/user.crash_free_rate@ratio"
ANR_USER = "e:sessions/user.anr@none"
Expand Down
2 changes: 2 additions & 0 deletions src/sentry/snuba/metrics/naming_layer/public.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class SessionMetricKey(Enum):
DURATION = "session.duration"
ALL = "session.all"
ABNORMAL = "session.abnormal"
UNHANDLED = "session.unhandled"
CRASHED = "session.crashed"
CRASH_FREE = "session.crash_free"
ERRORED = "session.errored"
Expand All @@ -46,6 +47,7 @@ class SessionMetricKey(Enum):
ALL_USER = "session.all_user"
ABNORMAL_USER = "session.abnormal_user"
CRASHED_USER = "session.crashed_user"
UNHANDLED_USER = "session.unhandled_user"
CRASH_FREE_USER = "session.crash_free_user"
ERRORED_USER = "session.errored_user"
HEALTHY_USER = "session.healthy_user"
Expand Down
28 changes: 24 additions & 4 deletions src/sentry/snuba/sessions_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ def get_snuba_columns(self, raw_groupby) -> list[str]: ...
class SessionsField:
def get_snuba_columns(self, raw_groupby):
if "session.status" in raw_groupby:
return ["sessions", "sessions_abnormal", "sessions_crashed", "sessions_errored"]
return [
"sessions",
"sessions_abnormal",
"sessions_crashed",
"sessions_errored",
"sessions_unhandled",
]
return ["sessions"]

def extract_from_row(self, row, group):
Expand All @@ -116,15 +122,21 @@ def extract_from_row(self, row, group):
if status is None:
return row["sessions"]
if status == "healthy":
# TODO(ryan953): Should we also subtract sessions_unhandled & sessions_crashed?
healthy_sessions = row["sessions"] - row["sessions_errored"]
return max(healthy_sessions, 0)
if status == "abnormal":
return row["sessions_abnormal"]
if status == "unhandled":
return row["sessions_unhandled"]
if status == "crashed":
return row["sessions_crashed"]
if status == "errored":
errored_sessions = (
row["sessions_errored"] - row["sessions_crashed"] - row["sessions_abnormal"]
row["sessions_errored"]
- row["sessions_unhandled"]
- row["sessions_crashed"]
- row["sessions_abnormal"]
)
return max(errored_sessions, 0)
return 0
Expand All @@ -133,7 +145,7 @@ def extract_from_row(self, row, group):
class UsersField:
def get_snuba_columns(self, raw_groupby):
if "session.status" in raw_groupby:
return ["users", "users_abnormal", "users_crashed", "users_errored"]
return ["users", "users_abnormal", "users_crashed", "users_errored", "users_unhandled"]
return ["users"]

def extract_from_row(self, row, group):
Expand All @@ -143,14 +155,22 @@ def extract_from_row(self, row, group):
if status is None:
return row["users"]
if status == "healthy":
# TODO(ryan953): Should we also subtract users_unhandled & users_crashed?
healthy_users = row["users"] - row["users_errored"]
return max(healthy_users, 0)
if status == "abnormal":
return row["users_abnormal"]
if status == "unhandled":
return row["users_unhandled"]
if status == "crashed":
return row["users_crashed"]
if status == "errored":
errored_users = row["users_errored"] - row["users_crashed"] - row["users_abnormal"]
errored_users = (
row["users_errored"]
- row["users_crashed"]
- row["users_abnormal"]
- row["users_unhandled"]
)
return max(errored_users, 0)
return 0

Expand Down
Loading
Loading