Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 37 additions & 38 deletions src/sentry/deletions/defaults/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from sentry.snuba.dataset import Dataset
from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer
from sentry.utils import metrics
from sentry.utils.query import RangeQuerySetWrapper

from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation
from ..manager import DeletionTaskManager
Expand Down Expand Up @@ -215,8 +214,12 @@ def _delete_children(self, instance_list: Sequence[Group]) -> None:
error_group_ids = [group.id for group in error_groups]
issue_platform_group_ids = [group.id for group in issue_platform_groups]

delete_group_hashes(instance_list[0].project_id, error_group_ids, seer_deletion=True)
delete_group_hashes(instance_list[0].project_id, issue_platform_group_ids)
delete_project_group_hashes(
instance_list[0].project_id, group_ids_filter=error_group_ids, seer_deletion=True
)
delete_project_group_hashes(
instance_list[0].project_id, group_ids_filter=issue_platform_group_ids
)

# If this isn't a retention cleanup also remove event data.
if not os.environ.get("_SENTRY_CLEANUP"):
Expand Down Expand Up @@ -246,21 +249,6 @@ def mark_deletion_in_progress(self, instance_list: Sequence[Group]) -> None:
).update(status=GroupStatus.DELETION_IN_PROGRESS, substatus=None)


def delete_project_group_hashes(project_id: int) -> None:
groups = []
for group in RangeQuerySetWrapper(
Group.objects.filter(project_id=project_id), step=GROUP_CHUNK_SIZE
):
groups.append(group)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was basically restricting the deletions to rows with group being set and missing all non-group related rows.

All rows for this project are null:

SELECT count(*) FROM `getsentry.sentry_grouphash` WHERE project_id IN (4506269346365440) and group_id is null;


error_groups, issue_platform_groups = separate_by_group_category(groups)
error_group_ids = [group.id for group in error_groups]
delete_group_hashes(project_id, error_group_ids, seer_deletion=True)

issue_platform_group_ids = [group.id for group in issue_platform_groups]
delete_group_hashes(project_id, issue_platform_group_ids)


def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
"""
Update seer_matched_grouphash to None for GroupHashMetadata rows
Expand Down Expand Up @@ -310,41 +298,52 @@ def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
metrics.incr("deletions.group_hash_metadata.max_iterations_reached", sample_rate=1.0)


def delete_group_hashes(
def delete_project_group_hashes(
project_id: int,
group_ids: Sequence[int],
group_ids_filter: Sequence[int] | None = None,
seer_deletion: bool = False,
) -> None:
# Validate batch size to ensure it's at least 1 to avoid ValueError in range()
"""
Delete GroupHash records for a project.

This is the main function for deleting GroupHash records. It can delete all hashes for a project
(used during project deletion to clean up orphaned records) or delete hashes for specific groups
(used during group deletion).

Args:
project_id: The project to delete hashes for
group_ids_filter: Optional filter for specific group IDs.
- If None: deletes ALL GroupHash records for the project (including orphans)
- If empty: returns immediately (no-op for safety)
- If non-empty: deletes only hashes for those specific groups
seer_deletion: Whether to notify Seer about the deletion
"""
# Safety: empty filter means nothing to delete
if group_ids_filter is not None and not group_ids_filter:
return

hashes_batch_size = max(1, options.get("deletions.group-hashes-batch-size"))

# Set a reasonable upper bound on iterations to prevent infinite loops.
# The loop will naturally terminate when no more hashes are found.
iterations = 0
while iterations < GROUP_HASH_ITERATIONS:
qs = GroupHash.objects.filter(project_id=project_id, group_id__in=group_ids).values_list(
"id", "hash"
)[:hashes_batch_size]
hashes_chunk = list(qs)
# Base query: all hashes for this project
qs = GroupHash.objects.filter(project_id=project_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will allow querying for rows w/o having group being set.


# Apply group filter if provided
if group_ids_filter is not None:
qs = qs.filter(group_id__in=group_ids_filter)

hashes_chunk = list(qs.values_list("id", "hash")[:hashes_batch_size])
if not hashes_chunk:
break
try:
if seer_deletion:
# Tell seer to delete grouping records for these groups
# It's low priority to delete the hashes from seer, so we don't want
# any network errors to block the deletion of the groups
hash_values = [gh[1] for gh in hashes_chunk]
may_schedule_task_to_delete_hashes_from_seer(project_id, hash_values)
except Exception:
logger.warning("Error scheduling task to delete hashes from seer")
finally:
hash_ids = [gh[0] for gh in hashes_chunk]
# GroupHashMetadata rows can reference GroupHash rows via seer_matched_grouphash_id.
# Before deleting these GroupHash rows, we need to either:
# 1. Update seer_matched_grouphash to None first (to avoid foreign key constraint errors), OR
# 2. Delete the GroupHashMetadata rows entirely (they'll be deleted anyway)
# If we update the columns first, the deletion of the grouphash metadata rows will have less work to do,
# thus, improving the performance of the deletion.
update_group_hash_metadata_in_batches(hash_ids)
GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete()
GroupHash.objects.filter(id__in=hash_ids).delete()
Expand All @@ -354,8 +353,8 @@ def delete_group_hashes(
if iterations == GROUP_HASH_ITERATIONS:
metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0)
logger.warning(
"Group hashes batch deletion reached the maximum number of iterations. "
"Investigate if we need to change the GROUP_HASH_ITERATIONS value."
"delete_group_hashes.max_iterations_reached",
extra={"project_id": project_id, "filtered": group_ids_filter is not None},
)


Expand Down
51 changes: 51 additions & 0 deletions tests/sentry/deletions/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sentry.models.files.file import File
from sentry.models.group import Group
from sentry.models.groupassignee import GroupAssignee
from sentry.models.grouphash import GroupHash
from sentry.models.groupmeta import GroupMeta
from sentry.models.groupopenperiod import GroupOpenPeriod
from sentry.models.groupopenperiodactivity import GroupOpenPeriodActivity, OpenPeriodActivityType
Expand Down Expand Up @@ -233,6 +234,56 @@ def test_delete_error_events(self) -> None:
)
assert len(events) == 0

def test_delete_orphaned_grouphashes(self) -> None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test will fix with or without the fix above since we don't have the 30 second stomping during tests and the ORM deletion of Project will delete everything.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh right, you mean because actually once a project gets fully deleted the orphaned ones would get deleted anyway. I think I will instead convert it to a targeted test for delete_project_group_hashes method

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

"""Test that orphaned GroupHash records (with group_id=None) are deleted during project deletion."""
project = self.create_project(name="test")

# Create an event to get a group with associated GroupHash
event = self.store_event(
data={
"timestamp": before_now(minutes=1).isoformat(),
"message": "test event",
},
project_id=project.id,
)
assert event.group is not None
group = event.group

# Get the GroupHash created for this group
group_hash = GroupHash.objects.get(project=project, group=group)

# Create additional orphaned GroupHash records (group_id=None)
# These simulate GroupHash records that were created but never assigned to a Group,
# or secondary grouping hashes
orphan_hash_1 = GroupHash.objects.create(
project=project,
hash="a" * 32,
group=None, # Orphaned - no group assigned
)
orphan_hash_2 = GroupHash.objects.create(
project=project,
hash="b" * 32,
group=None, # Orphaned - no group assigned
)

# Verify all GroupHash records exist before deletion
assert GroupHash.objects.filter(project=project).count() == 3
assert GroupHash.objects.filter(project=project, group__isnull=True).count() == 2

# Schedule and run project deletion
self.ScheduledDeletion.schedule(instance=project, days=0)
with self.tasks():
run_scheduled_deletions()

# Verify project is deleted
assert not Project.objects.filter(id=project.id).exists()

# Verify ALL GroupHash records are deleted, including orphans
assert not GroupHash.objects.filter(id=group_hash.id).exists()
assert not GroupHash.objects.filter(id=orphan_hash_1.id).exists()
assert not GroupHash.objects.filter(id=orphan_hash_2.id).exists()
assert GroupHash.objects.filter(project_id=project.id).count() == 0

@mock.patch("sentry.quotas.backend.remove_seat")
def test_delete_with_uptime_monitors(self, mock_remove_seat: mock.MagicMock) -> None:
project = self.create_project(name="test")
Expand Down
Loading