-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
WIP #101561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP #101561
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| from django.db.models import Model, QuerySet | ||
| from django.utils import timezone | ||
|
|
||
| from sentry import options | ||
| from sentry.runner.decorators import log_options | ||
| from sentry.silo.base import SiloLimit, SiloMode | ||
|
|
||
|
|
@@ -224,6 +225,15 @@ def is_filtered(model: type[Model]) -> bool: | |
|
|
||
| deletes = models_which_use_deletions_code_path() | ||
|
|
||
| if options.get("cleanup.delete-old-groups.enabled") and model == ("sentry.Group",): | ||
| with metrics.timer("cleanup.stage", instance=router, tags={"stage": "old_groups"}): | ||
| try: | ||
| delete_groups_older_than_days( | ||
| days=options.get("cleanup.delete-old-groups.days") | ||
| ) | ||
| except Exception: | ||
| logger.exception("Error in delete_groups_older_than_days") | ||
|
|
||
| # Track timing for each deletion stage to monitor execution progress and identify bottlenecks | ||
| with metrics.timer( | ||
| "cleanup.stage", instance=router, tags={"stage": "specialized_cleanups"} | ||
|
|
@@ -298,6 +308,80 @@ def is_filtered(model: type[Model]) -> bool: | |
| ) | ||
|
|
||
|
|
||
| def delete_groups_older_than_days(days: int) -> None: | ||
| """ | ||
| Delete old groups based on last_seen timestamp. | ||
|
|
||
| Since there's no index with last_seen as the leading column, we get all project_ids | ||
| (which is fast via the primary key index), then process each project using the | ||
| (project_id, last_seen) index. | ||
|
|
||
| Uses batching to avoid long-running transactions and table locks. | ||
| """ | ||
| from sentry import options | ||
| from sentry.models.group import Group | ||
| from sentry.models.project import Project | ||
| from sentry.utils import metrics | ||
|
|
||
| metrics_key = "cleanup.delete-old-groups" | ||
|
|
||
| cutoff_date = timezone.now() - timedelta(days=days) | ||
|
|
||
| # Get all project_ids (uses Project primary key index, very fast) | ||
| project_ids = list(Project.objects.values_list("id", flat=True)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. High severity vulnerability may affect your project—review required: ℹ️ Why this mattersAffected versions of Django are vulnerable to Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection'). SQL injection in Django's ORM column aliases: when using QuerySet.annotate(), QuerySet.alias(), QuerySet.aggregate(), or QuerySet.extra() with dictionary expansion (**kwargs), the dictionary keys are used unescaped as SQL column aliases. On MySQL and MariaDB backends, an attacker who can influence those keys (for example, by passing a crafted dict of annotations) can inject arbitrary SQL into the generated query. To resolve this comment:
💬 Ignore this findingTo ignore this, reply with:
You can view more details on this finding in the Semgrep AppSec Platform here. |
||
|
|
||
| # Track total iterations to respect the configured limit | ||
| total_iterations = 0 | ||
| max_iterations = options.get("cleanup.delete-old-groups.iterations") | ||
| batch_size = options.get("cleanup.delete-old-groups.batch-size") | ||
|
|
||
| # Process each project using the (project_id, last_seen) index | ||
| for project_id in project_ids: | ||
| # This query efficiently uses sentry_groupedmessage_project_id_last_seen index | ||
| # by filtering on project_id first (leading column), then last_seen | ||
| # ORDER BY last_seen DESC, id DESC matches the index for optimal performance | ||
| # CREATE INDEX sentry_groupedmessage_project_id_last_seen ON public.sentry_groupedmessage USING btree (project_id, last_seen DESC) | ||
| while True: | ||
| # Check iteration limit | ||
| if total_iterations >= max_iterations: | ||
| logger.info( | ||
| "Reached iteration limit, stopping deletion", | ||
| extra={"max_iterations": max_iterations}, | ||
| ) | ||
| return | ||
|
|
||
| with metrics.timer(f"{metrics_key}.delete_groups_for_project"): | ||
| try: | ||
| # Use order_by to ensure consistent ordering that matches index | ||
| # This helps PostgreSQL use the index efficiently | ||
| group_ids = list( | ||
| Group.objects.filter(project_id=project_id, last_seen__lt=cutoff_date) | ||
| .order_by("-last_seen", "-id") | ||
| .values_list("id", flat=True)[:batch_size] | ||
|
Comment on lines
+358
to
+360
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. High severity vulnerability may affect your project—review required: ℹ️ Why this mattersAffected versions of Django are vulnerable to Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection'). SQL injection in Django's ORM column aliases: when using QuerySet.annotate(), QuerySet.alias(), QuerySet.aggregate(), or QuerySet.extra() with dictionary expansion (**kwargs), the dictionary keys are used unescaped as SQL column aliases. On MySQL and MariaDB backends, an attacker who can influence those keys (for example, by passing a crafted dict of annotations) can inject arbitrary SQL into the generated query. To resolve this comment:
💬 Ignore this findingTo ignore this, reply with:
You can view more details on this finding in the Semgrep AppSec Platform here. |
||
| ) | ||
|
|
||
| if not group_ids: | ||
| # No more groups to delete for this project | ||
| break | ||
|
|
||
| deleted_count, _ = Group.objects.filter(id__in=group_ids).delete() | ||
| metrics.incr(f"{metrics_key}.deleted_count", deleted_count) | ||
| total_iterations += 1 | ||
|
|
||
| # If we deleted fewer groups than batch size, we're done with this project | ||
| if len(group_ids) < batch_size: | ||
| break | ||
|
|
||
| except Exception: | ||
| logger.exception("Error in delete_groups_batch") | ||
| break | ||
|
|
||
| # Check killswitch after each batch | ||
| if not options.get("cleanup.delete-old-groups.enabled"): | ||
| logger.info("Killswitch disabled, stopping deletion") | ||
| return | ||
|
|
||
|
|
||
| def _validate_and_setup_environment(concurrency: int, silent: bool) -> None: | ||
| """Validate input parameters and set up environment variables.""" | ||
| if concurrency < 1: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: Group Cleanup Blocked by Incorrect Model Handling
The condition
model == ("sentry.Group",)prevents the new group cleanup from running. Themodelparameter from CLI arguments typically contains simple names like "Group" or full paths, not("sentry.Group",). This is inconsistent with how models are usually handled (e.g., lowercase names in a list, or all models if none specified), making the new cleanup unreachable.