Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions src/sentry/seer/sentry_data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ class Span(BaseModel):
span_description: str | None


class EvidenceSpan(BaseModel):
span_id: str | None = None
parent_span_id: str | None = None
timestamp: float | None = None
op: str | None = None
description: str | None = None
exclusive_time: float | None = None # duration in milliseconds
data: dict[str, Any] | None = None


class TraceData(BaseModel):
trace_id: str
project_id: int
Expand All @@ -42,10 +32,7 @@ class TraceData(BaseModel):

class EvidenceTraceData(BaseModel):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now that we aren't passing any real trace data here, we can use whatever name we want

trace_id: str
project_id: int
transaction_name: str
Copy link
Member

@roggenkemper roggenkemper Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need the transaction name in addition to the trace_id when fetching the EAPTrace? or is this just so we still have access to the transaction name for our own things

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great q - transaction name is now just context data that we pass to seer, and seer passes back in the detected issue, because we need it to create the issue.
the EAPTrace only needs trace_id + org_id

total_spans: int
spans: list[EvidenceSpan]


class EAPTrace(BaseModel):
Expand Down
177 changes: 76 additions & 101 deletions src/sentry/tasks/llm_issue_detection/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
from sentry.models.project import Project
from sentry.net.http import connection_from_url
from sentry.seer.explorer.index_data import get_transactions_for_project
from sentry.seer.models import SeerApiError
from sentry.seer.sentry_data_models import EvidenceTraceData
from sentry.seer.signed_seer_api import make_signed_seer_api_request
from sentry.tasks.base import instrumented_task
from sentry.tasks.llm_issue_detection.trace_data import get_evidence_trace_for_llm_detection
from sentry.tasks.llm_issue_detection.trace_data import (
get_project_top_transaction_traces_for_llm_detection,
)
from sentry.taskworker.namespaces import issues_tasks
from sentry.utils import json

Expand All @@ -30,10 +30,8 @@
SEER_ANALYZE_ISSUE_ENDPOINT_PATH = "/v1/automation/issue-detection/analyze"
SEER_TIMEOUT_S = 120
SEER_RETRIES = 1

NUM_TRANSACTIONS_TO_PROCESS = 20
LOWER_SPAN_LIMIT = 20
UPPER_SPAN_LIMIT = 500
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these will be handled on the seer side

START_TIME_DELTA_MINUTES = 30
TRANSACTION_BATCH_SIZE = 100


seer_issue_detection_connection_pool = connection_from_url(
Expand All @@ -45,11 +43,15 @@


class DetectedIssue(BaseModel):
# LLM generated fields
explanation: str
impact: str
evidence: str
missing_telemetry: str | None = None
title: str
# context fields, not LLM generated
trace_id: str
transaction_name: str


class IssueDetectionResponse(BaseModel):
Expand All @@ -62,13 +64,13 @@ def __init__(
message: str,
status: int,
project_id: int | None = None,
trace_id: str | None = None,
organization_id: int | None = None,
response_data: str | None = None,
error_message: str | None = None,
):
super().__init__(message, status)
self.project_id = project_id
self.trace_id = trace_id
self.organization_id = organization_id
self.response_data = response_data
self.error_message = error_message

Expand Down Expand Up @@ -99,9 +101,7 @@ def get_base_platform(platform: str | None) -> str | None:

def create_issue_occurrence_from_detection(
detected_issue: DetectedIssue,
trace: EvidenceTraceData,
project_id: int,
transaction_name: str,
) -> None:
"""
Create and produce an IssueOccurrence from an LLM-detected issue.
Expand All @@ -110,11 +110,13 @@ def create_issue_occurrence_from_detection(
occurrence_id = uuid4().hex
detection_time = datetime.now(UTC)
project = Project.objects.get_from_cache(id=project_id)
trace_id = detected_issue.trace_id
transaction_name = detected_issue.transaction_name
title = detected_issue.title.lower().replace(" ", "-")
fingerprint = [f"llm-detected-{title}-{transaction_name}"]

evidence_data = {
"trace_id": trace.trace_id,
"trace_id": trace_id,
"transaction": transaction_name,
"explanation": detected_issue.explanation,
"impact": detected_issue.impact,
Expand Down Expand Up @@ -155,7 +157,7 @@ def create_issue_occurrence_from_detection(
"transaction": transaction_name,
"contexts": {
"trace": {
"trace_id": trace.trace_id,
"trace_id": trace_id,
"type": "trace",
}
},
Expand Down Expand Up @@ -206,6 +208,11 @@ def run_llm_issue_detection() -> None:
def detect_llm_issues_for_project(project_id: int) -> None:
"""
Process a single project for LLM issue detection.

Gets the project's top TRANSACTION_BATCH_SIZE transaction spans from the last START_TIME_DELTA_MINUTES, sorted by -sum(span.duration).
From those transactions, dedupes on normalized transaction_name.
For each deduped transaction, gets first trace_id from the start of time window, which has small random variation.
Sends these trace_ids to seer, which uses get_trace_waterfall to construct an EAPTrace to analyze.
"""
project = Project.objects.get_from_cache(id=project_id)
organization = project.organization
Expand All @@ -217,99 +224,67 @@ def detect_llm_issues_for_project(project_id: int) -> None:
if not has_access:
return
Comment on lines 224 to 225
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Unhandled Project.DoesNotExist exception when fetching project from cache, leading to task crashes.
Severity: CRITICAL | Confidence: High

🔍 Detailed Analysis

The Project.objects.get_from_cache(id=project_id) call at src/sentry/tasks/llm_issue_detection/detection.py:224 lacks error handling for Project.DoesNotExist. If a project is deleted after run_llm_issue_detection() dispatches the subtask but before detect_llm_issues_for_project() executes, the task will crash with an unhandled Project.DoesNotExist exception. This is inconsistent with trace_data.py in the same PR, which explicitly handles this exception.

💡 Suggested Fix

Wrap the Project.objects.get_from_cache() call in a try-except Project.DoesNotExist block, log the error, and return early to gracefully handle missing projects.

🤖 Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: src/sentry/tasks/llm_issue_detection/detection.py#L224-L225

Potential issue: The `Project.objects.get_from_cache(id=project_id)` call at
`src/sentry/tasks/llm_issue_detection/detection.py:224` lacks error handling for
`Project.DoesNotExist`. If a project is deleted after `run_llm_issue_detection()`
dispatches the subtask but before `detect_llm_issues_for_project()` executes, the task
will crash with an unhandled `Project.DoesNotExist` exception. This is inconsistent with
`trace_data.py` in the same PR, which explicitly handles this exception.

Did we get this right? 👍 / 👎 to inform future reviews.
Reference ID: 5916674


transactions = get_transactions_for_project(
project_id, limit=100, start_time_delta={"minutes": 30}
evidence_traces = get_project_top_transaction_traces_for_llm_detection(
project_id, limit=TRANSACTION_BATCH_SIZE, start_time_delta_minutes=START_TIME_DELTA_MINUTES
)
if not transactions:
if not evidence_traces:
return

# Shuffle transactions to randomize order
random.shuffle(transactions)
# Shuffle to randomize order
random.shuffle(evidence_traces)

processed_count = 0
for transaction in transactions:
if processed_count >= NUM_TRANSACTIONS_TO_PROCESS:
break
seer_request = {
"telemetry": [{**trace.dict(), "kind": "trace"} for trace in evidence_traces],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feels like we could use better variable names here since it's just the id/name instead of an actual trace now

"organization_id": organization_id,
"project_id": project_id,
}
response = make_signed_seer_api_request(
connection_pool=seer_issue_detection_connection_pool,
path=SEER_ANALYZE_ISSUE_ENDPOINT_PATH,
body=json.dumps(seer_request).encode("utf-8"),
)

if response.status < 200 or response.status >= 300:
raise LLMIssueDetectionError(
message="Seer HTTP error",
status=response.status,
project_id=project_id,
organization_id=organization_id,
response_data=response.data.decode("utf-8"),
)

try:
raw_response_data = response.json()
response_data = IssueDetectionResponse.parse_obj(raw_response_data)
except (ValueError, TypeError) as e:
raise LLMIssueDetectionError(
message="Seer response parsing error",
status=response.status,
project_id=project_id,
organization_id=organization_id,
response_data=response.data.decode("utf-8"),
error_message=str(e),
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing pydantic.ValidationError in exception handler

The exception handler catches (ValueError, TypeError) but IssueDetectionResponse.parse_obj() raises pydantic.ValidationError when the Seer response doesn't match the expected schema. Since the DetectedIssue model now requires trace_id and transaction_name fields that Seer must pass back, if Seer fails to return these fields or returns them with incorrect types, the pydantic.ValidationError will propagate uncaught instead of being wrapped in LLMIssueDetectionError. The codebase correctly catches pydantic.ValidationError elsewhere when using parse_obj.

Fix in Cursor Fix in Web


n_found_issues = len(response_data.issues)
logger.info(
"Seer issue detection success",
extra={
"num_traces": len(evidence_traces),
"num_issues": n_found_issues,
"organization_id": organization_id,
"project_id": project_id,
"titles": (
[issue.title for issue in response_data.issues] if n_found_issues > 0 else None
),
},
)
for detected_issue in response_data.issues:
try:
trace = get_evidence_trace_for_llm_detection(transaction.name, transaction.project_id)

if (
not trace
or trace.total_spans < LOWER_SPAN_LIMIT
or trace.total_spans > UPPER_SPAN_LIMIT
):
continue

processed_count += 1
logger.info(
"Found trace for LLM issue detection",
extra={
"trace_id": trace.trace_id,
"project_id": project_id,
"total_spans": trace.total_spans,
"transaction_name": trace.transaction_name,
},
)

seer_request = {
"telemetry": [{**trace.dict(), "kind": "trace"}],
"organization_id": organization_id,
"project_id": project_id,
}
response = make_signed_seer_api_request(
connection_pool=seer_issue_detection_connection_pool,
path=SEER_ANALYZE_ISSUE_ENDPOINT_PATH,
body=json.dumps(seer_request).encode("utf-8"),
)

if response.status < 200 or response.status >= 300:
raise LLMIssueDetectionError(
message="Seer HTTP error",
status=response.status,
project_id=project_id,
trace_id=trace.trace_id,
response_data=response.data.decode("utf-8"),
)

try:
raw_response_data = response.json()
response_data = IssueDetectionResponse.parse_obj(raw_response_data)
except (ValueError, TypeError) as e:
raise LLMIssueDetectionError(
message="Seer response parsing error",
status=response.status,
project_id=project_id,
trace_id=trace.trace_id,
response_data=response.data.decode("utf-8"),
error_message=str(e),
)

n_found_issues = len(response_data.issues)
logger.info(
"Seer issue detection success",
extra={
"num_issues": n_found_issues,
"trace_id": trace.trace_id,
"project_id": project_id,
"titles": (
[issue.title for issue in response_data.issues]
if n_found_issues > 0
else None
),
},
create_issue_occurrence_from_detection(
detected_issue=detected_issue,
project_id=project_id,
)
for detected_issue in response_data.issues:
try:
create_issue_occurrence_from_detection(
detected_issue=detected_issue,
trace=trace,
project_id=project_id,
transaction_name=transaction.name,
)

except Exception as e:
sentry_sdk.capture_exception(e)
except LLMIssueDetectionError as e:
except Exception as e:
sentry_sdk.capture_exception(e)
continue # if one transaction encounters an error, don't block processing of the others
continue
Loading
Loading