-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
ref(llm-detection): Refactor Seer integration to fetch traces via RPC #104485
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,16 +22,6 @@ class Span(BaseModel): | |
| span_description: str | None | ||
|
|
||
|
|
||
| class EvidenceSpan(BaseModel): | ||
| span_id: str | None = None | ||
| parent_span_id: str | None = None | ||
| timestamp: float | None = None | ||
| op: str | None = None | ||
| description: str | None = None | ||
| exclusive_time: float | None = None # duration in milliseconds | ||
| data: dict[str, Any] | None = None | ||
|
|
||
|
|
||
| class TraceData(BaseModel): | ||
| trace_id: str | ||
| project_id: int | ||
|
|
@@ -42,10 +32,7 @@ class TraceData(BaseModel): | |
|
|
||
| class EvidenceTraceData(BaseModel): | ||
| trace_id: str | ||
| project_id: int | ||
| transaction_name: str | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need the transaction name in addition to the trace_id when fetching the EAPTrace? or is this just so we still have access to the transaction name for our own things
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. great q - transaction name is now just context data that we pass to seer, and seer passes back in the detected issue, because we need it to create the issue. |
||
| total_spans: int | ||
| spans: list[EvidenceSpan] | ||
|
|
||
|
|
||
| class EAPTrace(BaseModel): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,12 +16,12 @@ | |
| from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka | ||
| from sentry.models.project import Project | ||
| from sentry.net.http import connection_from_url | ||
| from sentry.seer.explorer.index_data import get_transactions_for_project | ||
| from sentry.seer.models import SeerApiError | ||
| from sentry.seer.sentry_data_models import EvidenceTraceData | ||
| from sentry.seer.signed_seer_api import make_signed_seer_api_request | ||
| from sentry.tasks.base import instrumented_task | ||
| from sentry.tasks.llm_issue_detection.trace_data import get_evidence_trace_for_llm_detection | ||
| from sentry.tasks.llm_issue_detection.trace_data import ( | ||
| get_project_top_transaction_traces_for_llm_detection, | ||
| ) | ||
| from sentry.taskworker.namespaces import issues_tasks | ||
| from sentry.utils import json | ||
|
|
||
|
|
@@ -30,10 +30,8 @@ | |
| SEER_ANALYZE_ISSUE_ENDPOINT_PATH = "/v1/automation/issue-detection/analyze" | ||
| SEER_TIMEOUT_S = 120 | ||
| SEER_RETRIES = 1 | ||
|
|
||
| NUM_TRANSACTIONS_TO_PROCESS = 20 | ||
| LOWER_SPAN_LIMIT = 20 | ||
| UPPER_SPAN_LIMIT = 500 | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these will be handled on the seer side |
||
| START_TIME_DELTA_MINUTES = 30 | ||
| TRANSACTION_BATCH_SIZE = 100 | ||
|
|
||
|
|
||
| seer_issue_detection_connection_pool = connection_from_url( | ||
|
|
@@ -45,11 +43,15 @@ | |
|
|
||
|
|
||
| class DetectedIssue(BaseModel): | ||
| # LLM generated fields | ||
| explanation: str | ||
| impact: str | ||
| evidence: str | ||
| missing_telemetry: str | None = None | ||
| title: str | ||
| # context fields, not LLM generated | ||
| trace_id: str | ||
| transaction_name: str | ||
|
|
||
|
|
||
| class IssueDetectionResponse(BaseModel): | ||
|
|
@@ -62,13 +64,13 @@ def __init__( | |
| message: str, | ||
| status: int, | ||
| project_id: int | None = None, | ||
| trace_id: str | None = None, | ||
| organization_id: int | None = None, | ||
| response_data: str | None = None, | ||
| error_message: str | None = None, | ||
| ): | ||
| super().__init__(message, status) | ||
| self.project_id = project_id | ||
| self.trace_id = trace_id | ||
| self.organization_id = organization_id | ||
| self.response_data = response_data | ||
| self.error_message = error_message | ||
|
|
||
|
|
@@ -99,9 +101,7 @@ def get_base_platform(platform: str | None) -> str | None: | |
|
|
||
| def create_issue_occurrence_from_detection( | ||
| detected_issue: DetectedIssue, | ||
| trace: EvidenceTraceData, | ||
| project_id: int, | ||
| transaction_name: str, | ||
| ) -> None: | ||
| """ | ||
| Create and produce an IssueOccurrence from an LLM-detected issue. | ||
|
|
@@ -110,11 +110,13 @@ def create_issue_occurrence_from_detection( | |
| occurrence_id = uuid4().hex | ||
| detection_time = datetime.now(UTC) | ||
| project = Project.objects.get_from_cache(id=project_id) | ||
| trace_id = detected_issue.trace_id | ||
| transaction_name = detected_issue.transaction_name | ||
| title = detected_issue.title.lower().replace(" ", "-") | ||
| fingerprint = [f"llm-detected-{title}-{transaction_name}"] | ||
|
|
||
| evidence_data = { | ||
| "trace_id": trace.trace_id, | ||
| "trace_id": trace_id, | ||
| "transaction": transaction_name, | ||
| "explanation": detected_issue.explanation, | ||
| "impact": detected_issue.impact, | ||
|
|
@@ -155,7 +157,7 @@ def create_issue_occurrence_from_detection( | |
| "transaction": transaction_name, | ||
| "contexts": { | ||
| "trace": { | ||
| "trace_id": trace.trace_id, | ||
| "trace_id": trace_id, | ||
| "type": "trace", | ||
| } | ||
| }, | ||
|
|
@@ -206,6 +208,11 @@ def run_llm_issue_detection() -> None: | |
| def detect_llm_issues_for_project(project_id: int) -> None: | ||
| """ | ||
| Process a single project for LLM issue detection. | ||
|
|
||
| Gets the project's top TRANSACTION_BATCH_SIZE transaction spans from the last START_TIME_DELTA_MINUTES, sorted by -sum(span.duration). | ||
| From those transactions, dedupes on normalized transaction_name. | ||
| For each deduped transaction, gets first trace_id from the start of time window, which has small random variation. | ||
| Sends these trace_ids to seer, which uses get_trace_waterfall to construct an EAPTrace to analyze. | ||
| """ | ||
| project = Project.objects.get_from_cache(id=project_id) | ||
| organization = project.organization | ||
|
|
@@ -217,99 +224,67 @@ def detect_llm_issues_for_project(project_id: int) -> None: | |
| if not has_access: | ||
| return | ||
|
Comment on lines
224
to
225
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Unhandled 🔍 Detailed AnalysisThe 💡 Suggested FixWrap the 🤖 Prompt for AI AgentDid we get this right? 👍 / 👎 to inform future reviews. |
||
|
|
||
| transactions = get_transactions_for_project( | ||
| project_id, limit=100, start_time_delta={"minutes": 30} | ||
| evidence_traces = get_project_top_transaction_traces_for_llm_detection( | ||
| project_id, limit=TRANSACTION_BATCH_SIZE, start_time_delta_minutes=START_TIME_DELTA_MINUTES | ||
| ) | ||
| if not transactions: | ||
| if not evidence_traces: | ||
| return | ||
|
|
||
| # Shuffle transactions to randomize order | ||
| random.shuffle(transactions) | ||
| # Shuffle to randomize order | ||
| random.shuffle(evidence_traces) | ||
|
|
||
| processed_count = 0 | ||
| for transaction in transactions: | ||
| if processed_count >= NUM_TRANSACTIONS_TO_PROCESS: | ||
| break | ||
| seer_request = { | ||
| "telemetry": [{**trace.dict(), "kind": "trace"} for trace in evidence_traces], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feels like we could use better variable names here since it's just the id/name instead of an actual trace now |
||
| "organization_id": organization_id, | ||
| "project_id": project_id, | ||
| } | ||
| response = make_signed_seer_api_request( | ||
| connection_pool=seer_issue_detection_connection_pool, | ||
| path=SEER_ANALYZE_ISSUE_ENDPOINT_PATH, | ||
| body=json.dumps(seer_request).encode("utf-8"), | ||
| ) | ||
|
|
||
| if response.status < 200 or response.status >= 300: | ||
| raise LLMIssueDetectionError( | ||
| message="Seer HTTP error", | ||
| status=response.status, | ||
| project_id=project_id, | ||
| organization_id=organization_id, | ||
| response_data=response.data.decode("utf-8"), | ||
| ) | ||
|
|
||
| try: | ||
| raw_response_data = response.json() | ||
| response_data = IssueDetectionResponse.parse_obj(raw_response_data) | ||
| except (ValueError, TypeError) as e: | ||
| raise LLMIssueDetectionError( | ||
| message="Seer response parsing error", | ||
| status=response.status, | ||
| project_id=project_id, | ||
| organization_id=organization_id, | ||
| response_data=response.data.decode("utf-8"), | ||
| error_message=str(e), | ||
| ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Missing pydantic.ValidationError in exception handlerThe exception handler catches |
||
|
|
||
| n_found_issues = len(response_data.issues) | ||
| logger.info( | ||
| "Seer issue detection success", | ||
| extra={ | ||
| "num_traces": len(evidence_traces), | ||
| "num_issues": n_found_issues, | ||
| "organization_id": organization_id, | ||
| "project_id": project_id, | ||
| "titles": ( | ||
| [issue.title for issue in response_data.issues] if n_found_issues > 0 else None | ||
| ), | ||
| }, | ||
| ) | ||
| for detected_issue in response_data.issues: | ||
| try: | ||
| trace = get_evidence_trace_for_llm_detection(transaction.name, transaction.project_id) | ||
|
|
||
| if ( | ||
| not trace | ||
| or trace.total_spans < LOWER_SPAN_LIMIT | ||
| or trace.total_spans > UPPER_SPAN_LIMIT | ||
| ): | ||
| continue | ||
|
|
||
| processed_count += 1 | ||
| logger.info( | ||
| "Found trace for LLM issue detection", | ||
| extra={ | ||
| "trace_id": trace.trace_id, | ||
| "project_id": project_id, | ||
| "total_spans": trace.total_spans, | ||
| "transaction_name": trace.transaction_name, | ||
| }, | ||
| ) | ||
|
|
||
| seer_request = { | ||
| "telemetry": [{**trace.dict(), "kind": "trace"}], | ||
| "organization_id": organization_id, | ||
| "project_id": project_id, | ||
| } | ||
| response = make_signed_seer_api_request( | ||
| connection_pool=seer_issue_detection_connection_pool, | ||
| path=SEER_ANALYZE_ISSUE_ENDPOINT_PATH, | ||
| body=json.dumps(seer_request).encode("utf-8"), | ||
| ) | ||
|
|
||
| if response.status < 200 or response.status >= 300: | ||
| raise LLMIssueDetectionError( | ||
| message="Seer HTTP error", | ||
| status=response.status, | ||
| project_id=project_id, | ||
| trace_id=trace.trace_id, | ||
| response_data=response.data.decode("utf-8"), | ||
| ) | ||
|
|
||
| try: | ||
| raw_response_data = response.json() | ||
| response_data = IssueDetectionResponse.parse_obj(raw_response_data) | ||
| except (ValueError, TypeError) as e: | ||
| raise LLMIssueDetectionError( | ||
| message="Seer response parsing error", | ||
| status=response.status, | ||
| project_id=project_id, | ||
| trace_id=trace.trace_id, | ||
| response_data=response.data.decode("utf-8"), | ||
| error_message=str(e), | ||
| ) | ||
|
|
||
| n_found_issues = len(response_data.issues) | ||
| logger.info( | ||
| "Seer issue detection success", | ||
| extra={ | ||
| "num_issues": n_found_issues, | ||
| "trace_id": trace.trace_id, | ||
| "project_id": project_id, | ||
| "titles": ( | ||
| [issue.title for issue in response_data.issues] | ||
| if n_found_issues > 0 | ||
| else None | ||
| ), | ||
| }, | ||
| create_issue_occurrence_from_detection( | ||
| detected_issue=detected_issue, | ||
| project_id=project_id, | ||
| ) | ||
| for detected_issue in response_data.issues: | ||
| try: | ||
| create_issue_occurrence_from_detection( | ||
| detected_issue=detected_issue, | ||
| trace=trace, | ||
| project_id=project_id, | ||
| transaction_name=transaction.name, | ||
| ) | ||
|
|
||
| except Exception as e: | ||
| sentry_sdk.capture_exception(e) | ||
| except LLMIssueDetectionError as e: | ||
| except Exception as e: | ||
| sentry_sdk.capture_exception(e) | ||
| continue # if one transaction encounters an error, don't block processing of the others | ||
| continue | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
now that we aren't passing any real trace data here, we can use whatever name we want