diff --git a/apps/cli/ai/eval-runner.ts b/apps/cli/ai/eval-runner.ts index 3a270f3afe..a232009956 100644 --- a/apps/cli/ai/eval-runner.ts +++ b/apps/cli/ai/eval-runner.ts @@ -19,6 +19,7 @@ import { } from 'cli/ai/auth'; import { runStudioAgentTurn } from 'cli/ai/runtimes/pi'; import { STUDIO_SITES_ROOT } from 'cli/lib/site-paths'; +import type { StopReason, Usage } from '@mariozechner/pi-ai'; import type { AgentSessionEvent } from '@mariozechner/pi-coding-agent'; import type { AiProviderId } from 'cli/ai/providers'; @@ -28,6 +29,61 @@ interface EvalRunnerInput { model?: AiModelId; } +// Tool names that indicate the agent did "real work" (mutated state, ran a +// command, scaffolded files, etc.). Used as a heuristic to detect runs where +// the agent reported success but produced no assistant text and never invoked +// a state-changing tool — typically a "false success" failure mode where the +// model trivially ended a turn without addressing the prompt. +const MUTATING_TOOL_NAMES = new Set( [ + 'Write', + 'Edit', + 'Bash', + 'wp_cli', + 'site_create', + 'site_delete', + 'site_import', + 'site_export', + 'site_pull', + 'site_push', + 'site_start', + 'site_stop', + 'scaffold_theme', + 'install_taxonomy_scripts', + 'preview_create', + 'preview_delete', + 'preview_update', + 'wpcom_request', +] ); + +// Cap individual text/tool-result strings inside the opt-in transcript so a +// single huge tool output doesn't explode the eval artifact. +const TRANSCRIPT_TEXT_MAX_LENGTH = 4000; + +function truncateText( value: string, maxLength = TRANSCRIPT_TEXT_MAX_LENGTH ): string { + return value.length > maxLength + ? `${ value.slice( 0, maxLength ) }…[truncated ${ value.length - maxLength } chars]` + : value; +} + +type TranscriptEvent = { + index: number; + type: AgentSessionEvent[ 'type' ]; + turnIndex: number; + elapsedMs: number; + text?: string[]; + toolCalls?: ToolCallRecord[]; + toolResult?: { + toolUseId: string; + toolName: string | null; + isError: boolean; + text?: string; + }; + stopReason?: StopReason; + errorMessage?: string; + compaction?: { reason: string; aborted?: boolean }; + autoRetry?: { attempt: number; success?: boolean; error?: string }; +}; + function extractToolCalls( event: AgentSessionEvent ) { if ( event.type !== 'message_end' || event.message.role !== 'assistant' ) { return []; @@ -168,8 +224,16 @@ async function runEval( input: EvalRunnerInput ) { let numTurns = 0; let numTurnsResult: number | null = null; let success = false; + let interrupted = false; let error: string | null = null; let timedOut = false; + let resultStopReason: StopReason | null = null; + let resultText = ''; + let resultErrorMessage: string | null = null; + let resultUsage: Usage | null = null; + const includeTranscript = process.env.STUDIO_EVAL_INCLUDE_TRANSCRIPT === '1'; + const transcript: TranscriptEvent[] = []; + let transcriptIndex = 0; phaseStartedAt = Date.now(); const session = SessionManager.inMemory( STUDIO_SITES_ROOT ); @@ -177,6 +241,15 @@ async function runEval( input: EvalRunnerInput ) { let turnStart = queryStartedAt; const handleEvent = ( event: AgentSessionEvent ): void => { + const transcriptEvent: TranscriptEvent | null = includeTranscript + ? { + index: ++transcriptIndex, + type: event.type, + turnIndex, + elapsedMs: elapsed(), + } + : null; + if ( event.type === 'message_end' && event.message.role === 'assistant' ) { const now = Date.now(); turnDurationsMs.push( now - turnStart ); @@ -185,8 +258,16 @@ async function runEval( input: EvalRunnerInput ) { phaseTimingsMs.first_assistant_message_ms = now - queryStartedAt; } turnStart = now; + if ( transcriptEvent ) { + transcriptEvent.turnIndex = turnIndex; + transcriptEvent.stopReason = event.message.stopReason; + if ( event.message.errorMessage ) { + transcriptEvent.errorMessage = event.message.errorMessage; + } + } } - for ( const tc of extractToolCalls( event ) ) { + const messageToolCalls = extractToolCalls( event ); + for ( const tc of messageToolCalls ) { toolCalls.push( tc ); toolNameById.set( tc.id, tc.name ); const evt: ToolEvent = { @@ -199,7 +280,14 @@ async function runEval( input: EvalRunnerInput ) { toolEvents.push( evt ); toolEventById.set( tc.id, evt ); } - textSegments.push( ...extractTextSegments( event ) ); + if ( transcriptEvent && messageToolCalls.length > 0 ) { + transcriptEvent.toolCalls = messageToolCalls; + } + const messageTextSegments = extractTextSegments( event ); + textSegments.push( ...messageTextSegments ); + if ( transcriptEvent && messageTextSegments.length > 0 ) { + transcriptEvent.text = messageTextSegments.map( ( segment ) => truncateText( segment ) ); + } if ( event.type === 'tool_execution_end' ) { const tr = extractToolResult( event ); @@ -226,6 +314,14 @@ async function runEval( input: EvalRunnerInput ) { isError: tr.isError, ...( tr.text ? { text: tr.text } : {} ), } ); + if ( transcriptEvent ) { + transcriptEvent.toolResult = { + toolUseId: id, + toolName: toolNameById.get( id ) ?? null, + isError: tr.isError, + ...( tr.text ? { text: truncateText( tr.text ) } : {} ), + }; + } } } @@ -233,12 +329,62 @@ async function runEval( input: EvalRunnerInput ) { numTurns += 1; } + if ( event.type === 'compaction_start' || event.type === 'compaction_end' ) { + if ( transcriptEvent ) { + transcriptEvent.compaction = { + reason: event.reason, + ...( event.type === 'compaction_end' ? { aborted: event.aborted } : {} ), + }; + } + } + + if ( event.type === 'auto_retry_start' ) { + if ( transcriptEvent ) { + transcriptEvent.autoRetry = { + attempt: event.attempt, + error: event.errorMessage, + }; + } + } + if ( event.type === 'auto_retry_end' ) { + if ( transcriptEvent ) { + transcriptEvent.autoRetry = { + attempt: event.attempt, + success: event.success, + ...( event.finalError ? { error: event.finalError } : {} ), + }; + } + } + if ( event.type === 'agent_end' ) { const lastAssistant = findLastAssistant( event.messages ); success = ! lastAssistant || ( lastAssistant.stopReason !== 'error' && lastAssistant.stopReason !== 'aborted' ); + interrupted = lastAssistant?.stopReason === 'aborted'; + if ( lastAssistant ) { + resultStopReason = lastAssistant.stopReason; + resultErrorMessage = lastAssistant.errorMessage ?? null; + resultUsage = lastAssistant.usage; + resultText = lastAssistant.content + .filter( ( c ): c is { type: 'text'; text: string } => c.type === 'text' ) + .map( ( c ) => c.text ) + .join( '\n' ) + .trim(); + } numTurnsResult = numTurns; + if ( transcriptEvent ) { + if ( resultStopReason ) { + transcriptEvent.stopReason = resultStopReason; + } + if ( resultErrorMessage ) { + transcriptEvent.errorMessage = resultErrorMessage; + } + } + } + + if ( transcriptEvent ) { + transcript.push( transcriptEvent ); } }; @@ -265,8 +411,15 @@ async function runEval( input: EvalRunnerInput ) { } phaseTimingsMs.total_eval_ms = elapsed(); + const hasAnyAssistantText = textSegments.some( ( segment ) => segment.trim().length > 0 ); + const hasSuccessfulMutatingTool = toolResults.some( + ( tr ) => ! tr.isError && tr.toolName !== null && MUTATING_TOOL_NAMES.has( tr.toolName ) + ); + const producedNoUsefulOutput = success && ! hasAnyAssistantText && ! hasSuccessfulMutatingTool; + return { success, + interrupted, error, timedOut, numTurns: numTurnsResult, @@ -277,6 +430,12 @@ async function runEval( input: EvalRunnerInput ) { toolEvents, firstToolError, textSegments, + resultStopReason, + resultText, + resultErrorMessage, + resultUsage, + producedNoUsefulOutput, + ...( includeTranscript ? { transcript } : {} ), }; }