feat: support AI SDK runner and Claude 4.1 Opus & Sonnet 4.5

devversion · devversion · commit 8134dade2980 · 2025-11-20T18:42:30.000+01:00
The Genkit runner and community libraries are seemingly falling behind
here and it's not possible to trivially enable Opus 4.1 or Sonnet 4.5.

This commit introduces the AI SDK runner and adds support for the latest
Claude models that way. Notably AI SDK has a pretty complete
documentation and more discoverable types (by avoiding Zod complexity).
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ You can customize the `web-codegen-scorer eval` script with the following flags:
     - Example: `web-codegen-scorer eval --model=gemini-2.5-flash --autorater-model=gemini-2.5-flash --env=<config path>`
 
 - `--runner=<name>`: Specifies the runner to use to execute the eval. Supported runners are
-  `genkit` (default), `gemini-cli`, `claude-code` or `codex`.
+  `genkit` (default), `ai-sdk`, `gemini-cli`, `claude-code` or `codex`.
 
 - `--local`: Runs the script in local mode for the initial code generation request. Instead of
   calling the LLM, it will attempt to read the initial code from a corresponding file in the
diff --git a/package.json b/package.json
@@ -51,6 +51,7 @@
     "wcs": "./runner/bin/cli.js"
   },
   "dependencies": {
+    "@ai-sdk/anthropic": "^2.0.45",
     "@anthropic-ai/sdk": "^0.68.0",
     "@axe-core/puppeteer": "^4.10.2",
     "@genkit-ai/compat-oai": "1.23.0",
@@ -63,6 +64,7 @@
     "@types/cli-progress": "^3.11.6",
     "@types/node": "^24.2.0",
     "@types/yargs": "^17.0.33",
+    "ai": "^5.0.95",
     "axe-core": "^4.10.3",
     "boxen": "^8.0.1",
     "chalk": "^5.4.1",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/runner/codegen/ai-sdk-runner.ts b/runner/codegen/ai-sdk-runner.ts
@@ -0,0 +1,208 @@
+import {
+  LlmRunner,
+  LocalLlmConstrainedOutputGenerateRequestOptions,
+  LocalLlmConstrainedOutputGenerateResponse,
+  LocalLlmGenerateFilesRequestOptions,
+  LocalLlmGenerateFilesResponse,
+  LocalLlmGenerateTextRequestOptions,
+  LocalLlmGenerateTextResponse,
+  PromptDataMessage,
+} from './llm-runner.js';
+import {
+  FilePart,
+  generateObject,
+  generateText,
+  LanguageModel,
+  ModelMessage,
+  SystemModelMessage,
+  TextPart,
+} from 'ai';
+import {anthropic, AnthropicProviderOptions} from '@ai-sdk/anthropic';
+import z from 'zod';
+import {callWithTimeout} from '../utils/timeout.js';
+import {combineAbortSignals} from '../utils/abort-signal.js';
+
+const SUPPORTED_MODELS = [
+  'claude-opus-4.1-no-thinking',
+  'claude-opus-4.1-with-thinking',
+  'claude-sonnet-4.5-no-thinking',
+  'claude-sonnet-4.5-with-thinking',
+] as const;
+
+// Increased to a very high value as we rely on an actual timeout
+// that aborts stuck LLM requests. WCS is targeting stability here;
+// even if it involves many exponential backoff-waiting.
+const DEFAULT_MAX_RETRIES = 100000;
+
+export class AiSDKRunner implements LlmRunner {
+  displayName = 'AI SDK';
+  id = 'ai-sdk';
+  hasBuiltInRepairLoop = true;
+
+  async generateText(
+    options: LocalLlmGenerateTextRequestOptions,
+  ): Promise<LocalLlmGenerateTextResponse> {
+    const response = await this._wrapRequestWithTimeoutAndRateLimiting(options, async abortSignal =>
+      generateText({
+        ...(await this._getAiSdkModelOptions(options)),
+        abortSignal: abortSignal,
+        messages: this._convertRequestToMessagesList(options),
+        maxRetries: DEFAULT_MAX_RETRIES,
+      }),
+    );
+
+    return {
+      reasoning: response.reasoningText ?? '',
+      text: response.text,
+      usage: response.usage,
+      // TODO: Consider supporting `toolLogs` and MCP here.
+    };
+  }
+
+  async generateConstrained<T extends z.ZodTypeAny = z.ZodTypeAny>(
+    options: LocalLlmConstrainedOutputGenerateRequestOptions<T>,
+  ): Promise<LocalLlmConstrainedOutputGenerateResponse<T>> {
+    const response = await this._wrapRequestWithTimeoutAndRateLimiting(options, async abortSignal =>
+      generateObject({
+        ...(await this._getAiSdkModelOptions(options)),
+        messages: this._convertRequestToMessagesList(options),
+        schema: options.schema,
+        abortSignal: abortSignal,
+        maxRetries: DEFAULT_MAX_RETRIES,
+      }),
+    );
+
+    return {
+      reasoning: response.reasoning ?? '',
+      output: response.object,
+      usage: response.usage,
+      // TODO: Consider supporting `toolLogs` and MCP here.
+    };
+  }
+
+  async generateFiles(
+    options: LocalLlmGenerateFilesRequestOptions,
+  ): Promise<LocalLlmGenerateFilesResponse> {
+    const response = await this.generateConstrained({
+      ...options,
+      prompt: options.context.executablePrompt,
+      systemPrompt: options.context.systemInstructions,
+      schema: z.object({
+        outputFiles: z.array(
+          z.object({
+            filePath: z.string().describe('Name of the file that is being changed'),
+            code: z.string().describe('New code of the file'),
+          }),
+        ),
+      }),
+    });
+
+    return {
+      files: response.output?.outputFiles ?? [],
+      reasoning: response.reasoning,
+      usage: response.usage,
+      // TODO: Consider supporting `toolLogs` and MCP here.
+    };
+  }
+
+  getSupportedModels(): string[] {
+    return [...SUPPORTED_MODELS];
+  }
+
+  async dispose(): Promise<void> {}
+
+  private async _wrapRequestWithTimeoutAndRateLimiting<T>(
+    request: LocalLlmGenerateTextRequestOptions | LocalLlmConstrainedOutputGenerateRequestOptions,
+    fn: (abortSignal: AbortSignal) => Promise<T>,
+  ): Promise<T> {
+    // TODO: Check if rate-limiting is actually necessary here. AI SDK
+    // seems to do retrying on its own.
+
+    if (request.timeout === undefined) {
+      return await fn(request.abortSignal);
+    }
+    return callWithTimeout(
+      request.timeout.description,
+      abortSignal => fn(combineAbortSignals(abortSignal, request.abortSignal)),
+      request.timeout.durationInMins,
+    );
+  }
+
+  private async _getAiSdkModelOptions(
+    request: LocalLlmGenerateTextRequestOptions,
+  ): Promise<{model: LanguageModel; providerOptions: {}}> {
+    switch (request.model) {
+      case 'claude-opus-4.1-no-thinking':
+      case 'claude-opus-4.1-with-thinking': {
+        const thinkingEnabled = request.model.endsWith('with-thinking');
+        return {
+          model: anthropic('claude-opus-4-1'),
+          providerOptions: {
+            sendReasoning: thinkingEnabled,
+            thinking: {type: thinkingEnabled ? 'enabled' : 'disabled'},
+          } satisfies AnthropicProviderOptions,
+        };
+      }
+      case 'claude-sonnet-4.5-no-thinking':
+      case 'claude-sonnet-4.5-with-thinking': {
+        const thinkingEnabled = request.model.endsWith('with-thinking');
+        return {
+          model: anthropic('claude-sonnet-4-5'),
+          providerOptions: {
+            sendReasoning: true,
+            thinking: {type: 'enabled'},
+          } satisfies AnthropicProviderOptions,
+        };
+      }
+      default:
+        throw new Error(`Unexpected model in AI SDK runner: ${request.model}.`);
+    }
+  }
+
+  private _convertRequestToMessagesList(
+    request: LocalLlmConstrainedOutputGenerateRequestOptions | LocalLlmGenerateTextRequestOptions,
+  ): ModelMessage[] {
+    return [
+      // System prompt message.
+      ...(request.systemPrompt !== undefined
+        ? [
+            {
+              role: 'system',
+              content: request.systemPrompt,
+            } satisfies SystemModelMessage,
+          ]
+        : []),
+      // Optional additional messages
+      ...this._toAiSDKMessage(request.messages ?? []),
+      // The main message.
+      {role: 'user', content: [{type: 'text', text: request.prompt}]},
+    ];
+  }
+
+  private _toAiSDKMessage(messages: PromptDataMessage[]): ModelMessage[] {
+    const result: ModelMessage[] = [];
+
+    for (const message of messages) {
+      if (message.role === 'model') {
+        result.push({
+          role: 'assistant',
+          content: message.content.map(c =>
+            'media' in c
+              ? ({type: 'file', data: c.media.url, mediaType: 'image/png'} satisfies FilePart)
+              : ({type: 'text', text: c.text} satisfies TextPart),
+          ),
+        });
+      } else if (message.role === 'user') {
+        result.push({
+          role: 'user',
+          content: message.content.map(c =>
+            'media' in c
+              ? ({type: 'file', data: c.media.url, mediaType: 'image/png'} satisfies FilePart)
+              : ({type: 'text', text: c.text} satisfies TextPart),
+          ),
+        });
+      }
+    }
+    return result;
+  }
+}
diff --git a/runner/codegen/runner-creation.ts b/runner/codegen/runner-creation.ts
@@ -4,9 +4,11 @@ import type {ClaudeCodeRunner} from './claude-code-runner.js';
 import type {GenkitRunner} from './genkit/genkit-runner.js';
 import type {CodexRunner} from './codex-runner.js';
 import type {NoopUnimplementedRunner} from './noop-unimplemented-runner.js';
+import {AiSDKRunner} from './ai-sdk-runner.js';
 
 interface AvailableRunners {
   genkit: GenkitRunner;
+  'ai-sdk': AiSDKRunner;
   'gemini-cli': GeminiCliRunner;
   'claude-code': ClaudeCodeRunner;
   'codex': CodexRunner;
@@ -27,6 +29,8 @@ export async function getRunnerByName<T extends RunnerName>(name: T): Promise<Av
       return import('./genkit/genkit-runner.js').then(
         m => new m.GenkitRunner() as AvailableRunners[T],
       );
+    case 'ai-sdk':
+      return import('./ai-sdk-runner.js').then(m => new m.AiSDKRunner() as AvailableRunners[T]);
     case 'gemini-cli':
       return import('./gemini-cli-runner.js').then(
         m => new m.GeminiCliRunner() as AvailableRunners[T],
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
@@ -61,7 +61,7 @@ function builder(argv: Argv): Argv<Options> {
       .option('runner', {
         type: 'string',
         default: 'genkit' as const,
-        choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
+        choices: ['genkit', 'ai-sdk', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
         description: 'Runner to use to execute the eval',
       })
       .option('local', {