diff --git a/src/api/providers/__tests__/gemini-handler.spec.ts b/src/api/providers/__tests__/gemini-handler.spec.ts index a9544a0b97f..9e0d4edc431 100644 --- a/src/api/providers/__tests__/gemini-handler.spec.ts +++ b/src/api/providers/__tests__/gemini-handler.spec.ts @@ -5,10 +5,9 @@ import { GeminiHandler } from "../gemini" import type { ApiHandlerOptions } from "../../../shared/api" describe("GeminiHandler backend support", () => { - it("createMessage uses function declarations (URL context and grounding are only for completePrompt)", async () => { - // URL context and grounding are mutually exclusive with function declarations - // in Gemini API, so createMessage only uses function declarations. - // URL context/grounding are only added in completePrompt. + it("createMessage uses function declarations and googleSearch for Gemini 3 models", async () => { + // Gemini 3+ models support combining built-in tools (Google Search) with + // function declarations in a single generation (tool context circulation). const options = { apiProvider: "gemini", enableUrlContext: true, @@ -20,9 +19,9 @@ describe("GeminiHandler backend support", () => { handler["client"].models.generateContentStream = stub await handler.createMessage("instr", [] as any).next() const config = stub.mock.calls[0][0].config - // createMessage always uses function declarations only - // (tools are always present from ALWAYS_AVAILABLE_TOOLS) - expect(config.tools).toEqual([{ functionDeclarations: expect.any(Array) }]) + // Default model is gemini-3.1-pro-preview, a Gemini 3 model, + // so tools should include both function declarations and googleSearch. + expect(config.tools).toEqual([{ functionDeclarations: expect.any(Array) }, { googleSearch: {} }]) }) it("completePrompt passes config overrides without tools when URL context and grounding disabled", async () => { diff --git a/src/api/providers/__tests__/gemini.spec.ts b/src/api/providers/__tests__/gemini.spec.ts index 47ee79dd0d6..3ae083329e0 100644 --- a/src/api/providers/__tests__/gemini.spec.ts +++ b/src/api/providers/__tests__/gemini.spec.ts @@ -257,6 +257,206 @@ describe("GeminiHandler", () => { }) }) + describe("Gemini 3 tool context circulation", () => { + const systemPrompt = "You are a helpful assistant" + const mockMessages: Anthropic.Messages.MessageParam[] = [ + { role: "user", content: "Search the web for the latest API docs" }, + ] + + it("should include googleSearch tool for Gemini 3 models", async () => { + const gemini3Handler = new GeminiHandler({ + apiKey: "test-key", + apiModelId: "gemini-3-pro-preview", + geminiApiKey: "test-key", + }) + + const mockGenerateContentStream = vitest.fn().mockResolvedValue({ + [Symbol.asyncIterator]: async function* () { + yield { text: "Hello" } + yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } } + }, + }) + + gemini3Handler["client"] = { + models: { + generateContentStream: mockGenerateContentStream, + generateContent: vitest.fn(), + }, + } as any + + const stream = gemini3Handler.createMessage(systemPrompt, mockMessages) + for await (const _chunk of stream) { + // consume + } + + const callArgs = mockGenerateContentStream.mock.calls[0][0] + const tools = callArgs.config.tools + expect(tools).toHaveLength(2) + expect(tools[0]).toHaveProperty("functionDeclarations") + expect(tools[1]).toEqual({ googleSearch: {} }) + }) + + it("should NOT include googleSearch tool for pre-Gemini 3 models", async () => { + // The default handler uses geminiDefaultModelId which is gemini-3.1-pro-preview + // Let's create one with a 2.5 model + const gemini25Handler = new GeminiHandler({ + apiKey: "test-key", + apiModelId: "gemini-2.5-pro", + geminiApiKey: "test-key", + }) + + const mockGenerateContentStream = vitest.fn().mockResolvedValue({ + [Symbol.asyncIterator]: async function* () { + yield { text: "Hello" } + yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } } + }, + }) + + gemini25Handler["client"] = { + models: { + generateContentStream: mockGenerateContentStream, + generateContent: vitest.fn(), + }, + } as any + + const stream = gemini25Handler.createMessage(systemPrompt, mockMessages) + for await (const _chunk of stream) { + // consume + } + + const callArgs = mockGenerateContentStream.mock.calls[0][0] + const tools = callArgs.config.tools + expect(tools).toHaveLength(1) + expect(tools[0]).toHaveProperty("functionDeclarations") + }) + + it("should handle executableCode parts in streaming response", async () => { + const gemini3Handler = new GeminiHandler({ + apiKey: "test-key", + apiModelId: "gemini-3-pro-preview", + geminiApiKey: "test-key", + }) + + const mockGenerateContentStream = vitest.fn().mockResolvedValue({ + [Symbol.asyncIterator]: async function* () { + yield { + candidates: [ + { + content: { + parts: [ + { + executableCode: { + code: 'print("hello")', + language: "python", + }, + }, + ], + }, + }, + ], + } + yield { + candidates: [ + { + content: { + parts: [ + { + codeExecutionResult: { + output: "hello", + outcome: "OUTCOME_OK", + }, + }, + ], + }, + }, + ], + } + yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } } + }, + }) + + gemini3Handler["client"] = { + models: { + generateContentStream: mockGenerateContentStream, + generateContent: vitest.fn(), + }, + } as any + + const stream = gemini3Handler.createMessage(systemPrompt, mockMessages) + const chunks = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should yield text chunks for executableCode and codeExecutionResult + const textChunks = chunks.filter((c) => c.type === "text") + expect(textChunks.length).toBe(2) + expect(textChunks[0].text).toContain('print("hello")') + expect(textChunks[1].text).toContain("hello") + + // Should store server-side tool parts for history round-tripping + const storedParts = gemini3Handler.getServerSideToolParts() + expect(storedParts).toHaveLength(2) + expect(storedParts![0].type).toBe("executableCode") + expect(storedParts![0].data).toEqual({ code: 'print("hello")', language: "python" }) + expect(storedParts![1].type).toBe("codeExecutionResult") + expect(storedParts![1].data).toEqual({ output: "hello", outcome: "OUTCOME_OK" }) + }) + + it("should reset server-side tool parts between requests", async () => { + const gemini3Handler = new GeminiHandler({ + apiKey: "test-key", + apiModelId: "gemini-3-pro-preview", + geminiApiKey: "test-key", + }) + + const mockGenerateContentStream = vitest.fn() + + gemini3Handler["client"] = { + models: { + generateContentStream: mockGenerateContentStream, + generateContent: vitest.fn(), + }, + } as any + + // First request: has server-side tool parts + mockGenerateContentStream.mockResolvedValueOnce({ + [Symbol.asyncIterator]: async function* () { + yield { + candidates: [ + { + content: { + parts: [{ executableCode: { code: "x = 1", language: "python" } }], + }, + }, + ], + } + yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } } + }, + }) + + let stream = gemini3Handler.createMessage(systemPrompt, mockMessages) + for await (const _chunk of stream) { + // consume + } + expect(gemini3Handler.getServerSideToolParts()).toHaveLength(1) + + // Second request: no server-side tool parts + mockGenerateContentStream.mockResolvedValueOnce({ + [Symbol.asyncIterator]: async function* () { + yield { text: "plain text" } + yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } } + }, + }) + + stream = gemini3Handler.createMessage(systemPrompt, mockMessages) + for await (const _chunk of stream) { + // consume + } + expect(gemini3Handler.getServerSideToolParts()).toBeUndefined() + }) + }) + describe("error telemetry", () => { const mockMessages: Anthropic.Messages.MessageParam[] = [ { diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index a49073ea334..43f92622fa9 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -29,6 +29,27 @@ import { getModelParams } from "../transform/model-params" import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index" import { BaseProvider } from "./base-provider" +/** + * Represents a server-side tool part returned by Gemini 3 when built-in tools + * (Google Search, Code Execution, URL Context) are combined with custom + * function declarations. These parts must be preserved and round-tripped in + * conversation history for the model to maintain context. + */ +export type ServerSideToolPart = { + type: "serverSideToolCall" | "serverSideToolResponse" | "executableCode" | "codeExecutionResult" + /** Raw part data from the Gemini API response, preserved for round-tripping. */ + data: Record +} + +/** + * Returns true if the model ID corresponds to a Gemini 3+ model that supports + * combining server-side built-in tools (Google Search, URL Context, Code + * Execution) with client-side function declarations in a single generation. + */ +function isGemini3Model(modelId: string): boolean { + return /^gemini-3/.test(modelId) +} + type GeminiHandlerOptions = ApiHandlerOptions & { isVertex?: boolean } @@ -39,6 +60,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl private client: GoogleGenAI private lastThoughtSignature?: string private lastResponseId?: string + private lastServerSideToolParts?: ServerSideToolPart[] private readonly providerName = "Gemini" constructor({ isVertex, ...options }: GeminiHandlerOptions) { @@ -80,6 +102,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl // Reset per-request metadata that we persist into apiConversationHistory. this.lastThoughtSignature = undefined this.lastResponseId = undefined + this.lastServerSideToolParts = undefined // For hybrid/budget reasoning models (e.g. Gemini 2.5 Pro), respect user-configured // modelMaxTokens so the ThinkingBudget slider can control the cap. For effort-only or @@ -129,18 +152,30 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl .flat() // Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS). - // Google built-in tools (Grounding, URL Context) are mutually exclusive - // with function declarations in the Gemini API, so we always use - // function declarations when tools are provided. - const tools: GenerateContentConfig["tools"] = [ - { - functionDeclarations: (metadata?.tools ?? []).map((tool) => ({ - name: (tool as any).function.name, - description: (tool as any).function.description, - parametersJsonSchema: (tool as any).function.parameters, - })), - }, - ] + // For pre-Gemini 3 models, Google built-in tools (Grounding, URL Context) + // are mutually exclusive with function declarations. + // For Gemini 3+, we can combine them, enabling "tool context circulation" + // where the model can use both server-side built-in tools and client-side + // function declarations in a single generation. + const isGemini3 = isGemini3Model(model) + + const functionDeclarationsTool = { + functionDeclarations: (metadata?.tools ?? []).map((tool) => ({ + name: (tool as any).function.name, + description: (tool as any).function.description, + parametersJsonSchema: (tool as any).function.parameters, + })), + } + + const tools: GenerateContentConfig["tools"] = isGemini3 + ? [ + functionDeclarationsTool, + // Enable Google Search as a built-in tool alongside custom function declarations. + // The model can invoke this server-side, and the results will be circulated back + // as context for subsequent turns. + { googleSearch: {} }, + ] + : [functionDeclarationsTool] // Determine temperature respecting model capabilities and defaults: // - If supportsTemperature is explicitly false, ignore user overrides @@ -235,6 +270,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl text?: string thoughtSignature?: string functionCall?: { name: string; args: Record } + executableCode?: { code: string; language?: string } + codeExecutionResult?: { output: string; outcome?: string } }>) { // Capture thought signatures so they can be persisted into API history. const thoughtSignature = part.thoughtSignature @@ -277,6 +314,37 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl } toolCallCounter++ + } else if (part.executableCode) { + // Server-side code execution part (Gemini 3 built-in tool). + // Surface the code to the user as informational text and + // store the raw part for round-tripping in conversation history. + hasContent = true + const lang = part.executableCode.language ?? "python" + yield { + type: "text", + text: `\n\`\`\`${lang}\n${part.executableCode.code}\n\`\`\`\n`, + } + if (!this.lastServerSideToolParts) { + this.lastServerSideToolParts = [] + } + this.lastServerSideToolParts.push({ + type: "executableCode", + data: part.executableCode as unknown as Record, + }) + } else if (part.codeExecutionResult) { + // Server-side code execution result (Gemini 3 built-in tool). + hasContent = true + yield { + type: "text", + text: `\n**Code Execution Result:**\n\`\`\`\n${part.codeExecutionResult.output}\n\`\`\`\n`, + } + if (!this.lastServerSideToolParts) { + this.lastServerSideToolParts = [] + } + this.lastServerSideToolParts.push({ + type: "codeExecutionResult", + data: part.codeExecutionResult as unknown as Record, + }) } else { // This is regular content if (part.text) { @@ -463,6 +531,10 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl return this.lastResponseId } + public getServerSideToolParts(): ServerSideToolPart[] | undefined { + return this.lastServerSideToolParts + } + public calculateCost({ info, inputTokens, diff --git a/src/api/transform/__tests__/gemini-format.spec.ts b/src/api/transform/__tests__/gemini-format.spec.ts index 23f752e207f..c97d6296b95 100644 --- a/src/api/transform/__tests__/gemini-format.spec.ts +++ b/src/api/transform/__tests__/gemini-format.spec.ts @@ -484,4 +484,97 @@ describe("convertAnthropicMessageToGemini", () => { }, ]) }) + + describe("server-side tool parts (Gemini 3 tool context circulation)", () => { + it("should convert executableCode content blocks back to Gemini parts", () => { + const anthropicMessage: Anthropic.Messages.MessageParam = { + role: "assistant", + content: [ + { type: "text", text: "Let me run some code" }, + { + type: "executableCode", + data: { code: 'print("hello")', language: "python" }, + }, + ] as any, + } + + const result = convertAnthropicMessageToGemini(anthropicMessage) + + expect(result).toEqual([ + { + role: "model", + parts: [ + { text: "Let me run some code" }, + { executableCode: { code: 'print("hello")', language: "python" } }, + ], + }, + ]) + }) + + it("should convert codeExecutionResult content blocks back to Gemini parts", () => { + const anthropicMessage: Anthropic.Messages.MessageParam = { + role: "assistant", + content: [ + { + type: "codeExecutionResult", + data: { output: "hello", outcome: "OUTCOME_OK" }, + }, + ] as any, + } + + const result = convertAnthropicMessageToGemini(anthropicMessage) + + expect(result).toEqual([ + { + role: "model", + parts: [{ codeExecutionResult: { output: "hello", outcome: "OUTCOME_OK" } }], + }, + ]) + }) + + it("should handle mixed function calls and server-side tool parts", () => { + const toolIdToName = new Map([["tool-1", "read_file"]]) + + const anthropicMessage: Anthropic.Messages.MessageParam = { + role: "assistant", + content: [ + { + type: "executableCode", + data: { code: "result = search('api docs')", language: "python" }, + }, + { + type: "codeExecutionResult", + data: { output: "Found 3 results", outcome: "OUTCOME_OK" }, + }, + { + type: "tool_use", + id: "tool-1", + name: "read_file", + input: { path: "README.md" }, + }, + ] as any, + } + + const result = convertAnthropicMessageToGemini(anthropicMessage, { + includeThoughtSignatures: false, + toolIdToName, + }) + + expect(result).toEqual([ + { + role: "model", + parts: [ + { executableCode: { code: "result = search('api docs')", language: "python" } }, + { codeExecutionResult: { output: "Found 3 results", outcome: "OUTCOME_OK" } }, + { + functionCall: { + name: "read_file", + args: { path: "README.md" }, + }, + }, + ], + }, + ]) + }) + }) }) diff --git a/src/api/transform/gemini-format.ts b/src/api/transform/gemini-format.ts index 6f240362960..407ebcd6d75 100644 --- a/src/api/transform/gemini-format.ts +++ b/src/api/transform/gemini-format.ts @@ -11,7 +11,21 @@ type ReasoningContentBlock = { text: string } -type ExtendedContentBlockParam = Anthropic.ContentBlockParam | ThoughtSignatureContentBlock | ReasoningContentBlock +/** + * Represents a server-side tool part stored in conversation history. + * These are produced by Gemini 3 built-in tools (Google Search, Code Execution, + * URL Context) and must be round-tripped back to the model for context circulation. + */ +type ServerSideToolContentBlock = { + type: "serverSideToolCall" | "serverSideToolResponse" | "executableCode" | "codeExecutionResult" + data: Record +} + +type ExtendedContentBlockParam = + | Anthropic.ContentBlockParam + | ThoughtSignatureContentBlock + | ReasoningContentBlock + | ServerSideToolContentBlock type ExtendedAnthropicContent = string | ExtendedContentBlockParam[] // Extension type to safely add thoughtSignature to Part @@ -23,6 +37,15 @@ function isThoughtSignatureContentBlock(block: ExtendedContentBlockParam): block return block.type === "thoughtSignature" } +function isServerSideToolContentBlock(block: ExtendedContentBlockParam): block is ServerSideToolContentBlock { + return ( + block.type === "serverSideToolCall" || + block.type === "serverSideToolResponse" || + block.type === "executableCode" || + block.type === "codeExecutionResult" + ) +} + export function convertAnthropicContentToGemini( content: ExtendedAnthropicContent, options?: { includeThoughtSignatures?: boolean; toolIdToName?: Map }, @@ -60,6 +83,23 @@ export function convertAnthropicContentToGemini( return [] } + // Handle server-side tool parts (Gemini 3 built-in tool context circulation). + // These parts are stored in conversation history and must be passed back to the + // model as-is so it can maintain context from previous server-side tool invocations. + if (isServerSideToolContentBlock(block)) { + const data = block.data + switch (block.type) { + case "executableCode": + return { executableCode: data } as Part + case "codeExecutionResult": + return { codeExecutionResult: data } as Part + default: + // For generic server-side tool call/response parts, pass through the raw data. + // The SDK Part type may not have explicit fields for these, so we cast. + return data as unknown as Part + } + } + switch (block.type) { case "text": return { text: block.text } diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 005bb0f292b..d091edc66bf 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -869,6 +869,7 @@ export class Task extends EventEmitter implements TaskLike { getThoughtSignature?: () => string | undefined getSummary?: () => any[] | undefined getReasoningDetails?: () => any[] | undefined + getServerSideToolParts?: () => Array<{ type: string; data: Record }> | undefined } if (message.role === "assistant") { @@ -983,6 +984,29 @@ export class Task extends EventEmitter implements TaskLike { } } + // For Gemini 3 models, persist server-side tool parts (executableCode, + // codeExecutionResult, etc.) so they can be round-tripped in subsequent turns. + // This enables "tool context circulation" where the model maintains context + // from previous server-side built-in tool invocations. + const serverSideToolParts = handler.getServerSideToolParts?.() + if (serverSideToolParts && serverSideToolParts.length > 0) { + const serverSideBlocks = serverSideToolParts.map((part) => ({ + type: part.type, + data: part.data, + })) + + if (typeof messageWithTs.content === "string") { + messageWithTs.content = [ + { type: "text", text: messageWithTs.content } satisfies Anthropic.Messages.TextBlockParam, + ...serverSideBlocks, + ] + } else if (Array.isArray(messageWithTs.content)) { + messageWithTs.content = [...messageWithTs.content, ...serverSideBlocks] + } else if (!messageWithTs.content) { + messageWithTs.content = serverSideBlocks + } + } + this.apiConversationHistory.push(messageWithTs) } else { // For user messages, validate tool_result IDs ONLY when the immediately previous *effective* message