fix(llm): emit structured image blocks for tool-result media in Anthropic Messages (#28755)

This commit is contained in:
Kit Langton
2026-05-22 12:23:41 -04:00
committed by GitHub
parent 700d012025
commit 9db90a0b76
4 changed files with 177 additions and 3 deletions

View File

@@ -14,6 +14,7 @@ import {
type ProviderMetadata,
type ToolCallPart,
type ToolDefinition,
type ToolResultContentPart,
type ToolResultPart,
} from "../schema"
import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
@@ -96,10 +97,18 @@ const AnthropicServerToolResultBlock = Schema.Struct({
})
type AnthropicServerToolResultBlock = Schema.Schema.Type<typeof AnthropicServerToolResultBlock>
// Anthropic accepts either a plain string or an ordered array of text/image
// blocks inside `tool_result.content`. The array form is required when a tool
// returns image bytes (screenshot, image search, etc.) so they can be passed
// to the model as proper image inputs instead of being JSON-stringified into
// the prompt — which silently inflates context by megabytes and can push the
// conversation over the model's token limit.
const AnthropicToolResultContent = Schema.Union([AnthropicTextBlock, AnthropicImageBlock])
const AnthropicToolResultBlock = Schema.Struct({
type: Schema.tag("tool_result"),
tool_use_id: Schema.String,
content: Schema.String,
content: Schema.Union([Schema.String, Schema.Array(AnthropicToolResultContent)]),
is_error: Schema.optional(Schema.Boolean),
cache_control: Schema.optional(AnthropicCacheControl),
})
@@ -298,6 +307,31 @@ const lowerImage = Effect.fn("AnthropicMessages.lowerImage")(function* (part: Me
} satisfies AnthropicImageBlock
})
// Tool results may carry structured text/images. Keep media as provider-native
// content instead of JSON-stringifying base64 into a prompt string.
const lowerToolResultContentItem = Effect.fn("AnthropicMessages.lowerToolResultContentItem")(function* (
item: ToolResultContentPart,
) {
if (item.type === "text") return { type: "text" as const, text: item.text } satisfies AnthropicTextBlock
if (item.mediaType.startsWith("image/"))
return {
type: "image" as const,
source: {
type: "base64" as const,
media_type: item.mediaType,
data: ProviderShared.mediaBase64(item),
},
} satisfies AnthropicImageBlock
return yield* invalid(`Anthropic Messages tool-result media content only supports images, got ${item.mediaType}`)
})
const lowerToolResultContent = Effect.fn("AnthropicMessages.lowerToolResultContent")(function* (part: ToolResultPart) {
// Text / json / error results stay as a string for backward compatibility
// with existing cassettes and provider expectations.
if (part.result.type !== "content") return ProviderShared.toolResultText(part)
return yield* Effect.forEach(part.result.value, lowerToolResultContentItem)
})
const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
request: LLMRequest,
breakpoints: Cache.Breakpoints,
@@ -360,7 +394,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
content.push({
type: "tool_result",
tool_use_id: part.id,
content: ProviderShared.toolResultText(part),
content: yield* lowerToolResultContent(part),
is_error: part.result.type === "error" ? true : undefined,
cache_control: cacheControl(breakpoints, part.cache),
})

File diff suppressed because one or more lines are too long

View File

@@ -24,6 +24,19 @@ const request = LLM.request({
generation: { maxTokens: 20, temperature: 0 },
})
type AnthropicToolResult = Extract<
AnthropicMessages.AnthropicMessagesBody["messages"][number]["content"][number],
{ readonly type: "tool_result" }
>
const expectToolResult = (body: AnthropicMessages.AnthropicMessagesBody): AnthropicToolResult => {
const result = body.messages
.flatMap((message) => (message.role === "user" ? message.content : []))
.find((block): block is AnthropicToolResult => block.type === "tool_result")
expect(result).toBeDefined()
return result!
}
describe("Anthropic Messages route", () => {
it.effect("prepares Anthropic Messages target", () =>
Effect.gen(function* () {
@@ -71,6 +84,87 @@ describe("Anthropic Messages route", () => {
}),
)
// Regression: screenshot/read tool results must stay structured so base64
// image data is not JSON-stringified into `tool_result.content`.
it.effect("lowers image tool-result content as structured image blocks", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(
LLM.request({
id: "req_tool_result_image",
model,
messages: [
Message.user("Show me the screenshot."),
Message.assistant([ToolCallPart.make({ id: "call_1", name: "read", input: { filePath: "shot.png" } })]),
Message.tool({
id: "call_1",
name: "read",
resultType: "content",
result: [
{ type: "text", text: "Image read successfully" },
{ type: "media", mediaType: "image/png", data: "AAECAw==" },
],
}),
],
cache: "none",
}),
)
expect(expectToolResult(prepared.body).content).toEqual([
{ type: "text", text: "Image read successfully" },
{ type: "image", source: { type: "base64", media_type: "image/png", data: "AAECAw==" } },
])
}),
)
it.effect("lowers single-image tool-result content as a structured image block", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(
LLM.request({
id: "req_tool_result_image_only",
model,
messages: [
Message.assistant([ToolCallPart.make({ id: "call_1", name: "screenshot", input: {} })]),
Message.tool({
id: "call_1",
name: "screenshot",
resultType: "content",
result: [{ type: "media", mediaType: "image/jpeg", data: "/9j/AA==" }],
}),
],
cache: "none",
}),
)
expect(expectToolResult(prepared.body).content).toEqual([
{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "/9j/AA==" } },
])
}),
)
it.effect("rejects non-image media in tool-result content with a clear error", () =>
Effect.gen(function* () {
const error = yield* LLMClient.prepare(
LLM.request({
id: "req_tool_result_unsupported_media",
model,
messages: [
Message.assistant([ToolCallPart.make({ id: "call_1", name: "fetch", input: {} })]),
Message.tool({
id: "call_1",
name: "fetch",
resultType: "content",
result: [{ type: "media", mediaType: "audio/mpeg", data: "AAECAw==" }],
}),
],
cache: "none",
}),
).pipe(Effect.flip)
expect(error.message).toContain("Anthropic Messages")
expect(error.message).toContain("audio/mpeg")
}),
)
it.effect("prepares the composed native continuation request", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(

View File

@@ -113,7 +113,10 @@ describeRecordedGoldenScenarios([
requires: ["ANTHROPIC_API_KEY"],
tags: ["flagship"],
options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) },
scenarios: [{ id: "tool-loop", temperature: false }],
scenarios: [
{ id: "tool-loop", temperature: false },
{ id: "image-tool-result", temperature: false, maxTokens: 40 },
],
},
{
name: "Gemini 2.5 Flash",