fix(llm): emit structured image blocks for tool-result media in Anthropic Messages (#28755)

2026-05-23 21:04:36 +00:00 · 2026-05-22 12:23:41 -04:00
parent 700d012025
commit 9db90a0b76
4 changed files with 177 additions and 3 deletions
--- a/packages/llm/src/protocols/anthropic-messages.ts
+++ b/packages/llm/src/protocols/anthropic-messages.ts
@@ -14,6 +14,7 @@ import {
  type ProviderMetadata,
  type ToolCallPart,
  type ToolDefinition,
+  type ToolResultContentPart,
  type ToolResultPart,
 } from "../schema"
 import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
@@ -96,10 +97,18 @@ const AnthropicServerToolResultBlock = Schema.Struct({
 })
 type AnthropicServerToolResultBlock = Schema.Schema.Type<typeof AnthropicServerToolResultBlock>

+// Anthropic accepts either a plain string or an ordered array of text/image
+// blocks inside `tool_result.content`. The array form is required when a tool
+// returns image bytes (screenshot, image search, etc.) so they can be passed
+// to the model as proper image inputs instead of being JSON-stringified into
+// the prompt — which silently inflates context by megabytes and can push the
+// conversation over the model's token limit.
+const AnthropicToolResultContent = Schema.Union([AnthropicTextBlock, AnthropicImageBlock])
+
 const AnthropicToolResultBlock = Schema.Struct({
  type: Schema.tag("tool_result"),
  tool_use_id: Schema.String,
-  content: Schema.String,
+  content: Schema.Union([Schema.String, Schema.Array(AnthropicToolResultContent)]),
  is_error: Schema.optional(Schema.Boolean),
  cache_control: Schema.optional(AnthropicCacheControl),
 })
@@ -298,6 +307,31 @@ const lowerImage = Effect.fn("AnthropicMessages.lowerImage")(function* (part: Me
  } satisfies AnthropicImageBlock
 })

+// Tool results may carry structured text/images. Keep media as provider-native
+// content instead of JSON-stringifying base64 into a prompt string.
+const lowerToolResultContentItem = Effect.fn("AnthropicMessages.lowerToolResultContentItem")(function* (
+  item: ToolResultContentPart,
+) {
+  if (item.type === "text") return { type: "text" as const, text: item.text } satisfies AnthropicTextBlock
+  if (item.mediaType.startsWith("image/"))
+    return {
+      type: "image" as const,
+      source: {
+        type: "base64" as const,
+        media_type: item.mediaType,
+        data: ProviderShared.mediaBase64(item),
+      },
+    } satisfies AnthropicImageBlock
+  return yield* invalid(`Anthropic Messages tool-result media content only supports images, got ${item.mediaType}`)
+})
+
+const lowerToolResultContent = Effect.fn("AnthropicMessages.lowerToolResultContent")(function* (part: ToolResultPart) {
+  // Text / json / error results stay as a string for backward compatibility
+  // with existing cassettes and provider expectations.
+  if (part.result.type !== "content") return ProviderShared.toolResultText(part)
+  return yield* Effect.forEach(part.result.value, lowerToolResultContentItem)
+})
+
 const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
  request: LLMRequest,
  breakpoints: Cache.Breakpoints,
@@ -360,7 +394,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
      content.push({
        type: "tool_result",
        tool_use_id: part.id,
-        content: ProviderShared.toolResultText(part),
+        content: yield* lowerToolResultContent(part),
        is_error: part.result.type === "error" ? true : undefined,
        cache_control: cacheControl(breakpoints, part.cache),
      })
--- a/packages/llm/test/fixtures/recordings/anthropic-messages/anthropic-opus-4-7-image-tool-result.json
+++ b/packages/llm/test/fixtures/recordings/anthropic-messages/anthropic-opus-4-7-image-tool-result.json
--- a/packages/llm/test/provider/anthropic-messages.test.ts
+++ b/packages/llm/test/provider/anthropic-messages.test.ts
@@ -24,6 +24,19 @@ const request = LLM.request({
  generation: { maxTokens: 20, temperature: 0 },
 })

+type AnthropicToolResult = Extract<
+  AnthropicMessages.AnthropicMessagesBody["messages"][number]["content"][number],
+  { readonly type: "tool_result" }
+>
+
+const expectToolResult = (body: AnthropicMessages.AnthropicMessagesBody): AnthropicToolResult => {
+  const result = body.messages
+    .flatMap((message) => (message.role === "user" ? message.content : []))
+    .find((block): block is AnthropicToolResult => block.type === "tool_result")
+  expect(result).toBeDefined()
+  return result!
+}
+
 describe("Anthropic Messages route", () => {
  it.effect("prepares Anthropic Messages target", () =>
    Effect.gen(function* () {
@@ -71,6 +84,87 @@ describe("Anthropic Messages route", () => {
    }),
  )

+  // Regression: screenshot/read tool results must stay structured so base64
+  // image data is not JSON-stringified into `tool_result.content`.
+  it.effect("lowers image tool-result content as structured image blocks", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(
+        LLM.request({
+          id: "req_tool_result_image",
+          model,
+          messages: [
+            Message.user("Show me the screenshot."),
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "read", input: { filePath: "shot.png" } })]),
+            Message.tool({
+              id: "call_1",
+              name: "read",
+              resultType: "content",
+              result: [
+                { type: "text", text: "Image read successfully" },
+                { type: "media", mediaType: "image/png", data: "AAECAw==" },
+              ],
+            }),
+          ],
+          cache: "none",
+        }),
+      )
+
+      expect(expectToolResult(prepared.body).content).toEqual([
+        { type: "text", text: "Image read successfully" },
+        { type: "image", source: { type: "base64", media_type: "image/png", data: "AAECAw==" } },
+      ])
+    }),
+  )
+
+  it.effect("lowers single-image tool-result content as a structured image block", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(
+        LLM.request({
+          id: "req_tool_result_image_only",
+          model,
+          messages: [
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "screenshot", input: {} })]),
+            Message.tool({
+              id: "call_1",
+              name: "screenshot",
+              resultType: "content",
+              result: [{ type: "media", mediaType: "image/jpeg", data: "/9j/AA==" }],
+            }),
+          ],
+          cache: "none",
+        }),
+      )
+
+      expect(expectToolResult(prepared.body).content).toEqual([
+        { type: "image", source: { type: "base64", media_type: "image/jpeg", data: "/9j/AA==" } },
+      ])
+    }),
+  )
+
+  it.effect("rejects non-image media in tool-result content with a clear error", () =>
+    Effect.gen(function* () {
+      const error = yield* LLMClient.prepare(
+        LLM.request({
+          id: "req_tool_result_unsupported_media",
+          model,
+          messages: [
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "fetch", input: {} })]),
+            Message.tool({
+              id: "call_1",
+              name: "fetch",
+              resultType: "content",
+              result: [{ type: "media", mediaType: "audio/mpeg", data: "AAECAw==" }],
+            }),
+          ],
+          cache: "none",
+        }),
+      ).pipe(Effect.flip)
+
+      expect(error.message).toContain("Anthropic Messages")
+      expect(error.message).toContain("audio/mpeg")
+    }),
+  )
+
  it.effect("prepares the composed native continuation request", () =>
    Effect.gen(function* () {
      const prepared = yield* LLMClient.prepare<AnthropicMessages.AnthropicMessagesBody>(
--- a/packages/llm/test/provider/golden.recorded.test.ts
+++ b/packages/llm/test/provider/golden.recorded.test.ts
@@ -113,7 +113,10 @@ describeRecordedGoldenScenarios([
    requires: ["ANTHROPIC_API_KEY"],
    tags: ["flagship"],
    options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) },
-    scenarios: [{ id: "tool-loop", temperature: false }],
+    scenarios: [
+      { id: "tool-loop", temperature: false },
+      { id: "image-tool-result", temperature: false, maxTokens: 40 },
+    ],
  },
  {
    name: "Gemini 2.5 Flash",