feat(llm): cache hint TTL, breakpoint cap, and tool placement (#26779)

2026-05-16 01:22:58 +00:00 · 2026-05-10 21:17:38 -04:00
parent fed716ada5
commit 77e6c0d329
12 changed files with 555 additions and 39 deletions
--- a/packages/llm/src/protocols/anthropic-messages.ts
+++ b/packages/llm/src/protocols/anthropic-messages.ts
@@ -16,6 +16,7 @@ import {
  type ToolResultPart,
 } from "../schema"
 import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
+import * as Cache from "./utils/cache"
 import { ToolStream } from "./utils/tool-stream"

 const ADAPTER = "anthropic-messages"
@@ -25,7 +26,10 @@ export const PATH = "/messages"
 // =============================================================================
 // Request Body Schema
 // =============================================================================
-const AnthropicCacheControl = Schema.Struct({ type: Schema.tag("ephemeral") })
+const AnthropicCacheControl = Schema.Struct({
+  type: Schema.tag("ephemeral"),
+  ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
+})

 const AnthropicTextBlock = Schema.Struct({
  type: Schema.tag("text"),
@@ -193,8 +197,24 @@ const invalid = ProviderShared.invalidRequest
 // =============================================================================
 // Request Lowering
 // =============================================================================
-const cacheControl = (cache: CacheHint | undefined) =>
-  cache?.type === "ephemeral" ? { type: "ephemeral" as const } : undefined
+// Anthropic accepts at most 4 explicit cache_control breakpoints per request,
+// across `tools`, `system`, and `messages`. Beyond the cap the API returns a
+// 400 — so the lowering layer counts emitted markers and silently drops any
+// that exceed it.
+const ANTHROPIC_BREAKPOINT_CAP = 4
+
+const EPHEMERAL_5M = { type: "ephemeral" as const }
+const EPHEMERAL_1H = { type: "ephemeral" as const, ttl: "1h" as const }
+
+const cacheControl = (breakpoints: Cache.Breakpoints, cache: CacheHint | undefined) => {
+  if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
+  if (breakpoints.remaining <= 0) {
+    breakpoints.dropped += 1
+    return undefined
+  }
+  breakpoints.remaining -= 1
+  return Cache.ttlBucket(cache.ttlSeconds) === "1h" ? EPHEMERAL_1H : EPHEMERAL_5M
+}

 const anthropicMetadata = (metadata: Record<string, unknown>): ProviderMetadata => ({ anthropic: metadata })

@@ -204,10 +224,11 @@ const signatureFromMetadata = (metadata: ProviderMetadata | undefined): string |
  return typeof anthropic.signature === "string" ? anthropic.signature : undefined
 }

-const lowerTool = (tool: ToolDefinition): AnthropicTool => ({
+const lowerTool = (breakpoints: Cache.Breakpoints, tool: ToolDefinition): AnthropicTool => ({
  name: tool.name,
  description: tool.description,
  input_schema: tool.inputSchema,
+  cache_control: cacheControl(breakpoints, tool.cache),
 })

 const lowerToolChoice = (toolChoice: NonNullable<LLMRequest["toolChoice"]>) =>
@@ -249,7 +270,10 @@ const lowerServerToolResult = Effect.fn("AnthropicMessages.lowerServerToolResult
  return { type: wireType, tool_use_id: part.id, content: part.result.value } satisfies AnthropicServerToolResultBlock
 })

-const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (request: LLMRequest) {
+const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
+  request: LLMRequest,
+  breakpoints: Cache.Breakpoints,
+) {
  const messages: AnthropicMessage[] = []

  for (const message of request.messages) {
@@ -258,7 +282,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
      for (const part of message.content) {
        if (!ProviderShared.supportsContent(part, ["text"]))
          return yield* ProviderShared.unsupportedContent("Anthropic Messages", "user", ["text"])
-        content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
+        content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
      }
      messages.push({ role: "user", content })
      continue
@@ -268,7 +292,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
      const content: AnthropicAssistantBlock[] = []
      for (const part of message.content) {
        if (part.type === "text") {
-          content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
+          content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
          continue
        }
        if (part.type === "reasoning") {
@@ -304,6 +328,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
        tool_use_id: part.id,
        content: ProviderShared.toolResultText(part),
        is_error: part.result.type === "error" ? true : undefined,
+        cache_control: cacheControl(breakpoints, part.cache),
      })
    }
    messages.push({ role: "user", content })
@@ -330,18 +355,33 @@ const lowerThinking = Effect.fn("AnthropicMessages.lowerThinking")(function* (re
 const fromRequest = Effect.fn("AnthropicMessages.fromRequest")(function* (request: LLMRequest) {
  const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
  const generation = request.generation
+  // Allocate the 4-breakpoint budget in invalidation order: tools → system →
+  // messages. Tools live highest in the cache hierarchy, so when callers
+  // over-mark we keep their tool hints and shed the message-tail ones first.
+  const breakpoints = Cache.newBreakpoints(ANTHROPIC_BREAKPOINT_CAP)
+  const tools =
+    request.tools.length === 0 || request.toolChoice?.type === "none"
+      ? undefined
+      : request.tools.map((tool) => lowerTool(breakpoints, tool))
+  const system =
+    request.system.length === 0
+      ? undefined
+      : request.system.map((part) => ({
+          type: "text" as const,
+          text: part.text,
+          cache_control: cacheControl(breakpoints, part.cache),
+        }))
+  const messages = yield* lowerMessages(request, breakpoints)
+  if (breakpoints.dropped > 0) {
+    yield* Effect.logWarning(
+      `Anthropic Messages: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${ANTHROPIC_BREAKPOINT_CAP} per request.`,
+    )
+  }
  return {
    model: request.model.id,
-    system:
-      request.system.length === 0
-        ? undefined
-        : request.system.map((part) => ({
-            type: "text" as const,
-            text: part.text,
-            cache_control: cacheControl(part.cache),
-          })),
-    messages: yield* lowerMessages(request),
-    tools: request.tools.length === 0 || request.toolChoice?.type === "none" ? undefined : request.tools.map(lowerTool),
+    system,
+    messages,
+    tools,
    tool_choice: toolChoice,
    stream: true as const,
    max_tokens: generation?.maxTokens ?? request.model.limits.output ?? 4096,
--- a/packages/llm/src/protocols/bedrock-converse.ts
+++ b/packages/llm/src/protocols/bedrock-converse.ts
@@ -108,7 +108,7 @@ type BedrockMessage = Schema.Schema.Type<typeof BedrockMessage>
 const BedrockSystemBlock = Schema.Union([BedrockTextBlock, BedrockCache.CachePointBlock])
 type BedrockSystemBlock = Schema.Schema.Type<typeof BedrockSystemBlock>

-const BedrockTool = Schema.Struct({
+const BedrockToolSpec = Schema.Struct({
  toolSpec: Schema.Struct({
    name: Schema.String,
    description: Schema.String,
@@ -117,6 +117,9 @@ const BedrockTool = Schema.Struct({
    }),
  }),
 })
+type BedrockToolSpec = Schema.Schema.Type<typeof BedrockToolSpec>
+
+const BedrockTool = Schema.Union([BedrockToolSpec, BedrockCache.CachePointBlock])
 type BedrockTool = Schema.Schema.Type<typeof BedrockTool>

 const BedrockToolChoice = Schema.Union([
@@ -214,7 +217,7 @@ type BedrockEvent = Schema.Schema.Type<typeof BedrockEvent>
 // =============================================================================
 // Request Lowering
 // =============================================================================
-const lowerTool = (tool: ToolDefinition): BedrockTool => ({
+const lowerToolSpec = (tool: ToolDefinition): BedrockToolSpec => ({
  toolSpec: {
    name: tool.name,
    description: tool.description,
@@ -222,11 +225,25 @@ const lowerTool = (tool: ToolDefinition): BedrockTool => ({
  },
 })

+const lowerTools = (
+  breakpoints: BedrockCache.Breakpoints,
+  tools: ReadonlyArray<ToolDefinition>,
+): BedrockTool[] => {
+  const result: BedrockTool[] = []
+  for (const tool of tools) {
+    result.push(lowerToolSpec(tool))
+    const cachePoint = BedrockCache.block(breakpoints, tool.cache)
+    if (cachePoint) result.push(cachePoint)
+  }
+  return result
+}
+
 const textWithCache = (
+  breakpoints: BedrockCache.Breakpoints,
  text: string,
  cache: CacheHint | undefined,
 ): Array<BedrockTextBlock | BedrockCache.CachePointBlock> => {
-  const cachePoint = BedrockCache.block(cache)
+  const cachePoint = BedrockCache.block(breakpoints, cache)
  return cachePoint ? [{ text }, cachePoint] : [{ text }]
 }

@@ -257,7 +274,10 @@ const lowerToolResult = (part: ToolResultPart): BedrockToolResultBlock => ({
  },
 })

-const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (request: LLMRequest) {
+const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (
+  request: LLMRequest,
+  breakpoints: BedrockCache.Breakpoints,
+) {
  const messages: BedrockMessage[] = []

  for (const message of request.messages) {
@@ -267,7 +287,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
        if (!ProviderShared.supportsContent(part, ["text", "media"]))
          return yield* ProviderShared.unsupportedContent("Bedrock Converse", "user", ["text", "media"])
        if (part.type === "text") {
-          content.push(...textWithCache(part.text, part.cache))
+          content.push(...textWithCache(breakpoints, part.text, part.cache))
          continue
        }
        if (part.type === "media") {
@@ -289,7 +309,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
            "tool-call",
          ])
        if (part.type === "text") {
-          content.push(...textWithCache(part.text, part.cache))
+          content.push(...textWithCache(breakpoints, part.text, part.cache))
          continue
        }
        if (part.type === "reasoning") {
@@ -309,11 +329,13 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
      continue
    }

-    const content: BedrockToolResultBlock[] = []
+    const content: BedrockUserBlock[] = []
    for (const part of message.content) {
      if (!ProviderShared.supportsContent(part, ["tool-result"]))
        return yield* ProviderShared.unsupportedContent("Bedrock Converse", "tool", ["tool-result"])
      content.push(lowerToolResult(part))
+      const cachePoint = BedrockCache.block(breakpoints, part.cache)
+      if (cachePoint) content.push(cachePoint)
    }
    messages.push({ role: "user", content })
  }
@@ -323,16 +345,32 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ

 // System prompts share the cache-point convention: emit the text block, then
 // optionally a positional `cachePoint` marker.
-const lowerSystem = (system: ReadonlyArray<LLMRequest["system"][number]>): BedrockSystemBlock[] =>
-  system.flatMap((part) => textWithCache(part.text, part.cache))
+const lowerSystem = (
+  breakpoints: BedrockCache.Breakpoints,
+  system: ReadonlyArray<LLMRequest["system"][number]>,
+): BedrockSystemBlock[] => system.flatMap((part) => textWithCache(breakpoints, part.text, part.cache))

 const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request: LLMRequest) {
  const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
  const generation = request.generation
+  // Bedrock-Claude shares Anthropic's 4-breakpoint cap. Spend the budget in
+  // tools → system → messages order to favour the highest-impact prefixes.
+  const breakpoints = BedrockCache.breakpoints()
+  const toolConfig =
+    request.tools.length > 0 && request.toolChoice?.type !== "none"
+      ? { tools: lowerTools(breakpoints, request.tools), toolChoice }
+      : undefined
+  const system = request.system.length === 0 ? undefined : lowerSystem(breakpoints, request.system)
+  const messages = yield* lowerMessages(request, breakpoints)
+  if (breakpoints.dropped > 0) {
+    yield* Effect.logWarning(
+      `Bedrock Converse: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${BedrockCache.BEDROCK_BREAKPOINT_CAP} per request.`,
+    )
+  }
  return {
    modelId: request.model.id,
-    messages: yield* lowerMessages(request),
-    system: request.system.length === 0 ? undefined : lowerSystem(request.system),
+    messages,
+    system,
    inferenceConfig:
      generation?.maxTokens === undefined &&
      generation?.temperature === undefined &&
@@ -345,10 +383,7 @@ const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request:
            topP: generation?.topP,
            stopSequences: generation?.stop,
          },
-    toolConfig:
-      request.tools.length > 0 && request.toolChoice?.type !== "none"
-        ? { tools: request.tools.map(lowerTool), toolChoice }
-        : undefined,
+    toolConfig,
  }
 })

--- a/packages/llm/src/protocols/utils/bedrock-cache.ts
+++ b/packages/llm/src/protocols/utils/bedrock-cache.ts
@@ -1,20 +1,37 @@
 import { Schema } from "effect"
 import type { CacheHint } from "../../schema"
+import { newBreakpoints, ttlBucket, type Breakpoints } from "./cache"

 // Bedrock cache markers are positional: emit a `cachePoint` block immediately
-// after the content the caller wants treated as a cacheable prefix.
+// after the content the caller wants treated as a cacheable prefix. Bedrock
+// accepts optional `ttl: "5m" | "1h"` on cachePoint, mirroring Anthropic.
 export const CachePointBlock = Schema.Struct({
-  cachePoint: Schema.Struct({ type: Schema.tag("default") }),
+  cachePoint: Schema.Struct({
+    type: Schema.tag("default"),
+    ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
+  }),
 })
 export type CachePointBlock = Schema.Schema.Type<typeof CachePointBlock>

-// Bedrock recently added optional `ttl: "5m" | "1h"` on cachePoint. Map
-// `CacheHint.ttlSeconds` here once a recorded cassette validates the wire shape.
-const DEFAULT: CachePointBlock = { cachePoint: { type: "default" } }
+// Bedrock-Claude enforces the same 4-breakpoint cap as the Anthropic Messages
+// API. Callers pass a shared counter through every `block()` call site so the
+// budget is respected across `system`, `messages`, and `tools`.
+export const BEDROCK_BREAKPOINT_CAP = 4

-export const block = (cache: CacheHint | undefined): CachePointBlock | undefined => {
+export type { Breakpoints } from "./cache"
+export const breakpoints = () => newBreakpoints(BEDROCK_BREAKPOINT_CAP)
+
+const DEFAULT_5M: CachePointBlock = { cachePoint: { type: "default" } }
+const DEFAULT_1H: CachePointBlock = { cachePoint: { type: "default", ttl: "1h" } }
+
+export const block = (breakpoints: Breakpoints, cache: CacheHint | undefined): CachePointBlock | undefined => {
  if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
-  return DEFAULT
+  if (breakpoints.remaining <= 0) {
+    breakpoints.dropped += 1
+    return undefined
+  }
+  breakpoints.remaining -= 1
+  return ttlBucket(cache.ttlSeconds) === "1h" ? DEFAULT_1H : DEFAULT_5M
 }

 export * as BedrockCache from "./bedrock-cache"
--- a/packages/llm/src/protocols/utils/cache.ts
+++ b/packages/llm/src/protocols/utils/cache.ts
@@ -0,0 +1,16 @@
+// Shared helpers for provider cache-marker lowering. Anthropic and Bedrock
+// both enforce a 4-breakpoint cap per request and accept the same `5m`/`1h`
+// TTL buckets, so the counter and TTL mapping live here.
+
+export interface Breakpoints {
+  remaining: number
+  dropped: number
+}
+
+export const newBreakpoints = (cap: number): Breakpoints => ({ remaining: cap, dropped: 0 })
+
+// Returns `"1h"` for any `ttlSeconds >= 3600`, otherwise `undefined` (the
+// provider default 5m). Anthropic & Bedrock both treat anything shorter than
+// an hour as 5m.
+export const ttlBucket = (ttlSeconds: number | undefined): "1h" | undefined =>
+  ttlSeconds !== undefined && ttlSeconds >= 3600 ? "1h" : undefined
--- a/packages/llm/src/schema/messages.ts
+++ b/packages/llm/src/schema/messages.ts
@@ -79,6 +79,7 @@ export const ToolResultPart = Object.assign(
    name: Schema.String,
    result: ToolResultValue,
    providerExecuted: Schema.optional(Schema.Boolean),
+    cache: Schema.optional(CacheHint),
    metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
    providerMetadata: Schema.optional(ProviderMetadata),
  }).annotate({ identifier: "LLM.Content.ToolResult" }),
@@ -94,6 +95,7 @@ export const ToolResultPart = Object.assign(
      name: input.name,
      result: ToolResultValue.make(input.result, input.resultType),
      providerExecuted: input.providerExecuted,
+      cache: input.cache,
      metadata: input.metadata,
      providerMetadata: input.providerMetadata,
    }),
@@ -151,6 +153,7 @@ export class ToolDefinition extends Schema.Class<ToolDefinition>("LLM.ToolDefini
  name: Schema.String,
  description: Schema.String,
  inputSchema: JsonSchema,
+  cache: Schema.optional(CacheHint),
  metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
  native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
 }) {}
--- a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
+++ b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
@@ -0,0 +1,48 @@
+import { Redactor } from "@opencode-ai/http-recorder"
+import { describe, expect } from "bun:test"
+import { Effect } from "effect"
+import { CacheHint, LLM } from "../../src"
+import { LLMClient } from "../../src/route"
+import * as AnthropicMessages from "../../src/protocols/anthropic-messages"
+import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
+import { recordedTests } from "../recorded-test"
+
+const model = AnthropicMessages.model({
+  id: "claude-haiku-4-5-20251001",
+  apiKey: process.env.ANTHROPIC_API_KEY ?? "fixture",
+})
+
+// Two identical generations in a row. The first call writes the prefix into
+// Anthropic's cache; the second should report a cache read against the same
+// prefix. Cassette captures both interactions in order.
+const cacheRequest = LLM.request({
+  id: "recorded_anthropic_cache",
+  model,
+  system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
+  prompt: "Say hi.",
+  generation: { maxTokens: 16, temperature: 0 },
+})
+
+const recorded = recordedTests({
+  prefix: "anthropic-messages-cache",
+  provider: "anthropic",
+  protocol: "anthropic-messages",
+  requires: ["ANTHROPIC_API_KEY"],
+  options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) },
+})
+
+describe("Anthropic Messages cache recorded", () => {
+  recorded.effect.with("writes then reads cache_control on identical second call", { tags: ["cache"] }, () =>
+    Effect.gen(function* () {
+      const first = yield* LLMClient.generate(cacheRequest)
+      // The first call may write the cache (cacheWriteInputTokens > 0) or it
+      // may be a fresh miss (both fields 0) depending on whether the prefix is
+      // already warm on Anthropic's side. The assertion that matters is that
+      // the SECOND call reports a non-zero cache read.
+      expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
+
+      const second = yield* LLMClient.generate(cacheRequest)
+      expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
+    }),
+  )
+})
--- a/packages/llm/test/provider/anthropic-messages.test.ts
+++ b/packages/llm/test/provider/anthropic-messages.test.ts
@@ -374,4 +374,134 @@ describe("Anthropic Messages route", () => {
      expect(error.message).toContain("Anthropic Messages user messages only support text content for now")
    }),
  )
+
+  it.effect("maps ttlSeconds >= 3600 to cache_control ttl: '1h'", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          system: { type: "text", text: "system", cache: new CacheHint({ type: "ephemeral", ttlSeconds: 3600 }) },
+          prompt: "hi",
+        }),
+      )
+
+      expect(prepared.body).toMatchObject({
+        system: [{ type: "text", text: "system", cache_control: { type: "ephemeral", ttl: "1h" } }],
+      })
+    }),
+  )
+
+  it.effect("emits cache_control on tool definitions and tool-result blocks", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          tools: [
+            {
+              name: "lookup",
+              description: "lookup tool",
+              inputSchema: { type: "object", properties: {} },
+              cache: new CacheHint({ type: "ephemeral" }),
+            },
+          ],
+          messages: [
+            LLM.user("What's the weather?"),
+            LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
+            LLM.toolMessage({
+              id: "call_1",
+              name: "lookup",
+              result: { temp: 72 },
+              cache: new CacheHint({ type: "ephemeral" }),
+            }),
+          ],
+        }),
+      )
+
+      expect(prepared.body).toMatchObject({
+        tools: [{ name: "lookup", cache_control: { type: "ephemeral" } }],
+        messages: [
+          { role: "user", content: [{ type: "text", text: "What's the weather?" }] },
+          { role: "assistant", content: [{ type: "tool_use", id: "call_1", name: "lookup" }] },
+          {
+            role: "user",
+            content: [{ type: "tool_result", tool_use_id: "call_1", cache_control: { type: "ephemeral" } }],
+          },
+        ],
+      })
+    }),
+  )
+
+  it.effect("drops cache_control breakpoints past the 4-per-request cap", () =>
+    Effect.gen(function* () {
+      const hint = new CacheHint({ type: "ephemeral" })
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          system: [
+            { type: "text", text: "a", cache: hint },
+            { type: "text", text: "b", cache: hint },
+            { type: "text", text: "c", cache: hint },
+            { type: "text", text: "d", cache: hint },
+            { type: "text", text: "e", cache: hint },
+            { type: "text", text: "f", cache: hint },
+          ],
+          prompt: "hi",
+        }),
+      )
+
+      const system = (prepared.body as { system: Array<{ cache_control?: unknown }> }).system
+      const marked = system.filter((part) => part.cache_control !== undefined)
+      expect(marked).toHaveLength(4)
+      expect(system[4]?.cache_control).toBeUndefined()
+      expect(system[5]?.cache_control).toBeUndefined()
+    }),
+  )
+
+  it.effect("spends breakpoint budget on tools before system before messages", () =>
+    Effect.gen(function* () {
+      const hint = new CacheHint({ type: "ephemeral" })
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          tools: [
+            {
+              name: "t1",
+              description: "t1",
+              inputSchema: { type: "object", properties: {} },
+              cache: hint,
+            },
+            {
+              name: "t2",
+              description: "t2",
+              inputSchema: { type: "object", properties: {} },
+              cache: hint,
+            },
+            {
+              name: "t3",
+              description: "t3",
+              inputSchema: { type: "object", properties: {} },
+              cache: hint,
+            },
+            {
+              name: "t4",
+              description: "t4",
+              inputSchema: { type: "object", properties: {} },
+              cache: hint,
+            },
+          ],
+          system: [{ type: "text", text: "system-tail", cache: hint }],
+          messages: [LLM.user([{ type: "text", text: "message-tail", cache: hint }])],
+        }),
+      )
+
+      const body = prepared.body as {
+        tools: Array<{ cache_control?: unknown }>
+        system: Array<{ cache_control?: unknown }>
+        messages: Array<{ content: Array<{ cache_control?: unknown }> }>
+      }
+      expect(body.tools.every((t) => t.cache_control !== undefined)).toBe(true)
+      expect(body.system[0]?.cache_control).toBeUndefined()
+      expect(body.messages[0]?.content[0]?.cache_control).toBeUndefined()
+    }),
+  )
 })
--- a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
+++ b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
@@ -0,0 +1,50 @@
+import { describe, expect } from "bun:test"
+import { Effect } from "effect"
+import { CacheHint, LLM } from "../../src"
+import { LLMClient } from "../../src/route"
+import * as BedrockConverse from "../../src/protocols/bedrock-converse"
+import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
+import { recordedTests } from "../recorded-test"
+
+const RECORDING_REGION = process.env.BEDROCK_RECORDING_REGION ?? "us-east-1"
+
+// Use a Claude model on Bedrock — Nova has automatic prefix caching that
+// doesn't reliably surface `cacheRead`/`cacheWrite` in usage, so the second
+// call wouldn't deterministically prove cache mapping works. Override with
+// BEDROCK_CACHE_MODEL_ID if your account has access elsewhere.
+const model = BedrockConverse.model({
+  id: process.env.BEDROCK_CACHE_MODEL_ID ?? "us.anthropic.claude-haiku-4-5-20251001-v1:0",
+  credentials: {
+    region: RECORDING_REGION,
+    accessKeyId: process.env.AWS_ACCESS_KEY_ID ?? "fixture",
+    secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY ?? "fixture",
+    sessionToken: process.env.AWS_SESSION_TOKEN,
+  },
+})
+
+const cacheRequest = LLM.request({
+  id: "recorded_bedrock_cache",
+  model,
+  system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
+  prompt: "Say hi.",
+  generation: { maxTokens: 16, temperature: 0 },
+})
+
+const recorded = recordedTests({
+  prefix: "bedrock-converse-cache",
+  provider: "amazon-bedrock",
+  protocol: "bedrock-converse",
+  requires: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
+})
+
+describe("Bedrock Converse cache recorded", () => {
+  recorded.effect.with("writes then reads cachePoint on identical second call", { tags: ["cache"] }, () =>
+    Effect.gen(function* () {
+      const first = yield* LLMClient.generate(cacheRequest)
+      expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
+
+      const second = yield* LLMClient.generate(cacheRequest)
+      expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
+    }),
+  )
+})
--- a/packages/llm/test/provider/bedrock-converse.test.ts
+++ b/packages/llm/test/provider/bedrock-converse.test.ts
@@ -440,6 +440,79 @@ describe("Bedrock Converse route", () => {
      expect(error.message).toContain("Bedrock Converse does not support media type application/x-tar")
    }),
  )
+
+  it.effect("maps ttlSeconds >= 3600 to cachePoint ttl: '1h'", () =>
+    Effect.gen(function* () {
+      const cache = new CacheHint({ type: "ephemeral", ttlSeconds: 3600 })
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          system: [{ type: "text", text: "system", cache }],
+          prompt: "hi",
+        }),
+      )
+
+      expect(prepared.body).toMatchObject({
+        system: [{ text: "system" }, { cachePoint: { type: "default", ttl: "1h" } }],
+      })
+    }),
+  )
+
+  it.effect("appends cachePoint after marked tool definitions and tool-result blocks", () =>
+    Effect.gen(function* () {
+      const cache = new CacheHint({ type: "ephemeral" })
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          tools: [
+            { name: "lookup", description: "lookup", inputSchema: { type: "object", properties: {} }, cache },
+          ],
+          messages: [
+            LLM.user("What's the weather?"),
+            LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
+            LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
+          ],
+        }),
+      )
+
+      expect(prepared.body).toMatchObject({
+        toolConfig: {
+          tools: [{ toolSpec: { name: "lookup" } }, { cachePoint: { type: "default" } }],
+        },
+        messages: [
+          { role: "user", content: [{ text: "What's the weather?" }] },
+          { role: "assistant", content: [{ toolUse: { toolUseId: "call_1" } }] },
+          {
+            role: "user",
+            content: [{ toolResult: { toolUseId: "call_1" } }, { cachePoint: { type: "default" } }],
+          },
+        ],
+      })
+    }),
+  )
+
+  it.effect("drops cachePoint markers past the 4-per-request cap", () =>
+    Effect.gen(function* () {
+      const cache = new CacheHint({ type: "ephemeral" })
+      const prepared = yield* LLMClient.prepare(
+        LLM.request({
+          model,
+          system: [
+            { type: "text", text: "a", cache },
+            { type: "text", text: "b", cache },
+            { type: "text", text: "c", cache },
+            { type: "text", text: "d", cache },
+            { type: "text", text: "e", cache },
+            { type: "text", text: "f", cache },
+          ],
+          prompt: "hi",
+        }),
+      )
+
+      const system = (prepared.body as { system: Array<{ cachePoint?: unknown }> }).system
+      expect(system.filter((part) => "cachePoint" in part)).toHaveLength(4)
+    }),
+  )
 })

 // Live recorded integration tests. Run with `RECORD=true AWS_ACCESS_KEY_ID=...
--- a/packages/llm/test/provider/gemini-cache.recorded.test.ts
+++ b/packages/llm/test/provider/gemini-cache.recorded.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect } from "bun:test"
+import { Effect } from "effect"
+import { LLM } from "../../src"
+import { LLMClient } from "../../src/route"
+import * as Gemini from "../../src/protocols/gemini"
+import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
+import { recordedTests } from "../recorded-test"
+
+const model = Gemini.model({
+  id: "gemini-2.5-flash",
+  apiKey: process.env.GEMINI_API_KEY ?? "fixture",
+})
+
+// Gemini does implicit prefix caching on 2.5+ models above ~1024 tokens. The
+// `CacheHint` is currently a no-op for Gemini (the explicit `CachedContent`
+// API is out-of-band and intentionally not wired up). This test exists to
+// pin the usage-parsing path: `cachedContentTokenCount` should surface as
+// `cacheReadInputTokens` on the second identical call.
+const cacheRequest = LLM.request({
+  id: "recorded_gemini_cache",
+  model,
+  system: LARGE_CACHEABLE_SYSTEM,
+  prompt: "Say hi.",
+  generation: { maxTokens: 16, temperature: 0 },
+})
+
+const recorded = recordedTests({
+  prefix: "gemini-cache",
+  provider: "google",
+  protocol: "gemini",
+  requires: ["GEMINI_API_KEY"],
+})
+
+describe("Gemini cache recorded", () => {
+  recorded.effect.with("reports cachedContentTokenCount on identical second call", { tags: ["cache"] }, () =>
+    Effect.gen(function* () {
+      const first = yield* LLMClient.generate(cacheRequest)
+      expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
+
+      const second = yield* LLMClient.generate(cacheRequest)
+      // Implicit caching is best-effort on Gemini's side; we assert the field
+      // is at least populated and non-negative. When re-recording, verify the
+      // cassette shows > 0 in the second response's usage.
+      expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
+    }),
+  )
+})
--- a/packages/llm/test/provider/openai-responses-cache.recorded.test.ts
+++ b/packages/llm/test/provider/openai-responses-cache.recorded.test.ts
@@ -0,0 +1,44 @@
+import { describe, expect } from "bun:test"
+import { Effect } from "effect"
+import { LLM } from "../../src"
+import { LLMClient } from "../../src/route"
+import * as OpenAIResponses from "../../src/protocols/openai-responses"
+import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
+import { recordedTests } from "../recorded-test"
+
+const model = OpenAIResponses.model({
+  id: "gpt-4.1-mini",
+  apiKey: process.env.OPENAI_API_KEY ?? "fixture",
+})
+
+// OpenAI caches prefixes automatically once they cross the 1024-token threshold;
+// `CacheHint` is a no-op for the wire body. The stable signal is the
+// `prompt_cache_key` routing hint, which keeps repeated calls on the same shard
+// so cache hits are observable.
+const cacheRequest = LLM.request({
+  id: "recorded_openai_responses_cache",
+  model,
+  system: LARGE_CACHEABLE_SYSTEM,
+  prompt: "Say hi.",
+  generation: { maxTokens: 16, temperature: 0 },
+  providerOptions: { openai: { promptCacheKey: "recorded-cache-test" } },
+})
+
+const recorded = recordedTests({
+  prefix: "openai-responses-cache",
+  provider: "openai",
+  protocol: "openai-responses",
+  requires: ["OPENAI_API_KEY"],
+})
+
+describe("OpenAI Responses cache recorded", () => {
+  recorded.effect.with("reports cached_tokens on identical second call", { tags: ["cache"] }, () =>
+    Effect.gen(function* () {
+      const first = yield* LLMClient.generate(cacheRequest)
+      expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
+
+      const second = yield* LLMClient.generate(cacheRequest)
+      expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
+    }),
+  )
+})
--- a/packages/llm/test/recorded-scenarios.ts
+++ b/packages/llm/test/recorded-scenarios.ts
@@ -6,6 +6,19 @@ import { tool } from "../src/tool"

 export const weatherToolName = "get_weather"

+// A deterministic system prompt long enough to clear every supported provider's
+// minimum cacheable-prefix threshold (Anthropic Haiku 3.5: 2048 tokens; Anthropic
+// Opus/Haiku 4.5: 4096 tokens; OpenAI/Gemini/Bedrock: lower). Built by repeating
+// a fixed sentence — the cassette replays bit-for-bit, so the exact text matters
+// only when re-recording with `RECORD=true`.
+export const LARGE_CACHEABLE_SYSTEM = (() => {
+  const sentence =
+    "You are a concise, factual assistant. Answer precisely and avoid filler. Cite numbers when known. "
+  // ~100 chars per sentence × 250 repeats ≈ 25,000 chars ≈ 5k+ tokens, safely
+  // above every provider's threshold.
+  return sentence.repeat(250)
+})()
+
 export const weatherTool = LLM.toolDefinition({
  name: weatherToolName,
  description: "Get current weather for a city.",