chore(llm): make cache: 'auto' the default (#26798)

2026-05-18 02:22:32 +00:00 · 2026-05-10 22:46:01 -04:00
parent 721ff5121e
commit 9b369ee815
8 changed files with 41 additions and 17 deletions
--- a/packages/llm/README.md
+++ b/packages/llm/README.md
@@ -35,26 +35,25 @@ Run `LLMClient.stream(request)` instead of `generate` when you want incremental

 ## Caching

-Prompt caching is unified across providers. Mark content with a `CacheHint` and each protocol translates it to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI's implicit caching needs no markers).
+Prompt caching is **on by default**. Every `LLMRequest` resolves to `cache: "auto"` unless the caller opts out with `cache: "none"`. Each protocol translates `CacheHint`s to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI and Gemini do implicit caching server-side and don't need inline markers — auto is a no-op there).

 ### Auto placement

-The simplest path is `cache: "auto"` on the request:
+`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
+
+The math justifies the default: Anthropic's 5-minute cache write is 1.25× base, read is 0.1×, so a single reuse within 5 minutes already wins. One-shot completions below the per-model minimum-cacheable-token threshold silently no-op on the wire, so the worst case is harmless.
+
+### Opting out

 ```ts
 LLM.request({
  model,
  system,
-  messages,
-  tools,
-  cache: "auto",
+  prompt: "one-off question",
+  cache: "none",
 })
 ```

-`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
-
-On OpenAI and Gemini `"auto"` is a no-op (their wire formats don't accept inline markers — both use implicit caching). On Anthropic and Bedrock it emits provider-native cache markers.
-
 ### Granular policy

 ```ts
--- a/packages/llm/src/cache-policy.ts
+++ b/packages/llm/src/cache-policy.ts
@@ -24,15 +24,15 @@ const AUTO: CachePolicyObject = {
 const NONE: CachePolicyObject = {}

 // Resolution rules:
-//   - undefined   → "none" (opt-in default so the policy never changes wire
-//                   shape for existing callers; downstream code can flip to
-//                   `cache: "auto"` once they audit the placement choices).
-//   - "auto"      → the recommended policy: tools + system + latest user msg.
+//   - undefined   → "auto" — caching is on by default. The math favors it:
+//                   Anthropic 5m-cache write is 1.25x base, read is 0.1x,
+//                   so a single reuse within 5 minutes already wins.
+//   - "auto"      → tools + system + latest user msg.
 //   - "none"      → no auto placement; manual `CacheHint`s still flow.
 //   - object form → exactly what the caller asked for.
 const resolve = (policy: CachePolicy | undefined): CachePolicyObject => {
-  if (policy === undefined || policy === "none") return NONE
-  if (policy === "auto") return AUTO
+  if (policy === undefined || policy === "auto") return AUTO
+  if (policy === "none") return NONE
  return policy
 }

--- a/packages/llm/test/cache-policy.test.ts
+++ b/packages/llm/test/cache-policy.test.ts
@@ -33,7 +33,7 @@ const geminiModel = Gemini.model({
 })

 describe("applyCachePolicy", () => {
-  it.effect("undefined cache leaves the request untouched (opt-in default)", () =>
+  it.effect("undefined cache resolves to 'auto' (the recommended default)", () =>
    Effect.gen(function* () {
      const prepared = yield* LLMClient.prepare(
        LLM.request({
@@ -43,8 +43,11 @@ describe("applyCachePolicy", () => {
        }),
      )

+      // No explicit cache field → auto policy fires → last system part + latest
+      // user message both get cache_control markers.
      expect(prepared.body).toMatchObject({
-        system: [{ type: "text", text: "You are concise.", cache_control: undefined }],
+        system: [{ type: "text", text: "You are concise.", cache_control: { type: "ephemeral" } }],
+        messages: [{ role: "user", content: [{ type: "text", text: "hi", cache_control: { type: "ephemeral" } }] }],
      })
    }),
  )
@@ -252,6 +255,7 @@ describe("applyCachePolicy", () => {
    const request = LLM.request({
      model: anthropicModel,
      prompt: "hi",
+      cache: "none",
    })
    expect(applyCachePolicy(request)).toBe(request)
  })
--- a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
+++ b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
@@ -20,6 +20,9 @@ const cacheRequest = LLM.request({
  model,
  system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
  prompt: "Say hi.",
+  // Manual hint on the system part is the only marker we want here — skip the
+  // auto-policy's latest-user-message breakpoint so the cassette body matches.
+  cache: "none",
  generation: { maxTokens: 16, temperature: 0 },
 })

--- a/packages/llm/test/provider/anthropic-messages.test.ts
+++ b/packages/llm/test/provider/anthropic-messages.test.ts
@@ -18,6 +18,9 @@ const request = LLM.request({
  model,
  system: { type: "text", text: "You are concise.", cache: new CacheHint({ type: "ephemeral" }) },
  prompt: "Say hello.",
+  // This fixture predates the `cache: "auto"` default; pin the policy off so
+  // existing wire-shape assertions only see the manual hint on the system part.
+  cache: "none",
  generation: { maxTokens: 20, temperature: 0 },
 })

@@ -48,6 +51,7 @@ describe("Anthropic Messages route", () => {
            LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: { query: "weather" } })]),
            LLM.toolMessage({ id: "call_1", name: "lookup", result: { forecast: "sunny" } }),
          ],
+          cache: "none",
        }),
      )

--- a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
+++ b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
@@ -27,6 +27,9 @@ const cacheRequest = LLM.request({
  model,
  system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
  prompt: "Say hi.",
+  // Manual hint on the system part is the only marker we want here — skip the
+  // auto-policy's latest-user-message breakpoint so the cassette body matches.
+  cache: "none",
  generation: { maxTokens: 16, temperature: 0 },
 })

--- a/packages/llm/test/provider/bedrock-converse.test.ts
+++ b/packages/llm/test/provider/bedrock-converse.test.ts
@@ -63,6 +63,9 @@ const baseRequest = LLM.request({
  model,
  system: "You are concise.",
  prompt: "Say hello.",
+  // Wire-shape assertions in this file predate the `cache: "auto"` default;
+  // pin the policy off so they only exercise the lowering path itself.
+  cache: "none",
  generation: { maxTokens: 64, temperature: 0 },
 })

@@ -125,6 +128,7 @@ describe("Bedrock Converse route", () => {
            LLM.assistant([LLM.toolCall({ id: "tool_1", name: "lookup", input: { query: "weather" } })]),
            LLM.toolMessage({ id: "tool_1", name: "lookup", result: { forecast: "sunny" } }),
          ],
+          cache: "none",
        }),
      )

@@ -339,6 +343,7 @@ describe("Bedrock Converse route", () => {
              { type: "media", mediaType: "image/webp", data: "DDDD" },
            ]),
          ],
+          cache: "none",
        }),
      )

@@ -470,6 +475,7 @@ describe("Bedrock Converse route", () => {
            LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
            LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
          ],
+          cache: "none",
        }),
      )

@@ -555,6 +561,7 @@ describe("Bedrock Converse recorded", () => {
          model: recordedModel(),
          system: "Reply with the single word 'Hello'.",
          prompt: "Say hello.",
+          cache: "none",
          generation: { maxTokens: 16, temperature: 0 },
        }),
      )
@@ -577,6 +584,7 @@ describe("Bedrock Converse recorded", () => {
          prompt: "Call get_weather with city exactly Paris.",
          tools: [weatherTool],
          toolChoice: LLM.toolChoice(weatherTool),
+          cache: "none",
          generation: { maxTokens: 80, temperature: 0 },
        }),
      )
--- a/packages/llm/test/recorded-scenarios.ts
+++ b/packages/llm/test/recorded-scenarios.ts
@@ -51,6 +51,7 @@ export const textRequest = (input: {
    model: input.model,
    system: "You are concise.",
    prompt: input.prompt ?? "Reply with exactly: Hello!",
+    cache: "none",
    generation:
      input.temperature === false
        ? { maxTokens: input.maxTokens ?? 20 }
@@ -70,6 +71,7 @@ export const weatherToolRequest = (input: {
    prompt: "Call get_weather with city exactly Paris.",
    tools: [weatherTool],
    toolChoice: LLM.toolChoice(weatherTool),
+    cache: "none",
    generation:
      input.temperature === false
        ? { maxTokens: input.maxTokens ?? 80 }
@@ -88,6 +90,7 @@ export const weatherToolLoopRequest = (input: {
    model: input.model,
    system: input.system ?? "Use the get_weather tool, then answer in one short sentence.",
    prompt: "What is the weather in Paris?",
+    cache: "none",
    generation:
      input.temperature === false
        ? { maxTokens: input.maxTokens ?? 80 }