diff --git a/packages/llm/README.md b/packages/llm/README.md index e164c4bf53..321bf715bb 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -35,26 +35,25 @@ Run `LLMClient.stream(request)` instead of `generate` when you want incremental ## Caching -Prompt caching is unified across providers. Mark content with a `CacheHint` and each protocol translates it to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI's implicit caching needs no markers). +Prompt caching is **on by default**. Every `LLMRequest` resolves to `cache: "auto"` unless the caller opts out with `cache: "none"`. Each protocol translates `CacheHint`s to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI and Gemini do implicit caching server-side and don't need inline markers — auto is a no-op there). ### Auto placement -The simplest path is `cache: "auto"` on the request: +`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit. + +The math justifies the default: Anthropic's 5-minute cache write is 1.25× base, read is 0.1×, so a single reuse within 5 minutes already wins. One-shot completions below the per-model minimum-cacheable-token threshold silently no-op on the wire, so the worst case is harmless. + +### Opting out ```ts LLM.request({ model, system, - messages, - tools, - cache: "auto", + prompt: "one-off question", + cache: "none", }) ``` -`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit. - -On OpenAI and Gemini `"auto"` is a no-op (their wire formats don't accept inline markers — both use implicit caching). On Anthropic and Bedrock it emits provider-native cache markers. - ### Granular policy ```ts diff --git a/packages/llm/src/cache-policy.ts b/packages/llm/src/cache-policy.ts index b9dca4e88f..6ab7a049fe 100644 --- a/packages/llm/src/cache-policy.ts +++ b/packages/llm/src/cache-policy.ts @@ -24,15 +24,15 @@ const AUTO: CachePolicyObject = { const NONE: CachePolicyObject = {} // Resolution rules: -// - undefined → "none" (opt-in default so the policy never changes wire -// shape for existing callers; downstream code can flip to -// `cache: "auto"` once they audit the placement choices). -// - "auto" → the recommended policy: tools + system + latest user msg. +// - undefined → "auto" — caching is on by default. The math favors it: +// Anthropic 5m-cache write is 1.25x base, read is 0.1x, +// so a single reuse within 5 minutes already wins. +// - "auto" → tools + system + latest user msg. // - "none" → no auto placement; manual `CacheHint`s still flow. // - object form → exactly what the caller asked for. const resolve = (policy: CachePolicy | undefined): CachePolicyObject => { - if (policy === undefined || policy === "none") return NONE - if (policy === "auto") return AUTO + if (policy === undefined || policy === "auto") return AUTO + if (policy === "none") return NONE return policy } diff --git a/packages/llm/test/cache-policy.test.ts b/packages/llm/test/cache-policy.test.ts index 640556105e..e742ca5e69 100644 --- a/packages/llm/test/cache-policy.test.ts +++ b/packages/llm/test/cache-policy.test.ts @@ -33,7 +33,7 @@ const geminiModel = Gemini.model({ }) describe("applyCachePolicy", () => { - it.effect("undefined cache leaves the request untouched (opt-in default)", () => + it.effect("undefined cache resolves to 'auto' (the recommended default)", () => Effect.gen(function* () { const prepared = yield* LLMClient.prepare( LLM.request({ @@ -43,8 +43,11 @@ describe("applyCachePolicy", () => { }), ) + // No explicit cache field → auto policy fires → last system part + latest + // user message both get cache_control markers. expect(prepared.body).toMatchObject({ - system: [{ type: "text", text: "You are concise.", cache_control: undefined }], + system: [{ type: "text", text: "You are concise.", cache_control: { type: "ephemeral" } }], + messages: [{ role: "user", content: [{ type: "text", text: "hi", cache_control: { type: "ephemeral" } }] }], }) }), ) @@ -252,6 +255,7 @@ describe("applyCachePolicy", () => { const request = LLM.request({ model: anthropicModel, prompt: "hi", + cache: "none", }) expect(applyCachePolicy(request)).toBe(request) }) diff --git a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts index cee31de19b..cb144b1a5d 100644 --- a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts +++ b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts @@ -20,6 +20,9 @@ const cacheRequest = LLM.request({ model, system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }], prompt: "Say hi.", + // Manual hint on the system part is the only marker we want here — skip the + // auto-policy's latest-user-message breakpoint so the cassette body matches. + cache: "none", generation: { maxTokens: 16, temperature: 0 }, }) diff --git a/packages/llm/test/provider/anthropic-messages.test.ts b/packages/llm/test/provider/anthropic-messages.test.ts index 3be041c94c..a867d16591 100644 --- a/packages/llm/test/provider/anthropic-messages.test.ts +++ b/packages/llm/test/provider/anthropic-messages.test.ts @@ -18,6 +18,9 @@ const request = LLM.request({ model, system: { type: "text", text: "You are concise.", cache: new CacheHint({ type: "ephemeral" }) }, prompt: "Say hello.", + // This fixture predates the `cache: "auto"` default; pin the policy off so + // existing wire-shape assertions only see the manual hint on the system part. + cache: "none", generation: { maxTokens: 20, temperature: 0 }, }) @@ -48,6 +51,7 @@ describe("Anthropic Messages route", () => { LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: { query: "weather" } })]), LLM.toolMessage({ id: "call_1", name: "lookup", result: { forecast: "sunny" } }), ], + cache: "none", }), ) diff --git a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts index 400e38849e..16c44099ce 100644 --- a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts +++ b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts @@ -27,6 +27,9 @@ const cacheRequest = LLM.request({ model, system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }], prompt: "Say hi.", + // Manual hint on the system part is the only marker we want here — skip the + // auto-policy's latest-user-message breakpoint so the cassette body matches. + cache: "none", generation: { maxTokens: 16, temperature: 0 }, }) diff --git a/packages/llm/test/provider/bedrock-converse.test.ts b/packages/llm/test/provider/bedrock-converse.test.ts index afadd89ac7..208b565272 100644 --- a/packages/llm/test/provider/bedrock-converse.test.ts +++ b/packages/llm/test/provider/bedrock-converse.test.ts @@ -63,6 +63,9 @@ const baseRequest = LLM.request({ model, system: "You are concise.", prompt: "Say hello.", + // Wire-shape assertions in this file predate the `cache: "auto"` default; + // pin the policy off so they only exercise the lowering path itself. + cache: "none", generation: { maxTokens: 64, temperature: 0 }, }) @@ -125,6 +128,7 @@ describe("Bedrock Converse route", () => { LLM.assistant([LLM.toolCall({ id: "tool_1", name: "lookup", input: { query: "weather" } })]), LLM.toolMessage({ id: "tool_1", name: "lookup", result: { forecast: "sunny" } }), ], + cache: "none", }), ) @@ -339,6 +343,7 @@ describe("Bedrock Converse route", () => { { type: "media", mediaType: "image/webp", data: "DDDD" }, ]), ], + cache: "none", }), ) @@ -470,6 +475,7 @@ describe("Bedrock Converse route", () => { LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]), LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }), ], + cache: "none", }), ) @@ -555,6 +561,7 @@ describe("Bedrock Converse recorded", () => { model: recordedModel(), system: "Reply with the single word 'Hello'.", prompt: "Say hello.", + cache: "none", generation: { maxTokens: 16, temperature: 0 }, }), ) @@ -577,6 +584,7 @@ describe("Bedrock Converse recorded", () => { prompt: "Call get_weather with city exactly Paris.", tools: [weatherTool], toolChoice: LLM.toolChoice(weatherTool), + cache: "none", generation: { maxTokens: 80, temperature: 0 }, }), ) diff --git a/packages/llm/test/recorded-scenarios.ts b/packages/llm/test/recorded-scenarios.ts index 8a02bc3a0a..127a444a16 100644 --- a/packages/llm/test/recorded-scenarios.ts +++ b/packages/llm/test/recorded-scenarios.ts @@ -51,6 +51,7 @@ export const textRequest = (input: { model: input.model, system: "You are concise.", prompt: input.prompt ?? "Reply with exactly: Hello!", + cache: "none", generation: input.temperature === false ? { maxTokens: input.maxTokens ?? 20 } @@ -70,6 +71,7 @@ export const weatherToolRequest = (input: { prompt: "Call get_weather with city exactly Paris.", tools: [weatherTool], toolChoice: LLM.toolChoice(weatherTool), + cache: "none", generation: input.temperature === false ? { maxTokens: input.maxTokens ?? 80 } @@ -88,6 +90,7 @@ export const weatherToolLoopRequest = (input: { model: input.model, system: input.system ?? "Use the get_weather tool, then answer in one short sentence.", prompt: "What is the weather in Paris?", + cache: "none", generation: input.temperature === false ? { maxTokens: input.maxTokens ?? 80 }