chore(llm): make cache: 'auto' the default (#26798)

This commit is contained in:
Kit Langton
2026-05-10 22:46:01 -04:00
committed by GitHub
parent 721ff5121e
commit 9b369ee815
8 changed files with 41 additions and 17 deletions

View File

@@ -35,26 +35,25 @@ Run `LLMClient.stream(request)` instead of `generate` when you want incremental
## Caching
Prompt caching is unified across providers. Mark content with a `CacheHint` and each protocol translates it to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI's implicit caching needs no markers).
Prompt caching is **on by default**. Every `LLMRequest` resolves to `cache: "auto"` unless the caller opts out with `cache: "none"`. Each protocol translates `CacheHint`s to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI and Gemini do implicit caching server-side and don't need inline markers — auto is a no-op there).
### Auto placement
The simplest path is `cache: "auto"` on the request:
`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
The math justifies the default: Anthropic's 5-minute cache write is 1.25× base, read is 0.1×, so a single reuse within 5 minutes already wins. One-shot completions below the per-model minimum-cacheable-token threshold silently no-op on the wire, so the worst case is harmless.
### Opting out
```ts
LLM.request({
model,
system,
messages,
tools,
cache: "auto",
prompt: "one-off question",
cache: "none",
})
```
`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
On OpenAI and Gemini `"auto"` is a no-op (their wire formats don't accept inline markers — both use implicit caching). On Anthropic and Bedrock it emits provider-native cache markers.
### Granular policy
```ts

View File

@@ -24,15 +24,15 @@ const AUTO: CachePolicyObject = {
const NONE: CachePolicyObject = {}
// Resolution rules:
// - undefined → "none" (opt-in default so the policy never changes wire
// shape for existing callers; downstream code can flip to
// `cache: "auto"` once they audit the placement choices).
// - "auto" → the recommended policy: tools + system + latest user msg.
// - undefined → "auto" — caching is on by default. The math favors it:
// Anthropic 5m-cache write is 1.25x base, read is 0.1x,
// so a single reuse within 5 minutes already wins.
// - "auto" → tools + system + latest user msg.
// - "none" → no auto placement; manual `CacheHint`s still flow.
// - object form → exactly what the caller asked for.
const resolve = (policy: CachePolicy | undefined): CachePolicyObject => {
if (policy === undefined || policy === "none") return NONE
if (policy === "auto") return AUTO
if (policy === undefined || policy === "auto") return AUTO
if (policy === "none") return NONE
return policy
}

View File

@@ -33,7 +33,7 @@ const geminiModel = Gemini.model({
})
describe("applyCachePolicy", () => {
it.effect("undefined cache leaves the request untouched (opt-in default)", () =>
it.effect("undefined cache resolves to 'auto' (the recommended default)", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare(
LLM.request({
@@ -43,8 +43,11 @@ describe("applyCachePolicy", () => {
}),
)
// No explicit cache field → auto policy fires → last system part + latest
// user message both get cache_control markers.
expect(prepared.body).toMatchObject({
system: [{ type: "text", text: "You are concise.", cache_control: undefined }],
system: [{ type: "text", text: "You are concise.", cache_control: { type: "ephemeral" } }],
messages: [{ role: "user", content: [{ type: "text", text: "hi", cache_control: { type: "ephemeral" } }] }],
})
}),
)
@@ -252,6 +255,7 @@ describe("applyCachePolicy", () => {
const request = LLM.request({
model: anthropicModel,
prompt: "hi",
cache: "none",
})
expect(applyCachePolicy(request)).toBe(request)
})

View File

@@ -20,6 +20,9 @@ const cacheRequest = LLM.request({
model,
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
prompt: "Say hi.",
// Manual hint on the system part is the only marker we want here — skip the
// auto-policy's latest-user-message breakpoint so the cassette body matches.
cache: "none",
generation: { maxTokens: 16, temperature: 0 },
})

View File

@@ -18,6 +18,9 @@ const request = LLM.request({
model,
system: { type: "text", text: "You are concise.", cache: new CacheHint({ type: "ephemeral" }) },
prompt: "Say hello.",
// This fixture predates the `cache: "auto"` default; pin the policy off so
// existing wire-shape assertions only see the manual hint on the system part.
cache: "none",
generation: { maxTokens: 20, temperature: 0 },
})
@@ -48,6 +51,7 @@ describe("Anthropic Messages route", () => {
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: { query: "weather" } })]),
LLM.toolMessage({ id: "call_1", name: "lookup", result: { forecast: "sunny" } }),
],
cache: "none",
}),
)

View File

@@ -27,6 +27,9 @@ const cacheRequest = LLM.request({
model,
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
prompt: "Say hi.",
// Manual hint on the system part is the only marker we want here — skip the
// auto-policy's latest-user-message breakpoint so the cassette body matches.
cache: "none",
generation: { maxTokens: 16, temperature: 0 },
})

View File

@@ -63,6 +63,9 @@ const baseRequest = LLM.request({
model,
system: "You are concise.",
prompt: "Say hello.",
// Wire-shape assertions in this file predate the `cache: "auto"` default;
// pin the policy off so they only exercise the lowering path itself.
cache: "none",
generation: { maxTokens: 64, temperature: 0 },
})
@@ -125,6 +128,7 @@ describe("Bedrock Converse route", () => {
LLM.assistant([LLM.toolCall({ id: "tool_1", name: "lookup", input: { query: "weather" } })]),
LLM.toolMessage({ id: "tool_1", name: "lookup", result: { forecast: "sunny" } }),
],
cache: "none",
}),
)
@@ -339,6 +343,7 @@ describe("Bedrock Converse route", () => {
{ type: "media", mediaType: "image/webp", data: "DDDD" },
]),
],
cache: "none",
}),
)
@@ -470,6 +475,7 @@ describe("Bedrock Converse route", () => {
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
],
cache: "none",
}),
)
@@ -555,6 +561,7 @@ describe("Bedrock Converse recorded", () => {
model: recordedModel(),
system: "Reply with the single word 'Hello'.",
prompt: "Say hello.",
cache: "none",
generation: { maxTokens: 16, temperature: 0 },
}),
)
@@ -577,6 +584,7 @@ describe("Bedrock Converse recorded", () => {
prompt: "Call get_weather with city exactly Paris.",
tools: [weatherTool],
toolChoice: LLM.toolChoice(weatherTool),
cache: "none",
generation: { maxTokens: 80, temperature: 0 },
}),
)

View File

@@ -51,6 +51,7 @@ export const textRequest = (input: {
model: input.model,
system: "You are concise.",
prompt: input.prompt ?? "Reply with exactly: Hello!",
cache: "none",
generation:
input.temperature === false
? { maxTokens: input.maxTokens ?? 20 }
@@ -70,6 +71,7 @@ export const weatherToolRequest = (input: {
prompt: "Call get_weather with city exactly Paris.",
tools: [weatherTool],
toolChoice: LLM.toolChoice(weatherTool),
cache: "none",
generation:
input.temperature === false
? { maxTokens: input.maxTokens ?? 80 }
@@ -88,6 +90,7 @@ export const weatherToolLoopRequest = (input: {
model: input.model,
system: input.system ?? "Use the get_weather tool, then answer in one short sentence.",
prompt: "What is the weather in Paris?",
cache: "none",
generation:
input.temperature === false
? { maxTokens: input.maxTokens ?? 80 }