mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-18 02:22:32 +00:00
chore(llm): make cache: 'auto' the default (#26798)
This commit is contained in:
@@ -35,26 +35,25 @@ Run `LLMClient.stream(request)` instead of `generate` when you want incremental
|
||||
|
||||
## Caching
|
||||
|
||||
Prompt caching is unified across providers. Mark content with a `CacheHint` and each protocol translates it to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI's implicit caching needs no markers).
|
||||
Prompt caching is **on by default**. Every `LLMRequest` resolves to `cache: "auto"` unless the caller opts out with `cache: "none"`. Each protocol translates `CacheHint`s to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI and Gemini do implicit caching server-side and don't need inline markers — auto is a no-op there).
|
||||
|
||||
### Auto placement
|
||||
|
||||
The simplest path is `cache: "auto"` on the request:
|
||||
`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
|
||||
|
||||
The math justifies the default: Anthropic's 5-minute cache write is 1.25× base, read is 0.1×, so a single reuse within 5 minutes already wins. One-shot completions below the per-model minimum-cacheable-token threshold silently no-op on the wire, so the worst case is harmless.
|
||||
|
||||
### Opting out
|
||||
|
||||
```ts
|
||||
LLM.request({
|
||||
model,
|
||||
system,
|
||||
messages,
|
||||
tools,
|
||||
cache: "auto",
|
||||
prompt: "one-off question",
|
||||
cache: "none",
|
||||
})
|
||||
```
|
||||
|
||||
`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
|
||||
|
||||
On OpenAI and Gemini `"auto"` is a no-op (their wire formats don't accept inline markers — both use implicit caching). On Anthropic and Bedrock it emits provider-native cache markers.
|
||||
|
||||
### Granular policy
|
||||
|
||||
```ts
|
||||
|
||||
@@ -24,15 +24,15 @@ const AUTO: CachePolicyObject = {
|
||||
const NONE: CachePolicyObject = {}
|
||||
|
||||
// Resolution rules:
|
||||
// - undefined → "none" (opt-in default so the policy never changes wire
|
||||
// shape for existing callers; downstream code can flip to
|
||||
// `cache: "auto"` once they audit the placement choices).
|
||||
// - "auto" → the recommended policy: tools + system + latest user msg.
|
||||
// - undefined → "auto" — caching is on by default. The math favors it:
|
||||
// Anthropic 5m-cache write is 1.25x base, read is 0.1x,
|
||||
// so a single reuse within 5 minutes already wins.
|
||||
// - "auto" → tools + system + latest user msg.
|
||||
// - "none" → no auto placement; manual `CacheHint`s still flow.
|
||||
// - object form → exactly what the caller asked for.
|
||||
const resolve = (policy: CachePolicy | undefined): CachePolicyObject => {
|
||||
if (policy === undefined || policy === "none") return NONE
|
||||
if (policy === "auto") return AUTO
|
||||
if (policy === undefined || policy === "auto") return AUTO
|
||||
if (policy === "none") return NONE
|
||||
return policy
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ const geminiModel = Gemini.model({
|
||||
})
|
||||
|
||||
describe("applyCachePolicy", () => {
|
||||
it.effect("undefined cache leaves the request untouched (opt-in default)", () =>
|
||||
it.effect("undefined cache resolves to 'auto' (the recommended default)", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
@@ -43,8 +43,11 @@ describe("applyCachePolicy", () => {
|
||||
}),
|
||||
)
|
||||
|
||||
// No explicit cache field → auto policy fires → last system part + latest
|
||||
// user message both get cache_control markers.
|
||||
expect(prepared.body).toMatchObject({
|
||||
system: [{ type: "text", text: "You are concise.", cache_control: undefined }],
|
||||
system: [{ type: "text", text: "You are concise.", cache_control: { type: "ephemeral" } }],
|
||||
messages: [{ role: "user", content: [{ type: "text", text: "hi", cache_control: { type: "ephemeral" } }] }],
|
||||
})
|
||||
}),
|
||||
)
|
||||
@@ -252,6 +255,7 @@ describe("applyCachePolicy", () => {
|
||||
const request = LLM.request({
|
||||
model: anthropicModel,
|
||||
prompt: "hi",
|
||||
cache: "none",
|
||||
})
|
||||
expect(applyCachePolicy(request)).toBe(request)
|
||||
})
|
||||
|
||||
@@ -20,6 +20,9 @@ const cacheRequest = LLM.request({
|
||||
model,
|
||||
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
|
||||
prompt: "Say hi.",
|
||||
// Manual hint on the system part is the only marker we want here — skip the
|
||||
// auto-policy's latest-user-message breakpoint so the cassette body matches.
|
||||
cache: "none",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
})
|
||||
|
||||
|
||||
@@ -18,6 +18,9 @@ const request = LLM.request({
|
||||
model,
|
||||
system: { type: "text", text: "You are concise.", cache: new CacheHint({ type: "ephemeral" }) },
|
||||
prompt: "Say hello.",
|
||||
// This fixture predates the `cache: "auto"` default; pin the policy off so
|
||||
// existing wire-shape assertions only see the manual hint on the system part.
|
||||
cache: "none",
|
||||
generation: { maxTokens: 20, temperature: 0 },
|
||||
})
|
||||
|
||||
@@ -48,6 +51,7 @@ describe("Anthropic Messages route", () => {
|
||||
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: { query: "weather" } })]),
|
||||
LLM.toolMessage({ id: "call_1", name: "lookup", result: { forecast: "sunny" } }),
|
||||
],
|
||||
cache: "none",
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
@@ -27,6 +27,9 @@ const cacheRequest = LLM.request({
|
||||
model,
|
||||
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
|
||||
prompt: "Say hi.",
|
||||
// Manual hint on the system part is the only marker we want here — skip the
|
||||
// auto-policy's latest-user-message breakpoint so the cassette body matches.
|
||||
cache: "none",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
})
|
||||
|
||||
|
||||
@@ -63,6 +63,9 @@ const baseRequest = LLM.request({
|
||||
model,
|
||||
system: "You are concise.",
|
||||
prompt: "Say hello.",
|
||||
// Wire-shape assertions in this file predate the `cache: "auto"` default;
|
||||
// pin the policy off so they only exercise the lowering path itself.
|
||||
cache: "none",
|
||||
generation: { maxTokens: 64, temperature: 0 },
|
||||
})
|
||||
|
||||
@@ -125,6 +128,7 @@ describe("Bedrock Converse route", () => {
|
||||
LLM.assistant([LLM.toolCall({ id: "tool_1", name: "lookup", input: { query: "weather" } })]),
|
||||
LLM.toolMessage({ id: "tool_1", name: "lookup", result: { forecast: "sunny" } }),
|
||||
],
|
||||
cache: "none",
|
||||
}),
|
||||
)
|
||||
|
||||
@@ -339,6 +343,7 @@ describe("Bedrock Converse route", () => {
|
||||
{ type: "media", mediaType: "image/webp", data: "DDDD" },
|
||||
]),
|
||||
],
|
||||
cache: "none",
|
||||
}),
|
||||
)
|
||||
|
||||
@@ -470,6 +475,7 @@ describe("Bedrock Converse route", () => {
|
||||
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
|
||||
LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
|
||||
],
|
||||
cache: "none",
|
||||
}),
|
||||
)
|
||||
|
||||
@@ -555,6 +561,7 @@ describe("Bedrock Converse recorded", () => {
|
||||
model: recordedModel(),
|
||||
system: "Reply with the single word 'Hello'.",
|
||||
prompt: "Say hello.",
|
||||
cache: "none",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
}),
|
||||
)
|
||||
@@ -577,6 +584,7 @@ describe("Bedrock Converse recorded", () => {
|
||||
prompt: "Call get_weather with city exactly Paris.",
|
||||
tools: [weatherTool],
|
||||
toolChoice: LLM.toolChoice(weatherTool),
|
||||
cache: "none",
|
||||
generation: { maxTokens: 80, temperature: 0 },
|
||||
}),
|
||||
)
|
||||
|
||||
@@ -51,6 +51,7 @@ export const textRequest = (input: {
|
||||
model: input.model,
|
||||
system: "You are concise.",
|
||||
prompt: input.prompt ?? "Reply with exactly: Hello!",
|
||||
cache: "none",
|
||||
generation:
|
||||
input.temperature === false
|
||||
? { maxTokens: input.maxTokens ?? 20 }
|
||||
@@ -70,6 +71,7 @@ export const weatherToolRequest = (input: {
|
||||
prompt: "Call get_weather with city exactly Paris.",
|
||||
tools: [weatherTool],
|
||||
toolChoice: LLM.toolChoice(weatherTool),
|
||||
cache: "none",
|
||||
generation:
|
||||
input.temperature === false
|
||||
? { maxTokens: input.maxTokens ?? 80 }
|
||||
@@ -88,6 +90,7 @@ export const weatherToolLoopRequest = (input: {
|
||||
model: input.model,
|
||||
system: input.system ?? "Use the get_weather tool, then answer in one short sentence.",
|
||||
prompt: "What is the weather in Paris?",
|
||||
cache: "none",
|
||||
generation:
|
||||
input.temperature === false
|
||||
? { maxTokens: input.maxTokens ?? 80 }
|
||||
|
||||
Reference in New Issue
Block a user