Files
opencode/packages/llm/src/cache-policy.ts
2026-05-10 22:46:01 -04:00

112 lines
5.5 KiB
TypeScript

// Apply an `LLMRequest.cache` policy by injecting `CacheHint`s onto the parts
// the policy designates. Runs once at compile time, before the per-protocol
// body builder, so the existing inline-hint lowering path handles the rest.
//
// The default `"auto"` shape places one breakpoint at the last tool definition,
// one at the last system part, and one at the latest user message. This
// matches what production agent harnesses (LangChain's caching middleware,
// kern-ai's 10x cost-reduction playbook) converge on for tool-use loops: the
// latest user message stays put while a single turn explodes into many
// assistant/tool round-trips, so caching at that boundary lets every
// intra-turn API call hit the prefix.
//
// Manual `cache: CacheHint` placements on individual parts are preserved —
// this function only fills gaps the caller left empty.
import { CacheHint, type CachePolicy, type CachePolicyObject } from "./schema/options"
import { LLMRequest, Message, ToolDefinition, type ContentPart } from "./schema/messages"
const AUTO: CachePolicyObject = {
tools: true,
system: true,
messages: "latest-user-message",
}
const NONE: CachePolicyObject = {}
// Resolution rules:
// - undefined → "auto" — caching is on by default. The math favors it:
// Anthropic 5m-cache write is 1.25x base, read is 0.1x,
// so a single reuse within 5 minutes already wins.
// - "auto" → tools + system + latest user msg.
// - "none" → no auto placement; manual `CacheHint`s still flow.
// - object form → exactly what the caller asked for.
const resolve = (policy: CachePolicy | undefined): CachePolicyObject => {
if (policy === undefined || policy === "auto") return AUTO
if (policy === "none") return NONE
return policy
}
// Protocols whose wire format ignores inline cache markers (OpenAI's implicit
// prefix caching, Gemini's implicit + out-of-band CachedContent). Skip the
// whole policy pass for these — emitting hints would be harmless but pointless.
const RESPECTS_INLINE_HINTS = new Set(["anthropic-messages", "bedrock-converse"])
const makeHint = (ttlSeconds: number | undefined): CacheHint =>
ttlSeconds !== undefined ? new CacheHint({ type: "ephemeral", ttlSeconds }) : new CacheHint({ type: "ephemeral" })
const markLastTool = (tools: ReadonlyArray<ToolDefinition>, hint: CacheHint): ReadonlyArray<ToolDefinition> => {
if (tools.length === 0) return tools
const last = tools.length - 1
if (tools[last]!.cache) return tools
return tools.map((tool, i) => (i === last ? new ToolDefinition({ ...tool, cache: hint }) : tool))
}
const markLastSystem = (system: LLMRequest["system"], hint: CacheHint): LLMRequest["system"] => {
if (system.length === 0) return system
const last = system.length - 1
if (system[last]!.cache) return system
return system.map((part, i) => (i === last ? { ...part, cache: hint } : part))
}
const lastIndexOfRole = (messages: ReadonlyArray<Message>, role: Message["role"]): number =>
messages.findLastIndex((m) => m.role === role)
// Mark the last text part of `messages[index]`. If no text part exists, mark
// the last content part regardless of type — that's the breakpoint position
// in tool-result-only messages too.
const markMessageAt = (messages: ReadonlyArray<Message>, index: number, hint: CacheHint): ReadonlyArray<Message> => {
if (index < 0 || index >= messages.length) return messages
const target = messages[index]!
if (target.content.length === 0) return messages
const lastTextIndex = target.content.findLastIndex((part) => part.type === "text")
const markAt = lastTextIndex >= 0 ? lastTextIndex : target.content.length - 1
const existing = target.content[markAt]!
if ("cache" in existing && existing.cache) return messages
const nextContent = target.content.map((part, i) => (i === markAt ? ({ ...part, cache: hint } as ContentPart) : part))
const next = new Message({ ...target, content: nextContent })
// Single pass over `messages`, substituting the one updated entry. Long
// conversations call this on every request, so avoid `.map()` here — its
// closure dispatch and identity copies show up in profiling.
const result = messages.slice()
result[index] = next
return result
}
const markMessages = (
messages: ReadonlyArray<Message>,
strategy: NonNullable<CachePolicyObject["messages"]>,
hint: CacheHint,
): ReadonlyArray<Message> => {
if (messages.length === 0) return messages
if (strategy === "latest-user-message") return markMessageAt(messages, lastIndexOfRole(messages, "user"), hint)
if (strategy === "latest-assistant") return markMessageAt(messages, lastIndexOfRole(messages, "assistant"), hint)
const start = Math.max(0, messages.length - strategy.tail)
let next = messages
for (let i = start; i < messages.length; i++) next = markMessageAt(next, i, hint)
return next
}
export const applyCachePolicy = (request: LLMRequest): LLMRequest => {
if (!RESPECTS_INLINE_HINTS.has(request.model.route)) return request
const policy = resolve(request.cache)
if (!policy.tools && !policy.system && !policy.messages) return request
const hint = makeHint(policy.ttlSeconds)
const tools = policy.tools ? markLastTool(request.tools, hint) : request.tools
const system = policy.system ? markLastSystem(request.system, hint) : request.system
const messages = policy.messages ? markMessages(request.messages, policy.messages, hint) : request.messages
if (tools === request.tools && system === request.system && messages === request.messages) return request
return LLMRequest.update(request, { tools, system, messages })
}