mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-16 01:22:58 +00:00
feat(llm): cache hint TTL, breakpoint cap, and tool placement (#26779)
This commit is contained in:
@@ -16,6 +16,7 @@ import {
|
||||
type ToolResultPart,
|
||||
} from "../schema"
|
||||
import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
|
||||
import * as Cache from "./utils/cache"
|
||||
import { ToolStream } from "./utils/tool-stream"
|
||||
|
||||
const ADAPTER = "anthropic-messages"
|
||||
@@ -25,7 +26,10 @@ export const PATH = "/messages"
|
||||
// =============================================================================
|
||||
// Request Body Schema
|
||||
// =============================================================================
|
||||
const AnthropicCacheControl = Schema.Struct({ type: Schema.tag("ephemeral") })
|
||||
const AnthropicCacheControl = Schema.Struct({
|
||||
type: Schema.tag("ephemeral"),
|
||||
ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
|
||||
})
|
||||
|
||||
const AnthropicTextBlock = Schema.Struct({
|
||||
type: Schema.tag("text"),
|
||||
@@ -193,8 +197,24 @@ const invalid = ProviderShared.invalidRequest
|
||||
// =============================================================================
|
||||
// Request Lowering
|
||||
// =============================================================================
|
||||
const cacheControl = (cache: CacheHint | undefined) =>
|
||||
cache?.type === "ephemeral" ? { type: "ephemeral" as const } : undefined
|
||||
// Anthropic accepts at most 4 explicit cache_control breakpoints per request,
|
||||
// across `tools`, `system`, and `messages`. Beyond the cap the API returns a
|
||||
// 400 — so the lowering layer counts emitted markers and silently drops any
|
||||
// that exceed it.
|
||||
const ANTHROPIC_BREAKPOINT_CAP = 4
|
||||
|
||||
const EPHEMERAL_5M = { type: "ephemeral" as const }
|
||||
const EPHEMERAL_1H = { type: "ephemeral" as const, ttl: "1h" as const }
|
||||
|
||||
const cacheControl = (breakpoints: Cache.Breakpoints, cache: CacheHint | undefined) => {
|
||||
if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
|
||||
if (breakpoints.remaining <= 0) {
|
||||
breakpoints.dropped += 1
|
||||
return undefined
|
||||
}
|
||||
breakpoints.remaining -= 1
|
||||
return Cache.ttlBucket(cache.ttlSeconds) === "1h" ? EPHEMERAL_1H : EPHEMERAL_5M
|
||||
}
|
||||
|
||||
const anthropicMetadata = (metadata: Record<string, unknown>): ProviderMetadata => ({ anthropic: metadata })
|
||||
|
||||
@@ -204,10 +224,11 @@ const signatureFromMetadata = (metadata: ProviderMetadata | undefined): string |
|
||||
return typeof anthropic.signature === "string" ? anthropic.signature : undefined
|
||||
}
|
||||
|
||||
const lowerTool = (tool: ToolDefinition): AnthropicTool => ({
|
||||
const lowerTool = (breakpoints: Cache.Breakpoints, tool: ToolDefinition): AnthropicTool => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
input_schema: tool.inputSchema,
|
||||
cache_control: cacheControl(breakpoints, tool.cache),
|
||||
})
|
||||
|
||||
const lowerToolChoice = (toolChoice: NonNullable<LLMRequest["toolChoice"]>) =>
|
||||
@@ -249,7 +270,10 @@ const lowerServerToolResult = Effect.fn("AnthropicMessages.lowerServerToolResult
|
||||
return { type: wireType, tool_use_id: part.id, content: part.result.value } satisfies AnthropicServerToolResultBlock
|
||||
})
|
||||
|
||||
const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (request: LLMRequest) {
|
||||
const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
|
||||
request: LLMRequest,
|
||||
breakpoints: Cache.Breakpoints,
|
||||
) {
|
||||
const messages: AnthropicMessage[] = []
|
||||
|
||||
for (const message of request.messages) {
|
||||
@@ -258,7 +282,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
|
||||
for (const part of message.content) {
|
||||
if (!ProviderShared.supportsContent(part, ["text"]))
|
||||
return yield* ProviderShared.unsupportedContent("Anthropic Messages", "user", ["text"])
|
||||
content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
|
||||
content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
|
||||
}
|
||||
messages.push({ role: "user", content })
|
||||
continue
|
||||
@@ -268,7 +292,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
|
||||
const content: AnthropicAssistantBlock[] = []
|
||||
for (const part of message.content) {
|
||||
if (part.type === "text") {
|
||||
content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
|
||||
content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
|
||||
continue
|
||||
}
|
||||
if (part.type === "reasoning") {
|
||||
@@ -304,6 +328,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
|
||||
tool_use_id: part.id,
|
||||
content: ProviderShared.toolResultText(part),
|
||||
is_error: part.result.type === "error" ? true : undefined,
|
||||
cache_control: cacheControl(breakpoints, part.cache),
|
||||
})
|
||||
}
|
||||
messages.push({ role: "user", content })
|
||||
@@ -330,18 +355,33 @@ const lowerThinking = Effect.fn("AnthropicMessages.lowerThinking")(function* (re
|
||||
const fromRequest = Effect.fn("AnthropicMessages.fromRequest")(function* (request: LLMRequest) {
|
||||
const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
|
||||
const generation = request.generation
|
||||
// Allocate the 4-breakpoint budget in invalidation order: tools → system →
|
||||
// messages. Tools live highest in the cache hierarchy, so when callers
|
||||
// over-mark we keep their tool hints and shed the message-tail ones first.
|
||||
const breakpoints = Cache.newBreakpoints(ANTHROPIC_BREAKPOINT_CAP)
|
||||
const tools =
|
||||
request.tools.length === 0 || request.toolChoice?.type === "none"
|
||||
? undefined
|
||||
: request.tools.map((tool) => lowerTool(breakpoints, tool))
|
||||
const system =
|
||||
request.system.length === 0
|
||||
? undefined
|
||||
: request.system.map((part) => ({
|
||||
type: "text" as const,
|
||||
text: part.text,
|
||||
cache_control: cacheControl(breakpoints, part.cache),
|
||||
}))
|
||||
const messages = yield* lowerMessages(request, breakpoints)
|
||||
if (breakpoints.dropped > 0) {
|
||||
yield* Effect.logWarning(
|
||||
`Anthropic Messages: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${ANTHROPIC_BREAKPOINT_CAP} per request.`,
|
||||
)
|
||||
}
|
||||
return {
|
||||
model: request.model.id,
|
||||
system:
|
||||
request.system.length === 0
|
||||
? undefined
|
||||
: request.system.map((part) => ({
|
||||
type: "text" as const,
|
||||
text: part.text,
|
||||
cache_control: cacheControl(part.cache),
|
||||
})),
|
||||
messages: yield* lowerMessages(request),
|
||||
tools: request.tools.length === 0 || request.toolChoice?.type === "none" ? undefined : request.tools.map(lowerTool),
|
||||
system,
|
||||
messages,
|
||||
tools,
|
||||
tool_choice: toolChoice,
|
||||
stream: true as const,
|
||||
max_tokens: generation?.maxTokens ?? request.model.limits.output ?? 4096,
|
||||
|
||||
@@ -108,7 +108,7 @@ type BedrockMessage = Schema.Schema.Type<typeof BedrockMessage>
|
||||
const BedrockSystemBlock = Schema.Union([BedrockTextBlock, BedrockCache.CachePointBlock])
|
||||
type BedrockSystemBlock = Schema.Schema.Type<typeof BedrockSystemBlock>
|
||||
|
||||
const BedrockTool = Schema.Struct({
|
||||
const BedrockToolSpec = Schema.Struct({
|
||||
toolSpec: Schema.Struct({
|
||||
name: Schema.String,
|
||||
description: Schema.String,
|
||||
@@ -117,6 +117,9 @@ const BedrockTool = Schema.Struct({
|
||||
}),
|
||||
}),
|
||||
})
|
||||
type BedrockToolSpec = Schema.Schema.Type<typeof BedrockToolSpec>
|
||||
|
||||
const BedrockTool = Schema.Union([BedrockToolSpec, BedrockCache.CachePointBlock])
|
||||
type BedrockTool = Schema.Schema.Type<typeof BedrockTool>
|
||||
|
||||
const BedrockToolChoice = Schema.Union([
|
||||
@@ -214,7 +217,7 @@ type BedrockEvent = Schema.Schema.Type<typeof BedrockEvent>
|
||||
// =============================================================================
|
||||
// Request Lowering
|
||||
// =============================================================================
|
||||
const lowerTool = (tool: ToolDefinition): BedrockTool => ({
|
||||
const lowerToolSpec = (tool: ToolDefinition): BedrockToolSpec => ({
|
||||
toolSpec: {
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
@@ -222,11 +225,25 @@ const lowerTool = (tool: ToolDefinition): BedrockTool => ({
|
||||
},
|
||||
})
|
||||
|
||||
const lowerTools = (
|
||||
breakpoints: BedrockCache.Breakpoints,
|
||||
tools: ReadonlyArray<ToolDefinition>,
|
||||
): BedrockTool[] => {
|
||||
const result: BedrockTool[] = []
|
||||
for (const tool of tools) {
|
||||
result.push(lowerToolSpec(tool))
|
||||
const cachePoint = BedrockCache.block(breakpoints, tool.cache)
|
||||
if (cachePoint) result.push(cachePoint)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
const textWithCache = (
|
||||
breakpoints: BedrockCache.Breakpoints,
|
||||
text: string,
|
||||
cache: CacheHint | undefined,
|
||||
): Array<BedrockTextBlock | BedrockCache.CachePointBlock> => {
|
||||
const cachePoint = BedrockCache.block(cache)
|
||||
const cachePoint = BedrockCache.block(breakpoints, cache)
|
||||
return cachePoint ? [{ text }, cachePoint] : [{ text }]
|
||||
}
|
||||
|
||||
@@ -257,7 +274,10 @@ const lowerToolResult = (part: ToolResultPart): BedrockToolResultBlock => ({
|
||||
},
|
||||
})
|
||||
|
||||
const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (request: LLMRequest) {
|
||||
const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (
|
||||
request: LLMRequest,
|
||||
breakpoints: BedrockCache.Breakpoints,
|
||||
) {
|
||||
const messages: BedrockMessage[] = []
|
||||
|
||||
for (const message of request.messages) {
|
||||
@@ -267,7 +287,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
|
||||
if (!ProviderShared.supportsContent(part, ["text", "media"]))
|
||||
return yield* ProviderShared.unsupportedContent("Bedrock Converse", "user", ["text", "media"])
|
||||
if (part.type === "text") {
|
||||
content.push(...textWithCache(part.text, part.cache))
|
||||
content.push(...textWithCache(breakpoints, part.text, part.cache))
|
||||
continue
|
||||
}
|
||||
if (part.type === "media") {
|
||||
@@ -289,7 +309,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
|
||||
"tool-call",
|
||||
])
|
||||
if (part.type === "text") {
|
||||
content.push(...textWithCache(part.text, part.cache))
|
||||
content.push(...textWithCache(breakpoints, part.text, part.cache))
|
||||
continue
|
||||
}
|
||||
if (part.type === "reasoning") {
|
||||
@@ -309,11 +329,13 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
|
||||
continue
|
||||
}
|
||||
|
||||
const content: BedrockToolResultBlock[] = []
|
||||
const content: BedrockUserBlock[] = []
|
||||
for (const part of message.content) {
|
||||
if (!ProviderShared.supportsContent(part, ["tool-result"]))
|
||||
return yield* ProviderShared.unsupportedContent("Bedrock Converse", "tool", ["tool-result"])
|
||||
content.push(lowerToolResult(part))
|
||||
const cachePoint = BedrockCache.block(breakpoints, part.cache)
|
||||
if (cachePoint) content.push(cachePoint)
|
||||
}
|
||||
messages.push({ role: "user", content })
|
||||
}
|
||||
@@ -323,16 +345,32 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
|
||||
|
||||
// System prompts share the cache-point convention: emit the text block, then
|
||||
// optionally a positional `cachePoint` marker.
|
||||
const lowerSystem = (system: ReadonlyArray<LLMRequest["system"][number]>): BedrockSystemBlock[] =>
|
||||
system.flatMap((part) => textWithCache(part.text, part.cache))
|
||||
const lowerSystem = (
|
||||
breakpoints: BedrockCache.Breakpoints,
|
||||
system: ReadonlyArray<LLMRequest["system"][number]>,
|
||||
): BedrockSystemBlock[] => system.flatMap((part) => textWithCache(breakpoints, part.text, part.cache))
|
||||
|
||||
const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request: LLMRequest) {
|
||||
const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
|
||||
const generation = request.generation
|
||||
// Bedrock-Claude shares Anthropic's 4-breakpoint cap. Spend the budget in
|
||||
// tools → system → messages order to favour the highest-impact prefixes.
|
||||
const breakpoints = BedrockCache.breakpoints()
|
||||
const toolConfig =
|
||||
request.tools.length > 0 && request.toolChoice?.type !== "none"
|
||||
? { tools: lowerTools(breakpoints, request.tools), toolChoice }
|
||||
: undefined
|
||||
const system = request.system.length === 0 ? undefined : lowerSystem(breakpoints, request.system)
|
||||
const messages = yield* lowerMessages(request, breakpoints)
|
||||
if (breakpoints.dropped > 0) {
|
||||
yield* Effect.logWarning(
|
||||
`Bedrock Converse: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${BedrockCache.BEDROCK_BREAKPOINT_CAP} per request.`,
|
||||
)
|
||||
}
|
||||
return {
|
||||
modelId: request.model.id,
|
||||
messages: yield* lowerMessages(request),
|
||||
system: request.system.length === 0 ? undefined : lowerSystem(request.system),
|
||||
messages,
|
||||
system,
|
||||
inferenceConfig:
|
||||
generation?.maxTokens === undefined &&
|
||||
generation?.temperature === undefined &&
|
||||
@@ -345,10 +383,7 @@ const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request:
|
||||
topP: generation?.topP,
|
||||
stopSequences: generation?.stop,
|
||||
},
|
||||
toolConfig:
|
||||
request.tools.length > 0 && request.toolChoice?.type !== "none"
|
||||
? { tools: request.tools.map(lowerTool), toolChoice }
|
||||
: undefined,
|
||||
toolConfig,
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@@ -1,20 +1,37 @@
|
||||
import { Schema } from "effect"
|
||||
import type { CacheHint } from "../../schema"
|
||||
import { newBreakpoints, ttlBucket, type Breakpoints } from "./cache"
|
||||
|
||||
// Bedrock cache markers are positional: emit a `cachePoint` block immediately
|
||||
// after the content the caller wants treated as a cacheable prefix.
|
||||
// after the content the caller wants treated as a cacheable prefix. Bedrock
|
||||
// accepts optional `ttl: "5m" | "1h"` on cachePoint, mirroring Anthropic.
|
||||
export const CachePointBlock = Schema.Struct({
|
||||
cachePoint: Schema.Struct({ type: Schema.tag("default") }),
|
||||
cachePoint: Schema.Struct({
|
||||
type: Schema.tag("default"),
|
||||
ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
|
||||
}),
|
||||
})
|
||||
export type CachePointBlock = Schema.Schema.Type<typeof CachePointBlock>
|
||||
|
||||
// Bedrock recently added optional `ttl: "5m" | "1h"` on cachePoint. Map
|
||||
// `CacheHint.ttlSeconds` here once a recorded cassette validates the wire shape.
|
||||
const DEFAULT: CachePointBlock = { cachePoint: { type: "default" } }
|
||||
// Bedrock-Claude enforces the same 4-breakpoint cap as the Anthropic Messages
|
||||
// API. Callers pass a shared counter through every `block()` call site so the
|
||||
// budget is respected across `system`, `messages`, and `tools`.
|
||||
export const BEDROCK_BREAKPOINT_CAP = 4
|
||||
|
||||
export const block = (cache: CacheHint | undefined): CachePointBlock | undefined => {
|
||||
export type { Breakpoints } from "./cache"
|
||||
export const breakpoints = () => newBreakpoints(BEDROCK_BREAKPOINT_CAP)
|
||||
|
||||
const DEFAULT_5M: CachePointBlock = { cachePoint: { type: "default" } }
|
||||
const DEFAULT_1H: CachePointBlock = { cachePoint: { type: "default", ttl: "1h" } }
|
||||
|
||||
export const block = (breakpoints: Breakpoints, cache: CacheHint | undefined): CachePointBlock | undefined => {
|
||||
if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
|
||||
return DEFAULT
|
||||
if (breakpoints.remaining <= 0) {
|
||||
breakpoints.dropped += 1
|
||||
return undefined
|
||||
}
|
||||
breakpoints.remaining -= 1
|
||||
return ttlBucket(cache.ttlSeconds) === "1h" ? DEFAULT_1H : DEFAULT_5M
|
||||
}
|
||||
|
||||
export * as BedrockCache from "./bedrock-cache"
|
||||
|
||||
16
packages/llm/src/protocols/utils/cache.ts
Normal file
16
packages/llm/src/protocols/utils/cache.ts
Normal file
@@ -0,0 +1,16 @@
|
||||
// Shared helpers for provider cache-marker lowering. Anthropic and Bedrock
|
||||
// both enforce a 4-breakpoint cap per request and accept the same `5m`/`1h`
|
||||
// TTL buckets, so the counter and TTL mapping live here.
|
||||
|
||||
export interface Breakpoints {
|
||||
remaining: number
|
||||
dropped: number
|
||||
}
|
||||
|
||||
export const newBreakpoints = (cap: number): Breakpoints => ({ remaining: cap, dropped: 0 })
|
||||
|
||||
// Returns `"1h"` for any `ttlSeconds >= 3600`, otherwise `undefined` (the
|
||||
// provider default 5m). Anthropic & Bedrock both treat anything shorter than
|
||||
// an hour as 5m.
|
||||
export const ttlBucket = (ttlSeconds: number | undefined): "1h" | undefined =>
|
||||
ttlSeconds !== undefined && ttlSeconds >= 3600 ? "1h" : undefined
|
||||
@@ -79,6 +79,7 @@ export const ToolResultPart = Object.assign(
|
||||
name: Schema.String,
|
||||
result: ToolResultValue,
|
||||
providerExecuted: Schema.optional(Schema.Boolean),
|
||||
cache: Schema.optional(CacheHint),
|
||||
metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
|
||||
providerMetadata: Schema.optional(ProviderMetadata),
|
||||
}).annotate({ identifier: "LLM.Content.ToolResult" }),
|
||||
@@ -94,6 +95,7 @@ export const ToolResultPart = Object.assign(
|
||||
name: input.name,
|
||||
result: ToolResultValue.make(input.result, input.resultType),
|
||||
providerExecuted: input.providerExecuted,
|
||||
cache: input.cache,
|
||||
metadata: input.metadata,
|
||||
providerMetadata: input.providerMetadata,
|
||||
}),
|
||||
@@ -151,6 +153,7 @@ export class ToolDefinition extends Schema.Class<ToolDefinition>("LLM.ToolDefini
|
||||
name: Schema.String,
|
||||
description: Schema.String,
|
||||
inputSchema: JsonSchema,
|
||||
cache: Schema.optional(CacheHint),
|
||||
metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
|
||||
native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
|
||||
}) {}
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
import { Redactor } from "@opencode-ai/http-recorder"
|
||||
import { describe, expect } from "bun:test"
|
||||
import { Effect } from "effect"
|
||||
import { CacheHint, LLM } from "../../src"
|
||||
import { LLMClient } from "../../src/route"
|
||||
import * as AnthropicMessages from "../../src/protocols/anthropic-messages"
|
||||
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
|
||||
import { recordedTests } from "../recorded-test"
|
||||
|
||||
const model = AnthropicMessages.model({
|
||||
id: "claude-haiku-4-5-20251001",
|
||||
apiKey: process.env.ANTHROPIC_API_KEY ?? "fixture",
|
||||
})
|
||||
|
||||
// Two identical generations in a row. The first call writes the prefix into
|
||||
// Anthropic's cache; the second should report a cache read against the same
|
||||
// prefix. Cassette captures both interactions in order.
|
||||
const cacheRequest = LLM.request({
|
||||
id: "recorded_anthropic_cache",
|
||||
model,
|
||||
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
|
||||
prompt: "Say hi.",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
})
|
||||
|
||||
const recorded = recordedTests({
|
||||
prefix: "anthropic-messages-cache",
|
||||
provider: "anthropic",
|
||||
protocol: "anthropic-messages",
|
||||
requires: ["ANTHROPIC_API_KEY"],
|
||||
options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) },
|
||||
})
|
||||
|
||||
describe("Anthropic Messages cache recorded", () => {
|
||||
recorded.effect.with("writes then reads cache_control on identical second call", { tags: ["cache"] }, () =>
|
||||
Effect.gen(function* () {
|
||||
const first = yield* LLMClient.generate(cacheRequest)
|
||||
// The first call may write the cache (cacheWriteInputTokens > 0) or it
|
||||
// may be a fresh miss (both fields 0) depending on whether the prefix is
|
||||
// already warm on Anthropic's side. The assertion that matters is that
|
||||
// the SECOND call reports a non-zero cache read.
|
||||
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
|
||||
|
||||
const second = yield* LLMClient.generate(cacheRequest)
|
||||
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
|
||||
}),
|
||||
)
|
||||
})
|
||||
@@ -374,4 +374,134 @@ describe("Anthropic Messages route", () => {
|
||||
expect(error.message).toContain("Anthropic Messages user messages only support text content for now")
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("maps ttlSeconds >= 3600 to cache_control ttl: '1h'", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
system: { type: "text", text: "system", cache: new CacheHint({ type: "ephemeral", ttlSeconds: 3600 }) },
|
||||
prompt: "hi",
|
||||
}),
|
||||
)
|
||||
|
||||
expect(prepared.body).toMatchObject({
|
||||
system: [{ type: "text", text: "system", cache_control: { type: "ephemeral", ttl: "1h" } }],
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("emits cache_control on tool definitions and tool-result blocks", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
tools: [
|
||||
{
|
||||
name: "lookup",
|
||||
description: "lookup tool",
|
||||
inputSchema: { type: "object", properties: {} },
|
||||
cache: new CacheHint({ type: "ephemeral" }),
|
||||
},
|
||||
],
|
||||
messages: [
|
||||
LLM.user("What's the weather?"),
|
||||
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
|
||||
LLM.toolMessage({
|
||||
id: "call_1",
|
||||
name: "lookup",
|
||||
result: { temp: 72 },
|
||||
cache: new CacheHint({ type: "ephemeral" }),
|
||||
}),
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
expect(prepared.body).toMatchObject({
|
||||
tools: [{ name: "lookup", cache_control: { type: "ephemeral" } }],
|
||||
messages: [
|
||||
{ role: "user", content: [{ type: "text", text: "What's the weather?" }] },
|
||||
{ role: "assistant", content: [{ type: "tool_use", id: "call_1", name: "lookup" }] },
|
||||
{
|
||||
role: "user",
|
||||
content: [{ type: "tool_result", tool_use_id: "call_1", cache_control: { type: "ephemeral" } }],
|
||||
},
|
||||
],
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("drops cache_control breakpoints past the 4-per-request cap", () =>
|
||||
Effect.gen(function* () {
|
||||
const hint = new CacheHint({ type: "ephemeral" })
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
system: [
|
||||
{ type: "text", text: "a", cache: hint },
|
||||
{ type: "text", text: "b", cache: hint },
|
||||
{ type: "text", text: "c", cache: hint },
|
||||
{ type: "text", text: "d", cache: hint },
|
||||
{ type: "text", text: "e", cache: hint },
|
||||
{ type: "text", text: "f", cache: hint },
|
||||
],
|
||||
prompt: "hi",
|
||||
}),
|
||||
)
|
||||
|
||||
const system = (prepared.body as { system: Array<{ cache_control?: unknown }> }).system
|
||||
const marked = system.filter((part) => part.cache_control !== undefined)
|
||||
expect(marked).toHaveLength(4)
|
||||
expect(system[4]?.cache_control).toBeUndefined()
|
||||
expect(system[5]?.cache_control).toBeUndefined()
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("spends breakpoint budget on tools before system before messages", () =>
|
||||
Effect.gen(function* () {
|
||||
const hint = new CacheHint({ type: "ephemeral" })
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
tools: [
|
||||
{
|
||||
name: "t1",
|
||||
description: "t1",
|
||||
inputSchema: { type: "object", properties: {} },
|
||||
cache: hint,
|
||||
},
|
||||
{
|
||||
name: "t2",
|
||||
description: "t2",
|
||||
inputSchema: { type: "object", properties: {} },
|
||||
cache: hint,
|
||||
},
|
||||
{
|
||||
name: "t3",
|
||||
description: "t3",
|
||||
inputSchema: { type: "object", properties: {} },
|
||||
cache: hint,
|
||||
},
|
||||
{
|
||||
name: "t4",
|
||||
description: "t4",
|
||||
inputSchema: { type: "object", properties: {} },
|
||||
cache: hint,
|
||||
},
|
||||
],
|
||||
system: [{ type: "text", text: "system-tail", cache: hint }],
|
||||
messages: [LLM.user([{ type: "text", text: "message-tail", cache: hint }])],
|
||||
}),
|
||||
)
|
||||
|
||||
const body = prepared.body as {
|
||||
tools: Array<{ cache_control?: unknown }>
|
||||
system: Array<{ cache_control?: unknown }>
|
||||
messages: Array<{ content: Array<{ cache_control?: unknown }> }>
|
||||
}
|
||||
expect(body.tools.every((t) => t.cache_control !== undefined)).toBe(true)
|
||||
expect(body.system[0]?.cache_control).toBeUndefined()
|
||||
expect(body.messages[0]?.content[0]?.cache_control).toBeUndefined()
|
||||
}),
|
||||
)
|
||||
})
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
import { describe, expect } from "bun:test"
|
||||
import { Effect } from "effect"
|
||||
import { CacheHint, LLM } from "../../src"
|
||||
import { LLMClient } from "../../src/route"
|
||||
import * as BedrockConverse from "../../src/protocols/bedrock-converse"
|
||||
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
|
||||
import { recordedTests } from "../recorded-test"
|
||||
|
||||
const RECORDING_REGION = process.env.BEDROCK_RECORDING_REGION ?? "us-east-1"
|
||||
|
||||
// Use a Claude model on Bedrock — Nova has automatic prefix caching that
|
||||
// doesn't reliably surface `cacheRead`/`cacheWrite` in usage, so the second
|
||||
// call wouldn't deterministically prove cache mapping works. Override with
|
||||
// BEDROCK_CACHE_MODEL_ID if your account has access elsewhere.
|
||||
const model = BedrockConverse.model({
|
||||
id: process.env.BEDROCK_CACHE_MODEL_ID ?? "us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
||||
credentials: {
|
||||
region: RECORDING_REGION,
|
||||
accessKeyId: process.env.AWS_ACCESS_KEY_ID ?? "fixture",
|
||||
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY ?? "fixture",
|
||||
sessionToken: process.env.AWS_SESSION_TOKEN,
|
||||
},
|
||||
})
|
||||
|
||||
const cacheRequest = LLM.request({
|
||||
id: "recorded_bedrock_cache",
|
||||
model,
|
||||
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
|
||||
prompt: "Say hi.",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
})
|
||||
|
||||
const recorded = recordedTests({
|
||||
prefix: "bedrock-converse-cache",
|
||||
provider: "amazon-bedrock",
|
||||
protocol: "bedrock-converse",
|
||||
requires: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
|
||||
})
|
||||
|
||||
describe("Bedrock Converse cache recorded", () => {
|
||||
recorded.effect.with("writes then reads cachePoint on identical second call", { tags: ["cache"] }, () =>
|
||||
Effect.gen(function* () {
|
||||
const first = yield* LLMClient.generate(cacheRequest)
|
||||
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
|
||||
|
||||
const second = yield* LLMClient.generate(cacheRequest)
|
||||
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
|
||||
}),
|
||||
)
|
||||
})
|
||||
@@ -440,6 +440,79 @@ describe("Bedrock Converse route", () => {
|
||||
expect(error.message).toContain("Bedrock Converse does not support media type application/x-tar")
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("maps ttlSeconds >= 3600 to cachePoint ttl: '1h'", () =>
|
||||
Effect.gen(function* () {
|
||||
const cache = new CacheHint({ type: "ephemeral", ttlSeconds: 3600 })
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
system: [{ type: "text", text: "system", cache }],
|
||||
prompt: "hi",
|
||||
}),
|
||||
)
|
||||
|
||||
expect(prepared.body).toMatchObject({
|
||||
system: [{ text: "system" }, { cachePoint: { type: "default", ttl: "1h" } }],
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("appends cachePoint after marked tool definitions and tool-result blocks", () =>
|
||||
Effect.gen(function* () {
|
||||
const cache = new CacheHint({ type: "ephemeral" })
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
tools: [
|
||||
{ name: "lookup", description: "lookup", inputSchema: { type: "object", properties: {} }, cache },
|
||||
],
|
||||
messages: [
|
||||
LLM.user("What's the weather?"),
|
||||
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
|
||||
LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
expect(prepared.body).toMatchObject({
|
||||
toolConfig: {
|
||||
tools: [{ toolSpec: { name: "lookup" } }, { cachePoint: { type: "default" } }],
|
||||
},
|
||||
messages: [
|
||||
{ role: "user", content: [{ text: "What's the weather?" }] },
|
||||
{ role: "assistant", content: [{ toolUse: { toolUseId: "call_1" } }] },
|
||||
{
|
||||
role: "user",
|
||||
content: [{ toolResult: { toolUseId: "call_1" } }, { cachePoint: { type: "default" } }],
|
||||
},
|
||||
],
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("drops cachePoint markers past the 4-per-request cap", () =>
|
||||
Effect.gen(function* () {
|
||||
const cache = new CacheHint({ type: "ephemeral" })
|
||||
const prepared = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
model,
|
||||
system: [
|
||||
{ type: "text", text: "a", cache },
|
||||
{ type: "text", text: "b", cache },
|
||||
{ type: "text", text: "c", cache },
|
||||
{ type: "text", text: "d", cache },
|
||||
{ type: "text", text: "e", cache },
|
||||
{ type: "text", text: "f", cache },
|
||||
],
|
||||
prompt: "hi",
|
||||
}),
|
||||
)
|
||||
|
||||
const system = (prepared.body as { system: Array<{ cachePoint?: unknown }> }).system
|
||||
expect(system.filter((part) => "cachePoint" in part)).toHaveLength(4)
|
||||
}),
|
||||
)
|
||||
})
|
||||
|
||||
// Live recorded integration tests. Run with `RECORD=true AWS_ACCESS_KEY_ID=...
|
||||
|
||||
47
packages/llm/test/provider/gemini-cache.recorded.test.ts
Normal file
47
packages/llm/test/provider/gemini-cache.recorded.test.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { describe, expect } from "bun:test"
|
||||
import { Effect } from "effect"
|
||||
import { LLM } from "../../src"
|
||||
import { LLMClient } from "../../src/route"
|
||||
import * as Gemini from "../../src/protocols/gemini"
|
||||
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
|
||||
import { recordedTests } from "../recorded-test"
|
||||
|
||||
const model = Gemini.model({
|
||||
id: "gemini-2.5-flash",
|
||||
apiKey: process.env.GEMINI_API_KEY ?? "fixture",
|
||||
})
|
||||
|
||||
// Gemini does implicit prefix caching on 2.5+ models above ~1024 tokens. The
|
||||
// `CacheHint` is currently a no-op for Gemini (the explicit `CachedContent`
|
||||
// API is out-of-band and intentionally not wired up). This test exists to
|
||||
// pin the usage-parsing path: `cachedContentTokenCount` should surface as
|
||||
// `cacheReadInputTokens` on the second identical call.
|
||||
const cacheRequest = LLM.request({
|
||||
id: "recorded_gemini_cache",
|
||||
model,
|
||||
system: LARGE_CACHEABLE_SYSTEM,
|
||||
prompt: "Say hi.",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
})
|
||||
|
||||
const recorded = recordedTests({
|
||||
prefix: "gemini-cache",
|
||||
provider: "google",
|
||||
protocol: "gemini",
|
||||
requires: ["GEMINI_API_KEY"],
|
||||
})
|
||||
|
||||
describe("Gemini cache recorded", () => {
|
||||
recorded.effect.with("reports cachedContentTokenCount on identical second call", { tags: ["cache"] }, () =>
|
||||
Effect.gen(function* () {
|
||||
const first = yield* LLMClient.generate(cacheRequest)
|
||||
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
|
||||
|
||||
const second = yield* LLMClient.generate(cacheRequest)
|
||||
// Implicit caching is best-effort on Gemini's side; we assert the field
|
||||
// is at least populated and non-negative. When re-recording, verify the
|
||||
// cassette shows > 0 in the second response's usage.
|
||||
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
|
||||
}),
|
||||
)
|
||||
})
|
||||
@@ -0,0 +1,44 @@
|
||||
import { describe, expect } from "bun:test"
|
||||
import { Effect } from "effect"
|
||||
import { LLM } from "../../src"
|
||||
import { LLMClient } from "../../src/route"
|
||||
import * as OpenAIResponses from "../../src/protocols/openai-responses"
|
||||
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
|
||||
import { recordedTests } from "../recorded-test"
|
||||
|
||||
const model = OpenAIResponses.model({
|
||||
id: "gpt-4.1-mini",
|
||||
apiKey: process.env.OPENAI_API_KEY ?? "fixture",
|
||||
})
|
||||
|
||||
// OpenAI caches prefixes automatically once they cross the 1024-token threshold;
|
||||
// `CacheHint` is a no-op for the wire body. The stable signal is the
|
||||
// `prompt_cache_key` routing hint, which keeps repeated calls on the same shard
|
||||
// so cache hits are observable.
|
||||
const cacheRequest = LLM.request({
|
||||
id: "recorded_openai_responses_cache",
|
||||
model,
|
||||
system: LARGE_CACHEABLE_SYSTEM,
|
||||
prompt: "Say hi.",
|
||||
generation: { maxTokens: 16, temperature: 0 },
|
||||
providerOptions: { openai: { promptCacheKey: "recorded-cache-test" } },
|
||||
})
|
||||
|
||||
const recorded = recordedTests({
|
||||
prefix: "openai-responses-cache",
|
||||
provider: "openai",
|
||||
protocol: "openai-responses",
|
||||
requires: ["OPENAI_API_KEY"],
|
||||
})
|
||||
|
||||
describe("OpenAI Responses cache recorded", () => {
|
||||
recorded.effect.with("reports cached_tokens on identical second call", { tags: ["cache"] }, () =>
|
||||
Effect.gen(function* () {
|
||||
const first = yield* LLMClient.generate(cacheRequest)
|
||||
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
|
||||
|
||||
const second = yield* LLMClient.generate(cacheRequest)
|
||||
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
|
||||
}),
|
||||
)
|
||||
})
|
||||
@@ -6,6 +6,19 @@ import { tool } from "../src/tool"
|
||||
|
||||
export const weatherToolName = "get_weather"
|
||||
|
||||
// A deterministic system prompt long enough to clear every supported provider's
|
||||
// minimum cacheable-prefix threshold (Anthropic Haiku 3.5: 2048 tokens; Anthropic
|
||||
// Opus/Haiku 4.5: 4096 tokens; OpenAI/Gemini/Bedrock: lower). Built by repeating
|
||||
// a fixed sentence — the cassette replays bit-for-bit, so the exact text matters
|
||||
// only when re-recording with `RECORD=true`.
|
||||
export const LARGE_CACHEABLE_SYSTEM = (() => {
|
||||
const sentence =
|
||||
"You are a concise, factual assistant. Answer precisely and avoid filler. Cite numbers when known. "
|
||||
// ~100 chars per sentence × 250 repeats ≈ 25,000 chars ≈ 5k+ tokens, safely
|
||||
// above every provider's threshold.
|
||||
return sentence.repeat(250)
|
||||
})()
|
||||
|
||||
export const weatherTool = LLM.toolDefinition({
|
||||
name: weatherToolName,
|
||||
description: "Get current weather for a city.",
|
||||
|
||||
Reference in New Issue
Block a user