diff --git a/packages/llm/src/protocols/anthropic-messages.ts b/packages/llm/src/protocols/anthropic-messages.ts index fba785373d..a426807c02 100644 --- a/packages/llm/src/protocols/anthropic-messages.ts +++ b/packages/llm/src/protocols/anthropic-messages.ts @@ -16,6 +16,7 @@ import { type ToolResultPart, } from "../schema" import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared" +import * as Cache from "./utils/cache" import { ToolStream } from "./utils/tool-stream" const ADAPTER = "anthropic-messages" @@ -25,7 +26,10 @@ export const PATH = "/messages" // ============================================================================= // Request Body Schema // ============================================================================= -const AnthropicCacheControl = Schema.Struct({ type: Schema.tag("ephemeral") }) +const AnthropicCacheControl = Schema.Struct({ + type: Schema.tag("ephemeral"), + ttl: Schema.optional(Schema.Literals(["5m", "1h"])), +}) const AnthropicTextBlock = Schema.Struct({ type: Schema.tag("text"), @@ -193,8 +197,24 @@ const invalid = ProviderShared.invalidRequest // ============================================================================= // Request Lowering // ============================================================================= -const cacheControl = (cache: CacheHint | undefined) => - cache?.type === "ephemeral" ? { type: "ephemeral" as const } : undefined +// Anthropic accepts at most 4 explicit cache_control breakpoints per request, +// across `tools`, `system`, and `messages`. Beyond the cap the API returns a +// 400 — so the lowering layer counts emitted markers and silently drops any +// that exceed it. +const ANTHROPIC_BREAKPOINT_CAP = 4 + +const EPHEMERAL_5M = { type: "ephemeral" as const } +const EPHEMERAL_1H = { type: "ephemeral" as const, ttl: "1h" as const } + +const cacheControl = (breakpoints: Cache.Breakpoints, cache: CacheHint | undefined) => { + if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined + if (breakpoints.remaining <= 0) { + breakpoints.dropped += 1 + return undefined + } + breakpoints.remaining -= 1 + return Cache.ttlBucket(cache.ttlSeconds) === "1h" ? EPHEMERAL_1H : EPHEMERAL_5M +} const anthropicMetadata = (metadata: Record): ProviderMetadata => ({ anthropic: metadata }) @@ -204,10 +224,11 @@ const signatureFromMetadata = (metadata: ProviderMetadata | undefined): string | return typeof anthropic.signature === "string" ? anthropic.signature : undefined } -const lowerTool = (tool: ToolDefinition): AnthropicTool => ({ +const lowerTool = (breakpoints: Cache.Breakpoints, tool: ToolDefinition): AnthropicTool => ({ name: tool.name, description: tool.description, input_schema: tool.inputSchema, + cache_control: cacheControl(breakpoints, tool.cache), }) const lowerToolChoice = (toolChoice: NonNullable) => @@ -249,7 +270,10 @@ const lowerServerToolResult = Effect.fn("AnthropicMessages.lowerServerToolResult return { type: wireType, tool_use_id: part.id, content: part.result.value } satisfies AnthropicServerToolResultBlock }) -const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (request: LLMRequest) { +const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* ( + request: LLMRequest, + breakpoints: Cache.Breakpoints, +) { const messages: AnthropicMessage[] = [] for (const message of request.messages) { @@ -258,7 +282,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re for (const part of message.content) { if (!ProviderShared.supportsContent(part, ["text"])) return yield* ProviderShared.unsupportedContent("Anthropic Messages", "user", ["text"]) - content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) }) + content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) }) } messages.push({ role: "user", content }) continue @@ -268,7 +292,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re const content: AnthropicAssistantBlock[] = [] for (const part of message.content) { if (part.type === "text") { - content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) }) + content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) }) continue } if (part.type === "reasoning") { @@ -304,6 +328,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re tool_use_id: part.id, content: ProviderShared.toolResultText(part), is_error: part.result.type === "error" ? true : undefined, + cache_control: cacheControl(breakpoints, part.cache), }) } messages.push({ role: "user", content }) @@ -330,18 +355,33 @@ const lowerThinking = Effect.fn("AnthropicMessages.lowerThinking")(function* (re const fromRequest = Effect.fn("AnthropicMessages.fromRequest")(function* (request: LLMRequest) { const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined const generation = request.generation + // Allocate the 4-breakpoint budget in invalidation order: tools → system → + // messages. Tools live highest in the cache hierarchy, so when callers + // over-mark we keep their tool hints and shed the message-tail ones first. + const breakpoints = Cache.newBreakpoints(ANTHROPIC_BREAKPOINT_CAP) + const tools = + request.tools.length === 0 || request.toolChoice?.type === "none" + ? undefined + : request.tools.map((tool) => lowerTool(breakpoints, tool)) + const system = + request.system.length === 0 + ? undefined + : request.system.map((part) => ({ + type: "text" as const, + text: part.text, + cache_control: cacheControl(breakpoints, part.cache), + })) + const messages = yield* lowerMessages(request, breakpoints) + if (breakpoints.dropped > 0) { + yield* Effect.logWarning( + `Anthropic Messages: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${ANTHROPIC_BREAKPOINT_CAP} per request.`, + ) + } return { model: request.model.id, - system: - request.system.length === 0 - ? undefined - : request.system.map((part) => ({ - type: "text" as const, - text: part.text, - cache_control: cacheControl(part.cache), - })), - messages: yield* lowerMessages(request), - tools: request.tools.length === 0 || request.toolChoice?.type === "none" ? undefined : request.tools.map(lowerTool), + system, + messages, + tools, tool_choice: toolChoice, stream: true as const, max_tokens: generation?.maxTokens ?? request.model.limits.output ?? 4096, diff --git a/packages/llm/src/protocols/bedrock-converse.ts b/packages/llm/src/protocols/bedrock-converse.ts index 260ee612cd..34bb13e61a 100644 --- a/packages/llm/src/protocols/bedrock-converse.ts +++ b/packages/llm/src/protocols/bedrock-converse.ts @@ -108,7 +108,7 @@ type BedrockMessage = Schema.Schema.Type const BedrockSystemBlock = Schema.Union([BedrockTextBlock, BedrockCache.CachePointBlock]) type BedrockSystemBlock = Schema.Schema.Type -const BedrockTool = Schema.Struct({ +const BedrockToolSpec = Schema.Struct({ toolSpec: Schema.Struct({ name: Schema.String, description: Schema.String, @@ -117,6 +117,9 @@ const BedrockTool = Schema.Struct({ }), }), }) +type BedrockToolSpec = Schema.Schema.Type + +const BedrockTool = Schema.Union([BedrockToolSpec, BedrockCache.CachePointBlock]) type BedrockTool = Schema.Schema.Type const BedrockToolChoice = Schema.Union([ @@ -214,7 +217,7 @@ type BedrockEvent = Schema.Schema.Type // ============================================================================= // Request Lowering // ============================================================================= -const lowerTool = (tool: ToolDefinition): BedrockTool => ({ +const lowerToolSpec = (tool: ToolDefinition): BedrockToolSpec => ({ toolSpec: { name: tool.name, description: tool.description, @@ -222,11 +225,25 @@ const lowerTool = (tool: ToolDefinition): BedrockTool => ({ }, }) +const lowerTools = ( + breakpoints: BedrockCache.Breakpoints, + tools: ReadonlyArray, +): BedrockTool[] => { + const result: BedrockTool[] = [] + for (const tool of tools) { + result.push(lowerToolSpec(tool)) + const cachePoint = BedrockCache.block(breakpoints, tool.cache) + if (cachePoint) result.push(cachePoint) + } + return result +} + const textWithCache = ( + breakpoints: BedrockCache.Breakpoints, text: string, cache: CacheHint | undefined, ): Array => { - const cachePoint = BedrockCache.block(cache) + const cachePoint = BedrockCache.block(breakpoints, cache) return cachePoint ? [{ text }, cachePoint] : [{ text }] } @@ -257,7 +274,10 @@ const lowerToolResult = (part: ToolResultPart): BedrockToolResultBlock => ({ }, }) -const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (request: LLMRequest) { +const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* ( + request: LLMRequest, + breakpoints: BedrockCache.Breakpoints, +) { const messages: BedrockMessage[] = [] for (const message of request.messages) { @@ -267,7 +287,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ if (!ProviderShared.supportsContent(part, ["text", "media"])) return yield* ProviderShared.unsupportedContent("Bedrock Converse", "user", ["text", "media"]) if (part.type === "text") { - content.push(...textWithCache(part.text, part.cache)) + content.push(...textWithCache(breakpoints, part.text, part.cache)) continue } if (part.type === "media") { @@ -289,7 +309,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ "tool-call", ]) if (part.type === "text") { - content.push(...textWithCache(part.text, part.cache)) + content.push(...textWithCache(breakpoints, part.text, part.cache)) continue } if (part.type === "reasoning") { @@ -309,11 +329,13 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ continue } - const content: BedrockToolResultBlock[] = [] + const content: BedrockUserBlock[] = [] for (const part of message.content) { if (!ProviderShared.supportsContent(part, ["tool-result"])) return yield* ProviderShared.unsupportedContent("Bedrock Converse", "tool", ["tool-result"]) content.push(lowerToolResult(part)) + const cachePoint = BedrockCache.block(breakpoints, part.cache) + if (cachePoint) content.push(cachePoint) } messages.push({ role: "user", content }) } @@ -323,16 +345,32 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ // System prompts share the cache-point convention: emit the text block, then // optionally a positional `cachePoint` marker. -const lowerSystem = (system: ReadonlyArray): BedrockSystemBlock[] => - system.flatMap((part) => textWithCache(part.text, part.cache)) +const lowerSystem = ( + breakpoints: BedrockCache.Breakpoints, + system: ReadonlyArray, +): BedrockSystemBlock[] => system.flatMap((part) => textWithCache(breakpoints, part.text, part.cache)) const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request: LLMRequest) { const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined const generation = request.generation + // Bedrock-Claude shares Anthropic's 4-breakpoint cap. Spend the budget in + // tools → system → messages order to favour the highest-impact prefixes. + const breakpoints = BedrockCache.breakpoints() + const toolConfig = + request.tools.length > 0 && request.toolChoice?.type !== "none" + ? { tools: lowerTools(breakpoints, request.tools), toolChoice } + : undefined + const system = request.system.length === 0 ? undefined : lowerSystem(breakpoints, request.system) + const messages = yield* lowerMessages(request, breakpoints) + if (breakpoints.dropped > 0) { + yield* Effect.logWarning( + `Bedrock Converse: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${BedrockCache.BEDROCK_BREAKPOINT_CAP} per request.`, + ) + } return { modelId: request.model.id, - messages: yield* lowerMessages(request), - system: request.system.length === 0 ? undefined : lowerSystem(request.system), + messages, + system, inferenceConfig: generation?.maxTokens === undefined && generation?.temperature === undefined && @@ -345,10 +383,7 @@ const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request: topP: generation?.topP, stopSequences: generation?.stop, }, - toolConfig: - request.tools.length > 0 && request.toolChoice?.type !== "none" - ? { tools: request.tools.map(lowerTool), toolChoice } - : undefined, + toolConfig, } }) diff --git a/packages/llm/src/protocols/utils/bedrock-cache.ts b/packages/llm/src/protocols/utils/bedrock-cache.ts index ca6e52cd11..fab4d07b5c 100644 --- a/packages/llm/src/protocols/utils/bedrock-cache.ts +++ b/packages/llm/src/protocols/utils/bedrock-cache.ts @@ -1,20 +1,37 @@ import { Schema } from "effect" import type { CacheHint } from "../../schema" +import { newBreakpoints, ttlBucket, type Breakpoints } from "./cache" // Bedrock cache markers are positional: emit a `cachePoint` block immediately -// after the content the caller wants treated as a cacheable prefix. +// after the content the caller wants treated as a cacheable prefix. Bedrock +// accepts optional `ttl: "5m" | "1h"` on cachePoint, mirroring Anthropic. export const CachePointBlock = Schema.Struct({ - cachePoint: Schema.Struct({ type: Schema.tag("default") }), + cachePoint: Schema.Struct({ + type: Schema.tag("default"), + ttl: Schema.optional(Schema.Literals(["5m", "1h"])), + }), }) export type CachePointBlock = Schema.Schema.Type -// Bedrock recently added optional `ttl: "5m" | "1h"` on cachePoint. Map -// `CacheHint.ttlSeconds` here once a recorded cassette validates the wire shape. -const DEFAULT: CachePointBlock = { cachePoint: { type: "default" } } +// Bedrock-Claude enforces the same 4-breakpoint cap as the Anthropic Messages +// API. Callers pass a shared counter through every `block()` call site so the +// budget is respected across `system`, `messages`, and `tools`. +export const BEDROCK_BREAKPOINT_CAP = 4 -export const block = (cache: CacheHint | undefined): CachePointBlock | undefined => { +export type { Breakpoints } from "./cache" +export const breakpoints = () => newBreakpoints(BEDROCK_BREAKPOINT_CAP) + +const DEFAULT_5M: CachePointBlock = { cachePoint: { type: "default" } } +const DEFAULT_1H: CachePointBlock = { cachePoint: { type: "default", ttl: "1h" } } + +export const block = (breakpoints: Breakpoints, cache: CacheHint | undefined): CachePointBlock | undefined => { if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined - return DEFAULT + if (breakpoints.remaining <= 0) { + breakpoints.dropped += 1 + return undefined + } + breakpoints.remaining -= 1 + return ttlBucket(cache.ttlSeconds) === "1h" ? DEFAULT_1H : DEFAULT_5M } export * as BedrockCache from "./bedrock-cache" diff --git a/packages/llm/src/protocols/utils/cache.ts b/packages/llm/src/protocols/utils/cache.ts new file mode 100644 index 0000000000..dd3e213e0e --- /dev/null +++ b/packages/llm/src/protocols/utils/cache.ts @@ -0,0 +1,16 @@ +// Shared helpers for provider cache-marker lowering. Anthropic and Bedrock +// both enforce a 4-breakpoint cap per request and accept the same `5m`/`1h` +// TTL buckets, so the counter and TTL mapping live here. + +export interface Breakpoints { + remaining: number + dropped: number +} + +export const newBreakpoints = (cap: number): Breakpoints => ({ remaining: cap, dropped: 0 }) + +// Returns `"1h"` for any `ttlSeconds >= 3600`, otherwise `undefined` (the +// provider default 5m). Anthropic & Bedrock both treat anything shorter than +// an hour as 5m. +export const ttlBucket = (ttlSeconds: number | undefined): "1h" | undefined => + ttlSeconds !== undefined && ttlSeconds >= 3600 ? "1h" : undefined diff --git a/packages/llm/src/schema/messages.ts b/packages/llm/src/schema/messages.ts index 3daf00bbc0..cc6b89a2c7 100644 --- a/packages/llm/src/schema/messages.ts +++ b/packages/llm/src/schema/messages.ts @@ -79,6 +79,7 @@ export const ToolResultPart = Object.assign( name: Schema.String, result: ToolResultValue, providerExecuted: Schema.optional(Schema.Boolean), + cache: Schema.optional(CacheHint), metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)), providerMetadata: Schema.optional(ProviderMetadata), }).annotate({ identifier: "LLM.Content.ToolResult" }), @@ -94,6 +95,7 @@ export const ToolResultPart = Object.assign( name: input.name, result: ToolResultValue.make(input.result, input.resultType), providerExecuted: input.providerExecuted, + cache: input.cache, metadata: input.metadata, providerMetadata: input.providerMetadata, }), @@ -151,6 +153,7 @@ export class ToolDefinition extends Schema.Class("LLM.ToolDefini name: Schema.String, description: Schema.String, inputSchema: JsonSchema, + cache: Schema.optional(CacheHint), metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)), native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)), }) {} diff --git a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts new file mode 100644 index 0000000000..b048d53ba0 --- /dev/null +++ b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts @@ -0,0 +1,48 @@ +import { Redactor } from "@opencode-ai/http-recorder" +import { describe, expect } from "bun:test" +import { Effect } from "effect" +import { CacheHint, LLM } from "../../src" +import { LLMClient } from "../../src/route" +import * as AnthropicMessages from "../../src/protocols/anthropic-messages" +import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios" +import { recordedTests } from "../recorded-test" + +const model = AnthropicMessages.model({ + id: "claude-haiku-4-5-20251001", + apiKey: process.env.ANTHROPIC_API_KEY ?? "fixture", +}) + +// Two identical generations in a row. The first call writes the prefix into +// Anthropic's cache; the second should report a cache read against the same +// prefix. Cassette captures both interactions in order. +const cacheRequest = LLM.request({ + id: "recorded_anthropic_cache", + model, + system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }], + prompt: "Say hi.", + generation: { maxTokens: 16, temperature: 0 }, +}) + +const recorded = recordedTests({ + prefix: "anthropic-messages-cache", + provider: "anthropic", + protocol: "anthropic-messages", + requires: ["ANTHROPIC_API_KEY"], + options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) }, +}) + +describe("Anthropic Messages cache recorded", () => { + recorded.effect.with("writes then reads cache_control on identical second call", { tags: ["cache"] }, () => + Effect.gen(function* () { + const first = yield* LLMClient.generate(cacheRequest) + // The first call may write the cache (cacheWriteInputTokens > 0) or it + // may be a fresh miss (both fields 0) depending on whether the prefix is + // already warm on Anthropic's side. The assertion that matters is that + // the SECOND call reports a non-zero cache read. + expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0) + + const second = yield* LLMClient.generate(cacheRequest) + expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0) + }), + ) +}) diff --git a/packages/llm/test/provider/anthropic-messages.test.ts b/packages/llm/test/provider/anthropic-messages.test.ts index 85900a1143..2f2b2a3e86 100644 --- a/packages/llm/test/provider/anthropic-messages.test.ts +++ b/packages/llm/test/provider/anthropic-messages.test.ts @@ -374,4 +374,134 @@ describe("Anthropic Messages route", () => { expect(error.message).toContain("Anthropic Messages user messages only support text content for now") }), ) + + it.effect("maps ttlSeconds >= 3600 to cache_control ttl: '1h'", () => + Effect.gen(function* () { + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + system: { type: "text", text: "system", cache: new CacheHint({ type: "ephemeral", ttlSeconds: 3600 }) }, + prompt: "hi", + }), + ) + + expect(prepared.body).toMatchObject({ + system: [{ type: "text", text: "system", cache_control: { type: "ephemeral", ttl: "1h" } }], + }) + }), + ) + + it.effect("emits cache_control on tool definitions and tool-result blocks", () => + Effect.gen(function* () { + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + tools: [ + { + name: "lookup", + description: "lookup tool", + inputSchema: { type: "object", properties: {} }, + cache: new CacheHint({ type: "ephemeral" }), + }, + ], + messages: [ + LLM.user("What's the weather?"), + LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]), + LLM.toolMessage({ + id: "call_1", + name: "lookup", + result: { temp: 72 }, + cache: new CacheHint({ type: "ephemeral" }), + }), + ], + }), + ) + + expect(prepared.body).toMatchObject({ + tools: [{ name: "lookup", cache_control: { type: "ephemeral" } }], + messages: [ + { role: "user", content: [{ type: "text", text: "What's the weather?" }] }, + { role: "assistant", content: [{ type: "tool_use", id: "call_1", name: "lookup" }] }, + { + role: "user", + content: [{ type: "tool_result", tool_use_id: "call_1", cache_control: { type: "ephemeral" } }], + }, + ], + }) + }), + ) + + it.effect("drops cache_control breakpoints past the 4-per-request cap", () => + Effect.gen(function* () { + const hint = new CacheHint({ type: "ephemeral" }) + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + system: [ + { type: "text", text: "a", cache: hint }, + { type: "text", text: "b", cache: hint }, + { type: "text", text: "c", cache: hint }, + { type: "text", text: "d", cache: hint }, + { type: "text", text: "e", cache: hint }, + { type: "text", text: "f", cache: hint }, + ], + prompt: "hi", + }), + ) + + const system = (prepared.body as { system: Array<{ cache_control?: unknown }> }).system + const marked = system.filter((part) => part.cache_control !== undefined) + expect(marked).toHaveLength(4) + expect(system[4]?.cache_control).toBeUndefined() + expect(system[5]?.cache_control).toBeUndefined() + }), + ) + + it.effect("spends breakpoint budget on tools before system before messages", () => + Effect.gen(function* () { + const hint = new CacheHint({ type: "ephemeral" }) + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + tools: [ + { + name: "t1", + description: "t1", + inputSchema: { type: "object", properties: {} }, + cache: hint, + }, + { + name: "t2", + description: "t2", + inputSchema: { type: "object", properties: {} }, + cache: hint, + }, + { + name: "t3", + description: "t3", + inputSchema: { type: "object", properties: {} }, + cache: hint, + }, + { + name: "t4", + description: "t4", + inputSchema: { type: "object", properties: {} }, + cache: hint, + }, + ], + system: [{ type: "text", text: "system-tail", cache: hint }], + messages: [LLM.user([{ type: "text", text: "message-tail", cache: hint }])], + }), + ) + + const body = prepared.body as { + tools: Array<{ cache_control?: unknown }> + system: Array<{ cache_control?: unknown }> + messages: Array<{ content: Array<{ cache_control?: unknown }> }> + } + expect(body.tools.every((t) => t.cache_control !== undefined)).toBe(true) + expect(body.system[0]?.cache_control).toBeUndefined() + expect(body.messages[0]?.content[0]?.cache_control).toBeUndefined() + }), + ) }) diff --git a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts new file mode 100644 index 0000000000..23dd697b9a --- /dev/null +++ b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts @@ -0,0 +1,50 @@ +import { describe, expect } from "bun:test" +import { Effect } from "effect" +import { CacheHint, LLM } from "../../src" +import { LLMClient } from "../../src/route" +import * as BedrockConverse from "../../src/protocols/bedrock-converse" +import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios" +import { recordedTests } from "../recorded-test" + +const RECORDING_REGION = process.env.BEDROCK_RECORDING_REGION ?? "us-east-1" + +// Use a Claude model on Bedrock — Nova has automatic prefix caching that +// doesn't reliably surface `cacheRead`/`cacheWrite` in usage, so the second +// call wouldn't deterministically prove cache mapping works. Override with +// BEDROCK_CACHE_MODEL_ID if your account has access elsewhere. +const model = BedrockConverse.model({ + id: process.env.BEDROCK_CACHE_MODEL_ID ?? "us.anthropic.claude-haiku-4-5-20251001-v1:0", + credentials: { + region: RECORDING_REGION, + accessKeyId: process.env.AWS_ACCESS_KEY_ID ?? "fixture", + secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY ?? "fixture", + sessionToken: process.env.AWS_SESSION_TOKEN, + }, +}) + +const cacheRequest = LLM.request({ + id: "recorded_bedrock_cache", + model, + system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }], + prompt: "Say hi.", + generation: { maxTokens: 16, temperature: 0 }, +}) + +const recorded = recordedTests({ + prefix: "bedrock-converse-cache", + provider: "amazon-bedrock", + protocol: "bedrock-converse", + requires: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"], +}) + +describe("Bedrock Converse cache recorded", () => { + recorded.effect.with("writes then reads cachePoint on identical second call", { tags: ["cache"] }, () => + Effect.gen(function* () { + const first = yield* LLMClient.generate(cacheRequest) + expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0) + + const second = yield* LLMClient.generate(cacheRequest) + expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0) + }), + ) +}) diff --git a/packages/llm/test/provider/bedrock-converse.test.ts b/packages/llm/test/provider/bedrock-converse.test.ts index 28be714bdf..d72e7f3116 100644 --- a/packages/llm/test/provider/bedrock-converse.test.ts +++ b/packages/llm/test/provider/bedrock-converse.test.ts @@ -440,6 +440,79 @@ describe("Bedrock Converse route", () => { expect(error.message).toContain("Bedrock Converse does not support media type application/x-tar") }), ) + + it.effect("maps ttlSeconds >= 3600 to cachePoint ttl: '1h'", () => + Effect.gen(function* () { + const cache = new CacheHint({ type: "ephemeral", ttlSeconds: 3600 }) + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + system: [{ type: "text", text: "system", cache }], + prompt: "hi", + }), + ) + + expect(prepared.body).toMatchObject({ + system: [{ text: "system" }, { cachePoint: { type: "default", ttl: "1h" } }], + }) + }), + ) + + it.effect("appends cachePoint after marked tool definitions and tool-result blocks", () => + Effect.gen(function* () { + const cache = new CacheHint({ type: "ephemeral" }) + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + tools: [ + { name: "lookup", description: "lookup", inputSchema: { type: "object", properties: {} }, cache }, + ], + messages: [ + LLM.user("What's the weather?"), + LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]), + LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }), + ], + }), + ) + + expect(prepared.body).toMatchObject({ + toolConfig: { + tools: [{ toolSpec: { name: "lookup" } }, { cachePoint: { type: "default" } }], + }, + messages: [ + { role: "user", content: [{ text: "What's the weather?" }] }, + { role: "assistant", content: [{ toolUse: { toolUseId: "call_1" } }] }, + { + role: "user", + content: [{ toolResult: { toolUseId: "call_1" } }, { cachePoint: { type: "default" } }], + }, + ], + }) + }), + ) + + it.effect("drops cachePoint markers past the 4-per-request cap", () => + Effect.gen(function* () { + const cache = new CacheHint({ type: "ephemeral" }) + const prepared = yield* LLMClient.prepare( + LLM.request({ + model, + system: [ + { type: "text", text: "a", cache }, + { type: "text", text: "b", cache }, + { type: "text", text: "c", cache }, + { type: "text", text: "d", cache }, + { type: "text", text: "e", cache }, + { type: "text", text: "f", cache }, + ], + prompt: "hi", + }), + ) + + const system = (prepared.body as { system: Array<{ cachePoint?: unknown }> }).system + expect(system.filter((part) => "cachePoint" in part)).toHaveLength(4) + }), + ) }) // Live recorded integration tests. Run with `RECORD=true AWS_ACCESS_KEY_ID=... diff --git a/packages/llm/test/provider/gemini-cache.recorded.test.ts b/packages/llm/test/provider/gemini-cache.recorded.test.ts new file mode 100644 index 0000000000..145728fdc6 --- /dev/null +++ b/packages/llm/test/provider/gemini-cache.recorded.test.ts @@ -0,0 +1,47 @@ +import { describe, expect } from "bun:test" +import { Effect } from "effect" +import { LLM } from "../../src" +import { LLMClient } from "../../src/route" +import * as Gemini from "../../src/protocols/gemini" +import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios" +import { recordedTests } from "../recorded-test" + +const model = Gemini.model({ + id: "gemini-2.5-flash", + apiKey: process.env.GEMINI_API_KEY ?? "fixture", +}) + +// Gemini does implicit prefix caching on 2.5+ models above ~1024 tokens. The +// `CacheHint` is currently a no-op for Gemini (the explicit `CachedContent` +// API is out-of-band and intentionally not wired up). This test exists to +// pin the usage-parsing path: `cachedContentTokenCount` should surface as +// `cacheReadInputTokens` on the second identical call. +const cacheRequest = LLM.request({ + id: "recorded_gemini_cache", + model, + system: LARGE_CACHEABLE_SYSTEM, + prompt: "Say hi.", + generation: { maxTokens: 16, temperature: 0 }, +}) + +const recorded = recordedTests({ + prefix: "gemini-cache", + provider: "google", + protocol: "gemini", + requires: ["GEMINI_API_KEY"], +}) + +describe("Gemini cache recorded", () => { + recorded.effect.with("reports cachedContentTokenCount on identical second call", { tags: ["cache"] }, () => + Effect.gen(function* () { + const first = yield* LLMClient.generate(cacheRequest) + expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0) + + const second = yield* LLMClient.generate(cacheRequest) + // Implicit caching is best-effort on Gemini's side; we assert the field + // is at least populated and non-negative. When re-recording, verify the + // cassette shows > 0 in the second response's usage. + expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0) + }), + ) +}) diff --git a/packages/llm/test/provider/openai-responses-cache.recorded.test.ts b/packages/llm/test/provider/openai-responses-cache.recorded.test.ts new file mode 100644 index 0000000000..0ac3dfe2b9 --- /dev/null +++ b/packages/llm/test/provider/openai-responses-cache.recorded.test.ts @@ -0,0 +1,44 @@ +import { describe, expect } from "bun:test" +import { Effect } from "effect" +import { LLM } from "../../src" +import { LLMClient } from "../../src/route" +import * as OpenAIResponses from "../../src/protocols/openai-responses" +import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios" +import { recordedTests } from "../recorded-test" + +const model = OpenAIResponses.model({ + id: "gpt-4.1-mini", + apiKey: process.env.OPENAI_API_KEY ?? "fixture", +}) + +// OpenAI caches prefixes automatically once they cross the 1024-token threshold; +// `CacheHint` is a no-op for the wire body. The stable signal is the +// `prompt_cache_key` routing hint, which keeps repeated calls on the same shard +// so cache hits are observable. +const cacheRequest = LLM.request({ + id: "recorded_openai_responses_cache", + model, + system: LARGE_CACHEABLE_SYSTEM, + prompt: "Say hi.", + generation: { maxTokens: 16, temperature: 0 }, + providerOptions: { openai: { promptCacheKey: "recorded-cache-test" } }, +}) + +const recorded = recordedTests({ + prefix: "openai-responses-cache", + provider: "openai", + protocol: "openai-responses", + requires: ["OPENAI_API_KEY"], +}) + +describe("OpenAI Responses cache recorded", () => { + recorded.effect.with("reports cached_tokens on identical second call", { tags: ["cache"] }, () => + Effect.gen(function* () { + const first = yield* LLMClient.generate(cacheRequest) + expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0) + + const second = yield* LLMClient.generate(cacheRequest) + expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0) + }), + ) +}) diff --git a/packages/llm/test/recorded-scenarios.ts b/packages/llm/test/recorded-scenarios.ts index 3fb3e0b9a9..2361bfdb78 100644 --- a/packages/llm/test/recorded-scenarios.ts +++ b/packages/llm/test/recorded-scenarios.ts @@ -6,6 +6,19 @@ import { tool } from "../src/tool" export const weatherToolName = "get_weather" +// A deterministic system prompt long enough to clear every supported provider's +// minimum cacheable-prefix threshold (Anthropic Haiku 3.5: 2048 tokens; Anthropic +// Opus/Haiku 4.5: 4096 tokens; OpenAI/Gemini/Bedrock: lower). Built by repeating +// a fixed sentence — the cassette replays bit-for-bit, so the exact text matters +// only when re-recording with `RECORD=true`. +export const LARGE_CACHEABLE_SYSTEM = (() => { + const sentence = + "You are a concise, factual assistant. Answer precisely and avoid filler. Cite numbers when known. " + // ~100 chars per sentence × 250 repeats ≈ 25,000 chars ≈ 5k+ tokens, safely + // above every provider's threshold. + return sentence.repeat(250) +})() + export const weatherTool = LLM.toolDefinition({ name: weatherToolName, description: "Get current weather for a city.",