feat(llm): cache hint TTL, breakpoint cap, and tool placement (#26779)

This commit is contained in:
Kit Langton
2026-05-10 21:17:38 -04:00
committed by GitHub
parent fed716ada5
commit 77e6c0d329
12 changed files with 555 additions and 39 deletions

View File

@@ -16,6 +16,7 @@ import {
type ToolResultPart,
} from "../schema"
import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
import * as Cache from "./utils/cache"
import { ToolStream } from "./utils/tool-stream"
const ADAPTER = "anthropic-messages"
@@ -25,7 +26,10 @@ export const PATH = "/messages"
// =============================================================================
// Request Body Schema
// =============================================================================
const AnthropicCacheControl = Schema.Struct({ type: Schema.tag("ephemeral") })
const AnthropicCacheControl = Schema.Struct({
type: Schema.tag("ephemeral"),
ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
})
const AnthropicTextBlock = Schema.Struct({
type: Schema.tag("text"),
@@ -193,8 +197,24 @@ const invalid = ProviderShared.invalidRequest
// =============================================================================
// Request Lowering
// =============================================================================
const cacheControl = (cache: CacheHint | undefined) =>
cache?.type === "ephemeral" ? { type: "ephemeral" as const } : undefined
// Anthropic accepts at most 4 explicit cache_control breakpoints per request,
// across `tools`, `system`, and `messages`. Beyond the cap the API returns a
// 400 — so the lowering layer counts emitted markers and silently drops any
// that exceed it.
const ANTHROPIC_BREAKPOINT_CAP = 4
const EPHEMERAL_5M = { type: "ephemeral" as const }
const EPHEMERAL_1H = { type: "ephemeral" as const, ttl: "1h" as const }
const cacheControl = (breakpoints: Cache.Breakpoints, cache: CacheHint | undefined) => {
if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
if (breakpoints.remaining <= 0) {
breakpoints.dropped += 1
return undefined
}
breakpoints.remaining -= 1
return Cache.ttlBucket(cache.ttlSeconds) === "1h" ? EPHEMERAL_1H : EPHEMERAL_5M
}
const anthropicMetadata = (metadata: Record<string, unknown>): ProviderMetadata => ({ anthropic: metadata })
@@ -204,10 +224,11 @@ const signatureFromMetadata = (metadata: ProviderMetadata | undefined): string |
return typeof anthropic.signature === "string" ? anthropic.signature : undefined
}
const lowerTool = (tool: ToolDefinition): AnthropicTool => ({
const lowerTool = (breakpoints: Cache.Breakpoints, tool: ToolDefinition): AnthropicTool => ({
name: tool.name,
description: tool.description,
input_schema: tool.inputSchema,
cache_control: cacheControl(breakpoints, tool.cache),
})
const lowerToolChoice = (toolChoice: NonNullable<LLMRequest["toolChoice"]>) =>
@@ -249,7 +270,10 @@ const lowerServerToolResult = Effect.fn("AnthropicMessages.lowerServerToolResult
return { type: wireType, tool_use_id: part.id, content: part.result.value } satisfies AnthropicServerToolResultBlock
})
const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (request: LLMRequest) {
const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (
request: LLMRequest,
breakpoints: Cache.Breakpoints,
) {
const messages: AnthropicMessage[] = []
for (const message of request.messages) {
@@ -258,7 +282,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
for (const part of message.content) {
if (!ProviderShared.supportsContent(part, ["text"]))
return yield* ProviderShared.unsupportedContent("Anthropic Messages", "user", ["text"])
content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
}
messages.push({ role: "user", content })
continue
@@ -268,7 +292,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
const content: AnthropicAssistantBlock[] = []
for (const part of message.content) {
if (part.type === "text") {
content.push({ type: "text", text: part.text, cache_control: cacheControl(part.cache) })
content.push({ type: "text", text: part.text, cache_control: cacheControl(breakpoints, part.cache) })
continue
}
if (part.type === "reasoning") {
@@ -304,6 +328,7 @@ const lowerMessages = Effect.fn("AnthropicMessages.lowerMessages")(function* (re
tool_use_id: part.id,
content: ProviderShared.toolResultText(part),
is_error: part.result.type === "error" ? true : undefined,
cache_control: cacheControl(breakpoints, part.cache),
})
}
messages.push({ role: "user", content })
@@ -330,18 +355,33 @@ const lowerThinking = Effect.fn("AnthropicMessages.lowerThinking")(function* (re
const fromRequest = Effect.fn("AnthropicMessages.fromRequest")(function* (request: LLMRequest) {
const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
const generation = request.generation
// Allocate the 4-breakpoint budget in invalidation order: tools → system →
// messages. Tools live highest in the cache hierarchy, so when callers
// over-mark we keep their tool hints and shed the message-tail ones first.
const breakpoints = Cache.newBreakpoints(ANTHROPIC_BREAKPOINT_CAP)
const tools =
request.tools.length === 0 || request.toolChoice?.type === "none"
? undefined
: request.tools.map((tool) => lowerTool(breakpoints, tool))
const system =
request.system.length === 0
? undefined
: request.system.map((part) => ({
type: "text" as const,
text: part.text,
cache_control: cacheControl(breakpoints, part.cache),
}))
const messages = yield* lowerMessages(request, breakpoints)
if (breakpoints.dropped > 0) {
yield* Effect.logWarning(
`Anthropic Messages: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${ANTHROPIC_BREAKPOINT_CAP} per request.`,
)
}
return {
model: request.model.id,
system:
request.system.length === 0
? undefined
: request.system.map((part) => ({
type: "text" as const,
text: part.text,
cache_control: cacheControl(part.cache),
})),
messages: yield* lowerMessages(request),
tools: request.tools.length === 0 || request.toolChoice?.type === "none" ? undefined : request.tools.map(lowerTool),
system,
messages,
tools,
tool_choice: toolChoice,
stream: true as const,
max_tokens: generation?.maxTokens ?? request.model.limits.output ?? 4096,

View File

@@ -108,7 +108,7 @@ type BedrockMessage = Schema.Schema.Type<typeof BedrockMessage>
const BedrockSystemBlock = Schema.Union([BedrockTextBlock, BedrockCache.CachePointBlock])
type BedrockSystemBlock = Schema.Schema.Type<typeof BedrockSystemBlock>
const BedrockTool = Schema.Struct({
const BedrockToolSpec = Schema.Struct({
toolSpec: Schema.Struct({
name: Schema.String,
description: Schema.String,
@@ -117,6 +117,9 @@ const BedrockTool = Schema.Struct({
}),
}),
})
type BedrockToolSpec = Schema.Schema.Type<typeof BedrockToolSpec>
const BedrockTool = Schema.Union([BedrockToolSpec, BedrockCache.CachePointBlock])
type BedrockTool = Schema.Schema.Type<typeof BedrockTool>
const BedrockToolChoice = Schema.Union([
@@ -214,7 +217,7 @@ type BedrockEvent = Schema.Schema.Type<typeof BedrockEvent>
// =============================================================================
// Request Lowering
// =============================================================================
const lowerTool = (tool: ToolDefinition): BedrockTool => ({
const lowerToolSpec = (tool: ToolDefinition): BedrockToolSpec => ({
toolSpec: {
name: tool.name,
description: tool.description,
@@ -222,11 +225,25 @@ const lowerTool = (tool: ToolDefinition): BedrockTool => ({
},
})
const lowerTools = (
breakpoints: BedrockCache.Breakpoints,
tools: ReadonlyArray<ToolDefinition>,
): BedrockTool[] => {
const result: BedrockTool[] = []
for (const tool of tools) {
result.push(lowerToolSpec(tool))
const cachePoint = BedrockCache.block(breakpoints, tool.cache)
if (cachePoint) result.push(cachePoint)
}
return result
}
const textWithCache = (
breakpoints: BedrockCache.Breakpoints,
text: string,
cache: CacheHint | undefined,
): Array<BedrockTextBlock | BedrockCache.CachePointBlock> => {
const cachePoint = BedrockCache.block(cache)
const cachePoint = BedrockCache.block(breakpoints, cache)
return cachePoint ? [{ text }, cachePoint] : [{ text }]
}
@@ -257,7 +274,10 @@ const lowerToolResult = (part: ToolResultPart): BedrockToolResultBlock => ({
},
})
const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (request: LLMRequest) {
const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (
request: LLMRequest,
breakpoints: BedrockCache.Breakpoints,
) {
const messages: BedrockMessage[] = []
for (const message of request.messages) {
@@ -267,7 +287,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
if (!ProviderShared.supportsContent(part, ["text", "media"]))
return yield* ProviderShared.unsupportedContent("Bedrock Converse", "user", ["text", "media"])
if (part.type === "text") {
content.push(...textWithCache(part.text, part.cache))
content.push(...textWithCache(breakpoints, part.text, part.cache))
continue
}
if (part.type === "media") {
@@ -289,7 +309,7 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
"tool-call",
])
if (part.type === "text") {
content.push(...textWithCache(part.text, part.cache))
content.push(...textWithCache(breakpoints, part.text, part.cache))
continue
}
if (part.type === "reasoning") {
@@ -309,11 +329,13 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
continue
}
const content: BedrockToolResultBlock[] = []
const content: BedrockUserBlock[] = []
for (const part of message.content) {
if (!ProviderShared.supportsContent(part, ["tool-result"]))
return yield* ProviderShared.unsupportedContent("Bedrock Converse", "tool", ["tool-result"])
content.push(lowerToolResult(part))
const cachePoint = BedrockCache.block(breakpoints, part.cache)
if (cachePoint) content.push(cachePoint)
}
messages.push({ role: "user", content })
}
@@ -323,16 +345,32 @@ const lowerMessages = Effect.fn("BedrockConverse.lowerMessages")(function* (requ
// System prompts share the cache-point convention: emit the text block, then
// optionally a positional `cachePoint` marker.
const lowerSystem = (system: ReadonlyArray<LLMRequest["system"][number]>): BedrockSystemBlock[] =>
system.flatMap((part) => textWithCache(part.text, part.cache))
const lowerSystem = (
breakpoints: BedrockCache.Breakpoints,
system: ReadonlyArray<LLMRequest["system"][number]>,
): BedrockSystemBlock[] => system.flatMap((part) => textWithCache(breakpoints, part.text, part.cache))
const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request: LLMRequest) {
const toolChoice = request.toolChoice ? yield* lowerToolChoice(request.toolChoice) : undefined
const generation = request.generation
// Bedrock-Claude shares Anthropic's 4-breakpoint cap. Spend the budget in
// tools → system → messages order to favour the highest-impact prefixes.
const breakpoints = BedrockCache.breakpoints()
const toolConfig =
request.tools.length > 0 && request.toolChoice?.type !== "none"
? { tools: lowerTools(breakpoints, request.tools), toolChoice }
: undefined
const system = request.system.length === 0 ? undefined : lowerSystem(breakpoints, request.system)
const messages = yield* lowerMessages(request, breakpoints)
if (breakpoints.dropped > 0) {
yield* Effect.logWarning(
`Bedrock Converse: dropped ${breakpoints.dropped} cache breakpoint(s); the API allows at most ${BedrockCache.BEDROCK_BREAKPOINT_CAP} per request.`,
)
}
return {
modelId: request.model.id,
messages: yield* lowerMessages(request),
system: request.system.length === 0 ? undefined : lowerSystem(request.system),
messages,
system,
inferenceConfig:
generation?.maxTokens === undefined &&
generation?.temperature === undefined &&
@@ -345,10 +383,7 @@ const fromRequest = Effect.fn("BedrockConverse.fromRequest")(function* (request:
topP: generation?.topP,
stopSequences: generation?.stop,
},
toolConfig:
request.tools.length > 0 && request.toolChoice?.type !== "none"
? { tools: request.tools.map(lowerTool), toolChoice }
: undefined,
toolConfig,
}
})

View File

@@ -1,20 +1,37 @@
import { Schema } from "effect"
import type { CacheHint } from "../../schema"
import { newBreakpoints, ttlBucket, type Breakpoints } from "./cache"
// Bedrock cache markers are positional: emit a `cachePoint` block immediately
// after the content the caller wants treated as a cacheable prefix.
// after the content the caller wants treated as a cacheable prefix. Bedrock
// accepts optional `ttl: "5m" | "1h"` on cachePoint, mirroring Anthropic.
export const CachePointBlock = Schema.Struct({
cachePoint: Schema.Struct({ type: Schema.tag("default") }),
cachePoint: Schema.Struct({
type: Schema.tag("default"),
ttl: Schema.optional(Schema.Literals(["5m", "1h"])),
}),
})
export type CachePointBlock = Schema.Schema.Type<typeof CachePointBlock>
// Bedrock recently added optional `ttl: "5m" | "1h"` on cachePoint. Map
// `CacheHint.ttlSeconds` here once a recorded cassette validates the wire shape.
const DEFAULT: CachePointBlock = { cachePoint: { type: "default" } }
// Bedrock-Claude enforces the same 4-breakpoint cap as the Anthropic Messages
// API. Callers pass a shared counter through every `block()` call site so the
// budget is respected across `system`, `messages`, and `tools`.
export const BEDROCK_BREAKPOINT_CAP = 4
export const block = (cache: CacheHint | undefined): CachePointBlock | undefined => {
export type { Breakpoints } from "./cache"
export const breakpoints = () => newBreakpoints(BEDROCK_BREAKPOINT_CAP)
const DEFAULT_5M: CachePointBlock = { cachePoint: { type: "default" } }
const DEFAULT_1H: CachePointBlock = { cachePoint: { type: "default", ttl: "1h" } }
export const block = (breakpoints: Breakpoints, cache: CacheHint | undefined): CachePointBlock | undefined => {
if (cache?.type !== "ephemeral" && cache?.type !== "persistent") return undefined
return DEFAULT
if (breakpoints.remaining <= 0) {
breakpoints.dropped += 1
return undefined
}
breakpoints.remaining -= 1
return ttlBucket(cache.ttlSeconds) === "1h" ? DEFAULT_1H : DEFAULT_5M
}
export * as BedrockCache from "./bedrock-cache"

View File

@@ -0,0 +1,16 @@
// Shared helpers for provider cache-marker lowering. Anthropic and Bedrock
// both enforce a 4-breakpoint cap per request and accept the same `5m`/`1h`
// TTL buckets, so the counter and TTL mapping live here.
export interface Breakpoints {
remaining: number
dropped: number
}
export const newBreakpoints = (cap: number): Breakpoints => ({ remaining: cap, dropped: 0 })
// Returns `"1h"` for any `ttlSeconds >= 3600`, otherwise `undefined` (the
// provider default 5m). Anthropic & Bedrock both treat anything shorter than
// an hour as 5m.
export const ttlBucket = (ttlSeconds: number | undefined): "1h" | undefined =>
ttlSeconds !== undefined && ttlSeconds >= 3600 ? "1h" : undefined

View File

@@ -79,6 +79,7 @@ export const ToolResultPart = Object.assign(
name: Schema.String,
result: ToolResultValue,
providerExecuted: Schema.optional(Schema.Boolean),
cache: Schema.optional(CacheHint),
metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
providerMetadata: Schema.optional(ProviderMetadata),
}).annotate({ identifier: "LLM.Content.ToolResult" }),
@@ -94,6 +95,7 @@ export const ToolResultPart = Object.assign(
name: input.name,
result: ToolResultValue.make(input.result, input.resultType),
providerExecuted: input.providerExecuted,
cache: input.cache,
metadata: input.metadata,
providerMetadata: input.providerMetadata,
}),
@@ -151,6 +153,7 @@ export class ToolDefinition extends Schema.Class<ToolDefinition>("LLM.ToolDefini
name: Schema.String,
description: Schema.String,
inputSchema: JsonSchema,
cache: Schema.optional(CacheHint),
metadata: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
}) {}

View File

@@ -0,0 +1,48 @@
import { Redactor } from "@opencode-ai/http-recorder"
import { describe, expect } from "bun:test"
import { Effect } from "effect"
import { CacheHint, LLM } from "../../src"
import { LLMClient } from "../../src/route"
import * as AnthropicMessages from "../../src/protocols/anthropic-messages"
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
import { recordedTests } from "../recorded-test"
const model = AnthropicMessages.model({
id: "claude-haiku-4-5-20251001",
apiKey: process.env.ANTHROPIC_API_KEY ?? "fixture",
})
// Two identical generations in a row. The first call writes the prefix into
// Anthropic's cache; the second should report a cache read against the same
// prefix. Cassette captures both interactions in order.
const cacheRequest = LLM.request({
id: "recorded_anthropic_cache",
model,
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
prompt: "Say hi.",
generation: { maxTokens: 16, temperature: 0 },
})
const recorded = recordedTests({
prefix: "anthropic-messages-cache",
provider: "anthropic",
protocol: "anthropic-messages",
requires: ["ANTHROPIC_API_KEY"],
options: { redactor: Redactor.defaults({ requestHeaders: { allow: ["content-type", "anthropic-version"] } }) },
})
describe("Anthropic Messages cache recorded", () => {
recorded.effect.with("writes then reads cache_control on identical second call", { tags: ["cache"] }, () =>
Effect.gen(function* () {
const first = yield* LLMClient.generate(cacheRequest)
// The first call may write the cache (cacheWriteInputTokens > 0) or it
// may be a fresh miss (both fields 0) depending on whether the prefix is
// already warm on Anthropic's side. The assertion that matters is that
// the SECOND call reports a non-zero cache read.
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
const second = yield* LLMClient.generate(cacheRequest)
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
}),
)
})

View File

@@ -374,4 +374,134 @@ describe("Anthropic Messages route", () => {
expect(error.message).toContain("Anthropic Messages user messages only support text content for now")
}),
)
it.effect("maps ttlSeconds >= 3600 to cache_control ttl: '1h'", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
system: { type: "text", text: "system", cache: new CacheHint({ type: "ephemeral", ttlSeconds: 3600 }) },
prompt: "hi",
}),
)
expect(prepared.body).toMatchObject({
system: [{ type: "text", text: "system", cache_control: { type: "ephemeral", ttl: "1h" } }],
})
}),
)
it.effect("emits cache_control on tool definitions and tool-result blocks", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
tools: [
{
name: "lookup",
description: "lookup tool",
inputSchema: { type: "object", properties: {} },
cache: new CacheHint({ type: "ephemeral" }),
},
],
messages: [
LLM.user("What's the weather?"),
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
LLM.toolMessage({
id: "call_1",
name: "lookup",
result: { temp: 72 },
cache: new CacheHint({ type: "ephemeral" }),
}),
],
}),
)
expect(prepared.body).toMatchObject({
tools: [{ name: "lookup", cache_control: { type: "ephemeral" } }],
messages: [
{ role: "user", content: [{ type: "text", text: "What's the weather?" }] },
{ role: "assistant", content: [{ type: "tool_use", id: "call_1", name: "lookup" }] },
{
role: "user",
content: [{ type: "tool_result", tool_use_id: "call_1", cache_control: { type: "ephemeral" } }],
},
],
})
}),
)
it.effect("drops cache_control breakpoints past the 4-per-request cap", () =>
Effect.gen(function* () {
const hint = new CacheHint({ type: "ephemeral" })
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
system: [
{ type: "text", text: "a", cache: hint },
{ type: "text", text: "b", cache: hint },
{ type: "text", text: "c", cache: hint },
{ type: "text", text: "d", cache: hint },
{ type: "text", text: "e", cache: hint },
{ type: "text", text: "f", cache: hint },
],
prompt: "hi",
}),
)
const system = (prepared.body as { system: Array<{ cache_control?: unknown }> }).system
const marked = system.filter((part) => part.cache_control !== undefined)
expect(marked).toHaveLength(4)
expect(system[4]?.cache_control).toBeUndefined()
expect(system[5]?.cache_control).toBeUndefined()
}),
)
it.effect("spends breakpoint budget on tools before system before messages", () =>
Effect.gen(function* () {
const hint = new CacheHint({ type: "ephemeral" })
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
tools: [
{
name: "t1",
description: "t1",
inputSchema: { type: "object", properties: {} },
cache: hint,
},
{
name: "t2",
description: "t2",
inputSchema: { type: "object", properties: {} },
cache: hint,
},
{
name: "t3",
description: "t3",
inputSchema: { type: "object", properties: {} },
cache: hint,
},
{
name: "t4",
description: "t4",
inputSchema: { type: "object", properties: {} },
cache: hint,
},
],
system: [{ type: "text", text: "system-tail", cache: hint }],
messages: [LLM.user([{ type: "text", text: "message-tail", cache: hint }])],
}),
)
const body = prepared.body as {
tools: Array<{ cache_control?: unknown }>
system: Array<{ cache_control?: unknown }>
messages: Array<{ content: Array<{ cache_control?: unknown }> }>
}
expect(body.tools.every((t) => t.cache_control !== undefined)).toBe(true)
expect(body.system[0]?.cache_control).toBeUndefined()
expect(body.messages[0]?.content[0]?.cache_control).toBeUndefined()
}),
)
})

View File

@@ -0,0 +1,50 @@
import { describe, expect } from "bun:test"
import { Effect } from "effect"
import { CacheHint, LLM } from "../../src"
import { LLMClient } from "../../src/route"
import * as BedrockConverse from "../../src/protocols/bedrock-converse"
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
import { recordedTests } from "../recorded-test"
const RECORDING_REGION = process.env.BEDROCK_RECORDING_REGION ?? "us-east-1"
// Use a Claude model on Bedrock — Nova has automatic prefix caching that
// doesn't reliably surface `cacheRead`/`cacheWrite` in usage, so the second
// call wouldn't deterministically prove cache mapping works. Override with
// BEDROCK_CACHE_MODEL_ID if your account has access elsewhere.
const model = BedrockConverse.model({
id: process.env.BEDROCK_CACHE_MODEL_ID ?? "us.anthropic.claude-haiku-4-5-20251001-v1:0",
credentials: {
region: RECORDING_REGION,
accessKeyId: process.env.AWS_ACCESS_KEY_ID ?? "fixture",
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY ?? "fixture",
sessionToken: process.env.AWS_SESSION_TOKEN,
},
})
const cacheRequest = LLM.request({
id: "recorded_bedrock_cache",
model,
system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
prompt: "Say hi.",
generation: { maxTokens: 16, temperature: 0 },
})
const recorded = recordedTests({
prefix: "bedrock-converse-cache",
provider: "amazon-bedrock",
protocol: "bedrock-converse",
requires: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
})
describe("Bedrock Converse cache recorded", () => {
recorded.effect.with("writes then reads cachePoint on identical second call", { tags: ["cache"] }, () =>
Effect.gen(function* () {
const first = yield* LLMClient.generate(cacheRequest)
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
const second = yield* LLMClient.generate(cacheRequest)
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
}),
)
})

View File

@@ -440,6 +440,79 @@ describe("Bedrock Converse route", () => {
expect(error.message).toContain("Bedrock Converse does not support media type application/x-tar")
}),
)
it.effect("maps ttlSeconds >= 3600 to cachePoint ttl: '1h'", () =>
Effect.gen(function* () {
const cache = new CacheHint({ type: "ephemeral", ttlSeconds: 3600 })
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
system: [{ type: "text", text: "system", cache }],
prompt: "hi",
}),
)
expect(prepared.body).toMatchObject({
system: [{ text: "system" }, { cachePoint: { type: "default", ttl: "1h" } }],
})
}),
)
it.effect("appends cachePoint after marked tool definitions and tool-result blocks", () =>
Effect.gen(function* () {
const cache = new CacheHint({ type: "ephemeral" })
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
tools: [
{ name: "lookup", description: "lookup", inputSchema: { type: "object", properties: {} }, cache },
],
messages: [
LLM.user("What's the weather?"),
LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
],
}),
)
expect(prepared.body).toMatchObject({
toolConfig: {
tools: [{ toolSpec: { name: "lookup" } }, { cachePoint: { type: "default" } }],
},
messages: [
{ role: "user", content: [{ text: "What's the weather?" }] },
{ role: "assistant", content: [{ toolUse: { toolUseId: "call_1" } }] },
{
role: "user",
content: [{ toolResult: { toolUseId: "call_1" } }, { cachePoint: { type: "default" } }],
},
],
})
}),
)
it.effect("drops cachePoint markers past the 4-per-request cap", () =>
Effect.gen(function* () {
const cache = new CacheHint({ type: "ephemeral" })
const prepared = yield* LLMClient.prepare(
LLM.request({
model,
system: [
{ type: "text", text: "a", cache },
{ type: "text", text: "b", cache },
{ type: "text", text: "c", cache },
{ type: "text", text: "d", cache },
{ type: "text", text: "e", cache },
{ type: "text", text: "f", cache },
],
prompt: "hi",
}),
)
const system = (prepared.body as { system: Array<{ cachePoint?: unknown }> }).system
expect(system.filter((part) => "cachePoint" in part)).toHaveLength(4)
}),
)
})
// Live recorded integration tests. Run with `RECORD=true AWS_ACCESS_KEY_ID=...

View File

@@ -0,0 +1,47 @@
import { describe, expect } from "bun:test"
import { Effect } from "effect"
import { LLM } from "../../src"
import { LLMClient } from "../../src/route"
import * as Gemini from "../../src/protocols/gemini"
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
import { recordedTests } from "../recorded-test"
const model = Gemini.model({
id: "gemini-2.5-flash",
apiKey: process.env.GEMINI_API_KEY ?? "fixture",
})
// Gemini does implicit prefix caching on 2.5+ models above ~1024 tokens. The
// `CacheHint` is currently a no-op for Gemini (the explicit `CachedContent`
// API is out-of-band and intentionally not wired up). This test exists to
// pin the usage-parsing path: `cachedContentTokenCount` should surface as
// `cacheReadInputTokens` on the second identical call.
const cacheRequest = LLM.request({
id: "recorded_gemini_cache",
model,
system: LARGE_CACHEABLE_SYSTEM,
prompt: "Say hi.",
generation: { maxTokens: 16, temperature: 0 },
})
const recorded = recordedTests({
prefix: "gemini-cache",
provider: "google",
protocol: "gemini",
requires: ["GEMINI_API_KEY"],
})
describe("Gemini cache recorded", () => {
recorded.effect.with("reports cachedContentTokenCount on identical second call", { tags: ["cache"] }, () =>
Effect.gen(function* () {
const first = yield* LLMClient.generate(cacheRequest)
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
const second = yield* LLMClient.generate(cacheRequest)
// Implicit caching is best-effort on Gemini's side; we assert the field
// is at least populated and non-negative. When re-recording, verify the
// cassette shows > 0 in the second response's usage.
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
}),
)
})

View File

@@ -0,0 +1,44 @@
import { describe, expect } from "bun:test"
import { Effect } from "effect"
import { LLM } from "../../src"
import { LLMClient } from "../../src/route"
import * as OpenAIResponses from "../../src/protocols/openai-responses"
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
import { recordedTests } from "../recorded-test"
const model = OpenAIResponses.model({
id: "gpt-4.1-mini",
apiKey: process.env.OPENAI_API_KEY ?? "fixture",
})
// OpenAI caches prefixes automatically once they cross the 1024-token threshold;
// `CacheHint` is a no-op for the wire body. The stable signal is the
// `prompt_cache_key` routing hint, which keeps repeated calls on the same shard
// so cache hits are observable.
const cacheRequest = LLM.request({
id: "recorded_openai_responses_cache",
model,
system: LARGE_CACHEABLE_SYSTEM,
prompt: "Say hi.",
generation: { maxTokens: 16, temperature: 0 },
providerOptions: { openai: { promptCacheKey: "recorded-cache-test" } },
})
const recorded = recordedTests({
prefix: "openai-responses-cache",
provider: "openai",
protocol: "openai-responses",
requires: ["OPENAI_API_KEY"],
})
describe("OpenAI Responses cache recorded", () => {
recorded.effect.with("reports cached_tokens on identical second call", { tags: ["cache"] }, () =>
Effect.gen(function* () {
const first = yield* LLMClient.generate(cacheRequest)
expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
const second = yield* LLMClient.generate(cacheRequest)
expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThan(0)
}),
)
})

View File

@@ -6,6 +6,19 @@ import { tool } from "../src/tool"
export const weatherToolName = "get_weather"
// A deterministic system prompt long enough to clear every supported provider's
// minimum cacheable-prefix threshold (Anthropic Haiku 3.5: 2048 tokens; Anthropic
// Opus/Haiku 4.5: 4096 tokens; OpenAI/Gemini/Bedrock: lower). Built by repeating
// a fixed sentence — the cassette replays bit-for-bit, so the exact text matters
// only when re-recording with `RECORD=true`.
export const LARGE_CACHEABLE_SYSTEM = (() => {
const sentence =
"You are a concise, factual assistant. Answer precisely and avoid filler. Cite numbers when known. "
// ~100 chars per sentence × 250 repeats ≈ 25,000 chars ≈ 5k+ tokens, safely
// above every provider's threshold.
return sentence.repeat(250)
})()
export const weatherTool = LLM.toolDefinition({
name: weatherToolName,
description: "Get current weather for a city.",