mirror of
https://github.com/anomalyco/opencode.git
synced 2026-06-01 19:05:38 +00:00
438 lines
16 KiB
TypeScript
438 lines
16 KiB
TypeScript
import { expect } from "bun:test"
|
||
import { Effect, Schema, Stream } from "effect"
|
||
import {
|
||
LLM,
|
||
LLMEvent,
|
||
LLMResponse,
|
||
Message,
|
||
ToolChoice,
|
||
ToolDefinition,
|
||
type ContentPart,
|
||
type FinishReason,
|
||
type LLMRequest,
|
||
type Model,
|
||
} from "../src"
|
||
import { LLMClient } from "../src/route"
|
||
import { tool } from "../src/tool"
|
||
|
||
export const weatherToolName = "get_weather"
|
||
|
||
// A deterministic system prompt long enough to clear every supported provider's
|
||
// minimum cacheable-prefix threshold (Anthropic Haiku 3.5: 2048 tokens; Anthropic
|
||
// Opus/Haiku 4.5: 4096 tokens; OpenAI/Gemini/Bedrock: lower). Built by repeating
|
||
// a fixed sentence — the cassette replays bit-for-bit, so the exact text matters
|
||
// only when re-recording with `RECORD=true`.
|
||
export const LARGE_CACHEABLE_SYSTEM = (() => {
|
||
const sentence = "You are a concise, factual assistant. Answer precisely and avoid filler. Cite numbers when known. "
|
||
// ~100 chars per sentence × 250 repeats ≈ 25,000 chars ≈ 5k+ tokens, safely
|
||
// above every provider's threshold.
|
||
return sentence.repeat(250)
|
||
})()
|
||
|
||
export const weatherTool = ToolDefinition.make({
|
||
name: weatherToolName,
|
||
description: "Get current weather for a city.",
|
||
inputSchema: {
|
||
type: "object",
|
||
properties: { city: { type: "string" } },
|
||
required: ["city"],
|
||
additionalProperties: false,
|
||
},
|
||
})
|
||
|
||
export const weatherRuntimeTool = tool({
|
||
description: weatherTool.description,
|
||
parameters: Schema.Struct({ city: Schema.String }),
|
||
success: Schema.Struct({ temperature: Schema.Number, condition: Schema.String }),
|
||
execute: ({ city }) =>
|
||
Effect.succeed(
|
||
city === "Paris" ? { temperature: 22, condition: "sunny" } : { temperature: 0, condition: "unknown" },
|
||
),
|
||
})
|
||
|
||
export const weatherToolLoopRequest = (input: {
|
||
readonly id: string
|
||
readonly model: Model
|
||
readonly system?: string
|
||
readonly maxTokens?: number
|
||
readonly temperature?: number | false
|
||
}) =>
|
||
LLM.request({
|
||
id: input.id,
|
||
model: input.model,
|
||
system: input.system ?? "Use the get_weather tool, then answer in one short sentence.",
|
||
prompt: "What is the weather in Paris?",
|
||
cache: "none",
|
||
generation:
|
||
input.temperature === false
|
||
? { maxTokens: input.maxTokens ?? 80 }
|
||
: { maxTokens: input.maxTokens ?? 80, temperature: input.temperature ?? 0 },
|
||
})
|
||
|
||
export const goldenWeatherToolLoopRequest = (input: {
|
||
readonly id: string
|
||
readonly model: Model
|
||
readonly maxTokens?: number
|
||
readonly temperature?: number | false
|
||
}) =>
|
||
weatherToolLoopRequest({
|
||
...input,
|
||
system: "Use the get_weather tool exactly once. After the tool result, reply exactly: Paris is sunny.",
|
||
})
|
||
|
||
const RESTROOM_IMAGE_TEXT = "jiggling restroom prison"
|
||
const restroomImage = () =>
|
||
Effect.promise(() => Bun.file(new URL("./fixtures/media/restroom.png", import.meta.url)).bytes()).pipe(
|
||
Effect.map((bytes) => Buffer.from(bytes).toString("base64")),
|
||
)
|
||
|
||
export const runWeatherToolLoop = (request: LLMRequest) =>
|
||
LLMClient.stream({
|
||
request,
|
||
tools: { [weatherToolName]: weatherRuntimeTool },
|
||
stopWhen: LLMClient.stepCountIs(10),
|
||
}).pipe(
|
||
Stream.runCollect,
|
||
Effect.map((events) => Array.from(events)),
|
||
)
|
||
|
||
export const expectFinish = (
|
||
events: ReadonlyArray<LLMEvent>,
|
||
reason: Extract<LLMEvent, { readonly type: "finish" }>["reason"],
|
||
) => expect(events.at(-1)).toMatchObject({ type: "finish", reason })
|
||
|
||
export const expectWeatherToolCall = (response: LLMResponse) =>
|
||
expect(response.toolCalls).toMatchObject([
|
||
{ type: "tool-call", id: expect.any(String), name: weatherToolName, input: { city: "Paris" } },
|
||
])
|
||
|
||
export const expectWeatherToolLoop = (events: ReadonlyArray<LLMEvent>) => {
|
||
const finishes = events.filter(LLMEvent.is.finish)
|
||
expect(finishes).toHaveLength(1)
|
||
expect(finishes[0]?.reason).toBe("stop")
|
||
|
||
const stepFinishes = events.filter(LLMEvent.is.stepFinish)
|
||
expect(stepFinishes.map((event) => event.reason)).toEqual(["tool-calls", "stop"])
|
||
|
||
const toolCalls = events.filter(LLMEvent.is.toolCall)
|
||
expect(toolCalls).toHaveLength(1)
|
||
expect(toolCalls[0]).toMatchObject({ type: "tool-call", name: weatherToolName, input: { city: "Paris" } })
|
||
|
||
const toolResults = events.filter(LLMEvent.is.toolResult)
|
||
expect(toolResults).toHaveLength(1)
|
||
expect(toolResults[0]).toMatchObject({
|
||
type: "tool-result",
|
||
name: weatherToolName,
|
||
result: { type: "json", value: { temperature: 22, condition: "sunny" } },
|
||
})
|
||
|
||
const output = LLMResponse.text({ events })
|
||
expect(output).toContain("Paris")
|
||
expect(output.trim().length).toBeGreaterThan(0)
|
||
}
|
||
|
||
export const expectGoldenWeatherToolLoop = (events: ReadonlyArray<LLMEvent>) => {
|
||
expectWeatherToolLoop(events)
|
||
expect(LLMResponse.text({ events }).trim()).toMatch(/^Paris is sunny\.?$/)
|
||
}
|
||
|
||
export interface GoldenScenarioContext {
|
||
readonly id: string
|
||
readonly model: Model
|
||
readonly maxTokens?: number
|
||
readonly temperature?: number | false
|
||
}
|
||
|
||
const generate = (request: LLMRequest) => LLMClient.generate(request)
|
||
|
||
const generation = (context: GoldenScenarioContext, maxTokens: number) =>
|
||
context.temperature === false ? { maxTokens } : { maxTokens, temperature: context.temperature ?? 0 }
|
||
|
||
const normalizeImageText = (value: string) =>
|
||
value
|
||
.toLowerCase()
|
||
.replace(/[^a-z\s]/g, "")
|
||
.replace(/\s+/g, " ")
|
||
.trim()
|
||
|
||
const encryptedReasoningOptions = {
|
||
openai: {
|
||
store: false,
|
||
includeEncryptedReasoning: true,
|
||
reasoningEffort: "low",
|
||
reasoningSummary: "auto",
|
||
},
|
||
} as const
|
||
|
||
type AssistantTextExpectation = string | RegExp
|
||
|
||
type UserStep = { readonly type: "user"; readonly content: Message.ContentInput }
|
||
type AssistantStep = {
|
||
readonly type: "assistant"
|
||
readonly text?: AssistantTextExpectation
|
||
readonly toolCall?: { readonly name: string; readonly input: unknown }
|
||
readonly reasoning?: "openai-encrypted"
|
||
readonly id?: string
|
||
readonly system?: string
|
||
readonly maxTokens?: number
|
||
readonly finish?: FinishReason
|
||
readonly tools?: LLM.RequestInput["tools"]
|
||
readonly toolChoice?: LLM.RequestInput["toolChoice"]
|
||
readonly providerOptions?: LLMRequest["providerOptions"]
|
||
readonly assert?: (response: LLMResponse) => void
|
||
}
|
||
type ConversationStep = UserStep | AssistantStep
|
||
|
||
const user = (content: Message.ContentInput): ConversationStep => ({ type: "user", content })
|
||
|
||
const assistant = {
|
||
expectText: (
|
||
text: AssistantTextExpectation,
|
||
options?: Omit<AssistantStep, "type" | "text" | "reasoning" | "toolCall">,
|
||
): ConversationStep => ({ type: "assistant", text, ...options }),
|
||
expectToolCall: (
|
||
name: string,
|
||
input: unknown,
|
||
options?: Omit<AssistantStep, "type" | "text" | "reasoning" | "toolCall" | "finish">,
|
||
): ConversationStep => ({ type: "assistant", toolCall: { name, input }, finish: "tool-calls", ...options }),
|
||
expectEncryptedReasoningText: (
|
||
text: AssistantTextExpectation,
|
||
options?: Omit<AssistantStep, "type" | "text" | "reasoning" | "toolCall" | "providerOptions">,
|
||
): ConversationStep => ({
|
||
type: "assistant",
|
||
text,
|
||
reasoning: "openai-encrypted",
|
||
providerOptions: encryptedReasoningOptions,
|
||
...options,
|
||
}),
|
||
}
|
||
|
||
const assertAssistantText = (actual: string, expected: AssistantTextExpectation) => {
|
||
if (typeof expected === "string") {
|
||
expect(actual.trim()).toBe(expected)
|
||
return
|
||
}
|
||
expect(actual.trim()).toMatch(expected)
|
||
}
|
||
|
||
const assertAssistantToolCall = (response: LLMResponse, expected: NonNullable<AssistantStep["toolCall"]>) => {
|
||
expect(response.toolCalls).toMatchObject([
|
||
{ type: "tool-call", id: expect.any(String), name: expected.name, input: expected.input },
|
||
])
|
||
}
|
||
|
||
// The generated golden scenarios only model one assistant shape at a time:
|
||
// encrypted reasoning + text, text, or tool call. Keep mixed interleavings in
|
||
// focused protocol tests where event order can be asserted directly.
|
||
const assistantMessageFromResponse = (response: LLMResponse, step: AssistantStep) => {
|
||
const content: ContentPart[] = []
|
||
if (step.reasoning === "openai-encrypted") {
|
||
const reasoning = response.events.find(
|
||
(event): event is Extract<LLMEvent, { readonly type: "reasoning-end" }> =>
|
||
LLMEvent.is.reasoningEnd(event) && typeof event.providerMetadata?.openai?.itemId === "string",
|
||
)
|
||
if (!reasoning) throw new Error("OpenAI Responses did not return reasoning metadata")
|
||
expect(reasoning.providerMetadata?.openai?.reasoningEncryptedContent).toEqual(expect.any(String))
|
||
content.push({ type: "reasoning", text: response.reasoning, providerMetadata: reasoning.providerMetadata })
|
||
}
|
||
|
||
if (response.text.length > 0) content.push({ type: "text", text: response.text })
|
||
content.push(...response.toolCalls)
|
||
return Message.assistant(content)
|
||
}
|
||
|
||
const runGeneratedConversation = (context: GoldenScenarioContext, steps: ReadonlyArray<ConversationStep>) =>
|
||
Effect.gen(function* () {
|
||
const messages: Message[] = []
|
||
let generated = 0
|
||
for (const step of steps) {
|
||
if (step.type === "user") {
|
||
messages.push(Message.user(step.content))
|
||
continue
|
||
}
|
||
|
||
generated += 1
|
||
const response = yield* generate(
|
||
LLM.request({
|
||
id: step.id ? `${context.id}_${step.id}` : `${context.id}_${generated}`,
|
||
model: context.model,
|
||
system: step.system,
|
||
cache: "none",
|
||
messages,
|
||
tools: step.tools,
|
||
toolChoice: step.toolChoice,
|
||
providerOptions: step.providerOptions,
|
||
generation: generation(context, step.maxTokens ?? context.maxTokens ?? 80),
|
||
}),
|
||
)
|
||
if (step.text !== undefined) assertAssistantText(response.text, step.text)
|
||
if (step.toolCall) assertAssistantToolCall(response, step.toolCall)
|
||
step.assert?.(response)
|
||
expectFinish(response.events, step.finish ?? "stop")
|
||
messages.push(assistantMessageFromResponse(response, step))
|
||
}
|
||
})
|
||
|
||
const runTextScenario = (context: GoldenScenarioContext) =>
|
||
runGeneratedConversation(context, [
|
||
user("Reply exactly with: Hello!"),
|
||
assistant.expectText(/^Hello!?$/, {
|
||
system: "You are concise.",
|
||
maxTokens: context.maxTokens ?? 40,
|
||
providerOptions:
|
||
context.model.route.id === "gemini" ? { gemini: { thinkingConfig: { thinkingBudget: 0 } } } : undefined,
|
||
}),
|
||
])
|
||
|
||
const runToolCallScenario = (context: GoldenScenarioContext) =>
|
||
runGeneratedConversation(context, [
|
||
user("Call get_weather with city exactly Paris."),
|
||
assistant.expectToolCall(
|
||
weatherToolName,
|
||
{ city: "Paris" },
|
||
{
|
||
system: "Call tools exactly as requested.",
|
||
tools: [weatherTool],
|
||
toolChoice: ToolChoice.make(weatherTool),
|
||
maxTokens: context.maxTokens ?? 80,
|
||
},
|
||
),
|
||
])
|
||
|
||
const runImageScenario = (context: GoldenScenarioContext) =>
|
||
Effect.gen(function* () {
|
||
yield* runGeneratedConversation(context, [
|
||
user([
|
||
{
|
||
type: "text",
|
||
text: "The image contains exactly three lowercase English words. Read them left to right and reply with only those words.",
|
||
},
|
||
{ type: "media", mediaType: "image/png", data: yield* restroomImage() },
|
||
]),
|
||
assistant.expectText(/.+/, {
|
||
system: "Read images carefully. Reply only with the visible text.",
|
||
maxTokens: context.maxTokens ?? 20,
|
||
assert: (response) => expect(normalizeImageText(response.text)).toBe(RESTROOM_IMAGE_TEXT),
|
||
}),
|
||
])
|
||
})
|
||
|
||
const runReasoningScenario = (context: GoldenScenarioContext) =>
|
||
runGeneratedConversation(context, [
|
||
user("Think briefly, then reply exactly with: Hello!"),
|
||
assistant.expectText(/^Hello!?$/, {
|
||
system: "Show concise reasoning when the provider supports visible reasoning summaries.",
|
||
providerOptions: { openai: { reasoningEffort: "low", reasoningSummary: "auto" } },
|
||
maxTokens: context.maxTokens ?? 120,
|
||
assert: (response) => expect(response.usage?.reasoningTokens ?? 0).toBeGreaterThan(0),
|
||
}),
|
||
])
|
||
|
||
const runReasoningContinuationScenario = (context: GoldenScenarioContext) =>
|
||
runGeneratedConversation(context, [
|
||
user("Think briefly, then reply exactly with: Hello!"),
|
||
assistant.expectEncryptedReasoningText(/^Hello!?$/, {
|
||
id: "first",
|
||
system: "Show concise reasoning when the provider supports visible reasoning summaries.",
|
||
maxTokens: context.maxTokens ?? 120,
|
||
}),
|
||
user("Now reply exactly with: Done."),
|
||
assistant.expectText(/^Done\.?$/, { id: "second", maxTokens: 40, providerOptions: encryptedReasoningOptions }),
|
||
])
|
||
|
||
const runToolLoopScenario = (context: GoldenScenarioContext) =>
|
||
Effect.gen(function* () {
|
||
expectGoldenWeatherToolLoop(
|
||
yield* runWeatherToolLoop(
|
||
goldenWeatherToolLoopRequest({
|
||
id: context.id,
|
||
model: context.model,
|
||
maxTokens: context.maxTokens ?? 80,
|
||
temperature: context.temperature,
|
||
}),
|
||
),
|
||
)
|
||
})
|
||
|
||
const goldenScenarios = {
|
||
text: { title: "streams text", tags: ["text", "golden"], run: runTextScenario },
|
||
"tool-call": { title: "streams tool call", tags: ["tool", "tool-call", "golden"], run: runToolCallScenario },
|
||
"tool-loop": { title: "drives a tool loop", tags: ["tool", "tool-loop", "golden"], run: runToolLoopScenario },
|
||
image: { title: "reads image text", tags: ["media", "image", "vision", "golden"], run: runImageScenario },
|
||
reasoning: { title: "uses reasoning", tags: ["reasoning", "golden"], run: runReasoningScenario },
|
||
"reasoning-continuation": {
|
||
title: "continues encrypted reasoning",
|
||
tags: ["reasoning", "continuation", "encrypted-reasoning", "golden"],
|
||
run: runReasoningContinuationScenario,
|
||
},
|
||
} as const
|
||
|
||
export type GoldenScenarioID = keyof typeof goldenScenarios
|
||
export const goldenScenarioTitle = (id: GoldenScenarioID) => goldenScenarios[id].title
|
||
export const goldenScenarioTags = (id: GoldenScenarioID) => [...goldenScenarios[id].tags]
|
||
export const runGoldenScenario = (id: GoldenScenarioID, context: GoldenScenarioContext) =>
|
||
goldenScenarios[id].run(context)
|
||
|
||
const usageSummary = (usage: LLMResponse["usage"] | undefined) => {
|
||
if (!usage) return undefined
|
||
return Object.fromEntries(
|
||
[
|
||
["inputTokens", usage.inputTokens],
|
||
["outputTokens", usage.outputTokens],
|
||
["reasoningTokens", usage.reasoningTokens],
|
||
["cacheReadInputTokens", usage.cacheReadInputTokens],
|
||
["cacheWriteInputTokens", usage.cacheWriteInputTokens],
|
||
["totalTokens", usage.totalTokens],
|
||
].filter((entry) => entry[1] !== undefined),
|
||
)
|
||
}
|
||
|
||
const pushText = (summary: Array<Record<string, unknown>>, type: "text" | "reasoning", value: string) => {
|
||
const last = summary.at(-1)
|
||
if (last?.type === type) {
|
||
last.value = `${typeof last.value === "string" ? last.value : ""}${value}`
|
||
return
|
||
}
|
||
summary.push({ type, value })
|
||
}
|
||
|
||
export const eventSummary = (events: ReadonlyArray<LLMEvent>) => {
|
||
const summary: Array<Record<string, unknown>> = []
|
||
for (const event of events) {
|
||
if (event.type === "text-delta") {
|
||
pushText(summary, "text", event.text)
|
||
continue
|
||
}
|
||
if (event.type === "reasoning-delta") {
|
||
pushText(summary, "reasoning", event.text)
|
||
continue
|
||
}
|
||
if (event.type === "tool-call") {
|
||
summary.push({
|
||
type: "tool-call",
|
||
name: event.name,
|
||
input: event.input,
|
||
providerExecuted: event.providerExecuted,
|
||
})
|
||
continue
|
||
}
|
||
if (event.type === "tool-result") {
|
||
summary.push({
|
||
type: "tool-result",
|
||
name: event.name,
|
||
result: event.result,
|
||
providerExecuted: event.providerExecuted,
|
||
})
|
||
continue
|
||
}
|
||
if (event.type === "tool-error") {
|
||
summary.push({ type: "tool-error", name: event.name, message: event.message })
|
||
continue
|
||
}
|
||
if (event.type === "finish") {
|
||
summary.push({ type: "finish", reason: event.reason, usage: usageSummary(event.usage) })
|
||
}
|
||
}
|
||
return summary.map((item) => Object.fromEntries(Object.entries(item).filter((entry) => entry[1] !== undefined)))
|
||
}
|