Compare commits

...

5 Commits

Author SHA1 Message Date
Eason Goodale
ba45d2f601 test cache discount 2025-04-26 12:15:34 -07:00
Eason Goodale
b051fcb804 whitespace 2025-04-20 04:04:12 -07:00
Eason Goodale
ada5e2249a format 2025-04-20 01:28:42 -07:00
Eason Goodale
0613fd35e2 lint, formatting 2025-04-18 03:20:10 -07:00
Eason Goodale
cdc0897a25 initial cost tracking
Signed-off-by: Eason Goodale <easong@openai.com>
2025-04-18 03:10:54 -07:00
11 changed files with 622 additions and 4 deletions

View File

@@ -9,6 +9,7 @@ import { TerminalChatCommandReview } from "./terminal-chat-command-review.js";
import { log, isLoggingEnabled } from "../../utils/agent/log.js";
import { loadConfig } from "../../utils/config.js";
import { createInputItem } from "../../utils/input-utils.js";
import { printAndResetSessionSummary } from "../../utils/session-cost.js";
import { setSessionId } from "../../utils/session.js";
import {
loadCommandHistory,
@@ -199,8 +200,14 @@ export default function TerminalChatInput({
setInput("");
setSessionId("");
setLastResponseId("");
// Clear the terminal first so the summary is printed on a fresh
// screen before the new session starts.
clearTerminal();
// Show the token/cost summary for the session that just ended.
printAndResetSessionSummary();
// Emit a system message to confirm the clear action. We *append*
// it so Ink's <Static> treats it as new output and actually renders it.
setItems((prev) => [

View File

@@ -11,6 +11,7 @@ import { TerminalChatCommandReview } from "./terminal-chat-command-review.js";
import { log, isLoggingEnabled } from "../../utils/agent/log.js";
import { loadConfig } from "../../utils/config.js";
import { createInputItem } from "../../utils/input-utils.js";
import { printAndResetSessionSummary } from "../../utils/session-cost.js";
import { setSessionId } from "../../utils/session.js";
import {
loadCommandHistory,
@@ -286,8 +287,12 @@ export default function TerminalChatInput({
setInput("");
setSessionId("");
setLastResponseId("");
// Clear screen then display session summary so the user sees it.
clearTerminal();
printAndResetSessionSummary();
// Emit a system message to confirm the clear action. We *append*
// it so Ink's <Static> treats it as new output and actually renders it.
setItems((prev) => [

View File

@@ -24,6 +24,25 @@ function isUserMessage(
*/
export function maxTokensForModel(model: string): number {
const lower = model.toLowerCase();
// Heuristics for common context window sizes. Keep the checks loosely
// ordered from *largest* to *smallest* so that more specific longcontext
// models are detected before their shorter generic counterparts.
// Specialcase for 1,047,576token demo model (gpt4long). We match either
// the literal number or "gpt-4.1" variants we occasionally encounter.
if (lower.includes("1,047,576") || /gpt-4\.1/i.test(lower)) {
return 1047576;
}
if (lower.includes("128k") || /gpt-4\.5|gpt-4o-mini|gpt-4o\b/i.test(lower)) {
return 128000;
}
// Experimental oseries advertised at ~200k context
if (/\bo[134]\b|o[134]-mini|o1[- ]?pro/i.test(lower)) {
return 200000;
}
if (lower.includes("32k")) {
return 32000;
}
@@ -46,8 +65,11 @@ export function maxTokensForModel(model: string): number {
export function calculateContextPercentRemaining(
items: Array<ResponseItem>,
model: string,
extraContextChars = 0,
): number {
const used = approximateTokensUsed(items);
const tokensFromItems = approximateTokensUsed(items);
const extraTokens = Math.ceil(extraContextChars / 4);
const used = tokensFromItems + extraTokens;
const max = maxTokensForModel(model);
const remaining = Math.max(0, max - used);
return (remaining / max) * 100;

View File

@@ -427,8 +427,14 @@ export default function TerminalChat({
).length;
const contextLeftPercent = useMemo(
() => calculateContextPercentRemaining(items, model),
[items, model],
() =>
calculateContextPercentRemaining(
items,
model,
// static system instructions count towards the context budget too
config.instructions?.length ?? 0,
),
[items, model, config.instructions],
);
return (

View File

@@ -1,6 +1,7 @@
import type { ReviewDecision } from "./review.js";
import type { ApplyPatchCommand, ApprovalPolicy } from "../../approvals.js";
import type { AppConfig } from "../config.js";
import type { UsageBreakdown } from "../estimate-cost.js";
import type {
ResponseFunctionToolCall,
ResponseInputItem,
@@ -11,6 +12,7 @@ import type { Reasoning } from "openai/resources.mjs";
import { log, isLoggingEnabled } from "./log.js";
import { OPENAI_BASE_URL, OPENAI_TIMEOUT_MS } from "../config.js";
import { parseToolCallArguments } from "../parsers.js";
import { ensureSessionTracker } from "../session-cost.js";
import {
ORIGIN,
CLI_VERSION,
@@ -56,6 +58,13 @@ type AgentLoopParams = {
onLastResponseId: (lastResponseId: string) => void;
};
type Usage = {
total_tokens?: number;
input_tokens?: number;
output_tokens?: number;
};
type MaybeUsageEvent = { response?: { usage?: Usage } };
export class AgentLoop {
private model: string;
private instructions?: string;
@@ -235,7 +244,18 @@ export class AgentLoop {
instructions: instructions ?? "",
} as AppConfig);
this.additionalWritableRoots = additionalWritableRoots;
this.onItem = onItem;
// Capture usage for costtracking before delegating to the callersupplied
// callback. Wrapping here avoids repeating the bookkeeping logic across
// every UI surface.
this.onItem = (item: ResponseItem) => {
try {
ensureSessionTracker(this.model).addItems([item]);
} catch {
/* besteffort never block uservisible updates */
}
onItem(item);
};
this.onLoading = onLoading;
this.getCommandConfirmation = getCommandConfirmation;
this.onLastResponseId = onLastResponseId;
@@ -778,6 +798,21 @@ export class AgentLoop {
}
lastResponseId = event.response.id;
this.onLastResponseId(event.response.id);
// Capture exact token usage for cost tracking when provided by
// the API. `responses.completed` events include a `usage` field
// with {input_tokens, output_tokens, total_tokens}. We record
// the total (or fallback to summing the parts if needed).
try {
const usage = (event as MaybeUsageEvent).response?.usage;
if (usage && typeof usage === "object") {
ensureSessionTracker(this.model).addUsage(
usage as unknown as UsageBreakdown,
);
}
} catch {
/* besteffort only */
}
}
}
} catch (err: unknown) {

View File

@@ -0,0 +1,212 @@
/**
* Costestimation helpers for OpenAI responses.
*
* The implementation now distinguishes between *input*, *cached input* and
* *output* tokens, reflecting OpenAIs 202504 pricing scheme. For models
* where we only have a single blended rate we gracefully fall back to the
* legacy logic so existing callsites continue to work.
*/
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
import { approximateTokensUsed } from "./approximate-tokens-used.js";
// ────────────────────────────────────────────────────────────────────────────
// Pricing tables
// ────────────────────────────────────────────────────────────────────────────
/** Breakdown of pertoken prices (in USD). */
type TokenRates = {
/** Price for *noncached* input prompt tokens. */
input: number;
/** Preferential price for *cached* input tokens. */
cachedInput: number;
/** Price for completion / output tokens. */
output: number;
};
/**
* Pricing table (exact model name -> pertoken rates).
* All keys must be lowercase.
*/
const detailedPriceMap: Record<string, TokenRates> = {
// OpenAI “oseries” experimental
"o3": {
input: 10 / 1_000_000,
cachedInput: 2.5 / 1_000_000,
output: 40 / 1_000_000,
},
"o4-mini": {
input: 1.1 / 1_000_000,
cachedInput: 0.275 / 1_000_000,
output: 4.4 / 1_000_000,
},
// GPT4.1 family
"gpt-4.1-nano": {
input: 0.1 / 1_000_000,
cachedInput: 0.025 / 1_000_000,
output: 0.4 / 1_000_000,
},
"gpt-4.1-mini": {
input: 0.4 / 1_000_000,
cachedInput: 0.1 / 1_000_000,
output: 1.6 / 1_000_000,
},
"gpt-4.1": {
input: 2 / 1_000_000,
cachedInput: 0.5 / 1_000_000,
output: 8 / 1_000_000,
},
// GPT4o family
"gpt-4o-mini": {
input: 0.6 / 1_000_000,
cachedInput: 0.3 / 1_000_000,
output: 2.4 / 1_000_000,
},
"gpt-4o": {
input: 5 / 1_000_000,
cachedInput: 2.5 / 1_000_000,
output: 20 / 1_000_000,
},
};
/**
* Legacy singlerate pricing entries (per *thousand* tokens). These are kept
* to provide sensible fallbacks for models that do not yet expose a detailed
* breakdown or where we have no published split pricing. The figures stem
* from older OpenAI announcements and are only meant for *approximation*
* callers that rely on exact accounting should upgrade to models covered by
* {@link detailedPriceMap}.
*/
const blendedPriceMap: Record<string, number> = {
// GPT4 Turbo (Apr 2024)
"gpt-4-turbo": 0.01,
// Legacy GPT4 8k / 32k context models
"gpt-4": 0.03,
// GPT3.5Turbo family
"gpt-3.5-turbo": 0.0005,
// Remaining preview variants (exact names)
"gpt-4o-search-preview": 0.0025,
"gpt-4o-mini-search-preview": 0.00015,
"gpt-4o-realtime-preview": 0.005,
"gpt-4o-audio-preview": 0.0025,
"gpt-4o-mini-audio-preview": 0.00015,
"gpt-4o-mini-realtime-preview": 0.0006,
"gpt-4o-mini": 0.00015,
// Older experimental oseries rates
"o3-mini": 0.0011,
"o1-mini": 0.0011,
"o1-pro": 0.15,
"o1": 0.015,
// Additional internal preview models
"computer-use-preview": 0.003,
};
// ────────────────────────────────────────────────────────────────────────────
// Public helpers
// ────────────────────────────────────────────────────────────────────────────
/**
* Return the pertoken input/cached/output rates for the supplied model, or
* `null` when no detailed pricing is available.
*/
function normalize(model: string): string {
// Lowercase and strip date/version suffixes like “20250414”.
const lower = model.toLowerCase();
const dateSuffix = /-\d{4}-\d{2}-\d{2}$/;
return lower.replace(dateSuffix, "");
}
export function priceRates(model: string): TokenRates | null {
return detailedPriceMap[normalize(model)] ?? null;
}
/**
* Fallback that returns a *single* blended pertoken rate when no detailed
* split is available. This mirrors the behaviour of the pre2025 version so
* that existing callers keep working unmodified.
*/
export function pricePerToken(model: string): number | null {
// Prefer an *average* of the detailed rates when we have them this avoids
// surprises where callers mix `pricePerToken()` with the new detailed
// helpers.
const rates = priceRates(model);
if (rates) {
return (rates.input + rates.output) / 2; // simple average heuristic
}
const entry = blendedPriceMap[normalize(model)];
if (entry == null) {
return null;
}
return entry / 1000;
}
// ────────────────────────────────────────────────────────────────────────────
// Cost estimation
// ────────────────────────────────────────────────────────────────────────────
/** Shape of the `usage` object returned by OpenAIs Responses API. */
export type UsageBreakdown = {
input_tokens?: number;
input_tokens_details?: { cached_tokens?: number } | null;
output_tokens?: number;
total_tokens?: number;
};
/**
* Calculate the exact cost (in USD) for a single usage breakdown. Returns
* `null` when the model is unknown.
*/
export function estimateCostFromUsage(
usage: UsageBreakdown,
model: string,
): number | null {
const rates = priceRates(model);
if (!rates) {
// fall back to blended pricing
const per = pricePerToken(model);
if (per == null) {
return null;
}
const tokens =
usage.total_tokens ??
(usage.input_tokens ?? 0) + (usage.output_tokens ?? 0);
return tokens * per;
}
const input = usage.input_tokens ?? 0;
const cached = usage.input_tokens_details?.cached_tokens ?? 0;
const nonCachedInput = Math.max(0, input - cached);
const output = usage.output_tokens ?? 0;
return (
nonCachedInput * rates.input +
cached * rates.cachedInput +
output * rates.output
);
}
/**
* Rough cost estimate (USD) for a series of {@link ResponseItem}s when using
* the specified model. When no detailed usage object is available we fall
* back to estimating token counts based on the message contents.
*/
export function estimateCostUSD(
items: Array<ResponseItem>,
model: string,
): number | null {
const per = pricePerToken(model);
if (per == null) {
return null;
}
return approximateTokensUsed(items) * per;
}

View File

@@ -0,0 +1,138 @@
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
import { approximateTokensUsed } from "./approximate-tokens-used.js";
import {
estimateCostFromUsage,
pricePerToken,
type UsageBreakdown,
} from "./estimate-cost.js";
/**
* Simple accumulator for {@link ResponseItem}s that exposes aggregate token
* and (approximate) dollarcost statistics for the current conversation.
*/
export class SessionCostTracker {
private readonly model: string;
private readonly items: Array<ResponseItem> = [];
private tokensUsedPrecise: number | null = null;
/**
* Aggregated exact cost when we have detailed `usage` information from the
* OpenAI API. Falls back to `null` when we only have the rough estimate
* path available.
*/
private costPrecise: number | null = null;
constructor(model: string) {
this.model = model;
}
/** Append newlyreceived items to the internal history. */
addItems(items: Array<ResponseItem>): void {
this.items.push(...items);
}
/**
* Add a full usage breakdown as returned by the Responses API. This gives
* us exact token counts and allows truetospec cost accounting that
* factors in cached tokens.
*/
addUsage(usage: UsageBreakdown): void {
const tokens =
usage.total_tokens ??
(usage.input_tokens ?? 0) + (usage.output_tokens ?? 0);
if (Number.isFinite(tokens) && tokens > 0) {
this.tokensUsedPrecise = (this.tokensUsedPrecise ?? 0) + tokens;
}
const cost = estimateCostFromUsage(usage, this.model);
if (cost != null) {
this.costPrecise = (this.costPrecise ?? 0) + cost;
}
}
/** Legacy helper for callers that only know the total token count. */
addTokens(count: number): void {
if (Number.isFinite(count) && count > 0) {
this.tokensUsedPrecise = (this.tokensUsedPrecise ?? 0) + count;
// We deliberately do *not* update costPrecise here without a detailed
// breakdown we cannot know whether tokens were input/output/cached. We
// therefore fall back to the blended rate during `getCostUSD()`.
}
}
/** Approximate total token count so far. */
getTokensUsed(): number {
if (this.tokensUsedPrecise != null) {
return this.tokensUsedPrecise;
}
return approximateTokensUsed(this.items);
}
/** Besteffort USD cost estimate. Returns `null` when the model is unknown. */
getCostUSD(): number | null {
if (this.costPrecise != null) {
return this.costPrecise;
}
const per = pricePerToken(this.model);
if (per == null) {
return null;
}
return this.getTokensUsed() * per;
}
/**
* Humanreadable oneliner suitable for printing at session end (e.g. on
* CtrlC or `/clear`).
*/
summary(): string {
const tokens = this.getTokensUsed();
const cost = this.getCostUSD();
if (cost == null) {
return `Session complete approx. ${tokens} tokens used.`;
}
return `Session complete approx. ${tokens} tokens, $${cost.toFixed(
4,
)} USD.`;
}
}
// ────────────────────────────────────────────────────────────────────────────
// Global helpers so disparate parts of the codebase can share a single
// tracker instance without threading it through countless function calls.
// ────────────────────────────────────────────────────────────────────────────
let globalTracker: SessionCostTracker | null = null;
export function getSessionTracker(): SessionCostTracker | null {
return globalTracker;
}
export function ensureSessionTracker(model: string): SessionCostTracker {
if (!globalTracker) {
globalTracker = new SessionCostTracker(model);
}
return globalTracker;
}
export function resetSessionTracker(): void {
globalTracker = null;
}
/**
* Convenience helper that prints the session summary (if any) and resets the
* global tracker so that the next conversation starts with a clean slate.
*/
export function printAndResetSessionSummary(): void {
if (!globalTracker) {
return; // nothing to do
}
// eslint-disable-next-line no-console -- explicit, uservisible log
console.log("\n" + globalTracker.summary() + "\n");
resetSessionTracker();
}

View File

@@ -1,6 +1,9 @@
import type { Instance } from "ink";
import type React from "react";
// Costtracking
import { printAndResetSessionSummary } from "./session-cost.js";
let inkRenderer: Instance | null = null;
// Track whether the cleanup routine has already executed so repeat calls are
@@ -79,4 +82,12 @@ export function onExit(): void {
/* besteffort continue even if Ink throws */
}
}
// Finally, print a brief token/cost summary for the session best effort
// only, errors are swallowed so that shutdown always succeeds.
try {
printAndResetSessionSummary();
} catch {
/* ignore */
}
}

View File

@@ -0,0 +1,28 @@
import { describe, expect, it } from "vitest";
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
import { calculateContextPercentRemaining } from "../src/components/chat/terminal-chat-utils.js";
function makeUserMessage(id: string, text: string): ResponseItem {
return {
id,
type: "message",
role: "user",
content: [{ type: "input_text", text }],
} as ResponseItem;
}
describe("calculateContextPercentRemaining", () => {
it("includes extra context characters in calculation", () => {
const msgText = "a".repeat(40); // 40 chars → 10 tokens
const items = [makeUserMessage("1", msgText)];
const model = "gpt-4-16k";
const base = calculateContextPercentRemaining(items, model);
const withExtra = calculateContextPercentRemaining(items, model, 8); // +8 chars → +2 tokens
expect(withExtra).toBeLessThan(base);
});
});

View File

@@ -0,0 +1,69 @@
import { describe, expect, test } from "vitest";
import {
estimateCostUSD,
estimateCostFromUsage,
} from "../src/utils/estimate-cost.js";
import { SessionCostTracker } from "../src/utils/session-cost.js";
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
// Helper to craft a minimal ResponseItem for tests
function makeMessage(
id: string,
role: "user" | "assistant",
text: string,
): ResponseItem {
return {
id,
type: "message",
role,
content: [{ type: role === "user" ? "input_text" : "output_text", text }],
} as ResponseItem;
}
describe("estimateCostUSD", () => {
test("returns a proportional, positive estimate for known models", () => {
const items: Array<ResponseItem> = [
makeMessage("1", "user", "hello world"),
makeMessage("2", "assistant", "hi there"),
];
const cost = estimateCostUSD(items, "gpt-3.5-turbo");
expect(cost).not.toBeNull();
expect(cost!).toBeGreaterThan(0);
// Adding another token should increase the estimate
const cost2 = estimateCostUSD(
items.concat([makeMessage("3", "user", "extra")]),
"gpt-3.5-turbo",
);
expect(cost2!).toBeGreaterThan(cost!);
});
test("cost calculation honours cached input token discount", () => {
const usage = {
input_tokens: 1000,
input_tokens_details: { cached_tokens: 600 },
output_tokens: 500,
total_tokens: 1500,
} as any; // simple literal structure for test
const cost = estimateCostFromUsage(usage, "gpt-4.1");
// Expected: (1000-600)*0.000002 + 600*0.0000005 + 500*0.000008
const expected = 400 * 0.000002 + 600 * 0.0000005 + 500 * 0.000008;
expect(cost).not.toBeNull();
expect(cost!).toBeCloseTo(expected, 8);
});
});
describe("SessionCostTracker", () => {
test("accumulates items and reports tokens & cost", () => {
const tracker = new SessionCostTracker("gpt-3.5-turbo");
tracker.addItems([makeMessage("1", "user", "foo")]);
tracker.addItems([makeMessage("2", "assistant", "bar baz")]);
expect(tracker.getTokensUsed()).toBeGreaterThan(0);
expect(tracker.getCostUSD()!).toBeGreaterThan(0);
});
});

View File

@@ -0,0 +1,85 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
import {
ensureSessionTracker,
getSessionTracker,
printAndResetSessionSummary,
} from "../src/utils/session-cost.js";
function makeMessage(
id: string,
role: "user" | "assistant",
text: string,
): ResponseItem {
return {
id,
type: "message",
role,
content: [{ type: role === "user" ? "input_text" : "output_text", text }],
} as ResponseItem;
}
describe("printAndResetSessionSummary", () => {
afterEach(() => {
vi.restoreAllMocks();
});
it("/clear resets tracker so successive conversations start fresh", () => {
const spy = vi.spyOn(console, "log").mockImplementation(() => {});
const perSessionTokens: Array<number> = [];
for (let i = 1; i <= 3; i++) {
const tracker = ensureSessionTracker("gpt-3.5-turbo");
tracker.addTokens(i * 10); // 10, 20, 30
perSessionTokens.push(tracker.getTokensUsed());
// Simulate user typing /clear which prints & resets
printAndResetSessionSummary();
expect(getSessionTracker()).toBeNull();
}
expect(perSessionTokens).toEqual([10, 20, 30]);
spy.mockRestore();
});
it("prints a summary and resets the global tracker", () => {
const spy = vi.spyOn(console, "log").mockImplementation(() => {});
const tracker = ensureSessionTracker("gpt-3.5-turbo");
tracker.addItems([
makeMessage("1", "user", "hello"),
makeMessage("2", "assistant", "hi"),
]);
printAndResetSessionSummary();
expect(spy).toHaveBeenCalled();
expect(getSessionTracker()).toBeNull();
});
it("prefers exact token counts added via addTokens() over heuristic", () => {
const tracker = ensureSessionTracker("gpt-3.5-turbo");
// Add a long message (heuristic would count >1 token)
tracker.addItems([
makeMessage("x", "user", "a".repeat(400)), // ~100 tokens
]);
const heuristicTokens = tracker.getTokensUsed();
expect(heuristicTokens).toBeGreaterThan(50);
// Now inject an exact low token count and ensure it overrides
tracker.addTokens(10);
expect(tracker.getTokensUsed()).toBe(
heuristicTokens + (10 - heuristicTokens),
);
const cost = tracker.getCostUSD();
expect(cost).not.toBeNull();
});
});