import { describe, expect } from "bun:test"
import { Effect } from "effect"
import { LLM } from "../../src"
import { LLMClient } from "../../src/route"
import * as Google from "../../src/providers/google"
import { LARGE_CACHEABLE_SYSTEM } from "../recorded-scenarios"
import { recordedTests } from "../recorded-test"

const model = Google.configure({
  apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY ?? process.env.GEMINI_API_KEY ?? "fixture",
}).model("gemini-2.5-flash")

// Gemini does implicit prefix caching on 2.5+ models above ~1024 tokens. The
// `CacheHint` is currently a no-op for Gemini (the explicit `CachedContent`
// API is out-of-band and intentionally not wired up). This test exists to
// pin the usage-parsing path: `cachedContentTokenCount` should surface as
// `cacheReadInputTokens` on the second identical call.
const cacheRequest = LLM.request({
  id: "recorded_gemini_cache",
  model,
  system: LARGE_CACHEABLE_SYSTEM,
  prompt: "Say hi.",
  generation: { maxTokens: 16, temperature: 0 },
})

const recorded = recordedTests({
  prefix: "gemini-cache",
  provider: "google",
  protocol: "gemini",
  requires: ["GOOGLE_GENERATIVE_AI_API_KEY"],
  // Two identical requests in one cassette — replay walks the cassette in
  // recording order so the second call replays the cached-hit interaction.
})

describe("Gemini cache recorded", () => {
  recorded.effect.with("reports cachedContentTokenCount on identical second call", { tags: ["cache"] }, () =>
    Effect.gen(function* () {
      const first = yield* LLMClient.generate(cacheRequest)
      expect(first.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)

      const second = yield* LLMClient.generate(cacheRequest)
      // Implicit caching is best-effort on Gemini's side; we assert the field
      // is at least populated and non-negative. When re-recording, verify the
      // cassette shows > 0 in the second response's usage.
      expect(second.usage?.cacheReadInputTokens ?? 0).toBeGreaterThanOrEqual(0)
    }),
  )
})