diff --git a/packages/llm/README.md b/packages/llm/README.md
index e164c4bf53..321bf715bb 100644
--- a/packages/llm/README.md
+++ b/packages/llm/README.md
@@ -35,26 +35,25 @@ Run `LLMClient.stream(request)` instead of `generate` when you want incremental
 
 ## Caching
 
-Prompt caching is unified across providers. Mark content with a `CacheHint` and each protocol translates it to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI's implicit caching needs no markers).
+Prompt caching is **on by default**. Every `LLMRequest` resolves to `cache: "auto"` unless the caller opts out with `cache: "none"`. Each protocol translates `CacheHint`s to its wire format (`cache_control` on Anthropic, `cachePoint` on Bedrock; OpenAI and Gemini do implicit caching server-side and don't need inline markers — auto is a no-op there).
 
 ### Auto placement
 
-The simplest path is `cache: "auto"` on the request:
+`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
+
+The math justifies the default: Anthropic's 5-minute cache write is 1.25× base, read is 0.1×, so a single reuse within 5 minutes already wins. One-shot completions below the per-model minimum-cacheable-token threshold silently no-op on the wire, so the worst case is harmless.
+
+### Opting out
 
 ```ts
 LLM.request({
   model,
   system,
-  messages,
-  tools,
-  cache: "auto",
+  prompt: "one-off question",
+  cache: "none",
 })
 ```
 
-`"auto"` places three breakpoints — last tool definition, last system part, latest user message. The last-user-message boundary is the load-bearing detail: in a tool-use loop, a single user turn expands into many assistant/tool round-trips, all sharing that prefix. Caching at that boundary lets every intra-turn API call hit.
-
-On OpenAI and Gemini `"auto"` is a no-op (their wire formats don't accept inline markers — both use implicit caching). On Anthropic and Bedrock it emits provider-native cache markers.
-
 ### Granular policy
 
 ```ts
diff --git a/packages/llm/src/cache-policy.ts b/packages/llm/src/cache-policy.ts
index b9dca4e88f..6ab7a049fe 100644
--- a/packages/llm/src/cache-policy.ts
+++ b/packages/llm/src/cache-policy.ts
@@ -24,15 +24,15 @@ const AUTO: CachePolicyObject = {
 const NONE: CachePolicyObject = {}
 
 // Resolution rules:
-//   - undefined   → "none" (opt-in default so the policy never changes wire
-//                   shape for existing callers; downstream code can flip to
-//                   `cache: "auto"` once they audit the placement choices).
-//   - "auto"      → the recommended policy: tools + system + latest user msg.
+//   - undefined   → "auto" — caching is on by default. The math favors it:
+//                   Anthropic 5m-cache write is 1.25x base, read is 0.1x,
+//                   so a single reuse within 5 minutes already wins.
+//   - "auto"      → tools + system + latest user msg.
 //   - "none"      → no auto placement; manual `CacheHint`s still flow.
 //   - object form → exactly what the caller asked for.
 const resolve = (policy: CachePolicy | undefined): CachePolicyObject => {
-  if (policy === undefined || policy === "none") return NONE
-  if (policy === "auto") return AUTO
+  if (policy === undefined || policy === "auto") return AUTO
+  if (policy === "none") return NONE
   return policy
 }
 
diff --git a/packages/llm/test/cache-policy.test.ts b/packages/llm/test/cache-policy.test.ts
index 640556105e..e742ca5e69 100644
--- a/packages/llm/test/cache-policy.test.ts
+++ b/packages/llm/test/cache-policy.test.ts
@@ -33,7 +33,7 @@ const geminiModel = Gemini.model({
 })
 
 describe("applyCachePolicy", () => {
-  it.effect("undefined cache leaves the request untouched (opt-in default)", () =>
+  it.effect("undefined cache resolves to 'auto' (the recommended default)", () =>
     Effect.gen(function* () {
       const prepared = yield* LLMClient.prepare(
         LLM.request({
@@ -43,8 +43,11 @@ describe("applyCachePolicy", () => {
         }),
       )
 
+      // No explicit cache field → auto policy fires → last system part + latest
+      // user message both get cache_control markers.
       expect(prepared.body).toMatchObject({
-        system: [{ type: "text", text: "You are concise.", cache_control: undefined }],
+        system: [{ type: "text", text: "You are concise.", cache_control: { type: "ephemeral" } }],
+        messages: [{ role: "user", content: [{ type: "text", text: "hi", cache_control: { type: "ephemeral" } }] }],
       })
     }),
   )
@@ -252,6 +255,7 @@ describe("applyCachePolicy", () => {
     const request = LLM.request({
       model: anthropicModel,
       prompt: "hi",
+      cache: "none",
     })
     expect(applyCachePolicy(request)).toBe(request)
   })
diff --git a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
index cee31de19b..cb144b1a5d 100644
--- a/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
+++ b/packages/llm/test/provider/anthropic-messages-cache.recorded.test.ts
@@ -20,6 +20,9 @@ const cacheRequest = LLM.request({
   model,
   system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
   prompt: "Say hi.",
+  // Manual hint on the system part is the only marker we want here — skip the
+  // auto-policy's latest-user-message breakpoint so the cassette body matches.
+  cache: "none",
   generation: { maxTokens: 16, temperature: 0 },
 })
 
diff --git a/packages/llm/test/provider/anthropic-messages.test.ts b/packages/llm/test/provider/anthropic-messages.test.ts
index 3be041c94c..a867d16591 100644
--- a/packages/llm/test/provider/anthropic-messages.test.ts
+++ b/packages/llm/test/provider/anthropic-messages.test.ts
@@ -18,6 +18,9 @@ const request = LLM.request({
   model,
   system: { type: "text", text: "You are concise.", cache: new CacheHint({ type: "ephemeral" }) },
   prompt: "Say hello.",
+  // This fixture predates the `cache: "auto"` default; pin the policy off so
+  // existing wire-shape assertions only see the manual hint on the system part.
+  cache: "none",
   generation: { maxTokens: 20, temperature: 0 },
 })
 
@@ -48,6 +51,7 @@ describe("Anthropic Messages route", () => {
             LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: { query: "weather" } })]),
             LLM.toolMessage({ id: "call_1", name: "lookup", result: { forecast: "sunny" } }),
           ],
+          cache: "none",
         }),
       )
 
diff --git a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
index 400e38849e..16c44099ce 100644
--- a/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
+++ b/packages/llm/test/provider/bedrock-converse-cache.recorded.test.ts
@@ -27,6 +27,9 @@ const cacheRequest = LLM.request({
   model,
   system: [{ type: "text", text: LARGE_CACHEABLE_SYSTEM, cache: new CacheHint({ type: "ephemeral" }) }],
   prompt: "Say hi.",
+  // Manual hint on the system part is the only marker we want here — skip the
+  // auto-policy's latest-user-message breakpoint so the cassette body matches.
+  cache: "none",
   generation: { maxTokens: 16, temperature: 0 },
 })
 
diff --git a/packages/llm/test/provider/bedrock-converse.test.ts b/packages/llm/test/provider/bedrock-converse.test.ts
index afadd89ac7..208b565272 100644
--- a/packages/llm/test/provider/bedrock-converse.test.ts
+++ b/packages/llm/test/provider/bedrock-converse.test.ts
@@ -63,6 +63,9 @@ const baseRequest = LLM.request({
   model,
   system: "You are concise.",
   prompt: "Say hello.",
+  // Wire-shape assertions in this file predate the `cache: "auto"` default;
+  // pin the policy off so they only exercise the lowering path itself.
+  cache: "none",
   generation: { maxTokens: 64, temperature: 0 },
 })
 
@@ -125,6 +128,7 @@ describe("Bedrock Converse route", () => {
             LLM.assistant([LLM.toolCall({ id: "tool_1", name: "lookup", input: { query: "weather" } })]),
             LLM.toolMessage({ id: "tool_1", name: "lookup", result: { forecast: "sunny" } }),
           ],
+          cache: "none",
         }),
       )
 
@@ -339,6 +343,7 @@ describe("Bedrock Converse route", () => {
               { type: "media", mediaType: "image/webp", data: "DDDD" },
             ]),
           ],
+          cache: "none",
         }),
       )
 
@@ -470,6 +475,7 @@ describe("Bedrock Converse route", () => {
             LLM.assistant([LLM.toolCall({ id: "call_1", name: "lookup", input: {} })]),
             LLM.toolMessage({ id: "call_1", name: "lookup", result: { temp: 72 }, cache }),
           ],
+          cache: "none",
         }),
       )
 
@@ -555,6 +561,7 @@ describe("Bedrock Converse recorded", () => {
           model: recordedModel(),
           system: "Reply with the single word 'Hello'.",
           prompt: "Say hello.",
+          cache: "none",
           generation: { maxTokens: 16, temperature: 0 },
         }),
       )
@@ -577,6 +584,7 @@ describe("Bedrock Converse recorded", () => {
           prompt: "Call get_weather with city exactly Paris.",
           tools: [weatherTool],
           toolChoice: LLM.toolChoice(weatherTool),
+          cache: "none",
           generation: { maxTokens: 80, temperature: 0 },
         }),
       )
diff --git a/packages/llm/test/recorded-scenarios.ts b/packages/llm/test/recorded-scenarios.ts
index 8a02bc3a0a..127a444a16 100644
--- a/packages/llm/test/recorded-scenarios.ts
+++ b/packages/llm/test/recorded-scenarios.ts
@@ -51,6 +51,7 @@ export const textRequest = (input: {
     model: input.model,
     system: "You are concise.",
     prompt: input.prompt ?? "Reply with exactly: Hello!",
+    cache: "none",
     generation:
       input.temperature === false
         ? { maxTokens: input.maxTokens ?? 20 }
@@ -70,6 +71,7 @@ export const weatherToolRequest = (input: {
     prompt: "Call get_weather with city exactly Paris.",
     tools: [weatherTool],
     toolChoice: LLM.toolChoice(weatherTool),
+    cache: "none",
     generation:
       input.temperature === false
         ? { maxTokens: input.maxTokens ?? 80 }
@@ -88,6 +90,7 @@ export const weatherToolLoopRequest = (input: {
     model: input.model,
     system: input.system ?? "Use the get_weather tool, then answer in one short sentence.",
     prompt: "What is the weather in Paris?",
+    cache: "none",
     generation:
       input.temperature === false
         ? { maxTokens: input.maxTokens ?? 80 }