fix(mcp): auto-reconnect MCP servers after transient listTools failure

When tools() encounters a listTools() failure, fork a background
reconnection fiber via createAndStore() instead of permanently
removing the server. The current turn gets no tools from that server,
but the background fiber re-establishes the connection so tools are
available on the next turn.

Previously, a single transient error (network blip, server restart)
would permanently delete the MCP server from the session with no
recovery path.
This commit is contained in:
Kit Langton
2026-03-25 08:53:27 -04:00
parent 6c45a55955
commit 50ac5b62f8
2 changed files with 22 additions and 7 deletions

View File

@@ -24,7 +24,7 @@ import { BusEvent } from "../bus/bus-event"
import { Bus } from "@/bus"
import { TuiEvent } from "@/cli/cmd/tui/event"
import open from "open"
import { Effect, Layer, ServiceMap, Stream } from "effect"
import { Effect, Layer, Scope, ServiceMap, Stream } from "effect"
import { InstanceState } from "@/effect/instance-state"
import { makeRunPromise } from "@/effect/run-service"
import { ChildProcess, ChildProcessSpawner } from "effect/unstable/process"
@@ -464,6 +464,7 @@ export namespace MCP {
export const layer = Layer.effect(
Service,
Effect.gen(function* () {
const scope = yield* Scope.Scope
const spawner = yield* ChildProcessSpawner.ChildProcessSpawner
const auth = yield* McpAuth.Service
@@ -658,7 +659,18 @@ export namespace MCP {
return e
},
}).pipe(Effect.option)
if (toolsResult._tag === "None") return
if (toolsResult._tag === "None") {
// Fork background reconnection — tools come back next turn
const mcpConfig = config[clientName]
if (mcpConfig && isMcpConfigured(mcpConfig)) {
yield* createAndStore(clientName, mcpConfig).pipe(
Effect.catch(() => Effect.void),
Effect.forkIn(scope),
)
}
return
}
const mcpConfig = config[clientName]
const entry = isMcpConfigured(mcpConfig) ? mcpConfig : undefined

View File

@@ -185,7 +185,7 @@ function withInstance(
// Bug #1: tools() silently deletes clients on transient listTools() failure
// ========================================================================
test.todo(
test(
"tools() removes client from state when listTools fails",
withInstance(
{
@@ -216,9 +216,10 @@ test.todo(
serverState.listToolsShouldFail = true
await MCP.tools()
// After the transient error clears, tools should recover
// automatically — the server shouldn't be permanently removed.
// After the transient error clears, the background reconnection
// fiber should restore the server. Give it a tick to complete.
serverState.listToolsShouldFail = false
await new Promise((r) => setTimeout(r, 100))
const toolsRecovered = await MCP.tools()
expect(Object.keys(toolsRecovered).length).toBeGreaterThan(0)
@@ -230,7 +231,7 @@ test.todo(
// Bug #2: status() shows stale data after tools() deletes a client
// ========================================================================
test.todo(
test(
"status shows 'failed' after tools() encounters a transient error",
withInstance(
{
@@ -256,7 +257,9 @@ test.todo(
await MCP.tools()
serverState.listToolsShouldFail = false
// After the error clears, status should recover automatically
// Give background reconnection fiber time to complete
await new Promise((r) => setTimeout(r, 100))
await MCP.tools()
const statusAfter = await MCP.status()
expect(statusAfter["status-server"]?.status).toBe("connected")