Compare commits

...

5 Commits

Author SHA1 Message Date
sdcoffey
4b14ab36b9 update doc 2026-03-08 21:53:47 -07:00
sdcoffey
81f2b2a0d9 diagrams 2026-03-08 21:28:27 -07:00
sdcoffey
814f0623a5 prototype sdkv2 on app-server 2026-03-08 21:13:07 -07:00
sdcoffey
15ba6609c9 wip full delegation 2026-03-07 09:47:05 -08:00
sdcoffey
dd9cc542ed plan 2026-03-06 16:54:21 -08:00
78 changed files with 5020 additions and 26 deletions

View File

@@ -33,6 +33,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User has approved this command, but wants the agent to execute a replacement command instead of the originally proposed one.",
"properties": {
"approved_with_command_override": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"approved_with_command_override"
],
"title": "ApprovedWithCommandOverrideReviewDecision",
"type": "object"
},
{
"additionalProperties": false,
"description": "User has approved this command and wants to apply the proposed execpolicy amendment so future matching commands are permitted.",

View File

@@ -1790,6 +1790,34 @@
}
]
},
"SdkDelegationConfig": {
"properties": {
"bridgeUrl": {
"description": "Base URL for the host-managed Responses bridge reachable by the Codex runtime.",
"type": "string"
},
"modelProviderId": {
"description": "Optional model-provider id to register for this thread. Defaults to `codex-sdk-v2`.",
"type": [
"string",
"null"
]
},
"streamIdleTimeoutMs": {
"description": "Optional stream idle timeout override for the delegated provider.",
"format": "uint64",
"minimum": 0.0,
"type": [
"integer",
"null"
]
}
},
"required": [
"bridgeUrl"
],
"type": "object"
},
"ServiceTier": {
"enum": [
"fast",

View File

@@ -216,6 +216,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User approved execution, but wants to replace the command before it runs.",
"properties": {
"acceptWithCommandOverride": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"acceptWithCommandOverride"
],
"title": "AcceptWithCommandOverrideCommandExecutionApprovalDecision",
"type": "object"
},
{
"description": "User approved the command and future prompts in the same session-scoped approval cache should run without prompting.",
"enum": [

View File

@@ -10,6 +10,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User approved execution, but wants to replace the command before it runs.",
"properties": {
"acceptWithCommandOverride": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"acceptWithCommandOverride"
],
"title": "AcceptWithCommandOverrideCommandExecutionApprovalDecision",
"type": "object"
},
{
"description": "User approved the command and future prompts in the same session-scoped approval cache should run without prompting.",
"enum": [

View File

@@ -5252,6 +5252,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User has approved this command, but wants the agent to execute a replacement command instead of the originally proposed one.",
"properties": {
"approved_with_command_override": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"approved_with_command_override"
],
"title": "ApprovedWithCommandOverrideReviewDecision",
"type": "object"
},
{
"additionalProperties": false,
"description": "User has approved this command and wants to apply the proposed execpolicy amendment so future matching commands are permitted.",

View File

@@ -33,6 +33,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User has approved this command, but wants the agent to execute a replacement command instead of the originally proposed one.",
"properties": {
"approved_with_command_override": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"approved_with_command_override"
],
"title": "ApprovedWithCommandOverrideReviewDecision",
"type": "object"
},
{
"additionalProperties": false,
"description": "User has approved this command and wants to apply the proposed execpolicy amendment so future matching commands are permitted.",

View File

@@ -1450,6 +1450,25 @@
}
]
},
"SdkDelegationConfiguredNotification": {
"properties": {
"bridgeUrl": {
"type": "string"
},
"modelProvider": {
"type": "string"
},
"threadId": {
"type": "string"
}
},
"required": [
"bridgeUrl",
"modelProvider",
"threadId"
],
"type": "object"
},
"ServerRequestResolvedNotification": {
"properties": {
"requestId": {
@@ -3648,6 +3667,26 @@
"title": "App/list/updatedNotification",
"type": "object"
},
{
"properties": {
"method": {
"enum": [
"codexSdk/delegationConfigured"
],
"title": "CodexSdk/delegationConfiguredNotificationMethod",
"type": "string"
},
"params": {
"$ref": "#/definitions/SdkDelegationConfiguredNotification"
}
},
"required": [
"method",
"params"
],
"title": "CodexSdk/delegationConfiguredNotification",
"type": "object"
},
{
"properties": {
"method": {

View File

@@ -282,6 +282,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User approved execution, but wants to replace the command before it runs.",
"properties": {
"acceptWithCommandOverride": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"acceptWithCommandOverride"
],
"title": "AcceptWithCommandOverrideCommandExecutionApprovalDecision",
"type": "object"
},
{
"description": "User approved the command and future prompts in the same session-scoped approval cache should run without prompting.",
"enum": [

View File

@@ -1514,6 +1514,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User approved execution, but wants to replace the command before it runs.",
"properties": {
"acceptWithCommandOverride": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"acceptWithCommandOverride"
],
"title": "AcceptWithCommandOverrideCommandExecutionApprovalDecision",
"type": "object"
},
{
"description": "User approved the command and future prompts in the same session-scoped approval cache should run without prompting.",
"enum": [
@@ -6587,6 +6612,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User has approved this command, but wants the agent to execute a replacement command instead of the originally proposed one.",
"properties": {
"approved_with_command_override": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"approved_with_command_override"
],
"title": "ApprovedWithCommandOverrideReviewDecision",
"type": "object"
},
{
"additionalProperties": false,
"description": "User has approved this command and wants to apply the proposed execpolicy amendment so future matching commands are permitted.",
@@ -7261,6 +7311,26 @@
"title": "App/list/updatedNotification",
"type": "object"
},
{
"properties": {
"method": {
"enum": [
"codexSdk/delegationConfigured"
],
"title": "CodexSdk/delegationConfiguredNotificationMethod",
"type": "string"
},
"params": {
"$ref": "#/definitions/v2/SdkDelegationConfiguredNotification"
}
},
"required": [
"method",
"params"
],
"title": "CodexSdk/delegationConfiguredNotification",
"type": "object"
},
{
"properties": {
"method": {
@@ -13161,6 +13231,55 @@
},
"type": "object"
},
"SdkDelegationConfig": {
"properties": {
"bridgeUrl": {
"description": "Base URL for the host-managed Responses bridge reachable by the Codex runtime.",
"type": "string"
},
"modelProviderId": {
"description": "Optional model-provider id to register for this thread. Defaults to `codex-sdk-v2`.",
"type": [
"string",
"null"
]
},
"streamIdleTimeoutMs": {
"description": "Optional stream idle timeout override for the delegated provider.",
"format": "uint64",
"minimum": 0.0,
"type": [
"integer",
"null"
]
}
},
"required": [
"bridgeUrl"
],
"type": "object"
},
"SdkDelegationConfiguredNotification": {
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {
"bridgeUrl": {
"type": "string"
},
"modelProvider": {
"type": "string"
},
"threadId": {
"type": "string"
}
},
"required": [
"bridgeUrl",
"modelProvider",
"threadId"
],
"title": "SdkDelegationConfiguredNotification",
"type": "object"
},
"ServerRequestResolvedNotification": {
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {

View File

@@ -9801,6 +9801,31 @@
],
"type": "string"
},
{
"additionalProperties": false,
"description": "User has approved this command, but wants the agent to execute a replacement command instead of the originally proposed one.",
"properties": {
"approved_with_command_override": {
"properties": {
"command": {
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"command"
],
"type": "object"
}
},
"required": [
"approved_with_command_override"
],
"title": "ApprovedWithCommandOverrideReviewDecision",
"type": "object"
},
{
"additionalProperties": false,
"description": "User has approved this command and wants to apply the proposed execpolicy amendment so future matching commands are permitted.",
@@ -10245,6 +10270,55 @@
},
"type": "object"
},
"SdkDelegationConfig": {
"properties": {
"bridgeUrl": {
"description": "Base URL for the host-managed Responses bridge reachable by the Codex runtime.",
"type": "string"
},
"modelProviderId": {
"description": "Optional model-provider id to register for this thread. Defaults to `codex-sdk-v2`.",
"type": [
"string",
"null"
]
},
"streamIdleTimeoutMs": {
"description": "Optional stream idle timeout override for the delegated provider.",
"format": "uint64",
"minimum": 0.0,
"type": [
"integer",
"null"
]
}
},
"required": [
"bridgeUrl"
],
"type": "object"
},
"SdkDelegationConfiguredNotification": {
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {
"bridgeUrl": {
"type": "string"
},
"modelProvider": {
"type": "string"
},
"threadId": {
"type": "string"
}
},
"required": [
"bridgeUrl",
"modelProvider",
"threadId"
],
"title": "SdkDelegationConfiguredNotification",
"type": "object"
},
"ServerNotification": {
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Notification sent from the server to the client.",
@@ -10771,6 +10845,26 @@
"title": "App/list/updatedNotification",
"type": "object"
},
{
"properties": {
"method": {
"enum": [
"codexSdk/delegationConfigured"
],
"title": "CodexSdk/delegationConfiguredNotificationMethod",
"type": "string"
},
"params": {
"$ref": "#/definitions/SdkDelegationConfiguredNotification"
}
},
"required": [
"method",
"params"
],
"title": "CodexSdk/delegationConfiguredNotification",
"type": "object"
},
{
"properties": {
"method": {

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {
"bridgeUrl": {
"type": "string"
},
"modelProvider": {
"type": "string"
},
"threadId": {
"type": "string"
}
},
"required": [
"bridgeUrl",
"modelProvider",
"threadId"
],
"title": "SdkDelegationConfiguredNotification",
"type": "object"
}

View File

@@ -76,6 +76,34 @@
],
"type": "string"
},
"SdkDelegationConfig": {
"properties": {
"bridgeUrl": {
"description": "Base URL for the host-managed Responses bridge reachable by the Codex runtime.",
"type": "string"
},
"modelProviderId": {
"description": "Optional model-provider id to register for this thread. Defaults to `codex-sdk-v2`.",
"type": [
"string",
"null"
]
},
"streamIdleTimeoutMs": {
"description": "Optional stream idle timeout override for the delegated provider.",
"format": "uint64",
"minimum": 0.0,
"type": [
"integer",
"null"
]
}
},
"required": [
"bridgeUrl"
],
"type": "object"
},
"ServiceTier": {
"enum": [
"fast",

View File

@@ -7,4 +7,4 @@ import type { NetworkPolicyAmendment } from "./NetworkPolicyAmendment";
/**
* User's decision in response to an ExecApprovalRequest.
*/
export type ReviewDecision = "approved" | { "approved_execpolicy_amendment": { proposed_execpolicy_amendment: ExecPolicyAmendment, } } | "approved_for_session" | { "network_policy_amendment": { network_policy_amendment: NetworkPolicyAmendment, } } | "denied" | "abort";
export type ReviewDecision = "approved" | { "approved_with_command_override": { command: Array<string>, } } | { "approved_execpolicy_amendment": { proposed_execpolicy_amendment: ExecPolicyAmendment, } } | "approved_for_session" | { "network_policy_amendment": { network_policy_amendment: NetworkPolicyAmendment, } } | "denied" | "abort";

View File

@@ -24,6 +24,7 @@ import type { RawResponseItemCompletedNotification } from "./v2/RawResponseItemC
import type { ReasoningSummaryPartAddedNotification } from "./v2/ReasoningSummaryPartAddedNotification";
import type { ReasoningSummaryTextDeltaNotification } from "./v2/ReasoningSummaryTextDeltaNotification";
import type { ReasoningTextDeltaNotification } from "./v2/ReasoningTextDeltaNotification";
import type { SdkDelegationConfiguredNotification } from "./v2/SdkDelegationConfiguredNotification";
import type { ServerRequestResolvedNotification } from "./v2/ServerRequestResolvedNotification";
import type { SkillsChangedNotification } from "./v2/SkillsChangedNotification";
import type { TerminalInteractionNotification } from "./v2/TerminalInteractionNotification";
@@ -49,4 +50,4 @@ import type { WindowsWorldWritableWarningNotification } from "./v2/WindowsWorldW
/**
* Notification sent from the server to the client.
*/
export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification };
export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "codexSdk/delegationConfigured", "params": SdkDelegationConfiguredNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification };

View File

@@ -4,4 +4,4 @@
import type { ExecPolicyAmendment } from "./ExecPolicyAmendment";
import type { NetworkPolicyAmendment } from "./NetworkPolicyAmendment";
export type CommandExecutionApprovalDecision = "accept" | "acceptForSession" | { "acceptWithExecpolicyAmendment": { execpolicy_amendment: ExecPolicyAmendment, } } | { "applyNetworkPolicyAmendment": { network_policy_amendment: NetworkPolicyAmendment, } } | "decline" | "cancel";
export type CommandExecutionApprovalDecision = "accept" | { "acceptWithCommandOverride": { command: Array<string>, } } | "acceptForSession" | { "acceptWithExecpolicyAmendment": { execpolicy_amendment: ExecPolicyAmendment, } } | { "applyNetworkPolicyAmendment": { network_policy_amendment: NetworkPolicyAmendment, } } | "decline" | "cancel";

View File

@@ -0,0 +1,18 @@
// GENERATED CODE! DO NOT MODIFY BY HAND!
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
export type SdkDelegationConfig = {
/**
* Base URL for the host-managed Responses bridge reachable by the Codex runtime.
*/
bridgeUrl: string,
/**
* Optional model-provider id to register for this thread.
* Defaults to `codex-sdk-v2`.
*/
modelProviderId: string | null,
/**
* Optional stream idle timeout override for the delegated provider.
*/
streamIdleTimeoutMs: bigint | null, };

View File

@@ -0,0 +1,5 @@
// GENERATED CODE! DO NOT MODIFY BY HAND!
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
export type SdkDelegationConfiguredNotification = { threadId: string, modelProvider: string, bridgeUrl: string, };

View File

@@ -8,6 +8,9 @@ import type { AskForApproval } from "./AskForApproval";
import type { SandboxMode } from "./SandboxMode";
export type ThreadStartParams = {model?: string | null, modelProvider?: string | null, serviceTier?: ServiceTier | null | null, cwd?: string | null, approvalPolicy?: AskForApproval | null, sandbox?: SandboxMode | null, config?: { [key in string]?: JsonValue } | null, serviceName?: string | null, baseInstructions?: string | null, developerInstructions?: string | null, personality?: Personality | null, ephemeral?: boolean | null, /**
* If true, require host-visible approval before executing built-in tools.
*/
manualToolExecution?: boolean, /**
* If true, opt into emitting raw Responses API items on the event stream.
* This is for internal use only (e.g. Codex Cloud).
*/

View File

@@ -176,6 +176,8 @@ export type { ReviewTarget } from "./ReviewTarget";
export type { SandboxMode } from "./SandboxMode";
export type { SandboxPolicy } from "./SandboxPolicy";
export type { SandboxWorkspaceWrite } from "./SandboxWorkspaceWrite";
export type { SdkDelegationConfig } from "./SdkDelegationConfig";
export type { SdkDelegationConfiguredNotification } from "./SdkDelegationConfiguredNotification";
export type { ServerRequestResolvedNotification } from "./ServerRequestResolvedNotification";
export type { SessionSource } from "./SessionSource";
export type { SkillDependencies } from "./SkillDependencies";

View File

@@ -790,6 +790,8 @@ server_notification_definitions! {
AccountUpdated => "account/updated" (v2::AccountUpdatedNotification),
AccountRateLimitsUpdated => "account/rateLimits/updated" (v2::AccountRateLimitsUpdatedNotification),
AppListUpdated => "app/list/updated" (v2::AppListUpdatedNotification),
#[experimental("thread/start.sdkDelegation")]
SdkDelegationConfigured => "codexSdk/delegationConfigured" (v2::SdkDelegationConfiguredNotification),
ReasoningSummaryTextDelta => "item/reasoning/summaryTextDelta" (v2::ReasoningSummaryTextDeltaNotification),
ReasoningSummaryPartAdded => "item/reasoning/summaryPartAdded" (v2::ReasoningSummaryPartAddedNotification),
ReasoningTextDelta => "item/reasoning/textDelta" (v2::ReasoningTextDeltaNotification),

View File

@@ -744,12 +744,15 @@ pub struct ConfigEdit {
pub merge_strategy: MergeStrategy,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)]
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS, ExperimentalApi)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub enum CommandExecutionApprovalDecision {
/// User approved the command.
Accept,
/// User approved execution, but wants to replace the command before it runs.
#[experimental("item/commandExecution/requestApproval.overrideCommand")]
AcceptWithCommandOverride { command: Vec<String> },
/// User approved the command and future prompts in the same session-scoped
/// approval cache should run without prompting.
AcceptForSession,
@@ -772,6 +775,9 @@ impl From<CoreReviewDecision> for CommandExecutionApprovalDecision {
fn from(value: CoreReviewDecision) -> Self {
match value {
CoreReviewDecision::Approved => Self::Accept,
CoreReviewDecision::ApprovedWithCommandOverride { command } => {
Self::AcceptWithCommandOverride { command }
}
CoreReviewDecision::ApprovedExecpolicyAmendment {
proposed_execpolicy_amendment,
} => Self::AcceptWithExecpolicyAmendment {
@@ -1868,6 +1874,13 @@ pub struct ThreadStartParams {
#[experimental("thread/start.dynamicTools")]
#[ts(optional = nullable)]
pub dynamic_tools: Option<Vec<DynamicToolSpec>>,
#[experimental("thread/start.builtinTools")]
#[ts(optional = nullable)]
pub builtin_tools: Option<Vec<String>>,
/// If true, require host-visible approval before executing built-in tools.
#[experimental("thread/start.manualToolExecution")]
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub manual_tool_execution: bool,
/// Test-only experimental field used to validate experimental gating and
/// schema filtering behavior in a stable way.
#[experimental("thread/start.mockExperimentalField")]
@@ -1883,6 +1896,24 @@ pub struct ThreadStartParams {
#[experimental("thread/start.persistFullHistory")]
#[serde(default)]
pub persist_extended_history: bool,
/// EXPERIMENTAL - route this thread's model traffic through a host-managed
/// bridge instead of the user's default provider configuration.
#[experimental("thread/start.sdkDelegation")]
#[ts(optional = nullable)]
pub sdk_delegation: Option<SdkDelegationConfig>,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub struct SdkDelegationConfig {
/// Base URL for the host-managed Responses bridge reachable by the Codex runtime.
pub bridge_url: String,
/// Optional model-provider id to register for this thread.
/// Defaults to `codex-sdk-v2`.
pub model_provider_id: Option<String>,
/// Optional stream idle timeout override for the delegated provider.
pub stream_idle_timeout_ms: Option<u64>,
}
#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, JsonSchema, TS)]
@@ -3729,6 +3760,15 @@ pub struct ThreadStartedNotification {
pub thread: Thread,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub struct SdkDelegationConfiguredNotification {
pub thread_id: String,
pub model_provider: String,
pub bridge_url: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
@@ -5552,6 +5592,74 @@ mod tests {
assert_eq!(serialized_without_override.get("serviceTier"), None);
}
#[test]
fn thread_start_params_round_trip_sdk_delegation() {
let params: ThreadStartParams = serde_json::from_value(json!({
"sdkDelegation": {
"bridgeUrl": "http://127.0.0.1:8080/v1",
"modelProviderId": "codex-sdk-v2",
"streamIdleTimeoutMs": 5000
}
}))
.expect("params should deserialize");
assert_eq!(
params.sdk_delegation,
Some(SdkDelegationConfig {
bridge_url: "http://127.0.0.1:8080/v1".to_string(),
model_provider_id: Some("codex-sdk-v2".to_string()),
stream_idle_timeout_ms: Some(5000),
})
);
let serialized = serde_json::to_value(&params).expect("params should serialize");
let mut expected =
serde_json::to_value(ThreadStartParams::default()).expect("params should serialize");
expected
.as_object_mut()
.expect("serialized params should be an object")
.insert(
"sdkDelegation".to_string(),
json!({
"bridgeUrl": "http://127.0.0.1:8080/v1",
"modelProviderId": "codex-sdk-v2",
"streamIdleTimeoutMs": 5000
}),
);
assert_eq!(serialized, expected);
}
#[test]
fn thread_start_params_round_trip_manual_tool_execution() {
let params: ThreadStartParams =
serde_json::from_value(json!({ "manualToolExecution": true }))
.expect("params should deserialize");
assert!(params.manual_tool_execution);
let serialized = serde_json::to_value(&params).expect("params should serialize");
let mut expected =
serde_json::to_value(ThreadStartParams::default()).expect("params should serialize");
expected
.as_object_mut()
.expect("serialized params should be an object")
.insert("manualToolExecution".to_string(), json!(true));
assert_eq!(serialized, expected);
}
#[test]
fn command_execution_request_approval_response_round_trip_command_override() {
let response = CommandExecutionRequestApprovalResponse {
decision: CommandExecutionApprovalDecision::AcceptWithCommandOverride {
command: vec!["ls".to_string(), "-la".to_string()],
},
};
let json = serde_json::to_value(&response).expect("serialize response");
let parsed: CommandExecutionRequestApprovalResponse =
serde_json::from_value(json).expect("deserialize response");
assert_eq!(parsed, response);
}
#[test]
fn turn_start_params_preserve_explicit_null_service_tier() {
let params: TurnStartParams = serde_json::from_value(json!({

View File

@@ -120,7 +120,7 @@ Example with notification opt-out:
## API Overview
- `thread/start` — create a new thread; emits `thread/started` (including the current `thread.status`) and auto-subscribes you to turn/item events for that thread.
- `thread/start` — create a new thread; emits `thread/started` (including the current `thread.status`) and auto-subscribes you to turn/item events for that thread. Experimental: `thread/start.sdkDelegation` lets a host integration register a per-thread delegated Responses provider that points Codex at a host-managed bridge. In the current prototype shape, Codex sends the raw Responses request body to that bridge and the host bridge injects upstream authorization before forwarding the request. Experimental: `thread/start.builtinTools` lets a client provide an exact allowlist of built-in Codex tool names for that thread. Experimental: `thread/start.manualToolExecution` forces built-in tool executions to pause for host approval before Codex invokes them.
- `thread/resume` — reopen an existing thread by id so subsequent `turn/start` calls append to it.
- `thread/fork` — fork an existing thread into a new thread id by copying the stored history; emits `thread/started` (including the current `thread.status`) and auto-subscribes you to turn/item events for the new thread.
- `thread/list` — page through stored rollouts; supports cursor-based pagination and optional `modelProviders`, `sourceKinds`, `archived`, `cwd`, and `searchTerm` filters. Each returned `thread` includes `status` (`ThreadStatus`), defaulting to `notLoaded` when the thread is not currently loaded.
@@ -128,6 +128,7 @@ Example with notification opt-out:
- `thread/read` — read a stored thread by id without resuming it; optionally include turns via `includeTurns`. The returned `thread` includes `status` (`ThreadStatus`), defaulting to `notLoaded` when the thread is not currently loaded.
- `thread/metadata/update` — patch stored thread metadata in sqlite; currently supports updating persisted `gitInfo` fields and returns the refreshed `thread`.
- `thread/status/changed` — notification emitted when a loaded threads status changes (`threadId` + new `status`).
- `codexSdk/delegationConfigured` — experimental notification emitted after `thread/start` when `sdkDelegation` is active for that thread.
- `thread/archive` — move a threads rollout file into the archived directory; returns `{}` on success and emits `thread/archived`.
- `thread/unsubscribe` — unsubscribe this connection from thread turn/item events. If this was the last subscriber, the server shuts down and unloads the thread, then emits `thread/closed`.
- `thread/name/set` — set or update a threads user-facing name for either a loaded thread or a persisted rollout; returns `{}` on success and emits `thread/name/updated` to initialized, opted-in clients. Thread names are not required to be unique; name lookups resolve to the most recently updated thread.
@@ -197,6 +198,13 @@ Start a fresh thread when you need a new Codex conversation.
}
}
],
"builtinTools": [
"exec_command",
"write_stdin",
"apply_patch",
"view_image"
],
"manualToolExecution": true
} }
{ "id": 10, "result": {
"thread": {
@@ -764,7 +772,7 @@ Certain actions (shell commands or modifying files) may require explicit user ap
Order of messages:
1. `item/started` — shows the pending `commandExecution` item with `command`, `cwd`, and other fields so you can render the proposed action.
2. `item/commandExecution/requestApproval` (request) — carries the same `itemId`, `threadId`, `turnId`, optionally `approvalId` (for subcommand callbacks), and `reason`. For normal command approvals, it also includes `command`, `cwd`, and `commandActions` for friendly display. When `initialize.params.capabilities.experimentalApi = true`, it may also include experimental `additionalPermissions` describing requested per-command sandbox access; any filesystem paths in that payload are absolute on the wire, and network access is represented as `additionalPermissions.network.enabled`. For network-only approvals, those command fields may be omitted and `networkApprovalContext` is provided instead. Optional persistence hints may also be included via `proposedExecpolicyAmendment` and `proposedNetworkPolicyAmendments`. Clients can prefer `availableDecisions` when present to render the exact set of choices the server wants to expose, while still falling back to the older heuristics if it is omitted.
2. `item/commandExecution/requestApproval` (request) — carries the same `itemId`, `threadId`, `turnId`, optionally `approvalId` (for subcommand callbacks), and `reason`. For normal command approvals, it also includes `command`, `cwd`, and `commandActions` for friendly display. When `initialize.params.capabilities.experimentalApi = true`, it may also include experimental `additionalPermissions` describing requested per-command sandbox access; any filesystem paths in that payload are absolute on the wire, and network access is represented as `additionalPermissions.network.enabled`. For network-only approvals, those command fields may be omitted and `networkApprovalContext` is provided instead. Optional persistence hints may also be included via `proposedExecpolicyAmendment` and `proposedNetworkPolicyAmendments`. Experimental clients may also answer with `acceptWithCommandOverride` to replace the proposed built-in command before execution. Clients can prefer `availableDecisions` when present to render the exact set of choices the server wants to expose, while still falling back to the older heuristics if it is omitted.
3. Client response — for example `{ "decision": "accept" }`, `{ "decision": "acceptForSession" }`, `{ "decision": { "acceptWithExecpolicyAmendment": { "execpolicy_amendment": [...] } } }`, `{ "decision": { "applyNetworkPolicyAmendment": { "network_policy_amendment": { "host": "example.com", "action": "allow" } } } }`, `{ "decision": "decline" }`, or `{ "decision": "cancel" }`.
4. `serverRequest/resolved``{ threadId, requestId }` confirms the pending request has been resolved or cleared, including lifecycle cleanup on turn start/complete/interrupt.
5. `item/completed` — final `commandExecution` item with `status: "completed" | "failed" | "declined"` and execution output. Render this as the authoritative result.

View File

@@ -2256,6 +2256,10 @@ async fn on_command_execution_request_approval_response(
let (decision, completion_status) = match decision {
CommandExecutionApprovalDecision::Accept => (ReviewDecision::Approved, None),
CommandExecutionApprovalDecision::AcceptWithCommandOverride { command } => (
ReviewDecision::ApprovedWithCommandOverride { command },
None,
),
CommandExecutionApprovalDecision::AcceptForSession => {
(ReviewDecision::ApprovedForSession, None)
}

View File

@@ -92,6 +92,8 @@ use codex_app_server_protocol::ReviewStartParams;
use codex_app_server_protocol::ReviewStartResponse;
use codex_app_server_protocol::ReviewTarget as ApiReviewTarget;
use codex_app_server_protocol::SandboxMode;
use codex_app_server_protocol::SdkDelegationConfig;
use codex_app_server_protocol::SdkDelegationConfiguredNotification;
use codex_app_server_protocol::ServerNotification;
use codex_app_server_protocol::ServerRequestResolvedNotification;
use codex_app_server_protocol::SkillsConfigWriteParams;
@@ -1602,11 +1604,14 @@ impl CodexMessageProcessor {
base_instructions,
developer_instructions,
dynamic_tools,
builtin_tools,
manual_tool_execution,
mock_experimental_field: _mock_experimental_field,
experimental_raw_events,
personality,
ephemeral,
persist_extended_history,
sdk_delegation,
} = params;
let mut typesafe_overrides = self.build_thread_config_overrides(
model,
@@ -1620,6 +1625,11 @@ impl CodexMessageProcessor {
personality,
);
typesafe_overrides.ephemeral = ephemeral;
let mut config = config.unwrap_or_default();
let sdk_delegation = sdk_delegation.inspect(|delegation| {
apply_sdk_delegation_overrides(&mut config, &mut typesafe_overrides, delegation);
});
let config = (!config.is_empty()).then_some(config);
let cli_overrides = self.cli_overrides.clone();
let cloud_requirements = self.current_cloud_requirements();
let listener_task_context = ListenerTaskContext {
@@ -1640,9 +1650,12 @@ impl CodexMessageProcessor {
config,
typesafe_overrides,
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
service_name,
experimental_raw_events,
sdk_delegation,
)
.await;
});
@@ -1657,9 +1670,12 @@ impl CodexMessageProcessor {
config_overrides: Option<HashMap<String, serde_json::Value>>,
typesafe_overrides: ConfigOverrides,
dynamic_tools: Option<Vec<ApiDynamicToolSpec>>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
service_name: Option<String>,
experimental_raw_events: bool,
sdk_delegation: Option<SdkDelegationConfig>,
) {
let config = match derive_config_from_params(
&cli_overrides,
@@ -1685,6 +1701,20 @@ impl CodexMessageProcessor {
};
let dynamic_tools = dynamic_tools.unwrap_or_default();
if let Some(builtin_tools) = builtin_tools.as_ref()
&& let Err(message) = validate_builtin_tools(builtin_tools)
{
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message,
data: None,
};
listener_task_context
.outgoing
.send_error(request_id, error)
.await;
return;
}
let core_dynamic_tools = if dynamic_tools.is_empty() {
Vec::new()
} else {
@@ -1715,6 +1745,8 @@ impl CodexMessageProcessor {
.start_thread_with_tools_and_service_name(
config,
core_dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
service_name,
)
@@ -1772,17 +1804,36 @@ impl CodexMessageProcessor {
sandbox: config_snapshot.sandbox_policy.into(),
reasoning_effort: config_snapshot.reasoning_effort,
};
let response_thread_id = response.thread.id.clone();
let response_model_provider = response.model_provider.clone();
listener_task_context
.outgoing
.send_response(request_id, response)
.await;
info!("thread/start created thread {response_thread_id}");
let notif = ThreadStartedNotification { thread };
listener_task_context
.outgoing
.send_server_notification(ServerNotification::ThreadStarted(notif))
.await;
info!("thread/start sent thread/started for {response_thread_id}");
if let Some(sdk_delegation) = sdk_delegation {
let notification = SdkDelegationConfiguredNotification {
thread_id: response_thread_id,
model_provider: response_model_provider,
bridge_url: sdk_delegation.bridge_url,
};
listener_task_context
.outgoing
.send_server_notification(ServerNotification::SdkDelegationConfigured(
notification,
))
.await;
info!("thread/start sent codexSdk/delegationConfigured");
}
}
Err(err) => {
let error = JSONRPCErrorError {
@@ -6877,6 +6928,25 @@ fn validate_dynamic_tools(tools: &[ApiDynamicToolSpec]) -> Result<(), String> {
Ok(())
}
fn validate_builtin_tools(tools: &[String]) -> Result<(), String> {
let mut seen = HashSet::new();
for tool in tools {
let name = tool.trim();
if name.is_empty() {
return Err("builtin tool name must not be empty".to_string());
}
if name != tool {
return Err(format!(
"builtin tool name has leading/trailing whitespace: {tool}"
));
}
if !seen.insert(name.to_string()) {
return Err(format!("duplicate builtin tool name: {name}"));
}
}
Ok(())
}
fn replace_cloud_requirements_loader(
cloud_requirements: &RwLock<CloudRequirementsLoader>,
auth_manager: Arc<AuthManager>,
@@ -6948,6 +7018,47 @@ async fn derive_config_from_params(
.await
}
fn apply_sdk_delegation_overrides(
request_overrides: &mut HashMap<String, serde_json::Value>,
typesafe_overrides: &mut ConfigOverrides,
sdk_delegation: &SdkDelegationConfig,
) {
let provider_id = sdk_delegation
.model_provider_id
.clone()
.unwrap_or_else(|| "codex-sdk-v2".to_string());
typesafe_overrides.model_provider = Some(provider_id.clone());
let provider_prefix = format!("model_providers.{provider_id}");
request_overrides.insert(
format!("{provider_prefix}.name"),
serde_json::Value::String("Codex SDK v2 Delegated Provider".to_string()),
);
request_overrides.insert(
format!("{provider_prefix}.base_url"),
serde_json::Value::String(sdk_delegation.bridge_url.clone()),
);
request_overrides.insert(
format!("{provider_prefix}.wire_api"),
serde_json::Value::String("responses".to_string()),
);
request_overrides.insert(
format!("{provider_prefix}.supports_websockets"),
serde_json::Value::Bool(false),
);
request_overrides.insert(
format!("{provider_prefix}.requires_openai_auth"),
serde_json::Value::Bool(false),
);
if let Some(stream_idle_timeout_ms) = sdk_delegation.stream_idle_timeout_ms {
request_overrides.insert(
format!("{provider_prefix}.stream_idle_timeout_ms"),
serde_json::Value::Number(stream_idle_timeout_ms.into()),
);
}
}
async fn derive_config_for_cwd(
cli_overrides: &[(String, TomlValue)],
request_overrides: Option<HashMap<String, serde_json::Value>>,

View File

@@ -28,6 +28,8 @@ use core_test_support::responses;
use pretty_assertions::assert_eq;
use serde_json::Value;
use serde_json::json;
use std::collections::HashMap;
use std::collections::HashSet;
use std::path::Path;
use std::time::Duration;
use tempfile::TempDir;
@@ -118,6 +120,85 @@ async fn thread_start_injects_dynamic_tools_into_model_requests() -> Result<()>
Ok(())
}
#[tokio::test]
async fn thread_start_builtin_tools_filters_model_requests() -> Result<()> {
let responses = vec![create_final_assistant_message_sse_response("Done")?];
let server = create_mock_responses_server_sequence_unchecked(responses).await;
let codex_home = TempDir::new()?;
create_config_toml(codex_home.path(), &server.uri())?;
let mut mcp = McpProcess::new(codex_home.path()).await?;
timeout(DEFAULT_READ_TIMEOUT, mcp.initialize()).await??;
let thread_req = mcp
.send_thread_start_request(ThreadStartParams {
config: Some(HashMap::from([
(
"experimental_use_unified_exec_tool".to_string(),
json!(true),
),
("include_apply_patch_tool".to_string(), json!(true)),
])),
builtin_tools: Some(vec![
"exec_command".to_string(),
"write_stdin".to_string(),
"update_plan".to_string(),
"view_image".to_string(),
]),
..Default::default()
})
.await?;
let thread_resp: JSONRPCResponse = timeout(
DEFAULT_READ_TIMEOUT,
mcp.read_stream_until_response_message(RequestId::Integer(thread_req)),
)
.await??;
let ThreadStartResponse { thread, .. } = to_response::<ThreadStartResponse>(thread_resp)?;
let turn_req = mcp
.send_turn_start_request(TurnStartParams {
thread_id: thread.id,
input: vec![V2UserInput::Text {
text: "Hello".to_string(),
text_elements: Vec::new(),
}],
..Default::default()
})
.await?;
let turn_resp: JSONRPCResponse = timeout(
DEFAULT_READ_TIMEOUT,
mcp.read_stream_until_response_message(RequestId::Integer(turn_req)),
)
.await??;
let _turn: TurnStartResponse = to_response::<TurnStartResponse>(turn_resp)?;
timeout(
DEFAULT_READ_TIMEOUT,
mcp.read_stream_until_notification_message("turn/completed"),
)
.await??;
let bodies = responses_bodies(&server).await?;
let body = bodies
.first()
.context("expected at least one responses request")?;
let tool_names = body
.get("tools")
.and_then(Value::as_array)
.context("expected tools array in request body")?
.iter()
.filter_map(|tool| tool.get("name").and_then(Value::as_str))
.collect::<HashSet<_>>();
assert_eq!(
tool_names,
HashSet::from(["exec_command", "write_stdin", "update_plan", "view_image",])
);
Ok(())
}
/// Exercises the full dynamic tool call path (server request, client response, model output).
#[tokio::test]
async fn dynamic_tool_call_round_trip_sends_text_content_items_to_model() -> Result<()> {

View File

@@ -10,6 +10,7 @@ use codex_app_server_protocol::JSONRPCMessage;
use codex_app_server_protocol::JSONRPCResponse;
use codex_app_server_protocol::MockExperimentalMethodParams;
use codex_app_server_protocol::RequestId;
use codex_app_server_protocol::SdkDelegationConfig;
use codex_app_server_protocol::ThreadRealtimeStartParams;
use codex_app_server_protocol::ThreadStartParams;
use codex_app_server_protocol::ThreadStartResponse;
@@ -121,6 +122,46 @@ async fn thread_start_mock_field_requires_experimental_api_capability() -> Resul
Ok(())
}
#[tokio::test]
async fn thread_start_sdk_delegation_requires_experimental_api_capability() -> Result<()> {
let server = create_mock_responses_server_sequence_unchecked(Vec::new()).await;
let codex_home = TempDir::new()?;
create_config_toml(codex_home.path(), &server.uri())?;
let mut mcp = McpProcess::new(codex_home.path()).await?;
let init = mcp
.initialize_with_capabilities(
default_client_info(),
Some(InitializeCapabilities {
experimental_api: false,
opt_out_notification_methods: None,
}),
)
.await?;
let JSONRPCMessage::Response(_) = init else {
anyhow::bail!("expected initialize response, got {init:?}");
};
let request_id = mcp
.send_thread_start_request(ThreadStartParams {
sdk_delegation: Some(SdkDelegationConfig {
bridge_url: "http://127.0.0.1:8080/v1".to_string(),
model_provider_id: None,
stream_idle_timeout_ms: None,
}),
..Default::default()
})
.await?;
let error = timeout(
DEFAULT_TIMEOUT,
mcp.read_stream_until_error_message(RequestId::Integer(request_id)),
)
.await??;
assert_experimental_capability_error(error, "thread/start.sdkDelegation");
Ok(())
}
#[tokio::test]
async fn thread_start_without_dynamic_tools_allows_without_experimental_api_capability()
-> Result<()> {

View File

@@ -241,9 +241,12 @@ async fn skills_changed_notification_is_emitted_after_skill_change() -> Result<(
personality: None,
ephemeral: None,
dynamic_tools: None,
builtin_tools: None,
manual_tool_execution: false,
mock_experimental_field: None,
experimental_raw_events: false,
persist_extended_history: false,
sdk_delegation: None,
})
.await?;
let _: JSONRPCResponse = timeout(

View File

@@ -6,6 +6,8 @@ use codex_app_server_protocol::JSONRPCError;
use codex_app_server_protocol::JSONRPCMessage;
use codex_app_server_protocol::JSONRPCResponse;
use codex_app_server_protocol::RequestId;
use codex_app_server_protocol::SdkDelegationConfig;
use codex_app_server_protocol::SdkDelegationConfiguredNotification;
use codex_app_server_protocol::ThreadStartParams;
use codex_app_server_protocol::ThreadStartResponse;
use codex_app_server_protocol::ThreadStartedNotification;
@@ -181,6 +183,55 @@ model_reasoning_effort = "high"
Ok(())
}
#[tokio::test]
async fn thread_start_emits_sdk_delegation_configured_notification() -> Result<()> {
let server = create_mock_responses_server_repeating_assistant("Done").await;
let codex_home = TempDir::new()?;
create_config_toml(codex_home.path(), &server.uri())?;
let mut mcp = McpProcess::new(codex_home.path()).await?;
timeout(DEFAULT_READ_TIMEOUT, mcp.initialize()).await??;
let req_id = mcp
.send_thread_start_request(ThreadStartParams {
sdk_delegation: Some(SdkDelegationConfig {
bridge_url: "http://127.0.0.1:8080/v1".to_string(),
model_provider_id: Some("sdk-provider".to_string()),
stream_idle_timeout_ms: Some(5000),
}),
..Default::default()
})
.await?;
let resp: JSONRPCResponse = timeout(
DEFAULT_READ_TIMEOUT,
mcp.read_stream_until_response_message(RequestId::Integer(req_id)),
)
.await??;
let ThreadStartResponse { thread, .. } = to_response::<ThreadStartResponse>(resp)?;
let deadline = tokio::time::Instant::now() + DEFAULT_READ_TIMEOUT;
let notification = loop {
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
let message = timeout(remaining, mcp.read_next_message()).await??;
let JSONRPCMessage::Notification(notification) = message else {
continue;
};
if notification.method == "codexSdk/delegationConfigured" {
break notification;
}
};
let configured: SdkDelegationConfiguredNotification =
serde_json::from_value(notification.params.expect("params must be present"))?;
assert_eq!(configured.thread_id, thread.id);
assert_eq!(configured.model_provider, "sdk-provider");
assert_eq!(configured.bridge_url, "http://127.0.0.1:8080/v1");
Ok(())
}
#[tokio::test]
async fn thread_start_accepts_flex_service_tier() -> Result<()> {
let server = create_mock_responses_server_repeating_assistant("Done").await;

View File

@@ -0,0 +1,75 @@
use crate::tools::spec::ToolsConfig;
const CORE_BASE_INSTRUCTIONS: &str = include_str!("../templates/base_instructions/core.md");
const APPLY_PATCH_INSTRUCTIONS: &str =
include_str!("../templates/base_instructions/capabilities/apply_patch.md");
const UNIFIED_EXEC_INSTRUCTIONS: &str =
include_str!("../templates/base_instructions/capabilities/unified_exec.md");
const UPDATE_PLAN_INSTRUCTIONS: &str =
include_str!("../templates/base_instructions/capabilities/update_plan.md");
pub(crate) fn compose_base_instructions(tools_config: &ToolsConfig) -> String {
let mut sections = vec![CORE_BASE_INSTRUCTIONS.trim().to_string()];
if tools_config.has_builtin_tool("exec_command") {
sections.push(UNIFIED_EXEC_INSTRUCTIONS.trim().to_string());
}
if tools_config.has_builtin_tool("apply_patch") {
sections.push(APPLY_PATCH_INSTRUCTIONS.trim().to_string());
}
if tools_config.has_builtin_tool("update_plan") {
sections.push(UPDATE_PLAN_INSTRUCTIONS.trim().to_string());
}
sections.join("\n\n")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::features::Feature;
use crate::features::Features;
use crate::models_manager::model_info::model_info_from_slug;
use crate::tools::spec::ToolsConfigParams;
use codex_protocol::protocol::SessionSource;
fn tools_config() -> ToolsConfig {
let mut features = Features::new();
let _ = features.enable(Feature::ShellTool);
let _ = features.enable(Feature::UnifiedExec);
let _ = features.enable(Feature::ApplyPatchFreeform);
let model_info = model_info_from_slug("gpt-5.2-codex");
ToolsConfig::new(&ToolsConfigParams {
model_info: &model_info,
features: &features,
web_search_mode: None,
session_source: SessionSource::Cli,
})
}
#[test]
fn omits_unenabled_capability_sections() {
let tools_config = tools_config().with_builtin_tools(Some(vec!["exec_command".to_string()]));
let instructions = compose_base_instructions(&tools_config);
assert!(instructions.contains("# Unified Exec"));
assert!(!instructions.contains("# Apply Patch"));
assert!(!instructions.contains("# Update Plan"));
}
#[test]
fn includes_capability_sections_for_enabled_tools() {
let tools_config = tools_config().with_builtin_tools(Some(vec![
"exec_command".to_string(),
"apply_patch".to_string(),
"update_plan".to_string(),
]));
let instructions = compose_base_instructions(&tools_config);
assert!(instructions.contains("# Unified Exec"));
assert!(instructions.contains("# Apply Patch"));
assert!(instructions.contains("# Update Plan"));
}
}

View File

@@ -18,6 +18,7 @@ use crate::analytics_client::InvocationType;
use crate::analytics_client::build_track_events_context;
use crate::apps::render_apps_section;
use crate::commit_attribution::commit_message_trailer_instruction;
use crate::base_instructions::compose_base_instructions;
use crate::compact;
use crate::compact::InitialContextInjection;
use crate::compact::run_inline_auto_compact_task;
@@ -356,6 +357,8 @@ impl Codex {
session_source: SessionSource,
agent_control: AgentControl,
dynamic_tools: Vec<DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
metrics_service_name: Option<String>,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
@@ -428,16 +431,24 @@ impl Codex {
.get_default_model(&config.model, refresh_strategy)
.await;
let model_info = models_manager.get_model_info(model.as_str(), &config).await;
let tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &model_info,
features: &config.features,
web_search_mode: config.web_search_request_mode.as_ref(),
session_source: session_source.clone(),
})
.with_builtin_tools(builtin_tools.clone())
.with_manual_tool_execution(manual_tool_execution);
// Resolve base instructions for the session. Priority order:
// 1. config.base_instructions override
// 2. conversation history => session_meta.base_instructions
// 3. base_instructions for current model
let model_info = models_manager.get_model_info(model.as_str(), &config).await;
// 3. composed base instructions for the current tool/capability set
let base_instructions = config
.base_instructions
.clone()
.or_else(|| conversation_history.get_base_instructions().map(|s| s.text))
.unwrap_or_else(|| model_info.get_model_instructions(config.personality));
.unwrap_or_else(|| compose_base_instructions(&tools_config));
// Respect thread-start tools. When missing (resumed/forked threads), read from the db
// first, then fall back to rollout-file tools.
@@ -497,6 +508,8 @@ impl Codex {
app_server_client_name: None,
session_source,
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
inherited_shell_snapshot,
};
@@ -745,7 +758,9 @@ impl TurnContext {
session_source: self.session_source.clone(),
})
.with_allow_login_shell(self.tools_config.allow_login_shell)
.with_agent_roles(config.agent_roles.clone());
.with_agent_roles(config.agent_roles.clone())
.with_builtin_tools(self.tools_config.builtin_tools.clone())
.with_manual_tool_execution(self.tools_config.manual_tool_execution);
Self {
sub_id: self.sub_id.clone(),
@@ -901,6 +916,8 @@ pub(crate) struct SessionConfiguration {
/// Source of the session (cli, vscode, exec, mcp, ...)
session_source: SessionSource,
dynamic_tools: Vec<DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
}
@@ -1120,7 +1137,9 @@ impl Session {
session_source: session_source.clone(),
})
.with_allow_login_shell(per_turn_config.permissions.allow_login_shell)
.with_agent_roles(per_turn_config.agent_roles.clone());
.with_agent_roles(per_turn_config.agent_roles.clone())
.with_builtin_tools(session_configuration.builtin_tools.clone())
.with_manual_tool_execution(session_configuration.manual_tool_execution);
let cwd = session_configuration.cwd.clone();
let turn_metadata_state = Arc::new(TurnMetadataState::new(
@@ -4912,7 +4931,9 @@ async fn spawn_review_thread(
session_source: parent_turn_context.session_source.clone(),
})
.with_allow_login_shell(config.permissions.allow_login_shell)
.with_agent_roles(config.agent_roles.clone());
.with_agent_roles(config.agent_roles.clone())
.with_builtin_tools(parent_turn_context.tools_config.builtin_tools.clone())
.with_manual_tool_execution(parent_turn_context.tools_config.manual_tool_execution);
let review_prompt = resolved.prompt.clone();
let provider = parent_turn_context.provider.clone();

View File

@@ -60,6 +60,8 @@ pub(crate) async fn run_codex_thread_interactive(
SessionSource::SubAgent(SubAgentSource::Review),
parent_session.services.agent_control.clone(),
Vec::new(),
None,
false,
false,
None,
None,

View File

@@ -1425,6 +1425,8 @@ async fn set_rate_limits_retains_previous_credits() {
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools: Vec::new(),
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
};
@@ -1519,6 +1521,8 @@ async fn set_rate_limits_updates_plan_type_when_present() {
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools: Vec::new(),
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
};
@@ -1871,6 +1875,8 @@ pub(crate) async fn make_session_configuration_for_tests() -> SessionConfigurati
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools: Vec::new(),
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
}
@@ -1928,6 +1934,8 @@ async fn session_new_fails_when_zsh_fork_enabled_without_zsh_path() {
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools: Vec::new(),
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
};
@@ -2018,6 +2026,8 @@ pub(crate) async fn make_session_and_context() -> (Session, TurnContext) {
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools: Vec::new(),
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
};
@@ -2423,6 +2433,8 @@ pub(crate) async fn make_session_and_context_with_dynamic_tools_and_rx(
app_server_client_name: None,
session_source: SessionSource::Exec,
dynamic_tools,
builtin_tools: None,
manual_tool_execution: false,
persist_extended_history: false,
inherited_shell_snapshot: None,
};

View File

@@ -175,6 +175,7 @@ pub(crate) struct ExecApprovalRequest<'a> {
pub(crate) sandbox_policy: &'a SandboxPolicy,
pub(crate) sandbox_permissions: SandboxPermissions,
pub(crate) prefix_rule: Option<Vec<String>>,
pub(crate) manual_tool_execution: bool,
}
impl ExecPolicyManager {
@@ -206,6 +207,7 @@ impl ExecPolicyManager {
sandbox_policy,
sandbox_permissions,
prefix_rule,
manual_tool_execution,
} = req;
let exec_policy = self.current();
let (commands, used_complex_parsing) = commands_for_exec_policy(command);
@@ -220,6 +222,7 @@ impl ExecPolicyManager {
cmd,
sandbox_permissions,
used_complex_parsing,
manual_tool_execution,
)
};
let match_options = MatchOptions {
@@ -491,7 +494,11 @@ pub fn render_decision_for_unmatched_command(
command: &[String],
sandbox_permissions: SandboxPermissions,
used_complex_parsing: bool,
manual_tool_execution: bool,
) -> Decision {
if manual_tool_execution {
return Decision::Prompt;
}
if is_known_safe_command(command) && !used_complex_parsing {
return Decision::Allow;
}
@@ -1234,6 +1241,7 @@ prefix_rule(pattern=["rm"], decision="forbidden")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1284,6 +1292,7 @@ prefix_rule(pattern=["rm"], decision="forbidden")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1311,6 +1320,7 @@ prefix_rule(pattern=["rm"], decision="forbidden")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1339,6 +1349,7 @@ prefix_rule(pattern=["rm"], decision="forbidden")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: Some(requested_prefix.clone()),
manual_tool_execution: false,
})
.await;
@@ -1378,6 +1389,7 @@ prefix_rule(
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1407,6 +1419,7 @@ prefix_rule(
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1443,6 +1456,7 @@ prefix_rule(pattern=["git"], decision="allow")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1485,6 +1499,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1513,6 +1528,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: Some(vec!["cargo".to_string(), "install".to_string()]),
manual_tool_execution: false,
})
.await;
@@ -1546,6 +1562,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1573,6 +1590,7 @@ prefix_rule(pattern=["git"], decision="prompt")
&command,
SandboxPermissions::RequireEscalated,
false,
false,
)
);
}
@@ -1592,6 +1610,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::RequireEscalated,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1628,6 +1647,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::RequireEscalated,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1662,6 +1682,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::RequireEscalated,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1685,6 +1706,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1709,6 +1731,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1737,6 +1760,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1765,6 +1789,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::RequireEscalated,
prefix_rule: Some(vec!["cargo".to_string(), "install".to_string()]),
manual_tool_execution: false,
})
.await;
@@ -1796,6 +1821,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::RequireEscalated,
prefix_rule: Some(vec!["cargo".to_string(), "install".to_string()]),
manual_tool_execution: false,
})
.await;
@@ -1834,6 +1860,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await,
ExecApprovalRequirement::NeedsApproval {
@@ -1908,6 +1935,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1938,6 +1966,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -1965,6 +1994,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -2003,6 +2033,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await,
ExecApprovalRequirement::NeedsApproval {
@@ -2026,6 +2057,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -2038,6 +2070,31 @@ prefix_rule(pattern=["git"], decision="prompt")
);
}
#[tokio::test]
async fn manual_tool_execution_requires_approval_for_safe_command() {
let command = vec!["echo".to_string(), "safe".to_string()];
let manager = ExecPolicyManager::default();
let requirement = manager
.create_exec_approval_requirement_for_command(ExecApprovalRequest {
command: &command,
approval_policy: AskForApproval::OnRequest,
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: true,
})
.await;
assert_eq!(
requirement,
ExecApprovalRequirement::NeedsApproval {
reason: None,
proposed_execpolicy_amendment: Some(ExecPolicyAmendment::new(command)),
}
);
}
#[tokio::test]
async fn proposed_execpolicy_amendment_is_suppressed_when_policy_matches_allow() {
let policy_src = r#"prefix_rule(pattern=["echo"], decision="allow")"#;
@@ -2056,6 +2113,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -2229,6 +2287,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::DangerFullAccess,
sandbox_permissions: SandboxPermissions::UseDefault,
prefix_rule: None,
manual_tool_execution: false,
})
.await;
@@ -2295,6 +2354,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: permissions,
prefix_rule: None,
manual_tool_execution: false,
})
.await,
"{pwsh_approval_reason}"
@@ -2318,6 +2378,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: permissions,
prefix_rule: None,
manual_tool_execution: false,
})
.await,
r#"On all platforms, a forbidden command should require approval
@@ -2337,6 +2398,7 @@ prefix_rule(pattern=["git"], decision="prompt")
sandbox_policy: &SandboxPolicy::new_read_only_policy(),
sandbox_permissions: permissions,
prefix_rule: None,
manual_tool_execution: false,
})
.await,
r#"On all platforms, a forbidden command should require approval

View File

@@ -10,6 +10,7 @@ pub mod api_bridge;
mod apply_patch;
mod apps;
pub mod auth;
mod base_instructions;
mod client;
mod client_common;
pub mod codex;

View File

@@ -314,18 +314,22 @@ impl ThreadManager {
pub async fn start_thread(&self, config: Config) -> CodexResult<NewThread> {
// Box delegated thread-spawn futures so these convenience wrappers do
// not inline the full spawn path into every caller's async state.
Box::pin(self.start_thread_with_tools(config, Vec::new(), false)).await
Box::pin(self.start_thread_with_tools(config, Vec::new(), None, false, false)).await
}
pub async fn start_thread_with_tools(
&self,
config: Config,
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
) -> CodexResult<NewThread> {
Box::pin(self.start_thread_with_tools_and_service_name(
config,
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
None,
))
@@ -336,6 +340,8 @@ impl ThreadManager {
&self,
config: Config,
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
metrics_service_name: Option<String>,
) -> CodexResult<NewThread> {
@@ -345,6 +351,8 @@ impl ThreadManager {
Arc::clone(&self.state.auth_manager),
self.agent_control(),
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
metrics_service_name,
))
@@ -375,6 +383,8 @@ impl ThreadManager {
auth_manager,
self.agent_control(),
Vec::new(),
None,
false,
persist_extended_history,
None,
))
@@ -416,6 +426,8 @@ impl ThreadManager {
Arc::clone(&self.state.auth_manager),
self.agent_control(),
Vec::new(),
None,
false,
persist_extended_history,
None,
))
@@ -499,6 +511,8 @@ impl ThreadManagerState {
agent_control,
session_source,
Vec::new(),
None,
false,
persist_extended_history,
metrics_service_name,
inherited_shell_snapshot,
@@ -522,6 +536,8 @@ impl ThreadManagerState {
agent_control,
session_source,
Vec::new(),
None,
false,
false,
None,
inherited_shell_snapshot,
@@ -545,6 +561,8 @@ impl ThreadManagerState {
agent_control,
session_source,
Vec::new(),
None,
false,
persist_extended_history,
None,
inherited_shell_snapshot,
@@ -561,6 +579,8 @@ impl ThreadManagerState {
auth_manager: Arc<AuthManager>,
agent_control: AgentControl,
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
metrics_service_name: Option<String>,
) -> CodexResult<NewThread> {
@@ -571,6 +591,8 @@ impl ThreadManagerState {
agent_control,
self.session_source.clone(),
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
metrics_service_name,
None,
@@ -587,6 +609,8 @@ impl ThreadManagerState {
agent_control: AgentControl,
session_source: SessionSource,
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
builtin_tools: Option<Vec<String>>,
manual_tool_execution: bool,
persist_extended_history: bool,
metrics_service_name: Option<String>,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
@@ -608,6 +632,8 @@ impl ThreadManagerState {
session_source,
agent_control,
dynamic_tools,
builtin_tools,
manual_tool_execution,
persist_extended_history,
metrics_service_name,
inherited_shell_snapshot,

View File

@@ -129,7 +129,7 @@ impl ToolHandler for ApplyPatchHandler {
);
emitter.begin(event_ctx).await;
let req = ApplyPatchRequest {
let mut req = ApplyPatchRequest {
action: apply.action,
file_paths,
changes,
@@ -149,7 +149,7 @@ impl ToolHandler for ApplyPatchHandler {
let out = orchestrator
.run(
&mut runtime,
&req,
&mut req,
&tool_ctx,
turn.as_ref(),
turn.approval_policy.value(),
@@ -231,7 +231,7 @@ pub(crate) async fn intercept_apply_patch(
);
emitter.begin(event_ctx).await;
let req = ApplyPatchRequest {
let mut req = ApplyPatchRequest {
action: apply.action,
file_paths: approval_keys,
changes,
@@ -251,7 +251,7 @@ pub(crate) async fn intercept_apply_patch(
let out = orchestrator
.run(
&mut runtime,
&req,
&mut req,
&tool_ctx,
turn.as_ref(),
turn.approval_policy.value(),

View File

@@ -389,10 +389,11 @@ impl ShellHandler {
sandbox_policy: turn.sandbox_policy.get(),
sandbox_permissions: exec_params.sandbox_permissions,
prefix_rule,
manual_tool_execution: false,
})
.await;
let req = ShellRequest {
let mut req = ShellRequest {
command: exec_params.command.clone(),
cwd: exec_params.cwd.clone(),
timeout_ms: exec_params.expiration.timeout_ms(),
@@ -423,7 +424,7 @@ impl ShellHandler {
let out = orchestrator
.run(
&mut runtime,
&req,
&mut req,
&tool_ctx,
&turn,
turn.approval_policy.value(),

View File

@@ -349,7 +349,9 @@ impl NetworkApprovalService {
let mut cache_session_deny = false;
let resolved = match approval_decision {
ReviewDecision::Approved | ReviewDecision::ApprovedExecpolicyAmendment { .. } => {
ReviewDecision::Approved
| ReviewDecision::ApprovedWithCommandOverride { .. }
| ReviewDecision::ApprovedExecpolicyAmendment { .. } => {
PendingApprovalDecision::AllowOnce
}
ReviewDecision::ApprovedForSession => PendingApprovalDecision::AllowForSession,

View File

@@ -100,7 +100,7 @@ impl ToolOrchestrator {
pub async fn run<Rq, Out, T>(
&mut self,
tool: &mut T,
req: &Rq,
req: &mut Rq,
tool_ctx: &ToolCtx,
turn_ctx: &crate::codex::TurnContext,
approval_policy: AskForApproval,
@@ -144,10 +144,11 @@ impl ToolOrchestrator {
return Err(ToolError::Rejected("rejected by user".to_string()));
}
ReviewDecision::Approved
| ReviewDecision::ApprovedWithCommandOverride { .. }
| ReviewDecision::ApprovedExecpolicyAmendment { .. }
| ReviewDecision::ApprovedForSession => {}
ReviewDecision::NetworkPolicyAmendment {
network_policy_amendment,
ref network_policy_amendment,
} => match network_policy_amendment.action {
NetworkPolicyRuleAction::Allow => {}
NetworkPolicyRuleAction::Deny => {
@@ -155,6 +156,7 @@ impl ToolOrchestrator {
}
},
}
tool.apply_approval_decision(req, &decision);
already_approved = true;
}
}
@@ -280,10 +282,11 @@ impl ToolOrchestrator {
return Err(ToolError::Rejected("rejected by user".to_string()));
}
ReviewDecision::Approved
| ReviewDecision::ApprovedWithCommandOverride { .. }
| ReviewDecision::ApprovedExecpolicyAmendment { .. }
| ReviewDecision::ApprovedForSession => {}
ReviewDecision::NetworkPolicyAmendment {
network_policy_amendment,
ref network_policy_amendment,
} => match network_policy_amendment.action {
NetworkPolicyRuleAction::Allow => {}
NetworkPolicyRuleAction::Deny => {
@@ -291,6 +294,7 @@ impl ToolOrchestrator {
}
},
}
tool.apply_approval_decision(req, &decision);
}
let escalated_attempt = SandboxAttempt {

View File

@@ -1,8 +1,11 @@
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use crate::client_common::tools::FreeformTool;
use crate::client_common::tools::ResponsesApiTool;
use crate::client_common::tools::ToolSpec;
use crate::features::Feature;
use crate::function_tool::FunctionCallError;
@@ -283,6 +286,21 @@ impl ToolRegistryBuilder {
}
}
pub fn filter_builtin_tools(
mut self,
builtin_tools: &BTreeSet<String>,
enabled_builtin_tools: &BTreeSet<String>,
) -> Self {
self.specs.retain(|tool| {
let name = tool_spec_name(&tool.spec);
!builtin_tools.contains(name) || enabled_builtin_tools.contains(name)
});
self.handlers.retain(|name, _| {
!builtin_tools.contains(name) || enabled_builtin_tools.contains(name)
});
self
}
// TODO(jif) for dynamic tools.
// pub fn register_many<I>(&mut self, names: I, handler: Arc<dyn ToolHandler>)
// where
@@ -307,6 +325,16 @@ impl ToolRegistryBuilder {
}
}
fn tool_spec_name(spec: &ToolSpec) -> &str {
match spec {
ToolSpec::Function(ResponsesApiTool { name, .. }) => name,
ToolSpec::LocalShell {} => "local_shell",
ToolSpec::ImageGeneration { .. } => "image_generation",
ToolSpec::WebSearch { .. } => "web_search",
ToolSpec::Freeform(FreeformTool { name, .. }) => name,
}
}
fn unsupported_tool_call_message(payload: &ToolPayload, tool_name: &str) -> String {
match payload {
ToolPayload::Custom { .. } => format!("unsupported custom tool call: {tool_name}"),

View File

@@ -452,6 +452,7 @@ impl CoreShellActionProvider {
.await?
{
ReviewDecision::Approved
| ReviewDecision::ApprovedWithCommandOverride { .. }
| ReviewDecision::ApprovedExecpolicyAmendment { .. } => {
if needs_escalation {
EscalationDecision::escalate(escalation_execution.clone())
@@ -677,6 +678,7 @@ fn evaluate_intercepted_exec_policy(
cmd,
sandbox_permissions,
used_complex_parsing,
false,
)
};

View File

@@ -144,6 +144,12 @@ impl Approvable<UnifiedExecRequest> for UnifiedExecRuntime<'_> {
Some(req.exec_approval_requirement.clone())
}
fn apply_approval_decision(&self, req: &mut UnifiedExecRequest, decision: &ReviewDecision) {
if let ReviewDecision::ApprovedWithCommandOverride { command } = decision {
req.command = command.clone();
}
}
fn sandbox_mode_for_first_attempt(&self, req: &UnifiedExecRequest) -> SandboxOverride {
sandbox_override_for_first_attempt(req.sandbox_permissions, &req.exec_approval_requirement)
}

View File

@@ -271,6 +271,8 @@ pub(crate) trait Approvable<Req> {
req: &'a Req,
ctx: ApprovalCtx<'a>,
) -> BoxFuture<'a, ReviewDecision>;
fn apply_approval_decision(&self, _req: &mut Req, _decision: &ReviewDecision) {}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]

View File

@@ -33,6 +33,7 @@ use serde::Serialize;
use serde_json::Value as JsonValue;
use serde_json::json;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap;
const SEARCH_TOOL_BM25_DESCRIPTION_TEMPLATE: &str =
@@ -72,6 +73,8 @@ pub(crate) struct ToolsConfig {
pub experimental_supported_tools: Vec<String>,
pub agent_jobs_tools: bool,
pub agent_jobs_worker_tools: bool,
pub builtin_tools: Option<Vec<String>>,
pub manual_tool_execution: bool,
}
pub(crate) struct ToolsConfigParams<'a> {
@@ -172,6 +175,8 @@ impl ToolsConfig {
experimental_supported_tools: model_info.experimental_supported_tools.clone(),
agent_jobs_tools: include_agent_jobs,
agent_jobs_worker_tools,
builtin_tools: None,
manual_tool_execution: false,
}
}
@@ -184,6 +189,35 @@ impl ToolsConfig {
self.allow_login_shell = allow_login_shell;
self
}
pub fn with_builtin_tools(mut self, builtin_tools: Option<Vec<String>>) -> Self {
self.builtin_tools = builtin_tools;
self
}
pub fn with_manual_tool_execution(mut self, manual_tool_execution: bool) -> Self {
self.manual_tool_execution = manual_tool_execution;
self
}
pub fn has_builtin_tool(&self, tool_name: &str) -> bool {
if let Some(builtin_tools) = &self.builtin_tools {
return builtin_tools.iter().any(|tool| tool == tool_name);
}
match tool_name {
"exec_command" | "write_stdin" => self.shell_type == ConfigShellToolType::UnifiedExec,
"update_plan" => true,
"request_user_input" => self.request_user_input,
"apply_patch" => self.apply_patch_tool_type.is_some(),
"search_tool_bm25" => self.search_tool,
"view_image" => true,
"spawn_agent" => self.collab_tools,
"spawn_agents_on_csv" => self.agent_jobs_tools,
"artifacts" => self.artifact_tools,
_ => self.experimental_supported_tools.iter().any(|tool| tool == tool_name),
}
}
}
fn supports_image_generation(model_info: &ModelInfo) -> bool {
@@ -2055,7 +2089,50 @@ pub(crate) fn build_specs(
}
}
builder
if let Some(builtin_tools) = &config.builtin_tools {
let builtin_tools = builtin_tools.iter().cloned().collect::<BTreeSet<_>>();
builder.filter_builtin_tools(&known_builtin_tool_names(), &builtin_tools)
} else {
builder
}
}
fn known_builtin_tool_names() -> BTreeSet<String> {
[
"artifacts",
"apply_patch",
"close_agent",
"container.exec",
"exec_command",
"grep_files",
"image_generation",
"js_repl",
"js_repl_reset",
"list_dir",
"list_mcp_resource_templates",
"list_mcp_resources",
"local_shell",
"read_file",
"read_mcp_resource",
"report_agent_job_result",
"request_user_input",
"resume_agent",
"search_tool_bm25",
"send_input",
"shell",
"shell_command",
"spawn_agent",
"spawn_agents_on_csv",
"test_sync_tool",
"update_plan",
"view_image",
"wait",
"web_search",
"write_stdin",
]
.into_iter()
.map(str::to_string)
.collect()
}
#[cfg(test)]

View File

@@ -584,9 +584,10 @@ impl UnifiedExecProcessManager {
sandbox_policy: context.turn.sandbox_policy.get(),
sandbox_permissions: request.sandbox_permissions,
prefix_rule: request.prefix_rule.clone(),
manual_tool_execution: context.turn.tools_config.manual_tool_execution,
})
.await;
let req = UnifiedExecToolRequest {
let mut req = UnifiedExecToolRequest {
command: request.command.clone(),
cwd,
env,
@@ -607,7 +608,7 @@ impl UnifiedExecProcessManager {
orchestrator
.run(
&mut runtime,
&req,
&mut req,
&tool_ctx,
&context.turn,
context.turn.approval_policy.value(),

View File

@@ -0,0 +1,8 @@
# Apply Patch
The `apply_patch` tool is available for editing files. Use `apply_patch` rather than `applypatch` or `apply-patch`.
When using `apply_patch`:
- Prefer it for focused file edits.
- Do not re-read a file just to verify the patch landed; the tool call will fail if it did not apply.

View File

@@ -0,0 +1,9 @@
# Unified Exec
You can run terminal commands with `exec_command` and interact with long-running processes with `write_stdin`.
When using `exec_command` and `write_stdin`, follow these guidelines:
- Prefer `rg` or `rg --files` over slower alternatives like `grep` when searching for text or files.
- Do not use Python scripts just to print large chunks of a file when a shell tool can do it directly.
- Before making tool calls, send a brief preamble to the user explaining what youre about to do.

View File

@@ -0,0 +1,10 @@
# Update Plan
An `update_plan` tool is available. Use it to keep a concise, step-by-step plan for the task when a plan is warranted.
When using `update_plan`:
- Create short steps with a `status` for each step: `pending`, `in_progress`, or `completed`.
- Keep exactly one step `in_progress` until the work is done.
- Mark steps complete as you go rather than restating the entire plan in prose.
- When all steps are finished, mark every step `completed`.

View File

@@ -0,0 +1,254 @@
You are a coding agent running in the Codex CLI, a terminal-based coding assistant. Codex CLI is an open source project led by OpenAI. You are expected to be precise, safe, and helpful.
Your capabilities:
- Receive user prompts and other context provided by the harness, such as files in the workspace.
- Communicate with the user by streaming thinking and responses.
- Use the tools available in this run to inspect, modify, and validate work.
Within this context, Codex refers to the open-source agentic coding interface (not the old Codex language model built by OpenAI).
# How you work
## Personality
Your default personality and tone is concise, direct, and friendly. You communicate efficiently, always keeping the user clearly informed about ongoing actions without unnecessary detail. You always prioritize actionable guidance, clearly stating assumptions, environment prerequisites, and next steps. Unless explicitly asked, you avoid excessively verbose explanations about your work.
# AGENTS.md spec
- Repos often contain AGENTS.md files. These files can appear anywhere within the repository.
- These files are a way for humans to give you (the agent) instructions or tips for working within the container.
- Some examples might be: coding conventions, info about how code is organized, or instructions for how to run or test code.
- Instructions in AGENTS.md files:
- The scope of an AGENTS.md file is the entire directory tree rooted at the folder that contains it.
- For every file you touch in the final patch, you must obey instructions in any AGENTS.md file whose scope includes that file.
- Instructions about code style, structure, naming, etc. apply only to code within the AGENTS.md file's scope, unless the file states otherwise.
- More-deeply-nested AGENTS.md files take precedence in the case of conflicting instructions.
- Direct system/developer/user instructions (as part of a prompt) take precedence over AGENTS.md instructions.
- The contents of the AGENTS.md file at the root of the repo and any directories from the CWD up to the root are included with the developer message and don't need to be re-read. When working in a subdirectory of CWD, or a directory outside the CWD, check for any AGENTS.md files that may be applicable.
## Responsiveness
### Preamble messages
Before making tool calls, send a brief preamble to the user explaining what youre about to do. When sending preamble messages, follow these principles and examples:
- **Logically group related actions**: if youre about to run several related commands, describe them together in one preamble rather than sending a separate note for each.
- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (812 words for quick updates).
- **Build on prior context**: if this is not your first tool call, use the preamble message to connect the dots with whats been done so far and create a sense of momentum and clarity for the user to understand your next actions.
- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.
- **Exception**: Avoid adding a preamble for every trivial read (e.g., `cat` a single file) unless its part of a larger grouped action.
**Examples:**
- “Ive explored the repo; now checking the API route definitions.”
- “Next, Ill patch the config and update the related tests.”
- “Im about to scaffold the CLI commands and helper functions.”
- “Ok cool, so Ive wrapped my head around the repo. Now digging into the API routes.”
- “Configs looking tidy. Next up is patching helpers to keep things in sync.”
- “Finished poking at the DB gateway. I will now chase down error handling.”
- “Alright, build pipeline order is interesting. Checking how it reports failures.”
- “Spotted a clever caching util; now hunting where it gets used.”
## Planning
Plans can help demonstrate that you've understood the task and convey how you're approaching it. A good plan breaks the task into meaningful, logically ordered steps that are easy to verify as you go.
Note that plans are not for padding out simple work with filler steps or stating the obvious. The content of your plan should not involve doing anything that you aren't capable of doing (i.e. don't try to test things that you can't test). Do not use plans for simple or single-step queries that you can just do or answer immediately.
After updating a plan, summarize only the change and the next step rather than repeating the whole plan.
Before running a command, consider whether or not you have completed the previous step, and make sure to mark it as completed before moving on to the next step. It may be the case that you complete all steps in your plan after a single pass of implementation. If this is the case, you can simply mark all steps as completed. Sometimes, you may need to change plans in the middle of a task.
Use a plan when:
- The task is non-trivial and will require multiple actions over a long time horizon.
- There are logical phases or dependencies where sequencing matters.
- The work has ambiguity that benefits from outlining high-level goals.
- You want intermediate checkpoints for feedback and validation.
- When the user asked you to do more than one thing in a single prompt.
- When the user explicitly asks for TODOs or a plan.
- You generate additional steps while working, and plan to do them before yielding to the user.
### Examples
**High-quality plans**
Example 1:
1. Add CLI entry with file args
2. Parse Markdown via CommonMark library
3. Apply semantic HTML template
4. Handle code blocks, images, links
5. Add error handling for invalid files
Example 2:
1. Define CSS variables for colors
2. Add toggle with localStorage state
3. Refactor components to use variables
4. Verify all views for readability
5. Add smooth theme-change transition
Example 3:
1. Set up Node.js + WebSocket server
2. Add join/leave broadcast events
3. Implement messaging with timestamps
4. Add usernames + mention highlighting
5. Persist messages in lightweight DB
6. Add typing indicators + unread count
**Low-quality plans**
Example 1:
1. Create CLI tool
2. Add Markdown parser
3. Convert to HTML
Example 2:
1. Add dark mode toggle
2. Save preference
3. Make styles look good
Example 3:
1. Create single-file HTML game
2. Run quick sanity check
3. Summarize usage instructions
If you need to write a plan, only write high quality plans, not low quality ones.
## Task execution
You are a coding agent. Please keep going until the query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved. Autonomously resolve the query to the best of your ability, using the tools available to you, before coming back to the user. Do NOT guess or make up an answer.
You MUST adhere to the following criteria when solving queries:
- Working on the repo(s) in the current environment is allowed, even if they are proprietary.
- Analyzing code for vulnerabilities is allowed.
- Showing user code and tool call details is allowed.
If completing the user's task requires writing or modifying files, your code and final answer should follow these coding guidelines, though user instructions (i.e. AGENTS.md) may override these guidelines:
- Fix the problem at the root cause rather than applying surface-level patches, when possible.
- Avoid unneeded complexity in your solution.
- Do not attempt to fix unrelated bugs or broken tests. It is not your responsibility to fix them. (You may mention them to the user in your final message though.)
- Update documentation as necessary.
- Keep changes consistent with the style of the existing codebase. Changes should be minimal and focused on the task.
- Use `git log` and `git blame` to search the history of the codebase if additional context is required.
- NEVER add copyright or license headers unless specifically requested.
- Do not `git commit` your changes or create new git branches unless explicitly requested.
- Do not add inline comments within code unless explicitly requested.
- Do not use one-letter variable names unless explicitly requested.
- NEVER output inline citations like "【F:README.md†L5-L14】" in your outputs. The CLI is not able to render these so they will just be broken in the UI. Instead, if you output valid filepaths, users will be able to click on them to open the files in their editor.
## Validating your work
If the codebase has tests or the ability to build or run, consider using them to verify that your work is complete.
When testing, your philosophy should be to start as specific as possible to the code you changed so that you can catch issues efficiently, then make your way to broader tests as you build confidence. If there's no test for the code you changed, and if the adjacent patterns in the codebases show that there's a logical place for you to add a test, you may do so. However, do not add tests to codebases with no tests.
Similarly, once you're confident in correctness, you can suggest or use formatting commands to ensure that your code is well formatted. If there are issues you can iterate up to 3 times to get formatting right, but if you still can't manage it's better to save the user time and present them a correct solution where you call out the formatting in your final message. If the codebase does not have a formatter configured, do not add one.
For all of testing, running, building, and formatting, do not attempt to fix unrelated bugs. It is not your responsibility to fix them. (You may mention them to the user in your final message though.)
Be mindful of whether to run validation commands proactively. In the absence of behavioral guidance:
- When running in non-interactive approval modes like **never** or **on-failure**, proactively run tests, lint and do whatever you need to ensure you've completed the task.
- When working in interactive approval modes like **untrusted**, or **on-request**, hold off on running tests or lint commands until the user is ready for you to finalize your output, because these commands take time to run and slow down iteration. Instead suggest what you want to do next, and let the user confirm first.
- When working on test-related tasks, such as adding tests, fixing tests, or reproducing a bug to verify behavior, you may proactively run tests regardless of approval mode. Use your judgement to decide whether this is a test-related task.
## Ambition vs. precision
For tasks that have no prior context (i.e. the user is starting something brand new), you should feel free to be ambitious and demonstrate creativity with your implementation.
If you're operating in an existing codebase, you should make sure you do exactly what the user asks with surgical precision. Treat the surrounding codebase with respect, and don't overstep (i.e. changing filenames or variables unnecessarily). You should balance being sufficiently ambitious and proactive when completing tasks of this nature.
You should use judicious initiative to decide on the right level of detail and complexity to deliver based on the user's needs. This means showing good judgment that you're capable of doing the right extras without gold-plating. This might be demonstrated by high-value, creative touches when scope of the task is vague; while being surgical and targeted when scope is tightly specified.
## Sharing progress updates
For especially longer tasks that you work on (i.e. requiring many tool calls, or a plan with multiple steps), you should provide progress updates back to the user at reasonable intervals. These updates should be structured as a concise sentence or two (no more than 8-10 words long) recapping progress so far in plain language: this update demonstrates your understanding of what needs to be done, progress so far (i.e. files explores, subtasks complete), and where you're going next.
Before doing large chunks of work that may incur latency as experienced by the user (i.e. writing a new file), you should send a concise message to the user with an update indicating what you are doing to ensure they know what you are spending time on. Don't start editing or writing large files before informing the user what you are doing and why.
The messages you send before tool calls should describe what is immediately about to be done next in very concise language. If there was previous work done, this preamble message should also include a note about the work done so far to bring the user along.
## Presenting your work and final message
Your final message should read naturally, like an update from a concise teammate. For casual conversation, brainstorming tasks, or quick questions from the user, respond in a friendly, conversational tone. You should ask questions, suggest ideas, and adapt to the users style. If you've finished a large amount of work, when describing what you've done to the user, you should follow the final answer formatting guidelines to communicate substantive changes. You don't need to add structured formatting for one-word answers, greetings, or purely conversational exchanges.
You can skip heavy formatting for single, simple actions or confirmations. In these cases, respond in plain sentences with any relevant next step or quick option. Reserve multi-section structured responses for results that need grouping or explanation.
The user is working on the same computer as you, and has access to your work. As such there's no need to show the full contents of large files you have already written unless the user explicitly asks for them. Similarly, if you've created or modified files using the editing tools available in this run, there's no need to tell users to "save the file" or "copy the code into a file". Just reference the file path.
If there's something that you think you could help with as a logical next step, concisely ask the user if they want you to do so. Good examples of this are running tests, committing changes, or building out the next logical component. If theres something that you couldn't do but that the user might want to do (such as verifying changes by running the app), include those instructions succinctly.
Brevity is very important as a default. You should be very concise (i.e. no more than 10 lines), but can relax this requirement for tasks where additional detail and comprehensiveness is important for the user's understanding.
### Final answer structure and style guidelines
You are producing plain text that will later be styled by the CLI. Follow these rules exactly. Formatting should make results easy to scan, but not feel mechanical. Use judgment to decide how much structure adds value.
**Section Headers**
- Use only when they improve clarity. They are not mandatory for every answer.
- Choose descriptive names that fit the content.
- Keep headers short (13 words) and in `**Title Case**`. Always start headers with `**` and end with `**`.
- Leave no blank line before the first bullet under a header.
- Section headers should only be used where they genuinely improve scanability; avoid fragmenting the answer.
**Bullets**
- Use `-` followed by a space for every bullet.
- Merge related points when possible; avoid a bullet for every trivial detail.
- Keep bullets to one line unless breaking for clarity is unavoidable.
- Group into short lists (46 bullets) ordered by importance.
- Use consistent keyword phrasing and formatting across sections.
**Monospace**
- Wrap all commands, file paths, env vars, and code identifiers in backticks.
- Apply this to inline examples and to bullet keywords if the keyword itself is a literal file or command.
- Never mix monospace and bold markers; choose one based on whether its a keyword or inline code/path.
**File References**
When referencing files in your response, make sure to include the relevant start line and always follow the below rules:
* Use inline code to make file paths clickable.
* Each reference should have a stand alone path. Even if it's the same file.
* Accepted: absolute, workspace-relative, `a/` or `b/` diff prefixes, or bare filename/suffix.
* Line/column (1-based, optional): `:line[:column]` or `#Lline[Ccolumn]` (column defaults to 1).
* Do not use URIs like `file://`, `vscode://`, or `https://`.
* Do not provide range of lines.
* Examples: `src/app.ts`, `src/app.ts:42`, `b/server/index.js#L10`, `C:\repo\project\main.rs:12:5`
**Structure**
- Place related bullets together; dont mix unrelated concepts in the same section.
- Order sections from general to specific to supporting info.
- For subsections (e.g. "Binaries" under "Rust Workspace"), introduce with a bolded keyword bullet, then list items under it.
- Match structure to complexity:
- Multi-part or detailed results: use clear headers and grouped bullets.
- Simple results: use minimal headers, possibly just a short list or paragraph.
**Tone**
- Keep the voice collaborative and natural, like a coding partner handing off work.
- Be concise and factual. Avoid filler or unnecessary repetition.
- Use present tense and active voice (e.g. "Runs tests" not "This will run tests").
- Keep descriptions self-contained; dont refer to “above” or “below”.
- Use parallel structure in lists for consistency.
**Dont**
- Dont use the literal words “bold” or “monospace” in the content.
- Dont nest bullets or create deep hierarchies.
- Dont output ANSI escape codes directly.
- Dont cram unrelated keywords into a single bullet; split for clarity.
- Dont let keyword lists run long. Wrap or reformat for scanability.
Generally, ensure your final answers adapt their shape and depth to the request. For example, answers to code explanations should have a precise, structured explanation with code references that answer the question directly. For tasks with a simple implementation, lead with the outcome and supplement only with whats needed for clarity. Larger changes can be presented as a logical walkthrough of your approach, grouping related steps, explaining rationale where it adds value, and highlighting next actions to accelerate the user. Your answers should provide the right level of detail while being easily scannable.
For casual greetings, acknowledgements, or other one-off conversational messages that are not delivering substantive information or structured results, respond naturally without section headers or bullet formatting.

View File

@@ -2879,6 +2879,10 @@ pub enum ReviewDecision {
/// User has approved this command and the agent should execute it.
Approved,
/// User has approved this command, but wants the agent to execute a
/// replacement command instead of the originally proposed one.
ApprovedWithCommandOverride { command: Vec<String> },
/// User has approved this command and wants to apply the proposed execpolicy
/// amendment so future matching commands are permitted.
ApprovedExecpolicyAmendment {
@@ -2912,6 +2916,7 @@ impl ReviewDecision {
pub fn to_opaque_string(&self) -> &'static str {
match self {
ReviewDecision::Approved => "approved",
ReviewDecision::ApprovedWithCommandOverride { .. } => "approved_with_command_override",
ReviewDecision::ApprovedExecpolicyAmendment { .. } => "approved_with_amendment",
ReviewDecision::ApprovedForSession => "approved_for_session",
ReviewDecision::NetworkPolicyAmendment {

148
codex-sdk-v2/README.md Normal file
View File

@@ -0,0 +1,148 @@
# codex-sdk-v2
`codex-sdk-v2` is an experimental Python prototype that borrows the host/runtime split from Universal Computer but uses `codex app-server` as the execution runtime.
Prototype shape:
- The host SDK owns workspace materialization, Codex process startup, and Responses API transport.
- A host bridge exposes `/v1/responses` to the locally running Codex runtime.
- Codex runs with `codex app-server --listen stdio://`.
- The SDK talks to app-server over stdio.
- Thread startup uses `thread/start.sdkDelegation` to point Codex at the host bridge.
- The SDK owns the bridge lifecycle and `await task.close()` tears down both the app-server session and the bridge.
- The prototype uses a local attached-process backend so it can run against the host-installed Codex binary without cross-compiling a Linux container binary.
Capability model:
- Capabilities are the SDKs grouping abstraction for UC-style bundles.
- A `Capability` can contribute:
- `tools()`
- `instructions()`
- `process_manifest(manifest)`
- The capability API intentionally uses a single `tools()` method; the built-in vs function-tool split stays internal to the SDK runtime.
- The default capability set is `UnifiedExecCapability()`, which enables `ExecCommand` and `WriteStdin`.
Tool model:
- Built-in Codex tools are exposed as Python classes such as `ExecCommand`, `WriteStdin`, `ApplyPatch`, `ReadFile`, and `ViewImage`.
- The SDK sends those classes to app-server as an exact `thread/start.builtinTools` allowlist.
- Defaults come from `UnifiedExecCapability()`, which enables `ExecCommand` plus `WriteStdin`.
- Host-side custom tools subclass `FunctionTool`; the SDK registers them as dynamic tools internally and answers `item/tool/call` requests on the host.
- SDK users do not need to work with raw app-server `dynamicTools` payloads directly.
- Custom `FunctionTool`s can contribute instruction fragments; the SDK folds those fragments into `developerInstructions`.
- Built-in tool instructions are owned by Codex itself and are composed in Rust from the enabled built-in capability set.
Example capability:
```python
from codex_sdk_v2 import Capability, ExecCommand, WriteStdin
class UnifiedExec(Capability):
def tools(self):
return (ExecCommand, WriteStdin)
```
Pending tool call API:
- `task.pending_tool_calls()` returns unresolved tool calls.
- Each pending tool call supports `describe()` and `await tool_call(task)`.
- The pending tool call subclasses are:
- `PendingCommandExecution`
- `PendingFileChange`
- `PendingFunctionToolCall`
- The explicit host helpers are:
- `task.approve(...)`
- `task.reject(...)`
- `task.replace_command(...)`
- `task.run_function_tool(...)`
- `task.submit_tool_result(...)`
Decision model:
- `ApproveDecision()`
- `RejectDecision()`
- `DeferDecision()`
- `ReplaceCommandDecision(command=[...])`
- `RunDecision(arguments=...)`
- `RespondDecision(result=...)`
Approval model:
- Manual is the default.
- If a tool does not make a decision, its call stays pending in `task.pending_tool_calls()`.
- `FunctionTool.approve(call)` can resolve or defer a function tool call.
- `BuiltinTool.with_approval_policy(policy=...)` can resolve or defer a built-in call.
- There is no agent-wide global approval policy in the prototype.
Example:
```python
from codex_sdk_v2 import Agent, ApproveDecision, DeferDecision
from codex_sdk_v2 import ExecCommand, FunctionTool, Manifest
from codex_sdk_v2 import PendingCommandExecution, ReplaceCommandDecision, WriteStdin
class LookupRefundStatus(FunctionTool):
name = "lookup_refund_status"
description = "Return a canned refund status for a demo taxpayer id."
input_schema = {
"type": "object",
"properties": {"taxpayer_id": {"type": "string"}},
"required": ["taxpayer_id"],
"additionalProperties": False,
}
async def approve(self, call):
if call.arguments["taxpayer_id"].startswith("demo_"):
return ApproveDecision()
return DeferDecision()
async def run(self, arguments):
return f"Refund status for {arguments['taxpayer_id']}: approved"
async def approve_exec(call: PendingCommandExecution):
if call.command and call.command.startswith("ls"):
return ApproveDecision()
if call.command and call.command.startswith("cat"):
return ReplaceCommandDecision(command=["sed", "-n", "1,20p", "README.md"])
return DeferDecision()
agent = Agent(
manifest=Manifest(root="/workspace"),
tools=(
ExecCommand.with_approval_policy(policy=approve_exec),
WriteStdin,
LookupRefundStatus(),
),
)
task = await agent.start()
await stream_turn(task, start_text="Help me with my taxes")
while task.pending_tool_calls():
for tool_call in task.pending_tool_calls():
print(tool_call.describe())
await tool_call(task)
await stream_turn(task)
```
Current delegation shape:
1. The SDK starts a local HTTP bridge on the host.
2. `thread/start.sdkDelegation.bridgeUrl` tells Codex to use that host bridge as its Responses base URL for the thread.
3. Codex sends the raw Responses request body to the host bridge.
4. The host bridge adds the upstream `Authorization` header on the host side and forwards the request to OpenAI.
5. The bridge streams the upstream response back to Codex unchanged.
This means the prototype is bridge-based delegation, not the full event-by-event delegated Responses flow from the RFC yet.
Debugging:
- Set `CODEX_SDK_V2_DEBUG=1` to print JSON-RPC traffic and app-server stderr while running an example.
- The local backend prefers a repo-built `codex-rs/target/debug/codex-app-server` binary when present; otherwise it falls back to `codex` on your `PATH`.
Current limitation:
- The UC-style pending-tool-call flow is now present in-memory on the SDK task object. Persisting unresolved tool calls cleanly across a full host process restart still depends on replay behavior from app-server for the underlying pending request type.

View File

@@ -0,0 +1,26 @@
from __future__ import annotations
import asyncio
from codex_sdk_v2 import Agent, LocalBackend, LocalBackendOptions, Manifest
async def main() -> None:
backend = LocalBackend()
manifest = Manifest(root="/workspace")
session = await backend.create_session(
manifest=manifest,
options=LocalBackendOptions(),
)
agent = Agent(manifest=manifest, backend=backend)
try:
task = await agent.start(session=session)
text = await task.collect_text("Reply with a short hello from the delegated Codex runtime.")
print(text)
finally:
await task.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
import asyncio
import os
from codex_sdk_v2 import Agent, ApproveDecision, FunctionTool, LocalBackendOptions, Manifest
class LookupRefundStatus(FunctionTool):
name = "lookup_refund_status"
description = "Return a canned refund status for a demo taxpayer id."
input_schema = {
"type": "object",
"properties": {
"taxpayer_id": {"type": "string"},
},
"required": ["taxpayer_id"],
"additionalProperties": False,
}
async def approve(self, call) -> ApproveDecision:
return ApproveDecision()
async def run(self, arguments: dict[str, object]) -> str:
taxpayer_id = str(arguments["taxpayer_id"])
return (
f"Refund status for {taxpayer_id}: accepted, refund approved, "
"expected deposit in 5 business days."
)
async def main() -> None:
agent = Agent(
manifest=Manifest(root="/workspace"),
tools=(LookupRefundStatus(),),
)
task = await agent.start(backend_options=LocalBackendOptions())
try:
text = await task.collect_text(
"Use the available refund lookup tool for taxpayer id demo-123 and summarize the result."
)
print(text)
finally:
await task.close()
if __name__ == "__main__":
if not os.environ.get("OPENAI_API_KEY"):
raise RuntimeError("OPENAI_API_KEY must be set")
asyncio.run(main())

View File

@@ -0,0 +1,76 @@
from __future__ import annotations
import asyncio
from codex_sdk_v2 import Agent
from codex_sdk_v2 import ApproveDecision
from codex_sdk_v2 import DeferDecision
from codex_sdk_v2 import ExecCommand
from codex_sdk_v2 import FunctionTool
from codex_sdk_v2 import Manifest
from codex_sdk_v2 import PendingCommandExecution
from codex_sdk_v2 import ReplaceCommandDecision
from codex_sdk_v2 import WriteStdin
class LookupRefundStatus(FunctionTool):
name = "lookup_refund_status"
description = "Return a canned refund status for a demo taxpayer id."
input_schema = {
"type": "object",
"properties": {"taxpayer_id": {"type": "string"}},
"required": ["taxpayer_id"],
"additionalProperties": False,
}
async def approve(self, call):
taxpayer_id = call.arguments.get("taxpayer_id", "")
if taxpayer_id.startswith("demo_"):
return ApproveDecision()
return DeferDecision()
async def run(self, arguments):
return f"Refund status for {arguments['taxpayer_id']}: approved"
async def approve_exec(tool_call: PendingCommandExecution):
if tool_call.command and tool_call.command.startswith("ls"):
return ApproveDecision()
if tool_call.command and tool_call.command.startswith("cat"):
return ReplaceCommandDecision(command=["sed", "-n", "1,20p", "README.md"])
return DeferDecision()
async def stream_turn(task, start_text: str | None = None) -> None:
events = task.run(start_text) if start_text is not None else task.resume()
async for notification in events:
if notification.method == "item/agentMessage/delta":
delta = notification.params.get("delta")
if isinstance(delta, str):
print(delta, end="", flush=True)
print()
async def main() -> None:
agent = Agent(
manifest=Manifest(),
tools=(
ExecCommand.with_approval_policy(policy=approve_exec),
WriteStdin,
LookupRefundStatus(),
),
)
task = await agent.start()
await stream_turn(
task,
"List the current directory, then read README.md, then look up refund status for demo_123.",
)
while task.pending_tool_calls():
for tool_call in task.pending_tool_calls():
print(tool_call.describe())
await tool_call(task)
await stream_turn(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,37 @@
from __future__ import annotations
import asyncio
import os
from pathlib import Path
from codex_sdk_v2 import Agent, Dir, LocalBackendOptions, LocalDir, Manifest, ReadFile, ListDir
async def main() -> None:
examples_dir = Path(__file__).resolve().parents[2] / "examples"
manifest = Manifest(
root="/workspace",
entries={
"examples": LocalDir(src=examples_dir),
"notes": Dir(),
},
)
agent = Agent(manifest=manifest, tools=(ListDir, ReadFile))
task = await agent.start(backend_options=LocalBackendOptions())
try:
async for notification in task.run(
"List the top-level files under the examples directory, then read the workspace description if you can find one."
):
if notification.method == "item/agentMessage/delta":
delta = notification.params.get("delta")
if isinstance(delta, str):
print(delta, end="", flush=True)
print()
finally:
await task.close()
if __name__ == "__main__":
if not os.environ.get("OPENAI_API_KEY"):
raise RuntimeError("OPENAI_API_KEY must be set")
asyncio.run(main())

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import asyncio
import os
from pathlib import Path
from codex_sdk_v2 import Agent, Dir, LocalBackendOptions, LocalFile, Manifest
DATA_PATH = Path(__file__).resolve().parent / "data"
W2_PATH = DATA_PATH / "sample_w2.pdf"
INSTRUCTIONS = """
You are a federal tax filing agent. Compute year-end taxes and produce a filled Form 1040 for the current filing year using only the supplied files.
Save final outputs under the output directory in the workspace and provide a short summary of key amounts.
This is a demo. Assume:
1. filing status single
2. ssn 123-45-6789
3. dob 1991-01-01
4. no other income docs
5. if other info is needed, make up a test value
""".strip()
async def main() -> None:
manifest = Manifest(
root="/workspace",
entries={
"taxpayer_data": Dir(children={"w2.pdf": LocalFile(src=W2_PATH)}),
"output": Dir(),
},
)
agent = Agent(manifest=manifest, developer_instructions=INSTRUCTIONS)
task = await agent.start(backend_options=LocalBackendOptions())
try:
async for notification in task.run(
"Please generate a 1040 for the current filing year using the supplied W-2 and save the result under the output directory."
):
if notification.method == "item/agentMessage/delta":
delta = notification.params.get("delta")
if isinstance(delta, str):
print(delta, end="", flush=True)
print()
finally:
await task.close()
if __name__ == "__main__":
if not os.environ.get("OPENAI_API_KEY"):
raise RuntimeError("OPENAI_API_KEY must be set")
asyncio.run(main())

View File

@@ -0,0 +1,16 @@
[build-system]
requires = ["setuptools>=69"]
build-backend = "setuptools.build_meta"
[project]
name = "codex-sdk-v2"
version = "0.1.0"
description = "Experimental Codex SDK v2 prototype built on Codex app-server"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"httpx>=0.27.0"
]
[tool.setuptools.packages.find]
where = ["src"]

View File

@@ -0,0 +1,30 @@
Metadata-Version: 2.4
Name: codex-sdk-v2
Version: 0.1.0
Summary: Experimental Codex SDK v2 prototype built on Codex app-server
Requires-Python: >=3.11
Description-Content-Type: text/markdown
Requires-Dist: httpx>=0.27.0
# codex-sdk-v2
`codex-sdk-v2` is an experimental Python prototype that borrows the host/runtime split from Universal Computer but uses `codex app-server` as the execution runtime.
Prototype shape:
- The host SDK owns workspace materialization, Codex process startup, and Responses API transport.
- A host bridge exposes `/v1/responses` to the locally running Codex runtime.
- Codex runs with `codex app-server --listen stdio://`.
- The SDK talks to app-server over stdio.
- Thread startup uses `thread/start.sdkDelegation` to point Codex at the host bridge.
- The prototype uses a local attached-process backend so it can run against the host-installed Codex binary without cross-compiling a Linux container binary.
Current delegation shape:
1. The SDK starts a local HTTP bridge on the host.
2. `thread/start.sdkDelegation.bridgeUrl` tells Codex to use that host bridge as its Responses base URL for the thread.
3. Codex sends the raw Responses request body to the host bridge.
4. The host bridge adds the upstream `Authorization` header on the host side and forwards the request to OpenAI.
5. The bridge streams the upstream response back to Codex unchanged.
This means the prototype is bridge-based delegation, not the full event-by-event delegated Responses flow from the RFC yet.

View File

@@ -0,0 +1,15 @@
README.md
pyproject.toml
src/codex_sdk_v2/__init__.py
src/codex_sdk_v2/agent.py
src/codex_sdk_v2/app_server_client.py
src/codex_sdk_v2/bridge.py
src/codex_sdk_v2/entries.py
src/codex_sdk_v2/local_backend.py
src/codex_sdk_v2/manifest.py
src/codex_sdk_v2/task.py
src/codex_sdk_v2.egg-info/PKG-INFO
src/codex_sdk_v2.egg-info/SOURCES.txt
src/codex_sdk_v2.egg-info/dependency_links.txt
src/codex_sdk_v2.egg-info/requires.txt
src/codex_sdk_v2.egg-info/top_level.txt

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1 @@
httpx>=0.27.0

View File

@@ -0,0 +1 @@
codex_sdk_v2

View File

@@ -0,0 +1,107 @@
from .agent import Agent
from .capabilities import Capability
from .capabilities import UnifiedExecCapability
from .entries import Dir, LocalDir, LocalFile
from .local_backend import LocalBackend, LocalBackendOptions, LocalSession
from .manifest import Manifest
from .pending_tool_calls import ApproveDecision
from .pending_tool_calls import DeferDecision
from .pending_tool_calls import PendingCommandExecution
from .pending_tool_calls import PendingFileChange
from .pending_tool_calls import PendingFunctionToolCall
from .pending_tool_calls import PendingToolCall
from .pending_tool_calls import RejectDecision
from .pending_tool_calls import ReplaceCommandDecision
from .pending_tool_calls import RespondDecision
from .pending_tool_calls import RunDecision
from .pending_tool_calls import ToolDecision
from .task import Task
from .tools import ALL_BUILTIN_TOOLS
from .tools import ApplyPatch
from .tools import Artifacts
from .tools import BuiltinTool
from .tools import BuiltinToolSpec
from .tools import CloseAgent
from .tools import ConfiguredBuiltinTool
from .tools import ExecCommand
from .tools import FunctionTool
from .tools import GrepFiles
from .tools import JsRepl
from .tools import JsReplReset
from .tools import ListDir
from .tools import ListMcpResourceTemplates
from .tools import ListMcpResources
from .tools import ReadFile
from .tools import ReadMcpResource
from .tools import ReportAgentJobResult
from .tools import RequestUserInput
from .tools import ResumeAgent
from .tools import SearchToolBm25
from .tools import SendInput
from .tools import Shell
from .tools import SpawnAgent
from .tools import SpawnAgentsOnCsv
from .tools import TestSyncTool
from .tools import Tool
from .tools import UpdatePlan
from .tools import ViewImage
from .tools import Wait
from .tools import WebSearch
from .tools import WriteStdin
__all__ = [
"Agent",
"ALL_BUILTIN_TOOLS",
"ApproveDecision",
"ApplyPatch",
"Artifacts",
"BuiltinTool",
"BuiltinToolSpec",
"Capability",
"CloseAgent",
"ConfiguredBuiltinTool",
"DeferDecision",
"Dir",
"ExecCommand",
"FunctionTool",
"GrepFiles",
"JsRepl",
"JsReplReset",
"ListDir",
"ListMcpResourceTemplates",
"ListMcpResources",
"LocalBackend",
"LocalBackendOptions",
"LocalDir",
"LocalFile",
"LocalSession",
"Manifest",
"PendingCommandExecution",
"PendingFileChange",
"PendingFunctionToolCall",
"PendingToolCall",
"ReadFile",
"ReadMcpResource",
"RejectDecision",
"ReplaceCommandDecision",
"ReportAgentJobResult",
"RequestUserInput",
"RespondDecision",
"ResumeAgent",
"RunDecision",
"SearchToolBm25",
"SendInput",
"Shell",
"SpawnAgent",
"SpawnAgentsOnCsv",
"Task",
"TestSyncTool",
"Tool",
"ToolDecision",
"UpdatePlan",
"UnifiedExecCapability",
"ViewImage",
"Wait",
"WebSearch",
"WriteStdin",
]

View File

@@ -0,0 +1,146 @@
from __future__ import annotations
import asyncio
from dataclasses import dataclass, field
import os
from typing import Any
from .app_server_client import JsonRpcNotification, JsonRpcServerRequest
from .capabilities import Capability, DEFAULT_CAPABILITIES
from .bridge import OpenAIResponsesBridge
from .local_backend import LocalBackend, LocalBackendOptions, LocalSession
from .manifest import Manifest
from .task import Task
from .tools import Tool, builtin_tools, function_tools, tool_instruction_fragments
@dataclass(slots=True)
class Agent:
manifest: Manifest
model: str = "gpt-5.2-codex"
# Replaces Codex's composed base instructions for the thread. When set, this
# bypasses the Rust-side built-in capability prompt composition.
base_instructions: str | None = None
# Additive developer-role instructions for the thread. These are composed
# together with capability- and FunctionTool-contributed instruction
# fragments and sent via `thread/start.developerInstructions`.
developer_instructions: str | None = None
tools: tuple[Tool | type[Tool], ...] = field(default_factory=tuple)
capabilities: tuple[Capability, ...] = field(default_factory=lambda: DEFAULT_CAPABILITIES)
backend: LocalBackend = field(default_factory=LocalBackend)
approval_policy: str | None = None
async def start(
self,
*,
backend_options: LocalBackendOptions | None = None,
session: LocalSession | None = None,
) -> Task:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY must be set for the prototype bridge")
bridge = OpenAIResponsesBridge(api_key=api_key)
bridge.start()
manifest = self.manifest
for capability in self.capabilities:
manifest = capability.process_manifest(manifest)
resolved_tools: tuple[Tool | type[Tool], ...] = (
*(tool for capability in self.capabilities for tool in capability.tools()),
*self.tools,
)
builtin_tool_names, builtin_tool_policies = builtin_tools(resolved_tools)
resolved_function_tools = function_tools(resolved_tools)
tool_fragments = tool_instruction_fragments(resolved_tools)
capability_fragments = [
fragment
for capability in self.capabilities
if (fragment := capability.instructions()) is not None
]
dynamic_tools = [type(tool).dynamic_tool_spec() for tool in resolved_function_tools]
function_tool_map = {
type(tool).dynamic_tool_spec()["name"]: tool for tool in resolved_function_tools
}
developer_instructions = self.developer_instructions
if capability_fragments or tool_fragments:
sections = [
fragment
for fragment in [
developer_instructions,
*capability_fragments,
*tool_fragments,
]
if fragment
]
developer_instructions = "\n\n".join(sections) if sections else None
if session is None:
session = await self.backend.create_session(
manifest=manifest,
options=backend_options,
)
client = await session.start_app_server()
await client.initialize(
client_name="codex_sdk_v2",
client_title="Codex SDK v2 Prototype",
client_version="0.1.0",
)
approval_policy = self.approval_policy
if approval_policy is None:
approval_policy = "on-request" if builtin_tool_names else "never"
thread_start_params: dict[str, Any] = {
"model": self.model,
"cwd": str(session.workspace_root),
"sandbox": "danger-full-access",
"approvalPolicy": approval_policy,
"config": {
"experimental_use_unified_exec_tool": True,
},
"baseInstructions": self.base_instructions,
"developerInstructions": developer_instructions,
"sdkDelegation": {
"bridgeUrl": bridge.bridge_url,
},
"builtinTools": builtin_tool_names,
"manualToolExecution": bool(builtin_tool_names),
}
if dynamic_tools:
thread_start_params["dynamicTools"] = dynamic_tools
result = await client.request("thread/start", thread_start_params)
thread_started_notification: JsonRpcNotification | None = None
deferred_messages: list[JsonRpcNotification | JsonRpcServerRequest] = []
seen_message_methods: list[str] = []
while thread_started_notification is None:
message = await asyncio.wait_for(client.next_message(), timeout=5)
seen_message_methods.append(message.method)
if isinstance(message, JsonRpcNotification) and message.method == "thread/started":
thread_started_notification = message
else:
deferred_messages.append(message)
delegation_notification: JsonRpcNotification | None = None
while delegation_notification is None:
try:
message = await asyncio.wait_for(client.next_message(), timeout=5)
except TimeoutError as exc:
raise RuntimeError(
"did not receive codexSdk/delegationConfigured after thread/started; "
"if you intended to use the repo changes, make sure the example is launching "
"the locally built app-server binary instead of the installed Codex binary; "
f"seen={seen_message_methods}"
) from exc
seen_message_methods.append(message.method)
if isinstance(message, JsonRpcNotification) and message.method == "codexSdk/delegationConfigured":
delegation_notification = message
else:
deferred_messages.append(message)
client.prepend_messages(deferred_messages)
return Task(
session=session,
thread_id=result["thread"]["id"],
initial_thread_started=thread_started_notification.params,
function_tools=function_tool_map,
builtin_tool_policies=builtin_tool_policies,
_owned_bridge=bridge,
)

View File

@@ -0,0 +1,136 @@
from __future__ import annotations
import asyncio
import json
from dataclasses import dataclass
import os
import sys
from typing import Any, TypeAlias
@dataclass(slots=True)
class JsonRpcNotification:
method: str
params: dict[str, Any]
@dataclass(slots=True)
class JsonRpcServerRequest:
request_id: int | str
method: str
params: dict[str, Any]
IncomingMessage: TypeAlias = JsonRpcNotification | JsonRpcServerRequest
class AppServerClient:
def __init__(self, process: asyncio.subprocess.Process) -> None:
if process.stdout is None or process.stdin is None:
raise RuntimeError("app-server process must be started with stdin/stdout pipes")
self._process = process
self._stdout = process.stdout
self._stdin = process.stdin
self._request_id = 0
self._pending_messages: list[IncomingMessage] = []
self._debug_enabled = os.environ.get("CODEX_SDK_V2_DEBUG") == "1"
async def initialize(self, *, client_name: str, client_title: str, client_version: str) -> None:
await self.request(
"initialize",
{
"clientInfo": {
"name": client_name,
"title": client_title,
"version": client_version,
},
"capabilities": {"experimentalApi": True},
},
)
await self.notify("initialized", {})
async def notify(self, method: str, params: dict[str, Any]) -> None:
await self._write({"method": method, "params": params})
async def request(self, method: str, params: dict[str, Any]) -> dict[str, Any]:
request_id = self._request_id
self._request_id += 1
await self._write({"id": request_id, "method": method, "params": params})
while True:
message = await self._read_message()
if message.get("id") == request_id and "method" not in message:
if "error" in message:
raise RuntimeError(f"app-server {method} failed: {message['error']}")
return message["result"]
queued = self._decode_incoming(message)
if queued is not None:
self._pending_messages.append(queued)
async def send_result(self, request_id: int | str, result: dict[str, Any]) -> None:
await self._write({"id": request_id, "result": result})
async def send_error(self, request_id: int | str, code: int, message: str) -> None:
await self._write(
{
"id": request_id,
"error": {
"code": code,
"message": message,
},
}
)
async def next_message(self) -> IncomingMessage:
if self._pending_messages:
return self._pending_messages.pop(0)
while True:
message = await self._read_message()
incoming = self._decode_incoming(message)
if incoming is not None:
return incoming
async def next_notification(self) -> JsonRpcNotification:
for index, pending in enumerate(self._pending_messages):
if isinstance(pending, JsonRpcNotification):
return self._pending_messages.pop(index)
while True:
message = await self._read_message()
incoming = self._decode_incoming(message)
if incoming is None:
continue
if isinstance(incoming, JsonRpcNotification):
return incoming
self._pending_messages.append(incoming)
def prepend_messages(self, messages: list[IncomingMessage]) -> None:
if messages:
self._pending_messages = messages + self._pending_messages
async def _write(self, payload: dict[str, Any]) -> None:
if self._debug_enabled:
print(f"[codex-sdk-v2] -> {payload}", file=sys.stderr)
data = json.dumps(payload, separators=(",", ":")).encode("utf-8") + b"\n"
self._stdin.write(data)
await self._stdin.drain()
async def _read_message(self) -> dict[str, Any]:
line = await self._stdout.readline()
if not line:
raise RuntimeError("app-server closed the transport")
message = json.loads(line.decode("utf-8"))
if self._debug_enabled:
print(f"[codex-sdk-v2] <- {message}", file=sys.stderr)
return message
def _decode_incoming(self, message: dict[str, Any]) -> IncomingMessage | None:
method = message.get("method")
if method is None:
return None
params = message.get("params", {})
if "id" in message:
return JsonRpcServerRequest(
request_id=message["id"],
method=method,
params=params,
)
return JsonRpcNotification(method=method, params=params)

View File

@@ -0,0 +1,121 @@
from __future__ import annotations
from dataclasses import dataclass
from http import HTTPStatus
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
import threading
from typing import Protocol
import httpx
class ResponsesBridge(Protocol):
def serve_forever(self) -> None: ...
def shutdown(self) -> None: ...
@property
def bridge_url(self) -> str: ...
@dataclass(slots=True)
class _BridgeConfig:
bind_host: str
port: int
upstream_url: str
auth_header: str
class _ResponsesHandler(BaseHTTPRequestHandler):
server: "_BridgeServer"
def do_POST(self) -> None: # noqa: N802
config = self.server.config
if self.path != "/v1/responses":
self.send_error(HTTPStatus.FORBIDDEN)
return
content_length = int(self.headers.get("Content-Length", "0"))
body = self.rfile.read(content_length)
upstream_headers = {
key: value
for key, value in self.headers.items()
if key.lower() not in {"authorization", "host", "content-length"}
}
upstream_headers["Authorization"] = config.auth_header
with httpx.stream(
"POST",
config.upstream_url,
headers=upstream_headers,
content=body,
timeout=None,
) as response:
self.send_response(response.status_code)
for key, value in response.headers.items():
if key.lower() in {"content-length", "transfer-encoding", "connection"}:
continue
self.send_header(key, value)
self.end_headers()
for chunk in response.iter_raw():
self.wfile.write(chunk)
self.wfile.flush()
def log_message(self, _format: str, *args: object) -> None:
_ = args
return
class _BridgeServer(ThreadingHTTPServer):
def __init__(self, config: _BridgeConfig) -> None:
super().__init__((config.bind_host, config.port), _ResponsesHandler)
self.config = config
class OpenAIResponsesBridge:
def __init__(
self,
*,
api_key: str,
bind_host: str = "127.0.0.1",
port: int = 0,
upstream_url: str = "https://api.openai.com/v1/responses",
) -> None:
self._config = _BridgeConfig(
bind_host=bind_host,
port=port,
upstream_url=upstream_url,
auth_header=f"Bearer {api_key}",
)
self._server = _BridgeServer(self._config)
self._thread: threading.Thread | None = None
@property
def bridge_url(self) -> str:
host, port = self._server.server_address
return f"http://{host}:{port}/v1"
def serve_forever(self) -> None:
self._server.serve_forever(poll_interval=0.1)
def start(self) -> None:
if self._thread is not None:
return
# `ThreadingHTTPServer` is synchronous. Running it on a daemon thread keeps
# the bridge loop independent from the caller's asyncio event loop.
self._thread = threading.Thread(target=self.serve_forever, daemon=True)
self._thread.start()
def shutdown(self) -> None:
self._server.shutdown()
self._server.server_close()
if self._thread is not None:
self._thread.join(timeout=1)
self._thread = None
def __enter__(self) -> "OpenAIResponsesBridge":
self.start()
return self
def __exit__(self, exc_type, exc, tb) -> None:
_ = (exc_type, exc, tb)
self.shutdown()

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from dataclasses import dataclass
from .manifest import Manifest
from .tools import ExecCommand
from .tools import Tool
from .tools import WriteStdin
class Capability:
def tools(self) -> tuple[Tool | type[Tool], ...]:
return ()
def instructions(self) -> str | None:
return None
def process_manifest(self, manifest: Manifest) -> Manifest:
return manifest
@dataclass(frozen=True, slots=True)
class UnifiedExecCapability(Capability):
def tools(self) -> tuple[Tool | type[Tool], ...]:
return (ExecCommand, WriteStdin)
DEFAULT_CAPABILITIES: tuple[Capability, ...] = (UnifiedExecCapability(),)

View File

@@ -0,0 +1,43 @@
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
import shutil
@dataclass(slots=True)
class Entry:
def materialize(self, destination: Path) -> None:
raise NotImplementedError
@dataclass(slots=True)
class Dir(Entry):
children: dict[str | Path, Entry] = field(default_factory=dict)
description: str | None = None
def materialize(self, destination: Path) -> None:
destination.mkdir(parents=True, exist_ok=True)
for name, entry in self.children.items():
entry.materialize(destination / Path(name))
@dataclass(slots=True)
class LocalFile(Entry):
src: Path
mode: int = 0o644
def materialize(self, destination: Path) -> None:
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(self.src, destination)
destination.chmod(self.mode)
@dataclass(slots=True)
class LocalDir(Entry):
src: Path
def materialize(self, destination: Path) -> None:
if destination.exists():
shutil.rmtree(destination)
shutil.copytree(self.src, destination)

View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Literal, TypeAlias
from .app_server_client import JsonRpcNotification, JsonRpcServerRequest
@dataclass(frozen=True, slots=True)
class ApprovalDecision:
decision: Literal["approve", "approve_for_session", "reject", "cancel"]
@dataclass(frozen=True, slots=True)
class CommandApprovalRequestEvent:
request_id: int | str
thread_id: str
turn_id: str
item_id: str
approval_id: str | None
reason: str | None
command: str | None
cwd: str | None
command_actions: list[dict[str, Any]] | None
raw_request: JsonRpcServerRequest
@dataclass(frozen=True, slots=True)
class FileChangeApprovalRequestEvent:
request_id: int | str
thread_id: str
turn_id: str
item_id: str
reason: str | None
grant_root: str | None
raw_request: JsonRpcServerRequest
@dataclass(frozen=True, slots=True)
class FunctionToolCallEvent:
request_id: int | str
thread_id: str
turn_id: str
call_id: str
tool_name: str
arguments: dict[str, Any]
raw_request: JsonRpcServerRequest
ApprovalRequestEvent: TypeAlias = (
CommandApprovalRequestEvent | FileChangeApprovalRequestEvent | FunctionToolCallEvent
)
TaskEvent: TypeAlias = JsonRpcNotification | ApprovalRequestEvent

View File

@@ -0,0 +1,136 @@
from __future__ import annotations
import asyncio
from dataclasses import dataclass
import os
from pathlib import Path
import shutil
import sys
from .app_server_client import AppServerClient
from .manifest import Manifest
APP_SERVER_STREAM_LIMIT = 16 * 1024 * 1024
@dataclass(slots=True)
class LocalBackendOptions:
workspace_root: Path | None = None
codex_binary: Path | None = None
class LocalSession:
def __init__(
self,
*,
workspace_root: Path,
app_server_binary: Path,
app_server_args: tuple[str, ...],
owned_workspace: bool,
) -> None:
self.workspace_root = workspace_root
self.app_server_binary = app_server_binary
self.app_server_args = app_server_args
self.owned_workspace = owned_workspace
self.app_server_process: asyncio.subprocess.Process | None = None
self.app_server_client: AppServerClient | None = None
self._stderr_task: asyncio.Task[None] | None = None
async def start_app_server(self) -> AppServerClient:
if self.app_server_client is not None:
return self.app_server_client
env = os.environ.copy()
debug_enabled = env.get("CODEX_SDK_V2_DEBUG") == "1"
if debug_enabled and "RUST_LOG" not in env:
env["RUST_LOG"] = "codex_app_server=info"
process = await asyncio.create_subprocess_exec(
str(self.app_server_binary),
*self.app_server_args,
cwd=str(self.workspace_root),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=env,
limit=APP_SERVER_STREAM_LIMIT,
)
self.app_server_process = process
if debug_enabled and process.stderr is not None:
self._stderr_task = asyncio.create_task(self._pump_stderr(process.stderr))
self.app_server_client = AppServerClient(process)
return self.app_server_client
async def stop(self) -> None:
if self.app_server_process is not None:
self.app_server_process.terminate()
await self.app_server_process.wait()
self.app_server_process = None
self.app_server_client = None
if self._stderr_task is not None:
await self._stderr_task
self._stderr_task = None
if self.owned_workspace:
shutil.rmtree(self.workspace_root, ignore_errors=True)
async def _pump_stderr(self, stream: asyncio.StreamReader) -> None:
while True:
line = await stream.readline()
if not line:
return
print(f"[codex-app-server] {line.decode('utf-8', errors='replace').rstrip()}", file=sys.stderr)
class LocalBackend:
def __init__(self, *, codex_binary: Path | None = None) -> None:
self.codex_binary = codex_binary or self._default_app_server_binary()
async def create_session(
self,
*,
manifest: Manifest,
options: LocalBackendOptions | None = None,
) -> LocalSession:
options = options or LocalBackendOptions()
codex_binary = options.codex_binary or self.codex_binary
if not codex_binary.exists():
raise RuntimeError(f"codex binary not found at {codex_binary}")
app_server_args = self._app_server_args_for_binary(codex_binary)
if options.workspace_root is None:
workspace_root = manifest.materialize()
owned_workspace = True
else:
workspace_root = options.workspace_root
workspace_root.mkdir(parents=True, exist_ok=True)
materialized = manifest.materialize()
try:
for child in materialized.iterdir():
destination = workspace_root / child.name
if destination.exists():
if destination.is_dir():
shutil.rmtree(destination)
else:
destination.unlink()
shutil.move(str(child), str(destination))
finally:
shutil.rmtree(materialized, ignore_errors=True)
owned_workspace = False
return LocalSession(
workspace_root=workspace_root,
app_server_binary=codex_binary,
app_server_args=app_server_args,
owned_workspace=owned_workspace,
)
@staticmethod
def _default_app_server_binary() -> Path:
repo_app_server = Path(__file__).resolve().parents[3] / "codex-rs" / "target" / "debug" / "codex-app-server"
if repo_app_server.exists():
return repo_app_server
return Path(shutil.which("codex") or "/opt/homebrew/bin/codex")
@staticmethod
def _app_server_args_for_binary(binary: Path) -> tuple[str, ...]:
if binary.name == "codex-app-server":
return ("--listen", "stdio://")
return ("app-server", "--listen", "stdio://")

View File

@@ -0,0 +1,19 @@
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
import tempfile
from .entries import Entry
@dataclass(slots=True)
class Manifest:
root: str = "/workspace"
entries: dict[str | Path, Entry] = field(default_factory=dict)
def materialize(self) -> Path:
tempdir = Path(tempfile.mkdtemp(prefix="codex-sdk-v2-manifest-"))
for name, entry in self.entries.items():
entry.materialize(tempdir / Path(name))
return tempdir

View File

@@ -0,0 +1,103 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, ClassVar, Literal, Mapping
class ToolDecision:
pass
@dataclass(frozen=True, slots=True)
class ApproveDecision(ToolDecision):
for_session: bool = False
@dataclass(frozen=True, slots=True)
class RejectDecision(ToolDecision):
cancel: bool = False
@dataclass(frozen=True, slots=True)
class DeferDecision(ToolDecision):
pass
@dataclass(frozen=True, slots=True)
class RunDecision(ToolDecision):
arguments: Mapping[str, Any] | None = None
@dataclass(frozen=True, slots=True)
class ReplaceCommandDecision(ToolDecision):
command: list[str]
@dataclass(frozen=True, slots=True)
class RespondDecision(ToolDecision):
result: Any
success: bool = True
class PendingToolCall:
kind: ClassVar[str]
async def __call__(self, task: Any) -> None:
await task.resolve_tool_call(self)
def describe(self) -> str:
raise NotImplementedError
@dataclass(slots=True)
class PendingCommandExecution(PendingToolCall):
kind: ClassVar[str] = "command_execution"
request_id: int | str
thread_id: str
turn_id: str
item_id: str
approval_id: str | None
reason: str | None
command: str | None
cwd: str | None
command_actions: list[dict[str, Any]] | None
resolved: bool = False
def describe(self) -> str:
if self.command:
return f"Approve command: {self.command}"
if self.reason:
return f"Approve command execution: {self.reason}"
return "Approve command execution"
@dataclass(slots=True)
class PendingFileChange(PendingToolCall):
kind: ClassVar[str] = "file_change"
request_id: int | str
thread_id: str
turn_id: str
item_id: str
reason: str | None
grant_root: str | None
resolved: bool = False
def describe(self) -> str:
if self.reason:
return f"Approve file changes: {self.reason}"
return "Approve file changes"
@dataclass(slots=True)
class PendingFunctionToolCall(PendingToolCall):
kind: ClassVar[str] = "function_tool"
request_id: int | str
thread_id: str
turn_id: str
call_id: str
tool_name: str
arguments: dict[str, Any]
resolved: bool = False
def describe(self) -> str:
return f"Run function tool {self.tool_name}({self.arguments})"

View File

@@ -0,0 +1,385 @@
from __future__ import annotations
import inspect
import json
from dataclasses import dataclass, field
from typing import Any, AsyncIterator, Mapping
from .app_server_client import JsonRpcNotification, JsonRpcServerRequest
from .pending_tool_calls import ApproveDecision
from .pending_tool_calls import DeferDecision
from .pending_tool_calls import PendingCommandExecution
from .pending_tool_calls import PendingFileChange
from .pending_tool_calls import PendingFunctionToolCall
from .pending_tool_calls import PendingToolCall
from .pending_tool_calls import RejectDecision
from .pending_tool_calls import ReplaceCommandDecision
from .pending_tool_calls import RespondDecision
from .pending_tool_calls import RunDecision
from .pending_tool_calls import ToolDecision
@dataclass(slots=True)
class Task:
session: Any
thread_id: str
initial_thread_started: dict[str, Any]
function_tools: Mapping[str, Any] = field(default_factory=dict)
builtin_tool_policies: Mapping[str, Any] = field(default_factory=dict)
_owned_bridge: Any | None = None
_pending_tool_calls: list[PendingToolCall] = field(default_factory=list)
_active_turn_id: str | None = None
_turn_complete: bool = True
async def close(self) -> None:
try:
await self.session.stop()
finally:
if self._owned_bridge is not None:
self._owned_bridge.shutdown()
self._owned_bridge = None
async def run(self, user_text: str) -> AsyncIterator[JsonRpcNotification]:
if self.session.app_server_client is None:
raise RuntimeError("app-server client is not attached")
if self.pending_tool_calls():
raise RuntimeError("cannot start a new turn while tool calls are pending")
response = await self.session.app_server_client.request(
"turn/start",
{
"threadId": self.thread_id,
"input": [{"type": "text", "text": user_text}],
},
)
self._active_turn_id = response["turn"]["id"]
self._turn_complete = False
while True:
message = await self.session.app_server_client.next_message()
if isinstance(message, JsonRpcServerRequest):
should_pause = await self._handle_server_request(message)
if should_pause:
return
continue
yield message
params = message.params
matches_turn = params.get("turnId") == self._active_turn_id or params.get("turn", {}).get("id") == self._active_turn_id
if message.method == "turn/completed" and matches_turn:
self._turn_complete = True
self._active_turn_id = None
return
async def resume(self) -> AsyncIterator[JsonRpcNotification]:
if self.session.app_server_client is None:
raise RuntimeError("app-server client is not attached")
if self.pending_tool_calls():
raise RuntimeError("cannot resume while tool calls are pending")
if self._turn_complete or self._active_turn_id is None:
return
while True:
message = await self.session.app_server_client.next_message()
if isinstance(message, JsonRpcServerRequest):
should_pause = await self._handle_server_request(message)
if should_pause:
return
continue
yield message
params = message.params
matches_turn = params.get("turnId") == self._active_turn_id or params.get("turn", {}).get("id") == self._active_turn_id
if message.method == "turn/completed" and matches_turn:
self._turn_complete = True
self._active_turn_id = None
return
async def collect_text(self, user_text: str) -> str:
text_chunks: list[str] = []
async for event in self.run(user_text):
if event.method == "item/agentMessage/delta":
delta = event.params.get("delta")
if isinstance(delta, str):
text_chunks.append(delta)
while not self._turn_complete and not self.pending_tool_calls():
async for event in self.resume():
if event.method == "item/agentMessage/delta":
delta = event.params.get("delta")
if isinstance(delta, str):
text_chunks.append(delta)
return "".join(text_chunks)
def pending_tool_calls(self) -> list[PendingToolCall]:
return [tool_call for tool_call in self._pending_tool_calls if not tool_call.resolved]
async def resolve_tool_call(self, tool_call: PendingToolCall) -> None:
if isinstance(tool_call, PendingCommandExecution):
await self.approve(tool_call)
return
if isinstance(tool_call, PendingFileChange):
await self.approve(tool_call)
return
await self.run_function_tool(tool_call)
async def apply_tool_decision(self, tool_call: PendingToolCall) -> bool:
decision = await self._call_tool_approval(tool_call)
if decision is None or isinstance(decision, DeferDecision):
return False
if isinstance(decision, ApproveDecision):
await self.approve(tool_call, for_session=decision.for_session)
return True
if isinstance(decision, RejectDecision):
await self.reject(tool_call, cancel=decision.cancel)
return True
if isinstance(decision, ReplaceCommandDecision):
if not isinstance(tool_call, PendingCommandExecution):
raise TypeError(
"ReplaceCommandDecision can only be used with built-in command approvals"
)
await self.replace_command(tool_call, decision.command)
return True
if isinstance(decision, RunDecision):
if not isinstance(tool_call, PendingFunctionToolCall):
raise TypeError("RunDecision can only be used with function tools")
await self.run_function_tool(tool_call, arguments=decision.arguments)
return True
if isinstance(decision, RespondDecision):
if not isinstance(tool_call, PendingFunctionToolCall):
raise TypeError("RespondDecision is only valid for function tool calls")
await self.submit_tool_result(tool_call, decision.result, success=decision.success)
return True
raise TypeError(f"unsupported tool decision: {type(decision)!r}")
async def approve(self, tool_call: PendingToolCall, *, for_session: bool = False) -> None:
if isinstance(tool_call, PendingCommandExecution):
await self._respond_to_command_approval(tool_call, for_session=for_session)
tool_call.resolved = True
return
if isinstance(tool_call, PendingFileChange):
await self._respond_to_file_change_approval(tool_call, for_session=for_session)
tool_call.resolved = True
return
await self.run_function_tool(tool_call)
async def replace_command(
self,
tool_call: PendingCommandExecution,
command: list[str],
) -> None:
await self._respond_to_command_override(tool_call, command=command)
tool_call.command = " ".join(command)
tool_call.resolved = True
async def reject(self, tool_call: PendingToolCall, *, cancel: bool = False) -> None:
if isinstance(tool_call, PendingCommandExecution):
await self._respond_to_command_rejection(tool_call, cancel=cancel)
tool_call.resolved = True
return
if isinstance(tool_call, PendingFileChange):
await self._respond_to_file_change_rejection(tool_call, cancel=cancel)
tool_call.resolved = True
return
await self._reject_function_tool(tool_call, cancel=cancel)
async def run_function_tool(
self,
tool_call: PendingFunctionToolCall,
*,
arguments: Mapping[str, Any] | None = None,
) -> None:
tool = self.function_tools.get(tool_call.tool_name)
if tool is None:
await self.submit_tool_result(
tool_call,
f"unknown function tool: {tool_call.tool_name}",
success=False,
)
return
call_arguments = dict(arguments) if arguments is not None else tool_call.arguments
try:
result = await tool.run(call_arguments)
except Exception as exc:
await self.submit_tool_result(
tool_call,
f"function tool {tool_call.tool_name} failed: {exc}",
success=False,
)
return
await self.submit_tool_result(tool_call, result, success=True)
async def submit_tool_result(
self,
tool_call: PendingFunctionToolCall,
result: Any,
*,
success: bool = True,
) -> None:
try:
text = result if isinstance(result, str) else json.dumps(result, indent=2, sort_keys=True)
except TypeError as exc:
text = f"function tool {tool_call.tool_name} returned a non-serializable result: {exc}"
success = False
await self.session.app_server_client.send_result(
tool_call.request_id,
{
"contentItems": [{"type": "inputText", "text": text}],
"success": success,
},
)
tool_call.resolved = True
async def _handle_server_request(self, request: JsonRpcServerRequest) -> bool:
tool_call = self._decode_tool_call(request)
if tool_call is None:
await self.session.app_server_client.send_error(
request.request_id,
-32601,
f"unsupported server request method: {request.method}",
)
return False
self._pending_tool_calls.append(tool_call)
handled = await self.apply_tool_decision(tool_call)
if handled:
return False
return not tool_call.resolved
def _decode_tool_call(self, request: JsonRpcServerRequest) -> PendingToolCall | None:
if request.method == "item/commandExecution/requestApproval":
return PendingCommandExecution(
request_id=request.request_id,
thread_id=str(request.params.get("threadId", self.thread_id)),
turn_id=str(request.params.get("turnId", "")),
item_id=str(request.params.get("itemId", "")),
approval_id=self._optional_str(request.params.get("approvalId")),
reason=self._optional_str(request.params.get("reason")),
command=self._optional_str(request.params.get("command")),
cwd=self._optional_str(request.params.get("cwd")),
command_actions=self._command_actions(request.params.get("commandActions")),
)
if request.method == "item/fileChange/requestApproval":
return PendingFileChange(
request_id=request.request_id,
thread_id=str(request.params.get("threadId", self.thread_id)),
turn_id=str(request.params.get("turnId", "")),
item_id=str(request.params.get("itemId", "")),
reason=self._optional_str(request.params.get("reason")),
grant_root=self._optional_str(request.params.get("grantRoot")),
)
if request.method != "item/tool/call":
return None
tool_name = request.params.get("tool")
arguments = request.params.get("arguments", {})
if not isinstance(tool_name, str):
raise RuntimeError("tool call is missing a string tool name")
if not isinstance(arguments, dict):
raise RuntimeError(f"tool call arguments for {tool_name} must be an object")
return PendingFunctionToolCall(
request_id=request.request_id,
thread_id=str(request.params.get("threadId", self.thread_id)),
turn_id=str(request.params.get("turnId", "")),
call_id=str(request.params.get("callId", "")),
tool_name=tool_name,
arguments=arguments,
)
async def _call_tool_approval(self, tool_call: PendingToolCall) -> ToolDecision | None:
decision: Any = None
if isinstance(tool_call, PendingFunctionToolCall):
tool = self.function_tools.get(tool_call.tool_name)
if tool is None:
return None
decision = tool.approve(tool_call)
elif isinstance(tool_call, PendingCommandExecution):
command_name = self._command_name(tool_call.command)
policy = self.builtin_tool_policies.get(command_name)
if policy is None:
return None
decision = policy(tool_call)
elif isinstance(tool_call, PendingFileChange):
return None
if inspect.isawaitable(decision):
decision = await decision
if decision is None or isinstance(decision, ToolDecision):
return decision
raise TypeError(f"tool approval must return ToolDecision or None, got {type(decision)!r}")
async def _respond_to_command_approval(
self,
tool_call: PendingCommandExecution,
*,
for_session: bool,
) -> None:
await self.session.app_server_client.send_result(
tool_call.request_id,
{"decision": "acceptForSession" if for_session else "accept"},
)
async def _respond_to_command_rejection(
self,
tool_call: PendingCommandExecution,
*,
cancel: bool,
) -> None:
await self.session.app_server_client.send_result(
tool_call.request_id,
{"decision": "cancel" if cancel else "decline"},
)
async def _respond_to_command_override(
self,
tool_call: PendingCommandExecution,
*,
command: list[str],
) -> None:
await self.session.app_server_client.send_result(
tool_call.request_id,
{
"decision": {
"acceptWithCommandOverride": {
"command": command,
}
}
},
)
async def _respond_to_file_change_approval(
self,
tool_call: PendingFileChange,
*,
for_session: bool,
) -> None:
await self.session.app_server_client.send_result(
tool_call.request_id,
{"decision": "acceptForSession" if for_session else "accept"},
)
async def _respond_to_file_change_rejection(
self,
tool_call: PendingFileChange,
*,
cancel: bool,
) -> None:
await self.session.app_server_client.send_result(
tool_call.request_id,
{"decision": "cancel" if cancel else "decline"},
)
async def _reject_function_tool(self, tool_call: PendingFunctionToolCall, *, cancel: bool) -> None:
action = "canceled" if cancel else "rejected"
await self.submit_tool_result(
tool_call,
f"function tool {tool_call.tool_name} was {action} by the host",
success=False,
)
@staticmethod
def _command_actions(value: Any) -> list[dict[str, Any]] | None:
if isinstance(value, list) and all(isinstance(action, dict) for action in value):
return value
return None
@staticmethod
def _optional_str(value: Any) -> str | None:
return value if isinstance(value, str) else None
@staticmethod
def _command_name(command: str | None) -> str:
if not command:
return "exec_command"
return "write_stdin" if command.startswith("write_stdin:") else "exec_command"

View File

@@ -0,0 +1,283 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Awaitable, Callable, ClassVar, Mapping, Sequence
if TYPE_CHECKING:
from .pending_tool_calls import PendingCommandExecution
from .pending_tool_calls import PendingFunctionToolCall
from .pending_tool_calls import ToolDecision
if TYPE_CHECKING:
BuiltinApprovalPolicy = Callable[
[PendingCommandExecution],
Awaitable[ToolDecision | None] | ToolDecision | None,
]
else:
BuiltinApprovalPolicy = Callable[[Any], Awaitable[Any] | Any]
@dataclass(frozen=True, slots=True)
class BuiltinToolSpec:
tool_name: str
class Tool:
pass
@dataclass(frozen=True, slots=True)
class ConfiguredBuiltinTool(Tool):
tool_type: type["BuiltinTool"]
approval_policy: BuiltinApprovalPolicy | None = None
def builtin_spec(self) -> BuiltinToolSpec:
return self.tool_type.builtin_spec()
class BuiltinTool(Tool):
codex_builtin_tool: ClassVar[str]
@classmethod
def builtin_spec(cls) -> BuiltinToolSpec:
tool_name = getattr(cls, "codex_builtin_tool", None)
if not tool_name:
raise TypeError(f"{cls.__name__} must define codex_builtin_tool")
return BuiltinToolSpec(tool_name=tool_name)
@classmethod
def with_approval_policy(
cls,
*,
policy: BuiltinApprovalPolicy,
) -> ConfiguredBuiltinTool:
return ConfiguredBuiltinTool(tool_type=cls, approval_policy=policy)
class FunctionTool(Tool):
name: ClassVar[str]
description: ClassVar[str]
input_schema: ClassVar[dict[str, Any]]
@classmethod
def dynamic_tool_spec(cls) -> dict[str, Any]:
name = getattr(cls, "name", None)
description = getattr(cls, "description", None)
input_schema = getattr(cls, "input_schema", None)
if not name:
raise TypeError(f"{cls.__name__} must define name")
if not description:
raise TypeError(f"{cls.__name__} must define description")
if not isinstance(input_schema, dict):
raise TypeError(f"{cls.__name__} must define input_schema as a dict")
return {
"name": name,
"description": description,
"inputSchema": input_schema,
}
async def approve(self, call: PendingFunctionToolCall) -> ToolDecision | None:
return None
def instructions(self) -> str | None:
return None
async def run(self, arguments: Mapping[str, Any]) -> Any:
raise NotImplementedError
class ExecCommand(BuiltinTool):
codex_builtin_tool = "exec_command"
class WriteStdin(BuiltinTool):
codex_builtin_tool = "write_stdin"
class Shell(BuiltinTool):
codex_builtin_tool = "shell"
class UpdatePlan(BuiltinTool):
codex_builtin_tool = "update_plan"
class RequestUserInput(BuiltinTool):
codex_builtin_tool = "request_user_input"
class ApplyPatch(BuiltinTool):
codex_builtin_tool = "apply_patch"
class WebSearch(BuiltinTool):
codex_builtin_tool = "web_search"
class ViewImage(BuiltinTool):
codex_builtin_tool = "view_image"
class SearchToolBm25(BuiltinTool):
codex_builtin_tool = "search_tool_bm25"
class ReadFile(BuiltinTool):
codex_builtin_tool = "read_file"
class ListDir(BuiltinTool):
codex_builtin_tool = "list_dir"
class GrepFiles(BuiltinTool):
codex_builtin_tool = "grep_files"
class ListMcpResources(BuiltinTool):
codex_builtin_tool = "list_mcp_resources"
class ListMcpResourceTemplates(BuiltinTool):
codex_builtin_tool = "list_mcp_resource_templates"
class ReadMcpResource(BuiltinTool):
codex_builtin_tool = "read_mcp_resource"
class SpawnAgent(BuiltinTool):
codex_builtin_tool = "spawn_agent"
class SendInput(BuiltinTool):
codex_builtin_tool = "send_input"
class ResumeAgent(BuiltinTool):
codex_builtin_tool = "resume_agent"
class Wait(BuiltinTool):
codex_builtin_tool = "wait"
class CloseAgent(BuiltinTool):
codex_builtin_tool = "close_agent"
class SpawnAgentsOnCsv(BuiltinTool):
codex_builtin_tool = "spawn_agents_on_csv"
class JsRepl(BuiltinTool):
codex_builtin_tool = "js_repl"
class JsReplReset(BuiltinTool):
codex_builtin_tool = "js_repl_reset"
class Artifacts(BuiltinTool):
codex_builtin_tool = "artifacts"
class ReportAgentJobResult(BuiltinTool):
codex_builtin_tool = "report_agent_job_result"
class TestSyncTool(BuiltinTool):
codex_builtin_tool = "test_sync_tool"
ALL_BUILTIN_TOOLS: tuple[type[BuiltinTool], ...] = (
ExecCommand,
WriteStdin,
Shell,
UpdatePlan,
RequestUserInput,
ApplyPatch,
WebSearch,
ViewImage,
SearchToolBm25,
ReadFile,
ListDir,
GrepFiles,
ListMcpResources,
ListMcpResourceTemplates,
ReadMcpResource,
SpawnAgent,
SendInput,
ResumeAgent,
Wait,
CloseAgent,
SpawnAgentsOnCsv,
JsRepl,
JsReplReset,
Artifacts,
ReportAgentJobResult,
TestSyncTool,
)
def builtin_tools(tools: Sequence[Tool | type[Tool]]) -> tuple[list[str], dict[str, BuiltinApprovalPolicy]]:
names: list[str] = []
policies: dict[str, BuiltinApprovalPolicy] = {}
seen: set[str] = set()
for tool in tools:
resolved = tool if isinstance(tool, ConfiguredBuiltinTool) else None
tool_type = resolved.tool_type if resolved is not None else tool if isinstance(tool, type) else type(tool)
if not isinstance(tool_type, type) or not issubclass(tool_type, BuiltinTool):
continue
name = tool_type.builtin_spec().tool_name
if name not in seen:
seen.add(name)
names.append(name)
if resolved is not None and resolved.approval_policy is not None:
policies[name] = resolved.approval_policy
return names, policies
def function_tools(tools: Sequence[Tool | type[Tool]]) -> list[FunctionTool]:
resolved: list[FunctionTool] = []
seen: set[str] = set()
builtin_names = {tool.builtin_spec().tool_name for tool in ALL_BUILTIN_TOOLS}
for tool in tools:
if isinstance(tool, ConfiguredBuiltinTool):
continue
tool_instance = tool() if isinstance(tool, type) else tool
if not isinstance(tool_instance, FunctionTool):
continue
tool_name = type(tool_instance).dynamic_tool_spec()["name"]
if tool_name in builtin_names:
raise ValueError(f"function tool name collides with codex built-in: {tool_name}")
if tool_name in seen:
raise ValueError(f"duplicate function tool name: {tool_name}")
seen.add(tool_name)
resolved.append(tool_instance)
return resolved
def tool_instruction_fragments(tools: Sequence[Tool | type[Tool]]) -> list[str]:
fragments: list[str] = []
seen: set[str] = set()
for tool in tools:
if isinstance(tool, ConfiguredBuiltinTool):
continue
tool_instance = tool() if isinstance(tool, type) else tool
if isinstance(tool_instance, FunctionTool):
fragment = tool_instance.instructions()
key = f"function:{type(tool_instance).dynamic_tool_spec()['name']}"
if fragment and key not in seen:
seen.add(key)
fragments.append(fragment)
return fragments
DEFAULT_TOOLS: tuple[type[BuiltinTool], ...] = (
ExecCommand,
WriteStdin,
)

91
codex-sdk-v2/uv.lock generated Normal file
View File

@@ -0,0 +1,91 @@
version = 1
revision = 3
requires-python = ">=3.11"
[[package]]
name = "anyio"
version = "4.12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "idna" },
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" },
]
[[package]]
name = "certifi"
version = "2026.2.25"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
]
[[package]]
name = "codex-sdk-v2"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "httpx" },
]
[package.metadata]
requires-dist = [{ name = "httpx", specifier = ">=0.27.0" }]
[[package]]
name = "h11"
version = "0.16.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]
[[package]]
name = "httpcore"
version = "1.0.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "h11" },
]
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
]
[[package]]
name = "httpx"
version = "0.28.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "certifi" },
{ name = "httpcore" },
{ name = "idna" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
[[package]]
name = "idna"
version = "3.11"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
]
[[package]]
name = "typing-extensions"
version = "4.15.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
]

View File

@@ -0,0 +1,482 @@
# codex-sdk-v2 Prototype Summary
## 1. Goals of the prototype
This prototype was meant to answer one core question:
Can we reuse Codex as the runtime and tool-execution engine while preserving the Universal Computer model in which the host SDK owns orchestration, configuration, approvals, and provider access?
More concretely, the prototype set out to prove that we can:
- run Codex through app-server instead of re-implementing tool behavior in Python
- let the host SDK own Responses API credentials and transport
- choose which Codex built-in tools are enabled from the SDK
- add host-defined function tools in a Universal Computer-style API
- keep the SDK in the loop for all tool calls via pending tool call pause-points
- support programmatic approval, defer, reject, and argument/command rewriting
- start moving away from monolithic prompt construction toward capability-scoped instruction composition
- preserve the Universal Computer ergonomics where possible while adopting Codexs runtime model
The result is a successful prototype, with one important caveat:
The current transport is still bridge-based delegation, not the final event-by-event host-delegation model from the RFC.
### Prototype architecture
```mermaid
flowchart LR
subgraph Host["Host SDK"]
SDK["AgentSDKv2"]
Bridge["Host Responses bridge"]
Provider["Upstream provider"]
end
subgraph Runtime["Runtime"]
AppServer["Codex app-server"]
Codex["Codex runtime"]
end
SDK --> AppServer
AppServer --> Codex
Codex --> Bridge
Bridge --> Provider
```
## 2. Changes made in Codex
To support the prototype, Codex needed a handful of structural changes.
### App-server and thread/session configuration
We added new thread-start knobs so the SDK can shape the runtime explicitly:
- a thread-scoped delegation configuration pointing Codex at a host-managed Responses bridge
- a built-in tool allowlist so the SDK can choose which Codex tools are exposed
- a manual tool execution mode so built-in tool calls can become host-visible pause-points
This is a meaningful shift in ownership. Before, tool availability and model transport were mostly internal to Codex. With this prototype, the SDK becomes an active configuration authority.
### Model transport delegation
Codex was extended so a thread can target a host-local bridge for `/v1/responses` traffic instead of always talking directly to the upstream provider.
That required:
- per-thread provider override behavior
- app-server awareness of the delegation configuration
- explicit startup signaling so the SDK can verify that delegation was actually applied
This is enough for the prototype, but it is still a proxy model rather than true delegated transport.
In the prototype:
- Codex still constructs a provider-shaped HTTP request
- Codex still opens the streaming HTTP connection itself
- that request is aimed at the host-owned bridge rather than directly at OpenAI
- the host bridge then makes the real upstream HTTP request, injects `Authorization`, and streams the provider response bytes back down to Codex unchanged
So the host really is performing the upstream HTTP request in this prototype, but Codex still thinks it is talking to a Responses-compatible HTTP endpoint. The transport contract is still HTTP proxying, not app-server-level request delegation.
```mermaid
sequenceDiagram
participant SDK as "Host SDK"
participant Codex as "Codex runtime"
participant Bridge as "Host bridge"
participant Provider as "Model provider"
SDK->>Codex: thread/start + sdkDelegation.bridgeUrl
SDK->>Codex: turn/start
Codex->>Bridge: HTTP POST /v1/responses
Bridge->>Provider: HTTP POST /v1/responses + auth
Provider-->>Bridge: streaming response
Bridge-->>Codex: streaming response
Codex-->>SDK: app-server events + tool pause-points
```
### Built-in tool selection and host-visible control flow
Codex already had built-in tools, but the prototype needed the SDK to decide which ones are present on a thread.
That led to:
- explicit built-in tool filtering at thread startup
- manual execution mode for unified-exec so the SDK can pause on built-in calls before Codex executes them
- command-override support for unified-exec so the SDK can replace a proposed command rather than only approve or reject it
This is a real architectural improvement for host control, but it also increases surface area around tool semantics and approval behavior.
### Prompt composition changes
Codex previously relied on a fairly monolithic base prompt. To support tool-conditional guidance, we introduced capability-scoped prompt fragments for built-ins and composed them into the session base instructions.
The important structural change here is not just “more prompt files.” It is that built-in tool guidance is no longer conceptually part of one indivisible system prompt. It is now attached to enabled capabilities.
That is the right direction, but it introduces a maintenance obligation: prompt behavior now depends on both model metadata and tool configuration, so drift between those layers becomes a real risk.
### Risks and maintenance challenges introduced in Codex
- The bridge-based delegation path is a temporary architecture and will be easy to over-invest in if we are not disciplined.
- Tool semantics now exist at the intersection of prompt composition, tool registry configuration, and approval handling, which increases the chance of subtle mismatches.
- Manual built-in pause-points make the runtime more host-friendly, but they also make turn progression and resume behavior more stateful and therefore more failure-prone.
- Built-in capability prompts now need to stay aligned with the actual available tool surface. If we add tools and forget to add or adjust capability fragments, the prompt can become misleading again.
- The prototype only exercises a narrow built-in set, mainly unified-exec. Expanding to the full Codex built-in surface will add complexity.
## 3. Changes from the original Universal Computer model
To make Universal Computer work in this world, we had to reshape several assumptions from the original package.
### Tool implementation ownership
Original Universal Computer treated the Python SDK as the home of tool execution for built-ins like filesystem and shell behavior.
In the new world:
- Codex owns built-in tool execution
- the SDK only enables or disables built-ins and participates in approvals/control flow
- host-defined tools remain host-executed, but they are expressed as Codex dynamic tools under the hood
This is probably the biggest philosophical change in the whole prototype.
### Plugin architecture became capability architecture
Universal Computer plugins mixed together:
- tool groups
- instructions
- manifest mutation
- request shaping
For the prototype, that concept was reintroduced as capabilities:
- a capability exposes a single `tools()` method
- it can contribute instructions
- it can mutate the manifest
The built-in/function distinction is now internal to the SDK rather than part of the public composition API.
### Approval and execution flow
Original Universal Computer exposed a very host-centric tool call loop. We restored that shape, but the semantics changed:
- built-in Codex tools now pause for host approval rather than being host-executed
- host function tools are still host-executed
- both are surfaced through one pending tool call abstraction
This preserves the ergonomics while changing the underlying runtime ownership model.
### Session instruction model
Universal Computer previously had a stronger notion of separate base, developer, and user instructions.
In the prototype:
- `base_instructions` remains a replacement channel
- `developer_instructions` remains an additive channel
- `user_instructions` was removed rather than carrying forward a misleading prefix-based approximation
That is a simplification, but also an admission that the old user-instructions shape was not yet properly mapped.
### Transport and backend model
Universal Computers original long-term direction is backend-agnostic remote execution.
The prototype narrows that significantly:
- it currently uses a local attached-process backend
- the SDK owns a local bridge
- the “copy Codex binary into a destination container” story is not yet real
This was the right tradeoff for a prototype, but it means the transport/backend layer is still far from feature-complete.
## 4. Remaining work to fully port the Universal Computer paradigm
There is still substantial work left before this becomes a full Universal Computer-on-Codex implementation.
### Replace bridge-based delegation with true app-server delegation
The RFCs intended architecture is:
- Codex prepares the upstream model request
- app-server emits host-directed delegation events
- the host SDK makes the provider call
- the host streams upstream events back into Codex
The prototype does not do that yet. It uses a bridge/proxy instead.
The difference matters:
- in the prototype bridge model, Codex speaks HTTP to a Responses-compatible endpoint and the host pretends to be that endpoint
- in the intended full-delegation model, Codex does not speak provider HTTP at all; it emits structured app-server events and the SDK owns the provider call lifecycle explicitly
Said another way:
- today, the host owns the real upstream HTTP request, but Codex still owns the HTTP client behavior and stream shape it expects to speak
- in the future design, the host owns both the upstream HTTP request and the transport contract between Codex and the host
That future-state is what unlocks:
- provider switching without pretending every provider is a Responses-compatible bridge
- clean host-side request persistence and replay
- first-class interception, cancellation, and routing at the SDK layer
- a better multi-container story because the host is in the loop at the app-server event layer rather than only behind an HTTP shim
This is the most important architectural gap to close.
```mermaid
sequenceDiagram
participant SDK as "Host SDK"
participant AppServer as "Codex app-server"
participant Codex as "Codex runtime"
participant Provider as "Model provider"
SDK->>AppServer: thread/start
SDK->>AppServer: turn/start
AppServer->>SDK: model/request
SDK->>Provider: provider request + auth
Provider-->>SDK: streaming provider events
SDK-->>AppServer: model/streamEvent*
AppServer->>Codex: normalized turn progression
Codex-->>SDK: tool approvals / pending tool calls / output
```
### Generate typed SDK protocol models
The prototype still hand-rolls Python-side JSON-RPC parsing and event handling.
To make this production-worthy we should generate typed SDK protocol models from the Rust app-server source of truth, including:
- Python Pydantic models for app-server requests, responses, and notifications
- publishing those generated Python models as a dedicated package to a PyPI repository
- pinning that package version to Codex releases so SDK/runtime compatibility is explicit
- continued TypeScript generation from the same Rust source
- a clearer separation between wire-level protocol models and higher-level SDK runtime objects like pending tool calls
This matters for correctness and maintainability. As the app-server surface grows, hand-maintained Python shapes will drift.
### Reintroduce durable rollout ownership on the host
The prototype has in-memory pending tool calls, but it does not yet fully restore the Universal Computer model where the host owns durable rollout state and can cleanly pause, spin down, and resume later.
To finish that work we need:
- stable serialization of unresolved pending tool calls
- robust replay/rehydration for built-in approvals and function-tool calls
- host-owned transcript and turn state as the source of truth
### Complete the capability system
The prototype capability model is intentionally small. A fuller port would need:
- richer capability composition
- skill-like capability bundles
- memory capability support
- manifest-processing conventions
- clearer precedence rules when multiple capabilities contribute instructions or tools
### Provider abstraction on the host
Universal Computer wants host-side multi-provider support. The prototype still assumes an OpenAI-shaped host bridge.
A real port needs:
- provider-neutral request abstractions on the host
- OpenAI and Anthropic support at minimum
- streaming normalization back into Codexs expectations
### Remote backend support
The prototype does not yet solve:
- pinned Codex version acquisition
- copying the right Codex binary to the destination environment
- attached-process support across all intended backends
- reliable host-container transport for app-server
That backend work is core to the Universal Computer value proposition and still remains.
### Complete prompt/capability alignment
We now have the beginning of built-in capability prompt composition, but not the final state.
Still needed:
- expand capability fragments beyond the current narrow built-in set
- unify prompt composition conventions across all built-ins
- make capability ownership and prompt ownership obvious and durable
## 5. Risks and challenges to productionize
### Architecture risk
The biggest risk is shipping too much around the bridge transport and then having to unwind it when moving to true full delegation mode. The bridge is a useful prototype tool, but it is not the right final abstraction.
### State and resume complexity
Host-visible pause-points are powerful, but productionizing them means solving:
- durable pending tool state
- replay correctness
- no double-execution on resume
- clear ownership of partially completed turns
This is tricky and easy to get subtly wrong.
### Prompt and capability drift
We are now explicitly tying enabled capabilities to prompt sections. That is a better model, but it creates a new kind of maintenance burden:
- adding or removing a tool may require prompt updates
- prompt fragments may drift from actual runtime behavior
- capability bundles may accumulate overlapping or contradictory instructions
### Cross-runtime compatibility
Codex is a Rust runtime with strong internal assumptions. Universal Computer wants host orchestration across heterogeneous backends and providers. The seam between those two worlds needs to stay disciplined or the SDK will slowly become a shadow runtime.
### Operational complexity
Productionizing this means dealing with:
- version pinning
- binary distribution
- backend compatibility
- network transport
- auth boundaries
- observability for both the host SDK and the remote Codex runtime
That is a larger operational surface than either original system had in isolation.
## 6. Suggested engineering roadmap
### Phase 0: Generated protocol models and package distribution
Goals:
- frontload generated Python protocol models before broader SDK implementation
- make app-server wire types a versioned dependency rather than a copied internal detail
- tie the Python protocol package version explicitly to Codex releases
Deliverables:
- generated Python Pydantic models for app-server protocol payloads
- continued TypeScript generation from the same Rust source
- published Python protocol package in the target PyPI repository
- explicit versioning and compatibility policy between Codex and the protocol package
### Phase 1: Architecture foundation and protocol contracts
Goals:
- stabilize the `codex-sdk-v2` API
- keep capabilities as the public composition abstraction
- expand and clean up built-in capability prompt composition
- remove obviously prototype-only rough edges
Deliverables:
- clear capability API
- consistent base/developer instruction semantics
- better example coverage
- tighter prompt composition ownership rules
### Phase 2: Replace bridge delegation with real app-server full delegation
Goals:
- make the host SDK the true owner of provider transport
- stop relying on a bridge/proxy architecture
Deliverables:
- new app-server delegation events for model requests and streamed upstream events
- generated protocol types covering the new delegation events
- host-side transport driver
- Codex-side external stream ingestion
- explicit cancellation and failure semantics
This phase is the real architectural transition.
### Phase 3: Host-owned rollout and durable pause/resume
Goals:
- restore the original Universal Computer durability model
- make pending tool calls and turn state resumable across process restarts
Deliverables:
- serialized pending tool state
- replay-safe resume logic
- host-owned rollout persistence as source of truth
- strong idempotency guarantees where possible
### Phase 4: Backend generalization
Goals:
- move beyond local attached-process execution
- support the Universal Computer backend model in earnest
Deliverables:
- pinned Codex version management
- binary acquisition and staging
- attached-process support across all supported backends
- robust host-runtime transport recommendations and implementations
### Phase 5: Provider and capability expansion
Goals:
- make the host SDK genuinely multi-provider
- expand the capability model to cover more of the original Universal Computer ecosystem
Deliverables:
- provider abstraction for OpenAI and Anthropic
- richer capability bundles
- memory/skills-style capabilities
- better story for apps and connector-driven capabilities
### Phase 6: Hardening and production readiness
Goals:
- make the system operable and debuggable in real workloads
Deliverables:
- observability across host and runtime
- clear failure semantics
- migration strategy from Universal Computer package users
- load, reliability, and recovery testing
- documentation and support model
### Roadmap at a glance
```mermaid
flowchart LR
P0["Phase 0<br/>Generated protocol package"]
P1["Phase 1<br/>Architecture + protocol contracts"]
P2["Phase 2<br/>Full delegation transport"]
P3["Phase 3<br/>Host-owned rollout + resume"]
P4["Phase 4<br/>Capabilities + backend generalization"]
P5["Phase 5<br/>Provider expansion"]
P6["Phase 6<br/>Production hardening"]
P0 --> P1 --> P2 --> P3 --> P4 --> P5 --> P6
```
## Final assessment
This prototype successfully demonstrates that Codex can act as the execution/runtime layer for a Universal Computer-style SDK without forcing the SDK to re-implement Codexs built-in tools in Python.
That is the right strategic result.
At the same time, it is still a prototype in the most important sense:
- the delegation transport is not final
- durability is not final
- backend generality is not final
The good news is that the prototype reduced uncertainty in the right places. The remaining work is substantial, but it now looks like engineering, not research.

View File

@@ -0,0 +1,884 @@
# RFC: Host-Delegated Codex App-Server for Universal Computer
## Summary
Universal Computer already has the right high-level instinct: the host SDK should own orchestration, credentials, approvals, persistence, and backend selection, while the remote runtime should own local execution against the target filesystem and sandbox.
Today, those responsibilities are split awkwardly. Universal Computer's Python SDK builds the Responses request, defines tools in Python, and interprets raw model output into actionable tool calls. Codex app-server, by contrast, already has a richer Rust-native execution engine, tool surface, approval model, and event model, but it assumes Codex itself is the party speaking to the Responses API and, in normal operation, the party managing rollout persistence.
The proposal is to add a new **full delegation mode** to codex app-server:
- Codex still runs inside the destination container or locally.
- Codex still owns prompt assembly, tool registration, tool execution, approvals, and turn semantics.
- The **host SDK** becomes the sole party that talks to the Responses API.
- The **host SDK** also becomes the source of truth for rollout persistence.
- The app-server protocol grows a small set of new server-initiated requests and client responses so Codex can ask the host to create, stream, cancel, and finalize upstream model requests.
This gives Universal Computer what it wants: reuse of Codex's Rust-native tool/runtime behavior without giving up host-side orchestration, multi-container routing, approval policy, or host-managed conversation state.
## Context
### What Universal Computer does well today
From the Universal Computer side, the architecture is already clean:
- `Agent` owns declarative configuration:
- `base_instructions`
- `developer_instructions`
- `user_instructions`
- `plugins`
- `tools`
- `sampling_params`
- `TaskContext` owns startup, manifest injection, snapshotting, and session binding.
- `Task` is the durable rollout object:
- it stores context
- it stores resumable session state
- it streams raw Responses events
- it pauses when tool calls are pending
- plugins are more than tool bundles:
- they can contribute instructions
- mutate context
- mutate sampling params
- mutate manifest/session setup
That is an important constraint: Universal Computer is not merely a remote shell. It is a **host orchestration framework**.
### What Codex app-server already provides
Codex app-server is already surprisingly close to what we need:
- thread and turn lifecycle APIs
- streaming turn/item notifications
- server-initiated approval requests
- dynamic tools
- apps/plugins/skills integration
- configurable developer instructions and other session settings
- client-managed notification transport
But the current model assumes:
- Codex itself makes the Responses API request
- Codex owns the upstream stream lifecycle
- Codex is the natural home for thread persistence
That assumption is the seam that needs to change.
## Design principle
The right boundary is:
- **Host SDK owns external orchestration**
- **Remote Codex owns local execution semantics**
More concretely:
### Host-owned
- Responses API transport and credentials
- rollout persistence
- backend selection
- multi-container routing
- approval UX and policy
- high-level session bootstrap
### Codex-owned
- instruction compilation
- model request planning
- tool schema materialization
- tool execution against the live workspace/container
- item/turn state machine
- normalization of model events into Codex semantics
This is the key pushback: the host should not have to reconstruct Codex's prompts, tool schemas, or internal turn loop. If we force the SDK to do that, we reintroduce the exact duplication you want to eliminate.
## Goals
1. Support running Codex app-server in a target container or locally.
2. Allow the host SDK to be the only component that talks to the Responses API.
3. Preserve Codex as the implementation of the default tool surface.
4. Preserve host-side approvals for all tools.
5. Preserve host-side rollout persistence as the source of truth.
6. Allow full client-provided configuration:
- base instructions
- developer instructions
- user instructions
- tool/plugin/app config
7. Allow the host to override or replace the default tool set.
8. Keep the protocol high-level and transport-agnostic enough for non-Docker backends.
## Non-goals
1. Re-implement Codex tool behavior in the Python SDK.
2. Make the host responsible for prompt assembly.
3. Force app-server to lose its current direct-to-Responses mode.
4. Solve every multi-agent routing problem in the first iteration.
## Proposed model: Full Delegation Mode
Add a new app-server execution mode, conceptually:
- `direct` mode: current behavior
- `fullDelegation` mode: new behavior
In `fullDelegation` mode:
1. The host starts app-server inside the target environment.
2. The host provides all desired configuration at thread/session startup.
3. Codex prepares the next upstream Responses request, but does not send it.
4. Codex emits a server-initiated request to the host containing the prepared upstream request envelope.
5. The host executes that request against the Responses API.
6. The host streams upstream events back into app-server.
7. Codex consumes those events, updates turn state, emits its normal item/turn notifications, and requests approvals or user input as needed.
8. The host persists the resulting rollout externally.
This is not "remote shell plus JSON." It is better understood as **remote Codex with externalized model transport**.
## Why this fits Universal Computer
Today Universal Computer's `Task.run()` does three important jobs:
1. build the request
2. stream events
3. pause for tool calls
Under this RFC:
- job 1 moves from Python to Codex
- job 2 remains host-owned at the transport layer
- job 3 becomes cleaner, because Codex itself now owns tool interpretation and execution
That is a net simplification.
## Protocol additions
The existing app-server pattern to imitate is the approval flow: Codex can already issue server-initiated JSON-RPC requests to the client and resume when the client responds.
Full delegation should reuse that same pattern.
## New concepts
### 1. Delegated model request
Codex needs a way to say:
> "Here is the exact upstream request I want to make. Please make it for me, and stream the result back."
Proposed request:
- `model/request`
Purpose:
- server-initiated request from Codex to host
- carries a canonicalized Responses request envelope
This envelope should include, at minimum:
- model
- instructions or compiled system input
- input items/messages
- tool definitions
- reasoning config
- tool-choice config
- request-level overrides derived from SDK/user configuration, such as reasoning effort, summary mode, verbosity, and other per-turn sampling controls
- metadata needed for correlation
- optional previous-response linkage if Codex wants it
- stream expectation
- opaque session/turn correlation ids
The important point is that this is **Codex-authored**. The host forwards it, it does not reinterpret it.
### 2. Delegated model stream injection
The host needs a way to stream upstream events back into Codex.
Proposed client method:
- `model/streamEvent`
Purpose:
- client-to-server notification or request delivering one upstream Responses stream event at a time
The server should accept:
- raw upstream event payload
- correlation id tying the event to the outstanding `model/request`
This lets Codex continue using its native event handling logic.
### 3. Terminal stream semantics
For normal operation, Codex should infer terminal model state from the raw upstream Responses events themselves, especially `response.completed` and `response.failed`. In other words, the canonical end-of-turn signal should come from the same event stream Codex is already consuming.
A separate client method is only needed for cases where the host cannot provide a terminal Responses event, for example:
- the host canceled the upstream request before a terminal event was emitted
- the network stream disconnected mid-flight
- the host rejected the delegated request before sending it upstream
In that narrower case, a small escape hatch such as `model/streamAborted` is useful. It should carry:
- delegated request id
- abort reason such as `canceled`, `disconnected`, or `requestRejected`
- normalized error info if relevant
This keeps the happy path simple while still giving Codex a way to distinguish "the model finished" from "the host-side transport broke."
### 4. Delegated model cancellation
Codex may need to ask the host to cancel an in-flight upstream request.
Proposed server request:
- `model/cancel`
This is important for:
- turn interruption
- approval denial during streaming
- client disconnect handling
- compaction or reroute logic
### 5. External rollout mode
Codex needs to know it is not the durable source of truth.
Proposed thread/session config:
- `rolloutOwnership: "server" | "client"`
In the new mode, use `"client"`.
Behaviorally, this means:
- server may still keep ephemeral in-memory turn state
- server should not assume persisted thread state is canonical
- resume/fork semantics should allow the client to provide prior rollout context explicitly
### 6. External history hydrate
If the host owns persistence, Codex needs a way to rehydrate a thread from client-supplied history.
Proposed startup field or dedicated method:
- `thread/start` or `thread/resume` with `initialItems` / `turnHistory`
This should be the normalized Codex-facing history representation, not raw Responses-only items.
That keeps Codex's turn engine informed without forcing SQLite/file rollout ownership back into the container.
There is already an implicit translation boundary here today: Codex does not operate on raw SSE events as its durable thread model. It turns upstream Responses output into a richer internal history made of turns and items. Client-owned rollout mode would make that boundary explicit. The host would persist the Codex-facing item history it receives over app-server notifications, then feed that normalized history back on resume, rather than trying to reconstruct a thread from raw Responses API events alone.
## Proposed event surface
A clean high-level set could be:
### Server -> client
- `model/request`
- `model/cancel`
- `delegation/request` for subagents or cross-container execution
- existing approval requests remain unchanged
- existing `item/tool/requestUserInput` remains unchanged
### Client -> server
- `model/streamEvent`
- `model/streamAborted`
- `model/requestRejected`
- `delegation/result`
- existing approval decisions remain unchanged
- existing tool/user-input responses remain unchanged
## Mermaid: end-to-end flow
```mermaid
sequenceDiagram
participant Host as "Universal Computer SDK (host)"
participant Codex as "Codex app-server (container/local)"
participant API as "Responses API"
Host->>Codex: thread/start + full config + fullDelegation
Host->>Codex: turn/start(user input)
Codex-->>Host: model/request(request envelope)
Host->>API: POST /v1/responses (stream=true)
loop Streaming
API-->>Host: response event
Host->>Codex: model/streamEvent(event)
Codex-->>Host: item/turn notifications
end
Codex-->>Host: item/commandExecution/requestApproval
Host->>Host: programmatic approval policy
Host-->>Codex: approval decision
Codex->>Codex: execute tool in container
Codex-->>Host: model/request(next request after tool output)
Host->>API: next Responses call
API-->>Host: terminal event
Host->>Codex: model/streamEvent(response.completed)
Codex-->>Host: turn/completed
Host->>Host: persist rollout as source of truth
```
## Mermaid: state ownership
In plain English: the host SDK remains the control plane, and Codex inside the container remains the execution plane. The host is responsible for the things that need global visibility or trust: talking to the Responses API, persisting rollout state, deciding approval policy, and deciding where delegated work should run. Codex is responsible for the things that need local workspace context: assembling the actual model request, running the turn state machine, choosing and executing tools, and applying side effects inside the container. The diagram below is just showing that split of responsibilities rather than a strict request-by-request sequence.
```mermaid
flowchart LR
subgraph Host["Host SDK"]
H1["Responses auth + transport"]
H2["Rollout persistence"]
H3["Approval policy"]
H4["Backend routing / multi-container"]
end
subgraph Remote["Remote Codex app-server"]
C1["Prompt + request synthesis"]
C2["Turn state machine"]
C3["Tool registry + execution"]
C4["Workspace-local side effects"]
end
H1 --> C2
H2 --> C2
H3 --> C3
H4 --> C2
C1 --> H1
C2 --> H2
C3 --> H3
```
## Behavioral changes required inside Codex
### 1. Separate "prepare request" from "send request"
Today those are effectively fused. Full delegation requires Codex to:
- build the canonical upstream request
- stop before transport
- wait for externally streamed events
That is the fundamental internal refactor.
### 2. Accept externally sourced Responses events as first-class input
Codex must be able to ingest a Responses event stream that it did not open itself.
This means:
- correlation of event stream to active turn/request
- same parsing, validation, and item synthesis path as direct mode
- same terminal handling and retry semantics where applicable
### 3. Make thread persistence optional, not authoritative
In client-owned rollout mode, Codex should treat persistence as operational cache, not source of truth.
A good discipline is:
- in-memory state for active turn execution
- explicit rehydrate from client history on resume
- no hidden reliance on local rollout files for correctness
### 4. Make tool registry fully session-configurable
This is already partly present through dynamic tools, plugins, and apps, but the new mode should make it explicit that the tool surface may be:
- default Codex tools
- default Codex tools plus client additions
- a full client override
- a minimal safe subset
The important policy question is precedence. My recommendation:
- `default`
- `default + additive overrides`
- `replace entirely`
as three explicit modes, not implicit merging.
### 5. Preserve current approval semantics across all tools
Approvals must remain server-initiated from Codex to host, because that is the clean point where the host can inject policy without reimplementing runtime behavior.
Operationally, this likely means Codex should be started with an approval configuration that never blocks on an in-container human prompt and instead always routes approval decisions through app-server requests to the host. The host SDK then becomes the policy engine and UI surface for tool approvals, while Codex remains the party that formulates the execution request and enforces the answer.
The host should not be approving raw Responses tool call output. It should be approving Codex's normalized execution intent: "run this command," "apply this patch," "grant this network access," and so on.
### 6. Support host-intercepted delegation as a future sibling of model delegation
If you want multi-container delegation, do not hide subagent creation entirely inside the container runtime. Give it a parallel host-visible control point.
Concretely, app-server should emit a `delegation/request` event whenever Codex wants to spawn a subagent. That event should include:
- the parent thread/turn context
- the requested subagent instructions and input items
- the requested tool/profile configuration
- execution hints such as preferred cwd, sandbox, or model
- enough metadata for the host to correlate the child back to the parent
The host SDK can then choose one of two paths:
1. run the subagent in the same container and return a `delegation/result`
2. materialize it as a separate top-level agent on another backend or container and still return a `delegation/result`
In both cases, Codex should treat the result as a structured child outcome rather than assuming where or how the subagent ran. That gives the SDK user real control over topology without making Codex blind to delegated work.
## Configuration model
Full delegation mode must support all the configuration Universal Computer already treats as first-class:
- base instructions
- developer instructions
- user instructions
- model choice
- sampling params
- plugin declarations
- app/plugin auth context
- tool override policy
- approval policy
- cwd / manifest / workspace metadata
The cleanest way to do this is:
1. client sends declarative config to app-server
2. Codex composes the actual upstream request
3. host transmits that request unchanged
This preserves client control without creating dual prompt builders.
In practice, this means there are two useful layers of configuration:
- session-level defaults supplied when establishing the thread or runtime
- request-level overrides supplied per turn, such as reasoning effort, summaries, verbosity, or other sampling controls
Codex should own how those layers merge into the final upstream request, but the SDK user should still be able to express both layers declaratively.
## Tooling model
Universal Computer's current plugins can affect:
- tools
- instructions
- sampling params
- context
- manifest
Codex app-server should not try to mimic Python plugin objects. Instead, the protocol should expose the resulting configuration effects in transportable form.
Three buckets matter:
### 1. Native Codex tools
Examples:
- shell
- apply patch
- filesystem-like behavior
- skills/apps tooling
These should stay implemented in Rust.
### 2. Declarative client-added tools
These are already conceptually close to dynamic tools.
### 3. Host policy wrappers
The host may still want to:
- require approval
- deny certain tools
- redirect certain actions
- attach metadata
This should be policy/config, not alternative execution logic.
## Rollout ownership
This deserves explicit treatment.
If the host is the source of truth, then app-server should not quietly persist a more authoritative local reality than the host sees.
Recommended behavior in client-owned rollout mode:
- active turn state exists in memory inside Codex
- the host receives all canonical turn/item notifications
- the host persists them
- resume requires the host to resupply prior normalized history
- local persistence, if any, is cache-only and discardable
That keeps recovery honest.
## Failure semantics
Full delegation mode needs explicit failure boundaries.
### Host-side failures
Examples:
- Responses auth failure
- network failure
- stream disconnect
- host policy rejection
These should arrive back in Codex as delegated request failures and surface through normal turn failure notifications.
### Codex-side failures
Examples:
- malformed upstream event
- incompatible tool result
- internal turn-state fault
These should surface as Codex errors to the host.
### Split-brain prevention
At most one outstanding delegated model request should be active per active turn segment unless Codex explicitly supports multiplexing. Start single-flight.
That constraint is worth being conservative about.
It is different from "only one tool runs at a time" or "only one turn exists at a time." The point is narrower: for a given live turn segment, there should be one authoritative upstream model stream that Codex is currently interpreting. If the host allowed two overlapping delegated Responses streams to feed the same turn state, Codex would need a much more complicated merge model for deltas, tool calls, and terminal events. Starting with single-flight keeps turn state deterministic.
## Security and trust boundaries
This design is stronger than today's Python-tool model in one important way: the canonical executor of shell and file actions moves into the same Rust runtime that already knows Codex's approval and event semantics.
That said, the host now becomes highly privileged because it owns:
- auth
- transcript persistence
- upstream transport
- approval decisions
That is acceptable, because Universal Computer already lives at that privilege level.
## Migration path
### Phase 1
- add `fullDelegation` execution mode
- add `model/request`
- add `model/streamEvent`
- add `model/streamAborted`
- add `model/cancel`
- add client-owned rollout mode with startup rehydrate
This is enough for a single-container Universal Computer integration.
### Phase 2
- add explicit tool-set override modes
- harden resume/fork semantics for externally persisted history
- support more complete correlation and retry rules
### Phase 3
- add host-visible delegation/subagent interception
- route subagents to alternate containers/backends
## Open questions
1. What is the canonical wire format for delegated model requests? My recommendation: a Codex-defined envelope that is close to Responses payloads, but explicitly versioned and correlation-safe.
2. Should the host stream raw Responses events or normalized Codex events back? Raw Responses events. Normalization should remain inside Codex.
3. Should local persistence be disabled entirely in client-owned rollout mode? Prefer "non-authoritative cache" over "disabled," but correctness must not depend on it.
4. Should tool overrides be merged or replaced? There is a real difference:
- merged means "start with Codex defaults, then add or selectively override entries"
- replaced means "the client supplies the entire tool surface and Codex defaults are not implicitly present"
Support both, explicitly. Implicit merge will become a policy trap.
5. How much of plugin behavior should be representable over protocol? Only the effects, not the Python object model.
## Changes required in Universal Computer
The Codex-side protocol changes are only half of the story. To make this architecture real, Universal Computer also needs to grow a host-side integration layer that treats Codex app-server as a remote execution runtime rather than treating the Responses API as the only runtime boundary.
At a high level, Universal Computer should stop being responsible for implementing the default Codex tool surface in Python and instead become responsible for:
- provisioning a compatible Codex binary
- starting and supervising app-server
- relaying delegated model traffic to the selected provider
- persisting rollout state as the canonical host-side record
- exposing SDK ergonomics for tool configuration, approvals, and delegation routing
### 1. Pin and provision a Codex version
Universal Computer will need an explicit notion of the Codex runtime version it expects to launch.
That likely means:
- adding a pinned Codex version field to the agent or runtime configuration
- defining how that resolves to a concrete binary artifact for the current host platform
- making the app-server protocol version part of compatibility checks
This should be treated as a first-class runtime dependency, not an incidental local executable lookup. If the host and container disagree about protocol shape, delegation mode will fail in confusing ways, so version pinning should be deliberate.
Recommended direction:
- Universal Computer pins a Codex release or build identifier explicitly
- the host resolves and caches that artifact
- the runtime startup path verifies the binary version before starting app-server
### 2. Reuse existing backends to place the Codex binary in the destination environment
Universal Computer already knows how to create and resume execution environments. It should reuse that backend abstraction for Codex provisioning rather than inventing a separate deployment system.
Concretely, the current backend model is already a good fit for binary staging:
- `BaseSandboxClient` creates and resumes sessions
- `BaseSandboxSession` exposes `write`, `read`, `exec`, and workspace materialization
- manifest entries such as `LocalFile` already support copying a host file into the workspace and applying permissions via `chmod`
So the binary-placement story does not need a brand-new distribution mechanism. Universal Computer can either:
- stage the pinned Codex binary as a manifest artifact with executable permissions, or
- push it into the workspace during session startup with `session.write(...)` followed by `chmod`
The first option is especially attractive because it fits the existing manifest/snapshot model and keeps provisioning declarative.
At a high level, each backend would need to support:
- ensuring the Codex binary is present in the target environment
- placing any required companion assets if Codex needs them
- starting `codex app-server` with the right arguments
- returning a live transport handle back to the host SDK
For local execution, this step can degenerate into "use a local binary and skip copy." For remote or containerized execution, this becomes an explicit staging step.
The important design point is that backend-specific logic stays confined to:
- binary placement
- process startup
- transport attachment
- snapshot and manifest lifecycle
and not tool execution.
One nuance from the codebase: backend reuse is straightforward for file placement, but not yet for long-lived supervised process attachment. Universal Computer's shared session API supports one-shot `exec` everywhere, while PTY-style attached process interaction exists only on some backends. If Codex app-server is going to be launched as a long-running child process, Universal Computer will likely need one additional backend-neutral capability for "start a process and keep a live byte stream attached," rather than trying to shoehorn everything through one-shot exec.
### 3. Replace Python implementations of the default tool surface with symbolic tool references
Universal Computer can likely delete or de-emphasize the Python implementations of the default filesystem and shell tool behavior once Codex is the executor.
The code today makes this fairly concrete: the built-in tool surface is assembled from Python plugins like `Filesystem`, `Shell`, `ApplyPatch`, and `Compaction`. The first three are thin wrappers that bind to a `SandboxSession`, expose tool schemas, and add instruction fragments; they are not deep subsystems in their own right.
But the SDK still needs a way to express tool policy and shape the tool surface. So instead of Python tool implementations being the source of truth, they should become declarative references, for example:
- enable Codex shell
- disable Codex apply-patch
- use the default Codex tool set
- replace the default tool set with a minimal subset
In other words, the Python layer should continue to speak in terms of tool identities and policy, but not carry the execution logic for the built-in tools.
This is important for UX. SDK users still want to write things like:
- "enable shell but not apply patch"
- "disable filesystem writes"
- "use only custom tools"
Those should remain easy, but they should compile down to app-server configuration rather than selecting Python classes that implement the behavior directly.
The one built-in plugin that does not fit the "just replace it with a Codex tool" bucket is compaction. In Universal Computer today, compaction is expressed as sampling-parameter and context-processing behavior rather than as a shell/filesystem tool. So the migration should separate:
- built-in execution tools that move to Codex
- host-side request shaping policies, like compaction thresholds, that may still belong in the SDK and need to be forwarded into delegated model requests
### 4. Add a dedicated app-server package or module
Universal Computer should grow a dedicated host-side app-server integration package rather than smearing the logic across the existing agent runtime.
Conceptually, that package would own:
- app-server process lifecycle
- connection management
- protocol type definitions
- delegated model request handling
- approval request handling
- delegated subagent handling
- rollout event capture and persistence hooks
A clean package boundary here matters because this integration is not just "another tool." It is a new runtime substrate.
A useful mental split would be:
- core Universal Computer agent model
- backend/session abstractions
- provider adapters
- app-server bridge
That keeps the Codex-specific transport logic from leaking into unrelated parts of the SDK.
### 5. Support the new delegated app-server events
Universal Computer will need host-side handlers for the new protocol surface proposed above.
At minimum, that means understanding and responding to:
- `model/request`
- `model/streamEvent`
- `model/streamAborted`
- `model/cancel`
- `delegation/request`
- `delegation/result`
- existing approval requests
In practice, the host runtime loop changes from:
- call `responses.create(...)`
- stream raw events
- inspect pending tool calls
to:
- wait for `model/request` from Codex
- execute that request against the selected provider
- feed raw upstream events back with `model/streamEvent`
- honor `model/cancel` and approval flows
- optionally route `delegation/request` to a different container or backend
That is a meaningful runtime refactor, but it is conceptually clean: Universal Computer becomes an orchestrator around Codex rather than a reimplementation of Codex behavior.
### 6. Add a host-side multi-provider abstraction
Today Universal Computer is structurally very OpenAI-shaped because the runtime path is built around the Responses API client. In delegated mode, that logic becomes even more central, so it should be abstracted intentionally.
The current code is explicit about this: `Task` stores an `openai.AsyncClient` and its default producer literally calls `client.responses.create(...)`. So multi-provider support is not a small configuration tweak; it is a real runtime abstraction change.
The host needs a provider abstraction capable of:
- taking a Codex-authored delegated model request
- translating it to the selected upstream provider call shape
- streaming provider events back into the common app-server event format
- surfacing provider-specific failures in a normalized way
For OpenAI-backed flows, that can stay close to raw Responses semantics.
For Anthropic or other providers, the host may need an adapter layer that maps:
- request fields
- tool-calling events
- reasoning/summary controls where supported
- terminal and error events
back into the event shape Codex expects.
This is precisely why the translation boundary should live on the host, not in the container. Provider choice is a host concern.
Recommended direction:
- define a `ModelProvider` or similarly named host-side interface
- keep OpenAI as the reference implementation
- add provider capability metadata so unsupported delegated-request features can fail clearly rather than degrade silently
There is already a hint of the right design elsewhere in Universal Computer: the memory subsystem defines normalized result schemas specifically so the rest of the system does not need to understand provider-specific formats. The delegated app-server bridge should follow the same principle for streamed model events.
### 7. Add host-side rollout persistence built around Codex item history
If the host is now the source of truth, Universal Computer should persist the Codex-facing event history it receives from app-server, not just the raw upstream Responses interaction.
That likely means persisting:
- thread identity
- turns
- normalized items
- approval decisions
- delegation edges between parent and child agents
- provider and runtime metadata
This persistence layer should support:
- resume into the same container
- resume into a fresh container with rehydrated history
- cross-backend continuation when the SDK chooses to re-home the work
### 8. Transport recommendation: prefer stdio over a reliable byte stream bridge
For the host-to-container app-server transport, the safest recommendation is:
- first choice: stdio over an attached process handle
- second choice: a reliable byte-stream tunnel such as SSH or a backend-managed TCP stream
Why:
- app-server traffic is ordered, stateful, and request-response oriented
- JSON-RPC + streaming notifications want reliable delivery and backpressure
`stdio` is still the right target transport because Codex app-server already supports it as the primary mode. But after a deeper look at Universal Computer, there is an important implementation detail: the current shared session abstraction does not yet provide a backend-neutral "launch a long-lived child process and keep stdin/stdout attached" API. It provides:
- one-shot `exec` everywhere
- optional PTY process support on some backends such as local Unix and Modal
- no equivalent attached-process primitive on Docker today
So the recommendation should be more precise:
- standardize on app-server `stdio` as the protocol transport
- add a new backend-neutral attached-process capability to Universal Computer for long-lived bridge processes
- make that capability part of the expected contract for all supported backends, instead of treating it as an optimization for only a few environments
- implement that capability per backend, instead of introducing a separate network protocol just to compensate for the missing primitive
If Universal Computer can directly attach to the launched process, `stdio` is ideal because:
- it matches app-server's primary supported transport
- it avoids inventing network semantics
- it inherits process lifecycle naturally
- it is easy to secure because nothing is exposed on a network port
For Docker specifically, that likely means adding a backend implementation that can launch Codex as an attached process rather than relying only on detached one-shot execs. For example, the backend could use an attached `docker exec` session or make Codex the supervised long-lived process inside the container and bridge its stdio back to the host.
If a direct process attachment is impossible because of the backend, the next best choice is a reliable stream transport tunneled over something the backend already trusts:
- SSH port forwarding or command execution with pipes
- a backend-provided TCP tunnel
I would not recommend treating app-server websocket as the default fallback here, because Codex app-server currently describes websocket transport as experimental and unsupported. If a backend absolutely forces a bridged network transport, prefer a reliable stream that still carries stdio-like semantics over inventing a new public network surface.
Recommendation:
- standardize on `stdio` as the canonical transport
- add a UC session-level attached-process abstraction to make `stdio` practical across backends
- require all supported backends to implement an attached-process bridge capable of launching and supervising app-server with a live byte stream
- use SSH or another reliable stream tunnel only when direct attachment is impossible
- treat websocket support as an implementation detail of last resort, not the preferred contract
This keeps the transport boring, which is exactly what you want for the control plane of a remote agent runtime.
### 9. Suggested Universal Computer rollout plan
A pragmatic order of operations would be:
1. add a Codex runtime abstraction with version pinning and binary provisioning
2. add an app-server bridge package with stdio-based transport
3. implement OpenAI delegated model handling end to end
4. persist Codex-facing history host-side and support resume
5. replace Python built-in tool execution with declarative tool enablement
6. add subagent interception and routing
7. add additional provider adapters such as Anthropic
That sequence gets a single-container OpenAI-backed flow working early while leaving room for multi-provider and multi-container sophistication later.
## Recommendation
Build **full delegation mode** as an app-server-level capability, not as a Universal Computer-specific shim.
The winning shape is:
- remote Codex prepares
- host transmits
- remote Codex interprets
- host persists
That preserves the best properties of both systems:
- Universal Computer keeps its orchestration superpower
- Codex becomes the reusable execution engine and tool runtime you actually want to standardize on