Files
codex/sdk/python/tests/test_app_server_run.py
Ahmed Ibrahim 3e10e09e24 [7/8] Add Python SDK app-server integration harness (#22014)
## Why

The SDK had behavioral tests that replaced SDK client internals. Those
tests could catch wrapper mistakes, but they did not prove the pinned
app-server runtime, generated notification models, request routing, and
sync/async public clients worked together.

This PR adds deterministic integration coverage that starts the pinned
`codex app-server` process and mocks only the upstream Responses HTTP
boundary.

## What

- Add `AppServerHarness` and `MockResponsesServer` helpers for isolated
`CODEX_HOME`, mock-provider config, queued SSE responses, and captured
`/v1/responses` requests.
- Add shared helpers for SSE construction, stream assertions,
approval-policy inspection, and image fixtures.
- Split integration coverage into focused modules for run behavior,
inputs, streaming, turn controls, approvals, and thread lifecycle.
- Cover sync and async `Thread.run`, `TurnHandle.stream`, interleaved
streams, approval-mode persistence, lifecycle helpers, final-answer
phase handling, image inputs, loaded skill input injection, steering,
interruption, listing, history reads, run overrides, and token usage
mapping.
- Replace public-wrapper tests that duplicated integration-test behavior
with lower-level client tests only where direct client behavior is the
thing under test.

## Stack

1. #21891 `[1/8]` Pin Python SDK runtime dependency
2. #21893 `[2/8]` Generate Python SDK types from pinned runtime
3. #21895 `[3/8]` Run Python SDK tests in CI
4. #21896 `[4/8]` Define Python SDK public API surface
5. #21905 `[5/8]` Rename Python SDK package to `openai-codex`
6. #21910 `[6/8]` Add high-level Python SDK approval mode
7. This PR `[7/8]` Add Python SDK app-server integration harness
8. #22021 `[8/8]` Add Python SDK Ruff formatting

## Verification

- Added pinned app-server integration tests under
`sdk/python/tests/test_app_server_*.py` and
`test_real_app_server_integration.py`.

---------

Co-authored-by: Codex <noreply@openai.com>
2026-05-12 01:06:41 +03:00

370 lines
13 KiB
Python

from __future__ import annotations
import asyncio
import pytest
from app_server_harness import (
AppServerHarness,
ev_assistant_message,
ev_completed,
ev_completed_with_usage,
ev_failed,
ev_response_created,
sse,
)
from openai_codex import AsyncCodex, Codex
from openai_codex.generated.v2_all import MessagePhase
from app_server_helpers import (
agent_message_texts_from_items,
assistant_message_with_phase,
)
def test_sync_thread_run_uses_mock_responses(
tmp_path,
) -> None:
"""Drive Thread.run through the pinned app-server and inspect the HTTP request."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_assistant_message("Hello from the mock.", response_id="run-1")
with Codex(config=harness.app_server_config()) as codex:
thread = codex.thread_start()
result = thread.run("hello")
request = harness.responses.single_request()
body = request.body_json()
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
"has_usage": result.usage is not None,
"request_model": body["model"],
"request_stream": body["stream"],
"request_user_texts": request.message_input_texts("user")[-1:],
} == {
"final_response": "Hello from the mock.",
"agent_messages": ["Hello from the mock."],
"has_usage": True,
"request_model": "mock-model",
"request_stream": True,
"request_user_texts": ["hello"],
}
def test_run_params_and_usage_cross_app_server_boundary(tmp_path) -> None:
"""Thread.run should pass overrides and collect app-server token usage."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("run-overrides"),
ev_assistant_message("msg-run-overrides", "overrides applied"),
ev_completed_with_usage(
"run-overrides",
input_tokens=11,
cached_input_tokens=3,
output_tokens=7,
reasoning_output_tokens=5,
total_tokens=18,
),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
thread = codex.thread_start()
result = thread.run(
"use overrides",
model="mock-model-override",
)
request = harness.responses.single_request()
usage_payload = None
if result.usage is not None:
dumped_usage = result.usage.model_dump(by_alias=True, mode="json")
usage_payload = {
"last": dumped_usage["last"],
"total": dumped_usage["total"],
}
assert {
"final_response": result.final_response,
"request_model": request.body_json()["model"],
"usage": usage_payload,
} == {
"final_response": "overrides applied",
"request_model": "mock-model-override",
"usage": {
"last": {
"cachedInputTokens": 3,
"inputTokens": 11,
"outputTokens": 7,
"reasoningOutputTokens": 5,
"totalTokens": 18,
},
"total": {
"cachedInputTokens": 3,
"inputTokens": 11,
"outputTokens": 7,
"reasoningOutputTokens": 5,
"totalTokens": 18,
},
},
}
def test_async_thread_run_uses_mock_responses(
tmp_path,
) -> None:
"""Async Thread.run should exercise the same app-server boundary."""
async def scenario() -> None:
"""Run the async client against a real app-server process."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_assistant_message(
"Hello async.",
response_id="async-run-1",
)
async with AsyncCodex(config=harness.app_server_config()) as codex:
thread = await codex.thread_start()
result = await thread.run("async hello")
request = harness.responses.single_request()
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
"request_user_texts": request.message_input_texts("user")[-1:],
} == {
"final_response": "Hello async.",
"agent_messages": ["Hello async."],
"request_user_texts": ["async hello"],
}
asyncio.run(scenario())
def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""RunResult should use the last unknown-phase agent message as final text."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("items-last"),
ev_assistant_message("msg-items-first", "First message"),
ev_assistant_message("msg-items-second", "Second message"),
ev_completed("items-last"),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
result = codex.thread_start().run("case: last unknown phase wins")
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
} == {
"final_response": "Second message",
"agent_messages": ["First message", "Second message"],
}
def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
"""RunResult should preserve an empty final agent message instead of skipping it."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("items-empty"),
ev_assistant_message("msg-items-nonempty", "First message"),
ev_assistant_message("msg-items-empty", ""),
ev_completed("items-empty"),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
result = codex.thread_start().run("case: empty last message")
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
} == {
"final_response": "",
"agent_messages": ["First message", ""],
}
def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
"""RunResult final_response should stay unset when app-server marks only commentary."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("items-commentary"),
assistant_message_with_phase(
"msg-items-commentary",
"Commentary",
MessagePhase.commentary,
),
ev_completed("items-commentary"),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
result = codex.thread_start().run("case: commentary only")
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
} == {
"final_response": None,
"agent_messages": ["Commentary"],
}
def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""Async RunResult should use the last unknown-phase agent message."""
async def scenario() -> None:
"""Run one async result-mapping case against a pinned app-server."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("async-items-last"),
ev_assistant_message(
"msg-async-items-first",
"First async message",
),
ev_assistant_message(
"msg-async-items-second",
"Second async message",
),
ev_completed("async-items-last"),
]
)
)
async with AsyncCodex(config=harness.app_server_config()) as codex:
result = await (await codex.thread_start()).run(
"case: async last unknown phase"
)
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
} == {
"final_response": "Second async message",
"agent_messages": ["First async message", "Second async message"],
}
asyncio.run(scenario())
def test_async_run_result_does_not_promote_commentary_only_to_final(
tmp_path,
) -> None:
"""Async RunResult final_response should stay unset for commentary-only output."""
async def scenario() -> None:
"""Run one async commentary mapping case against a pinned app-server."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("async-items-commentary"),
assistant_message_with_phase(
"msg-async-items-commentary",
"Async commentary",
MessagePhase.commentary,
),
ev_completed("async-items-commentary"),
]
)
)
async with AsyncCodex(config=harness.app_server_config()) as codex:
result = await (await codex.thread_start()).run(
"case: async commentary only"
)
assert {
"final_response": result.final_response,
"agent_messages": agent_message_texts_from_items(result.items),
} == {
"final_response": None,
"agent_messages": ["Async commentary"],
}
asyncio.run(scenario())
def test_thread_run_raises_when_real_app_server_reports_failed_turn(tmp_path) -> None:
"""Thread.run should surface the failed turn error emitted by app-server."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("failed-run"),
ev_failed("failed-run", "boom from mock model"),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
thread = codex.thread_start()
with pytest.raises(RuntimeError, match="boom from mock model"):
thread.run("trigger failure")
def test_final_answer_phase_survives_real_app_server_mapping(tmp_path) -> None:
"""RunResult should use the final-answer item emitted by app-server."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
[
ev_response_created("phase-1"),
{
**ev_assistant_message("msg-commentary", "Commentary"),
"item": {
**ev_assistant_message("msg-commentary", "Commentary")["item"],
"phase": MessagePhase.commentary.value,
},
},
{
**ev_assistant_message("msg-final", "Final answer"),
"item": {
**ev_assistant_message("msg-final", "Final answer")["item"],
"phase": MessagePhase.final_answer.value,
},
},
ev_completed("phase-1"),
]
)
)
with Codex(config=harness.app_server_config()) as codex:
result = codex.thread_start().run("choose final answer")
assert {
"final_response": result.final_response,
"items": [
{
"text": item.root.text,
"phase": None if item.root.phase is None else item.root.phase.value,
}
for item in result.items
if item.root.type == "agentMessage"
],
} == {
"final_response": "Final answer",
"items": [
{"text": "Commentary", "phase": MessagePhase.commentary.value},
{"text": "Final answer", "phase": MessagePhase.final_answer.value},
],
}