mirror of
https://github.com/openai/codex.git
synced 2026-05-15 08:42:34 +00:00
## Why The SDK had behavioral tests that replaced SDK client internals. Those tests could catch wrapper mistakes, but they did not prove the pinned app-server runtime, generated notification models, request routing, and sync/async public clients worked together. This PR adds deterministic integration coverage that starts the pinned `codex app-server` process and mocks only the upstream Responses HTTP boundary. ## What - Add `AppServerHarness` and `MockResponsesServer` helpers for isolated `CODEX_HOME`, mock-provider config, queued SSE responses, and captured `/v1/responses` requests. - Add shared helpers for SSE construction, stream assertions, approval-policy inspection, and image fixtures. - Split integration coverage into focused modules for run behavior, inputs, streaming, turn controls, approvals, and thread lifecycle. - Cover sync and async `Thread.run`, `TurnHandle.stream`, interleaved streams, approval-mode persistence, lifecycle helpers, final-answer phase handling, image inputs, loaded skill input injection, steering, interruption, listing, history reads, run overrides, and token usage mapping. - Replace public-wrapper tests that duplicated integration-test behavior with lower-level client tests only where direct client behavior is the thing under test. ## Stack 1. #21891 `[1/8]` Pin Python SDK runtime dependency 2. #21893 `[2/8]` Generate Python SDK types from pinned runtime 3. #21895 `[3/8]` Run Python SDK tests in CI 4. #21896 `[4/8]` Define Python SDK public API surface 5. #21905 `[5/8]` Rename Python SDK package to `openai-codex` 6. #21910 `[6/8]` Add high-level Python SDK approval mode 7. This PR `[7/8]` Add Python SDK app-server integration harness 8. #22021 `[8/8]` Add Python SDK Ruff formatting ## Verification - Added pinned app-server integration tests under `sdk/python/tests/test_app_server_*.py` and `test_real_app_server_integration.py`. --------- Co-authored-by: Codex <noreply@openai.com>
370 lines
13 KiB
Python
370 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
import pytest
|
|
|
|
from app_server_harness import (
|
|
AppServerHarness,
|
|
ev_assistant_message,
|
|
ev_completed,
|
|
ev_completed_with_usage,
|
|
ev_failed,
|
|
ev_response_created,
|
|
sse,
|
|
)
|
|
from openai_codex import AsyncCodex, Codex
|
|
from openai_codex.generated.v2_all import MessagePhase
|
|
from app_server_helpers import (
|
|
agent_message_texts_from_items,
|
|
assistant_message_with_phase,
|
|
)
|
|
|
|
|
|
def test_sync_thread_run_uses_mock_responses(
|
|
tmp_path,
|
|
) -> None:
|
|
"""Drive Thread.run through the pinned app-server and inspect the HTTP request."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_assistant_message("Hello from the mock.", response_id="run-1")
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
thread = codex.thread_start()
|
|
result = thread.run("hello")
|
|
|
|
request = harness.responses.single_request()
|
|
|
|
body = request.body_json()
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
"has_usage": result.usage is not None,
|
|
"request_model": body["model"],
|
|
"request_stream": body["stream"],
|
|
"request_user_texts": request.message_input_texts("user")[-1:],
|
|
} == {
|
|
"final_response": "Hello from the mock.",
|
|
"agent_messages": ["Hello from the mock."],
|
|
"has_usage": True,
|
|
"request_model": "mock-model",
|
|
"request_stream": True,
|
|
"request_user_texts": ["hello"],
|
|
}
|
|
|
|
|
|
def test_run_params_and_usage_cross_app_server_boundary(tmp_path) -> None:
|
|
"""Thread.run should pass overrides and collect app-server token usage."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("run-overrides"),
|
|
ev_assistant_message("msg-run-overrides", "overrides applied"),
|
|
ev_completed_with_usage(
|
|
"run-overrides",
|
|
input_tokens=11,
|
|
cached_input_tokens=3,
|
|
output_tokens=7,
|
|
reasoning_output_tokens=5,
|
|
total_tokens=18,
|
|
),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
thread = codex.thread_start()
|
|
result = thread.run(
|
|
"use overrides",
|
|
model="mock-model-override",
|
|
)
|
|
request = harness.responses.single_request()
|
|
|
|
usage_payload = None
|
|
if result.usage is not None:
|
|
dumped_usage = result.usage.model_dump(by_alias=True, mode="json")
|
|
usage_payload = {
|
|
"last": dumped_usage["last"],
|
|
"total": dumped_usage["total"],
|
|
}
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"request_model": request.body_json()["model"],
|
|
"usage": usage_payload,
|
|
} == {
|
|
"final_response": "overrides applied",
|
|
"request_model": "mock-model-override",
|
|
"usage": {
|
|
"last": {
|
|
"cachedInputTokens": 3,
|
|
"inputTokens": 11,
|
|
"outputTokens": 7,
|
|
"reasoningOutputTokens": 5,
|
|
"totalTokens": 18,
|
|
},
|
|
"total": {
|
|
"cachedInputTokens": 3,
|
|
"inputTokens": 11,
|
|
"outputTokens": 7,
|
|
"reasoningOutputTokens": 5,
|
|
"totalTokens": 18,
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def test_async_thread_run_uses_mock_responses(
|
|
tmp_path,
|
|
) -> None:
|
|
"""Async Thread.run should exercise the same app-server boundary."""
|
|
|
|
async def scenario() -> None:
|
|
"""Run the async client against a real app-server process."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_assistant_message(
|
|
"Hello async.",
|
|
response_id="async-run-1",
|
|
)
|
|
|
|
async with AsyncCodex(config=harness.app_server_config()) as codex:
|
|
thread = await codex.thread_start()
|
|
result = await thread.run("async hello")
|
|
|
|
request = harness.responses.single_request()
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
"request_user_texts": request.message_input_texts("user")[-1:],
|
|
} == {
|
|
"final_response": "Hello async.",
|
|
"agent_messages": ["Hello async."],
|
|
"request_user_texts": ["async hello"],
|
|
}
|
|
|
|
asyncio.run(scenario())
|
|
|
|
|
|
def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
|
|
"""RunResult should use the last unknown-phase agent message as final text."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("items-last"),
|
|
ev_assistant_message("msg-items-first", "First message"),
|
|
ev_assistant_message("msg-items-second", "Second message"),
|
|
ev_completed("items-last"),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
result = codex.thread_start().run("case: last unknown phase wins")
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
} == {
|
|
"final_response": "Second message",
|
|
"agent_messages": ["First message", "Second message"],
|
|
}
|
|
|
|
|
|
def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
|
|
"""RunResult should preserve an empty final agent message instead of skipping it."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("items-empty"),
|
|
ev_assistant_message("msg-items-nonempty", "First message"),
|
|
ev_assistant_message("msg-items-empty", ""),
|
|
ev_completed("items-empty"),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
result = codex.thread_start().run("case: empty last message")
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
} == {
|
|
"final_response": "",
|
|
"agent_messages": ["First message", ""],
|
|
}
|
|
|
|
|
|
def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
|
|
"""RunResult final_response should stay unset when app-server marks only commentary."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("items-commentary"),
|
|
assistant_message_with_phase(
|
|
"msg-items-commentary",
|
|
"Commentary",
|
|
MessagePhase.commentary,
|
|
),
|
|
ev_completed("items-commentary"),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
result = codex.thread_start().run("case: commentary only")
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
} == {
|
|
"final_response": None,
|
|
"agent_messages": ["Commentary"],
|
|
}
|
|
|
|
|
|
def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
|
|
"""Async RunResult should use the last unknown-phase agent message."""
|
|
|
|
async def scenario() -> None:
|
|
"""Run one async result-mapping case against a pinned app-server."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("async-items-last"),
|
|
ev_assistant_message(
|
|
"msg-async-items-first",
|
|
"First async message",
|
|
),
|
|
ev_assistant_message(
|
|
"msg-async-items-second",
|
|
"Second async message",
|
|
),
|
|
ev_completed("async-items-last"),
|
|
]
|
|
)
|
|
)
|
|
|
|
async with AsyncCodex(config=harness.app_server_config()) as codex:
|
|
result = await (await codex.thread_start()).run(
|
|
"case: async last unknown phase"
|
|
)
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
} == {
|
|
"final_response": "Second async message",
|
|
"agent_messages": ["First async message", "Second async message"],
|
|
}
|
|
|
|
asyncio.run(scenario())
|
|
|
|
|
|
def test_async_run_result_does_not_promote_commentary_only_to_final(
|
|
tmp_path,
|
|
) -> None:
|
|
"""Async RunResult final_response should stay unset for commentary-only output."""
|
|
|
|
async def scenario() -> None:
|
|
"""Run one async commentary mapping case against a pinned app-server."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("async-items-commentary"),
|
|
assistant_message_with_phase(
|
|
"msg-async-items-commentary",
|
|
"Async commentary",
|
|
MessagePhase.commentary,
|
|
),
|
|
ev_completed("async-items-commentary"),
|
|
]
|
|
)
|
|
)
|
|
|
|
async with AsyncCodex(config=harness.app_server_config()) as codex:
|
|
result = await (await codex.thread_start()).run(
|
|
"case: async commentary only"
|
|
)
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"agent_messages": agent_message_texts_from_items(result.items),
|
|
} == {
|
|
"final_response": None,
|
|
"agent_messages": ["Async commentary"],
|
|
}
|
|
|
|
asyncio.run(scenario())
|
|
|
|
|
|
def test_thread_run_raises_when_real_app_server_reports_failed_turn(tmp_path) -> None:
|
|
"""Thread.run should surface the failed turn error emitted by app-server."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("failed-run"),
|
|
ev_failed("failed-run", "boom from mock model"),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
thread = codex.thread_start()
|
|
with pytest.raises(RuntimeError, match="boom from mock model"):
|
|
thread.run("trigger failure")
|
|
|
|
|
|
def test_final_answer_phase_survives_real_app_server_mapping(tmp_path) -> None:
|
|
"""RunResult should use the final-answer item emitted by app-server."""
|
|
with AppServerHarness(tmp_path) as harness:
|
|
harness.responses.enqueue_sse(
|
|
sse(
|
|
[
|
|
ev_response_created("phase-1"),
|
|
{
|
|
**ev_assistant_message("msg-commentary", "Commentary"),
|
|
"item": {
|
|
**ev_assistant_message("msg-commentary", "Commentary")["item"],
|
|
"phase": MessagePhase.commentary.value,
|
|
},
|
|
},
|
|
{
|
|
**ev_assistant_message("msg-final", "Final answer"),
|
|
"item": {
|
|
**ev_assistant_message("msg-final", "Final answer")["item"],
|
|
"phase": MessagePhase.final_answer.value,
|
|
},
|
|
},
|
|
ev_completed("phase-1"),
|
|
]
|
|
)
|
|
)
|
|
|
|
with Codex(config=harness.app_server_config()) as codex:
|
|
result = codex.thread_start().run("choose final answer")
|
|
|
|
assert {
|
|
"final_response": result.final_response,
|
|
"items": [
|
|
{
|
|
"text": item.root.text,
|
|
"phase": None if item.root.phase is None else item.root.phase.value,
|
|
}
|
|
for item in result.items
|
|
if item.root.type == "agentMessage"
|
|
],
|
|
} == {
|
|
"final_response": "Final answer",
|
|
"items": [
|
|
{"text": "Commentary", "phase": MessagePhase.commentary.value},
|
|
{"text": "Final answer", "phase": MessagePhase.final_answer.value},
|
|
],
|
|
}
|