[codex] Return TurnResult from Python turn handles (#23151)

## Why

`TurnHandle.run()` returned the raw app-server `Turn`, whose live
start/completed payloads do not include loaded `items`, so users saw
empty `items` after starting a turn. That made the handle-based path
behave differently from `Thread.run(...)`, and pushed examples toward
persisted-thread reads plus helper extraction.

This PR makes the run APIs standalone: starting a turn and running it
returns collected turn data directly, or fails visibly when required
stream events are missing.

## What Changed

- Replaces the public `RunResult` export with `TurnResult`.
- Adds turn metadata to `TurnResult`: `id`, `status`, `error`,
`started_at`, `completed_at`, and `duration_ms`, alongside
`final_response`, `items`, and `usage`.
- Changes `TurnHandle.run()` and `AsyncTurnHandle.run()` to consume
stream events with the same collector used by `Thread.run(...)`.
- Exports `TurnError` from `openai_codex.types` for the new result
shape.
- Updates tests, examples, docs, and the walkthrough notebook to use
`result.final_response` and `result.items` directly.
- Removes persisted-thread helper paths and placeholder/skipped control
flows from the public examples and notebook.

## Verification

- `python3 -m py_compile ...` over changed SDK, example, and test Python
files.
- `python3 -c "import json;
json.load(open('sdk/python/notebooks/sdk_walkthrough.ipynb'))"`
- `git diff --check`
- `PYTHONPATH=sdk/python/src python3 -c ...` import/signature smoke for
`TurnResult`, `TurnHandle.run`, and `AsyncTurnHandle.run`.
This commit is contained in:
Ahmed Ibrahim
2026-05-17 06:17:22 -07:00
committed by GitHub
parent 4c89772314
commit f0166cadbb
42 changed files with 399 additions and 677 deletions

View File

@@ -111,7 +111,7 @@ def agent_message_texts(events: list[Notification]) -> list[str]:
def agent_message_texts_from_items(items: Iterable[Any]) -> list[str]:
"""Extract agent-message text from completed run result items."""
"""Extract agent-message text from completed turn result items."""
texts: list[str] = []
for item in items:
root = item.root

View File

@@ -157,7 +157,7 @@ def test_async_lifecycle_methods_round_trip(tmp_path) -> None:
async with AsyncCodex(config=harness.app_server_config()) as codex:
thread = await codex.thread_start()
run_result = await thread.run("materialize async thread")
turn_result = await thread.run("materialize async thread")
await thread.set_name("async lifecycle")
named = await thread.read()
resumed = await codex.thread_resume(thread.id)
@@ -166,14 +166,14 @@ def test_async_lifecycle_methods_round_trip(tmp_path) -> None:
unarchived = await codex.thread_unarchive(thread.id)
assert {
"run_final_response": run_result.final_response,
"turn_final_response": turn_result.final_response,
"named_thread": named.thread.name,
"resumed_id": resumed.id,
"forked_is_distinct": forked.id != thread.id,
"archive_response": archive_response.model_dump(by_alias=True, mode="json"),
"unarchived_id": unarchived.id,
} == {
"run_final_response": "async materialized",
"turn_final_response": "async materialized",
"named_thread": "async lifecycle",
"resumed_id": thread.id,
"forked_is_distinct": True,
@@ -253,19 +253,19 @@ def test_compact_rpc_hits_mock_responses(tmp_path) -> None:
with Codex(config=harness.app_server_config()) as codex:
thread = codex.thread_start()
run_result = thread.run("create history")
turn_result = thread.run("create history")
compact_response = thread.compact()
requests = harness.responses.wait_for_requests(2)
assert {
"run_final_response": run_result.final_response,
"turn_final_response": turn_result.final_response,
"compact_response": compact_response.model_dump(
by_alias=True,
mode="json",
),
"request_kinds": [request_kind(request.path) for request in requests],
} == {
"run_final_response": "history",
"turn_final_response": "history",
"compact_response": {},
"request_kinds": ["responses", "responses"],
}

View File

@@ -145,8 +145,8 @@ def test_async_thread_run_uses_mock_responses(
asyncio.run(scenario())
def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""RunResult should use the last unknown-phase agent message as final text."""
def test_sync_turn_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""TurnResult should use the last unknown-phase agent message as final text."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
@@ -171,8 +171,8 @@ def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
}
def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
"""RunResult should preserve an empty final agent message instead of skipping it."""
def test_sync_turn_result_preserves_empty_last_message(tmp_path) -> None:
"""TurnResult should preserve an empty final agent message instead of skipping it."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
@@ -197,8 +197,8 @@ def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
}
def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
"""RunResult final_response should stay unset when app-server marks only commentary."""
def test_sync_turn_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
"""TurnResult final_response should stay unset when app-server marks only commentary."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(
@@ -226,8 +226,8 @@ def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) ->
}
def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""Async RunResult should use the last unknown-phase agent message."""
def test_async_turn_result_uses_last_unknown_phase_message(tmp_path) -> None:
"""Async TurnResult should use the last unknown-phase agent message."""
async def scenario() -> None:
"""Run one async result-mapping case against a pinned app-server."""
@@ -263,10 +263,10 @@ def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
asyncio.run(scenario())
def test_async_run_result_does_not_promote_commentary_only_to_final(
def test_async_turn_result_does_not_promote_commentary_only_to_final(
tmp_path,
) -> None:
"""Async RunResult final_response should stay unset for commentary-only output."""
"""Async TurnResult final_response should stay unset for commentary-only output."""
async def scenario() -> None:
"""Run one async commentary mapping case against a pinned app-server."""
@@ -318,7 +318,7 @@ def test_thread_run_raises_when_real_app_server_reports_failed_turn(tmp_path) ->
def test_final_answer_phase_survives_real_app_server_mapping(tmp_path) -> None:
"""RunResult should use the final-answer item emitted by app-server."""
"""TurnResult should use the final-answer item emitted by app-server."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_sse(
sse(

View File

@@ -5,6 +5,7 @@ import asyncio
from app_server_harness import AppServerHarness
from app_server_helpers import (
agent_message_texts,
agent_message_texts_from_items,
next_async_delta,
next_sync_delta,
streaming_response,
@@ -48,7 +49,7 @@ def test_sync_stream_routes_text_deltas_and_completion(tmp_path) -> None:
def test_turn_run_returns_completed_turn(tmp_path) -> None:
"""TurnHandle.run should wait for the app-server completion notification."""
"""TurnHandle.run should collect output and completion metadata."""
with AppServerHarness(tmp_path) as harness:
harness.responses.enqueue_assistant_message("turn complete", response_id="turn-run-1")
@@ -60,11 +61,13 @@ def test_turn_run_returns_completed_turn(tmp_path) -> None:
assert {
"turn_id": completed.id,
"status": completed.status,
"items": completed.items,
"agent_messages": agent_message_texts_from_items(completed.items),
"final_response": completed.final_response,
} == {
"turn_id": turn.id,
"status": TurnStatus.completed,
"items": [],
"agent_messages": ["turn complete"],
"final_response": "turn complete",
}

View File

@@ -14,9 +14,11 @@ from openai_codex import (
AppServerConfig,
AsyncCodex,
AsyncThread,
AsyncTurnHandle,
Codex,
RunResult,
Thread,
TurnHandle,
TurnResult,
)
from openai_codex._initialize_metadata import validate_initialize_metadata
from openai_codex.types import InitializeResponse
@@ -35,7 +37,7 @@ EXPECTED_ROOT_EXPORTS = [
"AsyncThread",
"TurnHandle",
"AsyncTurnHandle",
"RunResult",
"TurnResult",
"Input",
"InputItem",
"TextInput",
@@ -92,6 +94,7 @@ EXPECTED_TYPES_EXPORTS = [
"ThreadTokenUsageUpdatedNotification",
"Turn",
"TurnCompletedNotification",
"TurnError",
"TurnInterruptResponse",
"TurnStatus",
"TurnSteerResponse",
@@ -128,9 +131,39 @@ def test_root_exports_app_server_config() -> None:
assert AppServerConfig.__name__ == "AppServerConfig"
def test_root_exports_run_result() -> None:
"""The root package should expose the common-case run result wrapper."""
assert RunResult.__name__ == "RunResult"
def test_root_exports_turn_result() -> None:
"""The root package should expose the collected turn result wrapper."""
assert {
"name": TurnResult.__name__,
"fields": list(TurnResult.__dataclass_fields__),
} == {
"name": "TurnResult",
"fields": [
"id",
"status",
"error",
"started_at",
"completed_at",
"duration_ms",
"final_response",
"items",
"usage",
],
}
def test_turn_run_methods_return_turn_result() -> None:
"""Both convenience and handle-based run APIs return the same result shape."""
funcs = [
Thread.run,
TurnHandle.run,
AsyncThread.run,
AsyncTurnHandle.run,
]
assert {fn: inspect.signature(fn).return_annotation for fn in funcs} == dict.fromkeys(
funcs, "TurnResult"
)
def test_root_exports_approval_mode() -> None:

View File

@@ -135,7 +135,7 @@ def _run_python(
)
def _runtime_compatibility_hint(
def _runtime_schema_hint(
runtime_env: PreparedRuntimeEnv,
*,
stdout: str,
@@ -144,7 +144,7 @@ def _runtime_compatibility_hint(
combined = f"{stdout}\n{stderr}"
if "ThreadStartResponse" in combined and "approvalsReviewer" in combined:
return (
"\nCompatibility hint:\n"
"\nSchema hint:\n"
f"Pinned runtime {runtime_env.runtime_version} returned a thread/start payload "
"that is older than the current SDK schema and is missing "
"`approvalsReviewer`. Bump `sdk/python/_runtime_setup.py` to a matching "
@@ -165,7 +165,7 @@ def _run_json_python(
"Python snippet failed.\n"
f"STDOUT:\n{result.stdout}\n"
f"STDERR:\n{result.stderr}"
f"{_runtime_compatibility_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
f"{_runtime_schema_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
)
return json.loads(result.stdout)
@@ -242,17 +242,12 @@ def test_real_thread_and_turn_start_smoke(runtime_env: PreparedRuntimeEnv) -> No
config={"model_reasoning_effort": "high"},
)
result = thread.turn(TextInput("hello")).run()
persisted = thread.read(include_turns=True)
persisted_turn = next(
(turn for turn in persisted.thread.turns or [] if turn.id == result.id),
None,
)
print(json.dumps({
"thread_id": thread.id,
"turn_id": result.id,
"status": result.status.value,
"items_count": len(result.items or []),
"persisted_items_count": 0 if persisted_turn is None else len(persisted_turn.items or []),
"items_count": len(result.items),
"final_response_is_text": isinstance(result.final_response, str) and bool(result.final_response.strip()),
}))
"""
),
@@ -261,8 +256,8 @@ def test_real_thread_and_turn_start_smoke(runtime_env: PreparedRuntimeEnv) -> No
assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
assert data["status"] == "completed"
assert isinstance(data["items_count"], int)
assert isinstance(data["persisted_items_count"], int)
assert data["items_count"] > 0
assert data["final_response_is_text"] is True
def test_real_thread_run_convenience_smoke(runtime_env: PreparedRuntimeEnv) -> None:
@@ -345,17 +340,12 @@ def test_real_async_thread_turn_usage_and_ids_smoke(
config={"model_reasoning_effort": "high"},
)
result = await (await thread.turn(TextInput("say ok"))).run()
persisted = await thread.read(include_turns=True)
persisted_turn = next(
(turn for turn in persisted.thread.turns or [] if turn.id == result.id),
None,
)
print(json.dumps({
"thread_id": thread.id,
"turn_id": result.id,
"status": result.status.value,
"items_count": len(result.items or []),
"persisted_items_count": 0 if persisted_turn is None else len(persisted_turn.items or []),
"items_count": len(result.items),
"final_response_is_text": isinstance(result.final_response, str) and bool(result.final_response.strip()),
}))
asyncio.run(main())
@@ -366,8 +356,8 @@ def test_real_async_thread_turn_usage_and_ids_smoke(
assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
assert data["status"] == "completed"
assert isinstance(data["items_count"], int)
assert isinstance(data["persisted_items_count"], int)
assert data["items_count"] > 0
assert data["final_response_is_text"] is True
def test_real_async_thread_run_convenience_smoke(
@@ -531,7 +521,7 @@ def test_real_examples_run_and_assert(
f"Example failed: {folder}/{script}\n"
f"STDOUT:\n{result.stdout}\n"
f"STDERR:\n{result.stderr}"
f"{_runtime_compatibility_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
f"{_runtime_schema_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
)
out = result.stdout
@@ -541,7 +531,7 @@ def test_real_examples_run_and_assert(
assert "Server: unknown" not in out
elif folder == "02_turn_run":
assert "thread_id:" in out and "turn_id:" in out and "status:" in out
assert "persisted.items.count:" in out
assert "items.count:" in out
elif folder == "03_turn_stream_events":
assert "stream.completed:" in out
assert "assistant>" in out