[codex] Return TurnResult from Python turn handles (#23151)

## Why `TurnHandle.run()` returned the raw app-server `Turn`, whose live start/completed payloads do not include loaded `items`, so users saw empty `items` after starting a turn. That made the handle-based path behave differently from `Thread.run(...)`, and pushed examples toward persisted-thread reads plus helper extraction. This PR makes the run APIs standalone: starting a turn and running it returns collected turn data directly, or fails visibly when required stream events are missing. ## What Changed - Replaces the public `RunResult` export with `TurnResult`. - Adds turn metadata to `TurnResult`: `id`, `status`, `error`, `started_at`, `completed_at`, and `duration_ms`, alongside `final_response`, `items`, and `usage`. - Changes `TurnHandle.run()` and `AsyncTurnHandle.run()` to consume stream events with the same collector used by `Thread.run(...)`. - Exports `TurnError` from `openai_codex.types` for the new result shape. - Updates tests, examples, docs, and the walkthrough notebook to use `result.final_response` and `result.items` directly. - Removes persisted-thread helper paths and placeholder/skipped control flows from the public examples and notebook. ## Verification - `python3 -m py_compile ...` over changed SDK, example, and test Python files. - `python3 -c "import json; json.load(open('sdk/python/notebooks/sdk_walkthrough.ipynb'))"` - `git diff --check` - `PYTHONPATH=sdk/python/src python3 -c ...` import/signature smoke for `TurnResult`, `TurnHandle.run`, and `AsyncTurnHandle.run`.
2026-05-21 03:33:41 +00:00 · 2026-05-17 06:17:22 -07:00
parent 4c89772314
commit f0166cadbb
42 changed files with 399 additions and 677 deletions
--- a/sdk/python/tests/app_server_helpers.py
+++ b/sdk/python/tests/app_server_helpers.py
@@ -111,7 +111,7 @@ def agent_message_texts(events: list[Notification]) -> list[str]:


 def agent_message_texts_from_items(items: Iterable[Any]) -> list[str]:
-    """Extract agent-message text from completed run result items."""
+    """Extract agent-message text from completed turn result items."""
    texts: list[str] = []
    for item in items:
        root = item.root
--- a/sdk/python/tests/test_app_server_lifecycle.py
+++ b/sdk/python/tests/test_app_server_lifecycle.py
@@ -157,7 +157,7 @@ def test_async_lifecycle_methods_round_trip(tmp_path) -> None:

            async with AsyncCodex(config=harness.app_server_config()) as codex:
                thread = await codex.thread_start()
-                run_result = await thread.run("materialize async thread")
+                turn_result = await thread.run("materialize async thread")
                await thread.set_name("async lifecycle")
                named = await thread.read()
                resumed = await codex.thread_resume(thread.id)
@@ -166,14 +166,14 @@ def test_async_lifecycle_methods_round_trip(tmp_path) -> None:
                unarchived = await codex.thread_unarchive(thread.id)

        assert {
-            "run_final_response": run_result.final_response,
+            "turn_final_response": turn_result.final_response,
            "named_thread": named.thread.name,
            "resumed_id": resumed.id,
            "forked_is_distinct": forked.id != thread.id,
            "archive_response": archive_response.model_dump(by_alias=True, mode="json"),
            "unarchived_id": unarchived.id,
        } == {
-            "run_final_response": "async materialized",
+            "turn_final_response": "async materialized",
            "named_thread": "async lifecycle",
            "resumed_id": thread.id,
            "forked_is_distinct": True,
@@ -253,19 +253,19 @@ def test_compact_rpc_hits_mock_responses(tmp_path) -> None:

        with Codex(config=harness.app_server_config()) as codex:
            thread = codex.thread_start()
-            run_result = thread.run("create history")
+            turn_result = thread.run("create history")
            compact_response = thread.compact()
            requests = harness.responses.wait_for_requests(2)

    assert {
-        "run_final_response": run_result.final_response,
+        "turn_final_response": turn_result.final_response,
        "compact_response": compact_response.model_dump(
            by_alias=True,
            mode="json",
        ),
        "request_kinds": [request_kind(request.path) for request in requests],
    } == {
-        "run_final_response": "history",
+        "turn_final_response": "history",
        "compact_response": {},
        "request_kinds": ["responses", "responses"],
    }
--- a/sdk/python/tests/test_app_server_run.py
+++ b/sdk/python/tests/test_app_server_run.py
@@ -145,8 +145,8 @@ def test_async_thread_run_uses_mock_responses(
    asyncio.run(scenario())


-def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
-    """RunResult should use the last unknown-phase agent message as final text."""
+def test_sync_turn_result_uses_last_unknown_phase_message(tmp_path) -> None:
+    """TurnResult should use the last unknown-phase agent message as final text."""
    with AppServerHarness(tmp_path) as harness:
        harness.responses.enqueue_sse(
            sse(
@@ -171,8 +171,8 @@ def test_sync_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
    }


-def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
-    """RunResult should preserve an empty final agent message instead of skipping it."""
+def test_sync_turn_result_preserves_empty_last_message(tmp_path) -> None:
+    """TurnResult should preserve an empty final agent message instead of skipping it."""
    with AppServerHarness(tmp_path) as harness:
        harness.responses.enqueue_sse(
            sse(
@@ -197,8 +197,8 @@ def test_sync_run_result_preserves_empty_last_message(tmp_path) -> None:
    }


-def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
-    """RunResult final_response should stay unset when app-server marks only commentary."""
+def test_sync_turn_result_does_not_promote_commentary_only_to_final(tmp_path) -> None:
+    """TurnResult final_response should stay unset when app-server marks only commentary."""
    with AppServerHarness(tmp_path) as harness:
        harness.responses.enqueue_sse(
            sse(
@@ -226,8 +226,8 @@ def test_sync_run_result_does_not_promote_commentary_only_to_final(tmp_path) ->
    }


-def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
-    """Async RunResult should use the last unknown-phase agent message."""
+def test_async_turn_result_uses_last_unknown_phase_message(tmp_path) -> None:
+    """Async TurnResult should use the last unknown-phase agent message."""

    async def scenario() -> None:
        """Run one async result-mapping case against a pinned app-server."""
@@ -263,10 +263,10 @@ def test_async_run_result_uses_last_unknown_phase_message(tmp_path) -> None:
    asyncio.run(scenario())


-def test_async_run_result_does_not_promote_commentary_only_to_final(
+def test_async_turn_result_does_not_promote_commentary_only_to_final(
    tmp_path,
 ) -> None:
-    """Async RunResult final_response should stay unset for commentary-only output."""
+    """Async TurnResult final_response should stay unset for commentary-only output."""

    async def scenario() -> None:
        """Run one async commentary mapping case against a pinned app-server."""
@@ -318,7 +318,7 @@ def test_thread_run_raises_when_real_app_server_reports_failed_turn(tmp_path) ->


 def test_final_answer_phase_survives_real_app_server_mapping(tmp_path) -> None:
-    """RunResult should use the final-answer item emitted by app-server."""
+    """TurnResult should use the final-answer item emitted by app-server."""
    with AppServerHarness(tmp_path) as harness:
        harness.responses.enqueue_sse(
            sse(
--- a/sdk/python/tests/test_app_server_streaming.py
+++ b/sdk/python/tests/test_app_server_streaming.py
@@ -5,6 +5,7 @@ import asyncio
 from app_server_harness import AppServerHarness
 from app_server_helpers import (
    agent_message_texts,
+    agent_message_texts_from_items,
    next_async_delta,
    next_sync_delta,
    streaming_response,
@@ -48,7 +49,7 @@ def test_sync_stream_routes_text_deltas_and_completion(tmp_path) -> None:


 def test_turn_run_returns_completed_turn(tmp_path) -> None:
-    """TurnHandle.run should wait for the app-server completion notification."""
+    """TurnHandle.run should collect output and completion metadata."""
    with AppServerHarness(tmp_path) as harness:
        harness.responses.enqueue_assistant_message("turn complete", response_id="turn-run-1")

@@ -60,11 +61,13 @@ def test_turn_run_returns_completed_turn(tmp_path) -> None:
    assert {
        "turn_id": completed.id,
        "status": completed.status,
-        "items": completed.items,
+        "agent_messages": agent_message_texts_from_items(completed.items),
+        "final_response": completed.final_response,
    } == {
        "turn_id": turn.id,
        "status": TurnStatus.completed,
-        "items": [],
+        "agent_messages": ["turn complete"],
+        "final_response": "turn complete",
    }


--- a/sdk/python/tests/test_public_api_signatures.py
+++ b/sdk/python/tests/test_public_api_signatures.py
@@ -14,9 +14,11 @@ from openai_codex import (
    AppServerConfig,
    AsyncCodex,
    AsyncThread,
+    AsyncTurnHandle,
    Codex,
-    RunResult,
    Thread,
+    TurnHandle,
+    TurnResult,
 )
 from openai_codex._initialize_metadata import validate_initialize_metadata
 from openai_codex.types import InitializeResponse
@@ -35,7 +37,7 @@ EXPECTED_ROOT_EXPORTS = [
    "AsyncThread",
    "TurnHandle",
    "AsyncTurnHandle",
-    "RunResult",
+    "TurnResult",
    "Input",
    "InputItem",
    "TextInput",
@@ -92,6 +94,7 @@ EXPECTED_TYPES_EXPORTS = [
    "ThreadTokenUsageUpdatedNotification",
    "Turn",
    "TurnCompletedNotification",
+    "TurnError",
    "TurnInterruptResponse",
    "TurnStatus",
    "TurnSteerResponse",
@@ -128,9 +131,39 @@ def test_root_exports_app_server_config() -> None:
    assert AppServerConfig.__name__ == "AppServerConfig"


-def test_root_exports_run_result() -> None:
-    """The root package should expose the common-case run result wrapper."""
-    assert RunResult.__name__ == "RunResult"
+def test_root_exports_turn_result() -> None:
+    """The root package should expose the collected turn result wrapper."""
+    assert {
+        "name": TurnResult.__name__,
+        "fields": list(TurnResult.__dataclass_fields__),
+    } == {
+        "name": "TurnResult",
+        "fields": [
+            "id",
+            "status",
+            "error",
+            "started_at",
+            "completed_at",
+            "duration_ms",
+            "final_response",
+            "items",
+            "usage",
+        ],
+    }
+
+
+def test_turn_run_methods_return_turn_result() -> None:
+    """Both convenience and handle-based run APIs return the same result shape."""
+    funcs = [
+        Thread.run,
+        TurnHandle.run,
+        AsyncThread.run,
+        AsyncTurnHandle.run,
+    ]
+
+    assert {fn: inspect.signature(fn).return_annotation for fn in funcs} == dict.fromkeys(
+        funcs, "TurnResult"
+    )


 def test_root_exports_approval_mode() -> None:
--- a/sdk/python/tests/test_real_app_server_integration.py
+++ b/sdk/python/tests/test_real_app_server_integration.py
@@ -135,7 +135,7 @@ def _run_python(
    )


-def _runtime_compatibility_hint(
+def _runtime_schema_hint(
    runtime_env: PreparedRuntimeEnv,
    *,
    stdout: str,
@@ -144,7 +144,7 @@ def _runtime_compatibility_hint(
    combined = f"{stdout}\n{stderr}"
    if "ThreadStartResponse" in combined and "approvalsReviewer" in combined:
        return (
-            "\nCompatibility hint:\n"
+            "\nSchema hint:\n"
            f"Pinned runtime {runtime_env.runtime_version} returned a thread/start payload "
            "that is older than the current SDK schema and is missing "
            "`approvalsReviewer`. Bump `sdk/python/_runtime_setup.py` to a matching "
@@ -165,7 +165,7 @@ def _run_json_python(
        "Python snippet failed.\n"
        f"STDOUT:\n{result.stdout}\n"
        f"STDERR:\n{result.stderr}"
-        f"{_runtime_compatibility_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
+        f"{_runtime_schema_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
    )
    return json.loads(result.stdout)

@@ -242,17 +242,12 @@ def test_real_thread_and_turn_start_smoke(runtime_env: PreparedRuntimeEnv) -> No
                    config={"model_reasoning_effort": "high"},
                )
                result = thread.turn(TextInput("hello")).run()
-                persisted = thread.read(include_turns=True)
-                persisted_turn = next(
-                    (turn for turn in persisted.thread.turns or [] if turn.id == result.id),
-                    None,
-                )
                print(json.dumps({
                    "thread_id": thread.id,
                    "turn_id": result.id,
                    "status": result.status.value,
-                    "items_count": len(result.items or []),
-                    "persisted_items_count": 0 if persisted_turn is None else len(persisted_turn.items or []),
+                    "items_count": len(result.items),
+                    "final_response_is_text": isinstance(result.final_response, str) and bool(result.final_response.strip()),
                }))
            """
        ),
@@ -261,8 +256,8 @@ def test_real_thread_and_turn_start_smoke(runtime_env: PreparedRuntimeEnv) -> No
    assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
    assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
    assert data["status"] == "completed"
-    assert isinstance(data["items_count"], int)
-    assert isinstance(data["persisted_items_count"], int)
+    assert data["items_count"] > 0
+    assert data["final_response_is_text"] is True


 def test_real_thread_run_convenience_smoke(runtime_env: PreparedRuntimeEnv) -> None:
@@ -345,17 +340,12 @@ def test_real_async_thread_turn_usage_and_ids_smoke(
                        config={"model_reasoning_effort": "high"},
                    )
                    result = await (await thread.turn(TextInput("say ok"))).run()
-                    persisted = await thread.read(include_turns=True)
-                    persisted_turn = next(
-                        (turn for turn in persisted.thread.turns or [] if turn.id == result.id),
-                        None,
-                    )
                    print(json.dumps({
                        "thread_id": thread.id,
                        "turn_id": result.id,
                        "status": result.status.value,
-                        "items_count": len(result.items or []),
-                        "persisted_items_count": 0 if persisted_turn is None else len(persisted_turn.items or []),
+                        "items_count": len(result.items),
+                        "final_response_is_text": isinstance(result.final_response, str) and bool(result.final_response.strip()),
                    }))

            asyncio.run(main())
@@ -366,8 +356,8 @@ def test_real_async_thread_turn_usage_and_ids_smoke(
    assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
    assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
    assert data["status"] == "completed"
-    assert isinstance(data["items_count"], int)
-    assert isinstance(data["persisted_items_count"], int)
+    assert data["items_count"] > 0
+    assert data["final_response_is_text"] is True


 def test_real_async_thread_run_convenience_smoke(
@@ -531,7 +521,7 @@ def test_real_examples_run_and_assert(
        f"Example failed: {folder}/{script}\n"
        f"STDOUT:\n{result.stdout}\n"
        f"STDERR:\n{result.stderr}"
-        f"{_runtime_compatibility_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
+        f"{_runtime_schema_hint(runtime_env, stdout=result.stdout, stderr=result.stderr)}"
    )

    out = result.stdout
@@ -541,7 +531,7 @@ def test_real_examples_run_and_assert(
        assert "Server: unknown" not in out
    elif folder == "02_turn_run":
        assert "thread_id:" in out and "turn_id:" in out and "status:" in out
-        assert "persisted.items.count:" in out
+        assert "items.count:" in out
    elif folder == "03_turn_stream_events":
        assert "stream.completed:" in out
        assert "assistant>" in out