python-sdk: use pinned runtime package in real coverage (2026-03-12)

2026-03-12 Switch the repo-source Python SDK real coverage over to a pinned runtime-package flow backed by GitHub release artifacts instead of PATH or explicit binary overrides. - add sdk/python/_runtime_setup.py to download the release codex archive for a requested CODEX_PYTHON_RUNTIME_VERSION, stage a temporary codex-cli-bin package, and install it into a target Python environment with cleanup - refactor real integration tests to run repo-source SDK code against an isolated site-packages target that contains the staged codex-cli-bin runtime - update examples and notebook bootstrap to install and use the runtime package, and stop consulting CODEX_PYTHON_SDK_CODEX_BIN or PATH - switch the failing turn-run and model-selection examples to runtime-compatible model selection for the pinned release binary - keep the main SDK runtime resolution model unchanged: explicit codex_bin or installed codex-cli-bin Validation: - python3 -m pytest sdk/python/tests - RUN_REAL_CODEX_TESTS=1 CODEX_PYTHON_RUNTIME_VERSION=0.115.0-alpha.11 python3 -m pytest sdk/python/tests/test_real_app_server_integration.py Co-authored-by: Codex <noreply@openai.com>
2026-04-29 08:56:38 +00:00 · 2026-03-12 09:51:07 -07:00
parent fd4beb8b37
commit 3cf1306968
9 changed files with 708 additions and 120 deletions
--- a/sdk/python/tests/test_real_app_server_integration.py
+++ b/sdk/python/tests/test_real_app_server_integration.py
@@ -1,22 +1,26 @@
 from __future__ import annotations

-import asyncio
 import json
 import os
-import shutil
 import subprocess
 import sys
 import tempfile
+import textwrap
+from dataclasses import dataclass
 from pathlib import Path

 import pytest

-from codex_app_server import AppServerConfig, AsyncCodex, Codex, TextInput
-
 ROOT = Path(__file__).resolve().parents[1]
 EXAMPLES_DIR = ROOT / "examples"
 NOTEBOOK_PATH = ROOT / "notebooks" / "sdk_walkthrough.ipynb"

+root_str = str(ROOT)
+if root_str not in sys.path:
+    sys.path.insert(0, root_str)
+
+from _runtime_setup import ensure_runtime_package_installed, required_runtime_version
+
 RUN_REAL_CODEX_TESTS = os.environ.get("RUN_REAL_CODEX_TESTS") == "1"
 pytestmark = pytest.mark.skipif(
    not RUN_REAL_CODEX_TESTS,
@@ -54,24 +58,59 @@ EXAMPLE_CASES: list[tuple[str, str]] = [
 ]


-def _run_example(
-    folder: str, script: str, *, timeout_s: int = 150
-) -> subprocess.CompletedProcess[str]:
-    path = EXAMPLES_DIR / folder / script
-    assert path.exists(), f"Missing example script: {path}"
+@dataclass(frozen=True)
+class PreparedRuntimeEnv:
+    python: str
+    env: dict[str, str]
+    runtime_version: str

-    env = os.environ.copy()
-    env.setdefault(
-        "CODEX_PYTHON_SDK_CODEX_BIN",
-        _real_test_config().codex_bin or "",
+
+@pytest.fixture(scope="session")
+def runtime_env(tmp_path_factory: pytest.TempPathFactory) -> PreparedRuntimeEnv:
+    runtime_version = required_runtime_version()
+    temp_root = tmp_path_factory.mktemp("python-runtime-env")
+    isolated_site = temp_root / "site-packages"
+    python = sys.executable
+
+    _run_command(
+        [
+            python,
+            "-m",
+            "pip",
+            "install",
+            "--target",
+            str(isolated_site),
+            "pydantic>=2.12",
+        ],
+        cwd=ROOT,
+        env=os.environ.copy(),
+        timeout_s=240,
+    )
+    ensure_runtime_package_installed(
+        python,
+        ROOT,
+        runtime_version,
+        install_target=isolated_site,
    )

-    # Feed '/exit' only to interactive mini-cli examples.
-    stdin = "/exit\n" if folder == "11_cli_mini_app" else None
+    env = os.environ.copy()
+    env["PYTHONPATH"] = os.pathsep.join([str(isolated_site), str(ROOT / "src")])
+    env["CODEX_PYTHON_RUNTIME_VERSION"] = runtime_version
+    env["CODEX_PYTHON_SDK_DIR"] = str(ROOT)
+    return PreparedRuntimeEnv(python=python, env=env, runtime_version=runtime_version)

+
+def _run_command(
+    args: list[str],
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    timeout_s: int,
+    stdin: str | None = None,
+) -> subprocess.CompletedProcess[str]:
    return subprocess.run(
-        [sys.executable, str(path)],
-        cwd=str(ROOT),
+        args,
+        cwd=str(cwd),
        env=env,
        input=stdin,
        text=True,
@@ -81,76 +120,172 @@ def _run_example(
    )


+def _run_python(
+    runtime_env: PreparedRuntimeEnv,
+    source: str,
+    *,
+    cwd: Path | None = None,
+    timeout_s: int = 180,
+) -> subprocess.CompletedProcess[str]:
+    return _run_command(
+        [str(runtime_env.python), "-c", source],
+        cwd=cwd or ROOT,
+        env=runtime_env.env,
+        timeout_s=timeout_s,
+    )
+
+
+def _run_json_python(
+    runtime_env: PreparedRuntimeEnv,
+    source: str,
+    *,
+    cwd: Path | None = None,
+    timeout_s: int = 180,
+) -> dict[str, object]:
+    result = _run_python(runtime_env, source, cwd=cwd, timeout_s=timeout_s)
+    assert result.returncode == 0, (
+        f"Python snippet failed.\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+    )
+    return json.loads(result.stdout)
+
+
+def _run_example(
+    runtime_env: PreparedRuntimeEnv,
+    folder: str,
+    script: str,
+    *,
+    timeout_s: int = 180,
+) -> subprocess.CompletedProcess[str]:
+    path = EXAMPLES_DIR / folder / script
+    assert path.exists(), f"Missing example script: {path}"
+
+    stdin = "/exit\n" if folder == "11_cli_mini_app" else None
+    return _run_command(
+        [str(runtime_env.python), str(path)],
+        cwd=ROOT,
+        env=runtime_env.env,
+        timeout_s=timeout_s,
+        stdin=stdin,
+    )
+
+
 def _notebook_cell_source(cell_index: int) -> str:
    notebook = json.loads(NOTEBOOK_PATH.read_text())
    return "".join(notebook["cells"][cell_index]["source"])


-def _real_test_config() -> AppServerConfig:
-    codex_bin = os.environ.get("CODEX_PYTHON_SDK_CODEX_BIN") or shutil.which("codex")
-    if codex_bin is None:
-        raise RuntimeError(
-            "Real SDK integration tests require a Codex CLI binary.\n"
-            "Set RUN_REAL_CODEX_TESTS=1 and CODEX_PYTHON_SDK_CODEX_BIN=/absolute/path/to/codex, "
-            "or ensure `codex` is on PATH."
-        )
-    return AppServerConfig(codex_bin=codex_bin)
+def test_real_initialize_and_model_list(runtime_env: PreparedRuntimeEnv) -> None:
+    data = _run_json_python(
+        runtime_env,
+        textwrap.dedent(
+            """
+            import json
+            from codex_app_server import Codex
+
+            with Codex() as codex:
+                models = codex.models(include_hidden=True)
+                print(json.dumps({
+                    "user_agent": codex.metadata.user_agent,
+                    "server_name": codex.metadata.server_name,
+                    "server_version": codex.metadata.server_version,
+                    "model_count": len(models.data),
+                }))
+            """
+        ),
+    )
+
+    assert isinstance(data["user_agent"], str) and data["user_agent"].strip()
+    assert isinstance(data["server_name"], str) and data["server_name"].strip()
+    assert isinstance(data["server_version"], str) and data["server_version"].strip()
+    assert isinstance(data["model_count"], int)


-def test_real_initialize_and_model_list():
-    with Codex(config=_real_test_config()) as codex:
-        metadata = codex.metadata
-        assert isinstance(metadata.user_agent, str) and metadata.user_agent.strip()
-        assert isinstance(metadata.server_name, str) and metadata.server_name.strip()
-        assert isinstance(metadata.server_version, str) and metadata.server_version.strip()
+def test_real_thread_and_turn_start_smoke(runtime_env: PreparedRuntimeEnv) -> None:
+    data = _run_json_python(
+        runtime_env,
+        textwrap.dedent(
+            """
+            import json
+            from codex_app_server import Codex, TextInput

-        models = codex.models(include_hidden=True)
-        assert isinstance(models.data, list)
+            with Codex() as codex:
+                thread = codex.thread_start(
+                    model="gpt-5.4",
+                    config={"model_reasoning_effort": "high"},
+                )
+                result = thread.turn(TextInput("hello")).run()
+                print(json.dumps({
+                    "thread_id": result.thread_id,
+                    "turn_id": result.turn_id,
+                    "items_count": len(result.items),
+                    "has_usage": result.usage is not None,
+                    "usage_thread_id": None if result.usage is None else result.usage.thread_id,
+                    "usage_turn_id": None if result.usage is None else result.usage.turn_id,
+                }))
+            """
+        ),
+    )
+
+    assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
+    assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
+    assert isinstance(data["items_count"], int)
+    assert data["has_usage"] is True
+    assert data["usage_thread_id"] == data["thread_id"]
+    assert data["usage_turn_id"] == data["turn_id"]


-def test_real_thread_and_turn_start_smoke():
-    with Codex(config=_real_test_config()) as codex:
-        thread = codex.thread_start(model="gpt-5", config={"model_reasoning_effort": "high"})
-        result = thread.turn(TextInput("hello")).run()
+def test_real_async_thread_turn_usage_and_ids_smoke(
+    runtime_env: PreparedRuntimeEnv,
+) -> None:
+    data = _run_json_python(
+        runtime_env,
+        textwrap.dedent(
+            """
+            import asyncio
+            import json
+            from codex_app_server import AsyncCodex, TextInput

-        assert isinstance(result.thread_id, str) and result.thread_id.strip()
-        assert isinstance(result.turn_id, str) and result.turn_id.strip()
-        assert isinstance(result.items, list)
-        assert result.usage is not None
-        assert result.usage.thread_id == result.thread_id
-        assert result.usage.turn_id == result.turn_id
+            async def main():
+                async with AsyncCodex() as codex:
+                    thread = await codex.thread_start(
+                        model="gpt-5.4",
+                        config={"model_reasoning_effort": "high"},
+                    )
+                    result = await (await thread.turn(TextInput("say ok"))).run()
+                    print(json.dumps({
+                        "thread_id": result.thread_id,
+                        "turn_id": result.turn_id,
+                        "items_count": len(result.items),
+                        "has_usage": result.usage is not None,
+                        "usage_thread_id": None if result.usage is None else result.usage.thread_id,
+                        "usage_turn_id": None if result.usage is None else result.usage.turn_id,
+                    }))
+
+            asyncio.run(main())
+            """
+        ),
+    )
+
+    assert isinstance(data["thread_id"], str) and data["thread_id"].strip()
+    assert isinstance(data["turn_id"], str) and data["turn_id"].strip()
+    assert isinstance(data["items_count"], int)
+    assert data["has_usage"] is True
+    assert data["usage_thread_id"] == data["thread_id"]
+    assert data["usage_turn_id"] == data["turn_id"]


-def test_real_async_thread_turn_usage_and_ids_smoke() -> None:
-    async def _run() -> None:
-        async with AsyncCodex(config=_real_test_config()) as codex:
-            thread = await codex.thread_start(model="gpt-5", config={"model_reasoning_effort": "high"})
-            result = await (await thread.turn(TextInput("say ok"))).run()
-
-            assert isinstance(result.thread_id, str) and result.thread_id.strip()
-            assert isinstance(result.turn_id, str) and result.turn_id.strip()
-            assert isinstance(result.items, list)
-            assert result.usage is not None
-            assert result.usage.thread_id == result.thread_id
-            assert result.usage.turn_id == result.turn_id
-
-    asyncio.run(_run())
-
-
-def test_notebook_bootstrap_resolves_sdk_from_unrelated_cwd() -> None:
+def test_notebook_bootstrap_resolves_sdk_and_runtime_from_unrelated_cwd(
+    runtime_env: PreparedRuntimeEnv,
+) -> None:
    cell_1_source = _notebook_cell_source(1)
-    env = os.environ.copy()
-    env["CODEX_PYTHON_SDK_DIR"] = str(ROOT)
+    env = runtime_env.env.copy()

    with tempfile.TemporaryDirectory() as temp_cwd:
-        result = subprocess.run(
-            [sys.executable, "-c", cell_1_source],
-            cwd=temp_cwd,
+        result = _run_command(
+            [str(runtime_env.python), "-c", cell_1_source],
+            cwd=Path(temp_cwd),
            env=env,
-            text=True,
-            capture_output=True,
-            timeout=60,
-            check=False,
+            timeout_s=180,
        )

    assert result.returncode == 0, (
@@ -159,43 +294,89 @@ def test_notebook_bootstrap_resolves_sdk_from_unrelated_cwd() -> None:
        f"STDERR:\n{result.stderr}"
    )
    assert "SDK source:" in result.stdout
-    assert "codex_app_server" in result.stdout or "sdk/python/src" in result.stdout
+    assert f"Runtime package: {runtime_env.runtime_version}" in result.stdout


-def test_real_streaming_smoke_turn_completed():
-    with Codex(config=_real_test_config()) as codex:
-        thread = codex.thread_start(model="gpt-5", config={"model_reasoning_effort": "high"})
-        turn = thread.turn(TextInput("Reply with one short sentence."))
-
-        saw_delta = False
-        saw_completed = False
-        for evt in turn.stream():
-            if evt.method == "item/agentMessage/delta":
-                saw_delta = True
-            if evt.method == "turn/completed":
-                saw_completed = True
-
-        assert saw_completed
-        # Some environments can produce zero deltas for very short output;
-        # this assert keeps the smoke test informative but non-flaky.
-        assert isinstance(saw_delta, bool)
+def test_notebook_sync_cell_smoke(runtime_env: PreparedRuntimeEnv) -> None:
+    source = "\n\n".join(
+        [
+            _notebook_cell_source(1),
+            _notebook_cell_source(2),
+            _notebook_cell_source(3),
+        ]
+    )
+    result = _run_python(runtime_env, source, timeout_s=240)
+    assert result.returncode == 0, (
+        f"Notebook sync smoke failed.\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+    )
+    assert "status:" in result.stdout
+    assert "server:" in result.stdout


-def test_real_turn_interrupt_smoke():
-    with Codex(config=_real_test_config()) as codex:
-        thread = codex.thread_start(model="gpt-5", config={"model_reasoning_effort": "high"})
-        turn = thread.turn(TextInput("Count from 1 to 200 with commas."))
+def test_real_streaming_smoke_turn_completed(runtime_env: PreparedRuntimeEnv) -> None:
+    data = _run_json_python(
+        runtime_env,
+        textwrap.dedent(
+            """
+            import json
+            from codex_app_server import Codex, TextInput

-        # Best effort: interrupting quickly may race with completion on fast models.
-        _ = turn.interrupt()
+            with Codex() as codex:
+                thread = codex.thread_start(
+                    model="gpt-5.4",
+                    config={"model_reasoning_effort": "high"},
+                )
+                turn = thread.turn(TextInput("Reply with one short sentence."))
+                saw_delta = False
+                saw_completed = False
+                for event in turn.stream():
+                    if event.method == "item/agentMessage/delta":
+                        saw_delta = True
+                    if event.method == "turn/completed":
+                        saw_completed = True
+                print(json.dumps({
+                    "saw_delta": saw_delta,
+                    "saw_completed": saw_completed,
+                }))
+            """
+        ),
+    )
+
+    assert data["saw_completed"] is True
+    assert isinstance(data["saw_delta"], bool)
+
+
+def test_real_turn_interrupt_smoke(runtime_env: PreparedRuntimeEnv) -> None:
+    data = _run_json_python(
+        runtime_env,
+        textwrap.dedent(
+            """
+            import json
+            from codex_app_server import Codex, TextInput
+
+            with Codex() as codex:
+                thread = codex.thread_start(
+                    model="gpt-5.4",
+                    config={"model_reasoning_effort": "high"},
+                )
+                turn = thread.turn(TextInput("Count from 1 to 200 with commas."))
+                turn.interrupt()
+                follow_up = thread.turn(TextInput("Say 'ok' only.")).run()
+                print(json.dumps({"status": follow_up.status.value}))
+            """
+        ),
+    )
+
+    assert data["status"] in {"completed", "failed"}

-        # Confirm the session is still usable after interrupt race.
-        follow_up = thread.turn(TextInput("Say 'ok' only.")).run()
-        assert follow_up.status.value in {"completed", "failed"}

@pytest.mark.parametrize(("folder", "script"), EXAMPLE_CASES)
-def test_real_examples_run_and_assert(folder: str, script: str):
-    result = _run_example(folder, script)
+def test_real_examples_run_and_assert(
+    runtime_env: PreparedRuntimeEnv,
+    folder: str,
+    script: str,
+) -> None:
+    result = _run_example(runtime_env, folder, script)

    assert result.returncode == 0, (
        f"Example failed: {folder}/{script}\n"
@@ -205,7 +386,6 @@ def test_real_examples_run_and_assert(folder: str, script: str):

    out = result.stdout

-    # Minimal content assertions so we validate behavior, not just exit code.
    if folder == "01_quickstart_constructor":
        assert "Status:" in out and "Text:" in out
        assert "Server: None None" not in out