Files
codex/scripts/codex_package/ripgrep.py
Michael Bolin 59f262a2b4 build: fetch rg for Codex packages (#23526)
## Why

The Codex package builder should produce a complete package without
requiring callers to pre-populate `rg` under `codex-cli/vendor` or have
`dotslash` installed on `PATH`. The repo already tracks the
authoritative DotSlash manifest in `codex-cli/bin/rg`, so the builder
can read that metadata directly and fetch the correct ripgrep archive
for the target it is packaging.

## What changed

- Added `scripts/codex_package/ripgrep.py` to parse `codex-cli/bin/rg`
after stripping the shebang, select the target platform entry, download
the configured artifact, and verify the recorded size and SHA-256
digest.
- Added a cache under `$TMPDIR/codex-package/<target>-rg` so verified
archives can be reused without fetching again.
- Extracted `rg`/`rg.exe` from `tar.gz` and `zip` artifacts into the
package-builder cache, then copied that into `codex-path` through the
existing package layout flow.
- Kept `--rg-bin` as an explicit local override for offline tests and
unusual local workflows.
- Documented the default `rg` fetch/cache behavior in
`scripts/codex_package/README.md`.

## Verification

- Ran wrapper/module syntax compilation.
- Ran `scripts/build_codex_package.py --help` from `/private/tmp`.
- Ran a local manifest fetch test covering shebang-stripped manifest
parsing, `tar.gz` extraction, `zip` extraction, size/SHA-256
verification, and cache reuse after deleting the original source
archives.
- Ran fake-cargo package/archive builds for macOS, Linux, and Windows
target layouts with `--rg-bin`, including an assertion that generated
tar archives contain no duplicate member names.




---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with [ReviewStack](https://reviewstack.dev/openai/codex/pull/23526).
* #23541
* __->__ #23526
2026-05-19 15:52:17 -07:00

196 lines
6.1 KiB
Python

"""Fetch ripgrep from the DotSlash manifest used by the npm package."""
import hashlib
import json
import shutil
import stat
import tarfile
import tempfile
import zipfile
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen
from .targets import REPO_ROOT
from .targets import TargetSpec
from .targets import resolve_input_path
RG_MANIFEST = REPO_ROOT / "codex-cli" / "bin" / "rg"
DOWNLOAD_TIMEOUT_SECS = 60
@dataclass(frozen=True)
class RgArtifact:
size: int
digest: str
archive_format: str
archive_member: str
url: str
def resolve_rg_bin(spec: TargetSpec, rg_bin: Path | None) -> Path:
if rg_bin is not None:
return resolve_input_path(rg_bin, "ripgrep executable", "--rg-bin")
return fetch_rg(spec)
def fetch_rg(
spec: TargetSpec,
*,
manifest_path: Path = RG_MANIFEST,
cache_root: Path | None = None,
) -> Path:
artifact = artifact_for_target(spec, manifest_path)
cache_dir = (cache_root or default_cache_root()) / f"{spec.target}-rg"
archive_path = cache_dir / archive_filename(artifact.url)
if not archive_is_valid(archive_path, artifact):
download_archive(artifact.url, archive_path)
try:
verify_archive(archive_path, artifact)
except RuntimeError:
archive_path.unlink(missing_ok=True)
raise
dest = cache_dir / spec.rg_name
extract_rg(archive_path, artifact, dest)
if not spec.is_windows:
mode = dest.stat().st_mode
dest.chmod(mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
return dest
def artifact_for_target(spec: TargetSpec, manifest_path: Path) -> RgArtifact:
manifest = load_manifest(manifest_path)
try:
platform_info = manifest["platforms"][spec.dotslash_platform]
except KeyError as exc:
raise RuntimeError(
f"ripgrep manifest {manifest_path} is missing platform {spec.dotslash_platform!r}"
) from exc
providers = platform_info.get("providers")
if not providers:
raise RuntimeError(
f"ripgrep manifest {manifest_path} has no providers for {spec.dotslash_platform!r}"
)
hash_name = platform_info.get("hash")
if hash_name != "sha256":
raise RuntimeError(
f"Unsupported ripgrep hash {hash_name!r} for "
f"{spec.dotslash_platform!r}; expected sha256"
)
return RgArtifact(
size=int(platform_info["size"]),
digest=str(platform_info["digest"]),
archive_format=str(platform_info["format"]),
archive_member=str(platform_info["path"]),
url=str(providers[0]["url"]),
)
def load_manifest(manifest_path: Path) -> dict:
text = manifest_path.read_text(encoding="utf-8")
if text.startswith("#!"):
text = "\n".join(text.splitlines()[1:])
return json.loads(text)
def default_cache_root() -> Path:
return Path(tempfile.gettempdir()) / "codex-package"
def archive_filename(url: str) -> str:
filename = Path(urlparse(url).path).name
if not filename:
raise RuntimeError(f"Unable to determine archive filename from {url}")
return filename
def archive_is_valid(archive_path: Path, artifact: RgArtifact) -> bool:
if not archive_path.is_file():
return False
try:
verify_archive(archive_path, artifact)
except RuntimeError:
archive_path.unlink(missing_ok=True)
return False
return True
def verify_archive(archive_path: Path, artifact: RgArtifact) -> None:
actual_size = archive_path.stat().st_size
if actual_size != artifact.size:
raise RuntimeError(
f"ripgrep archive {archive_path} has size {actual_size}, expected {artifact.size}"
)
digest = hashlib.sha256()
with open(archive_path, "rb") as fh:
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
digest.update(chunk)
actual_digest = digest.hexdigest()
if actual_digest != artifact.digest:
raise RuntimeError(
f"ripgrep archive {archive_path} has sha256 {actual_digest}, "
f"expected {artifact.digest}"
)
def download_archive(url: str, archive_path: Path) -> None:
archive_path.parent.mkdir(parents=True, exist_ok=True)
temp_path = archive_path.with_suffix(f"{archive_path.suffix}.tmp")
temp_path.unlink(missing_ok=True)
try:
with urlopen(url, timeout=DOWNLOAD_TIMEOUT_SECS) as response:
with open(temp_path, "wb") as out:
shutil.copyfileobj(response, out)
temp_path.replace(archive_path)
finally:
temp_path.unlink(missing_ok=True)
def extract_rg(archive_path: Path, artifact: RgArtifact, dest: Path) -> None:
dest.parent.mkdir(parents=True, exist_ok=True)
dest.unlink(missing_ok=True)
if artifact.archive_format == "tar.gz":
with tarfile.open(archive_path, "r:gz") as archive:
try:
member = archive.getmember(artifact.archive_member)
except KeyError as exc:
raise RuntimeError(
f"ripgrep archive {archive_path} is missing {artifact.archive_member!r}"
) from exc
extracted = archive.extractfile(member)
if extracted is None:
raise RuntimeError(
f"ripgrep archive member {artifact.archive_member!r} is not a file"
)
with extracted, open(dest, "wb") as out:
shutil.copyfileobj(extracted, out)
return
if artifact.archive_format == "zip":
with zipfile.ZipFile(archive_path) as archive:
try:
with archive.open(artifact.archive_member) as extracted:
with open(dest, "wb") as out:
shutil.copyfileobj(extracted, out)
except KeyError as exc:
raise RuntimeError(
f"ripgrep archive {archive_path} is missing {artifact.archive_member!r}"
) from exc
return
raise RuntimeError(
f"Unsupported ripgrep archive format {artifact.archive_format!r}; expected tar.gz or zip"
)