diff --git a/scripts/codex_package/README.md b/scripts/codex_package/README.md index 8c3c7ac58b..6eb45eef47 100644 --- a/scripts/codex_package/README.md +++ b/scripts/codex_package/README.md @@ -34,6 +34,8 @@ one grouped `cargo build` command per package: The default cargo profile is `dev-small` because local iteration should favor fast, small builds. Release jobs should pass `--cargo-profile release`. -`rg` is not built from this repository, so it remains an input. If `--rg-bin` is -omitted, the builder looks in the existing `codex-cli/vendor//path/` -location. +`rg` is not built from this repository, so the builder fetches it from the +DotSlash manifest at `codex-cli/bin/rg`. Downloaded archives are cached under +`$TMPDIR/codex-package/-rg` and are reused only after the recorded size +and SHA-256 digest have been verified. Pass `--rg-bin` to use a local ripgrep +executable instead. diff --git a/scripts/codex_package/cli.py b/scripts/codex_package/cli.py index beddffa3b9..c80f50a1a0 100644 --- a/scripts/codex_package/cli.py +++ b/scripts/codex_package/cli.py @@ -8,9 +8,9 @@ from .cargo import build_source_binaries from .layout import build_package_dir from .layout import prepare_package_dir from .layout import validate_package_dir +from .ripgrep import resolve_rg_bin from .targets import TARGET_SPECS from .targets import PackageInputs -from .targets import resolve_rg_bin def parse_args() -> argparse.Namespace: @@ -69,7 +69,10 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--rg-bin", type=Path, - help="Path to the ripgrep executable to place in codex-path/.", + help=( + "Optional local ripgrep executable override instead of fetching from " + "codex-cli/bin/rg." + ), ) return parser.parse_args() diff --git a/scripts/codex_package/ripgrep.py b/scripts/codex_package/ripgrep.py new file mode 100644 index 0000000000..5411cb579a --- /dev/null +++ b/scripts/codex_package/ripgrep.py @@ -0,0 +1,195 @@ +"""Fetch ripgrep from the DotSlash manifest used by the npm package.""" + +import hashlib +import json +import shutil +import stat +import tarfile +import tempfile +import zipfile +from dataclasses import dataclass +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import urlopen + +from .targets import REPO_ROOT +from .targets import TargetSpec +from .targets import resolve_input_path + + +RG_MANIFEST = REPO_ROOT / "codex-cli" / "bin" / "rg" +DOWNLOAD_TIMEOUT_SECS = 60 + + +@dataclass(frozen=True) +class RgArtifact: + size: int + digest: str + archive_format: str + archive_member: str + url: str + + +def resolve_rg_bin(spec: TargetSpec, rg_bin: Path | None) -> Path: + if rg_bin is not None: + return resolve_input_path(rg_bin, "ripgrep executable", "--rg-bin") + + return fetch_rg(spec) + + +def fetch_rg( + spec: TargetSpec, + *, + manifest_path: Path = RG_MANIFEST, + cache_root: Path | None = None, +) -> Path: + artifact = artifact_for_target(spec, manifest_path) + cache_dir = (cache_root or default_cache_root()) / f"{spec.target}-rg" + archive_path = cache_dir / archive_filename(artifact.url) + + if not archive_is_valid(archive_path, artifact): + download_archive(artifact.url, archive_path) + try: + verify_archive(archive_path, artifact) + except RuntimeError: + archive_path.unlink(missing_ok=True) + raise + + dest = cache_dir / spec.rg_name + extract_rg(archive_path, artifact, dest) + if not spec.is_windows: + mode = dest.stat().st_mode + dest.chmod(mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + return dest + + +def artifact_for_target(spec: TargetSpec, manifest_path: Path) -> RgArtifact: + manifest = load_manifest(manifest_path) + try: + platform_info = manifest["platforms"][spec.dotslash_platform] + except KeyError as exc: + raise RuntimeError( + f"ripgrep manifest {manifest_path} is missing platform {spec.dotslash_platform!r}" + ) from exc + + providers = platform_info.get("providers") + if not providers: + raise RuntimeError( + f"ripgrep manifest {manifest_path} has no providers for {spec.dotslash_platform!r}" + ) + + hash_name = platform_info.get("hash") + if hash_name != "sha256": + raise RuntimeError( + f"Unsupported ripgrep hash {hash_name!r} for " + f"{spec.dotslash_platform!r}; expected sha256" + ) + + return RgArtifact( + size=int(platform_info["size"]), + digest=str(platform_info["digest"]), + archive_format=str(platform_info["format"]), + archive_member=str(platform_info["path"]), + url=str(providers[0]["url"]), + ) + + +def load_manifest(manifest_path: Path) -> dict: + text = manifest_path.read_text(encoding="utf-8") + if text.startswith("#!"): + text = "\n".join(text.splitlines()[1:]) + return json.loads(text) + + +def default_cache_root() -> Path: + return Path(tempfile.gettempdir()) / "codex-package" + + +def archive_filename(url: str) -> str: + filename = Path(urlparse(url).path).name + if not filename: + raise RuntimeError(f"Unable to determine archive filename from {url}") + return filename + + +def archive_is_valid(archive_path: Path, artifact: RgArtifact) -> bool: + if not archive_path.is_file(): + return False + try: + verify_archive(archive_path, artifact) + except RuntimeError: + archive_path.unlink(missing_ok=True) + return False + return True + + +def verify_archive(archive_path: Path, artifact: RgArtifact) -> None: + actual_size = archive_path.stat().st_size + if actual_size != artifact.size: + raise RuntimeError( + f"ripgrep archive {archive_path} has size {actual_size}, expected {artifact.size}" + ) + + digest = hashlib.sha256() + with open(archive_path, "rb") as fh: + for chunk in iter(lambda: fh.read(1024 * 1024), b""): + digest.update(chunk) + + actual_digest = digest.hexdigest() + if actual_digest != artifact.digest: + raise RuntimeError( + f"ripgrep archive {archive_path} has sha256 {actual_digest}, " + f"expected {artifact.digest}" + ) + + +def download_archive(url: str, archive_path: Path) -> None: + archive_path.parent.mkdir(parents=True, exist_ok=True) + temp_path = archive_path.with_suffix(f"{archive_path.suffix}.tmp") + temp_path.unlink(missing_ok=True) + try: + with urlopen(url, timeout=DOWNLOAD_TIMEOUT_SECS) as response: + with open(temp_path, "wb") as out: + shutil.copyfileobj(response, out) + temp_path.replace(archive_path) + finally: + temp_path.unlink(missing_ok=True) + + +def extract_rg(archive_path: Path, artifact: RgArtifact, dest: Path) -> None: + dest.parent.mkdir(parents=True, exist_ok=True) + dest.unlink(missing_ok=True) + + if artifact.archive_format == "tar.gz": + with tarfile.open(archive_path, "r:gz") as archive: + try: + member = archive.getmember(artifact.archive_member) + except KeyError as exc: + raise RuntimeError( + f"ripgrep archive {archive_path} is missing {artifact.archive_member!r}" + ) from exc + + extracted = archive.extractfile(member) + if extracted is None: + raise RuntimeError( + f"ripgrep archive member {artifact.archive_member!r} is not a file" + ) + with extracted, open(dest, "wb") as out: + shutil.copyfileobj(extracted, out) + return + + if artifact.archive_format == "zip": + with zipfile.ZipFile(archive_path) as archive: + try: + with archive.open(artifact.archive_member) as extracted: + with open(dest, "wb") as out: + shutil.copyfileobj(extracted, out) + except KeyError as exc: + raise RuntimeError( + f"ripgrep archive {archive_path} is missing {artifact.archive_member!r}" + ) from exc + return + + raise RuntimeError( + f"Unsupported ripgrep archive format {artifact.archive_format!r}; expected tar.gz or zip" + ) diff --git a/scripts/codex_package/targets.py b/scripts/codex_package/targets.py index bdba8bd766..0cb964a4cc 100644 --- a/scripts/codex_package/targets.py +++ b/scripts/codex_package/targets.py @@ -14,6 +14,7 @@ class TargetSpec: target: str is_windows: bool is_linux: bool + dotslash_platform: str @property def exe_suffix(self) -> str: @@ -42,53 +43,43 @@ TARGET_SPECS: dict[str, TargetSpec] = { target="x86_64-unknown-linux-musl", is_windows=False, is_linux=True, + dotslash_platform="linux-x86_64", ), "aarch64-unknown-linux-musl": TargetSpec( target="aarch64-unknown-linux-musl", is_windows=False, is_linux=True, + dotslash_platform="linux-aarch64", ), "x86_64-apple-darwin": TargetSpec( target="x86_64-apple-darwin", is_windows=False, is_linux=False, + dotslash_platform="macos-x86_64", ), "aarch64-apple-darwin": TargetSpec( target="aarch64-apple-darwin", is_windows=False, is_linux=False, + dotslash_platform="macos-aarch64", ), "x86_64-pc-windows-msvc": TargetSpec( target="x86_64-pc-windows-msvc", is_windows=True, is_linux=False, + dotslash_platform="windows-x86_64", ), "aarch64-pc-windows-msvc": TargetSpec( target="aarch64-pc-windows-msvc", is_windows=True, is_linux=False, + dotslash_platform="windows-aarch64", ), } -def resolve_rg_bin(spec: TargetSpec, rg_bin: Path | None) -> Path: - return resolve_input_path( - rg_bin, - default_rg_candidates(spec), - "ripgrep executable", - "--rg-bin", - ) - - -def default_rg_candidates(spec: TargetSpec) -> list[Path]: - return [ - REPO_ROOT / "codex-cli" / "vendor" / spec.target / "path" / spec.rg_name, - ] - - def resolve_input_path( explicit_path: Path | None, - default_candidates: list[Path], description: str, flag_name: str, ) -> Path: @@ -100,14 +91,7 @@ def resolve_input_path( raise RuntimeError(f"{description} is not executable: {path}") return path - for candidate in default_candidates: - if candidate.is_file(): - return candidate.resolve() - - candidates = "\n".join(f" - {candidate}" for candidate in default_candidates) - raise RuntimeError( - f"Could not find {description}. Pass {flag_name}, or create one of:\n{candidates}" - ) + raise RuntimeError(f"Must specify {flag_name} for {description}.") def is_executable(path: Path) -> bool: