diff --git a/.github/blob-size-allowlist.txt b/.github/blob-size-allowlist.txt new file mode 100644 index 000000000..4c9462e8e --- /dev/null +++ b/.github/blob-size-allowlist.txt @@ -0,0 +1,8 @@ +# Paths are matched exactly, relative to the repository root. +# Keep this list short and limited to intentional large checked-in assets. + +.github/codex-cli-splash.png +MODULE.bazel.lock +codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +codex-rs/tui/tests/fixtures/oss-story.jsonl diff --git a/.github/workflows/blob-size-policy.yml b/.github/workflows/blob-size-policy.yml new file mode 100644 index 000000000..441775c0e --- /dev/null +++ b/.github/workflows/blob-size-policy.yml @@ -0,0 +1,29 @@ +name: blob-size-policy + +on: + pull_request: {} + +jobs: + check: + name: Blob size policy + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Determine PR comparison range + id: range + shell: bash + run: | + set -euo pipefail + echo "base=$(git rev-parse HEAD^1)" >> "$GITHUB_OUTPUT" + echo "head=$(git rev-parse HEAD^2)" >> "$GITHUB_OUTPUT" + + - name: Check changed blob sizes + run: | + python3 scripts/check_blob_size.py \ + --base "${{ steps.range.outputs.base }}" \ + --head "${{ steps.range.outputs.head }}" \ + --max-bytes 512000 \ + --allowlist .github/blob-size-allowlist.txt diff --git a/scripts/check_blob_size.py b/scripts/check_blob_size.py new file mode 100755 index 000000000..455145f18 --- /dev/null +++ b/scripts/check_blob_size.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + + +DEFAULT_MAX_BYTES = 500 * 1024 + + +@dataclass(frozen=True) +class ChangedBlob: + path: str + size_bytes: int + is_allowlisted: bool + is_binary: bool + + +def run_git(*args: str) -> str: + result = subprocess.run( + ["git", *args], + check=True, + capture_output=True, + text=True, + ) + return result.stdout + + +def load_allowlist(path: Path) -> set[str]: + allowlist: set[str] = set() + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.split("#", 1)[0].strip() + if line: + allowlist.add(line) + return allowlist + + +def get_changed_paths(base: str, head: str) -> list[str]: + output = run_git( + "diff", + "--name-only", + "--diff-filter=AM", + "--no-renames", + "-z", + base, + head, + ) + return [path for path in output.split("\0") if path] + + +def is_binary_change(base: str, head: str, path: str) -> bool: + output = run_git( + "diff", + "--numstat", + "--diff-filter=AM", + "--no-renames", + base, + head, + "--", + path, + ).strip() + if not output: + return False + + added, deleted, _ = output.split("\t", 2) + return added == "-" and deleted == "-" + + +def blob_size(commit: str, path: str) -> int: + return int(run_git("cat-file", "-s", f"{commit}:{path}").strip()) + + +def collect_changed_blobs(base: str, head: str, allowlist: set[str]) -> list[ChangedBlob]: + blobs: list[ChangedBlob] = [] + for path in get_changed_paths(base, head): + blobs.append( + ChangedBlob( + path=path, + size_bytes=blob_size(head, path), + is_allowlisted=path in allowlist, + is_binary=is_binary_change(base, head, path), + ) + ) + return blobs + + +def format_kib(size_bytes: int) -> str: + return f"{size_bytes / 1024:.1f} KiB" + + +def write_step_summary( + max_bytes: int, + blobs: list[ChangedBlob], + violations: list[ChangedBlob], +) -> None: + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path: + return + + lines = [ + "## Blob Size Policy", + "", + f"Default max: `{max_bytes}` bytes ({format_kib(max_bytes)})", + f"Changed files checked: `{len(blobs)}`", + f"Violations: `{len(violations)}`", + "", + ] + + if blobs: + lines.extend( + [ + "| Path | Kind | Size | Status |", + "| --- | --- | ---: | --- |", + ] + ) + for blob in blobs: + status = "allowlisted" if blob.is_allowlisted else "ok" + if blob in violations: + status = "blocked" + kind = "binary" if blob.is_binary else "non-binary" + lines.append( + f"| `{blob.path}` | {kind} | `{blob.size_bytes}` bytes ({format_kib(blob.size_bytes)}) | {status} |" + ) + else: + lines.append("No changed files were detected.") + + lines.append("") + Path(summary_path).write_text("\n".join(lines), encoding="utf-8") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Fail if changed blobs exceed the configured size budget." + ) + parser.add_argument("--base", required=True, help="Base git revision to diff against.") + parser.add_argument("--head", required=True, help="Head git revision to inspect.") + parser.add_argument( + "--max-bytes", + type=int, + default=DEFAULT_MAX_BYTES, + help=f"Maximum allowed blob size in bytes. Default: {DEFAULT_MAX_BYTES}.", + ) + parser.add_argument( + "--allowlist", + type=Path, + required=True, + help="Path to the newline-delimited allowlist file.", + ) + args = parser.parse_args() + + allowlist = load_allowlist(args.allowlist) + blobs = collect_changed_blobs(args.base, args.head, allowlist) + violations = [ + blob for blob in blobs if blob.size_bytes > args.max_bytes and not blob.is_allowlisted + ] + + write_step_summary(args.max_bytes, blobs, violations) + + if not blobs: + print("No changed files were detected.") + return 0 + + print(f"Checked {len(blobs)} changed file(s) against the {args.max_bytes}-byte limit.") + for blob in blobs: + status = "allowlisted" if blob.is_allowlisted else "ok" + if blob in violations: + status = "blocked" + kind = "binary" if blob.is_binary else "non-binary" + print( + f"- {blob.path}: {blob.size_bytes} bytes ({format_kib(blob.size_bytes)}) [{kind}, {status}]" + ) + + if violations: + print("\nFile(s) exceed the configured limit:") + for blob in violations: + print(f"- {blob.path}: {blob.size_bytes} bytes > {args.max_bytes} bytes") + print( + "\nIf one of these is a real checked-in asset we want to keep, add its " + "repo-relative path to .github/blob-size-allowlist.txt. Otherwise, " + "shrink it or keep it out of git." + ) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main())