check for large binaries in CI (#14382)
Prevent binaries >500KB from being committed. And maintain an allowlist if we need to bypass on a case-by-case basis. I checked the currently tracked binary-like assets in the repo. There are only 5 obvious committed binaries by extension/MIME type: - `.github/codex-cli-splash.png`: `838,131` bytes, about `818 KiB` - `codex-rs/vendor/bubblewrap/bubblewrap.jpg`: `40,239` bytes, about `39 KiB` - `codex-rs/skills/src/assets/samples/skill-creator/assets/skill-creator.png`: `1,563` bytes - `codex-rs/skills/src/assets/samples/openai-docs/assets/openai.png`: `1,429` bytes - `codex-rs/skills/src/assets/samples/skill-installer/assets/skill-installer.png`: `1,086` bytes So `500 KB` looks like a good default for this repo. It would only trip on one existing intentional asset, which keeps the allowlist small and the policy easy to understand. Here's a smoke-test from a throwaway branch that tries to commit a large binary: https://github.com/openai/codex/actions/runs/22971558828/job/66689330435?pr=14383
This commit is contained in:
parent
8791f0ab9a
commit
f50e88db82
3 changed files with 230 additions and 0 deletions
8
.github/blob-size-allowlist.txt
vendored
Normal file
8
.github/blob-size-allowlist.txt
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# Paths are matched exactly, relative to the repository root.
|
||||
# Keep this list short and limited to intentional large checked-in assets.
|
||||
|
||||
.github/codex-cli-splash.png
|
||||
MODULE.bazel.lock
|
||||
codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
|
||||
codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
|
||||
codex-rs/tui/tests/fixtures/oss-story.jsonl
|
||||
29
.github/workflows/blob-size-policy.yml
vendored
Normal file
29
.github/workflows/blob-size-policy.yml
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
name: blob-size-policy
|
||||
|
||||
on:
|
||||
pull_request: {}
|
||||
|
||||
jobs:
|
||||
check:
|
||||
name: Blob size policy
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine PR comparison range
|
||||
id: range
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "base=$(git rev-parse HEAD^1)" >> "$GITHUB_OUTPUT"
|
||||
echo "head=$(git rev-parse HEAD^2)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Check changed blob sizes
|
||||
run: |
|
||||
python3 scripts/check_blob_size.py \
|
||||
--base "${{ steps.range.outputs.base }}" \
|
||||
--head "${{ steps.range.outputs.head }}" \
|
||||
--max-bytes 512000 \
|
||||
--allowlist .github/blob-size-allowlist.txt
|
||||
193
scripts/check_blob_size.py
Executable file
193
scripts/check_blob_size.py
Executable file
|
|
@ -0,0 +1,193 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DEFAULT_MAX_BYTES = 500 * 1024
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChangedBlob:
|
||||
path: str
|
||||
size_bytes: int
|
||||
is_allowlisted: bool
|
||||
is_binary: bool
|
||||
|
||||
|
||||
def run_git(*args: str) -> str:
|
||||
result = subprocess.run(
|
||||
["git", *args],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def load_allowlist(path: Path) -> set[str]:
|
||||
allowlist: set[str] = set()
|
||||
for raw_line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = raw_line.split("#", 1)[0].strip()
|
||||
if line:
|
||||
allowlist.add(line)
|
||||
return allowlist
|
||||
|
||||
|
||||
def get_changed_paths(base: str, head: str) -> list[str]:
|
||||
output = run_git(
|
||||
"diff",
|
||||
"--name-only",
|
||||
"--diff-filter=AM",
|
||||
"--no-renames",
|
||||
"-z",
|
||||
base,
|
||||
head,
|
||||
)
|
||||
return [path for path in output.split("\0") if path]
|
||||
|
||||
|
||||
def is_binary_change(base: str, head: str, path: str) -> bool:
|
||||
output = run_git(
|
||||
"diff",
|
||||
"--numstat",
|
||||
"--diff-filter=AM",
|
||||
"--no-renames",
|
||||
base,
|
||||
head,
|
||||
"--",
|
||||
path,
|
||||
).strip()
|
||||
if not output:
|
||||
return False
|
||||
|
||||
added, deleted, _ = output.split("\t", 2)
|
||||
return added == "-" and deleted == "-"
|
||||
|
||||
|
||||
def blob_size(commit: str, path: str) -> int:
|
||||
return int(run_git("cat-file", "-s", f"{commit}:{path}").strip())
|
||||
|
||||
|
||||
def collect_changed_blobs(base: str, head: str, allowlist: set[str]) -> list[ChangedBlob]:
|
||||
blobs: list[ChangedBlob] = []
|
||||
for path in get_changed_paths(base, head):
|
||||
blobs.append(
|
||||
ChangedBlob(
|
||||
path=path,
|
||||
size_bytes=blob_size(head, path),
|
||||
is_allowlisted=path in allowlist,
|
||||
is_binary=is_binary_change(base, head, path),
|
||||
)
|
||||
)
|
||||
return blobs
|
||||
|
||||
|
||||
def format_kib(size_bytes: int) -> str:
|
||||
return f"{size_bytes / 1024:.1f} KiB"
|
||||
|
||||
|
||||
def write_step_summary(
|
||||
max_bytes: int,
|
||||
blobs: list[ChangedBlob],
|
||||
violations: list[ChangedBlob],
|
||||
) -> None:
|
||||
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
if not summary_path:
|
||||
return
|
||||
|
||||
lines = [
|
||||
"## Blob Size Policy",
|
||||
"",
|
||||
f"Default max: `{max_bytes}` bytes ({format_kib(max_bytes)})",
|
||||
f"Changed files checked: `{len(blobs)}`",
|
||||
f"Violations: `{len(violations)}`",
|
||||
"",
|
||||
]
|
||||
|
||||
if blobs:
|
||||
lines.extend(
|
||||
[
|
||||
"| Path | Kind | Size | Status |",
|
||||
"| --- | --- | ---: | --- |",
|
||||
]
|
||||
)
|
||||
for blob in blobs:
|
||||
status = "allowlisted" if blob.is_allowlisted else "ok"
|
||||
if blob in violations:
|
||||
status = "blocked"
|
||||
kind = "binary" if blob.is_binary else "non-binary"
|
||||
lines.append(
|
||||
f"| `{blob.path}` | {kind} | `{blob.size_bytes}` bytes ({format_kib(blob.size_bytes)}) | {status} |"
|
||||
)
|
||||
else:
|
||||
lines.append("No changed files were detected.")
|
||||
|
||||
lines.append("")
|
||||
Path(summary_path).write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fail if changed blobs exceed the configured size budget."
|
||||
)
|
||||
parser.add_argument("--base", required=True, help="Base git revision to diff against.")
|
||||
parser.add_argument("--head", required=True, help="Head git revision to inspect.")
|
||||
parser.add_argument(
|
||||
"--max-bytes",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_BYTES,
|
||||
help=f"Maximum allowed blob size in bytes. Default: {DEFAULT_MAX_BYTES}.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allowlist",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to the newline-delimited allowlist file.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
allowlist = load_allowlist(args.allowlist)
|
||||
blobs = collect_changed_blobs(args.base, args.head, allowlist)
|
||||
violations = [
|
||||
blob for blob in blobs if blob.size_bytes > args.max_bytes and not blob.is_allowlisted
|
||||
]
|
||||
|
||||
write_step_summary(args.max_bytes, blobs, violations)
|
||||
|
||||
if not blobs:
|
||||
print("No changed files were detected.")
|
||||
return 0
|
||||
|
||||
print(f"Checked {len(blobs)} changed file(s) against the {args.max_bytes}-byte limit.")
|
||||
for blob in blobs:
|
||||
status = "allowlisted" if blob.is_allowlisted else "ok"
|
||||
if blob in violations:
|
||||
status = "blocked"
|
||||
kind = "binary" if blob.is_binary else "non-binary"
|
||||
print(
|
||||
f"- {blob.path}: {blob.size_bytes} bytes ({format_kib(blob.size_bytes)}) [{kind}, {status}]"
|
||||
)
|
||||
|
||||
if violations:
|
||||
print("\nFile(s) exceed the configured limit:")
|
||||
for blob in violations:
|
||||
print(f"- {blob.path}: {blob.size_bytes} bytes > {args.max_bytes} bytes")
|
||||
print(
|
||||
"\nIf one of these is a real checked-in asset we want to keep, add its "
|
||||
"repo-relative path to .github/blob-size-allowlist.txt. Otherwise, "
|
||||
"shrink it or keep it out of git."
|
||||
)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Reference in a new issue