check for large binaries in CI (#14382)

Prevent binaries >500KB from being committed. And maintain an allowlist
if we need to bypass on a case-by-case basis.

I checked the currently tracked binary-like assets in the repo. There
are only 5 obvious committed binaries by extension/MIME type:
- `.github/codex-cli-splash.png`: `838,131` bytes, about `818 KiB`
- `codex-rs/vendor/bubblewrap/bubblewrap.jpg`: `40,239` bytes, about `39
KiB`
-
`codex-rs/skills/src/assets/samples/skill-creator/assets/skill-creator.png`:
`1,563` bytes
- `codex-rs/skills/src/assets/samples/openai-docs/assets/openai.png`:
`1,429` bytes
-
`codex-rs/skills/src/assets/samples/skill-installer/assets/skill-installer.png`:
`1,086` bytes

So `500 KB` looks like a good default for this repo. It would only trip
on one existing intentional asset, which keeps the allowlist small and
the policy easy to understand.

Here's a smoke-test from a throwaway branch that tries to commit a large
binary:
https://github.com/openai/codex/actions/runs/22971558828/job/66689330435?pr=14383
This commit is contained in:
Owen Lin 2026-03-11 15:39:08 -07:00 committed by GitHub
parent 8791f0ab9a
commit f50e88db82
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 230 additions and 0 deletions

8
.github/blob-size-allowlist.txt vendored Normal file
View file

@ -0,0 +1,8 @@
# Paths are matched exactly, relative to the repository root.
# Keep this list short and limited to intentional large checked-in assets.
.github/codex-cli-splash.png
MODULE.bazel.lock
codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
codex-rs/tui/tests/fixtures/oss-story.jsonl

29
.github/workflows/blob-size-policy.yml vendored Normal file
View file

@ -0,0 +1,29 @@
name: blob-size-policy
on:
pull_request: {}
jobs:
check:
name: Blob size policy
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Determine PR comparison range
id: range
shell: bash
run: |
set -euo pipefail
echo "base=$(git rev-parse HEAD^1)" >> "$GITHUB_OUTPUT"
echo "head=$(git rev-parse HEAD^2)" >> "$GITHUB_OUTPUT"
- name: Check changed blob sizes
run: |
python3 scripts/check_blob_size.py \
--base "${{ steps.range.outputs.base }}" \
--head "${{ steps.range.outputs.head }}" \
--max-bytes 512000 \
--allowlist .github/blob-size-allowlist.txt

193
scripts/check_blob_size.py Executable file
View file

@ -0,0 +1,193 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
DEFAULT_MAX_BYTES = 500 * 1024
@dataclass(frozen=True)
class ChangedBlob:
path: str
size_bytes: int
is_allowlisted: bool
is_binary: bool
def run_git(*args: str) -> str:
result = subprocess.run(
["git", *args],
check=True,
capture_output=True,
text=True,
)
return result.stdout
def load_allowlist(path: Path) -> set[str]:
allowlist: set[str] = set()
for raw_line in path.read_text(encoding="utf-8").splitlines():
line = raw_line.split("#", 1)[0].strip()
if line:
allowlist.add(line)
return allowlist
def get_changed_paths(base: str, head: str) -> list[str]:
output = run_git(
"diff",
"--name-only",
"--diff-filter=AM",
"--no-renames",
"-z",
base,
head,
)
return [path for path in output.split("\0") if path]
def is_binary_change(base: str, head: str, path: str) -> bool:
output = run_git(
"diff",
"--numstat",
"--diff-filter=AM",
"--no-renames",
base,
head,
"--",
path,
).strip()
if not output:
return False
added, deleted, _ = output.split("\t", 2)
return added == "-" and deleted == "-"
def blob_size(commit: str, path: str) -> int:
return int(run_git("cat-file", "-s", f"{commit}:{path}").strip())
def collect_changed_blobs(base: str, head: str, allowlist: set[str]) -> list[ChangedBlob]:
blobs: list[ChangedBlob] = []
for path in get_changed_paths(base, head):
blobs.append(
ChangedBlob(
path=path,
size_bytes=blob_size(head, path),
is_allowlisted=path in allowlist,
is_binary=is_binary_change(base, head, path),
)
)
return blobs
def format_kib(size_bytes: int) -> str:
return f"{size_bytes / 1024:.1f} KiB"
def write_step_summary(
max_bytes: int,
blobs: list[ChangedBlob],
violations: list[ChangedBlob],
) -> None:
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
if not summary_path:
return
lines = [
"## Blob Size Policy",
"",
f"Default max: `{max_bytes}` bytes ({format_kib(max_bytes)})",
f"Changed files checked: `{len(blobs)}`",
f"Violations: `{len(violations)}`",
"",
]
if blobs:
lines.extend(
[
"| Path | Kind | Size | Status |",
"| --- | --- | ---: | --- |",
]
)
for blob in blobs:
status = "allowlisted" if blob.is_allowlisted else "ok"
if blob in violations:
status = "blocked"
kind = "binary" if blob.is_binary else "non-binary"
lines.append(
f"| `{blob.path}` | {kind} | `{blob.size_bytes}` bytes ({format_kib(blob.size_bytes)}) | {status} |"
)
else:
lines.append("No changed files were detected.")
lines.append("")
Path(summary_path).write_text("\n".join(lines), encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(
description="Fail if changed blobs exceed the configured size budget."
)
parser.add_argument("--base", required=True, help="Base git revision to diff against.")
parser.add_argument("--head", required=True, help="Head git revision to inspect.")
parser.add_argument(
"--max-bytes",
type=int,
default=DEFAULT_MAX_BYTES,
help=f"Maximum allowed blob size in bytes. Default: {DEFAULT_MAX_BYTES}.",
)
parser.add_argument(
"--allowlist",
type=Path,
required=True,
help="Path to the newline-delimited allowlist file.",
)
args = parser.parse_args()
allowlist = load_allowlist(args.allowlist)
blobs = collect_changed_blobs(args.base, args.head, allowlist)
violations = [
blob for blob in blobs if blob.size_bytes > args.max_bytes and not blob.is_allowlisted
]
write_step_summary(args.max_bytes, blobs, violations)
if not blobs:
print("No changed files were detected.")
return 0
print(f"Checked {len(blobs)} changed file(s) against the {args.max_bytes}-byte limit.")
for blob in blobs:
status = "allowlisted" if blob.is_allowlisted else "ok"
if blob in violations:
status = "blocked"
kind = "binary" if blob.is_binary else "non-binary"
print(
f"- {blob.path}: {blob.size_bytes} bytes ({format_kib(blob.size_bytes)}) [{kind}, {status}]"
)
if violations:
print("\nFile(s) exceed the configured limit:")
for blob in violations:
print(f"- {blob.path}: {blob.size_bytes} bytes > {args.max_bytes} bytes")
print(
"\nIf one of these is a real checked-in asset we want to keep, add its "
"repo-relative path to .github/blob-size-allowlist.txt. Otherwise, "
"shrink it or keep it out of git."
)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())