EverOS/scripts/check_cjk.py

#!/usr/bin/env python3
"""Scan tracked text files for CJK characters outside the language-policy allowlist.

Replaces the grep-based reference command that used to live in
``.claude/rules/language-policy.md``. That command silently produced false
negatives on this repo: the ``grep -zZv`` + ``xargs -0`` NUL pipeline
mis-parsed the path list and reported "clean" even when violations existed.

Exit code 0 = clean, 1 = violations found (paths + line numbers printed).

Usage:
    python scripts/check_cjk.py            # scan all tracked files
    python scripts/check_cjk.py a.py b.md  # scan specific files (pre-commit)
    python scripts/check_cjk.py --quiet    # per-file counts only
"""

from __future__ import annotations

import argparse
import os
import re
import subprocess

# CJK / fullwidth code points: CJK symbols & ideographs, Hangul syllables,
# and halfwidth/fullwidth forms. Kept as escapes so this file stays ASCII.
_CJK = re.compile("[\\u3000-\\u9fff\\uac00-\\ud7af\\uff00-\\uffef]")


def _is_allowlisted(path: str) -> bool:
    """Return True if CJK is permitted in this path (see language-policy.md)."""
    name = os.path.basename(path)
    # 1. Tests: fixtures, sample inputs, and CJK-behavior assertions.
    if path.startswith("tests/"):
        return True
    # 2. Tokenizer NLP resources (stopword lists, segmentation examples).
    if path.startswith("src/everos/component/tokenizer/"):
        return True
    # 3. Locale-suffixed sample data, e.g. data/solo_chat_zh.json.
    if re.match(r"data/.*_(zh|ja|ko)\.", path):
        return True
    # 4. Translated doc mirrors, e.g. README.zh.md.
    if re.search(r"\.(zh|ja|ko)\.md$", path):
        return True
    # 5. Filenames explicitly marked with a CJK/locale token.
    return bool(re.search(r"(^|[._-])(cjk|zh|ja|ko)([._-]|$)", name))


def _tracked_files() -> list[str]:
    out = subprocess.check_output(["git", "ls-files"], text=True)
    return out.splitlines()


def main() -> int:
    parser = argparse.ArgumentParser(description="CJK language-policy scanner.")
    parser.add_argument("files", nargs="*", help="files to scan (default: all tracked)")
    parser.add_argument("--quiet", action="store_true", help="per-file counts only")
    args = parser.parse_args()

    paths = args.files or _tracked_files()
    violations: list[tuple[str, int, str]] = []
    for path in paths:
        if _is_allowlisted(path):
            continue
        try:
            with open(path, encoding="utf-8") as fh:
                lines = fh.readlines()
        except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError):
            continue  # binary / missing / directory: nothing to scan
        for i, line in enumerate(lines, start=1):
            if _CJK.search(line):
                violations.append((path, i, line.strip()))

    if not violations:
        print("CJK language-policy: clean")
        return 0

    by_file: dict[str, int] = {}
    for path, _lineno, _text in violations:
        by_file[path] = by_file.get(path, 0) + 1

    print(f"CJK language-policy: {len(violations)} hit(s) in {len(by_file)} file(s)\n")
    if args.quiet:
        for path, count in sorted(by_file.items(), key=lambda kv: -kv[1]):
            print(f"  {count:4d}  {path}")
    else:
        for path, lineno, text in violations:
            print(f"  {path}:{lineno}: {text[:100]}")
    print("\nAllowed CJK locations are defined in .claude/rules/language-policy.md")
    return 1


if __name__ == "__main__":
    raise SystemExit(main())