md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
94 lines
3.4 KiB
Python
94 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Scan tracked text files for CJK characters outside the language-policy allowlist.
|
|
|
|
Replaces the grep-based reference command that used to live in
|
|
``.claude/rules/language-policy.md``. That command silently produced false
|
|
negatives on this repo: the ``grep -zZv`` + ``xargs -0`` NUL pipeline
|
|
mis-parsed the path list and reported "clean" even when violations existed.
|
|
|
|
Exit code 0 = clean, 1 = violations found (paths + line numbers printed).
|
|
|
|
Usage:
|
|
python scripts/check_cjk.py # scan all tracked files
|
|
python scripts/check_cjk.py a.py b.md # scan specific files (pre-commit)
|
|
python scripts/check_cjk.py --quiet # per-file counts only
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import subprocess
|
|
|
|
# CJK / fullwidth code points: CJK symbols & ideographs, Hangul syllables,
|
|
# and halfwidth/fullwidth forms. Kept as escapes so this file stays ASCII.
|
|
_CJK = re.compile("[\\u3000-\\u9fff\\uac00-\\ud7af\\uff00-\\uffef]")
|
|
|
|
|
|
def _is_allowlisted(path: str) -> bool:
|
|
"""Return True if CJK is permitted in this path (see language-policy.md)."""
|
|
name = os.path.basename(path)
|
|
# 1. Tests: fixtures, sample inputs, and CJK-behavior assertions.
|
|
if path.startswith("tests/"):
|
|
return True
|
|
# 2. Tokenizer NLP resources (stopword lists, segmentation examples).
|
|
if path.startswith("src/everos/component/tokenizer/"):
|
|
return True
|
|
# 3. Locale-suffixed sample data, e.g. data/solo_chat_zh.json.
|
|
if re.match(r"data/.*_(zh|ja|ko)\.", path):
|
|
return True
|
|
# 4. Translated doc mirrors, e.g. README.zh.md.
|
|
if re.search(r"\.(zh|ja|ko)\.md$", path):
|
|
return True
|
|
# 5. Filenames explicitly marked with a CJK/locale token.
|
|
return bool(re.search(r"(^|[._-])(cjk|zh|ja|ko)([._-]|$)", name))
|
|
|
|
|
|
def _tracked_files() -> list[str]:
|
|
out = subprocess.check_output(["git", "ls-files"], text=True)
|
|
return out.splitlines()
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="CJK language-policy scanner.")
|
|
parser.add_argument("files", nargs="*", help="files to scan (default: all tracked)")
|
|
parser.add_argument("--quiet", action="store_true", help="per-file counts only")
|
|
args = parser.parse_args()
|
|
|
|
paths = args.files or _tracked_files()
|
|
violations: list[tuple[str, int, str]] = []
|
|
for path in paths:
|
|
if _is_allowlisted(path):
|
|
continue
|
|
try:
|
|
with open(path, encoding="utf-8") as fh:
|
|
lines = fh.readlines()
|
|
except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError):
|
|
continue # binary / missing / directory: nothing to scan
|
|
for i, line in enumerate(lines, start=1):
|
|
if _CJK.search(line):
|
|
violations.append((path, i, line.strip()))
|
|
|
|
if not violations:
|
|
print("CJK language-policy: clean")
|
|
return 0
|
|
|
|
by_file: dict[str, int] = {}
|
|
for path, _lineno, _text in violations:
|
|
by_file[path] = by_file.get(path, 0) + 1
|
|
|
|
print(f"CJK language-policy: {len(violations)} hit(s) in {len(by_file)} file(s)\n")
|
|
if args.quiet:
|
|
for path, count in sorted(by_file.items(), key=lambda kv: -kv[1]):
|
|
print(f" {count:4d} {path}")
|
|
else:
|
|
for path, lineno, text in violations:
|
|
print(f" {path}:{lineno}: {text[:100]}")
|
|
print("\nAllowed CJK locations are defined in .claude/rules/language-policy.md")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|