chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
93
scripts/check_cjk.py
Normal file
93
scripts/check_cjk.py
Normal file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scan tracked text files for CJK characters outside the language-policy allowlist.
|
||||
|
||||
Replaces the grep-based reference command that used to live in
|
||||
``.claude/rules/language-policy.md``. That command silently produced false
|
||||
negatives on this repo: the ``grep -zZv`` + ``xargs -0`` NUL pipeline
|
||||
mis-parsed the path list and reported "clean" even when violations existed.
|
||||
|
||||
Exit code 0 = clean, 1 = violations found (paths + line numbers printed).
|
||||
|
||||
Usage:
|
||||
python scripts/check_cjk.py # scan all tracked files
|
||||
python scripts/check_cjk.py a.py b.md # scan specific files (pre-commit)
|
||||
python scripts/check_cjk.py --quiet # per-file counts only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
# CJK / fullwidth code points: CJK symbols & ideographs, Hangul syllables,
|
||||
# and halfwidth/fullwidth forms. Kept as escapes so this file stays ASCII.
|
||||
_CJK = re.compile("[\\u3000-\\u9fff\\uac00-\\ud7af\\uff00-\\uffef]")
|
||||
|
||||
|
||||
def _is_allowlisted(path: str) -> bool:
|
||||
"""Return True if CJK is permitted in this path (see language-policy.md)."""
|
||||
name = os.path.basename(path)
|
||||
# 1. Tests: fixtures, sample inputs, and CJK-behavior assertions.
|
||||
if path.startswith("tests/"):
|
||||
return True
|
||||
# 2. Tokenizer NLP resources (stopword lists, segmentation examples).
|
||||
if path.startswith("src/everos/component/tokenizer/"):
|
||||
return True
|
||||
# 3. Locale-suffixed sample data, e.g. data/solo_chat_zh.json.
|
||||
if re.match(r"data/.*_(zh|ja|ko)\.", path):
|
||||
return True
|
||||
# 4. Translated doc mirrors, e.g. README.zh.md.
|
||||
if re.search(r"\.(zh|ja|ko)\.md$", path):
|
||||
return True
|
||||
# 5. Filenames explicitly marked with a CJK/locale token.
|
||||
return bool(re.search(r"(^|[._-])(cjk|zh|ja|ko)([._-]|$)", name))
|
||||
|
||||
|
||||
def _tracked_files() -> list[str]:
|
||||
out = subprocess.check_output(["git", "ls-files"], text=True)
|
||||
return out.splitlines()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="CJK language-policy scanner.")
|
||||
parser.add_argument("files", nargs="*", help="files to scan (default: all tracked)")
|
||||
parser.add_argument("--quiet", action="store_true", help="per-file counts only")
|
||||
args = parser.parse_args()
|
||||
|
||||
paths = args.files or _tracked_files()
|
||||
violations: list[tuple[str, int, str]] = []
|
||||
for path in paths:
|
||||
if _is_allowlisted(path):
|
||||
continue
|
||||
try:
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
lines = fh.readlines()
|
||||
except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError):
|
||||
continue # binary / missing / directory: nothing to scan
|
||||
for i, line in enumerate(lines, start=1):
|
||||
if _CJK.search(line):
|
||||
violations.append((path, i, line.strip()))
|
||||
|
||||
if not violations:
|
||||
print("CJK language-policy: clean")
|
||||
return 0
|
||||
|
||||
by_file: dict[str, int] = {}
|
||||
for path, _lineno, _text in violations:
|
||||
by_file[path] = by_file.get(path, 0) + 1
|
||||
|
||||
print(f"CJK language-policy: {len(violations)} hit(s) in {len(by_file)} file(s)\n")
|
||||
if args.quiet:
|
||||
for path, count in sorted(by_file.items(), key=lambda kv: -kv[1]):
|
||||
print(f" {count:4d} {path}")
|
||||
else:
|
||||
for path, lineno, text in violations:
|
||||
print(f" {path}:{lineno}: {text[:100]}")
|
||||
print("\nAllowed CJK locations are defined in .claude/rules/language-policy.md")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user