Files
EverOS/src/everos/component/tokenizer/jieba_provider.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

142 lines
3.7 KiB
Python

"""Jieba-based tokenizer — covers CJK + English mixed content.
Uses ``jieba.cut_for_search`` (search-mode segmentation: yields both the
greedy max-match segment and its finer sub-segments for compound CJK
words). Same mode as the legacy enterprise keyword-search path uses on
the query side — keeping cascade write and search query symmetric is
the hard contract for BM25 recall to work.
After segmentation we drop:
* whitespace / empty tokens (so the join-on-space output stays clean),
* tokens shorter than ``min_token_length`` (default 2 — same threshold
enterprise's ``filter_stopwords(min_length=2)`` uses; single-char
fragments mostly hurt BM25 precision),
* tokens in a small bilingual stopword set (Chinese function words +
English articles / prepositions / aux verbs).
"""
from __future__ import annotations
from collections.abc import Sequence
from typing import Final
import jieba
# Small bilingual stopword set. Intentionally tight (not a full
# Chinese stopword list) so the behaviour is predictable; callers
# tuning recall can subclass / extend.
_DEFAULT_STOPWORDS: Final[frozenset[str]] = frozenset(
{
# English — articles / prepositions / aux verbs that dominate BM25
# idf-noise but add no recall value.
"the",
"a",
"an",
"and",
"or",
"but",
"if",
"of",
"to",
"in",
"on",
"at",
"by",
"for",
"with",
"as",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"do",
"does",
"did",
"has",
"have",
"had",
"this",
"that",
"these",
"those",
"it",
"its",
# Chinese — function words / particles. ``cut_for_search`` emits
# these as single-char tokens anyway, and the min_length=2 floor
# would drop most; listing them explicitly makes the intent clear
# and is a no-op when min_length filtering also kicks in.
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
}
)
_DEFAULT_MIN_TOKEN_LENGTH: Final[int] = 2
class JiebaTokenizer:
"""Tokenizer that calls into ``jieba.cut_for_search`` and filters."""
def __init__(
self,
*,
min_token_length: int = _DEFAULT_MIN_TOKEN_LENGTH,
extra_stopwords: frozenset[str] | None = None,
) -> None:
# Touching ``jieba.initialize()`` here would force eager dict load
# at import time and balloon test-collection latency. ``jieba.cut*``
# lazy-loads on first call instead.
self._min_len = min_token_length
self._stopwords = (
_DEFAULT_STOPWORDS | extra_stopwords
if extra_stopwords
else _DEFAULT_STOPWORDS
)
def tokenize(self, text: str) -> list[str]:
if not text:
return []
out: list[str] = []
for raw in jieba.cut_for_search(text):
tok = raw.strip().lower()
if not tok or tok.isspace():
continue
if len(tok) < self._min_len:
continue
if tok in self._stopwords:
continue
out.append(tok)
return out
def tokenize_batch(self, texts: Sequence[str]) -> list[list[str]]:
return [self.tokenize(t) for t in texts]