md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
142 lines
3.7 KiB
Python
142 lines
3.7 KiB
Python
"""Jieba-based tokenizer — covers CJK + English mixed content.
|
|
|
|
Uses ``jieba.cut_for_search`` (search-mode segmentation: yields both the
|
|
greedy max-match segment and its finer sub-segments for compound CJK
|
|
words). Same mode as the legacy enterprise keyword-search path uses on
|
|
the query side — keeping cascade write and search query symmetric is
|
|
the hard contract for BM25 recall to work.
|
|
|
|
After segmentation we drop:
|
|
|
|
* whitespace / empty tokens (so the join-on-space output stays clean),
|
|
* tokens shorter than ``min_token_length`` (default 2 — same threshold
|
|
enterprise's ``filter_stopwords(min_length=2)`` uses; single-char
|
|
fragments mostly hurt BM25 precision),
|
|
* tokens in a small bilingual stopword set (Chinese function words +
|
|
English articles / prepositions / aux verbs).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Sequence
|
|
from typing import Final
|
|
|
|
import jieba
|
|
|
|
# Small bilingual stopword set. Intentionally tight (not a full
|
|
# Chinese stopword list) so the behaviour is predictable; callers
|
|
# tuning recall can subclass / extend.
|
|
_DEFAULT_STOPWORDS: Final[frozenset[str]] = frozenset(
|
|
{
|
|
# English — articles / prepositions / aux verbs that dominate BM25
|
|
# idf-noise but add no recall value.
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"if",
|
|
"of",
|
|
"to",
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"by",
|
|
"for",
|
|
"with",
|
|
"as",
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"has",
|
|
"have",
|
|
"had",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"it",
|
|
"its",
|
|
# Chinese — function words / particles. ``cut_for_search`` emits
|
|
# these as single-char tokens anyway, and the min_length=2 floor
|
|
# would drop most; listing them explicitly makes the intent clear
|
|
# and is a no-op when min_length filtering also kicks in.
|
|
"的",
|
|
"了",
|
|
"和",
|
|
"是",
|
|
"在",
|
|
"我",
|
|
"你",
|
|
"他",
|
|
"她",
|
|
"它",
|
|
"也",
|
|
"都",
|
|
"就",
|
|
"还",
|
|
"或",
|
|
"及",
|
|
"与",
|
|
"对",
|
|
"把",
|
|
"被",
|
|
"有",
|
|
"没",
|
|
"不",
|
|
"啊",
|
|
"吗",
|
|
"呢",
|
|
"吧",
|
|
"哦",
|
|
}
|
|
)
|
|
|
|
_DEFAULT_MIN_TOKEN_LENGTH: Final[int] = 2
|
|
|
|
|
|
class JiebaTokenizer:
|
|
"""Tokenizer that calls into ``jieba.cut_for_search`` and filters."""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
min_token_length: int = _DEFAULT_MIN_TOKEN_LENGTH,
|
|
extra_stopwords: frozenset[str] | None = None,
|
|
) -> None:
|
|
# Touching ``jieba.initialize()`` here would force eager dict load
|
|
# at import time and balloon test-collection latency. ``jieba.cut*``
|
|
# lazy-loads on first call instead.
|
|
self._min_len = min_token_length
|
|
self._stopwords = (
|
|
_DEFAULT_STOPWORDS | extra_stopwords
|
|
if extra_stopwords
|
|
else _DEFAULT_STOPWORDS
|
|
)
|
|
|
|
def tokenize(self, text: str) -> list[str]:
|
|
if not text:
|
|
return []
|
|
out: list[str] = []
|
|
for raw in jieba.cut_for_search(text):
|
|
tok = raw.strip().lower()
|
|
if not tok or tok.isspace():
|
|
continue
|
|
if len(tok) < self._min_len:
|
|
continue
|
|
if tok in self._stopwords:
|
|
continue
|
|
out.append(tok)
|
|
return out
|
|
|
|
def tokenize_batch(self, texts: Sequence[str]) -> list[list[str]]:
|
|
return [self.tokenize(t) for t in texts]
|