EverOS/src/everos/component/tokenizer/jieba_provider.py

"""Jieba-based tokenizer — covers CJK + English mixed content.

Uses ``jieba.cut_for_search`` (search-mode segmentation: yields both the
greedy max-match segment and its finer sub-segments for compound CJK
words). Same mode as the legacy enterprise keyword-search path uses on
the query side — keeping cascade write and search query symmetric is
the hard contract for BM25 recall to work.

After segmentation we drop:

* whitespace / empty tokens (so the join-on-space output stays clean),
* tokens shorter than ``min_token_length`` (default 2 — same threshold
  enterprise's ``filter_stopwords(min_length=2)`` uses; single-char
  fragments mostly hurt BM25 precision),
* tokens in a small bilingual stopword set (Chinese function words +
  English articles / prepositions / aux verbs).
"""

from __future__ import annotations

from collections.abc import Sequence
from typing import Final

import jieba

# Small bilingual stopword set. Intentionally tight (not a full
# Chinese stopword list) so the behaviour is predictable; callers
# tuning recall can subclass / extend.
_DEFAULT_STOPWORDS: Final[frozenset[str]] = frozenset(
    {
        # English — articles / prepositions / aux verbs that dominate BM25
        # idf-noise but add no recall value.
        "the",
        "a",
        "an",
        "and",
        "or",
        "but",
        "if",
        "of",
        "to",
        "in",
        "on",
        "at",
        "by",
        "for",
        "with",
        "as",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "do",
        "does",
        "did",
        "has",
        "have",
        "had",
        "this",
        "that",
        "these",
        "those",
        "it",
        "its",
        # Chinese — function words / particles. ``cut_for_search`` emits
        # these as single-char tokens anyway, and the min_length=2 floor
        # would drop most; listing them explicitly makes the intent clear
        # and is a no-op when min_length filtering also kicks in.
        "的",
        "了",
        "和",
        "是",
        "在",
        "我",
        "你",
        "他",
        "她",
        "它",
        "也",
        "都",
        "就",
        "还",
        "或",
        "及",
        "与",
        "对",
        "把",
        "被",
        "有",
        "没",
        "不",
        "啊",
        "吗",
        "呢",
        "吧",
        "哦",
    }
)

_DEFAULT_MIN_TOKEN_LENGTH: Final[int] = 2


class JiebaTokenizer:
    """Tokenizer that calls into ``jieba.cut_for_search`` and filters."""

    def __init__(
        self,
        *,
        min_token_length: int = _DEFAULT_MIN_TOKEN_LENGTH,
        extra_stopwords: frozenset[str] | None = None,
    ) -> None:
        # Touching ``jieba.initialize()`` here would force eager dict load
        # at import time and balloon test-collection latency. ``jieba.cut*``
        # lazy-loads on first call instead.
        self._min_len = min_token_length
        self._stopwords = (
            _DEFAULT_STOPWORDS | extra_stopwords
            if extra_stopwords
            else _DEFAULT_STOPWORDS
        )

    def tokenize(self, text: str) -> list[str]:
        if not text:
            return []
        out: list[str] = []
        for raw in jieba.cut_for_search(text):
            tok = raw.strip().lower()
            if not tok or tok.isspace():
                continue
            if len(tok) < self._min_len:
                continue
            if tok in self._stopwords:
                continue
            out.append(tok)
        return out

    def tokenize_batch(self, texts: Sequence[str]) -> list[list[str]]:
        return [self.tokenize(t) for t in texts]