Files
EverOS/tests/unit/test_component/test_tokenizer/test_jieba.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

99 lines
3.3 KiB
Python

"""Unit tests for :class:`JiebaTokenizer`.
Verify the contract that callers downstream depend on:
* clean token list (no whitespace, no empty strings),
* CJK + ASCII pass-through under ``cut_for_search`` segmentation,
* default stopword + ``min_length=2`` filter applied,
* batch preserves order.
The tokenizer is symmetric — cascade write side and search query side
both go through this code path, so changes here change BM25 recall on
both ends.
"""
from __future__ import annotations
from everos.component.tokenizer import JiebaTokenizer, build_tokenizer
def test_tokenize_returns_list_for_english() -> None:
tokens = JiebaTokenizer().tokenize("hello world")
assert tokens == ["hello", "world"]
def test_tokenize_drops_pure_whitespace() -> None:
"""Whitespace-only tokens never reach the BM25 column."""
tokens = JiebaTokenizer().tokenize("foo bar")
assert all(t.strip() for t in tokens)
def test_tokenize_empty_input() -> None:
assert JiebaTokenizer().tokenize("") == []
def test_tokenize_cjk_keeps_multichar_words() -> None:
"""``cut_for_search`` keeps multi-character compounds usable by BM25."""
tokens = JiebaTokenizer().tokenize("我爱北京天安门")
# Single-char tokens (我 / 爱) are filtered by min_length=2 (and 我
# is also in the default stopword set). Multi-char compounds survive.
assert "" not in tokens
assert "" not in tokens
assert "北京" in tokens
assert any(t in {"天安门", "天安"} for t in tokens)
def test_tokenize_drops_default_english_stopwords() -> None:
tokens = JiebaTokenizer().tokenize("the quick brown fox")
assert "the" not in tokens
assert "quick" in tokens
assert "brown" in tokens
assert "fox" in tokens
def test_tokenize_drops_short_tokens_below_min_length() -> None:
"""Single-char ASCII tokens are dropped by the default ``min_length=2``."""
tokens = JiebaTokenizer().tokenize("a quick b run")
assert "a" not in tokens
assert "b" not in tokens
assert "quick" in tokens
assert "run" in tokens
def test_tokenize_is_case_insensitive() -> None:
"""Lowercasing is part of the symmetric contract."""
tokens = JiebaTokenizer().tokenize("HELLO World")
assert tokens == ["hello", "world"]
def test_extra_stopwords_extend_defaults() -> None:
tk = JiebaTokenizer(extra_stopwords=frozenset({"hello"}))
tokens = tk.tokenize("hello world")
assert "hello" not in tokens
assert "world" in tokens
def test_custom_min_token_length_relaxes_filter() -> None:
"""Lower ``min_length`` lets shorter tokens through.
Stopword filter still applies — even at ``min_length=1`` the English
article ``"a"`` stays filtered because it's in the default stopwords.
"""
tokens = JiebaTokenizer(min_token_length=1).tokenize("a quick b")
# 'a' is in the default English stopword set even at min_length=1.
assert "a" not in tokens
assert "b" in tokens
assert "quick" in tokens
def test_tokenize_batch_preserves_order() -> None:
tk = JiebaTokenizer()
out = tk.tokenize_batch(["foo bar", "baz", ""])
assert len(out) == 3
assert out[2] == []
def test_build_tokenizer_returns_jieba_default() -> None:
"""Factory exposes the same JiebaTokenizer the cascade handler uses."""
assert isinstance(build_tokenizer(), JiebaTokenizer)