chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/tests/integration/search/test_search_e2e.py
+++ b/tests/integration/search/test_search_e2e.py
@ -0,0 +1,241 @@
+"""End-to-end ``/api/v1/memory/search`` tests over a real LoCoMo corpus.
+
+Six tests, each pinning one path through :class:`SearchManager`:
+
+============================================  =================================
+``test_keyword_recalls_atomic_fact_origin``   keyword (BM25 only)
+``test_vector_recalls_atomic_fact_origin``    vector (cosine only)
+``test_hybrid_with_profile_returns_profile``  hybrid + ``include_profile``
+``test_partition_respects_owner_id``          cross-owner isolation
+``test_unknown_owner_returns_empty_200``      empty response, no 500
+``test_filter_dsl_compiles_and_excludes``     filters DSL → LanceDB ``where``
+============================================  =================================
+
+The corpus is built once by :func:`_ingested_memory_root` (session-
+scoped fixture in ``conftest.py``) and shared across all tests. Each
+test re-attaches a fresh lifespan via :func:`search_client`, so the
+search-manager singletons rebuild from cold per-test — a regression
+in the lazy-init path can't hide behind warm state from a prior test.
+
+Bootstrapping: queries are derived from the corpus's own
+``atomic_facts`` md files via :func:`pick_query_seeds`, not
+hardcoded. Closed-loop correctness — what the pipeline extracted
+should be findable by the search side.
+
+Assertions follow the project's "守恒 + 下界 + 形状" convention
+(see :func:`_helpers.assert_recall`): no exact ranks, no exact
+scores, no exact ids. LLM-driven retrieval is non-deterministic
+across runs; brittle assertions cause CI noise, not signal.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+
+from ._helpers import (
+    assert_recall,
+    flatten_hits,
+    pick_query_seeds,
+)
+
+# Whole module is opt-in — it depends on ``_ingested_memory_root`` which
+# spends ~10 min running real LLM + embedder against LoCoMo conv_0.
+pytestmark = pytest.mark.slow
+
+
+# ── 1. Keyword recall ──────────────────────────────────────────────────
+
+
+async def test_keyword_recalls_atomic_fact_origin(
+    search_client: httpx.AsyncClient,
+    _ingested_memory_root: Path,
+) -> None:
+    """BM25 must recall *some* episode for *some* fact-derived bigram.
+
+    The project's tokenizer is jieba (CJK-first); single short
+    English tokens and proper nouns / all-caps acronyms recall
+    poorly, but ordinary lowercase content bigrams recall reliably
+    (verified empirically). So we walk through the first N atomic
+    facts, pull consecutive lowercase content tokens, and pass the
+    test as soon as one candidate bigram returns ≥ 1 hit. This
+    validates the BM25 plumbing without coupling to which specific
+    fact got sampled — vector + hybrid tests own the strict
+    closed-loop recall claim.
+    """
+    seeds = pick_query_seeds(_ingested_memory_root, limit=20)
+    last_query: str | None = None
+    for owner, fact in seeds:
+        for query in _candidate_bigrams(fact):
+            last_query = query
+            resp = await search_client.post(
+                "/api/v1/memory/search",
+                json={
+                    "user_id": owner,
+                    "query": query,
+                    "method": "keyword",
+                    "top_k": 5,
+                },
+                timeout=60.0,
+            )
+            assert resp.status_code == 200, resp.text
+            hits = flatten_hits(resp.json()["data"])
+            if hits:
+                # Partition still holds even on a successful keyword hit.
+                for hit_owner, _s, _t in hits:
+                    if hit_owner is not None:
+                        assert hit_owner == owner
+                return
+    raise AssertionError(
+        f"BM25 returned 0 hits across {len(seeds)} fact seeds; "
+        f"last tried query={last_query!r}"
+    )
+
+
+def _candidate_bigrams(fact: str) -> list[str]:
+    """Lowercase consecutive content-token bigrams from ``fact``.
+
+    Skip tokens that include uppercase letters in the original text
+    (proper nouns / acronyms — empirically poor BM25 recall under
+    jieba). Returns at most 5 candidates per fact, in source order.
+    """
+    import re as _re
+
+    out: list[str] = []
+    tokens: list[str] = []
+    for raw in _re.findall(r"\w+", fact):
+        if raw.lower() == raw and len(raw) >= 3:
+            tokens.append(raw)
+    for i in range(len(tokens) - 1):
+        out.append(f"{tokens[i]} {tokens[i + 1]}")
+        if len(out) >= 5:
+            break
+    return out
+
+
+# ── 2. Vector recall ───────────────────────────────────────────────────
+
+
+async def test_vector_recalls_atomic_fact_origin(
+    search_client: httpx.AsyncClient,
+    _ingested_memory_root: Path,
+) -> None:
+    """Same fact via cosine ANN — independent of BM25 tokenisation."""
+    owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
+    await assert_recall(
+        search_client,
+        owner_id=owner,
+        query=fact,
+        method="vector",
+        # Cosine: identical text would score ~1.0; threshold loose
+        # because the LLM-summarised episode text isn't the verbatim fact.
+        min_score=0.1,
+    )
+
+
+# ── 3. Hybrid + include_profile ────────────────────────────────────────
+
+
+async def test_hybrid_with_profile_returns_profile(
+    search_client: httpx.AsyncClient,
+    _ingested_memory_root: Path,
+) -> None:
+    """``include_profile=true`` must populate the profiles array."""
+    owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
+    resp = await search_client.post(
+        "/api/v1/memory/search",
+        json={
+            "user_id": owner,
+            "query": fact,
+            "method": "hybrid",
+            "top_k": 5,
+            "include_profile": True,
+        },
+        timeout=120.0,
+    )
+    assert resp.status_code == 200, resp.text
+    data = resp.json()["data"]
+    assert data["profiles"], "include_profile=true but profiles[] empty"
+    assert data["profiles"][0]["user_id"] == owner
+
+
+# ── 4. Owner partition ─────────────────────────────────────────────────
+
+
+async def test_partition_respects_owner_id(
+    search_client: httpx.AsyncClient,
+    _ingested_memory_root: Path,
+) -> None:
+    """Querying owner=A must not leak owner=B's data, even on shared topics."""
+    seeds = pick_query_seeds(_ingested_memory_root, limit=2)
+    owners = {o for o, _ in seeds}
+    assert len(owners) >= 1, "need at least one owner in the corpus"
+    target_owner = next(iter(owners))
+    _, fact = next((o, f) for o, f in seeds if o == target_owner)
+
+    body = await assert_recall(
+        search_client,
+        owner_id=target_owner,
+        query=fact,
+        method="hybrid",
+    )
+    # Agent tracks must be empty for user owners.
+    assert body["data"]["agent_cases"] == []
+    assert body["data"]["agent_skills"] == []
+
+
+# ── 5. Unknown owner ───────────────────────────────────────────────────
+
+
+async def test_unknown_owner_returns_empty_200(
+    search_client: httpx.AsyncClient,
+) -> None:
+    """An owner that the corpus never saw → 200 with four empty arrays."""
+    resp = await search_client.post(
+        "/api/v1/memory/search",
+        json={
+            "user_id": "ghost_user_does_not_exist",
+            "query": "anything",
+            "method": "hybrid",
+            "top_k": 5,
+        },
+        timeout=60.0,
+    )
+    assert resp.status_code == 200, resp.text
+    data = resp.json()["data"]
+    assert data["episodes"] == []
+    assert data["profiles"] == []
+    assert data["agent_cases"] == []
+    assert data["agent_skills"] == []
+
+
+# ── 6. Filter DSL ──────────────────────────────────────────────────────
+
+
+async def test_filter_dsl_compiles_and_excludes(
+    search_client: httpx.AsyncClient,
+    _ingested_memory_root: Path,
+) -> None:
+    """Add a ``session_id`` ne-filter, verify the returned hits respect it."""
+    owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
+    bogus_session = "session_that_never_was"
+    resp = await search_client.post(
+        "/api/v1/memory/search",
+        json={
+            "user_id": owner,
+            "query": fact,
+            "method": "keyword",
+            "top_k": 10,
+            "filters": {"session_id": {"ne": bogus_session}},
+        },
+        timeout=120.0,
+    )
+    assert resp.status_code == 200, resp.text
+    data = resp.json()["data"]
+    # The filter is satisfied by every real episode (none have the
+    # bogus id), so the hit count should be ≥ 1 — the filter
+    # compiled and shipped to LanceDB without breaking recall.
+    for ep in data["episodes"]:
+        assert ep["session_id"] != bogus_session