chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/tests/integration/search/_helpers.py
+++ b/tests/integration/search/_helpers.py
@ -0,0 +1,269 @@
+"""Private helpers shared across the search e2e tests.
+
+* :func:`pick_query_seeds` — scans the session corpus's
+  ``.atomic_facts/`` md files and returns a list of
+  ``(owner_id, fact_text)`` tuples to use as deterministic search
+  queries. Bootstrapping queries off the corpus's own extraction
+  output gives us a closed-loop correctness signal — what was
+  written should be findable.
+
+* :func:`assert_recall` — the canonical "this search returned at
+  least one sensible hit for ``owner``" assertion bundle. Used by
+  the keyword / vector / hybrid recall tests so the assertion logic
+  is in one place.
+
+* :func:`flatten_hits` — collapses ``SearchData``'s four arrays into
+  one ``(owner_id, score, text)`` tuple list for relevance checks.
+
+The helpers do **not** hardcode topical keywords ("hiking" / "work")
+— they are derived from what the pipeline produced. This keeps the
+suite stable across LLM-driven boundary-cut variance.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+# Cap how many fact strings we sample per call — running every test
+# against every fact would blow the LLM rerank budget.
+_DEFAULT_SEED_LIMIT = 3
+
+# Tokenise on word characters; lowercase; drop short tokens that carry
+# no signal for the "content overlap" check.
+_TOKEN_RE = re.compile(r"\w+", re.UNICODE)
+_MIN_TOKEN_LEN = 3
+_STOPWORDS: frozenset[str] = frozenset(
+    {
+        "the",
+        "and",
+        "for",
+        "that",
+        "with",
+        "this",
+        "was",
+        "has",
+        "have",
+        "are",
+        "but",
+        "from",
+        "you",
+        "she",
+        "her",
+        "his",
+        "him",
+        "they",
+        "them",
+        "their",
+    }
+)
+
+
+# ── Query seed extraction ───────────────────────────────────────────────
+
+
+def pick_query_seeds(
+    memory_root: Path,
+    *,
+    limit: int = _DEFAULT_SEED_LIMIT,
+) -> list[tuple[str, str]]:
+    """Sample ``(owner_id, fact_text)`` tuples from atomic_facts md files.
+
+    Walks ``users/<owner>/.atomic_facts/atomic_fact-*.md`` and parses
+    the ``## Fact\\n<text>`` sections inside each daily-log entry.
+    Returns deterministic seeds (insertion order of ``rglob`` is
+    sort-stable thanks to the explicit ``sorted`` call) so a flaky
+    test surfaces a real regression, not query-rotation variance.
+
+    Raises:
+        AssertionError: if no facts were extracted — that's a fixture
+            failure, not a test failure, and should fail loudly.
+    """
+    seeds: list[tuple[str, str]] = []
+    users_dir = memory_root / "default_app" / "default_project" / "users"
+    if not users_dir.is_dir():
+        raise AssertionError(f"expected {users_dir} to exist after ingest")
+
+    for owner_dir in sorted(users_dir.iterdir()):
+        if not owner_dir.is_dir():
+            continue
+        facts_dir = owner_dir / ".atomic_facts"
+        if not facts_dir.is_dir():
+            continue
+        for md in sorted(facts_dir.rglob("*.md")):
+            for fact in _extract_fact_sections(md):
+                if fact:
+                    seeds.append((owner_dir.name, fact))
+                    if len(seeds) >= limit:
+                        return seeds
+    if not seeds:
+        raise AssertionError(
+            f"no atomic_fact md entries under {users_dir} — pipeline did "
+            "not produce any facts; cannot bootstrap search queries"
+        )
+    return seeds
+
+
+def _extract_fact_sections(md: Path) -> list[str]:
+    """Return every ``### Fact`` section body in a daily-log md file.
+
+    Daily-log entries are ``## <entry-id>`` blocks; the labelled body
+    sections inside an entry are h3 (``### Fact``, ``### Foresight``,
+    …). We scan linearly for ``### Fact`` and collect lines until the
+    next heading at any level or the end-of-entry marker.
+    """
+    body = md.read_text(encoding="utf-8")
+    sections: list[str] = []
+    in_fact = False
+    buf: list[str] = []
+    for line in body.splitlines():
+        stripped = line.lstrip()
+        if stripped.startswith("### Fact"):
+            if in_fact:
+                sections.append("\n".join(buf).strip())
+            in_fact = True
+            buf = []
+            continue
+        # Any subsequent heading or entry-end marker closes the section.
+        if in_fact and (stripped.startswith("#") or stripped.startswith("<!-- /entry")):
+            sections.append("\n".join(buf).strip())
+            in_fact = False
+            buf = []
+            continue
+        if in_fact:
+            buf.append(line)
+    if in_fact:
+        sections.append("\n".join(buf).strip())
+    return [s for s in sections if s]
+
+
+# ── Response flattening + assertions ────────────────────────────────────
+
+
+def flatten_hits(data: dict[str, Any]) -> list[tuple[str | None, float, str]]:
+    """Collapse ``SearchData``'s four arrays into ``(owner_id, score, text)``.
+
+    Stable shape across track-kinds so the recall / partition tests
+    don't have to branch. Episodes / profiles carry ``user_id`` on the
+    item; cases / skills carry ``agent_id`` — both project to the
+    generic ``owner`` slot here. ``owner`` may be ``None`` for profile
+    hits where the owner is implicit.
+    """
+    out: list[tuple[str | None, float, str]] = []
+    for ep in data.get("episodes", []):
+        out.append(
+            (
+                ep.get("user_id"),
+                float(ep.get("score") or 0.0),
+                ep.get("episode") or ep.get("summary") or ep.get("subject") or "",
+            )
+        )
+    for pf in data.get("profiles", []):
+        out.append(
+            (
+                pf.get("user_id"),
+                float(pf.get("score") or 0.0),
+                str(pf.get("profile_data") or ""),
+            )
+        )
+    for cs in data.get("agent_cases", []):
+        out.append(
+            (
+                cs.get("agent_id"),
+                float(cs.get("score") or 0.0),
+                cs.get("approach") or cs.get("task_intent") or "",
+            )
+        )
+    for sk in data.get("agent_skills", []):
+        out.append(
+            (
+                sk.get("agent_id"),
+                float(sk.get("score") or 0.0),
+                sk.get("content") or sk.get("description") or "",
+            )
+        )
+    return out
+
+
+async def assert_recall(
+    client: httpx.AsyncClient,
+    *,
+    owner_id: str,
+    query: str,
+    method: str,
+    min_score: float = 0.0,
+    top_k: int = 10,
+) -> dict[str, Any]:
+    """Hit ``/search`` and lock the four standard recall invariants.
+
+    1. **Status** 200 — the route compiled.
+    2. **Existence** — ``total >= 1`` across the four arrays.
+    3. **Owner partition** — every non-``None`` ``owner_id`` matches
+       the queried owner. Profile hits may carry ``None`` so they're
+       skipped from the check.
+    4. **Score sanity** — the top-scored hit clears ``min_score``.
+
+    Returns the parsed response body so the caller can layer
+    case-specific assertions on top.
+    """
+    resp = await client.post(
+        "/api/v1/memory/search",
+        json={
+            "user_id": owner_id,
+            "query": query,
+            "method": method,
+            "top_k": top_k,
+        },
+        timeout=120.0,
+    )
+    assert resp.status_code == 200, resp.text
+    body = resp.json()
+    hits = flatten_hits(body["data"])
+    assert hits, (
+        f"no hits for owner={owner_id} query={query!r} method={method} — "
+        f"recall is broken"
+    )
+    for hit_owner, _score, _text in hits:
+        if hit_owner is not None:
+            assert hit_owner == owner_id, (
+                f"partition leak: got owner={hit_owner!r} when querying {owner_id!r}"
+            )
+    top_score = max(score for _o, score, _t in hits)
+    assert top_score >= min_score, (
+        f"top hit score {top_score:.3f} < min {min_score} for method={method}"
+    )
+    return body
+
+
+# ── Token utilities (for content-overlap checks) ────────────────────────
+
+
+def query_tokens(query: str) -> set[str]:
+    """Lowercase content tokens worth checking for overlap in hit text."""
+    return {
+        t.lower()
+        for t in _TOKEN_RE.findall(query)
+        if len(t) >= _MIN_TOKEN_LEN and t.lower() not in _STOPWORDS
+    }
+
+
+def content_tokens_in_order(query: str) -> list[str]:
+    """Content tokens in original document order, dedup'd by first occurrence.
+
+    Used by the keyword test: the project's BM25 tokenizer (jieba) is
+    Chinese-first and degrades to near-zero recall on single short
+    English tokens. Multi-token phrases recall well in practice, so
+    keyword queries are built by concatenating consecutive content
+    tokens from the source fact rather than sorting alphabetically.
+    """
+    seen: set[str] = set()
+    out: list[str] = []
+    for t in _TOKEN_RE.findall(query):
+        low = t.lower()
+        if len(t) >= _MIN_TOKEN_LEN and low not in _STOPWORDS and low not in seen:
+            seen.add(low)
+            out.append(low)
+    return out