chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
0
tests/integration/search/__init__.py
Normal file
0
tests/integration/search/__init__.py
Normal file
269
tests/integration/search/_helpers.py
Normal file
269
tests/integration/search/_helpers.py
Normal file
@ -0,0 +1,269 @@
|
||||
"""Private helpers shared across the search e2e tests.
|
||||
|
||||
* :func:`pick_query_seeds` — scans the session corpus's
|
||||
``.atomic_facts/`` md files and returns a list of
|
||||
``(owner_id, fact_text)`` tuples to use as deterministic search
|
||||
queries. Bootstrapping queries off the corpus's own extraction
|
||||
output gives us a closed-loop correctness signal — what was
|
||||
written should be findable.
|
||||
|
||||
* :func:`assert_recall` — the canonical "this search returned at
|
||||
least one sensible hit for ``owner``" assertion bundle. Used by
|
||||
the keyword / vector / hybrid recall tests so the assertion logic
|
||||
is in one place.
|
||||
|
||||
* :func:`flatten_hits` — collapses ``SearchData``'s four arrays into
|
||||
one ``(owner_id, score, text)`` tuple list for relevance checks.
|
||||
|
||||
The helpers do **not** hardcode topical keywords ("hiking" / "work")
|
||||
— they are derived from what the pipeline produced. This keeps the
|
||||
suite stable across LLM-driven boundary-cut variance.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
# Cap how many fact strings we sample per call — running every test
|
||||
# against every fact would blow the LLM rerank budget.
|
||||
_DEFAULT_SEED_LIMIT = 3
|
||||
|
||||
# Tokenise on word characters; lowercase; drop short tokens that carry
|
||||
# no signal for the "content overlap" check.
|
||||
_TOKEN_RE = re.compile(r"\w+", re.UNICODE)
|
||||
_MIN_TOKEN_LEN = 3
|
||||
_STOPWORDS: frozenset[str] = frozenset(
|
||||
{
|
||||
"the",
|
||||
"and",
|
||||
"for",
|
||||
"that",
|
||||
"with",
|
||||
"this",
|
||||
"was",
|
||||
"has",
|
||||
"have",
|
||||
"are",
|
||||
"but",
|
||||
"from",
|
||||
"you",
|
||||
"she",
|
||||
"her",
|
||||
"his",
|
||||
"him",
|
||||
"they",
|
||||
"them",
|
||||
"their",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# ── Query seed extraction ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def pick_query_seeds(
|
||||
memory_root: Path,
|
||||
*,
|
||||
limit: int = _DEFAULT_SEED_LIMIT,
|
||||
) -> list[tuple[str, str]]:
|
||||
"""Sample ``(owner_id, fact_text)`` tuples from atomic_facts md files.
|
||||
|
||||
Walks ``users/<owner>/.atomic_facts/atomic_fact-*.md`` and parses
|
||||
the ``## Fact\\n<text>`` sections inside each daily-log entry.
|
||||
Returns deterministic seeds (insertion order of ``rglob`` is
|
||||
sort-stable thanks to the explicit ``sorted`` call) so a flaky
|
||||
test surfaces a real regression, not query-rotation variance.
|
||||
|
||||
Raises:
|
||||
AssertionError: if no facts were extracted — that's a fixture
|
||||
failure, not a test failure, and should fail loudly.
|
||||
"""
|
||||
seeds: list[tuple[str, str]] = []
|
||||
users_dir = memory_root / "default_app" / "default_project" / "users"
|
||||
if not users_dir.is_dir():
|
||||
raise AssertionError(f"expected {users_dir} to exist after ingest")
|
||||
|
||||
for owner_dir in sorted(users_dir.iterdir()):
|
||||
if not owner_dir.is_dir():
|
||||
continue
|
||||
facts_dir = owner_dir / ".atomic_facts"
|
||||
if not facts_dir.is_dir():
|
||||
continue
|
||||
for md in sorted(facts_dir.rglob("*.md")):
|
||||
for fact in _extract_fact_sections(md):
|
||||
if fact:
|
||||
seeds.append((owner_dir.name, fact))
|
||||
if len(seeds) >= limit:
|
||||
return seeds
|
||||
if not seeds:
|
||||
raise AssertionError(
|
||||
f"no atomic_fact md entries under {users_dir} — pipeline did "
|
||||
"not produce any facts; cannot bootstrap search queries"
|
||||
)
|
||||
return seeds
|
||||
|
||||
|
||||
def _extract_fact_sections(md: Path) -> list[str]:
|
||||
"""Return every ``### Fact`` section body in a daily-log md file.
|
||||
|
||||
Daily-log entries are ``## <entry-id>`` blocks; the labelled body
|
||||
sections inside an entry are h3 (``### Fact``, ``### Foresight``,
|
||||
…). We scan linearly for ``### Fact`` and collect lines until the
|
||||
next heading at any level or the end-of-entry marker.
|
||||
"""
|
||||
body = md.read_text(encoding="utf-8")
|
||||
sections: list[str] = []
|
||||
in_fact = False
|
||||
buf: list[str] = []
|
||||
for line in body.splitlines():
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("### Fact"):
|
||||
if in_fact:
|
||||
sections.append("\n".join(buf).strip())
|
||||
in_fact = True
|
||||
buf = []
|
||||
continue
|
||||
# Any subsequent heading or entry-end marker closes the section.
|
||||
if in_fact and (stripped.startswith("#") or stripped.startswith("<!-- /entry")):
|
||||
sections.append("\n".join(buf).strip())
|
||||
in_fact = False
|
||||
buf = []
|
||||
continue
|
||||
if in_fact:
|
||||
buf.append(line)
|
||||
if in_fact:
|
||||
sections.append("\n".join(buf).strip())
|
||||
return [s for s in sections if s]
|
||||
|
||||
|
||||
# ── Response flattening + assertions ────────────────────────────────────
|
||||
|
||||
|
||||
def flatten_hits(data: dict[str, Any]) -> list[tuple[str | None, float, str]]:
|
||||
"""Collapse ``SearchData``'s four arrays into ``(owner_id, score, text)``.
|
||||
|
||||
Stable shape across track-kinds so the recall / partition tests
|
||||
don't have to branch. Episodes / profiles carry ``user_id`` on the
|
||||
item; cases / skills carry ``agent_id`` — both project to the
|
||||
generic ``owner`` slot here. ``owner`` may be ``None`` for profile
|
||||
hits where the owner is implicit.
|
||||
"""
|
||||
out: list[tuple[str | None, float, str]] = []
|
||||
for ep in data.get("episodes", []):
|
||||
out.append(
|
||||
(
|
||||
ep.get("user_id"),
|
||||
float(ep.get("score") or 0.0),
|
||||
ep.get("episode") or ep.get("summary") or ep.get("subject") or "",
|
||||
)
|
||||
)
|
||||
for pf in data.get("profiles", []):
|
||||
out.append(
|
||||
(
|
||||
pf.get("user_id"),
|
||||
float(pf.get("score") or 0.0),
|
||||
str(pf.get("profile_data") or ""),
|
||||
)
|
||||
)
|
||||
for cs in data.get("agent_cases", []):
|
||||
out.append(
|
||||
(
|
||||
cs.get("agent_id"),
|
||||
float(cs.get("score") or 0.0),
|
||||
cs.get("approach") or cs.get("task_intent") or "",
|
||||
)
|
||||
)
|
||||
for sk in data.get("agent_skills", []):
|
||||
out.append(
|
||||
(
|
||||
sk.get("agent_id"),
|
||||
float(sk.get("score") or 0.0),
|
||||
sk.get("content") or sk.get("description") or "",
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
async def assert_recall(
|
||||
client: httpx.AsyncClient,
|
||||
*,
|
||||
owner_id: str,
|
||||
query: str,
|
||||
method: str,
|
||||
min_score: float = 0.0,
|
||||
top_k: int = 10,
|
||||
) -> dict[str, Any]:
|
||||
"""Hit ``/search`` and lock the four standard recall invariants.
|
||||
|
||||
1. **Status** 200 — the route compiled.
|
||||
2. **Existence** — ``total >= 1`` across the four arrays.
|
||||
3. **Owner partition** — every non-``None`` ``owner_id`` matches
|
||||
the queried owner. Profile hits may carry ``None`` so they're
|
||||
skipped from the check.
|
||||
4. **Score sanity** — the top-scored hit clears ``min_score``.
|
||||
|
||||
Returns the parsed response body so the caller can layer
|
||||
case-specific assertions on top.
|
||||
"""
|
||||
resp = await client.post(
|
||||
"/api/v1/memory/search",
|
||||
json={
|
||||
"user_id": owner_id,
|
||||
"query": query,
|
||||
"method": method,
|
||||
"top_k": top_k,
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
body = resp.json()
|
||||
hits = flatten_hits(body["data"])
|
||||
assert hits, (
|
||||
f"no hits for owner={owner_id} query={query!r} method={method} — "
|
||||
f"recall is broken"
|
||||
)
|
||||
for hit_owner, _score, _text in hits:
|
||||
if hit_owner is not None:
|
||||
assert hit_owner == owner_id, (
|
||||
f"partition leak: got owner={hit_owner!r} when querying {owner_id!r}"
|
||||
)
|
||||
top_score = max(score for _o, score, _t in hits)
|
||||
assert top_score >= min_score, (
|
||||
f"top hit score {top_score:.3f} < min {min_score} for method={method}"
|
||||
)
|
||||
return body
|
||||
|
||||
|
||||
# ── Token utilities (for content-overlap checks) ────────────────────────
|
||||
|
||||
|
||||
def query_tokens(query: str) -> set[str]:
|
||||
"""Lowercase content tokens worth checking for overlap in hit text."""
|
||||
return {
|
||||
t.lower()
|
||||
for t in _TOKEN_RE.findall(query)
|
||||
if len(t) >= _MIN_TOKEN_LEN and t.lower() not in _STOPWORDS
|
||||
}
|
||||
|
||||
|
||||
def content_tokens_in_order(query: str) -> list[str]:
|
||||
"""Content tokens in original document order, dedup'd by first occurrence.
|
||||
|
||||
Used by the keyword test: the project's BM25 tokenizer (jieba) is
|
||||
Chinese-first and degrades to near-zero recall on single short
|
||||
English tokens. Multi-token phrases recall well in practice, so
|
||||
keyword queries are built by concatenating consecutive content
|
||||
tokens from the source fact rather than sorting alphabetically.
|
||||
"""
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for t in _TOKEN_RE.findall(query):
|
||||
low = t.lower()
|
||||
if len(t) >= _MIN_TOKEN_LEN and low not in _STOPWORDS and low not in seen:
|
||||
seen.add(low)
|
||||
out.append(low)
|
||||
return out
|
||||
83
tests/integration/search/_rerun_probes.py
Normal file
83
tests/integration/search/_rerun_probes.py
Normal file
@ -0,0 +1,83 @@
|
||||
"""Re-run probes against an existing corpus + regenerate the report.
|
||||
|
||||
Reuses everything from :mod:`_run_full_report` except the ingest step —
|
||||
points at the already-populated ``~/.everos-report-corpus`` and only
|
||||
re-runs the search probes + report rendering. Useful when the corpus
|
||||
is already there from a previous run and you just want to refresh the
|
||||
retrieval section without paying for LLM ingestion again.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
load_dotenv(_PROJECT_ROOT / ".env", override=False)
|
||||
|
||||
|
||||
from _run_full_report import ( # noqa: E402
|
||||
CONVERSATION,
|
||||
CORPUS_ROOT,
|
||||
REPORT_PATH,
|
||||
inspect_artifacts,
|
||||
render_report,
|
||||
run_probes,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
if not (CORPUS_ROOT / "users").is_dir():
|
||||
raise SystemExit(f"{CORPUS_ROOT} not populated — run _run_full_report.py first")
|
||||
os.environ["EVEROS_MEMORY__ROOT"] = str(CORPUS_ROOT)
|
||||
from everos.config import load_settings
|
||||
|
||||
load_settings.cache_clear()
|
||||
|
||||
print(f"[1/3] using corpus at {CORPUS_ROOT}")
|
||||
|
||||
from everos.entrypoints.api.app import create_app
|
||||
|
||||
app = create_app()
|
||||
transport = httpx.ASGITransport(app=app)
|
||||
|
||||
async with (
|
||||
app.router.lifespan_context(app),
|
||||
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
|
||||
):
|
||||
print("[2/3] inspecting artifacts + running probes ...")
|
||||
artifacts = await inspect_artifacts(CORPUS_ROOT)
|
||||
probes = await run_probes(client)
|
||||
|
||||
print("[3/3] re-rendering report ...")
|
||||
md = render_report(
|
||||
memory_root=CORPUS_ROOT,
|
||||
ingest_summary={
|
||||
"batches": [
|
||||
{
|
||||
"idx": i,
|
||||
"msg_count": len(b),
|
||||
"status": "extracted (cached)",
|
||||
"returned_count": len(b),
|
||||
}
|
||||
for i, b in enumerate(CONVERSATION)
|
||||
],
|
||||
"flush_status": "extracted (cached)",
|
||||
},
|
||||
cascade_summary={
|
||||
"note": "cascade was force-completed via _rerun_probes.py "
|
||||
"after initial run; counts below are post-completion."
|
||||
},
|
||||
artifacts=artifacts,
|
||||
probes=probes,
|
||||
)
|
||||
REPORT_PATH.write_text(md, encoding="utf-8")
|
||||
print(f" → {REPORT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
660
tests/integration/search/_run_full_report.py
Normal file
660
tests/integration/search/_run_full_report.py
Normal file
@ -0,0 +1,660 @@
|
||||
"""End-to-end report generator: fresh corpus → ingest → retrieve → markdown report.
|
||||
|
||||
Run with::
|
||||
|
||||
PYTHONPATH=src python tests/integration/search/_run_full_report.py
|
||||
|
||||
Writes a fresh ``~/.everos-report-corpus/`` memory_root, runs a small
|
||||
synthetic 16-message conversation between two new users (``u_diana`` +
|
||||
``u_ethan``) through ``/add`` + ``/flush``, waits for cascade drain, then
|
||||
runs a curated set of search probes and dumps a structured markdown
|
||||
report to ``tests/integration/search/SEARCH_REPORT.md``.
|
||||
|
||||
Not a pytest test — pure investigative script, real LLM, real embedder.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load .env BEFORE any everos import so settings are correct.
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
load_dotenv(_PROJECT_ROOT / ".env", override=False)
|
||||
|
||||
|
||||
# ── Corpus location ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
CORPUS_ROOT = Path.home() / ".everos-report-corpus"
|
||||
REPORT_PATH = _PROJECT_ROOT / "tests/integration/search/SEARCH_REPORT.md"
|
||||
SESSION_ID = "report_session_diana_ethan"
|
||||
|
||||
|
||||
# ── Synthetic conversation (16 msgs, 2 batches) ────────────────────────
|
||||
|
||||
|
||||
CONVERSATION = [
|
||||
# Batch 1 — introducing hobbies
|
||||
[
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778414400000,
|
||||
"content": "Hey Ethan! Just got back from a 3-day hike in Yosemite. "
|
||||
"My new Sony A7 camera is amazing for landscape shots.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778407260000,
|
||||
"content": "Wow that sounds intense! I'd never survive without my "
|
||||
"espresso. How's the Rust programming learning going?",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778407320000,
|
||||
"content": "Slow but steady. Working through the official book. "
|
||||
"The borrow checker still trips me up.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778407380000,
|
||||
"content": "I'm marathon training — up to 15 miles long runs now. "
|
||||
"Plus I joined a jazz quartet on weekends.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778407440000,
|
||||
"content": "That's awesome! Saxophone again?",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778407500000,
|
||||
"content": "Yeah, alto sax. We're playing at the Blue Note next month.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778407560000,
|
||||
"content": "I'll come watch! Speaking of trips, want to do "
|
||||
"that Iceland thing this summer?",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778407620000,
|
||||
"content": "100% yes. I've been researching ring road photography spots.",
|
||||
},
|
||||
],
|
||||
# Batch 2 — Iceland trip planning
|
||||
[
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778410800000,
|
||||
"content": "I want to see the Northern Lights and shoot some "
|
||||
"volcanic landscapes.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778410860000,
|
||||
"content": "We should rent a 4x4. The F-roads are insane I hear.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778410920000,
|
||||
"content": "And I want to try Icelandic lamb stew. You cook, right?",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778410980000,
|
||||
"content": (
|
||||
"Yeah, I'll bring my Dutch oven. Maybe a cast iron pan for fish."
|
||||
),
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778411040000,
|
||||
"content": "Perfect. Mid-July works for me — I have a Rust conference "
|
||||
"in late August.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778411100000,
|
||||
"content": "July it is. I have the Boston Marathon qualifier in October "
|
||||
"so I can't go after.",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_diana",
|
||||
"role": "user",
|
||||
"timestamp": 1778411160000,
|
||||
"content": "Let's book flights this weekend?",
|
||||
},
|
||||
{
|
||||
"sender_id": "u_ethan",
|
||||
"role": "user",
|
||||
"timestamp": 1778411220000,
|
||||
"content": "Deal. Also bringing my Olympus E-M1 for the landscapes.",
|
||||
},
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
# ── Probe set ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
PROBES: list[dict] = [
|
||||
# Owner-specific topical: should recall the right owner's episodes.
|
||||
{
|
||||
"section": "Owner-specific topical (diana)",
|
||||
"owner": "u_diana",
|
||||
"query": "hiking",
|
||||
"method": "hybrid",
|
||||
"expect": "diana's Yosemite episode",
|
||||
},
|
||||
{
|
||||
"section": "Owner-specific topical (diana)",
|
||||
"owner": "u_diana",
|
||||
"query": "Rust programming",
|
||||
"method": "hybrid",
|
||||
"expect": "diana's Rust learning facts",
|
||||
},
|
||||
{
|
||||
"section": "Owner-specific topical (diana)",
|
||||
"owner": "u_diana",
|
||||
"query": "photography",
|
||||
"method": "hybrid",
|
||||
"expect": "diana's camera (Sony A7) facts",
|
||||
},
|
||||
{
|
||||
"section": "Owner-specific topical (ethan)",
|
||||
"owner": "u_ethan",
|
||||
"query": "jazz",
|
||||
"method": "hybrid",
|
||||
"expect": "ethan's jazz quartet / sax facts",
|
||||
},
|
||||
{
|
||||
"section": "Owner-specific topical (ethan)",
|
||||
"owner": "u_ethan",
|
||||
"query": "marathon training",
|
||||
"method": "hybrid",
|
||||
"expect": "ethan's marathon facts",
|
||||
},
|
||||
{
|
||||
"section": "Owner-specific topical (ethan)",
|
||||
"owner": "u_ethan",
|
||||
"query": "cooking",
|
||||
"method": "hybrid",
|
||||
"expect": "ethan's Dutch oven / lamb stew facts",
|
||||
},
|
||||
# Shared topic — both should recall their own perspective.
|
||||
{
|
||||
"section": "Shared topic (Iceland)",
|
||||
"owner": "u_diana",
|
||||
"query": "Iceland trip",
|
||||
"method": "hybrid",
|
||||
"expect": "diana's planning episode",
|
||||
},
|
||||
{
|
||||
"section": "Shared topic (Iceland)",
|
||||
"owner": "u_ethan",
|
||||
"query": "Iceland trip",
|
||||
"method": "hybrid",
|
||||
"expect": "ethan's planning episode",
|
||||
},
|
||||
# Method comparison on the same query.
|
||||
{
|
||||
"section": "Method comparison (diana + 'Rust')",
|
||||
"owner": "u_diana",
|
||||
"query": "Rust",
|
||||
"method": "keyword",
|
||||
"expect": "BM25 single token",
|
||||
},
|
||||
{
|
||||
"section": "Method comparison (diana + 'Rust')",
|
||||
"owner": "u_diana",
|
||||
"query": "Rust",
|
||||
"method": "vector",
|
||||
"expect": "cosine ANN",
|
||||
},
|
||||
{
|
||||
"section": "Method comparison (diana + 'Rust')",
|
||||
"owner": "u_diana",
|
||||
"query": "Rust",
|
||||
"method": "hybrid",
|
||||
"expect": "fusion of BM25 + vector",
|
||||
},
|
||||
# Owner partition: diana searching for ethan's exclusive topic.
|
||||
{
|
||||
"section": "Owner partition",
|
||||
"owner": "u_diana",
|
||||
"query": "jazz quartet",
|
||||
"method": "hybrid",
|
||||
"expect": "should NOT leak ethan's content",
|
||||
},
|
||||
{
|
||||
"section": "Owner partition",
|
||||
"owner": "u_ethan",
|
||||
"query": "Rust programming",
|
||||
"method": "hybrid",
|
||||
"expect": "should NOT leak diana's content",
|
||||
},
|
||||
# Phrase + bigram.
|
||||
{
|
||||
"section": "Phrase queries",
|
||||
"owner": "u_diana",
|
||||
"query": "Northern Lights",
|
||||
"method": "keyword",
|
||||
"expect": "diana's Iceland aurora plans",
|
||||
},
|
||||
{
|
||||
"section": "Phrase queries",
|
||||
"owner": "u_ethan",
|
||||
"query": "Boston Marathon",
|
||||
"method": "keyword",
|
||||
"expect": "ethan's qualifier date",
|
||||
},
|
||||
# include_profile.
|
||||
{
|
||||
"section": "Profile attach",
|
||||
"owner": "u_diana",
|
||||
"query": "anything",
|
||||
"method": "hybrid",
|
||||
"include_profile": True,
|
||||
"expect": "should return diana's profile object",
|
||||
},
|
||||
# Unknown owner.
|
||||
{
|
||||
"section": "Unknown owner",
|
||||
"owner": "u_ghost_does_not_exist",
|
||||
"query": "hiking",
|
||||
"method": "hybrid",
|
||||
"expect": "empty arrays, status 200",
|
||||
},
|
||||
# Non-existent term.
|
||||
{
|
||||
"section": "Non-existent term",
|
||||
"owner": "u_diana",
|
||||
"query": "quantum blockchain pizza",
|
||||
"method": "keyword",
|
||||
"expect": "0 hits, status 200",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Pipeline runners ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def ingest(client: httpx.AsyncClient) -> dict:
|
||||
"""POST /add for each batch, then /flush. Return summary."""
|
||||
summary: dict = {"batches": [], "flush_status": None}
|
||||
for i, batch in enumerate(CONVERSATION):
|
||||
resp = await client.post(
|
||||
"/api/v1/memory/add",
|
||||
json={"session_id": SESSION_ID, "messages": batch},
|
||||
timeout=600.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()["data"]
|
||||
summary["batches"].append(
|
||||
{
|
||||
"idx": i,
|
||||
"msg_count": len(batch),
|
||||
"status": data["status"],
|
||||
"returned_count": data["message_count"],
|
||||
}
|
||||
)
|
||||
resp = await client.post(
|
||||
"/api/v1/memory/flush",
|
||||
json={"session_id": SESSION_ID},
|
||||
timeout=600.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
summary["flush_status"] = resp.json()["data"]["status"]
|
||||
return summary
|
||||
|
||||
|
||||
async def wait_cascade(
|
||||
*,
|
||||
expected_md_paths: int = 8,
|
||||
stable_checks: int = 5,
|
||||
deadline_seconds: float = 600.0,
|
||||
) -> dict:
|
||||
"""Block until cascade is *stably* done across all expected md kinds.
|
||||
|
||||
A plain ``pending == 0`` check is racy: OME async strategies
|
||||
(extract_foresight / extract_user_profile) emit md writes
|
||||
asynchronously after ``/flush`` returns, and there's a window
|
||||
where the cascade queue is momentarily empty before OME's writes
|
||||
arrive. We require two stronger conditions:
|
||||
|
||||
1. At least ``expected_md_paths`` rows exist in ``md_change_state``
|
||||
(one per expected (owner × kind) — episodes + atomic_facts +
|
||||
foresights + user_profile, per owner). This guards against
|
||||
returning before OME has emitted *anything*.
|
||||
2. ``pending == 0`` stays true for ``stable_checks`` consecutive
|
||||
polls (separated by 1s sleep). This guards against a transient
|
||||
empty queue while a strategy is still mid-write.
|
||||
"""
|
||||
from everos.infra.persistence.sqlite import md_change_state_repo
|
||||
|
||||
consecutive_zero = 0
|
||||
async with asyncio.timeout(deadline_seconds):
|
||||
while True:
|
||||
sm = await md_change_state_repo.queue_summary()
|
||||
total_rows = (
|
||||
sm.pending + sm.done + sm.failed_retryable + sm.failed_permanent
|
||||
)
|
||||
if sm.pending == 0 and total_rows >= expected_md_paths:
|
||||
consecutive_zero += 1
|
||||
if consecutive_zero >= stable_checks:
|
||||
return {
|
||||
"done": sm.done,
|
||||
"failed_retryable": sm.failed_retryable,
|
||||
"failed_permanent": sm.failed_permanent,
|
||||
"max_lsn": sm.max_lsn,
|
||||
"last_processed_lsn": sm.last_processed_lsn,
|
||||
}
|
||||
else:
|
||||
consecutive_zero = 0
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
|
||||
async def inspect_artifacts(memory_root: Path) -> dict:
|
||||
"""Read md files + LanceDB counts after cascade drain."""
|
||||
from everos.infra.persistence.lancedb import (
|
||||
atomic_fact_repo,
|
||||
dispose_connection,
|
||||
episode_repo,
|
||||
foresight_repo,
|
||||
get_connection,
|
||||
user_profile_repo,
|
||||
verify_business_schemas,
|
||||
)
|
||||
|
||||
await get_connection()
|
||||
await verify_business_schemas()
|
||||
counts = {
|
||||
"episode_rows": await episode_repo.count(),
|
||||
"atomic_fact_rows": await atomic_fact_repo.count(),
|
||||
"foresight_rows": await foresight_repo.count(),
|
||||
"user_profile_rows": await user_profile_repo.count(),
|
||||
}
|
||||
await dispose_connection()
|
||||
|
||||
md_files: list[str] = []
|
||||
users_dir = memory_root / "default_app" / "default_project" / "users"
|
||||
if users_dir.is_dir():
|
||||
for f in sorted(users_dir.rglob("*.md")):
|
||||
md_files.append(str(f.relative_to(memory_root)))
|
||||
counts["md_files"] = md_files
|
||||
return counts
|
||||
|
||||
|
||||
async def run_probes(client: httpx.AsyncClient) -> list[dict]:
|
||||
"""Execute every probe in :data:`PROBES`; return captured rows."""
|
||||
rows: list[dict] = []
|
||||
for p in PROBES:
|
||||
payload: dict = {
|
||||
"owner_id": p["owner"],
|
||||
"owner_type": "user",
|
||||
"query": p["query"],
|
||||
"method": p["method"],
|
||||
"top_k": 5,
|
||||
}
|
||||
if p.get("include_profile"):
|
||||
payload["include_profile"] = True
|
||||
resp = await client.post("/api/v1/memory/search", json=payload, timeout=120.0)
|
||||
body = resp.json()
|
||||
data = body.get("data", {})
|
||||
rows.append(
|
||||
{
|
||||
"section": p["section"],
|
||||
"expect": p["expect"],
|
||||
"request": payload,
|
||||
"status": resp.status_code,
|
||||
"episodes": [
|
||||
{
|
||||
"id": e["id"],
|
||||
"owner_id": e["owner_id"],
|
||||
"score": round(float(e["score"]), 3),
|
||||
"summary": (e.get("summary") or "")[:150],
|
||||
"atomic_facts_count": len(e.get("atomic_facts", [])),
|
||||
}
|
||||
for e in data.get("episodes", [])
|
||||
],
|
||||
"profiles": [
|
||||
{
|
||||
"owner_id": p_.get("owner_id"),
|
||||
"score": p_.get("score"),
|
||||
"summary_excerpt": str(p_.get("profile_data", {}))[:200],
|
||||
}
|
||||
for p_ in data.get("profiles", [])
|
||||
],
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
# ── Markdown report renderer ───────────────────────────────────────────
|
||||
|
||||
|
||||
def render_report(
|
||||
*,
|
||||
memory_root: Path,
|
||||
ingest_summary: dict,
|
||||
cascade_summary: dict,
|
||||
artifacts: dict,
|
||||
probes: list[dict],
|
||||
) -> str:
|
||||
lines: list[str] = []
|
||||
lines.append("# Search E2E Report — fresh corpus (u_diana + u_ethan)\n")
|
||||
lines.append(
|
||||
"Generated by [`_run_full_report.py`](_run_full_report.py). "
|
||||
"Two synthetic users with distinct hobbies feed a 16-message "
|
||||
"conversation through the full pipeline; the report below "
|
||||
"captures ingest stats, cascade drain numbers, on-disk "
|
||||
"artifacts, and the response of every curated search probe.\n"
|
||||
)
|
||||
|
||||
# ── Section: Setup ────────────────────────────────────────────────
|
||||
lines.append("## 1. Setup\n")
|
||||
lines.append(f"- **Memory root**: `{memory_root}`\n")
|
||||
lines.append(f"- **Session id**: `{SESSION_ID}`\n")
|
||||
lines.append(
|
||||
"- **Users**: `u_diana` (hiking / Rust / photography), "
|
||||
"`u_ethan` (jazz / marathon / cooking)\n"
|
||||
)
|
||||
lines.append(
|
||||
f"- **Batches**: {len(CONVERSATION)} "
|
||||
f"({sum(len(b) for b in CONVERSATION)} messages total)\n"
|
||||
)
|
||||
|
||||
# ── Section: Ingest stats ─────────────────────────────────────────
|
||||
lines.append("\n## 2. Ingest (`/add` × N + `/flush`)\n")
|
||||
lines.append("| batch | msg_count | status |\n")
|
||||
lines.append("|---|---|---|\n")
|
||||
for b in ingest_summary["batches"]:
|
||||
lines.append(f"| {b['idx']} | {b['msg_count']} | `{b['status']}` |\n")
|
||||
lines.append(f"\n**Flush status**: `{ingest_summary['flush_status']}`\n")
|
||||
|
||||
# ── Section: Cascade drain ────────────────────────────────────────
|
||||
lines.append("\n## 3. Cascade drain (md → LanceDB sync)\n")
|
||||
lines.append("```\n")
|
||||
lines.append(json.dumps(cascade_summary, indent=2) + "\n")
|
||||
lines.append("```\n")
|
||||
|
||||
# ── Section: Artifacts ────────────────────────────────────────────
|
||||
lines.append("\n## 4. On-disk artifacts\n")
|
||||
lines.append("### LanceDB row counts\n\n")
|
||||
lines.append("| table | rows |\n")
|
||||
lines.append("|---|---|\n")
|
||||
for k in (
|
||||
"episode_rows",
|
||||
"atomic_fact_rows",
|
||||
"foresight_rows",
|
||||
"user_profile_rows",
|
||||
):
|
||||
lines.append(f"| {k.replace('_rows', '')} | {artifacts[k]} |\n")
|
||||
lines.append("\n### Markdown files\n\n")
|
||||
for f in artifacts["md_files"]:
|
||||
lines.append(f"- `{f}`\n")
|
||||
|
||||
# ── Section: Probes ───────────────────────────────────────────────
|
||||
lines.append("\n## 5. Retrieval probes\n")
|
||||
lines.append(
|
||||
"Every row below is one POST to `/api/v1/memory/search`. "
|
||||
"`expected` is what the test designer expects to see; "
|
||||
"actual results are captured verbatim.\n"
|
||||
)
|
||||
current_section = None
|
||||
for row in probes:
|
||||
if row["section"] != current_section:
|
||||
lines.append(f"\n### {row['section']}\n")
|
||||
current_section = row["section"]
|
||||
req = row["request"]
|
||||
lines.append(
|
||||
f"\n#### `{req['query']}` (method=`{req['method']}`, "
|
||||
f"owner=`{req['owner_id']}`)\n"
|
||||
)
|
||||
lines.append(f"\n- **Expected**: {row['expect']}\n")
|
||||
lines.append(f"- **Status**: {row['status']}\n")
|
||||
lines.append(f"- **Episodes returned**: {len(row['episodes'])}\n")
|
||||
if row["episodes"]:
|
||||
lines.append("\n| rank | score | owner | atomic_facts | summary |\n")
|
||||
lines.append("|---|---|---|---|---|\n")
|
||||
for i, ep in enumerate(row["episodes"], 1):
|
||||
summary = ep["summary"].replace("|", "\\|")
|
||||
lines.append(
|
||||
f"| {i} | {ep['score']} | `{ep['owner_id']}` | "
|
||||
f"{ep['atomic_facts_count']} | {summary} |\n"
|
||||
)
|
||||
else:
|
||||
lines.append("\n_(no episodes)_\n")
|
||||
if row["profiles"]:
|
||||
lines.append(
|
||||
"\n**Profile attached**: "
|
||||
f"`{row['profiles'][0]['owner_id']}` "
|
||||
f"(excerpt: {row['profiles'][0]['summary_excerpt']!r})\n"
|
||||
)
|
||||
|
||||
# ── Section: Pass/Fail summary ────────────────────────────────────
|
||||
lines.append("\n## 6. Pass / Fail summary\n")
|
||||
pf = _grade(probes)
|
||||
lines.append("| # | section | query | result |\n")
|
||||
lines.append("|---|---|---|---|\n")
|
||||
for r in pf:
|
||||
lines.append(
|
||||
f"| {r['idx']} | {r['section']} | `{r['query']}` | {r['verdict']} |\n"
|
||||
)
|
||||
passed = sum(1 for r in pf if r["verdict"].startswith("✅"))
|
||||
lines.append(f"\n**Total: {passed}/{len(pf)} passed.**\n")
|
||||
|
||||
return "".join(lines)
|
||||
|
||||
|
||||
def _grade(probes: list[dict]) -> list[dict]:
|
||||
"""Apply soft heuristic pass/fail to each probe based on its 'expect'."""
|
||||
graded: list[dict] = []
|
||||
for i, row in enumerate(probes, 1):
|
||||
req = row["request"]
|
||||
expect = row["expect"].lower()
|
||||
verdict = "—"
|
||||
if "should not leak" in expect:
|
||||
leaked = any(ep["owner_id"] != req["owner_id"] for ep in row["episodes"])
|
||||
verdict = "❌ leaked" if leaked else "✅ no leak"
|
||||
elif "empty arrays" in expect or "0 hits" in expect:
|
||||
verdict = "✅" if not row["episodes"] else f"❌ got {len(row['episodes'])}"
|
||||
elif "profile" in expect:
|
||||
verdict = "✅" if row["profiles"] else "❌ no profile"
|
||||
elif row["episodes"]:
|
||||
top_owner = row["episodes"][0]["owner_id"]
|
||||
verdict = (
|
||||
"✅" if top_owner == req["owner_id"] else f"❌ wrong owner: {top_owner}"
|
||||
)
|
||||
else:
|
||||
verdict = "❌ no hits"
|
||||
graded.append(
|
||||
{
|
||||
"idx": i,
|
||||
"section": row["section"],
|
||||
"query": req["query"],
|
||||
"verdict": verdict,
|
||||
}
|
||||
)
|
||||
return graded
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
# Reset corpus to a known empty state.
|
||||
if CORPUS_ROOT.exists():
|
||||
shutil.rmtree(CORPUS_ROOT)
|
||||
CORPUS_ROOT.mkdir(parents=True)
|
||||
os.environ["EVEROS_MEMORY__ROOT"] = str(CORPUS_ROOT)
|
||||
|
||||
# Reset cached singletons so they pick up the new env.
|
||||
from everos.config import load_settings
|
||||
|
||||
load_settings.cache_clear()
|
||||
|
||||
print(f"[1/6] fresh corpus at {CORPUS_ROOT}")
|
||||
|
||||
from everos.entrypoints.api.app import create_app
|
||||
|
||||
app = create_app()
|
||||
transport = httpx.ASGITransport(app=app)
|
||||
|
||||
async with (
|
||||
app.router.lifespan_context(app),
|
||||
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
|
||||
):
|
||||
print("[2/6] ingesting via /add + /flush ...")
|
||||
ingest_summary = await ingest(client)
|
||||
print(f" batches={ingest_summary['batches']}")
|
||||
|
||||
print("[3/6] waiting for cascade drain ...")
|
||||
cascade_summary = await wait_cascade()
|
||||
print(f" drained: {cascade_summary}")
|
||||
|
||||
print("[4/6] inspecting on-disk artifacts ...")
|
||||
artifacts = await inspect_artifacts(CORPUS_ROOT)
|
||||
print(
|
||||
" lancedb: {k: v for k,v in artifacts.items() if k.endswith('_rows')}"
|
||||
)
|
||||
|
||||
print(f"[5/6] running {len(PROBES)} search probes ...")
|
||||
probes = await run_probes(client)
|
||||
|
||||
print("[6/6] rendering report ...")
|
||||
md = render_report(
|
||||
memory_root=CORPUS_ROOT,
|
||||
ingest_summary=ingest_summary,
|
||||
cascade_summary=cascade_summary,
|
||||
artifacts=artifacts,
|
||||
probes=probes,
|
||||
)
|
||||
REPORT_PATH.write_text(md, encoding="utf-8")
|
||||
print(f" → {REPORT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
269
tests/integration/search/conftest.py
Normal file
269
tests/integration/search/conftest.py
Normal file
@ -0,0 +1,269 @@
|
||||
"""Session-scoped corpus fixture for ``tests/integration/search/``.
|
||||
|
||||
The pipeline that produces the search corpus (`/add` × 19 + `/flush` +
|
||||
cascade drain) is the same one exercised by
|
||||
``tests/integration/test_add_flush_pipeline_e2e.py`` — and it costs
|
||||
~10 minutes against real LLMs. To keep the search test suite usable
|
||||
in CI we run that pipeline **once per session** here, persist the
|
||||
resulting memory_root to a session ``tmp_path``, and let every test
|
||||
re-attach a fresh FastAPI lifespan against the on-disk corpus.
|
||||
|
||||
Layout::
|
||||
|
||||
_ingested_memory_root (session-scoped)
|
||||
└── ingests LoCoMo conv_0 via the HTTP API, then tears
|
||||
lifespan down. Returns the memory_root path with md +
|
||||
sqlite + lancedb populated on disk.
|
||||
|
||||
search_client (function-scoped)
|
||||
└── per-test ``httpx.AsyncClient`` wired to a freshly built
|
||||
FastAPI app, ``EVEROS_MEMORY__ROOT`` pointed at the
|
||||
session corpus. Singletons are reset so each test starts
|
||||
with cold caches and the lifespan is the only thing
|
||||
constructing them.
|
||||
|
||||
This is intentionally separate from ``tests/integration/conftest.py``
|
||||
fixtures (which are function-scoped). Cross-suite isolation: tests
|
||||
under ``search/`` cannot poison or be poisoned by the ones above.
|
||||
|
||||
All tests in this folder are marked ``slow`` via the module-level
|
||||
``pytestmark`` in ``test_search_e2e.py`` — a non-``-m slow`` run skips
|
||||
the whole suite cleanly without paying the ingest cost.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import os
|
||||
from collections.abc import AsyncIterator, Awaitable, Callable, Generator
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from sqlalchemy import text
|
||||
|
||||
# Set ``EVEROS_REUSE_CORPUS=<path>`` to skip ingest and point the
|
||||
# session fixture at an existing memory_root (md + lancedb already
|
||||
# populated). Search is a read-only path, so no copy is needed — the
|
||||
# fixture just sets ``EVEROS_MEMORY__ROOT`` to that directory.
|
||||
_REUSE_ENV = "EVEROS_REUSE_CORPUS"
|
||||
|
||||
# Memorize-service module-level lazy singletons; reset between phases so
|
||||
# stale clients / engines don't leak from ingest into per-test lifespans.
|
||||
_MEMORIZE_SINGLETONS: tuple[str, ...] = (
|
||||
"_episode_writer",
|
||||
"_prompt_loader",
|
||||
"_user_pipeline",
|
||||
"_agent_pipeline",
|
||||
"_ome_engine",
|
||||
)
|
||||
|
||||
|
||||
# ── Session-scoped MonkeyPatch ─────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def _session_monkeypatch() -> Generator[pytest.MonkeyPatch, None, None]:
|
||||
"""A ``MonkeyPatch`` instance with session lifetime.
|
||||
|
||||
Pytest's default ``monkeypatch`` is function-scoped. The ingest
|
||||
fixture below has to set env vars and null singletons before the
|
||||
lifespan even starts — those changes have to live for the whole
|
||||
session, so we open our own ``MonkeyPatch`` and undo it at session
|
||||
end.
|
||||
"""
|
||||
mp = pytest.MonkeyPatch()
|
||||
yield mp
|
||||
mp.undo()
|
||||
|
||||
|
||||
# ── Singleton reset helper ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def _reset_memorize_singletons(mp: pytest.MonkeyPatch) -> None:
|
||||
"""Null out memorize/strategy/LLM-client lazy singletons.
|
||||
|
||||
Called once before ingest (so the freshly-set ``EVEROS_MEMORY__ROOT``
|
||||
actually wins) and once per test (so the session corpus's lifespan
|
||||
sees clean caches).
|
||||
"""
|
||||
from everos.config import load_settings
|
||||
|
||||
load_settings.cache_clear()
|
||||
|
||||
svc = importlib.import_module("everos.service.memorize")
|
||||
client_mod = importlib.import_module("everos.component.llm.client")
|
||||
af_mod = importlib.import_module("everos.memory.strategies.extract_atomic_facts")
|
||||
fs_mod = importlib.import_module("everos.memory.strategies.extract_foresight")
|
||||
|
||||
for attr in _MEMORIZE_SINGLETONS:
|
||||
mp.setattr(svc, attr, None, raising=False)
|
||||
mp.setattr(client_mod, "_llm_client", None, raising=False)
|
||||
mp.setattr(af_mod, "_writer", None, raising=False)
|
||||
mp.setattr(fs_mod, "_writer", None, raising=False)
|
||||
|
||||
|
||||
# ── Session corpus: ingest once ────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def _ingested_memory_root(
|
||||
tmp_path_factory: pytest.TempPathFactory,
|
||||
_session_monkeypatch: pytest.MonkeyPatch,
|
||||
long_conversation: dict,
|
||||
) -> Path:
|
||||
"""Run /add × 19 + /flush + cascade drain once; return the memory_root.
|
||||
|
||||
All on-disk artifacts (md files + sqlite system.db + lancedb
|
||||
tables) survive lifespan teardown, so per-test fixtures can
|
||||
re-attach a fresh app against the populated root and exercise
|
||||
only the read path.
|
||||
|
||||
Marked **slow** transitively via ``pytestmark`` in
|
||||
``test_search_e2e.py`` — without ``-m slow`` the test module is
|
||||
deselected and this fixture is never instantiated.
|
||||
"""
|
||||
reuse = os.environ.get(_REUSE_ENV)
|
||||
if reuse:
|
||||
memory_root = Path(reuse).expanduser().resolve()
|
||||
users_dir = memory_root / "default_app" / "default_project" / "users"
|
||||
if not users_dir.is_dir():
|
||||
raise AssertionError(
|
||||
f"{_REUSE_ENV}={memory_root} has no "
|
||||
"default_app/default_project/users/ subdir — point it at a "
|
||||
"fully-ingested memory_root or unset to rebuild from scratch"
|
||||
)
|
||||
else:
|
||||
memory_root = tmp_path_factory.mktemp("search_corpus")
|
||||
|
||||
_session_monkeypatch.setenv("EVEROS_MEMORY__ROOT", str(memory_root))
|
||||
_reset_memorize_singletons(_session_monkeypatch)
|
||||
|
||||
if reuse:
|
||||
# Search is read-only; the corpus is consumed in place, no copy.
|
||||
return memory_root
|
||||
|
||||
# Drive the ingest in its own event loop. The lifespan inside
|
||||
# ``_ingest`` properly closes LanceDB / SQLite handles on exit so
|
||||
# the per-test lifespans can re-open them.
|
||||
asyncio.run(_ingest(memory_root, long_conversation))
|
||||
return memory_root
|
||||
|
||||
|
||||
async def _ingest(memory_root: Path, long_conversation: dict) -> None:
|
||||
"""Bring up the app once, push the LoCoMo fixture through /add+/flush."""
|
||||
from everos.entrypoints.api.app import create_app
|
||||
|
||||
app = create_app()
|
||||
transport = httpx.ASGITransport(app=app)
|
||||
|
||||
async with (
|
||||
app.router.lifespan_context(app),
|
||||
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
|
||||
):
|
||||
session_id = long_conversation["everos_session_id"]
|
||||
for batch in long_conversation["batches"]:
|
||||
messages = [
|
||||
{
|
||||
"sender_id": m["sender_id"],
|
||||
"role": m["role"],
|
||||
"timestamp": m["timestamp"],
|
||||
"content": m["content"],
|
||||
}
|
||||
for m in batch["messages"]
|
||||
]
|
||||
resp = await client.post(
|
||||
"/api/v1/memory/add",
|
||||
json={"session_id": session_id, "messages": messages},
|
||||
timeout=600.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
resp = await client.post(
|
||||
"/api/v1/memory/flush",
|
||||
json={"session_id": session_id},
|
||||
timeout=600.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
await _poll_cascade_drained(deadline_seconds=600.0)
|
||||
|
||||
|
||||
async def _poll_cascade_drained(*, deadline_seconds: float) -> None:
|
||||
"""Block until ``md_change_state.pending == 0`` or deadline."""
|
||||
from everos.infra.persistence.sqlite import md_change_state_repo
|
||||
|
||||
async with asyncio.timeout(deadline_seconds):
|
||||
while True:
|
||||
summary = await md_change_state_repo.queue_summary()
|
||||
if summary.pending == 0:
|
||||
return
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
# ── Per-test client against the session corpus ─────────────────────────
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def search_client(
|
||||
_ingested_memory_root: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> AsyncIterator[httpx.AsyncClient]:
|
||||
"""Per-test ``AsyncClient`` reading from the session corpus.
|
||||
|
||||
Singletons are reset before the lifespan starts so the search
|
||||
manager builds a fresh embedding / rerank / LLM client per test —
|
||||
we don't want cross-test client state to mask a regression.
|
||||
"""
|
||||
monkeypatch.setenv("EVEROS_MEMORY__ROOT", str(_ingested_memory_root))
|
||||
_reset_memorize_singletons(monkeypatch)
|
||||
|
||||
# The search service has its own module-level singletons; reset
|
||||
# those too so re-attach is clean.
|
||||
search_svc = importlib.import_module("everos.service.search")
|
||||
for attr in (
|
||||
"_manager",
|
||||
"_embedding",
|
||||
"_reranker",
|
||||
"_llm_client",
|
||||
"_embedding_resolved",
|
||||
"_rerank_resolved",
|
||||
"_llm_resolved",
|
||||
):
|
||||
if hasattr(search_svc, attr):
|
||||
monkeypatch.setattr(
|
||||
search_svc,
|
||||
attr,
|
||||
None if not attr.endswith("_resolved") else False,
|
||||
raising=False,
|
||||
)
|
||||
|
||||
from everos.entrypoints.api.app import create_app
|
||||
|
||||
app = create_app()
|
||||
transport = httpx.ASGITransport(app=app)
|
||||
async with (
|
||||
app.router.lifespan_context(app),
|
||||
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
|
||||
):
|
||||
yield client
|
||||
|
||||
|
||||
# ── Diagnostic helpers (handy for tests that probe SQLite directly) ───
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def memcell_count() -> Callable[[], Awaitable[int]]:
|
||||
"""Return an async callable: ``await memcell_count() -> int``."""
|
||||
|
||||
async def _count() -> int:
|
||||
from everos.infra.persistence.sqlite import get_engine
|
||||
|
||||
engine = get_engine()
|
||||
async with engine.connect() as conn:
|
||||
result = await conn.execute(text("SELECT COUNT(*) FROM memcell"))
|
||||
return int(result.scalar() or 0)
|
||||
|
||||
return _count
|
||||
241
tests/integration/search/test_search_e2e.py
Normal file
241
tests/integration/search/test_search_e2e.py
Normal file
@ -0,0 +1,241 @@
|
||||
"""End-to-end ``/api/v1/memory/search`` tests over a real LoCoMo corpus.
|
||||
|
||||
Six tests, each pinning one path through :class:`SearchManager`:
|
||||
|
||||
============================================ =================================
|
||||
``test_keyword_recalls_atomic_fact_origin`` keyword (BM25 only)
|
||||
``test_vector_recalls_atomic_fact_origin`` vector (cosine only)
|
||||
``test_hybrid_with_profile_returns_profile`` hybrid + ``include_profile``
|
||||
``test_partition_respects_owner_id`` cross-owner isolation
|
||||
``test_unknown_owner_returns_empty_200`` empty response, no 500
|
||||
``test_filter_dsl_compiles_and_excludes`` filters DSL → LanceDB ``where``
|
||||
============================================ =================================
|
||||
|
||||
The corpus is built once by :func:`_ingested_memory_root` (session-
|
||||
scoped fixture in ``conftest.py``) and shared across all tests. Each
|
||||
test re-attaches a fresh lifespan via :func:`search_client`, so the
|
||||
search-manager singletons rebuild from cold per-test — a regression
|
||||
in the lazy-init path can't hide behind warm state from a prior test.
|
||||
|
||||
Bootstrapping: queries are derived from the corpus's own
|
||||
``atomic_facts`` md files via :func:`pick_query_seeds`, not
|
||||
hardcoded. Closed-loop correctness — what the pipeline extracted
|
||||
should be findable by the search side.
|
||||
|
||||
Assertions follow the project's "守恒 + 下界 + 形状" convention
|
||||
(see :func:`_helpers.assert_recall`): no exact ranks, no exact
|
||||
scores, no exact ids. LLM-driven retrieval is non-deterministic
|
||||
across runs; brittle assertions cause CI noise, not signal.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from ._helpers import (
|
||||
assert_recall,
|
||||
flatten_hits,
|
||||
pick_query_seeds,
|
||||
)
|
||||
|
||||
# Whole module is opt-in — it depends on ``_ingested_memory_root`` which
|
||||
# spends ~10 min running real LLM + embedder against LoCoMo conv_0.
|
||||
pytestmark = pytest.mark.slow
|
||||
|
||||
|
||||
# ── 1. Keyword recall ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_keyword_recalls_atomic_fact_origin(
|
||||
search_client: httpx.AsyncClient,
|
||||
_ingested_memory_root: Path,
|
||||
) -> None:
|
||||
"""BM25 must recall *some* episode for *some* fact-derived bigram.
|
||||
|
||||
The project's tokenizer is jieba (CJK-first); single short
|
||||
English tokens and proper nouns / all-caps acronyms recall
|
||||
poorly, but ordinary lowercase content bigrams recall reliably
|
||||
(verified empirically). So we walk through the first N atomic
|
||||
facts, pull consecutive lowercase content tokens, and pass the
|
||||
test as soon as one candidate bigram returns ≥ 1 hit. This
|
||||
validates the BM25 plumbing without coupling to which specific
|
||||
fact got sampled — vector + hybrid tests own the strict
|
||||
closed-loop recall claim.
|
||||
"""
|
||||
seeds = pick_query_seeds(_ingested_memory_root, limit=20)
|
||||
last_query: str | None = None
|
||||
for owner, fact in seeds:
|
||||
for query in _candidate_bigrams(fact):
|
||||
last_query = query
|
||||
resp = await search_client.post(
|
||||
"/api/v1/memory/search",
|
||||
json={
|
||||
"user_id": owner,
|
||||
"query": query,
|
||||
"method": "keyword",
|
||||
"top_k": 5,
|
||||
},
|
||||
timeout=60.0,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
hits = flatten_hits(resp.json()["data"])
|
||||
if hits:
|
||||
# Partition still holds even on a successful keyword hit.
|
||||
for hit_owner, _s, _t in hits:
|
||||
if hit_owner is not None:
|
||||
assert hit_owner == owner
|
||||
return
|
||||
raise AssertionError(
|
||||
f"BM25 returned 0 hits across {len(seeds)} fact seeds; "
|
||||
f"last tried query={last_query!r}"
|
||||
)
|
||||
|
||||
|
||||
def _candidate_bigrams(fact: str) -> list[str]:
|
||||
"""Lowercase consecutive content-token bigrams from ``fact``.
|
||||
|
||||
Skip tokens that include uppercase letters in the original text
|
||||
(proper nouns / acronyms — empirically poor BM25 recall under
|
||||
jieba). Returns at most 5 candidates per fact, in source order.
|
||||
"""
|
||||
import re as _re
|
||||
|
||||
out: list[str] = []
|
||||
tokens: list[str] = []
|
||||
for raw in _re.findall(r"\w+", fact):
|
||||
if raw.lower() == raw and len(raw) >= 3:
|
||||
tokens.append(raw)
|
||||
for i in range(len(tokens) - 1):
|
||||
out.append(f"{tokens[i]} {tokens[i + 1]}")
|
||||
if len(out) >= 5:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
# ── 2. Vector recall ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_vector_recalls_atomic_fact_origin(
|
||||
search_client: httpx.AsyncClient,
|
||||
_ingested_memory_root: Path,
|
||||
) -> None:
|
||||
"""Same fact via cosine ANN — independent of BM25 tokenisation."""
|
||||
owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
|
||||
await assert_recall(
|
||||
search_client,
|
||||
owner_id=owner,
|
||||
query=fact,
|
||||
method="vector",
|
||||
# Cosine: identical text would score ~1.0; threshold loose
|
||||
# because the LLM-summarised episode text isn't the verbatim fact.
|
||||
min_score=0.1,
|
||||
)
|
||||
|
||||
|
||||
# ── 3. Hybrid + include_profile ────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_hybrid_with_profile_returns_profile(
|
||||
search_client: httpx.AsyncClient,
|
||||
_ingested_memory_root: Path,
|
||||
) -> None:
|
||||
"""``include_profile=true`` must populate the profiles array."""
|
||||
owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
|
||||
resp = await search_client.post(
|
||||
"/api/v1/memory/search",
|
||||
json={
|
||||
"user_id": owner,
|
||||
"query": fact,
|
||||
"method": "hybrid",
|
||||
"top_k": 5,
|
||||
"include_profile": True,
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
data = resp.json()["data"]
|
||||
assert data["profiles"], "include_profile=true but profiles[] empty"
|
||||
assert data["profiles"][0]["user_id"] == owner
|
||||
|
||||
|
||||
# ── 4. Owner partition ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_partition_respects_owner_id(
|
||||
search_client: httpx.AsyncClient,
|
||||
_ingested_memory_root: Path,
|
||||
) -> None:
|
||||
"""Querying owner=A must not leak owner=B's data, even on shared topics."""
|
||||
seeds = pick_query_seeds(_ingested_memory_root, limit=2)
|
||||
owners = {o for o, _ in seeds}
|
||||
assert len(owners) >= 1, "need at least one owner in the corpus"
|
||||
target_owner = next(iter(owners))
|
||||
_, fact = next((o, f) for o, f in seeds if o == target_owner)
|
||||
|
||||
body = await assert_recall(
|
||||
search_client,
|
||||
owner_id=target_owner,
|
||||
query=fact,
|
||||
method="hybrid",
|
||||
)
|
||||
# Agent tracks must be empty for user owners.
|
||||
assert body["data"]["agent_cases"] == []
|
||||
assert body["data"]["agent_skills"] == []
|
||||
|
||||
|
||||
# ── 5. Unknown owner ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_unknown_owner_returns_empty_200(
|
||||
search_client: httpx.AsyncClient,
|
||||
) -> None:
|
||||
"""An owner that the corpus never saw → 200 with four empty arrays."""
|
||||
resp = await search_client.post(
|
||||
"/api/v1/memory/search",
|
||||
json={
|
||||
"user_id": "ghost_user_does_not_exist",
|
||||
"query": "anything",
|
||||
"method": "hybrid",
|
||||
"top_k": 5,
|
||||
},
|
||||
timeout=60.0,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
data = resp.json()["data"]
|
||||
assert data["episodes"] == []
|
||||
assert data["profiles"] == []
|
||||
assert data["agent_cases"] == []
|
||||
assert data["agent_skills"] == []
|
||||
|
||||
|
||||
# ── 6. Filter DSL ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def test_filter_dsl_compiles_and_excludes(
|
||||
search_client: httpx.AsyncClient,
|
||||
_ingested_memory_root: Path,
|
||||
) -> None:
|
||||
"""Add a ``session_id`` ne-filter, verify the returned hits respect it."""
|
||||
owner, fact = pick_query_seeds(_ingested_memory_root, limit=1)[0]
|
||||
bogus_session = "session_that_never_was"
|
||||
resp = await search_client.post(
|
||||
"/api/v1/memory/search",
|
||||
json={
|
||||
"user_id": owner,
|
||||
"query": fact,
|
||||
"method": "keyword",
|
||||
"top_k": 10,
|
||||
"filters": {"session_id": {"ne": bogus_session}},
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
data = resp.json()["data"]
|
||||
# The filter is satisfied by every real episode (none have the
|
||||
# bogus id), so the hit count should be ≥ 1 — the filter
|
||||
# compiled and shipped to LanceDB without breaking recall.
|
||||
for ep in data["episodes"]:
|
||||
assert ep["session_id"] != bogus_session
|
||||
Reference in New Issue
Block a user