Files
EverOS/tests/integration/search/_run_full_report.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

661 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""End-to-end report generator: fresh corpus → ingest → retrieve → markdown report.
Run with::
PYTHONPATH=src python tests/integration/search/_run_full_report.py
Writes a fresh ``~/.everos-report-corpus/`` memory_root, runs a small
synthetic 16-message conversation between two new users (``u_diana`` +
``u_ethan``) through ``/add`` + ``/flush``, waits for cascade drain, then
runs a curated set of search probes and dumps a structured markdown
report to ``tests/integration/search/SEARCH_REPORT.md``.
Not a pytest test — pure investigative script, real LLM, real embedder.
"""
from __future__ import annotations
import asyncio
import json
import os
import shutil
from pathlib import Path
import httpx
from dotenv import load_dotenv
# Load .env BEFORE any everos import so settings are correct.
_PROJECT_ROOT = Path(__file__).resolve().parents[3]
load_dotenv(_PROJECT_ROOT / ".env", override=False)
# ── Corpus location ────────────────────────────────────────────────────
CORPUS_ROOT = Path.home() / ".everos-report-corpus"
REPORT_PATH = _PROJECT_ROOT / "tests/integration/search/SEARCH_REPORT.md"
SESSION_ID = "report_session_diana_ethan"
# ── Synthetic conversation (16 msgs, 2 batches) ────────────────────────
CONVERSATION = [
# Batch 1 — introducing hobbies
[
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778414400000,
"content": "Hey Ethan! Just got back from a 3-day hike in Yosemite. "
"My new Sony A7 camera is amazing for landscape shots.",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778407260000,
"content": "Wow that sounds intense! I'd never survive without my "
"espresso. How's the Rust programming learning going?",
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778407320000,
"content": "Slow but steady. Working through the official book. "
"The borrow checker still trips me up.",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778407380000,
"content": "I'm marathon training — up to 15 miles long runs now. "
"Plus I joined a jazz quartet on weekends.",
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778407440000,
"content": "That's awesome! Saxophone again?",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778407500000,
"content": "Yeah, alto sax. We're playing at the Blue Note next month.",
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778407560000,
"content": "I'll come watch! Speaking of trips, want to do "
"that Iceland thing this summer?",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778407620000,
"content": "100% yes. I've been researching ring road photography spots.",
},
],
# Batch 2 — Iceland trip planning
[
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778410800000,
"content": "I want to see the Northern Lights and shoot some "
"volcanic landscapes.",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778410860000,
"content": "We should rent a 4x4. The F-roads are insane I hear.",
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778410920000,
"content": "And I want to try Icelandic lamb stew. You cook, right?",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778410980000,
"content": (
"Yeah, I'll bring my Dutch oven. Maybe a cast iron pan for fish."
),
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778411040000,
"content": "Perfect. Mid-July works for me — I have a Rust conference "
"in late August.",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778411100000,
"content": "July it is. I have the Boston Marathon qualifier in October "
"so I can't go after.",
},
{
"sender_id": "u_diana",
"role": "user",
"timestamp": 1778411160000,
"content": "Let's book flights this weekend?",
},
{
"sender_id": "u_ethan",
"role": "user",
"timestamp": 1778411220000,
"content": "Deal. Also bringing my Olympus E-M1 for the landscapes.",
},
],
]
# ── Probe set ───────────────────────────────────────────────────────────
PROBES: list[dict] = [
# Owner-specific topical: should recall the right owner's episodes.
{
"section": "Owner-specific topical (diana)",
"owner": "u_diana",
"query": "hiking",
"method": "hybrid",
"expect": "diana's Yosemite episode",
},
{
"section": "Owner-specific topical (diana)",
"owner": "u_diana",
"query": "Rust programming",
"method": "hybrid",
"expect": "diana's Rust learning facts",
},
{
"section": "Owner-specific topical (diana)",
"owner": "u_diana",
"query": "photography",
"method": "hybrid",
"expect": "diana's camera (Sony A7) facts",
},
{
"section": "Owner-specific topical (ethan)",
"owner": "u_ethan",
"query": "jazz",
"method": "hybrid",
"expect": "ethan's jazz quartet / sax facts",
},
{
"section": "Owner-specific topical (ethan)",
"owner": "u_ethan",
"query": "marathon training",
"method": "hybrid",
"expect": "ethan's marathon facts",
},
{
"section": "Owner-specific topical (ethan)",
"owner": "u_ethan",
"query": "cooking",
"method": "hybrid",
"expect": "ethan's Dutch oven / lamb stew facts",
},
# Shared topic — both should recall their own perspective.
{
"section": "Shared topic (Iceland)",
"owner": "u_diana",
"query": "Iceland trip",
"method": "hybrid",
"expect": "diana's planning episode",
},
{
"section": "Shared topic (Iceland)",
"owner": "u_ethan",
"query": "Iceland trip",
"method": "hybrid",
"expect": "ethan's planning episode",
},
# Method comparison on the same query.
{
"section": "Method comparison (diana + 'Rust')",
"owner": "u_diana",
"query": "Rust",
"method": "keyword",
"expect": "BM25 single token",
},
{
"section": "Method comparison (diana + 'Rust')",
"owner": "u_diana",
"query": "Rust",
"method": "vector",
"expect": "cosine ANN",
},
{
"section": "Method comparison (diana + 'Rust')",
"owner": "u_diana",
"query": "Rust",
"method": "hybrid",
"expect": "fusion of BM25 + vector",
},
# Owner partition: diana searching for ethan's exclusive topic.
{
"section": "Owner partition",
"owner": "u_diana",
"query": "jazz quartet",
"method": "hybrid",
"expect": "should NOT leak ethan's content",
},
{
"section": "Owner partition",
"owner": "u_ethan",
"query": "Rust programming",
"method": "hybrid",
"expect": "should NOT leak diana's content",
},
# Phrase + bigram.
{
"section": "Phrase queries",
"owner": "u_diana",
"query": "Northern Lights",
"method": "keyword",
"expect": "diana's Iceland aurora plans",
},
{
"section": "Phrase queries",
"owner": "u_ethan",
"query": "Boston Marathon",
"method": "keyword",
"expect": "ethan's qualifier date",
},
# include_profile.
{
"section": "Profile attach",
"owner": "u_diana",
"query": "anything",
"method": "hybrid",
"include_profile": True,
"expect": "should return diana's profile object",
},
# Unknown owner.
{
"section": "Unknown owner",
"owner": "u_ghost_does_not_exist",
"query": "hiking",
"method": "hybrid",
"expect": "empty arrays, status 200",
},
# Non-existent term.
{
"section": "Non-existent term",
"owner": "u_diana",
"query": "quantum blockchain pizza",
"method": "keyword",
"expect": "0 hits, status 200",
},
]
# ── Pipeline runners ───────────────────────────────────────────────────
async def ingest(client: httpx.AsyncClient) -> dict:
"""POST /add for each batch, then /flush. Return summary."""
summary: dict = {"batches": [], "flush_status": None}
for i, batch in enumerate(CONVERSATION):
resp = await client.post(
"/api/v1/memory/add",
json={"session_id": SESSION_ID, "messages": batch},
timeout=600.0,
)
resp.raise_for_status()
data = resp.json()["data"]
summary["batches"].append(
{
"idx": i,
"msg_count": len(batch),
"status": data["status"],
"returned_count": data["message_count"],
}
)
resp = await client.post(
"/api/v1/memory/flush",
json={"session_id": SESSION_ID},
timeout=600.0,
)
resp.raise_for_status()
summary["flush_status"] = resp.json()["data"]["status"]
return summary
async def wait_cascade(
*,
expected_md_paths: int = 8,
stable_checks: int = 5,
deadline_seconds: float = 600.0,
) -> dict:
"""Block until cascade is *stably* done across all expected md kinds.
A plain ``pending == 0`` check is racy: OME async strategies
(extract_foresight / extract_user_profile) emit md writes
asynchronously after ``/flush`` returns, and there's a window
where the cascade queue is momentarily empty before OME's writes
arrive. We require two stronger conditions:
1. At least ``expected_md_paths`` rows exist in ``md_change_state``
(one per expected (owner × kind) — episodes + atomic_facts +
foresights + user_profile, per owner). This guards against
returning before OME has emitted *anything*.
2. ``pending == 0`` stays true for ``stable_checks`` consecutive
polls (separated by 1s sleep). This guards against a transient
empty queue while a strategy is still mid-write.
"""
from everos.infra.persistence.sqlite import md_change_state_repo
consecutive_zero = 0
async with asyncio.timeout(deadline_seconds):
while True:
sm = await md_change_state_repo.queue_summary()
total_rows = (
sm.pending + sm.done + sm.failed_retryable + sm.failed_permanent
)
if sm.pending == 0 and total_rows >= expected_md_paths:
consecutive_zero += 1
if consecutive_zero >= stable_checks:
return {
"done": sm.done,
"failed_retryable": sm.failed_retryable,
"failed_permanent": sm.failed_permanent,
"max_lsn": sm.max_lsn,
"last_processed_lsn": sm.last_processed_lsn,
}
else:
consecutive_zero = 0
await asyncio.sleep(1.0)
async def inspect_artifacts(memory_root: Path) -> dict:
"""Read md files + LanceDB counts after cascade drain."""
from everos.infra.persistence.lancedb import (
atomic_fact_repo,
dispose_connection,
episode_repo,
foresight_repo,
get_connection,
user_profile_repo,
verify_business_schemas,
)
await get_connection()
await verify_business_schemas()
counts = {
"episode_rows": await episode_repo.count(),
"atomic_fact_rows": await atomic_fact_repo.count(),
"foresight_rows": await foresight_repo.count(),
"user_profile_rows": await user_profile_repo.count(),
}
await dispose_connection()
md_files: list[str] = []
users_dir = memory_root / "default_app" / "default_project" / "users"
if users_dir.is_dir():
for f in sorted(users_dir.rglob("*.md")):
md_files.append(str(f.relative_to(memory_root)))
counts["md_files"] = md_files
return counts
async def run_probes(client: httpx.AsyncClient) -> list[dict]:
"""Execute every probe in :data:`PROBES`; return captured rows."""
rows: list[dict] = []
for p in PROBES:
payload: dict = {
"owner_id": p["owner"],
"owner_type": "user",
"query": p["query"],
"method": p["method"],
"top_k": 5,
}
if p.get("include_profile"):
payload["include_profile"] = True
resp = await client.post("/api/v1/memory/search", json=payload, timeout=120.0)
body = resp.json()
data = body.get("data", {})
rows.append(
{
"section": p["section"],
"expect": p["expect"],
"request": payload,
"status": resp.status_code,
"episodes": [
{
"id": e["id"],
"owner_id": e["owner_id"],
"score": round(float(e["score"]), 3),
"summary": (e.get("summary") or "")[:150],
"atomic_facts_count": len(e.get("atomic_facts", [])),
}
for e in data.get("episodes", [])
],
"profiles": [
{
"owner_id": p_.get("owner_id"),
"score": p_.get("score"),
"summary_excerpt": str(p_.get("profile_data", {}))[:200],
}
for p_ in data.get("profiles", [])
],
}
)
return rows
# ── Markdown report renderer ───────────────────────────────────────────
def render_report(
*,
memory_root: Path,
ingest_summary: dict,
cascade_summary: dict,
artifacts: dict,
probes: list[dict],
) -> str:
lines: list[str] = []
lines.append("# Search E2E Report — fresh corpus (u_diana + u_ethan)\n")
lines.append(
"Generated by [`_run_full_report.py`](_run_full_report.py). "
"Two synthetic users with distinct hobbies feed a 16-message "
"conversation through the full pipeline; the report below "
"captures ingest stats, cascade drain numbers, on-disk "
"artifacts, and the response of every curated search probe.\n"
)
# ── Section: Setup ────────────────────────────────────────────────
lines.append("## 1. Setup\n")
lines.append(f"- **Memory root**: `{memory_root}`\n")
lines.append(f"- **Session id**: `{SESSION_ID}`\n")
lines.append(
"- **Users**: `u_diana` (hiking / Rust / photography), "
"`u_ethan` (jazz / marathon / cooking)\n"
)
lines.append(
f"- **Batches**: {len(CONVERSATION)} "
f"({sum(len(b) for b in CONVERSATION)} messages total)\n"
)
# ── Section: Ingest stats ─────────────────────────────────────────
lines.append("\n## 2. Ingest (`/add` × N + `/flush`)\n")
lines.append("| batch | msg_count | status |\n")
lines.append("|---|---|---|\n")
for b in ingest_summary["batches"]:
lines.append(f"| {b['idx']} | {b['msg_count']} | `{b['status']}` |\n")
lines.append(f"\n**Flush status**: `{ingest_summary['flush_status']}`\n")
# ── Section: Cascade drain ────────────────────────────────────────
lines.append("\n## 3. Cascade drain (md → LanceDB sync)\n")
lines.append("```\n")
lines.append(json.dumps(cascade_summary, indent=2) + "\n")
lines.append("```\n")
# ── Section: Artifacts ────────────────────────────────────────────
lines.append("\n## 4. On-disk artifacts\n")
lines.append("### LanceDB row counts\n\n")
lines.append("| table | rows |\n")
lines.append("|---|---|\n")
for k in (
"episode_rows",
"atomic_fact_rows",
"foresight_rows",
"user_profile_rows",
):
lines.append(f"| {k.replace('_rows', '')} | {artifacts[k]} |\n")
lines.append("\n### Markdown files\n\n")
for f in artifacts["md_files"]:
lines.append(f"- `{f}`\n")
# ── Section: Probes ───────────────────────────────────────────────
lines.append("\n## 5. Retrieval probes\n")
lines.append(
"Every row below is one POST to `/api/v1/memory/search`. "
"`expected` is what the test designer expects to see; "
"actual results are captured verbatim.\n"
)
current_section = None
for row in probes:
if row["section"] != current_section:
lines.append(f"\n### {row['section']}\n")
current_section = row["section"]
req = row["request"]
lines.append(
f"\n#### `{req['query']}` (method=`{req['method']}`, "
f"owner=`{req['owner_id']}`)\n"
)
lines.append(f"\n- **Expected**: {row['expect']}\n")
lines.append(f"- **Status**: {row['status']}\n")
lines.append(f"- **Episodes returned**: {len(row['episodes'])}\n")
if row["episodes"]:
lines.append("\n| rank | score | owner | atomic_facts | summary |\n")
lines.append("|---|---|---|---|---|\n")
for i, ep in enumerate(row["episodes"], 1):
summary = ep["summary"].replace("|", "\\|")
lines.append(
f"| {i} | {ep['score']} | `{ep['owner_id']}` | "
f"{ep['atomic_facts_count']} | {summary} |\n"
)
else:
lines.append("\n_(no episodes)_\n")
if row["profiles"]:
lines.append(
"\n**Profile attached**: "
f"`{row['profiles'][0]['owner_id']}` "
f"(excerpt: {row['profiles'][0]['summary_excerpt']!r})\n"
)
# ── Section: Pass/Fail summary ────────────────────────────────────
lines.append("\n## 6. Pass / Fail summary\n")
pf = _grade(probes)
lines.append("| # | section | query | result |\n")
lines.append("|---|---|---|---|\n")
for r in pf:
lines.append(
f"| {r['idx']} | {r['section']} | `{r['query']}` | {r['verdict']} |\n"
)
passed = sum(1 for r in pf if r["verdict"].startswith(""))
lines.append(f"\n**Total: {passed}/{len(pf)} passed.**\n")
return "".join(lines)
def _grade(probes: list[dict]) -> list[dict]:
"""Apply soft heuristic pass/fail to each probe based on its 'expect'."""
graded: list[dict] = []
for i, row in enumerate(probes, 1):
req = row["request"]
expect = row["expect"].lower()
verdict = ""
if "should not leak" in expect:
leaked = any(ep["owner_id"] != req["owner_id"] for ep in row["episodes"])
verdict = "❌ leaked" if leaked else "✅ no leak"
elif "empty arrays" in expect or "0 hits" in expect:
verdict = "" if not row["episodes"] else f"❌ got {len(row['episodes'])}"
elif "profile" in expect:
verdict = "" if row["profiles"] else "❌ no profile"
elif row["episodes"]:
top_owner = row["episodes"][0]["owner_id"]
verdict = (
"" if top_owner == req["owner_id"] else f"❌ wrong owner: {top_owner}"
)
else:
verdict = "❌ no hits"
graded.append(
{
"idx": i,
"section": row["section"],
"query": req["query"],
"verdict": verdict,
}
)
return graded
# ── Main ────────────────────────────────────────────────────────────────
async def main() -> None:
# Reset corpus to a known empty state.
if CORPUS_ROOT.exists():
shutil.rmtree(CORPUS_ROOT)
CORPUS_ROOT.mkdir(parents=True)
os.environ["EVEROS_MEMORY__ROOT"] = str(CORPUS_ROOT)
# Reset cached singletons so they pick up the new env.
from everos.config import load_settings
load_settings.cache_clear()
print(f"[1/6] fresh corpus at {CORPUS_ROOT}")
from everos.entrypoints.api.app import create_app
app = create_app()
transport = httpx.ASGITransport(app=app)
async with (
app.router.lifespan_context(app),
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
):
print("[2/6] ingesting via /add + /flush ...")
ingest_summary = await ingest(client)
print(f" batches={ingest_summary['batches']}")
print("[3/6] waiting for cascade drain ...")
cascade_summary = await wait_cascade()
print(f" drained: {cascade_summary}")
print("[4/6] inspecting on-disk artifacts ...")
artifacts = await inspect_artifacts(CORPUS_ROOT)
print(
" lancedb: {k: v for k,v in artifacts.items() if k.endswith('_rows')}"
)
print(f"[5/6] running {len(PROBES)} search probes ...")
probes = await run_probes(client)
print("[6/6] rendering report ...")
md = render_report(
memory_root=CORPUS_ROOT,
ingest_summary=ingest_summary,
cascade_summary=cascade_summary,
artifacts=artifacts,
probes=probes,
)
REPORT_PATH.write_text(md, encoding="utf-8")
print(f"{REPORT_PATH}")
if __name__ == "__main__":
asyncio.run(main())