chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
Elliot Chen
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions

0
tests/e2e/__init__.py Normal file
View File

286
tests/e2e/conftest.py Normal file
View File

@ -0,0 +1,286 @@
"""Shared fixtures for ``tests/e2e/``.
Provides:
- ``core_pipeline_runtime``: tmp memory root + reset memorize singletons.
Uses the **real** LLM / embedding / rerank creds from ``.env`` per the
project test policy.
- ``async_client``: ``httpx.AsyncClient`` wired into ``create_app()`` with
the full lifespan stack (SQLite + LanceDB + Cascade + OME).
- ``cascade_done_poll``: wait until ``md_change_state`` queue is fully
drained (``pending`` rows == 0; includes the internal ``processing``).
- ``pipeline_done_poll``: composite drain — waits until OME strategy runs AND
``md_change_state`` queue both drain (use for tests that exercise the full
OME → md → cascade pipeline).
- ``buffer_count`` / ``memcell_count``: raw counts for buffer-delta and
memcell-growth assertions.
The ``long_conversation`` fixture (LoCoMo conv_0) lives in
:mod:`tests.conftest` so both ``tests/e2e/`` and
``tests/integration/search/`` can depend on it.
Conventions:
- ``.env`` is loaded at import time (before any everos module reads
settings) — overrides for ``EVEROS_MEMORY__ROOT`` happen per-test.
- This file does **not** define ``cascade_runtime`` — that name belongs
to ``tests/integration/test_cascade_integration.py``'s local fixture.
The pipeline test uses ``core_pipeline_runtime`` to avoid name
collision.
"""
from __future__ import annotations
import asyncio
import importlib
import json
from collections.abc import AsyncIterator, Awaitable, Callable
from pathlib import Path
import httpx
import pytest
import pytest_asyncio
from dotenv import load_dotenv
from sqlalchemy import text
# Load real .env creds before any everos import touches load_settings().
_PROJECT_ROOT = Path(__file__).resolve().parents[2]
load_dotenv(_PROJECT_ROOT / ".env", override=False)
_FIXTURE_DIR = _PROJECT_ROOT / "tests" / "fixtures"
_SEARCH_SEED_DIR = _FIXTURE_DIR / "search_seed"
# Memorize service module-level singletons that survive across tests; we
# null them out so each test rebuilds against its own ``tmp_path``.
_MEMORIZE_SINGLETONS: tuple[str, ...] = (
"_episode_writer",
"_prompt_loader",
"_user_pipeline",
"_agent_pipeline",
"_ome_engine",
)
# OME strategy modules carry module-level lazy singletons (``_writer`` /
# ``_reader``) that capture ``MemoryRoot.default()`` at first call. They
# survive across tests, so the second test writes its output to the
# **first test's** tmp_path. Reset all of them per-test.
_STRATEGY_SINGLETONS: tuple[tuple[str, tuple[str, ...]], ...] = (
("everos.memory.strategies.extract_atomic_facts", ("_writer",)),
("everos.memory.strategies.extract_foresight", ("_writer",)),
("everos.memory.strategies.extract_user_profile", ("_writer", "_reader")),
("everos.memory.strategies.extract_agent_case", ("_writer",)),
("everos.memory.strategies.extract_agent_skill", ("_writer",)),
)
def _reset_strategy_singletons(monkeypatch: pytest.MonkeyPatch) -> None:
"""Null every strategy ``_writer`` / ``_reader`` so the next test
rebuilds against its own ``MemoryRoot.default()`` (driven by the
fresh ``EVEROS_MEMORY__ROOT`` env var set by the calling fixture).
"""
for mod_name, attrs in _STRATEGY_SINGLETONS:
mod = importlib.import_module(mod_name)
for attr in attrs:
monkeypatch.setattr(mod, attr, None, raising=False)
# ---------------------------------------------------------------------------
# Data fixture
# ---------------------------------------------------------------------------
@pytest.fixture(scope="session")
def search_seed() -> dict[str, list[dict]]:
"""Load the search seed slice produced by ``_dump_search_seed.py``.
Returns a dict with four keys (``episode`` / ``atomic_fact`` /
``foresight`` / ``user_profile``); each value is a list of raw row
dicts ready to be fed into ``Model.model_validate`` for LanceDB.
Tests pick the subset they need and may mutate per-row fields
(e.g. set distinct ``session_id`` values to exercise filter DSL)
before instantiating the pydantic model.
"""
return {
name: json.loads((_SEARCH_SEED_DIR / f"{name}.json").read_text())
for name in ("episode", "atomic_fact", "foresight", "user_profile")
}
# ---------------------------------------------------------------------------
# Runtime fixture: tmp memory root + singleton reset (no app lifespan)
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def core_pipeline_runtime(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> AsyncIterator[Path]:
"""Prepare clean memory root + reset memorize singletons.
Keeps real LLM / embedding settings from ``.env`` (do NOT overwrite
``EVEROS_LLM__*`` or ``EVEROS_EMBEDDING__*``).
"""
monkeypatch.setenv("EVEROS_MEMORY__ROOT", str(tmp_path))
from everos.config import load_settings
load_settings.cache_clear()
svc = importlib.import_module("everos.service.memorize")
client_mod = importlib.import_module("everos.component.llm.client")
for attr in _MEMORIZE_SINGLETONS:
monkeypatch.setattr(svc, attr, None, raising=False)
monkeypatch.setattr(client_mod, "_llm_client", None, raising=False)
_reset_strategy_singletons(monkeypatch)
yield tmp_path
# ---------------------------------------------------------------------------
# Async client fixture (full app lifespan)
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def async_client(
core_pipeline_runtime: Path,
) -> AsyncIterator[httpx.AsyncClient]:
"""Bring up the full everos app with lifespan, return an httpx client.
The lifespan starts: SQLite engine, LanceDB connection + business
indexes, Cascade orchestrator (watcher + scanner + worker), OME
engine. Teardown stops everything in reverse.
"""
from everos.entrypoints.api.app import create_app
app = create_app()
transport = httpx.ASGITransport(app=app)
# Drive starlette's lifespan_context explicitly — httpx.ASGITransport
# does not run startup / shutdown on its own.
async with (
app.router.lifespan_context(app),
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
):
yield client
# ---------------------------------------------------------------------------
# Poll helpers
# ---------------------------------------------------------------------------
async def _poll(
condition: Callable[[], Awaitable[bool]],
*,
deadline_seconds: float,
interval: float = 0.5,
) -> None:
"""Poll an async predicate until truthy; ``TimeoutError`` on deadline."""
async with asyncio.timeout(deadline_seconds):
while True:
if await condition():
return
await asyncio.sleep(interval)
@pytest.fixture
def cascade_done_poll() -> Callable[..., Awaitable[None]]:
"""Wait until ``md_change_state`` queue is drained (no pending/processing)."""
async def _wait(*, deadline_seconds: float = 180.0) -> None:
from everos.infra.persistence.sqlite import md_change_state_repo
async def _drained() -> bool:
summary = await md_change_state_repo.queue_summary()
# `pending` includes the internal `processing` rows (see QueueSummary).
return summary.pending == 0
await _poll(_drained, deadline_seconds=deadline_seconds)
return _wait
@pytest.fixture
def pipeline_done_poll() -> Callable[..., Awaitable[None]]:
"""Wait until OME strategy runs AND ``md_change_state`` queue both drain.
Composite drain — fixes the trap where :func:`cascade_done_poll`
alone returns immediately while a slow LLM-driven strategy is still
in flight (the strategy has not written md yet, so the cascade queue
is momentarily empty). Pipeline tests that touch the full async
chain (OME -> md -> cascade -> LanceDB) must use this instead of
``cascade_done_poll``.
"""
async def _wait(*, deadline_seconds: float = 180.0) -> None:
from everos.infra.persistence.sqlite import md_change_state_repo
from everos.service.memorize import _get_engine
engine = _get_engine()
async def _drained() -> bool:
# OME side first: cascade can only fire after a strategy
# writes md, so an in-flight run means the queue check below
# is premature.
if not await engine.wait_idle(timeout=0.5):
return False
# `pending` includes the internal `processing` rows (see
# QueueSummary).
summary = await md_change_state_repo.queue_summary()
return summary.pending == 0
await _poll(_drained, deadline_seconds=deadline_seconds)
return _wait
# ---------------------------------------------------------------------------
# Count helpers (used directly by tests for buffer-delta assertions)
# ---------------------------------------------------------------------------
@pytest.fixture
def buffer_count() -> Callable[[str], Awaitable[int]]:
"""Return an async callable: ``await buffer_count(session_id) -> int``."""
async def _count(session_id: str) -> int:
from everos.infra.persistence.sqlite import get_engine
engine = get_engine()
async with engine.connect() as conn:
result = await conn.execute(
text("SELECT COUNT(*) FROM unprocessed_buffer WHERE session_id = :sid"),
{"sid": session_id},
)
return int(result.scalar() or 0)
return _count
@pytest.fixture
def memcell_count() -> Callable[[str], Awaitable[int]]:
"""Return an async callable: ``await memcell_count(user_id_or_session) -> int``.
Counts memcell rows; pass session_id to count by session, or omit to
count all.
"""
async def _count(session_id: str | None = None) -> int:
from everos.infra.persistence.sqlite import get_engine
engine = get_engine()
async with engine.connect() as conn:
if session_id is None:
result = await conn.execute(text("SELECT COUNT(*) FROM memcell"))
else:
result = await conn.execute(
text("SELECT COUNT(*) FROM memcell WHERE session_id = :sid"),
{"sid": session_id},
)
return int(result.scalar() or 0)
return _count

View File

@ -0,0 +1,206 @@
"""Agent pipeline e2e: 5 SWE-bench trajectories drive /add + /flush.
Drives the full HTTP route through to storage, exercising the agent-track
pipeline (boundary → memcell → extract_agent_case → trigger_skill_clustering
→ extract_agent_skill) with real LLM and real embedder credentials.
Mixed tenancy by design (sender_id alignment from fixture):
agent_pytest (1 session, pytest-dev/pytest-7236) ┐ independent
agent_sympy (1 session, sympy/sympy-18763) ┘ owners
agent_django (3 sessions, django/django-{14311,16255,16263}) shared
Concurrency strategy (workaround for the known
``trigger_skill_clustering`` read-modify-write race on a shared owner_id):
Phase 1: pytest + sympy concurrent via asyncio.gather (disjoint owners)
Phase 2: 3 django sessions sequential (same owner, would race)
Once the cluster race is fixed in production, Phase 2 can collapse into
the same gather and the test will still pass — the assertions are
race-free, only the driver is conservative.
White-box assertions (audit trail of internal surfaces touched):
- sqlite ``memcell`` rows per session_id
- filesystem ``<root>/agents/<agent>/.cases/*.md`` presence
- LanceDB ``agent_case`` rows by ``owner_id`` (count + session_id set)
- LanceDB ``agent_skill`` rows by ``owner_id`` (soft — LLM-dependent)
"""
from __future__ import annotations
import asyncio
import json
from collections.abc import Awaitable, Callable
from pathlib import Path
import httpx
import pytest
from everos.infra.persistence.lancedb import agent_case_repo, agent_skill_repo
from everos.infra.persistence.markdown import AgentCaseDailyFrontmatter
_FIXTURE_DIR = Path(__file__).resolve().parents[1] / "fixtures" / "agent_trajectories"
# Hand-picked trajectories (kept in-tree as fixtures; this selection is
# the source of truth — the original converter is not in the repo).
_PYTEST_SESSION = "session_pytest_7236"
_SYMPY_SESSION = "session_sympy_18763"
_DJANGO_SESSIONS = (
"session_django_14311",
"session_django_16255",
"session_django_16263",
)
_AGENT_PYTEST = "agent_pytest"
_AGENT_SYMPY = "agent_sympy"
_AGENT_DJANGO = "agent_django"
# Phase 3 drain budget: OME chain (case → cluster → skill) writes md in
# stages, each picked up by cascade. Multiple drain rounds with brief
# sleeps let the chain quiesce without false-positive completion.
_DRAIN_ROUNDS = 4
_DRAIN_TIMEOUT_SECONDS = 300.0
_DRAIN_INTER_ROUND_SLEEP_SECONDS = 5.0
def _load_fixture(session_id: str) -> dict:
return json.loads((_FIXTURE_DIR / f"{session_id}.json").read_text())
async def _drive_session(
client: httpx.AsyncClient, session_data: dict
) -> tuple[str, str]:
"""Run /add followed by /flush for one trajectory; return status."""
sid = session_data["everos_session_id"]
msgs = session_data["messages"]
# MessageItemDTO.max_length=500; our largest fixture has 324 messages.
r = await client.post(
"/api/v1/memory/add",
json={"session_id": sid, "messages": msgs},
timeout=600.0,
)
assert r.status_code == 200, (
f"{sid}: /add returned {r.status_code}{r.text[:300]}"
)
r = await client.post(
"/api/v1/memory/flush",
json={"session_id": sid},
timeout=600.0,
)
assert r.status_code == 200, (
f"{sid}: /flush returned {r.status_code}{r.text[:300]}"
)
return sid, r.json()["data"]["status"]
@pytest.mark.slow
@pytest.mark.live_llm
async def test_agent_pipeline_e2e_mixed_tenancy(
async_client: httpx.AsyncClient,
core_pipeline_runtime: Path,
pipeline_done_poll: Callable[..., Awaitable[None]],
memcell_count: Callable[..., Awaitable[int]],
) -> None:
"""5 SWE-bench trajectories → agent_case + agent_skill on three agents."""
memory_root = core_pipeline_runtime
pytest_fx = _load_fixture(_PYTEST_SESSION)
sympy_fx = _load_fixture(_SYMPY_SESSION)
django_fxs = [_load_fixture(s) for s in _DJANGO_SESSIONS]
# ── Phase 1: independent owners concurrent ────────────────────────────
await asyncio.gather(
_drive_session(async_client, pytest_fx),
_drive_session(async_client, sympy_fx),
)
# ── Phase 2: shared owner_id, sequential to dodge cluster race ────────
for fx in django_fxs:
await _drive_session(async_client, fx)
# ── Phase 3: drain OME chain + cascade ────────────────────────────────
for _ in range(_DRAIN_ROUNDS):
await pipeline_done_poll(deadline_seconds=_DRAIN_TIMEOUT_SECONDS)
await asyncio.sleep(_DRAIN_INTER_ROUND_SLEEP_SECONDS)
# ── Phase 4: assertions ───────────────────────────────────────────────
# 4.1 every session produced ≥1 memcell
all_sessions = (_PYTEST_SESSION, _SYMPY_SESSION, *_DJANGO_SESSIONS)
for sid in all_sessions:
n = await memcell_count(sid)
assert n >= 1, f"no memcell for session {sid!r} (got {n})"
# 4.2 each agent has a .cases dir with ≥1 .md file
agents_dir = memory_root / "default_app" / "default_project" / "agents"
case_dir_name = AgentCaseDailyFrontmatter.DIR_NAME
for agent_id in (_AGENT_PYTEST, _AGENT_SYMPY, _AGENT_DJANGO):
case_dir = agents_dir / agent_id / case_dir_name
assert case_dir.is_dir(), f"missing {case_dir!s} for agent={agent_id!r}"
md_files = list(case_dir.glob("*.md"))
assert md_files, f"no agent_case md under {case_dir!s}"
# 4.3 LanceDB agent_case rows per owner
pytest_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_PYTEST}'")
sympy_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_SYMPY}'")
django_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_DJANGO}'")
assert len(pytest_cases) >= 1, (
f"no agent_pytest rows in LanceDB (got {len(pytest_cases)})"
)
assert len(sympy_cases) >= 1, (
f"no agent_sympy rows in LanceDB (got {len(sympy_cases)})"
)
# Each django session writes at least one cell → at least one case per
# session. Lower bound 3 covers the minimum; LLM may produce more.
assert len(django_cases) >= 3, (
f"agent_django expected ≥3 LanceDB cases (3 sessions), got {len(django_cases)}"
)
# 4.4 cross-owner isolation — each agent's cases trace back only to
# its own sessions
pytest_session_ids = {c.session_id for c in pytest_cases}
assert pytest_session_ids == {_PYTEST_SESSION}, (
f"agent_pytest cases leaked across sessions: {pytest_session_ids}"
)
sympy_session_ids = {c.session_id for c in sympy_cases}
assert sympy_session_ids == {_SYMPY_SESSION}, (
f"agent_sympy cases leaked across sessions: {sympy_session_ids}"
)
django_session_ids = {c.session_id for c in django_cases}
assert django_session_ids == set(_DJANGO_SESSIONS), (
f"agent_django session set mismatch — got {django_session_ids}, "
f"want {set(_DJANGO_SESSIONS)}"
)
# 4.5 agent_skill — soft: emission depends on LLM clustering quality
# gate (skip_quality_threshold + cluster size). pytest/sympy are
# single-case clusters and may legitimately yield 0 skills. django
# has 3 cases and should aggregate into ≥1 cluster of size ≥2,
# producing ≥1 skill — but we keep this informational (LLM-dependent)
# rather than a hard floor to avoid flaky CI signal.
pytest_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_PYTEST}'")
sympy_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_SYMPY}'")
django_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_DJANGO}'")
# Hard sanity: counts non-negative (the repo isn't broken).
assert len(pytest_skills) >= 0
assert len(sympy_skills) >= 0
assert len(django_skills) >= 0
# 4.6 strict md ↔ LanceDB parity across every cascade kind
#
# The per-owner counts above are loose (LLM-emission-dependent); this
# check enforces byte-exact id-set + content_sha256 parity across
# every md the agent pipeline wrote.
#
# ``expect_at_least`` pins agent_case (every session writes ≥1 case)
# so an empty glob would fail loudly. agent_skill is NOT pinned —
# emission depends on the LLM clustering quality gate per 4.5; a
# legitimately empty agent_skill md set is still a passing run.
from tests._consistency_assertions import assert_md_lance_strict_consistent
await assert_md_lance_strict_consistent(
memory_root,
expect_at_least={"agent_case": 1},
)

View File

@ -0,0 +1,337 @@
"""Add + Flush core pipeline smoke — long real-conversation drive.
Goal: prove the user-side add/flush chain is end-to-end live. Feeds
**419 real LoCoMo messages** through ``POST /api/v1/memory/add`` (in 19
batches sharing one session_id) then a final ``POST /flush``, and
verifies:
1. Each /add returns a sane status and the unprocessed_buffer delta
matches what the service claims (accumulated → grew by batch size;
extracted → shrank or stayed flat).
2. After /flush the buffer is empty and the memcell table has rows.
3. After cascade drains, episode md files exist and LanceDB rows
reflect them with valid content_sha256 + vector.
4. OME-driven async strategies have produced atomic_fact / foresight /
profile md files.
Real LLM + real embedder (creds via ``.env``). Marked ``slow`` —
``pytest -m slow tests/integration/test_add_flush_core_pipeline_smoke.py``.
"""
from __future__ import annotations
import os
import shutil
from collections.abc import Awaitable, Callable
from pathlib import Path
import httpx
import pytest
from everos.infra.persistence.markdown import (
AtomicFactDailyFrontmatter,
EpisodeDailyFrontmatter,
ForesightDailyFrontmatter,
)
# Directory names live on the frontmatter schemas (single source of truth);
# atomic_facts / foresights are dotfile-hidden so users only see episodes.
_EPISODE_DIR = EpisodeDailyFrontmatter.DIR_NAME
_ATOMIC_FACT_DIR = AtomicFactDailyFrontmatter.DIR_NAME
_FORESIGHT_DIR = ForesightDailyFrontmatter.DIR_NAME
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _to_add_messages(batch: dict) -> list[dict]:
"""Strip ``_audit_*`` fields; keep only what MessageItemDTO accepts."""
return [
{
"sender_id": m["sender_id"],
"role": m["role"],
"timestamp": m["timestamp"],
"content": m["content"],
}
for m in batch["messages"]
]
def _list_md_files(memory_root: Path, subpath: str) -> list[Path]:
"""List .md files under
``<memory_root>/default_app/default_project/users/<user>/<subpath>/``."""
user_dir = memory_root / "default_app" / "default_project" / "users"
if not user_dir.exists():
return []
out: list[Path] = []
for user_dir_child in user_dir.iterdir():
target = user_dir_child / subpath
if target.is_dir():
out.extend(target.rglob("*.md"))
elif target.with_suffix(".md").exists():
out.append(target.with_suffix(".md"))
return out
def _count_episode_entries(md_files: list[Path]) -> int:
"""Count ``## entry-*`` blocks across all episode md files."""
n = 0
for f in md_files:
for line in f.read_text().splitlines():
stripped = line.strip()
# Daily-log entries start with `## ` followed by an id token.
# We count any second-level heading that isn't the standard
# subsection headers used inside an entry.
if stripped.startswith("## ") and not stripped.startswith(
("## Subject", "## Summary", "## Content", "## Fact", "## Foresight")
):
n += 1
return n
def _maybe_snapshot_memory_root(memory_root: Path) -> None:
"""Copy ``memory_root`` to ``$EVEROS_KEEP_CORPUS_TO`` when set.
Used to harvest a known-good corpus (md + sqlite + lancedb three-piece
set) after a green test run, for later upload as the /search e2e
fixture. Pure sync I/O — kept out of the async test body so ASYNC240
doesn't complain about pathlib usage on the async path.
"""
keep_to = os.environ.get("EVEROS_KEEP_CORPUS_TO")
if not keep_to:
return
dest = Path(keep_to).resolve()
if dest.exists():
shutil.rmtree(dest)
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copytree(memory_root, dest)
# ---------------------------------------------------------------------------
# The test (slow — hits real LLM + embedder; opt in via `pytest -m slow`)
# ---------------------------------------------------------------------------
@pytest.mark.slow
@pytest.mark.live_llm
# Retries cover transient real-LLM flakes: OME profile clustering
# occasionally fails to emit user.md within the cascade-drain deadline
# (LLM timeout, empty response, or async race), but is reliably stable
# on retry. reruns_delay leaves the cascade workers idle between
# attempts so we don't pile state on top of a prior run.
@pytest.mark.flaky(reruns=2, reruns_delay=5)
async def test_long_conversation_produces_all_memory_types(
long_conversation: dict,
async_client: httpx.AsyncClient,
core_pipeline_runtime: Path,
cascade_done_poll: Callable[..., Awaitable[None]],
buffer_count: Callable[[str], Awaitable[int]],
memcell_count: Callable[..., Awaitable[int]],
) -> None:
"""One big seamless run: add 19 batches, flush, poll, assert everything."""
session_id = long_conversation["everos_session_id"]
memory_root = core_pipeline_runtime
# ── Stage 0: baseline ─────────────────────────────────────────────────
assert await buffer_count(session_id) == 0
assert await memcell_count(session_id) == 0
# ── Stage 1: drip 19 batches into /add, asserting buffer delta ────────
last_status: str | None = None
for idx, batch in enumerate(long_conversation["batches"]):
msg_count = batch["message_count"]
buf_before = await buffer_count(session_id)
cells_before = await memcell_count(session_id)
resp = await async_client.post(
"/api/v1/memory/add",
json={"session_id": session_id, "messages": _to_add_messages(batch)},
timeout=600.0, # boundary detection may call LLM
)
assert resp.status_code == 200, (
f"batch {idx} ({batch['locomo_session']}): {resp.status_code} {resp.text}"
)
body = resp.json()
status: str = body["data"]["status"]
returned_count: int = body["data"]["message_count"]
assert status in {"accumulated", "extracted"}, body
assert returned_count == msg_count, body
last_status = status
buf_after = await buffer_count(session_id)
cells_after = await memcell_count(session_id)
# Buffer-delta invariants:
if status == "accumulated":
# No boundary cut → entire batch piled into the buffer.
assert buf_after == buf_before + msg_count, (
f"batch {idx} accumulated: expected buf {buf_before + msg_count}, "
f"got {buf_after}"
)
assert cells_after == cells_before, (
f"batch {idx} accumulated: memcell should not change "
f"({cells_before}{cells_after})"
)
else: # "extracted"
# Boundary fired: some messages turned into memcell(s), tail
# (if any) stays in the buffer. We can't predict the exact tail
# length but two invariants must hold.
assert cells_after > cells_before, (
f"batch {idx} extracted: memcell should grow "
f"({cells_before}{cells_after})"
)
assert buf_after >= 0
# Conservation: nothing should silently vanish — the union of
# (buffer carry-over + this batch) must equal (new buffer +
# messages carved into cells). We approximate by asserting the
# new buffer is at most the carry-over + this batch size.
assert buf_after <= buf_before + msg_count, (
f"batch {idx} extracted: buffer overflow "
f"({buf_before} + {msg_count}{buf_after})"
)
# ── Stage 2: flush ────────────────────────────────────────────────────
cells_pre_flush = await memcell_count(session_id)
resp = await async_client.post(
"/api/v1/memory/flush",
json={"session_id": session_id},
timeout=600.0,
)
assert resp.status_code == 200, resp.text
flush_status = resp.json()["data"]["status"]
assert flush_status in {"extracted", "no_extraction"}, resp.json()
assert await buffer_count(session_id) == 0, "buffer must be drained after flush"
cells_after_flush = await memcell_count(session_id)
# If the last /add was already 'extracted' and emptied the buffer,
# flush returns 'no_extraction'. Otherwise flush must produce ≥ 1
# cell to satisfy the boundary semantics.
if flush_status == "extracted":
assert cells_after_flush > cells_pre_flush
# 419 LoCoMo messages produce ~19 memcells in practice (LLM boundary
# decides semantic cuts; daily-life chat carves coarsely). Threshold
# 15 leaves room for run-to-run variance from the boundary LLM.
assert cells_after_flush >= 15, (
f"expected ≥ 15 memcells from 419 messages, got {cells_after_flush}; "
f"last add status was {last_status!r}, flush was {flush_status!r}"
)
# ── Stage 3 + 4: wait for cascade to drain ────────────────────────────
# Cascade syncs md → LanceDB. OME async strategies (atomic / foresight /
# profile) also write md, which then cascade picks up. So one wait on
# cascade-drain effectively covers both pipelines, IF OME has already
# emitted its strategies (which memorize.py does inline via engine.emit).
await cascade_done_poll(deadline_seconds=600.0)
# ── Stage 5: artifacts on disk + LanceDB ──────────────────────────────
# 5.1 episodes
episode_files = _list_md_files(memory_root, _EPISODE_DIR)
assert episode_files, "no episode md files written"
episode_entries = _count_episode_entries(episode_files)
# 19 memcells × 2 owners (caroline + melanie) ≈ 36 episode rows seen
# in practice; threshold 15 leaves variance room.
assert episode_entries >= 15, (
f"expected ≥ 15 episode entries across {len(episode_files)} files, "
f"got {episode_entries}"
)
# 5.2 episode → LanceDB
from everos.infra.persistence.lancedb import episode_repo
lance_episode_count = await episode_repo.count()
assert lance_episode_count >= 15, (
f"LanceDB episode rows ({lance_episode_count}) < md entries ({episode_entries})"
)
# 5.3 atomic_fact
af_files = _list_md_files(memory_root, _ATOMIC_FACT_DIR)
assert af_files, "no atomic_fact md files — extract_atomic_facts did not emit"
from everos.infra.persistence.lancedb import atomic_fact_repo
lance_af_count = await atomic_fact_repo.count()
assert lance_af_count >= 1, (
f"LanceDB atomic_fact rows = {lance_af_count}; expected ≥ 1"
)
# 5.4 foresight
# Foresight extractor is correctly invoked (log: ``foresights_extracted``
# per memcell) but daily-life chat about kids / work / hobbies rarely
# yields explicit future-intent statements, so count is usually 0.
# We assert the LanceDB table exists (count returns 0 cleanly) — not
# that any row was emitted.
from everos.infra.persistence.lancedb import foresight_repo
lance_fs_count = await foresight_repo.count()
assert lance_fs_count >= 0, f"foresight table broken: count={lance_fs_count}"
# 5.5 profile (md only — profile retrieval path is stub; we only assert
# the writer wrote something). Profile lives as a single file
# ``users/<user_id>/user.md`` (schema: ``UserProfileFrontmatter.PROFILE_FILENAME``).
from everos.infra.persistence.markdown import UserProfileFrontmatter
profile_filename = UserProfileFrontmatter.PROFILE_FILENAME
profile_files: list[Path] = []
users_root = memory_root / "default_app" / "default_project" / "users"
if users_root.is_dir():
for ud in users_root.iterdir():
candidate = ud / profile_filename
if candidate.exists():
profile_files.append(candidate)
assert profile_files, (
f"no {profile_filename} written — extract_user_profile / "
"trigger_profile_clustering did not emit"
)
# At least one profile file has non-trivial content.
assert any(f.read_text().strip() for f in profile_files), (
"all profile.md files are empty"
)
# ── Stage 5b: strict md ↔ LanceDB parity (every cascade kind) ─────────
# Counts above are looser ``>=`` checks against LLM non-determinism;
# here we enforce byte-exact id-set + content_sha256 parity across
# every md the pipeline wrote. Catches: missing rows, orphan rows,
# content drift between md and the indexed projection.
#
# ``expect_at_least`` pins the kinds this pipeline MUST produce so an
# empty glob (kind not emitted at all) fails loudly — without this
# guard the parity check would silently pass on zero files. Foresight
# is NOT pinned because the LLM frequently yields 0 future-intent
# statements on daily-life chat (see commentary above stage 5.4).
from tests._consistency_assertions import assert_md_lance_strict_consistent
await assert_md_lance_strict_consistent(
memory_root,
expect_at_least={
"episode": 1,
"atomic_fact": 1,
"user_profile": 1,
},
)
# ── Stage 6: optional corpus snapshot ─────────────────────────────────
# When ``EVEROS_KEEP_CORPUS_TO=<dest>`` is set, copy the post-test
# ``memory_root`` to ``<dest>`` so it can be tarred + uploaded as a
# test corpus for the /search e2e suite. Skipped silently when the
# env var is absent (default test runs don't snapshot).
_maybe_snapshot_memory_root(memory_root)
# ---------------------------------------------------------------------------
# Diagnostic: lighter smoke that doesn't depend on the long fixture, used
# to validate the conftest fixtures themselves are wired correctly.
# ---------------------------------------------------------------------------
async def test_async_client_starts_and_health_responds(
async_client: httpx.AsyncClient,
) -> None:
"""Tiny smoke — proves the conftest fixture brings the app up."""
resp = await async_client.get("/health")
assert resp.status_code == 200, resp.text

View File

@ -0,0 +1,219 @@
"""Real full-pipeline timezone e2e — the gold-standard anti-drift test.
Exercises the **complete stack** under a display-tz switch:
POST /add → unprocessed_buffer → POST /flush
boundary detection (memcell)
markdown writer (episode.md)
cascade scanner / worker
LanceDB index (episode row)
then POST /search and POST /get under display tz = Shanghai,
switch display tz to UTC, repeat /search + /get.
Pin: the **UTC instant** of every returned ``timestamp`` field is
identical across all four renders. Only the offset / wall-clock
changes. This is the user-facing contract of the storage-UTC discipline.
Real LLM (boundary detection + episode extraction) + real embedder
(LanceDB vector + FTS) — marked ``@slow`` ``@live_llm``.
"""
from __future__ import annotations
import datetime as dt
from collections.abc import Awaitable, Callable
import httpx
import pytest
from everos.component.utils import datetime as dt_module
from everos.component.utils.datetime import from_iso_format
from everos.config import load_settings
async def _switch_display_tz(monkeypatch: pytest.MonkeyPatch, tz: str) -> None:
"""Hot-swap the display tz mid-test + drop both caches.
The ``_display_tz`` resolver and ``load_settings`` are
``functools.cache``-d; missing either ``cache_clear`` would let the
new env var read silently no-op.
"""
monkeypatch.setenv("EVEROS_MEMORY__TIMEZONE", tz)
load_settings.cache_clear()
dt_module._display_tz.cache_clear()
@pytest.mark.slow
@pytest.mark.live_llm
async def test_full_pipeline_tz_switch_preserves_utc_instant(
async_client: httpx.AsyncClient,
pipeline_done_poll: Callable[..., Awaitable[None]],
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Real /add → /flush → cascade → LanceDB → /search /get under tz switch.
Steps:
1. Configure ``EVEROS_MEMORY__TIMEZONE=Asia/Shanghai``.
2. POST /add a single message with a pinned epoch-ms timestamp.
3. POST /flush — forces boundary detection to carve a memcell out
of the single-message buffer.
4. Wait for cascade to drain (md → LanceDB indexed).
5. POST /search + POST /get: capture episode timestamp strings.
6. Switch ``EVEROS_MEMORY__TIMEZONE=UTC``.
7. POST /search + POST /get again: capture episode timestamp strings.
8. Parse all four timestamp strings back to UTC instants. They must
all be equal. The offsets and wall-clock numbers will differ
between Shanghai and UTC renders — that's expected; what must
NOT differ is the absolute UTC instant.
Anti-drift contract is end-to-end: writes under one display tz
must read back under another with zero data drift.
"""
user_id = "alice_full_tz"
session_id = "sess_full_tz"
# 1748498400000 ms = 2026-05-29T06:00:00Z = 2026-05-29T14:00:00+08:00
pinned_ms = 1748498400000
expected_instant = dt.datetime.fromtimestamp(pinned_ms / 1000, tz=dt.UTC)
# ── Step 1+2: configure Shanghai + write via /add ──
await _switch_display_tz(monkeypatch, "Asia/Shanghai")
resp = await async_client.post(
"/api/v1/memory/add",
json={
"user_id": user_id,
"session_id": session_id,
"messages": [
{
"sender_id": user_id,
"role": "user",
"timestamp": pinned_ms,
"content": "I love climbing in Yosemite every spring.",
},
],
},
timeout=60.0,
)
assert resp.status_code == 200, resp.text
# ── Step 3: /flush forces boundary detection on the single-message buffer ──
resp = await async_client.post(
"/api/v1/memory/flush",
json={"user_id": user_id, "session_id": session_id},
timeout=60.0,
)
assert resp.status_code == 200, resp.text
# ── Step 4: wait for OME strategies + cascade to fully drain ──
# 10-minute deadline: extract_episode + extract_atomic_facts run under
# real LLM and the cascade worker only fires after md lands. The
# `pipeline_done_poll` fixture covers both OME idle and cascade queue
# empty.
await pipeline_done_poll(deadline_seconds=600.0)
# ── Step 5: /search + /get under Shanghai display tz ──
resp_search_sh = await async_client.post(
"/api/v1/memory/search",
json={
"user_id": user_id,
"query": "climbing",
"method": "keyword", # no embedder cost; FTS index built by cascade
"filters": {"session_id": session_id},
},
timeout=60.0,
)
assert resp_search_sh.status_code == 200, resp_search_sh.text
eps_search_sh = resp_search_sh.json()["data"]["episodes"]
assert eps_search_sh, (
f"/search must return an episode after flush+cascade; got {eps_search_sh!r}"
)
ts_search_sh = eps_search_sh[0]["timestamp"]
assert ts_search_sh.endswith("+08:00"), (
f"Shanghai display tz should render offset +08:00; got {ts_search_sh!r}"
)
resp_get_sh = await async_client.post(
"/api/v1/memory/get",
json={
"user_id": user_id,
"memory_type": "episode",
"page": 1,
"page_size": 20,
},
timeout=60.0,
)
assert resp_get_sh.status_code == 200, resp_get_sh.text
eps_get_sh = resp_get_sh.json()["data"]["episodes"]
assert eps_get_sh, "/get must return the same episode /search did"
ts_get_sh = eps_get_sh[0]["timestamp"]
assert ts_get_sh.endswith("+08:00"), ts_get_sh
# ── Step 6: switch to UTC display tz (drops caches) ──
await _switch_display_tz(monkeypatch, "UTC")
# ── Step 7: /search + /get again, same on-disk row, new render ──
resp_search_utc = await async_client.post(
"/api/v1/memory/search",
json={
"user_id": user_id,
"query": "climbing",
"method": "keyword",
"filters": {"session_id": session_id},
},
timeout=60.0,
)
assert resp_search_utc.status_code == 200, resp_search_utc.text
eps_search_utc = resp_search_utc.json()["data"]["episodes"]
assert eps_search_utc
ts_search_utc = eps_search_utc[0]["timestamp"]
assert ts_search_utc.endswith("Z") or ts_search_utc.endswith("+00:00"), (
f"UTC display tz should render Z / +00:00; got {ts_search_utc!r}"
)
resp_get_utc = await async_client.post(
"/api/v1/memory/get",
json={
"user_id": user_id,
"memory_type": "episode",
"page": 1,
"page_size": 20,
},
timeout=60.0,
)
assert resp_get_utc.status_code == 200, resp_get_utc.text
eps_get_utc = resp_get_utc.json()["data"]["episodes"]
ts_get_utc = eps_get_utc[0]["timestamp"]
assert ts_get_utc.endswith("Z") or ts_get_utc.endswith("+00:00"), ts_get_utc
# ── Step 8: anti-drift assertion — all four UTC instants identical ──
instants = {
"search/Shanghai": from_iso_format(ts_search_sh).astimezone(dt.UTC),
"get/Shanghai": from_iso_format(ts_get_sh).astimezone(dt.UTC),
"search/UTC": from_iso_format(ts_search_utc).astimezone(dt.UTC),
"get/UTC": from_iso_format(ts_get_utc).astimezone(dt.UTC),
}
distinct = set(instants.values())
assert len(distinct) == 1, (
f"display-tz switch must NOT drift the UTC instant. Got distinct "
f"instants across renders: {instants!r}"
)
actual_instant = next(iter(distinct))
# Episode timestamp inherits from the last message's epoch ms — the
# pinned input value must round-trip exactly.
assert actual_instant == expected_instant, (
f"episode UTC instant must equal the pinned input ms epoch; "
f"expected {expected_instant.isoformat()}, got {actual_instant.isoformat()}"
)
# ── Sanity: across the four renders, identical instant projects to the
# correct wall-clock under each display tz ──
# Shanghai: 14:00 wall clock; UTC: 06:00 wall clock.
assert "T14:00:00" in ts_search_sh, ts_search_sh
assert "T14:00:00" in ts_get_sh, ts_get_sh
assert "T06:00:00" in ts_search_utc, ts_search_utc
assert "T06:00:00" in ts_get_utc, ts_get_utc

View File

@ -0,0 +1,829 @@
"""End-to-end integration tests for ``POST /api/v1/memory/get``.
These tests spin up the FastAPI app with **no lifespan providers**
against a tmp ``EVEROS_MEMORY__ROOT``, populate a real LanceDB
``episode`` table directly via the repo singleton, and exercise the
HTTP route. They cover the wiring that unit tests cannot: pydantic
422s from the route, JSON envelope shape, and the full
``request → service → manager → LanceDB`` path.
"""
from __future__ import annotations
import asyncio
import datetime as _dt
from collections.abc import AsyncIterator
from importlib import import_module
from pathlib import Path
import pytest
from httpx import ASGITransport, AsyncClient
from everos.config import load_settings
from everos.entrypoints.api.app import create_app
from everos.infra.persistence.lancedb import (
AgentCase,
AgentSkill,
Episode,
UserProfile,
agent_case_repo,
agent_skill_repo,
episode_repo,
lancedb_manager,
user_profile_repo,
)
# ``everos.service.__init__`` re-exports the ``get`` function under the
# same name as the submodule (``from .get import get as get``), which
# shadows the submodule when imported normally. Pull the actual module
# via importlib so the test can poke at its ``_manager`` singleton.
get_service_mod = import_module("everos.service.get")
def _ts(day: int) -> _dt.datetime:
return _dt.datetime(2026, 1, day, tzinfo=_dt.UTC)
def _episode(
entry: str,
*,
owner: str = "u1",
session: str = "sess_a",
parent_id: str = "mc_1",
sender_ids: list[str] | None = None,
day: int = 1,
) -> Episode:
return Episode(
id=f"{owner}_{entry}",
entry_id=entry,
owner_id=owner,
owner_type="user",
session_id=session,
timestamp=_ts(day),
parent_type="memcell",
parent_id=parent_id,
sender_ids=sender_ids if sender_ids is not None else [owner, "assistant"],
subject=f"subj {entry}",
summary=f"summary {entry}",
episode=f"body of {entry}",
episode_tokens=f"body of {entry}",
md_path=f"users/{owner}/episodes/{entry}.md",
content_sha256="abc",
vector=[0.0] * 1024,
)
def _agent_case(
entry: str,
*,
owner: str = "a1",
session: str = "sess_x",
day: int = 1,
) -> AgentCase:
return AgentCase(
id=f"{owner}_{entry}",
entry_id=entry,
owner_id=owner,
owner_type="agent",
session_id=session,
timestamp=_ts(day),
parent_type="memcell",
parent_id="mc_99",
quality_score=0.8,
task_intent=f"intent {entry}",
task_intent_tokens=f"intent {entry}",
approach=f"approach {entry}",
approach_tokens=f"approach {entry}",
key_insight=None,
md_path=f"agents/{owner}/cases/{entry}.md",
content_sha256="abc",
vector=[0.0] * 1024,
)
def _agent_skill(
name: str,
*,
owner: str = "a1",
) -> AgentSkill:
return AgentSkill(
id=f"{owner}_{name}",
owner_id=owner,
owner_type="agent",
name=name,
description=f"desc {name}",
description_tokens=f"desc {name}",
content=f"content {name}",
content_tokens=f"content {name}",
confidence=0.9,
maturity_score=0.7,
source_case_ids=[f"{owner}_ac_1"],
md_path=f"agents/{owner}/skills/{name}/SKILL.md",
content_sha256="abc",
vector=[0.0] * 1024,
)
@pytest.fixture
async def client(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> AsyncIterator[AsyncClient]:
"""Build the FastAPI app against a tmp memory root with no lifespan."""
monkeypatch.setenv("EVEROS_MEMORY__ROOT", str(tmp_path))
load_settings.cache_clear()
# Reset every module-level singleton the get-path touches.
lancedb_manager._conn = None
lancedb_manager._tables.clear()
get_service_mod._manager = None
app = create_app(lifespan_providers=[])
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as c:
yield c
await lancedb_manager.dispose_connection()
load_settings.cache_clear()
# ── Happy path ──────────────────────────────────────────────────────────
async def test_get_episodes_returns_page_and_total(
client: AsyncClient,
) -> None:
"""5 rows in, page_size=2 → 2 episodes back + total_count=5."""
await episode_repo.add(
[_episode(f"ep_{i:03d}", day=i) for i in range(1, 6)],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"page": 1,
"page_size": 2,
},
)
assert resp.status_code == 200
body = resp.json()
rid = body["request_id"]
assert len(rid) == 32 and all(c in "0123456789abcdef" for c in rid)
data = body["data"]
assert data["total_count"] == 5
assert data["count"] == 2
assert len(data["episodes"]) == 2
# default sort = timestamp DESC → highest day first
assert data["episodes"][0]["id"] == "u1_ep_005"
assert data["episodes"][1]["id"] == "u1_ep_004"
# The non-requested kinds are empty arrays (envelope invariant).
assert data["profiles"] == []
assert data["agent_cases"] == []
assert data["agent_skills"] == []
async def test_get_episodes_filtered_by_session_id(
client: AsyncClient,
) -> None:
"""Filter narrows results to the matching ``session_id`` only."""
await episode_repo.add(
[
_episode("ep_001", session="sess_a"),
_episode("ep_002", session="sess_a"),
_episode("ep_003", session="sess_b"),
],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"session_id": "sess_a"},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
assert body["data"]["count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_002"}
async def test_get_empty_returns_zero_counts(client: AsyncClient) -> None:
"""An owner with no rows yields total_count=0 + empty episodes list."""
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "ghost",
"memory_type": "episode",
},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["total_count"] == 0
assert data["count"] == 0
assert data["episodes"] == []
async def test_get_profile_miss_returns_empty(client: AsyncClient) -> None:
"""Cold start (no profile row) → ``profiles=[]`` / ``total_count=0``."""
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "profile",
},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["profiles"] == []
assert data["total_count"] == 0
async def test_get_profile_returns_seeded_row(client: AsyncClient) -> None:
"""A profile row in the ``user_profile`` table is returned + json-decoded.
Full-stack: seed the LanceDB ``user_profile`` table (as cascade would
from ``users/u1/user.md``), then read it back through the HTTP route.
White-box surface: ``user_profile_repo`` (the same table /search's
``include_profile`` reads).
"""
await user_profile_repo.add(
[
UserProfile(
id="u1",
owner_id="u1",
owner_type="user",
app_id="default",
project_id="default",
summary="u1 loves climbing in Yosemite",
explicit_info_json='[{"category": "Hobby", "description": "climbing"}]',
implicit_traits_json='[{"trait": "Outdoorsy"}]',
profile_timestamp_ms=1780304400000,
md_path="users/u1/user.md",
content_sha256="abc",
)
]
)
resp = await client.post(
"/api/v1/memory/get",
json={"user_id": "u1", "memory_type": "profile"},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["total_count"] == 1
assert data["count"] == 1
assert len(data["profiles"]) == 1
prof = data["profiles"][0]
assert prof["id"] == "u1"
assert prof["user_id"] == "u1"
assert prof["profile_data"]["summary"] == "u1 loves climbing in Yosemite"
assert prof["profile_data"]["explicit_info"] == [
{"category": "Hobby", "description": "climbing"}
]
assert prof["profile_data"]["implicit_traits"] == [{"trait": "Outdoorsy"}]
# ── Pagination + sort ───────────────────────────────────────────────────
async def test_get_episodes_page_two_returns_correct_slice(
client: AsyncClient,
) -> None:
"""5 rows / page_size=2 / page=2 → middle slice (rows 3 + 4 by DESC ts)."""
await episode_repo.add(
[_episode(f"ep_{i:03d}", day=i) for i in range(1, 6)],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"page": 2,
"page_size": 2,
},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["total_count"] == 5
assert data["count"] == 2
# default sort = timestamp DESC; page 2 of 2-per-page over 5 rows →
# rows at offsets 2,3 → day=3, day=2 (1-indexed: ep_003, ep_002).
assert [ep["id"] for ep in data["episodes"]] == ["u1_ep_003", "u1_ep_002"]
async def test_get_episodes_sort_order_asc(client: AsyncClient) -> None:
"""``sort_order=asc`` flips the order (oldest first)."""
await episode_repo.add(
[_episode(f"ep_{i:03d}", day=i) for i in range(1, 4)],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"sort_order": "asc",
},
)
assert resp.status_code == 200
ids = [ep["id"] for ep in resp.json()["data"]["episodes"]]
assert ids == ["u1_ep_001", "u1_ep_002", "u1_ep_003"]
# ── Agent-side kinds ────────────────────────────────────────────────────
async def test_get_agent_cases_happy_path(client: AsyncClient) -> None:
"""``agent_case`` listing returns shaped items, populates only that array."""
await agent_case_repo.add(
[_agent_case(f"ac_{i:03d}", day=i) for i in range(1, 4)],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"agent_id": "a1",
"memory_type": "agent_case",
},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["total_count"] == 3
assert data["count"] == 3
assert [c["id"] for c in data["agent_cases"]] == [
"a1_ac_003",
"a1_ac_002",
"a1_ac_001",
]
# Cross-kind envelope stays empty.
assert data["episodes"] == []
assert data["agent_skills"] == []
# AgentCase item shape — score absent (vs SearchAgentCaseItem),
# quality_score round-trips.
first = data["agent_cases"][0]
assert "score" not in first
assert first["quality_score"] == 0.8
assert first["agent_id"] == "a1"
async def test_get_agent_cases_filtered_by_session(client: AsyncClient) -> None:
"""Filter narrows ``agent_case`` rows to the session."""
await agent_case_repo.add(
[
_agent_case("ac_001", session="sess_x"),
_agent_case("ac_002", session="sess_x"),
_agent_case("ac_003", session="sess_y"),
]
)
resp = await client.post(
"/api/v1/memory/get",
json={
"agent_id": "a1",
"memory_type": "agent_case",
"filters": {"session_id": "sess_x"},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {c["id"] for c in body["data"]["agent_cases"]}
assert ids == {"a1_ac_001", "a1_ac_002"}
async def test_get_agent_skills_happy_path(client: AsyncClient) -> None:
"""``agent_skill`` listing — sort silently uses ``updated_at``."""
await agent_skill_repo.add(
[_agent_skill(name) for name in ("planner", "summariser")],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"agent_id": "a1",
"memory_type": "agent_skill",
},
)
assert resp.status_code == 200
data = resp.json()["data"]
assert data["total_count"] == 2
names = {s["name"] for s in data["agent_skills"]}
assert names == {"planner", "summariser"}
async def test_get_agent_skills_sort_by_timestamp_silently_downgraded(
client: AsyncClient,
) -> None:
"""Explicit ``sort_by=timestamp`` does not 500 — manager rewrites to
``updated_at`` (the only temporal column on ``agent_skill``)."""
await agent_skill_repo.add([_agent_skill("planner")])
resp = await client.post(
"/api/v1/memory/get",
json={
"agent_id": "a1",
"memory_type": "agent_skill",
"sort_by": "timestamp",
},
)
assert resp.status_code == 200
assert resp.json()["data"]["total_count"] == 1
# ── Filter coverage end-to-end ──────────────────────────────────────────
async def test_get_episodes_filtered_by_ne_session(client: AsyncClient) -> None:
"""``ne`` op on a str field excludes matching rows end-to-end."""
await episode_repo.add(
[
_episode("ep_001", session="sess_a"),
_episode("ep_002", session="sess_internal"),
_episode("ep_003", session="sess_b"),
]
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"session_id": {"ne": "sess_internal"}},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_003"}
async def test_get_episodes_filtered_by_iso_timestamp(
client: AsyncClient,
) -> None:
"""ISO 8601 string timestamp literal is accepted alongside epoch ms."""
await episode_repo.add(
[
_episode("ep_001", day=1), # 2026-01-01
_episode("ep_002", day=5), # 2026-01-05
_episode("ep_003", day=9), # 2026-01-09
]
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"timestamp": {"gte": "2026-01-04T00:00:00+00:00"}},
},
)
assert resp.status_code == 200
ids = {ep["id"] for ep in resp.json()["data"]["episodes"]}
assert ids == {"u1_ep_002", "u1_ep_003"}
async def test_get_episodes_filtered_by_parent_id(client: AsyncClient) -> None:
"""Core use case: every episode derived from one memcell."""
await episode_repo.add(
[
_episode("ep_001", parent_id="mc_target"),
_episode("ep_002", parent_id="mc_target"),
_episode("ep_003", parent_id="mc_other"),
]
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"parent_id": "mc_target"},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_002"}
async def test_get_episodes_filtered_by_sender_id_in(
client: AsyncClient,
) -> None:
"""``sender_id: {"in": [...]}`` → ``array_has(sender_ids, ...) OR ...``."""
await episode_repo.add(
[
_episode("ep_001", sender_ids=["alice", "assistant"]),
_episode("ep_002", sender_ids=["bob", "assistant"]),
_episode("ep_003", sender_ids=["carol", "assistant"]),
]
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"sender_id": {"in": ["alice", "bob"]}},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_002"}
async def test_get_episodes_nested_and_inside_or(client: AsyncClient) -> None:
"""Nested ``AND`` inside ``OR`` — parity with /search combinator semantics."""
await episode_repo.add(
[
_episode("ep_001", session="sess_a", parent_id="mc_target"),
_episode("ep_002", session="sess_a", parent_id="mc_other"),
_episode("ep_003", session="sess_b", parent_id="mc_target"),
_episode("ep_004", session="sess_c", parent_id="mc_other"),
]
)
# (session=sess_a AND parent_id=mc_target)
# OR (parent_id=mc_other AND session=sess_c)
# → ep_001 + ep_004
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {
"OR": [
{
"AND": [
{"session_id": "sess_a"},
{"parent_id": "mc_target"},
]
},
{
"AND": [
{"parent_id": "mc_other"},
{"session_id": "sess_c"},
]
},
]
},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_004"}
# ── Filter combinators (200 — happy path) ──────────────────────────────
# Pure 422 / validation cases moved to
# tests/unit/test_entrypoints/test_api/test_routes/test_get_route_validation.py
async def test_get_top_level_and_or_compiles_and_filters(
client: AsyncClient,
) -> None:
"""``AND`` / ``OR`` combinators are accepted (parity with /search)."""
await episode_repo.add(
[
_episode("ep_001", session="sess_a"),
_episode("ep_002", session="sess_b"),
_episode("ep_003", session="sess_c"),
],
)
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {"OR": [{"session_id": "sess_a"}, {"session_id": "sess_b"}]},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_001", "u1_ep_002"}
async def test_get_episodes_filtered_by_timestamp_range(
client: AsyncClient,
) -> None:
"""``timestamp: {gte, lt}`` — same-field double op compiles to implicit AND."""
await episode_repo.add(
[
_episode("ep_001", day=1), # 2026-01-01
_episode("ep_002", day=3), # 2026-01-03
_episode("ep_003", day=5), # 2026-01-05
_episode("ep_004", day=7), # 2026-01-07
_episode("ep_005", day=9), # 2026-01-09
]
)
# Window [Jan 3, Jan 7) → ep_002 + ep_003 (Jan 7 excluded by `lt`).
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {
"timestamp": {
"gte": "2026-01-03T00:00:00+00:00",
"lt": "2026-01-07T00:00:00+00:00",
}
},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 2
ids = {ep["id"] for ep in body["data"]["episodes"]}
assert ids == {"u1_ep_002", "u1_ep_003"}
async def test_get_episodes_top_level_and_filter(client: AsyncClient) -> None:
"""Explicit top-level ``AND`` — distinct from implicit multi-field AND."""
await episode_repo.add(
[
_episode("ep_001", session="sess_a", parent_id="mc_target"),
_episode("ep_002", session="sess_a", parent_id="mc_other"),
_episode("ep_003", session="sess_b", parent_id="mc_target"),
]
)
# session=sess_a AND parent_id=mc_target → ep_001 only
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"filters": {
"AND": [
{"session_id": "sess_a"},
{"parent_id": "mc_target"},
]
},
},
)
assert resp.status_code == 200
body = resp.json()
assert body["data"]["total_count"] == 1
assert body["data"]["episodes"][0]["id"] == "u1_ep_001"
# ── max_fetch limit trigger ─────────────────────────────────────────────
async def test_get_truncates_above_max_fetch(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Filter matches > ``max_fetch`` rows → chassis emits warning + page
contents come from the truncated prefix; ``total_count`` is still the
*true* match count (``count_rows`` ignores ``max_fetch``).
Injects a low ``max_fetch=5`` by wrapping the bound method so the
end-to-end path runs through the truncation branch without populating
20k+ rows.
"""
# The e2e ``client`` fixture builds the app without lifespan providers,
# so ``configure_logging`` (normally invoked by the CLI entry) never
# runs. Call it here so the structlog → stdlib logging bridge is
# wired up and ``caplog`` can observe the chassis warning.
from everos.core.observability.logging import configure_logging
configure_logging(level="WARNING")
await episode_repo.add(
[_episode(f"ep_{i:03d}", day=i) for i in range(1, 11)],
)
original = episode_repo.find_where_paginated
async def low_cap(*args: object, **kwargs: object) -> object:
kwargs["max_fetch"] = 5
return await original(*args, **kwargs) # type: ignore[arg-type]
monkeypatch.setattr(episode_repo, "find_where_paginated", low_cap)
with caplog.at_level("WARNING"):
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": "u1",
"memory_type": "episode",
"page": 1,
"page_size": 3,
},
)
assert resp.status_code == 200
body = resp.json()
# True row count is still 10, even though only 5 made it into the sort.
assert body["data"]["total_count"] == 10
assert body["data"]["count"] == 3
# structlog now routes through stdlib's root logger (see
# ``core/observability/logging/factory.py``); the warning surfaces via
# the standard ``caplog`` fixture rather than direct stdout capture.
assert "find_where_paginated truncated" in caplog.text
# ── Concurrency ─────────────────────────────────────────────────────────
async def test_get_concurrent_owners_no_cross_contamination(
client: AsyncClient,
) -> None:
"""Concurrent /get requests against different ``owner_id`` partitions
return only their own rows. ``GetManager`` is a lazy singleton —
this also exercises first-request lazy-init under contention."""
await episode_repo.add(
[
_episode("ep_001", owner="u1"),
_episode("ep_002", owner="u1"),
_episode("ep_001", owner="u2"),
_episode("ep_001", owner="u3"),
]
)
async def query(owner: str) -> dict[str, object]:
resp = await client.post(
"/api/v1/memory/get",
json={
"user_id": owner,
"memory_type": "episode",
},
)
assert resp.status_code == 200, f"{owner}: {resp.text}"
return resp.json()
bodies = await asyncio.gather(
query("u1"),
query("u2"),
query("u3"),
)
u1, u2, u3 = bodies
assert u1["data"]["total_count"] == 2 # type: ignore[index]
assert u2["data"]["total_count"] == 1 # type: ignore[index]
assert u3["data"]["total_count"] == 1 # type: ignore[index]
assert {ep["id"] for ep in u1["data"]["episodes"]} == { # type: ignore[index]
"u1_ep_001",
"u1_ep_002",
}
assert {ep["id"] for ep in u2["data"]["episodes"]} == {"u2_ep_001"} # type: ignore[index]
assert {ep["id"] for ep in u3["data"]["episodes"]} == {"u3_ep_001"} # type: ignore[index]
async def test_get_concurrent_different_memory_types(client: AsyncClient) -> None:
"""Concurrent /get on different ``memory_type`` (episode + agent_case +
agent_skill) returns each kind in its own envelope slot, with no
cross-array bleed."""
await episode_repo.add([_episode("ep_001", owner="u1")])
await agent_case_repo.add([_agent_case("ac_001", owner="a1")])
await agent_skill_repo.add([_agent_skill("planner", owner="a1")])
async def query(payload: dict[str, object]) -> dict[str, object]:
resp = await client.post("/api/v1/memory/get", json=payload)
assert resp.status_code == 200, resp.text
return resp.json()
ep_body, case_body, skill_body = await asyncio.gather(
query({"user_id": "u1", "memory_type": "episode"}),
query(
{
"agent_id": "a1",
"memory_type": "agent_case",
}
),
query(
{
"agent_id": "a1",
"memory_type": "agent_skill",
}
),
)
# Episode envelope: only ``episodes`` populated.
assert len(ep_body["data"]["episodes"]) == 1 # type: ignore[index]
assert ep_body["data"]["agent_cases"] == [] # type: ignore[index]
assert ep_body["data"]["agent_skills"] == [] # type: ignore[index]
# Case envelope: only ``agent_cases`` populated.
assert len(case_body["data"]["agent_cases"]) == 1 # type: ignore[index]
assert case_body["data"]["episodes"] == [] # type: ignore[index]
# Skill envelope: only ``agent_skills`` populated.
assert len(skill_body["data"]["agent_skills"]) == 1 # type: ignore[index]
assert skill_body["data"]["episodes"] == [] # type: ignore[index]
async def test_get_concurrent_lazy_init_builds_one_manager(
client: AsyncClient,
) -> None:
"""The lazy singleton survives first-request contention — N concurrent
requests against a virgin manager all succeed and leave one instance."""
# ``client`` fixture already reset _manager to None.
assert get_service_mod._manager is None
await episode_repo.add([_episode("ep_001")])
payload = {
"user_id": "u1",
"memory_type": "episode",
}
results = await asyncio.gather(
*(client.post("/api/v1/memory/get", json=payload) for _ in range(8))
)
assert all(r.status_code == 200 for r in results)
# After the storm, exactly one manager instance is cached.
assert get_service_mod._manager is not None

View File

@ -0,0 +1,140 @@
"""E2E: multimodal /add parses HTML (base64) and http(s) uri end-to-end.
Scope: full HTTP stack (``create_app()`` + ``AsyncClient``) → ingest →
multimodal parse → unprocessed_buffer. Proves the three paths the unit
tests can only mock:
1. ``type="html"`` + base64 + ``ext="html"`` — the normal HTML-file call.
2. ``type="html"`` + ``https`` uri — everalgo fetches the page and
dispatches by the response Content-Type.
3. ``type="html"`` + ``file://`` uri — EverOS reads the file locally and
hands everalgo hydrated bytes (the library never touches the fs).
Real multimodal LLM (creds via ``.env``) + real public internet, so the
module is marked ``live_llm``. Skipped when the ``[multimodal]`` extra is
absent.
White-box surface: reads the ``text`` column of ``unprocessed_buffer``
(the derived text the ingest stage produced from the parsed content) to
assert the parsed payload actually flowed into the buffer.
"""
from __future__ import annotations
import base64
from pathlib import Path
import httpx
import pytest
from sqlalchemy import text as sql_text
pytest.importorskip("everalgo.parser")
pytestmark = pytest.mark.live_llm
async def _buffer_text(session_id: str) -> str:
"""Concatenated derived ``text`` of all buffer rows for a session."""
from everos.infra.persistence.sqlite import get_engine
async with get_engine().connect() as conn:
rows = (
await conn.execute(
sql_text("SELECT text FROM unprocessed_buffer WHERE session_id = :sid"),
{"sid": session_id},
)
).all()
return "\n".join(str(r[0]) for r in rows)
async def test_add_html_base64_parsed_into_buffer(
async_client: httpx.AsyncClient,
) -> None:
"""A base64 HTML file is parsed and its text lands in the buffer."""
html = (
b"<html><body><h1>Release</h1>"
b"<p>Version 9.9.9 ships Dark Mode.</p></body></html>"
)
sid = "e2e-mm-html-b64"
resp = await async_client.post(
"/api/v1/memory/add",
json={
"session_id": sid,
"messages": [
{
"sender_id": "alice",
"role": "user",
"timestamp": 1780304400000,
"content": [
{
"type": "html",
"base64": base64.b64encode(html).decode(),
"ext": "html",
"name": "notes.html",
}
],
}
],
},
)
assert resp.status_code == 200, resp.text
buffered = await _buffer_text(sid)
assert "9.9.9" in buffered
async def test_add_html_https_uri_parsed_into_buffer(
async_client: httpx.AsyncClient,
) -> None:
"""An https uri is fetched + parsed and its text lands in the buffer."""
sid = "e2e-mm-html-uri"
resp = await async_client.post(
"/api/v1/memory/add",
json={
"session_id": sid,
"messages": [
{
"sender_id": "alice",
"role": "user",
"timestamp": 1780304400000,
"content": [{"type": "html", "uri": "https://example.com"}],
}
],
},
)
assert resp.status_code == 200, resp.text
buffered = await _buffer_text(sid)
assert "example domain" in buffered.lower()
async def test_add_html_file_uri_parsed_into_buffer(
async_client: httpx.AsyncClient,
tmp_path: Path,
) -> None:
"""A file:// html asset is read locally (hydrated) + parsed into buffer.
Exercises EverOS-side file:// support: the parser receives bytes, never
the path. Default allowlist is empty (local-first) so the temp file reads.
"""
doc = tmp_path / "release.html"
doc.write_text("<html><body><p>Version 9.9.9 ships Dark Mode.</p></body></html>")
sid = "e2e-mm-html-file"
resp = await async_client.post(
"/api/v1/memory/add",
json={
"session_id": sid,
"messages": [
{
"sender_id": "alice",
"role": "user",
"timestamp": 1780304400000,
"content": [{"type": "html", "uri": f"file://{doc}"}],
}
],
},
)
assert resp.status_code == 200, resp.text
buffered = await _buffer_text(sid)
assert "9.9.9" in buffered

View File

@ -0,0 +1,87 @@
"""Belt-and-braces gate: dev-mode ``GET /openapi.json`` ≡ ``docs/openapi.json``.
The lint-time ``make check-openapi`` already diffs ``app.openapi()``
against the committed ``docs/openapi.json``. This e2e test closes the
remaining theoretical gap: if anyone ever adds a *lifespan-mutated*
OpenAPI schema (e.g. ``app.openapi_schema = ...`` inside a startup
handler), the in-memory ``app.openapi()`` and the runtime
``GET /openapi.json`` response would diverge — the lint gate would
miss it, but this test wouldn't.
How:
1. Force ``ENV=DEV`` so the ``openapi_url`` route is enabled.
2. Construct the app via ``create_app(lifespan_providers=[])`` to skip
SQLite / LanceDB / OME (the schema is route-driven, not state-
driven) — but *do* run the lifespan context, so any startup hook
that mutates ``app.openapi_schema`` is exercised.
3. ``GET /openapi.json`` through ``httpx.AsyncClient``.
4. Diff against ``docs/openapi.json`` byte-for-byte (after JSON
normalisation to defeat ordering nondeterminism).
"""
from __future__ import annotations
import json
import os
from pathlib import Path
import httpx
import pytest
_REPO_ROOT = Path(__file__).resolve().parents[2]
_COMMITTED_OPENAPI = _REPO_ROOT / "docs" / "openapi.json"
async def test_dev_mode_openapi_endpoint_matches_committed_docs(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Runtime ``GET /openapi.json`` (dev mode) must equal ``docs/openapi.json``."""
# The gate's own committed snapshot must exist — otherwise the dev
# workflow ``make openapi`` has been skipped.
assert _COMMITTED_OPENAPI.is_file(), (
f"{_COMMITTED_OPENAPI} not found — run `make openapi`"
)
# Force dev-mode so ``openapi_url="/openapi.json"`` is registered.
monkeypatch.setenv("ENV", "DEV")
from everos.entrypoints.api.app import create_app
app = create_app(lifespan_providers=[])
transport = httpx.ASGITransport(app=app)
async with (
app.router.lifespan_context(app),
httpx.AsyncClient(transport=transport, base_url="http://test") as client,
):
resp = await client.get("/openapi.json")
assert resp.status_code == 200, resp.text
runtime_schema = resp.json()
committed_schema = json.loads(_COMMITTED_OPENAPI.read_text(encoding="utf-8"))
if runtime_schema != committed_schema:
# Emit a concise diff to help locate the drift cause.
import difflib
runtime_rendered = json.dumps(runtime_schema, indent=2, ensure_ascii=False)
committed_rendered = json.dumps(committed_schema, indent=2, ensure_ascii=False)
diff = "\n".join(
list(
difflib.unified_diff(
committed_rendered.splitlines(),
runtime_rendered.splitlines(),
fromfile="docs/openapi.json (committed)",
tofile="GET /openapi.json (runtime)",
lineterm="",
)
)[:120]
)
raise AssertionError(
"runtime /openapi.json drifts from docs/openapi.json; "
"run `make openapi` and commit the result.\n\n" + diff
)
# Keep ``os`` legit in case future scenarios need direct env reads.
_ = os

File diff suppressed because it is too large Load Diff