Files
EverOS/tests/e2e/test_add_flush_agent_pipeline_e2e.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

207 lines
8.6 KiB
Python

"""Agent pipeline e2e: 5 SWE-bench trajectories drive /add + /flush.
Drives the full HTTP route through to storage, exercising the agent-track
pipeline (boundary → memcell → extract_agent_case → trigger_skill_clustering
→ extract_agent_skill) with real LLM and real embedder credentials.
Mixed tenancy by design (sender_id alignment from fixture):
agent_pytest (1 session, pytest-dev/pytest-7236) ┐ independent
agent_sympy (1 session, sympy/sympy-18763) ┘ owners
agent_django (3 sessions, django/django-{14311,16255,16263}) shared
Concurrency strategy (workaround for the known
``trigger_skill_clustering`` read-modify-write race on a shared owner_id):
Phase 1: pytest + sympy concurrent via asyncio.gather (disjoint owners)
Phase 2: 3 django sessions sequential (same owner, would race)
Once the cluster race is fixed in production, Phase 2 can collapse into
the same gather and the test will still pass — the assertions are
race-free, only the driver is conservative.
White-box assertions (audit trail of internal surfaces touched):
- sqlite ``memcell`` rows per session_id
- filesystem ``<root>/agents/<agent>/.cases/*.md`` presence
- LanceDB ``agent_case`` rows by ``owner_id`` (count + session_id set)
- LanceDB ``agent_skill`` rows by ``owner_id`` (soft — LLM-dependent)
"""
from __future__ import annotations
import asyncio
import json
from collections.abc import Awaitable, Callable
from pathlib import Path
import httpx
import pytest
from everos.infra.persistence.lancedb import agent_case_repo, agent_skill_repo
from everos.infra.persistence.markdown import AgentCaseDailyFrontmatter
_FIXTURE_DIR = Path(__file__).resolve().parents[1] / "fixtures" / "agent_trajectories"
# Hand-picked trajectories (kept in-tree as fixtures; this selection is
# the source of truth — the original converter is not in the repo).
_PYTEST_SESSION = "session_pytest_7236"
_SYMPY_SESSION = "session_sympy_18763"
_DJANGO_SESSIONS = (
"session_django_14311",
"session_django_16255",
"session_django_16263",
)
_AGENT_PYTEST = "agent_pytest"
_AGENT_SYMPY = "agent_sympy"
_AGENT_DJANGO = "agent_django"
# Phase 3 drain budget: OME chain (case → cluster → skill) writes md in
# stages, each picked up by cascade. Multiple drain rounds with brief
# sleeps let the chain quiesce without false-positive completion.
_DRAIN_ROUNDS = 4
_DRAIN_TIMEOUT_SECONDS = 300.0
_DRAIN_INTER_ROUND_SLEEP_SECONDS = 5.0
def _load_fixture(session_id: str) -> dict:
return json.loads((_FIXTURE_DIR / f"{session_id}.json").read_text())
async def _drive_session(
client: httpx.AsyncClient, session_data: dict
) -> tuple[str, str]:
"""Run /add followed by /flush for one trajectory; return status."""
sid = session_data["everos_session_id"]
msgs = session_data["messages"]
# MessageItemDTO.max_length=500; our largest fixture has 324 messages.
r = await client.post(
"/api/v1/memory/add",
json={"session_id": sid, "messages": msgs},
timeout=600.0,
)
assert r.status_code == 200, (
f"{sid}: /add returned {r.status_code}{r.text[:300]}"
)
r = await client.post(
"/api/v1/memory/flush",
json={"session_id": sid},
timeout=600.0,
)
assert r.status_code == 200, (
f"{sid}: /flush returned {r.status_code}{r.text[:300]}"
)
return sid, r.json()["data"]["status"]
@pytest.mark.slow
@pytest.mark.live_llm
async def test_agent_pipeline_e2e_mixed_tenancy(
async_client: httpx.AsyncClient,
core_pipeline_runtime: Path,
pipeline_done_poll: Callable[..., Awaitable[None]],
memcell_count: Callable[..., Awaitable[int]],
) -> None:
"""5 SWE-bench trajectories → agent_case + agent_skill on three agents."""
memory_root = core_pipeline_runtime
pytest_fx = _load_fixture(_PYTEST_SESSION)
sympy_fx = _load_fixture(_SYMPY_SESSION)
django_fxs = [_load_fixture(s) for s in _DJANGO_SESSIONS]
# ── Phase 1: independent owners concurrent ────────────────────────────
await asyncio.gather(
_drive_session(async_client, pytest_fx),
_drive_session(async_client, sympy_fx),
)
# ── Phase 2: shared owner_id, sequential to dodge cluster race ────────
for fx in django_fxs:
await _drive_session(async_client, fx)
# ── Phase 3: drain OME chain + cascade ────────────────────────────────
for _ in range(_DRAIN_ROUNDS):
await pipeline_done_poll(deadline_seconds=_DRAIN_TIMEOUT_SECONDS)
await asyncio.sleep(_DRAIN_INTER_ROUND_SLEEP_SECONDS)
# ── Phase 4: assertions ───────────────────────────────────────────────
# 4.1 every session produced ≥1 memcell
all_sessions = (_PYTEST_SESSION, _SYMPY_SESSION, *_DJANGO_SESSIONS)
for sid in all_sessions:
n = await memcell_count(sid)
assert n >= 1, f"no memcell for session {sid!r} (got {n})"
# 4.2 each agent has a .cases dir with ≥1 .md file
agents_dir = memory_root / "default_app" / "default_project" / "agents"
case_dir_name = AgentCaseDailyFrontmatter.DIR_NAME
for agent_id in (_AGENT_PYTEST, _AGENT_SYMPY, _AGENT_DJANGO):
case_dir = agents_dir / agent_id / case_dir_name
assert case_dir.is_dir(), f"missing {case_dir!s} for agent={agent_id!r}"
md_files = list(case_dir.glob("*.md"))
assert md_files, f"no agent_case md under {case_dir!s}"
# 4.3 LanceDB agent_case rows per owner
pytest_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_PYTEST}'")
sympy_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_SYMPY}'")
django_cases = await agent_case_repo.find_where(f"owner_id = '{_AGENT_DJANGO}'")
assert len(pytest_cases) >= 1, (
f"no agent_pytest rows in LanceDB (got {len(pytest_cases)})"
)
assert len(sympy_cases) >= 1, (
f"no agent_sympy rows in LanceDB (got {len(sympy_cases)})"
)
# Each django session writes at least one cell → at least one case per
# session. Lower bound 3 covers the minimum; LLM may produce more.
assert len(django_cases) >= 3, (
f"agent_django expected ≥3 LanceDB cases (3 sessions), got {len(django_cases)}"
)
# 4.4 cross-owner isolation — each agent's cases trace back only to
# its own sessions
pytest_session_ids = {c.session_id for c in pytest_cases}
assert pytest_session_ids == {_PYTEST_SESSION}, (
f"agent_pytest cases leaked across sessions: {pytest_session_ids}"
)
sympy_session_ids = {c.session_id for c in sympy_cases}
assert sympy_session_ids == {_SYMPY_SESSION}, (
f"agent_sympy cases leaked across sessions: {sympy_session_ids}"
)
django_session_ids = {c.session_id for c in django_cases}
assert django_session_ids == set(_DJANGO_SESSIONS), (
f"agent_django session set mismatch — got {django_session_ids}, "
f"want {set(_DJANGO_SESSIONS)}"
)
# 4.5 agent_skill — soft: emission depends on LLM clustering quality
# gate (skip_quality_threshold + cluster size). pytest/sympy are
# single-case clusters and may legitimately yield 0 skills. django
# has 3 cases and should aggregate into ≥1 cluster of size ≥2,
# producing ≥1 skill — but we keep this informational (LLM-dependent)
# rather than a hard floor to avoid flaky CI signal.
pytest_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_PYTEST}'")
sympy_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_SYMPY}'")
django_skills = await agent_skill_repo.find_where(f"owner_id = '{_AGENT_DJANGO}'")
# Hard sanity: counts non-negative (the repo isn't broken).
assert len(pytest_skills) >= 0
assert len(sympy_skills) >= 0
assert len(django_skills) >= 0
# 4.6 strict md ↔ LanceDB parity across every cascade kind
#
# The per-owner counts above are loose (LLM-emission-dependent); this
# check enforces byte-exact id-set + content_sha256 parity across
# every md the agent pipeline wrote.
#
# ``expect_at_least`` pins agent_case (every session writes ≥1 case)
# so an empty glob would fail loudly. agent_skill is NOT pinned —
# emission depends on the LLM clustering quality gate per 4.5; a
# legitimately empty agent_skill md set is still a passing run.
from tests._consistency_assertions import assert_md_lance_strict_consistent
await assert_md_lance_strict_consistent(
memory_root,
expect_at_least={"agent_case": 1},
)