Files
EverOS/tests/integration/test_memorize_window_segmentation.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

434 lines
15 KiB
Python

"""Window-segmentation white-box integration tests for boundary stage.
Verifies the **read-merge-boundary-write** semantics of one ``memorize()``
invocation, especially the buffer-as-tail invariant and the **buffer
replacement** behaviour on successive calls:
Invariants under test
---------------------
I1. After one ``add`` with ``boundaries=[k]``:
- memcell rows: prefix of merged input (first k messages)
- buffer rows: tail (the remaining messages)
- every input message_id lands in exactly one of {memcell, buffer}
(covered ∧ disjoint)
I2. Tail ordering: every buffer row's timestamp ≥ every memcell row's
timestamp (the tail is the **last** part of the time-ordered slice).
I3. Successive ``add`` consumes prior buffer:
- Round 2's boundary sees ``prior_buffer + new_batch`` merged.
- The prior tail (m3 say) ends up in **Round 2's memcell** if the
boundary cuts past it, NOT in any buffer row.
- The new buffer is the **fresh** tail, with the old buffer rows
replaced entirely (semantics of ``_replace_buffer``).
I4. ``flush`` with ``is_final=True`` drains the buffer entirely — every
remaining message ends up in some memcell.
This is **single-threaded sequential** (the concurrent race is covered
separately in test_memorize_concurrent_session_lock.py). FakeLLM scripts
boundary decisions deterministically so we own exact slicing.
"""
from __future__ import annotations
import importlib
import json
from collections.abc import AsyncIterator, Callable
from pathlib import Path
from typing import Any
from unittest.mock import AsyncMock
import pytest
import pytest_asyncio
from everalgo.llm.types import ChatMessage as LLMChatMessage
from everalgo.llm.types import ChatResponse
from everalgo.testing.fake_llm import FakeLLMClient
from sqlalchemy import text
from sqlmodel import SQLModel
from everos.core.persistence import MemoryRoot
from everos.service.memorize import memorize
# ---------------------------------------------------------------------------
# FakeLLM with scripted boundary responses (FIFO queue, one pop per call)
# ---------------------------------------------------------------------------
def _boundary_response(boundaries: list[int]) -> str:
return json.dumps(
{"reasoning": "test", "boundaries": boundaries, "should_wait": False}
)
def _episode_response(title: str = "T", content: str = "B") -> str:
return json.dumps({"title": title, "content": content})
def _make_scripted_llm(
boundary_responses: list[list[int]],
) -> FakeLLMClient:
"""Boundary calls FIFO-pop from ``boundary_responses``.
Episode calls (for downstream pipeline) get a canned response.
"""
queue: list[list[int]] = list(boundary_responses)
def handler(messages: list[LLMChatMessage], **_: Any) -> ChatResponse:
prompt = messages[0].content
if "boundaries" in prompt.lower() or "memcell" in prompt.lower():
cuts = queue.pop(0) if queue else []
return ChatResponse(content=_boundary_response(cuts), model="fake")
return ChatResponse(content=_episode_response(), model="fake")
return FakeLLMClient(handler=handler)
# ---------------------------------------------------------------------------
# Fixture — mirrors the locked-env fixture in the concurrent test
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def memorize_env_scripted(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> AsyncIterator[Callable[..., AsyncMock]]:
monkeypatch.setattr(
MemoryRoot, "default", classmethod(lambda cls: MemoryRoot(root=tmp_path))
)
(tmp_path / ".index" / "sqlite").mkdir(parents=True, exist_ok=True)
svc = importlib.import_module("everos.service.memorize")
af_mod = importlib.import_module("everos.memory.strategies.extract_atomic_facts")
fs_mod = importlib.import_module("everos.memory.strategies.extract_foresight")
client_mod = importlib.import_module("everos.component.llm.client")
lock_mod = importlib.import_module("everos.service._session_lock")
for attr in (
"_episode_writer",
"_prompt_loader",
"_user_pipeline",
"_agent_pipeline",
"_ome_engine",
):
monkeypatch.setattr(svc, attr, None, raising=False)
monkeypatch.setattr(client_mod, "_llm_client", None, raising=False)
monkeypatch.setattr(af_mod, "_writer", None, raising=False)
monkeypatch.setattr(fs_mod, "_writer", None, raising=False)
lock_mod._reset_for_tests()
started: dict[str, Any] = {"engine": None}
async def _setup(*, fake_llm: FakeLLMClient) -> None:
monkeypatch.setenv("EVEROS_MEMORIZE__MODE", "chat")
monkeypatch.setenv("EVEROS_LLM__API_KEY", "fake-key")
monkeypatch.setenv("EVEROS_LLM__BASE_URL", "https://fake.example.com")
from everos.config import load_settings
load_settings.cache_clear()
monkeypatch.setattr(client_mod, "_llm_client", fake_llm)
from everos.infra.persistence.sqlite import get_engine
db_engine = get_engine()
async with db_engine.begin() as conn:
await conn.run_sync(SQLModel.metadata.create_all)
# Silence OME strategies — orthogonal to boundary segmentation.
mock_af = AsyncMock(return_value=[])
mock_fs = AsyncMock(return_value=[])
monkeypatch.setattr(
af_mod,
"AtomicFactExtractor",
lambda *a, **k: type("M", (), {"aextract": mock_af})(),
)
monkeypatch.setattr(
fs_mod,
"ForesightExtractor",
lambda *a, **k: type("M", (), {"aextract": mock_fs})(),
)
engine = svc._get_engine()
await engine.start()
started["engine"] = engine
yield _setup
if started["engine"] is not None:
await started["engine"].stop()
from everos.infra.persistence.sqlite import dispose_engine
await dispose_engine()
# ---------------------------------------------------------------------------
# Helpers — message factory + state inspectors
# ---------------------------------------------------------------------------
_BASE_TS = 1_700_000_000_000 # 2023-11-14, plenty of headroom
def _msg(idx: int, sender: str = "alice") -> dict[str, Any]:
"""Build one canonical /add message with monotonically increasing ts."""
return {
"sender_id": sender,
"role": "user",
"timestamp": _BASE_TS + idx * 1000,
"content": f"msg-{idx}",
}
async def _buffer_rows(session_id: str) -> list[tuple[str, int]]:
"""Return ``[(message_id, timestamp_ms)]`` for buffer rows, time-ordered."""
from everos.component.utils.datetime import from_iso_format, to_timestamp_ms
from everos.infra.persistence.sqlite import get_engine
eng = get_engine()
async with eng.connect() as conn:
result = await conn.execute(
text(
"SELECT message_id, timestamp FROM unprocessed_buffer "
"WHERE session_id = :s ORDER BY timestamp"
),
{"s": session_id},
)
rows: list[tuple[str, int]] = []
for mid, ts in result.fetchall():
# sqlite stores DateTime as ISO 8601 string via SQLAlchemy.
ts_ms = to_timestamp_ms(from_iso_format(ts))
rows.append((mid, ts_ms))
return rows
async def _memcell_rows(session_id: str) -> list[tuple[str, list[str]]]:
"""Return ``[(memcell_id, message_ids[])]`` in insertion order."""
from everos.infra.persistence.sqlite import get_engine
eng = get_engine()
async with eng.connect() as conn:
result = await conn.execute(
text(
"SELECT memcell_id, message_ids_json FROM memcell "
"WHERE session_id = :s ORDER BY created_at"
),
{"s": session_id},
)
return [(mid, json.loads(raw)) for mid, raw in result.fetchall()]
# ---------------------------------------------------------------------------
# I1 + I2: single add with boundaries=[k] — prefix→memcell, suffix→buffer
# ---------------------------------------------------------------------------
async def test_single_add_no_cut_accumulates_full_batch_in_buffer(
memorize_env_scripted: Callable[..., AsyncMock],
) -> None:
"""boundaries=[] → no memcell, entire batch sits in buffer."""
await memorize_env_scripted(fake_llm=_make_scripted_llm([[]]))
session = "s_no_cut"
inputs = [_msg(i) for i in range(3)]
await memorize({"session_id": session, "messages": inputs})
cells = await _memcell_rows(session)
buffer = await _buffer_rows(session)
assert cells == [], f"expected no memcell, got {cells}"
assert len(buffer) == 3, f"expected 3 buffer rows, got {len(buffer)}"
# buffer holds all 3 input message_ids, time-ordered
buffer_ts = [ts for _, ts in buffer]
assert buffer_ts == sorted(buffer_ts)
async def test_single_add_with_cut_splits_prefix_to_cell_suffix_to_buffer(
memorize_env_scripted: Callable[..., AsyncMock],
) -> None:
"""boundaries=[2] on a 3-msg batch → cell=[m0,m1], buffer=[m2]."""
await memorize_env_scripted(fake_llm=_make_scripted_llm([[2]]))
session = "s_cut"
inputs = [_msg(i) for i in range(3)]
await memorize({"session_id": session, "messages": inputs})
cells = await _memcell_rows(session)
buffer = await _buffer_rows(session)
# Exactly one memcell carved.
assert len(cells) == 1, cells
cell_msg_ids = set(cells[0][1])
assert len(cell_msg_ids) == 2
# Buffer holds the remaining one message.
assert len(buffer) == 1
buf_msg_id = buffer[0][0]
# Disjoint: buffer message NOT in the memcell.
assert buf_msg_id not in cell_msg_ids, (
"buffer row leaked into memcell — buffer should be the tail only"
)
# I2 — tail comes AFTER prefix in time.
cell_max_ts = max(_BASE_TS + i * 1000 for i in (0, 1))
buf_ts = buffer[0][1]
assert buf_ts >= cell_max_ts, (
f"tail ts ({buf_ts}) must be >= max cell ts ({cell_max_ts})"
)
# ---------------------------------------------------------------------------
# I3: successive add — prior buffer feeds into next memcell, then is REPLACED
# ---------------------------------------------------------------------------
async def test_second_add_consumes_prior_buffer_and_replaces_tail(
memorize_env_scripted: Callable[..., AsyncMock],
) -> None:
"""Core test: prior tail must end up in next memcell, NOT remain in buffer."""
# Round 1: cut after 2 of 3 → cell=[m0,m1], buffer=[m2]
# Round 2: merged input = [m2,m3,m4,m5]; cut after 3 → cell=[m2,m3,m4],
# buffer=[m5]
await memorize_env_scripted(
fake_llm=_make_scripted_llm([[2], [3]]),
)
session = "s_replace"
# Round 1
r1_inputs = [_msg(i) for i in range(3)]
await memorize({"session_id": session, "messages": r1_inputs})
r1_cells = await _memcell_rows(session)
r1_buffer = await _buffer_rows(session)
assert len(r1_cells) == 1
assert len(r1_buffer) == 1
prior_tail_msg_id = r1_buffer[0][0]
# Round 2 — fresh messages m3, m4, m5
r2_inputs = [_msg(i) for i in range(3, 6)]
await memorize({"session_id": session, "messages": r2_inputs})
r2_cells = await _memcell_rows(session)
r2_buffer = await _buffer_rows(session)
# Two memcells total: one from round 1, one from round 2.
assert len(r2_cells) == 2, r2_cells
round1_cell_msgs = set(r2_cells[0][1])
round2_cell_msgs = set(r2_cells[1][1])
# ★ KEY ASSERTION ★ — prior buffer's message landed in round 2 cell.
assert prior_tail_msg_id in round2_cell_msgs, (
f"prior buffer msg {prior_tail_msg_id} should have been consumed "
f"into round 2's memcell, but it's missing from {round2_cell_msgs}"
)
# Round 2 cell should have exactly 3 messages (prior tail + first 2 of new).
assert len(round2_cell_msgs) == 3
# Round 1 cell unchanged.
assert len(round1_cell_msgs) == 2
assert prior_tail_msg_id not in round1_cell_msgs
# Buffer is the NEW tail — exactly 1 fresh row.
assert len(r2_buffer) == 1
new_tail_id = r2_buffer[0][0]
# ★ KEY ASSERTION ★ — the OLD buffer entry is gone (replaced, not appended).
assert new_tail_id != prior_tail_msg_id, (
"old buffer entry survived into round 2's buffer — "
"_replace_buffer is supposed to wipe + reinsert, not append"
)
# Buffer ∩ all memcells = ∅
all_cell_msgs = round1_cell_msgs | round2_cell_msgs
assert new_tail_id not in all_cell_msgs
# Conservation: 6 distinct message ids covered across cells + buffer.
# (We avoid hard-coding id format here — gen_message_id encodes the
# per-batch index, not a global one.)
covered = all_cell_msgs | {new_tail_id}
assert len(covered) == 6, (
f"expected 6 distinct ids covered, got {len(covered)}: {covered}"
)
# ---------------------------------------------------------------------------
# I4: flush drains buffer entirely (is_final=True path)
# ---------------------------------------------------------------------------
async def test_flush_after_accumulation_drains_buffer_into_memcell(
memorize_env_scripted: Callable[..., AsyncMock],
) -> None:
"""add(boundaries=[]) → buffer accumulates → flush → cell=all, buffer=[]."""
# Round 1 add: boundaries=[] → no cut, all into buffer.
# Flush: is_final=True passes empty boundaries → algo closes tail into cell.
await memorize_env_scripted(
fake_llm=_make_scripted_llm([[], []]),
)
session = "s_flush"
inputs = [_msg(i) for i in range(3)]
await memorize({"session_id": session, "messages": inputs})
# Post-add: nothing in memcell yet.
cells = await _memcell_rows(session)
buffer = await _buffer_rows(session)
assert cells == []
assert len(buffer) == 3
# Flush
await memorize({"session_id": session, "messages": []}, is_final=True)
cells = await _memcell_rows(session)
buffer = await _buffer_rows(session)
assert len(cells) == 1, cells
assert len(cells[0][1]) == 3
assert buffer == []
# ---------------------------------------------------------------------------
# Sanity: empty boundaries + multiple sequential adds keep conservation
# ---------------------------------------------------------------------------
async def test_three_sequential_adds_conservation_no_loss(
memorize_env_scripted: Callable[..., AsyncMock],
) -> None:
"""3 sequential adds with mixed cuts: every input id covered exactly once."""
# add 1: 3 msgs, no cut → buffer holds [m0,m1,m2]
# add 2: 3 msgs, cut after 4 of merged [m0..m5] → cell=[m0..m3], buffer=[m4,m5]
# add 3: 3 msgs, cut after 3 of merged [m4..m8] → cell=[m4,m5,m6], buffer=[m7,m8]
await memorize_env_scripted(
fake_llm=_make_scripted_llm([[], [4], [3]]),
)
session = "s_seq"
total_inputs = 0
for batch_start in (0, 3, 6):
await memorize(
{
"session_id": session,
"messages": [_msg(i) for i in range(batch_start, batch_start + 3)],
}
)
total_inputs += 3
cells = await _memcell_rows(session)
buffer = await _buffer_rows(session)
in_cells: set[str] = set()
for _, msg_ids in cells:
in_cells.update(msg_ids)
in_buffer = {mid for mid, _ in buffer}
covered = in_cells | in_buffer
assert len(covered) == total_inputs, (
f"expected {total_inputs} ids covered, got {len(covered)}"
)
# Disjoint
assert not (in_cells & in_buffer)