md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
434 lines
15 KiB
Python
434 lines
15 KiB
Python
"""Window-segmentation white-box integration tests for boundary stage.
|
|
|
|
Verifies the **read-merge-boundary-write** semantics of one ``memorize()``
|
|
invocation, especially the buffer-as-tail invariant and the **buffer
|
|
replacement** behaviour on successive calls:
|
|
|
|
Invariants under test
|
|
---------------------
|
|
I1. After one ``add`` with ``boundaries=[k]``:
|
|
- memcell rows: prefix of merged input (first k messages)
|
|
- buffer rows: tail (the remaining messages)
|
|
- every input message_id lands in exactly one of {memcell, buffer}
|
|
(covered ∧ disjoint)
|
|
|
|
I2. Tail ordering: every buffer row's timestamp ≥ every memcell row's
|
|
timestamp (the tail is the **last** part of the time-ordered slice).
|
|
|
|
I3. Successive ``add`` consumes prior buffer:
|
|
- Round 2's boundary sees ``prior_buffer + new_batch`` merged.
|
|
- The prior tail (m3 say) ends up in **Round 2's memcell** if the
|
|
boundary cuts past it, NOT in any buffer row.
|
|
- The new buffer is the **fresh** tail, with the old buffer rows
|
|
replaced entirely (semantics of ``_replace_buffer``).
|
|
|
|
I4. ``flush`` with ``is_final=True`` drains the buffer entirely — every
|
|
remaining message ends up in some memcell.
|
|
|
|
This is **single-threaded sequential** (the concurrent race is covered
|
|
separately in test_memorize_concurrent_session_lock.py). FakeLLM scripts
|
|
boundary decisions deterministically so we own exact slicing.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import json
|
|
from collections.abc import AsyncIterator, Callable
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from unittest.mock import AsyncMock
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from everalgo.llm.types import ChatMessage as LLMChatMessage
|
|
from everalgo.llm.types import ChatResponse
|
|
from everalgo.testing.fake_llm import FakeLLMClient
|
|
from sqlalchemy import text
|
|
from sqlmodel import SQLModel
|
|
|
|
from everos.core.persistence import MemoryRoot
|
|
from everos.service.memorize import memorize
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# FakeLLM with scripted boundary responses (FIFO queue, one pop per call)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _boundary_response(boundaries: list[int]) -> str:
|
|
return json.dumps(
|
|
{"reasoning": "test", "boundaries": boundaries, "should_wait": False}
|
|
)
|
|
|
|
|
|
def _episode_response(title: str = "T", content: str = "B") -> str:
|
|
return json.dumps({"title": title, "content": content})
|
|
|
|
|
|
def _make_scripted_llm(
|
|
boundary_responses: list[list[int]],
|
|
) -> FakeLLMClient:
|
|
"""Boundary calls FIFO-pop from ``boundary_responses``.
|
|
|
|
Episode calls (for downstream pipeline) get a canned response.
|
|
"""
|
|
queue: list[list[int]] = list(boundary_responses)
|
|
|
|
def handler(messages: list[LLMChatMessage], **_: Any) -> ChatResponse:
|
|
prompt = messages[0].content
|
|
if "boundaries" in prompt.lower() or "memcell" in prompt.lower():
|
|
cuts = queue.pop(0) if queue else []
|
|
return ChatResponse(content=_boundary_response(cuts), model="fake")
|
|
return ChatResponse(content=_episode_response(), model="fake")
|
|
|
|
return FakeLLMClient(handler=handler)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixture — mirrors the locked-env fixture in the concurrent test
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def memorize_env_scripted(
|
|
tmp_path: Path,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> AsyncIterator[Callable[..., AsyncMock]]:
|
|
monkeypatch.setattr(
|
|
MemoryRoot, "default", classmethod(lambda cls: MemoryRoot(root=tmp_path))
|
|
)
|
|
(tmp_path / ".index" / "sqlite").mkdir(parents=True, exist_ok=True)
|
|
|
|
svc = importlib.import_module("everos.service.memorize")
|
|
af_mod = importlib.import_module("everos.memory.strategies.extract_atomic_facts")
|
|
fs_mod = importlib.import_module("everos.memory.strategies.extract_foresight")
|
|
client_mod = importlib.import_module("everos.component.llm.client")
|
|
lock_mod = importlib.import_module("everos.service._session_lock")
|
|
|
|
for attr in (
|
|
"_episode_writer",
|
|
"_prompt_loader",
|
|
"_user_pipeline",
|
|
"_agent_pipeline",
|
|
"_ome_engine",
|
|
):
|
|
monkeypatch.setattr(svc, attr, None, raising=False)
|
|
monkeypatch.setattr(client_mod, "_llm_client", None, raising=False)
|
|
monkeypatch.setattr(af_mod, "_writer", None, raising=False)
|
|
monkeypatch.setattr(fs_mod, "_writer", None, raising=False)
|
|
lock_mod._reset_for_tests()
|
|
|
|
started: dict[str, Any] = {"engine": None}
|
|
|
|
async def _setup(*, fake_llm: FakeLLMClient) -> None:
|
|
monkeypatch.setenv("EVEROS_MEMORIZE__MODE", "chat")
|
|
monkeypatch.setenv("EVEROS_LLM__API_KEY", "fake-key")
|
|
monkeypatch.setenv("EVEROS_LLM__BASE_URL", "https://fake.example.com")
|
|
from everos.config import load_settings
|
|
|
|
load_settings.cache_clear()
|
|
|
|
monkeypatch.setattr(client_mod, "_llm_client", fake_llm)
|
|
|
|
from everos.infra.persistence.sqlite import get_engine
|
|
|
|
db_engine = get_engine()
|
|
async with db_engine.begin() as conn:
|
|
await conn.run_sync(SQLModel.metadata.create_all)
|
|
|
|
# Silence OME strategies — orthogonal to boundary segmentation.
|
|
mock_af = AsyncMock(return_value=[])
|
|
mock_fs = AsyncMock(return_value=[])
|
|
monkeypatch.setattr(
|
|
af_mod,
|
|
"AtomicFactExtractor",
|
|
lambda *a, **k: type("M", (), {"aextract": mock_af})(),
|
|
)
|
|
monkeypatch.setattr(
|
|
fs_mod,
|
|
"ForesightExtractor",
|
|
lambda *a, **k: type("M", (), {"aextract": mock_fs})(),
|
|
)
|
|
|
|
engine = svc._get_engine()
|
|
await engine.start()
|
|
started["engine"] = engine
|
|
|
|
yield _setup
|
|
|
|
if started["engine"] is not None:
|
|
await started["engine"].stop()
|
|
from everos.infra.persistence.sqlite import dispose_engine
|
|
|
|
await dispose_engine()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers — message factory + state inspectors
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_BASE_TS = 1_700_000_000_000 # 2023-11-14, plenty of headroom
|
|
|
|
|
|
def _msg(idx: int, sender: str = "alice") -> dict[str, Any]:
|
|
"""Build one canonical /add message with monotonically increasing ts."""
|
|
return {
|
|
"sender_id": sender,
|
|
"role": "user",
|
|
"timestamp": _BASE_TS + idx * 1000,
|
|
"content": f"msg-{idx}",
|
|
}
|
|
|
|
|
|
async def _buffer_rows(session_id: str) -> list[tuple[str, int]]:
|
|
"""Return ``[(message_id, timestamp_ms)]`` for buffer rows, time-ordered."""
|
|
from everos.component.utils.datetime import from_iso_format, to_timestamp_ms
|
|
from everos.infra.persistence.sqlite import get_engine
|
|
|
|
eng = get_engine()
|
|
async with eng.connect() as conn:
|
|
result = await conn.execute(
|
|
text(
|
|
"SELECT message_id, timestamp FROM unprocessed_buffer "
|
|
"WHERE session_id = :s ORDER BY timestamp"
|
|
),
|
|
{"s": session_id},
|
|
)
|
|
rows: list[tuple[str, int]] = []
|
|
for mid, ts in result.fetchall():
|
|
# sqlite stores DateTime as ISO 8601 string via SQLAlchemy.
|
|
ts_ms = to_timestamp_ms(from_iso_format(ts))
|
|
rows.append((mid, ts_ms))
|
|
return rows
|
|
|
|
|
|
async def _memcell_rows(session_id: str) -> list[tuple[str, list[str]]]:
|
|
"""Return ``[(memcell_id, message_ids[])]`` in insertion order."""
|
|
from everos.infra.persistence.sqlite import get_engine
|
|
|
|
eng = get_engine()
|
|
async with eng.connect() as conn:
|
|
result = await conn.execute(
|
|
text(
|
|
"SELECT memcell_id, message_ids_json FROM memcell "
|
|
"WHERE session_id = :s ORDER BY created_at"
|
|
),
|
|
{"s": session_id},
|
|
)
|
|
return [(mid, json.loads(raw)) for mid, raw in result.fetchall()]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# I1 + I2: single add with boundaries=[k] — prefix→memcell, suffix→buffer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def test_single_add_no_cut_accumulates_full_batch_in_buffer(
|
|
memorize_env_scripted: Callable[..., AsyncMock],
|
|
) -> None:
|
|
"""boundaries=[] → no memcell, entire batch sits in buffer."""
|
|
await memorize_env_scripted(fake_llm=_make_scripted_llm([[]]))
|
|
|
|
session = "s_no_cut"
|
|
inputs = [_msg(i) for i in range(3)]
|
|
await memorize({"session_id": session, "messages": inputs})
|
|
|
|
cells = await _memcell_rows(session)
|
|
buffer = await _buffer_rows(session)
|
|
|
|
assert cells == [], f"expected no memcell, got {cells}"
|
|
assert len(buffer) == 3, f"expected 3 buffer rows, got {len(buffer)}"
|
|
# buffer holds all 3 input message_ids, time-ordered
|
|
buffer_ts = [ts for _, ts in buffer]
|
|
assert buffer_ts == sorted(buffer_ts)
|
|
|
|
|
|
async def test_single_add_with_cut_splits_prefix_to_cell_suffix_to_buffer(
|
|
memorize_env_scripted: Callable[..., AsyncMock],
|
|
) -> None:
|
|
"""boundaries=[2] on a 3-msg batch → cell=[m0,m1], buffer=[m2]."""
|
|
await memorize_env_scripted(fake_llm=_make_scripted_llm([[2]]))
|
|
|
|
session = "s_cut"
|
|
inputs = [_msg(i) for i in range(3)]
|
|
await memorize({"session_id": session, "messages": inputs})
|
|
|
|
cells = await _memcell_rows(session)
|
|
buffer = await _buffer_rows(session)
|
|
|
|
# Exactly one memcell carved.
|
|
assert len(cells) == 1, cells
|
|
cell_msg_ids = set(cells[0][1])
|
|
assert len(cell_msg_ids) == 2
|
|
|
|
# Buffer holds the remaining one message.
|
|
assert len(buffer) == 1
|
|
buf_msg_id = buffer[0][0]
|
|
|
|
# Disjoint: buffer message NOT in the memcell.
|
|
assert buf_msg_id not in cell_msg_ids, (
|
|
"buffer row leaked into memcell — buffer should be the tail only"
|
|
)
|
|
|
|
# I2 — tail comes AFTER prefix in time.
|
|
cell_max_ts = max(_BASE_TS + i * 1000 for i in (0, 1))
|
|
buf_ts = buffer[0][1]
|
|
assert buf_ts >= cell_max_ts, (
|
|
f"tail ts ({buf_ts}) must be >= max cell ts ({cell_max_ts})"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# I3: successive add — prior buffer feeds into next memcell, then is REPLACED
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def test_second_add_consumes_prior_buffer_and_replaces_tail(
|
|
memorize_env_scripted: Callable[..., AsyncMock],
|
|
) -> None:
|
|
"""Core test: prior tail must end up in next memcell, NOT remain in buffer."""
|
|
# Round 1: cut after 2 of 3 → cell=[m0,m1], buffer=[m2]
|
|
# Round 2: merged input = [m2,m3,m4,m5]; cut after 3 → cell=[m2,m3,m4],
|
|
# buffer=[m5]
|
|
await memorize_env_scripted(
|
|
fake_llm=_make_scripted_llm([[2], [3]]),
|
|
)
|
|
|
|
session = "s_replace"
|
|
|
|
# Round 1
|
|
r1_inputs = [_msg(i) for i in range(3)]
|
|
await memorize({"session_id": session, "messages": r1_inputs})
|
|
|
|
r1_cells = await _memcell_rows(session)
|
|
r1_buffer = await _buffer_rows(session)
|
|
assert len(r1_cells) == 1
|
|
assert len(r1_buffer) == 1
|
|
prior_tail_msg_id = r1_buffer[0][0]
|
|
|
|
# Round 2 — fresh messages m3, m4, m5
|
|
r2_inputs = [_msg(i) for i in range(3, 6)]
|
|
await memorize({"session_id": session, "messages": r2_inputs})
|
|
|
|
r2_cells = await _memcell_rows(session)
|
|
r2_buffer = await _buffer_rows(session)
|
|
|
|
# Two memcells total: one from round 1, one from round 2.
|
|
assert len(r2_cells) == 2, r2_cells
|
|
round1_cell_msgs = set(r2_cells[0][1])
|
|
round2_cell_msgs = set(r2_cells[1][1])
|
|
|
|
# ★ KEY ASSERTION ★ — prior buffer's message landed in round 2 cell.
|
|
assert prior_tail_msg_id in round2_cell_msgs, (
|
|
f"prior buffer msg {prior_tail_msg_id} should have been consumed "
|
|
f"into round 2's memcell, but it's missing from {round2_cell_msgs}"
|
|
)
|
|
# Round 2 cell should have exactly 3 messages (prior tail + first 2 of new).
|
|
assert len(round2_cell_msgs) == 3
|
|
|
|
# Round 1 cell unchanged.
|
|
assert len(round1_cell_msgs) == 2
|
|
assert prior_tail_msg_id not in round1_cell_msgs
|
|
|
|
# Buffer is the NEW tail — exactly 1 fresh row.
|
|
assert len(r2_buffer) == 1
|
|
new_tail_id = r2_buffer[0][0]
|
|
|
|
# ★ KEY ASSERTION ★ — the OLD buffer entry is gone (replaced, not appended).
|
|
assert new_tail_id != prior_tail_msg_id, (
|
|
"old buffer entry survived into round 2's buffer — "
|
|
"_replace_buffer is supposed to wipe + reinsert, not append"
|
|
)
|
|
|
|
# Buffer ∩ all memcells = ∅
|
|
all_cell_msgs = round1_cell_msgs | round2_cell_msgs
|
|
assert new_tail_id not in all_cell_msgs
|
|
|
|
# Conservation: 6 distinct message ids covered across cells + buffer.
|
|
# (We avoid hard-coding id format here — gen_message_id encodes the
|
|
# per-batch index, not a global one.)
|
|
covered = all_cell_msgs | {new_tail_id}
|
|
assert len(covered) == 6, (
|
|
f"expected 6 distinct ids covered, got {len(covered)}: {covered}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# I4: flush drains buffer entirely (is_final=True path)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def test_flush_after_accumulation_drains_buffer_into_memcell(
|
|
memorize_env_scripted: Callable[..., AsyncMock],
|
|
) -> None:
|
|
"""add(boundaries=[]) → buffer accumulates → flush → cell=all, buffer=[]."""
|
|
# Round 1 add: boundaries=[] → no cut, all into buffer.
|
|
# Flush: is_final=True passes empty boundaries → algo closes tail into cell.
|
|
await memorize_env_scripted(
|
|
fake_llm=_make_scripted_llm([[], []]),
|
|
)
|
|
|
|
session = "s_flush"
|
|
inputs = [_msg(i) for i in range(3)]
|
|
await memorize({"session_id": session, "messages": inputs})
|
|
|
|
# Post-add: nothing in memcell yet.
|
|
cells = await _memcell_rows(session)
|
|
buffer = await _buffer_rows(session)
|
|
assert cells == []
|
|
assert len(buffer) == 3
|
|
|
|
# Flush
|
|
await memorize({"session_id": session, "messages": []}, is_final=True)
|
|
|
|
cells = await _memcell_rows(session)
|
|
buffer = await _buffer_rows(session)
|
|
|
|
assert len(cells) == 1, cells
|
|
assert len(cells[0][1]) == 3
|
|
assert buffer == []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sanity: empty boundaries + multiple sequential adds keep conservation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def test_three_sequential_adds_conservation_no_loss(
|
|
memorize_env_scripted: Callable[..., AsyncMock],
|
|
) -> None:
|
|
"""3 sequential adds with mixed cuts: every input id covered exactly once."""
|
|
# add 1: 3 msgs, no cut → buffer holds [m0,m1,m2]
|
|
# add 2: 3 msgs, cut after 4 of merged [m0..m5] → cell=[m0..m3], buffer=[m4,m5]
|
|
# add 3: 3 msgs, cut after 3 of merged [m4..m8] → cell=[m4,m5,m6], buffer=[m7,m8]
|
|
await memorize_env_scripted(
|
|
fake_llm=_make_scripted_llm([[], [4], [3]]),
|
|
)
|
|
|
|
session = "s_seq"
|
|
total_inputs = 0
|
|
for batch_start in (0, 3, 6):
|
|
await memorize(
|
|
{
|
|
"session_id": session,
|
|
"messages": [_msg(i) for i in range(batch_start, batch_start + 3)],
|
|
}
|
|
)
|
|
total_inputs += 3
|
|
|
|
cells = await _memcell_rows(session)
|
|
buffer = await _buffer_rows(session)
|
|
|
|
in_cells: set[str] = set()
|
|
for _, msg_ids in cells:
|
|
in_cells.update(msg_ids)
|
|
in_buffer = {mid for mid, _ in buffer}
|
|
|
|
covered = in_cells | in_buffer
|
|
assert len(covered) == total_inputs, (
|
|
f"expected {total_inputs} ids covered, got {len(covered)}"
|
|
)
|
|
# Disjoint
|
|
assert not (in_cells & in_buffer)
|