EverOS/tests/integration/test_memorize_integration.py

"""End-to-end memorize integration tests.

Drives ``service.memorize.memorize()`` with a ``FakeLLMClient`` so the
full chain (ingest → boundary → user / agent pipeline → md + OME emit)
runs without real LLM calls. Each test isolates state by:

- redirecting ``MemoryRoot.default()`` to a ``tmp_path``
- resetting service-layer lazy singletons
- starting / stopping a per-test ``OfflineEngine``
- patching ``get_llm_client`` (boundary + strategies) onto a fake

OME strategies (atomic / foresight) are silenced via ``mock_aextract`` so
this test focuses on the synchronous boundary + pipeline + md path —
strategy dispatch correctness already has its own coverage in
``test_ome_strategies_integration.py``.
"""

from __future__ import annotations

import importlib
import json
import sqlite3
from collections.abc import AsyncIterator, Callable
from pathlib import Path
from typing import Any
from unittest.mock import AsyncMock

import pytest
import pytest_asyncio
from everalgo.llm.types import ChatMessage as LLMChatMessage
from everalgo.llm.types import ChatResponse
from everalgo.testing.fake_llm import FakeLLMClient
from sqlmodel import SQLModel

from everos.core.persistence import MemoryRoot
from everos.service.memorize import MemorizeResult, memorize

# ---------------------------------------------------------------------------
# Canned LLM responses
# ---------------------------------------------------------------------------


def _boundary_response(boundaries: list[int]) -> str:
    """Build a ``detect_boundaries`` JSON response (algo schema)."""
    payload = {
        "reasoning": "test",
        "boundaries": boundaries,
        "should_wait": False,
    }
    return json.dumps(payload)


def _episode_response(title: str = "Test Subject", content: str = "Test body") -> str:
    """Build an ``EpisodeExtractor`` JSON response (algo schema)."""
    return json.dumps({"title": title, "content": content})


def _make_fake_llm(
    boundary_responses: list[list[int]] | None = None,
    *,
    episode_title: str = "Test Subject",
    episode_content: str = "Test body",
) -> FakeLLMClient:
    """Build a ``FakeLLMClient`` that dispatches by prompt fingerprint.

    Pops one ``boundaries=...`` from ``boundary_responses`` per boundary
    prompt seen; every episode prompt returns the same canned
    ``{title, content}``.
    """
    boundary_queue: list[list[int]] = list(boundary_responses or [])

    def handler(messages: list[LLMChatMessage], **_: Any) -> ChatResponse:
        prompt = messages[0].content
        if "boundaries" in prompt.lower() or "memcell" in prompt.lower():
            cuts = boundary_queue.pop(0) if boundary_queue else []
            return ChatResponse(content=_boundary_response(cuts), model="fake")
        # Fall through to episode (also catches atomic/foresight prompts —
        # they'll return success-but-empty in their mocked extractor below).
        return ChatResponse(
            content=_episode_response(episode_title, episode_content),
            model="fake",
        )

    return FakeLLMClient(handler=handler)


# ---------------------------------------------------------------------------
# Shared setup fixture
# ---------------------------------------------------------------------------


@pytest_asyncio.fixture
async def memorize_env(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> AsyncIterator[Callable[..., AsyncMock]]:
    """Yield a builder that configures a clean memorize environment.

    Usage::

        async def test_x(memorize_env):
            await memorize_env(mode="chat", fake_llm=_make_fake_llm([...]))
            outcome = await memorize({"session_id": "s", "messages": [...]})

    The builder must be called exactly once per test (it primes singletons
    + starts the OME engine). Teardown stops the engine and disposes the
    sqlite engine.
    """
    monkeypatch.setattr(
        MemoryRoot, "default", classmethod(lambda cls: MemoryRoot(root=tmp_path))
    )
    (tmp_path / ".index" / "sqlite").mkdir(parents=True, exist_ok=True)

    svc = importlib.import_module("everos.service.memorize")
    af_mod = importlib.import_module("everos.memory.strategies.extract_atomic_facts")
    fs_mod = importlib.import_module("everos.memory.strategies.extract_foresight")
    client_mod = importlib.import_module("everos.component.llm.client")

    # Reset singletons.
    for attr in (
        "_episode_writer",
        "_prompt_loader",
        "_user_pipeline",
        "_agent_pipeline",
        "_ome_engine",
    ):
        monkeypatch.setattr(svc, attr, None, raising=False)
    monkeypatch.setattr(client_mod, "_llm_client", None, raising=False)
    monkeypatch.setattr(af_mod, "_writer", None, raising=False)
    monkeypatch.setattr(fs_mod, "_writer", None, raising=False)

    started: dict[str, Any] = {"engine": None, "sqlite_engine": None}

    async def _setup(
        *,
        mode: str = "chat",
        fake_llm: FakeLLMClient,
        hard_token_limit: int = 65536,
        hard_msg_limit: int = 500,
    ) -> None:
        # Provide a non-None API key + base_url so get_llm_client doesn't
        # raise; we replace the cached singleton with our fake right after.
        monkeypatch.setenv("EVEROS_MEMORIZE__MODE", mode)
        monkeypatch.setenv("EVEROS_LLM__API_KEY", "fake-key")
        monkeypatch.setenv("EVEROS_LLM__BASE_URL", "https://fake.example.com")
        monkeypatch.setenv(
            "EVEROS_BOUNDARY_DETECTION__HARD_TOKEN_LIMIT", str(hard_token_limit)
        )
        monkeypatch.setenv(
            "EVEROS_BOUNDARY_DETECTION__HARD_MSG_LIMIT", str(hard_msg_limit)
        )
        from everos.config import load_settings

        load_settings.cache_clear()

        # Replace the cached client singleton with our fake so get_llm_client
        # returns the fake on subsequent calls.
        monkeypatch.setattr(client_mod, "_llm_client", fake_llm)

        # Build sqlite schema.
        from everos.infra.persistence.sqlite import dispose_engine, get_engine

        db_engine = get_engine()
        async with db_engine.begin() as conn:
            await conn.run_sync(SQLModel.metadata.create_all)
        started["sqlite_engine"] = (get_engine, dispose_engine)

        # Mock the OME extractors so the async strategy chain is a no-op
        # (the strategy itself still runs; it just sees no facts/foresights).
        mock_af = AsyncMock(return_value=[])
        mock_fs = AsyncMock(return_value=[])
        monkeypatch.setattr(
            af_mod,
            "AtomicFactExtractor",
            lambda *a, **k: type("M", (), {"aextract": mock_af})(),
        )
        monkeypatch.setattr(
            fs_mod,
            "ForesightExtractor",
            lambda *a, **k: type("M", (), {"aextract": mock_fs})(),
        )

        engine = svc._get_engine()
        await engine.start()
        started["engine"] = engine

    yield _setup

    if started["engine"] is not None:
        await started["engine"].stop()
    if started["sqlite_engine"] is not None:
        _, dispose = started["sqlite_engine"]
        await dispose()


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _msg(
    role: str,
    content: str,
    *,
    sender_id: str = "u_alice",
    timestamp: int = 1_700_000_000_000,
    tool_calls: list[dict] | None = None,
    tool_call_id: str | None = None,
) -> dict[str, Any]:
    out: dict[str, Any] = {
        "sender_id": sender_id,
        "role": role,
        "content": content,
        "timestamp": timestamp,
    }
    if tool_calls is not None:
        out["tool_calls"] = tool_calls
    if tool_call_id is not None:
        out["tool_call_id"] = tool_call_id
    return out


def _user(content: str, ts: int, *, sender: str = "u_alice") -> dict[str, Any]:
    return _msg("user", content, sender_id=sender, timestamp=ts)


def _assistant(content: str, ts: int, *, sender: str = "assistant") -> dict[str, Any]:
    return _msg("assistant", content, sender_id=sender, timestamp=ts)


def _memcell_rows(tmp_path: Path) -> list[sqlite3.Row]:
    db = tmp_path / ".index" / "sqlite" / "system.db"
    if not db.is_file():
        return []
    conn = sqlite3.connect(db)
    conn.row_factory = sqlite3.Row
    try:
        return list(conn.execute("SELECT * FROM memcell ORDER BY timestamp"))
    finally:
        conn.close()


def _buffer_count(tmp_path: Path) -> int:
    db = tmp_path / ".index" / "sqlite" / "system.db"
    if not db.is_file():
        return 0
    conn = sqlite3.connect(db)
    try:
        return conn.execute(
            "SELECT COUNT(*) FROM unprocessed_buffer WHERE track='memorize'"
        ).fetchone()[0]
    finally:
        conn.close()


def _episode_paths(tmp_path: Path) -> list[Path]:
    base = tmp_path / "default_app" / "default_project" / "users"
    return sorted(base.rglob("episode-*.md"))


# ---------------------------------------------------------------------------
# Happy path baseline
# ---------------------------------------------------------------------------


async def test_chat_baseline_two_msgs_one_cell(
    tmp_path: Path,
    memorize_env: Callable[..., Any],
) -> None:
    """2 messages → flush forces them into 1 cell + 1 Episode + 1 memcell row."""
    fake = _make_fake_llm(boundary_responses=[[]])  # no internal cuts
    await memorize_env(mode="chat", fake_llm=fake)

    payload = {
        "session_id": "test_chat_1",
        "messages": [
            _user("hello", 1_700_000_000_000),
            _assistant("hi there", 1_700_000_001_000),
        ],
    }
    result = await memorize(payload, is_final=True)

    assert isinstance(result, MemorizeResult)
    assert result.status == "extracted"
    assert result.message_count == 2

    rows = _memcell_rows(tmp_path)
    assert len(rows) == 1
    assert rows[0]["track"] == "memorize"
    assert rows[0]["raw_type"] == "Conversation"
    # MemCell has no single owner — sender_ids carries the participants.
    assert "u_alice" in json.loads(rows[0]["sender_ids_json"])

    assert _buffer_count(tmp_path) == 0

    md_files = _episode_paths(tmp_path)
    assert len(md_files) == 1
    body = md_files[0].read_text()
    assert "Test Subject" in body
    assert "Test body" in body


# ---------------------------------------------------------------------------
# Input-shape boundary cases (6)
# ---------------------------------------------------------------------------


async def test_empty_batch_non_final_is_skipped(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """``messages=[]`` + ``is_final=False`` → skipped, no side effects."""
    await memorize_env(mode="chat", fake_llm=_make_fake_llm())

    result = await memorize(
        {"session_id": "test_empty_nonfinal", "messages": []}, is_final=False
    )
    assert result.status == "accumulated"
    assert result.message_count == 0
    assert _memcell_rows(tmp_path) == []
    assert _episode_paths(tmp_path) == []


async def test_empty_batch_final_drains_empty_buffer(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """``messages=[]`` + ``is_final=True`` on virgin session → no cells, no md."""
    await memorize_env(mode="chat", fake_llm=_make_fake_llm())

    result = await memorize(
        {"session_id": "test_empty_final", "messages": []}, is_final=True
    )
    assert result.status == "accumulated"
    assert _memcell_rows(tmp_path) == []
    assert _episode_paths(tmp_path) == []


async def test_assistant_only_batch_accumulates(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """No role=user message → boundary stage parks everything in buffer."""
    fake = _make_fake_llm(boundary_responses=[])  # no LLM call expected
    await memorize_env(mode="chat", fake_llm=fake)

    result = await memorize(
        {
            "session_id": "test_asst_only",
            "messages": [
                _assistant("hi", 1_700_000_000_000),
                _assistant("anyone here?", 1_700_000_001_000),
            ],
        },
        is_final=False,
    )
    assert result.status == "accumulated"
    assert _memcell_rows(tmp_path) == []
    assert _buffer_count(tmp_path) == 2  # parked in buffer


async def test_single_user_message_accumulates(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Single user msg → boundary returns no cells (need conversation) → buffer it."""
    fake = _make_fake_llm(boundary_responses=[[]])  # boundary called, no cuts
    await memorize_env(mode="chat", fake_llm=fake)

    result = await memorize(
        {
            "session_id": "test_single",
            "messages": [_user("hello?", 1_700_000_000_000)],
        },
        is_final=False,
    )
    assert result.status == "accumulated"
    assert _memcell_rows(tmp_path) == []
    assert _buffer_count(tmp_path) == 1


async def test_chat_mode_filters_tool_messages(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Chat mode drops ``role=tool`` + assistant-with-tool_calls pre-boundary."""
    fake = _make_fake_llm(boundary_responses=[[]])
    await memorize_env(mode="chat", fake_llm=fake)

    result = await memorize(
        {
            "session_id": "test_chat_filter",
            "messages": [
                _user("debug this", 1_700_000_000_000),
                _msg(
                    "assistant",
                    "calling tool",
                    timestamp=1_700_000_001_000,
                    tool_calls=[
                        {
                            "id": "c1",
                            "type": "function",
                            "function": {"name": "x", "arguments": "{}"},
                        }
                    ],
                ),
                _msg(
                    "tool",
                    "result",
                    sender_id="tool",
                    timestamp=1_700_000_002_000,
                    tool_call_id="c1",
                ),
                _assistant("here's the answer", 1_700_000_003_000),
            ],
        },
        is_final=True,
    )
    # After filter: 1 user + 1 assistant text = 2 msgs → 1 cell on flush.
    assert result.status == "extracted"
    rows = _memcell_rows(tmp_path)
    assert len(rows) == 1
    ids = json.loads(rows[0]["message_ids_json"])
    assert len(ids) == 2  # tool + assistant-with-tool_calls dropped


async def test_duplicate_message_id_dedup_across_adds(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Same message replayed across two ``/add`` calls is deduped by message_id."""
    fake = _make_fake_llm(boundary_responses=[[], []])  # 2 boundary calls, both empty
    await memorize_env(mode="chat", fake_llm=fake)

    # message_id is derived from (session_id, ts_ms, idx); same payload twice
    # produces the same id, so the second add should be a no-op insert.
    payload = {
        "session_id": "test_dedup",
        "messages": [
            _user("hi", 1_700_000_000_000),
            _assistant("hi back", 1_700_000_001_000),
        ],
    }
    await memorize(payload, is_final=False)
    await memorize(payload, is_final=False)  # replay
    await memorize({"session_id": "test_dedup", "messages": []}, is_final=True)

    rows = _memcell_rows(tmp_path)
    assert len(rows) == 1
    ids = json.loads(rows[0]["message_ids_json"])
    assert len(ids) == 2  # not 4 — dedup worked
    assert len(set(ids)) == 2  # unique


# ---------------------------------------------------------------------------
# Hard-limit cases (2)
# ---------------------------------------------------------------------------


async def test_hard_msg_limit_force_split(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Exceeding ``hard_msg_limit`` triggers a force-split before the LLM call."""
    fake = _make_fake_llm(boundary_responses=[[]])  # LLM call after force-split
    # hard_msg_limit=3 → batch of 5 msgs forces ~1 split before LLM.
    await memorize_env(
        mode="chat", fake_llm=fake, hard_msg_limit=3, hard_token_limit=10_000
    )

    msgs = [
        _user(f"u{i}", 1_700_000_000_000 + i * 1000, sender="u_alice")
        if i % 2 == 0
        else _assistant(f"a{i}", 1_700_000_000_000 + i * 1000)
        for i in range(5)
    ]
    result = await memorize(
        {"session_id": "test_hardmsg", "messages": msgs}, is_final=True
    )
    assert result.status == "extracted"
    rows = _memcell_rows(tmp_path)
    # Force-split + LLM final → at least 2 cells (force + remaining).
    assert len(rows) >= 2


async def test_hard_token_limit_force_split(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Exceeding ``hard_token_limit`` triggers a force-split (token-based)."""
    fake = _make_fake_llm(boundary_responses=[[]])
    # Very small token budget → even tiny content triggers force-split.
    await memorize_env(
        mode="chat", fake_llm=fake, hard_msg_limit=500, hard_token_limit=20
    )

    msgs = [
        _user("a" * 200, 1_700_000_000_000, sender="u_alice"),
        _assistant("b" * 200, 1_700_000_001_000),
        _user("c" * 200, 1_700_000_002_000, sender="u_alice"),
        _assistant("d" * 200, 1_700_000_003_000),
    ]
    result = await memorize(
        {"session_id": "test_hardtok", "messages": msgs}, is_final=True
    )
    assert result.status == "extracted"
    assert len(_memcell_rows(tmp_path)) >= 2


# ---------------------------------------------------------------------------
# Flush state-machine cases (4)
# ---------------------------------------------------------------------------


async def test_flush_on_virgin_session_is_noop(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Flush a session that never received ``/add`` — should not crash."""
    await memorize_env(mode="chat", fake_llm=_make_fake_llm())

    result = await memorize(
        {"session_id": "test_virgin_flush", "messages": []}, is_final=True
    )
    assert result.status == "accumulated"
    assert _memcell_rows(tmp_path) == []


async def test_add_then_flush_then_add(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """After flush drains the buffer, a follow-up ``/add`` still works."""
    fake = _make_fake_llm(boundary_responses=[[], []])
    await memorize_env(mode="chat", fake_llm=fake)

    sid = "test_add_flush_add"
    await memorize(
        {
            "session_id": sid,
            "messages": [
                _user("first", 1_700_000_000_000),
                _assistant("ack", 1_700_000_001_000),
            ],
        },
        is_final=False,
    )
    await memorize({"session_id": sid, "messages": []}, is_final=True)

    rows_after_flush_1 = len(_memcell_rows(tmp_path))
    assert rows_after_flush_1 == 1

    # Second turn after the flush.
    await memorize(
        {
            "session_id": sid,
            "messages": [
                _user("second turn", 1_700_000_010_000),
                _assistant("ok", 1_700_000_011_000),
            ],
        },
        is_final=True,
    )
    assert len(_memcell_rows(tmp_path)) == 2  # cumulative


async def test_consecutive_flushes_second_is_noop(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Flush twice in a row — second call finds empty buffer, no-ops."""
    fake = _make_fake_llm(boundary_responses=[[]])
    await memorize_env(mode="chat", fake_llm=fake)

    sid = "test_double_flush"
    await memorize(
        {
            "session_id": sid,
            "messages": [
                _user("hi", 1_700_000_000_000),
                _assistant("ok", 1_700_000_001_000),
            ],
        },
        is_final=False,
    )
    res1 = await memorize({"session_id": sid, "messages": []}, is_final=True)
    res2 = await memorize({"session_id": sid, "messages": []}, is_final=True)

    assert res1.status == "extracted"
    assert res2.status == "accumulated"  # nothing left
    assert len(_memcell_rows(tmp_path)) == 1


async def test_flush_drains_assistant_only_buffer(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Buffer with only assistant messages: flush still forces them into a cell."""
    fake = _make_fake_llm(boundary_responses=[[]])
    await memorize_env(mode="chat", fake_llm=fake)

    sid = "test_asst_then_flush"
    # Two assistant-only adds → both park in buffer.
    await memorize(
        {
            "session_id": sid,
            "messages": [_assistant("a1", 1_700_000_000_000)],
        },
        is_final=False,
    )
    await memorize(
        {
            "session_id": sid,
            "messages": [_assistant("a2", 1_700_000_001_000)],
        },
        is_final=False,
    )
    assert _buffer_count(tmp_path) == 2

    # Add a user message + flush — boundary should now run.
    result = await memorize(
        {
            "session_id": sid,
            "messages": [_user("anyone there?", 1_700_000_002_000)],
        },
        is_final=True,
    )
    assert result.status == "extracted"
    assert _buffer_count(tmp_path) == 0


# ---------------------------------------------------------------------------
# Multi-session cases (2)
# ---------------------------------------------------------------------------


async def test_two_sessions_are_isolated(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Two session_ids share the engine but their buffers / cells stay separate."""
    fake = _make_fake_llm(boundary_responses=[[], []])  # 1 per session
    await memorize_env(mode="chat", fake_llm=fake)

    await memorize(
        {
            "session_id": "sess_A",
            "messages": [
                _user("hi from A", 1_700_000_000_000, sender="u_alice"),
                _assistant("ack A", 1_700_000_001_000),
            ],
        },
        is_final=True,
    )
    await memorize(
        {
            "session_id": "sess_B",
            "messages": [
                _user("hi from B", 1_700_000_010_000, sender="u_bob"),
                _assistant("ack B", 1_700_000_011_000),
            ],
        },
        is_final=True,
    )

    rows = _memcell_rows(tmp_path)
    assert len(rows) == 2
    sessions = sorted(r["session_id"] for r in rows)
    assert sessions == ["sess_A", "sess_B"]
    # MemCell has no single owner — sender_ids carries who participated.
    senders = {r["session_id"]: json.loads(r["sender_ids_json"]) for r in rows}
    assert "u_alice" in senders["sess_A"]
    assert "u_bob" in senders["sess_B"]


async def test_same_session_multi_add_concatenates(
    tmp_path: Path, memorize_env: Callable[..., Any]
) -> None:
    """Multiple adds on the same session accumulate in one buffer until flushed."""
    fake = _make_fake_llm(boundary_responses=[[], [], []])
    await memorize_env(mode="chat", fake_llm=fake)

    sid = "test_multi_add"
    for i in range(3):
        await memorize(
            {
                "session_id": sid,
                "messages": [
                    _user(f"u{i}", 1_700_000_000_000 + i * 2000),
                    _assistant(f"a{i}", 1_700_000_001_000 + i * 2000),
                ],
            },
            is_final=False,
        )
    # Buffer should have 6 messages now (no boundary cuts).
    assert _buffer_count(tmp_path) == 6

    result = await memorize({"session_id": sid, "messages": []}, is_final=True)
    assert result.status == "extracted"
    rows = _memcell_rows(tmp_path)
    assert len(rows) == 1  # one cell from the flush
    ids = json.loads(rows[0]["message_ids_json"])
    assert len(ids) == 6  # all 6 messages folded in