chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/tests/unit/test_memory/test_extract/init.py
+++ b/tests/unit/test_memory/test_extract/init.py
--- a/tests/unit/test_memory/test_extract/test_ingest/init.py
+++ b/tests/unit/test_memory/test_extract/test_ingest/init.py
--- a/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py
+++ b/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py
@ -0,0 +1,45 @@
+"""Tests for ingest content coercion + text derivation (tagged rendering)."""
+
+from __future__ import annotations
+
+from everos.memory.extract.ingest.multimodal import (
+    coerce_items,
+    derive_text,
+    normalise_content,
+)
+
+
+def test_coerce_str_to_text_item() -> None:
+    assert coerce_items("hi") == [{"type": "text", "text": "hi"}]
+
+
+def test_derive_text_renders_parsed_nontext_as_tag() -> None:
+    items = [
+        {"type": "text", "text": "before"},
+        {"type": "image", "name": "p.png", "parsed_content": "OCR TEXT"},
+        {"type": "text", "text": "after"},
+    ]
+    text, non_text = derive_text(items)
+
+    assert "[IMAGE: p.png]\nOCR TEXT" in text
+    assert text.startswith("before")
+    assert text.endswith("after")
+    assert non_text == 0
+
+
+def test_derive_text_counts_unparsed_nontext() -> None:
+    text, non_text = derive_text([{"type": "image", "uri": "x"}])
+    assert text == ""
+    assert non_text == 1
+
+
+def test_derive_text_tag_without_name() -> None:
+    text, _ = derive_text([{"type": "pdf", "parsed_content": "DOC"}])
+    assert text == "[PDF]\nDOC"
+
+
+def test_normalise_content_text_only_unchanged() -> None:
+    items, text, non_text = normalise_content("hello")
+    assert items == [{"type": "text", "text": "hello"}]
+    assert text == "hello"
+    assert non_text == 0
--- a/tests/unit/test_memory/test_extract/test_parser/init.py
+++ b/tests/unit/test_memory/test_extract/test_parser/init.py
--- a/tests/unit/test_memory/test_extract/test_parser/test_availability.py
+++ b/tests/unit/test_memory/test_extract/test_parser/test_availability.py
@ -0,0 +1,38 @@
+"""Tests for the multimodal capability guard."""
+
+from __future__ import annotations
+
+import pytest
+
+from everos.core.errors import MultimodalNotEnabledError
+from everos.memory.extract.parser import availability
+
+
+def test_has_unparsed_multimodal_true_for_unparsed_nontext() -> None:
+    items = [{"type": "text", "text": "hi"}, {"type": "image", "uri": "x"}]
+    assert availability.has_unparsed_multimodal(items) is True
+
+
+def test_has_unparsed_multimodal_false_when_all_text() -> None:
+    items = [{"type": "text", "text": "hi"}]
+    assert availability.has_unparsed_multimodal(items) is False
+
+
+def test_has_unparsed_multimodal_false_when_already_parsed() -> None:
+    items = [{"type": "image", "uri": "x", "parsed_content": "ocr"}]
+    assert availability.has_unparsed_multimodal(items) is False
+
+
+def test_require_multimodal_raises_when_unavailable(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(availability, "multimodal_available", lambda: False)
+    with pytest.raises(MultimodalNotEnabledError):
+        availability.require_multimodal()
+
+
+def test_require_multimodal_ok_when_available(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(availability, "multimodal_available", lambda: True)
+    availability.require_multimodal()  # must not raise
--- a/tests/unit/test_memory/test_extract/test_parser/test_enrich.py
+++ b/tests/unit/test_memory/test_extract/test_parser/test_enrich.py
@ -0,0 +1,183 @@
+"""Tests for enrich_content_items (everalgo.parser.aparse is monkeypatched)."""
+
+from __future__ import annotations
+
+import base64
+from typing import Any
+
+import pytest
+
+# ``everalgo.parser`` ships under the ``[multimodal]`` extra (see
+# pyproject.toml). CI doesn't install that extra by default, and these
+# tests monkeypatch ``everalgo.parser.aparse`` — which requires the
+# module to actually be importable, otherwise ``monkeypatch.setattr``
+# fails at resolve-time. Skip the whole module when the optional
+# dependency isn't present; we still run when ``multimodal`` is installed.
+pytest.importorskip("everalgo.parser")
+
+from everalgo.llm import LLMError  # noqa: E402
+from everalgo.types import ParsedContent  # noqa: E402
+
+from everos.core.errors import UnsupportedModalityError  # noqa: E402
+from everos.memory.extract.parser import enrich_content_items  # noqa: E402
+
+
+def _img_item() -> dict[str, Any]:
+    return {
+        "type": "image",
+        "base64": base64.b64encode(b"\x89PNG").decode(),
+        "ext": "png",
+    }
+
+
+def _html_b64_item() -> dict[str, Any]:
+    return {
+        "type": "html",
+        "base64": base64.b64encode(b"<html><body>v9.9.9</body></html>").decode(),
+        "ext": "html",
+    }
+
+
+def _html_uri_item() -> dict[str, Any]:
+    return {"type": "html", "uri": "https://example.com/page.html"}
+
+
+async def test_enrich_backfills_parsed_content(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        return ParsedContent(text="OCR RESULT")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    items: list[dict[str, Any]] = [{"type": "text", "text": "hi"}, _img_item()]
+    await enrich_content_items(items, llm=object(), max_concurrency=2)
+
+    assert items[1]["parsed_content"] == "OCR RESULT"
+    assert items[1]["parse_status"] == "success"
+    assert "parsed_content" not in items[0]  # text item untouched
+
+
+async def test_enrich_unsupported_modality_raises(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        raise NotImplementedError("video deferred")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    with pytest.raises(UnsupportedModalityError):
+        await enrich_content_items([_img_item()], llm=object())
+
+
+async def test_enrich_transient_llm_error_degrades(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        raise LLMError("provider down")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    items = [_img_item()]
+    await enrich_content_items(items, llm=object())  # must not raise
+
+    assert items[0]["parse_status"] == "failed"
+    assert "parsed_content" not in items[0]
+
+
+async def test_enrich_html_base64_routes_as_html_bytes(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """A type=html base64 item reaches the parser as html-extension bytes.
+
+    Locks the "normal HTML file call" contract: base64 + ext=html maps to
+    a RawFile the parser dispatches as HTML (vs the 415 that a text-only
+    html item produces — see test_ingest for that negative path).
+    """
+    seen: dict[str, Any] = {}
+
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        seen["extension"] = raw_file.extension
+        seen["content"] = raw_file.content
+        return ParsedContent(text="HTML PARSED")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    items = [_html_b64_item()]
+    await enrich_content_items(items, llm=object())
+
+    assert items[0]["parsed_content"] == "HTML PARSED"
+    assert items[0]["parse_status"] == "success"
+    assert seen["extension"] == "html"
+    assert b"v9.9.9" in seen["content"]
+
+
+async def test_enrich_http_uri_routes_as_uri(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """An http(s) uri item reaches the parser as a uri RawFile (no bytes).
+
+    Proves everos forwards uri-backed items to the parser, which is what
+    drives everalgo's URL-fetch dispatch path (http/https only; file:// is
+    rejected downstream).
+    """
+    seen: dict[str, Any] = {}
+
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        seen["uri"] = raw_file.uri
+        seen["content"] = raw_file.content
+        return ParsedContent(text="URL PARSED")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    items = [_html_uri_item()]
+    await enrich_content_items(items, llm=object())
+
+    assert items[0]["parsed_content"] == "URL PARSED"
+    assert items[0]["parse_status"] == "success"
+    assert seen["uri"] == "https://example.com/page.html"
+    assert seen["content"] == b""
+
+
+async def test_enrich_html_text_only_raises_unsupported(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """type=html carrying only ``text`` (no uri/base64) is undispatchable.
+
+    Any non-text item is routed to the parser, which needs a fetchable or
+    decodable payload; a bare ``text`` has neither, so it surfaces as a
+    MultimodalError (the route maps it to HTTP 415). To inline HTML *as
+    text*, callers must use ``type="text"`` instead.
+    """
+
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        return ParsedContent(text="should-not-be-reached")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    with pytest.raises(UnsupportedModalityError):
+        await enrich_content_items(
+            [{"type": "html", "text": "<p>hi</p>"}], llm=object()
+        )
+
+
+async def test_enrich_file_uri_hydrates_and_parses(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Any,
+) -> None:
+    """A ``file://`` item is read locally and handed to the parser as bytes.
+
+    Proves EverOS hydrates the file (everalgo never sees the path / fs) — the
+    parser receives ``content`` bytes, not a uri.
+    """
+    seen: dict[str, Any] = {}
+
+    async def fake_aparse(raw_file: Any, *, llm: Any) -> ParsedContent:
+        seen["content"] = raw_file.content
+        seen["uri"] = raw_file.uri
+        return ParsedContent(text="FILE PARSED")
+
+    monkeypatch.setattr("everalgo.parser.aparse", fake_aparse)
+    f = tmp_path / "doc.html"
+    f.write_bytes(b"<html>hello</html>")
+    items = [{"type": "html", "uri": f"file://{f}"}]
+    await enrich_content_items(items, llm=object())
+
+    assert items[0]["parsed_content"] == "FILE PARSED"
+    assert items[0]["parse_status"] == "success"
+    assert seen["content"] == b"<html>hello</html>"  # hydrated, not a pointer
+    assert seen["uri"] == ""
--- a/tests/unit/test_memory/test_extract/test_parser/test_mapping.py
+++ b/tests/unit/test_memory/test_extract/test_parser/test_mapping.py
@ -0,0 +1,105 @@
+"""Tests for ContentItem -> everalgo RawFile mapping + file:// hydration."""
+
+from __future__ import annotations
+
+import base64
+from pathlib import Path
+
+import pytest
+
+from everos.config import load_settings
+from everos.memory.extract.parser.mapping import build_raw_file, to_raw_file
+
+
+@pytest.fixture(autouse=True)
+def _clear_settings_cache():
+    """file:// guardrails read settings; keep the lru_cache from leaking
+    env overrides across tests."""
+    load_settings.cache_clear()
+    yield
+    load_settings.cache_clear()
+
+
+def test_uri_item_maps_to_rawfile_uri() -> None:
+    rf = to_raw_file({"type": "image", "uri": "https://x/y.png"})
+    assert rf.uri == "https://x/y.png"
+    assert rf.content == b""
+
+
+def test_base64_item_decodes_and_lowercases_extension() -> None:
+    raw = b"\x89PNG\r\n"
+    rf = to_raw_file(
+        {"type": "image", "base64": base64.b64encode(raw).decode(), "ext": ".PNG"}
+    )
+    assert rf.content == raw
+    assert rf.extension == "png"
+
+
+def test_item_without_uri_or_base64_raises() -> None:
+    with pytest.raises(ValueError):
+        to_raw_file({"type": "image"})
+
+
+# ── build_raw_file: file:// hydration + guardrails ──────────────────────
+
+
+async def test_build_raw_file_delegates_http_uri() -> None:
+    """http(s) uris stay in uri form (everalgo fetches), not hydrated."""
+    rf = await build_raw_file({"type": "html", "uri": "https://example.com"})
+    assert rf.uri == "https://example.com"
+    assert rf.content == b""
+
+
+async def test_build_raw_file_hydrates_file_uri(tmp_path: Path) -> None:
+    """file:// is read locally into a hydrated RawFile (content + ext)."""
+    f = tmp_path / "notes.html"
+    f.write_bytes(b"<html><body>v9.9.9</body></html>")
+    rf = await build_raw_file({"type": "html", "uri": f"file://{f}"})
+    assert rf.content == b"<html><body>v9.9.9</body></html>"
+    assert rf.extension == "html"
+    assert rf.uri == ""  # hydrated, not a pointer
+
+
+async def test_build_raw_file_file_uri_ext_hint_wins(tmp_path: Path) -> None:
+    f = tmp_path / "blob"  # no suffix
+    f.write_bytes(b"%PDF-1.4 ...")
+    rf = await build_raw_file({"type": "pdf", "uri": f"file://{f}", "ext": "pdf"})
+    assert rf.extension == "pdf"
+
+
+async def test_build_raw_file_missing_file_raises(tmp_path: Path) -> None:
+    with pytest.raises(ValueError):
+        await build_raw_file({"type": "pdf", "uri": f"file://{tmp_path}/nope.pdf"})
+
+
+async def test_build_raw_file_oversize_raises(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    f = tmp_path / "big.html"
+    f.write_bytes(b"x" * 100)
+    monkeypatch.setenv("EVEROS_MULTIMODAL__FILE_URI_MAX_BYTES", "10")
+    load_settings.cache_clear()
+    with pytest.raises(ValueError, match="too large"):
+        await build_raw_file({"type": "html", "uri": f"file://{f}"})
+
+
+async def test_build_raw_file_outside_allowlist_raises(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    f = tmp_path / "secret.html"
+    f.write_bytes(b"<html></html>")
+    monkeypatch.setenv("EVEROS_MULTIMODAL__FILE_URI_ALLOW_DIRS", '["/some/other/root"]')
+    load_settings.cache_clear()
+    with pytest.raises(ValueError, match="outside the allowed roots"):
+        await build_raw_file({"type": "html", "uri": f"file://{f}"})
+
+
+async def test_build_raw_file_inside_allowlist_ok(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    f = tmp_path / "ok.html"
+    f.write_bytes(b"<html>ok</html>")
+    monkeypatch.setenv("EVEROS_MULTIMODAL__FILE_URI_ALLOW_DIRS", f'["{tmp_path}"]')
+    load_settings.cache_clear()
+    rf = await build_raw_file({"type": "html", "uri": f"file://{f}"})
+    assert rf.content == b"<html>ok</html>"
--- a/tests/unit/test_memory/test_extract/test_pipeline/init.py
+++ b/tests/unit/test_memory/test_extract/test_pipeline/init.py
--- a/tests/unit/test_memory/test_extract/test_pipeline/test_agent_memory.py
+++ b/tests/unit/test_memory/test_extract/test_pipeline/test_agent_memory.py
@ -0,0 +1,61 @@
+"""``AgentMemoryPipeline.run`` — empty short-circuit + per-cell event emit."""
+
+from __future__ import annotations
+
+from everalgo.types import ChatMessage, MemCell
+
+from everos.memory import IngestResult
+from everos.memory.events import AgentPipelineStarted
+from everos.memory.extract.pipeline.agent_memory import AgentMemoryPipeline
+
+
+class _FakeEngine:
+    """Captures emitted events; mirrors ``OfflineEngine.emit`` async signature."""
+
+    def __init__(self) -> None:
+        self.events: list[AgentPipelineStarted] = []
+
+    async def emit(self, event: AgentPipelineStarted) -> None:
+        self.events.append(event)
+
+
+def _make_cell(n_items: int, ts: int = 1_700_000_000_000) -> MemCell:
+    items = [
+        ChatMessage(
+            id=f"m{i}",
+            role="user",
+            sender_id="u1",
+            sender_name="u",
+            content="hi",
+            timestamp=ts,
+        )
+        for i in range(n_items)
+    ]
+    return MemCell(items=items, timestamp=ts)
+
+
+async def test_empty_cells_short_circuit() -> None:
+    engine = _FakeEngine()
+    pipeline = AgentMemoryPipeline(engine)  # type: ignore[arg-type]
+    ingested = IngestResult(session_id="s1", messages=[])
+    out = await pipeline.run(ingested, cells=[], memcell_ids=[])
+    assert out.track == "agent_memory"
+    assert out.status == "accumulated"
+    assert out.message_count == 0
+    assert engine.events == []
+
+
+async def test_emits_one_event_per_cell() -> None:
+    engine = _FakeEngine()
+    pipeline = AgentMemoryPipeline(engine)  # type: ignore[arg-type]
+    ingested = IngestResult(session_id="s1", messages=[])
+    cells = [_make_cell(n_items=2), _make_cell(n_items=3)]
+    memcell_ids = ["mc_a", "mc_b"]
+    out = await pipeline.run(ingested, cells=cells, memcell_ids=memcell_ids)
+
+    assert out.track == "agent_memory"
+    assert out.status == "extracted"
+    assert out.message_count == 5  # 2 + 3
+    assert [e.memcell_id for e in engine.events] == ["mc_a", "mc_b"]
+    assert all(e.session_id == "s1" for e in engine.events)
+    assert all(isinstance(e, AgentPipelineStarted) for e in engine.events)
--- a/tests/unit/test_memory/test_extract/test_pipeline/test_user_memory_emits.py
+++ b/tests/unit/test_memory/test_extract/test_pipeline/test_user_memory_emits.py
@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import datetime as _dt
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from everalgo.types import ChatMessage, MemCell
+from everalgo.types import Episode as AlgoEpisode
+
+from everos.core.persistence import EntryId
+from everos.memory import IngestResult
+from everos.memory.events import EpisodeExtracted, UserPipelineStarted
+from everos.memory.extract.pipeline.user_memory import UserMemoryPipeline
+from everos.memory.models import CanonicalMessage
+
+
+def _sample_memcell() -> MemCell:
+    return MemCell(
+        items=[
+            ChatMessage(
+                id="m1",
+                role="user",
+                content="hello",
+                timestamp=1_700_000_000_000,
+                sender_id="u1",
+            ),
+        ],
+        timestamp=1_700_000_000_000,
+    )
+
+
+class _CapturingEngine:
+    def __init__(self) -> None:
+        self.emitted: list[object] = []
+
+    async def emit(self, event: object) -> None:
+        self.emitted.append(event)
+
+
+async def test_emit_pipeline_started_routes_through_engine() -> None:
+    engine = _CapturingEngine()
+    pipeline = UserMemoryPipeline(
+        episode_writer=MagicMock(),
+        prompt_loader=MagicMock(),
+        llm_client=MagicMock(),
+        engine=engine,
+    )
+
+    cell = _sample_memcell()
+    await pipeline._emit_pipeline_started(  # noqa: SLF001 — test introspection
+        memcell_id="mc_a",
+        session_id="s1",
+        app_id="claude_code",
+        project_id="oss",
+        cell=cell,
+    )
+
+    started = [e for e in engine.emitted if isinstance(e, UserPipelineStarted)]
+    assert len(started) == 1
+    assert started[0].memcell_id == "mc_a"
+    assert started[0].session_id == "s1"
+    assert started[0].app_id == "claude_code"
+    assert started[0].project_id == "oss"
+    assert started[0].memcell is cell
+
+
+@pytest.mark.asyncio
+async def test_emit_episode_extracted_after_md_write() -> None:
+    """Each per-sender Episode write emits EpisodeExtracted with the md entry id."""
+    engine = _CapturingEngine()
+    episode_writer = MagicMock()
+    episode_writer.append_entry = AsyncMock(
+        return_value=EntryId(prefix="ep", date=_dt.date(2026, 5, 17), seq=1)
+    )
+    episode_writer.path_for = MagicMock(
+        return_value="users/u1/episodes/episode-2026-05-17.md"
+    )
+    prompt_loader = MagicMock()
+    prompt_loader.load = MagicMock(return_value="<prompt>")
+    llm_client = MagicMock()
+
+    pipeline = UserMemoryPipeline(
+        episode_writer=episode_writer,
+        prompt_loader=prompt_loader,
+        llm_client=llm_client,
+        engine=engine,
+    )
+
+    cell = _sample_memcell()
+    ingested = IngestResult(
+        session_id="s1",
+        messages=[
+            CanonicalMessage(
+                message_id="m1",
+                session_id="s1",
+                sender_id="u1",
+                role="user",
+                timestamp=_dt.datetime.fromtimestamp(1_700_000_000, tz=_dt.UTC),
+                text="hello",
+            )
+        ],
+    )
+    algo_ep = AlgoEpisode(
+        owner_id="u1", episode="they said hello", timestamp=1_700_000_000_000
+    )
+    with patch.object(  # noqa: SLF001
+        pipeline._ep_ext, "aextract", new=AsyncMock(return_value=algo_ep)
+    ):
+        outcome = await pipeline.run(
+            ingested=ingested,
+            cells=[cell],
+            memcell_ids=["mc_a"],
+            per_cell_all_senders=[["u1"]],
+        )
+
+    assert outcome.status == "extracted"
+    extracted = [e for e in engine.emitted if isinstance(e, EpisodeExtracted)]
+    assert len(extracted) == 1
+    assert extracted[0].memcell_id == "mc_a"
+    assert extracted[0].episode_entry_id == "ep_20260517_00000001"
+    assert extracted[0].episode_text == "they said hello"
+    assert extracted[0].episode_timestamp_ms == 1_700_000_000_000
+    assert extracted[0].owner_id == "u1"