chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/src/everos/infra/persistence/sqlite/tables/init.py
+++ b/src/everos/infra/persistence/sqlite/tables/init.py
@ -0,0 +1,24 @@
+"""Business SQLModel table schemas.
+
+Each business table lives in its own module here (e.g. ``memcell.py``,
+``unprocessed_buffer.py``). The package ``__init__`` re-exports them so
+``SQLModel.metadata.create_all`` (run by
+:class:`everos.core.lifespan.SqliteLifespanProvider` at startup) sees
+every registered table.
+"""
+
+from .cluster import Cluster as Cluster
+from .cluster import ClusterMember as ClusterMember
+from .conversation_status import ConversationStatus as ConversationStatus
+from .md_change_state import MdChangeState as MdChangeState
+from .memcell import Memcell as Memcell
+from .unprocessed_buffer import UnprocessedBuffer as UnprocessedBuffer
+
+__all__ = [
+    "Cluster",
+    "ClusterMember",
+    "ConversationStatus",
+    "MdChangeState",
+    "Memcell",
+    "UnprocessedBuffer",
+]
--- a/src/everos/infra/persistence/sqlite/tables/cluster.py
+++ b/src/everos/infra/persistence/sqlite/tables/cluster.py
@ -0,0 +1,99 @@
+"""``cluster`` — persisted snapshot of one ``everalgo.clustering.Cluster``.
+
+Mirrors the algo-side frozen value object (centroid + count + last_ts +
+preview) plus everos engineering metadata (``owner_id`` / ``owner_type``
+/ ``kind``) so a single SQLite table can hold both the user-memory cluster
+track (episode embeddings) and the agent-case cluster track (task_intent
+embeddings). The ``members`` field on the algo type is persisted in the
+sibling :class:`ClusterMember` table to keep the relation queryable from
+both directions (forward by ``cluster_id``, reverse by ``(member_type,
+member_id)``).
+"""
+
+from __future__ import annotations
+
+from sqlalchemy import Index, LargeBinary
+
+from everos.component.utils.datetime import UtcDatetime
+from everos.core.persistence.sqlite import BaseTable, Field
+from everos.core.persistence.sqlite.base import UtcDateTimeColumn
+
+
+class Cluster(BaseTable, table=True):
+    """One row per cluster. PK ``cluster_id`` (``cl_<12hex>``)."""
+
+    __tablename__ = "cluster"  # type: ignore[assignment]
+    __table_args__ = (
+        # List all clusters for one (app, project, owner, kind) on each strategy
+        # invocation; scope-first composite so clustering never mixes spaces.
+        Index("ix_cluster_owner_kind", "app_id", "project_id", "owner_id", "kind"),
+    )
+
+    cluster_id: str = Field(primary_key=True)
+    """Caller-minted opaque id (algo type carries it through verbatim).
+    Format: ``cl_<12 hex chars>`` to mirror :func:`memcell._mint_memcell_id`."""
+
+    app_id: str = Field(default="default")
+    project_id: str = Field(default="default")
+    """App / project scope segments. The aggregation key is
+    ``(app_id, project_id, owner_id, kind)`` so a cluster set never spans
+    two spaces."""
+
+    owner_id: str = Field(index=True)
+    """``user_id`` (kind=``user_memory``) or ``agent_id`` (kind=``agent_case``)."""
+
+    owner_type: str
+    """``"user"`` or ``"agent"`` — redundant with ``kind`` today but kept
+    explicit so future kinds (e.g. tenant-level) can plug in without a
+    schema change."""
+
+    kind: str
+    """``"user_memory"`` (episode-vector cluster, drives profile extraction)
+    or ``"agent_case"`` (task_intent-vector cluster, drives skill extraction)."""
+
+    centroid_blob: bytes = Field(sa_type=LargeBinary)
+    """``np.float32`` centroid serialised via ``ndarray.tobytes()``. The
+    repo round-trips bytes ↔ ``np.ndarray`` so callers see the algo type."""
+
+    count: int
+    """Number of members merged into this cluster (algo-maintained)."""
+
+    last_ts_ms: int
+    """Most recent member's timestamp as Unix epoch milliseconds — matches
+    :attr:`everalgo.clustering.Cluster.last_ts` exactly so no lossy
+    datetime ↔ int conversion is needed across the storage boundary."""
+
+    preview_json: str
+    """JSON-encoded ``list[str]`` — short text samples used by
+    :func:`cluster_by_llm` ranking. Repo round-trips JSON ↔ list."""
+
+
+class ClusterMember(BaseTable, table=True):
+    """One row per (cluster, entity) link.
+
+    Forward lookup (``cluster_id → list[member_id]``) is the algo-side
+    ``Cluster.members`` view. Reverse lookup (``(member_type, member_id)
+    → cluster_id``) is served by the composite index below — needed when
+    a downstream consumer holds an entity id and wants its cluster.
+
+    ``member_type`` is informational on the row (the parent ``Cluster.kind``
+    already disambiguates), but kept explicit so the reverse index can be
+    a single composite (member_type, member_id) without joining back.
+    """
+
+    __tablename__ = "cluster_member"  # type: ignore[assignment]
+    __table_args__ = (Index("ix_cluster_member_reverse", "member_type", "member_id"),)
+
+    cluster_id: str = Field(primary_key=True, foreign_key="cluster.cluster_id")
+    """Parent cluster id."""
+
+    member_id: str = Field(primary_key=True)
+    """``memcell_id`` (member_type=``memcell``) or md entry_id
+    (member_type=``case``) — the entity grouped into this cluster."""
+
+    member_type: str
+    """``"memcell"`` or ``"case"``. Echoes the parent cluster's ``kind``
+    domain but kept on the row so the reverse index is self-contained."""
+
+    added_ts: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
+    """When this entity was first attached to the cluster."""
--- a/src/everos/infra/persistence/sqlite/tables/conversation_status.py
+++ b/src/everos/infra/persistence/sqlite/tables/conversation_status.py
@ -0,0 +1,38 @@
+"""``conversation_status`` — window pointer per (app, project, session, track).
+
+The window pointer is scoped by ``app_id`` / ``project_id`` so the same
+``session_id`` may recur in different spaces without colliding; those two
+segments lead the composite ``UniqueConstraint``.
+"""
+
+from __future__ import annotations
+
+from sqlalchemy import UniqueConstraint
+
+from everos.component.utils.datetime import UtcDatetime
+from everos.core.persistence.sqlite import BaseTable, Field
+from everos.core.persistence.sqlite.base import UtcDateTimeColumn
+
+
+class ConversationStatus(BaseTable, table=True):
+    """One row per (app, project, session, track). Tracks latest msg / memcell ts."""
+
+    __tablename__ = "conversation_status"  # type: ignore[assignment]
+    __table_args__ = (
+        UniqueConstraint(
+            "app_id",
+            "project_id",
+            "session_id",
+            "track",
+            name="uq_conversation_status_session_track",
+        ),
+    )
+
+    id: int | None = Field(default=None, primary_key=True)
+    app_id: str = Field(default="default")
+    project_id: str = Field(default="default")
+    """App / project scope segments (default ``"default"``)."""
+    session_id: str = Field(index=True)
+    track: str
+    last_message_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
+    last_memcell_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
--- a/src/everos/infra/persistence/sqlite/tables/md_change_state.py
+++ b/src/everos/infra/persistence/sqlite/tables/md_change_state.py
@ -0,0 +1,119 @@
+"""``md_change_state`` — cascade work queue.
+
+One row per markdown path. Both watcher (real-time fsevents) and
+scanner (periodic sweep) UPSERT into this table; the worker consumes
+``pending`` rows in ``lsn`` order, transitions them through an
+internal ``processing`` claim state, and lands them in ``done`` or
+``failed`` (with a ``retryable`` flag).
+
+Schema sourced from ``12_cascade_design.md`` §4.1 + decisions DD-3 …
+DD-12; the four indexes below are required by ``13_cascade_design.md``
+§7 status / fix queries.
+"""
+
+from __future__ import annotations
+
+from sqlalchemy import Index, text
+
+from everos.component.utils.datetime import UtcDatetime, get_utc_now
+from everos.core.persistence.sqlite import BaseTable, Field
+from everos.core.persistence.sqlite.base import UtcDateTimeColumn
+
+
+class MdChangeState(BaseTable, table=True):
+    """One row per markdown path; UPSERT-driven work queue for cascade.
+
+    The public state machine is the 3-tuple ``pending`` / ``done`` /
+    ``failed`` (12 doc §6). ``processing`` is an internal claim state
+    used by :meth:`MdChangeStateRepo.claim_one` and rolled back into
+    ``pending`` for CLI / status output (16 doc §4.2 — DD-12 keeps the
+    public surface clean).
+    """
+
+    __tablename__ = "md_change_state"  # type: ignore[assignment]
+    __table_args__ = (
+        # Worker scans pending rows in lsn order — partial index drops
+        # done/failed rows from the b-tree and keeps it tight.
+        Index(
+            "idx_md_change_pending",
+            "status",
+            "lsn",
+            sqlite_where=text("status = 'pending'"),
+        ),
+        # `cascade fix --apply` only ever touches failed + retryable=TRUE
+        # rows — partial index makes that pass essentially O(retryable).
+        Index(
+            "idx_md_change_retryable",
+            "status",
+            "retryable",
+            sqlite_where=text("status = 'failed' AND retryable = 1"),
+        ),
+        # Scanner reverse-reconcile (disk → state) compares mtime.
+        Index("idx_md_change_mtime", "mtime"),
+        # `cascade status` aggregates by kind.
+        Index("idx_md_change_kind", "kind"),
+    )
+
+    md_path: str = Field(primary_key=True)
+    """Path relative to the memory-root (e.g. ``users/u_jason/
+    episodes/episode-2026-05-12.md``). Every reverse-link anchors here."""
+
+    kind: str = Field(nullable=False, index=True)
+    """Kind registry name (e.g. ``"episode"``); worker dispatches the
+    matching handler."""
+
+    change_type: str = Field(nullable=False)
+    """``"added"`` | ``"modified"`` | ``"deleted"``. A hint for the
+    worker — handler re-derives truth from the actual file state."""
+
+    mtime: float = Field(default=0.0, nullable=False)
+    """File mtime captured when the row was last UPSERTed. Scanner
+    compares this against the on-disk mtime to identify dirty paths."""
+
+    first_seen_at: UtcDatetime = Field(
+        default_factory=get_utc_now, sa_type=UtcDateTimeColumn
+    )
+    """When the path was first enqueued."""
+
+    last_changed_at: UtcDatetime = Field(
+        default_factory=get_utc_now, sa_type=UtcDateTimeColumn
+    )
+    """Most recent UPSERT timestamp (re-stamped on every re-enqueue)."""
+
+    lsn: int = Field(nullable=False, index=True)
+    """Global monotonic sequence (``MAX(lsn) + 1`` per UPSERT). Worker
+    processes pending rows in ascending lsn order; the gap between
+    ``MAX(lsn)`` and the last processed lsn is the queue lag."""
+
+    status: str = Field(default="pending", nullable=False, index=True)
+    """Lifecycle:
+
+    - ``"pending"`` — waiting for the worker.
+    - ``"processing"`` — claimed by a worker (internal; CLI rolls into
+      pending for display).
+    - ``"done"`` — handler completed successfully.
+    - ``"failed"`` — handler exhausted retries or hit an
+      unrecoverable error (see :attr:`retryable`).
+    """
+
+    retryable: bool | None = Field(default=None)
+    """Meaningful only when ``status='failed'``.
+
+    - ``TRUE`` — RecoverableError exhausted MAX_RETRY; ``cascade fix
+      --apply`` will re-enqueue this row (pending, retry_count reset).
+    - ``FALSE`` — UnrecoverableError (malformed YAML, schema error
+      etc.); requires editing the md and re-saving.
+    - ``NULL`` — not a failed row (pending / processing / done).
+    """
+
+    last_attempt_at: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
+    """Timestamp of the most recent worker attempt (success or
+    failure)."""
+
+    retry_count: int = Field(default=0, nullable=False)
+    """Number of retries the worker has *actually issued* (the first
+    attempt does not count). Reaches MAX_RETRY (default 3) before the
+    row transitions to ``failed`` with ``retryable=TRUE``."""
+
+    error: str | None = Field(default=None)
+    """Most recent failure message (truncated upstream if needed)."""
--- a/src/everos/infra/persistence/sqlite/tables/memcell.py
+++ b/src/everos/infra/persistence/sqlite/tables/memcell.py
@ -0,0 +1,55 @@
+"""``memcell`` — metadata + payload archive for boundary-detected MemCells.
+
+Holds ``message_ids_json`` / ``sender_ids_json`` (JSON arrays of audit
+ids) plus ``payload_json`` — the full :class:`everalgo.types.MemCell`
+serialised via ``model_dump_json``. The payload is what
+``unprocessed_buffer`` cannot keep (boundary's delete-then-insert clears
+the staging slice once messages fold into a cell): downstream offline
+strategies that need the raw chat messages (e.g. profile extraction)
+deserialise the payload back into an algo ``MemCell``. Episode markdown
+still carries the LLM-synthesised narrative; ``payload_json`` is the
+chat-stream archive that narrative was distilled from.
+"""
+
+from __future__ import annotations
+
+from sqlalchemy import Index
+
+from everos.component.utils.datetime import UtcDatetime
+from everos.core.persistence.sqlite import BaseTable, Field
+from everos.core.persistence.sqlite.base import UtcDateTimeColumn
+
+
+class Memcell(BaseTable, table=True):
+    """One row per MemCell. PK ``memcell_id`` (uuid4)."""
+
+    __tablename__ = "memcell"  # type: ignore[assignment]
+    __table_args__ = (
+        # Scope-first composite: app/project partition the lookup before the
+        # session window so cross-(app, project) rows never share an index slot.
+        Index(
+            "ix_memcell_session",
+            "app_id",
+            "project_id",
+            "session_id",
+            "track",
+            "timestamp",
+        ),
+    )
+
+    memcell_id: str = Field(primary_key=True)
+    app_id: str = Field(default="default")
+    project_id: str = Field(default="default")
+    """App / project scope segments. Default to ``"default"`` so the column is
+    always populated; callers in a non-default space pass real ids."""
+    session_id: str = Field(index=True)
+    track: str
+    raw_type: str
+    message_ids_json: str
+    sender_ids_json: str
+    payload_json: str
+    """``MemCell.model_dump_json()`` — the full algo-side MemCell (items =
+    chat messages / tool calls) serialised at boundary time so offline
+    strategies can deserialise it back into an algo MemCell long after
+    ``unprocessed_buffer`` has dropped the staging rows."""
+    timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
--- a/src/everos/infra/persistence/sqlite/tables/unprocessed_buffer.py
+++ b/src/everos/infra/persistence/sqlite/tables/unprocessed_buffer.py
@ -0,0 +1,52 @@
+"""``unprocessed_buffer`` — chat-stream messages waiting on boundary detection.
+
+Schema property: presence in the table = pending; absence = consumed.
+There is no ``consumed`` column. Pipeline uses ``replace(session, track,
+remaining)`` to atomically rewrite the (session, track) slice each turn.
+"""
+
+from __future__ import annotations
+
+from sqlalchemy import Index
+
+from everos.component.utils.datetime import UtcDatetime
+from everos.core.persistence.sqlite import BaseTable, Field
+from everos.core.persistence.sqlite.base import UtcDateTimeColumn
+
+
+class UnprocessedBuffer(BaseTable, table=True):
+    """One row per unprocessed message. PK ``message_id``."""
+
+    __tablename__ = "unprocessed_buffer"  # type: ignore[assignment]
+    __table_args__ = (
+        # Scope-first composite: app/project partition the (session, track)
+        # staging slice so different spaces never share a buffer window.
+        Index(
+            "ix_unprocessed_buffer_lookup",
+            "app_id",
+            "project_id",
+            "session_id",
+            "track",
+            "timestamp",
+        ),
+    )
+
+    message_id: str = Field(primary_key=True)
+    app_id: str = Field(default="default")
+    project_id: str = Field(default="default")
+    """App / project scope segments (default ``"default"``)."""
+    session_id: str = Field(index=True)
+    track: str = Field(index=True)
+    sender_id: str
+    sender_name: str | None = None
+    role: str
+    timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
+    # JSON-serialised raw ContentItem list (mirrors src_old
+    # RawMessage.content_items). Keeps the original multimodal payload
+    # available so a future parser can reach back to image / audio / etc.
+    content_items_json: str
+    # Derived plain-text concatenation of ``type=text`` entries — what
+    # downstream LLM-facing extractors and md writer consume today.
+    text: str
+    tool_calls_json: str | None = None
+    tool_call_id: str | None = None