chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
Elliot Chen
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions

View File

@ -0,0 +1,24 @@
"""Business SQLModel table schemas.
Each business table lives in its own module here (e.g. ``memcell.py``,
``unprocessed_buffer.py``). The package ``__init__`` re-exports them so
``SQLModel.metadata.create_all`` (run by
:class:`everos.core.lifespan.SqliteLifespanProvider` at startup) sees
every registered table.
"""
from .cluster import Cluster as Cluster
from .cluster import ClusterMember as ClusterMember
from .conversation_status import ConversationStatus as ConversationStatus
from .md_change_state import MdChangeState as MdChangeState
from .memcell import Memcell as Memcell
from .unprocessed_buffer import UnprocessedBuffer as UnprocessedBuffer
__all__ = [
"Cluster",
"ClusterMember",
"ConversationStatus",
"MdChangeState",
"Memcell",
"UnprocessedBuffer",
]

View File

@ -0,0 +1,99 @@
"""``cluster`` — persisted snapshot of one ``everalgo.clustering.Cluster``.
Mirrors the algo-side frozen value object (centroid + count + last_ts +
preview) plus everos engineering metadata (``owner_id`` / ``owner_type``
/ ``kind``) so a single SQLite table can hold both the user-memory cluster
track (episode embeddings) and the agent-case cluster track (task_intent
embeddings). The ``members`` field on the algo type is persisted in the
sibling :class:`ClusterMember` table to keep the relation queryable from
both directions (forward by ``cluster_id``, reverse by ``(member_type,
member_id)``).
"""
from __future__ import annotations
from sqlalchemy import Index, LargeBinary
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class Cluster(BaseTable, table=True):
"""One row per cluster. PK ``cluster_id`` (``cl_<12hex>``)."""
__tablename__ = "cluster" # type: ignore[assignment]
__table_args__ = (
# List all clusters for one (app, project, owner, kind) on each strategy
# invocation; scope-first composite so clustering never mixes spaces.
Index("ix_cluster_owner_kind", "app_id", "project_id", "owner_id", "kind"),
)
cluster_id: str = Field(primary_key=True)
"""Caller-minted opaque id (algo type carries it through verbatim).
Format: ``cl_<12 hex chars>`` to mirror :func:`memcell._mint_memcell_id`."""
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments. The aggregation key is
``(app_id, project_id, owner_id, kind)`` so a cluster set never spans
two spaces."""
owner_id: str = Field(index=True)
"""``user_id`` (kind=``user_memory``) or ``agent_id`` (kind=``agent_case``)."""
owner_type: str
"""``"user"`` or ``"agent"`` — redundant with ``kind`` today but kept
explicit so future kinds (e.g. tenant-level) can plug in without a
schema change."""
kind: str
"""``"user_memory"`` (episode-vector cluster, drives profile extraction)
or ``"agent_case"`` (task_intent-vector cluster, drives skill extraction)."""
centroid_blob: bytes = Field(sa_type=LargeBinary)
"""``np.float32`` centroid serialised via ``ndarray.tobytes()``. The
repo round-trips bytes ↔ ``np.ndarray`` so callers see the algo type."""
count: int
"""Number of members merged into this cluster (algo-maintained)."""
last_ts_ms: int
"""Most recent member's timestamp as Unix epoch milliseconds — matches
:attr:`everalgo.clustering.Cluster.last_ts` exactly so no lossy
datetime ↔ int conversion is needed across the storage boundary."""
preview_json: str
"""JSON-encoded ``list[str]`` — short text samples used by
:func:`cluster_by_llm` ranking. Repo round-trips JSON ↔ list."""
class ClusterMember(BaseTable, table=True):
"""One row per (cluster, entity) link.
Forward lookup (``cluster_id → list[member_id]``) is the algo-side
``Cluster.members`` view. Reverse lookup (``(member_type, member_id)
→ cluster_id``) is served by the composite index below — needed when
a downstream consumer holds an entity id and wants its cluster.
``member_type`` is informational on the row (the parent ``Cluster.kind``
already disambiguates), but kept explicit so the reverse index can be
a single composite (member_type, member_id) without joining back.
"""
__tablename__ = "cluster_member" # type: ignore[assignment]
__table_args__ = (Index("ix_cluster_member_reverse", "member_type", "member_id"),)
cluster_id: str = Field(primary_key=True, foreign_key="cluster.cluster_id")
"""Parent cluster id."""
member_id: str = Field(primary_key=True)
"""``memcell_id`` (member_type=``memcell``) or md entry_id
(member_type=``case``) — the entity grouped into this cluster."""
member_type: str
"""``"memcell"`` or ``"case"``. Echoes the parent cluster's ``kind``
domain but kept on the row so the reverse index is self-contained."""
added_ts: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
"""When this entity was first attached to the cluster."""

View File

@ -0,0 +1,38 @@
"""``conversation_status`` — window pointer per (app, project, session, track).
The window pointer is scoped by ``app_id`` / ``project_id`` so the same
``session_id`` may recur in different spaces without colliding; those two
segments lead the composite ``UniqueConstraint``.
"""
from __future__ import annotations
from sqlalchemy import UniqueConstraint
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class ConversationStatus(BaseTable, table=True):
"""One row per (app, project, session, track). Tracks latest msg / memcell ts."""
__tablename__ = "conversation_status" # type: ignore[assignment]
__table_args__ = (
UniqueConstraint(
"app_id",
"project_id",
"session_id",
"track",
name="uq_conversation_status_session_track",
),
)
id: int | None = Field(default=None, primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments (default ``"default"``)."""
session_id: str = Field(index=True)
track: str
last_message_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
last_memcell_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)

View File

@ -0,0 +1,119 @@
"""``md_change_state`` — cascade work queue.
One row per markdown path. Both watcher (real-time fsevents) and
scanner (periodic sweep) UPSERT into this table; the worker consumes
``pending`` rows in ``lsn`` order, transitions them through an
internal ``processing`` claim state, and lands them in ``done`` or
``failed`` (with a ``retryable`` flag).
Schema sourced from ``12_cascade_design.md`` §4.1 + decisions DD-3 …
DD-12; the four indexes below are required by ``13_cascade_design.md``
§7 status / fix queries.
"""
from __future__ import annotations
from sqlalchemy import Index, text
from everos.component.utils.datetime import UtcDatetime, get_utc_now
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class MdChangeState(BaseTable, table=True):
"""One row per markdown path; UPSERT-driven work queue for cascade.
The public state machine is the 3-tuple ``pending`` / ``done`` /
``failed`` (12 doc §6). ``processing`` is an internal claim state
used by :meth:`MdChangeStateRepo.claim_one` and rolled back into
``pending`` for CLI / status output (16 doc §4.2 — DD-12 keeps the
public surface clean).
"""
__tablename__ = "md_change_state" # type: ignore[assignment]
__table_args__ = (
# Worker scans pending rows in lsn order — partial index drops
# done/failed rows from the b-tree and keeps it tight.
Index(
"idx_md_change_pending",
"status",
"lsn",
sqlite_where=text("status = 'pending'"),
),
# `cascade fix --apply` only ever touches failed + retryable=TRUE
# rows — partial index makes that pass essentially O(retryable).
Index(
"idx_md_change_retryable",
"status",
"retryable",
sqlite_where=text("status = 'failed' AND retryable = 1"),
),
# Scanner reverse-reconcile (disk → state) compares mtime.
Index("idx_md_change_mtime", "mtime"),
# `cascade status` aggregates by kind.
Index("idx_md_change_kind", "kind"),
)
md_path: str = Field(primary_key=True)
"""Path relative to the memory-root (e.g. ``users/u_jason/
episodes/episode-2026-05-12.md``). Every reverse-link anchors here."""
kind: str = Field(nullable=False, index=True)
"""Kind registry name (e.g. ``"episode"``); worker dispatches the
matching handler."""
change_type: str = Field(nullable=False)
"""``"added"`` | ``"modified"`` | ``"deleted"``. A hint for the
worker — handler re-derives truth from the actual file state."""
mtime: float = Field(default=0.0, nullable=False)
"""File mtime captured when the row was last UPSERTed. Scanner
compares this against the on-disk mtime to identify dirty paths."""
first_seen_at: UtcDatetime = Field(
default_factory=get_utc_now, sa_type=UtcDateTimeColumn
)
"""When the path was first enqueued."""
last_changed_at: UtcDatetime = Field(
default_factory=get_utc_now, sa_type=UtcDateTimeColumn
)
"""Most recent UPSERT timestamp (re-stamped on every re-enqueue)."""
lsn: int = Field(nullable=False, index=True)
"""Global monotonic sequence (``MAX(lsn) + 1`` per UPSERT). Worker
processes pending rows in ascending lsn order; the gap between
``MAX(lsn)`` and the last processed lsn is the queue lag."""
status: str = Field(default="pending", nullable=False, index=True)
"""Lifecycle:
- ``"pending"`` — waiting for the worker.
- ``"processing"`` — claimed by a worker (internal; CLI rolls into
pending for display).
- ``"done"`` — handler completed successfully.
- ``"failed"`` — handler exhausted retries or hit an
unrecoverable error (see :attr:`retryable`).
"""
retryable: bool | None = Field(default=None)
"""Meaningful only when ``status='failed'``.
- ``TRUE`` — RecoverableError exhausted MAX_RETRY; ``cascade fix
--apply`` will re-enqueue this row (pending, retry_count reset).
- ``FALSE`` — UnrecoverableError (malformed YAML, schema error
etc.); requires editing the md and re-saving.
- ``NULL`` — not a failed row (pending / processing / done).
"""
last_attempt_at: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
"""Timestamp of the most recent worker attempt (success or
failure)."""
retry_count: int = Field(default=0, nullable=False)
"""Number of retries the worker has *actually issued* (the first
attempt does not count). Reaches MAX_RETRY (default 3) before the
row transitions to ``failed`` with ``retryable=TRUE``."""
error: str | None = Field(default=None)
"""Most recent failure message (truncated upstream if needed)."""

View File

@ -0,0 +1,55 @@
"""``memcell`` — metadata + payload archive for boundary-detected MemCells.
Holds ``message_ids_json`` / ``sender_ids_json`` (JSON arrays of audit
ids) plus ``payload_json`` — the full :class:`everalgo.types.MemCell`
serialised via ``model_dump_json``. The payload is what
``unprocessed_buffer`` cannot keep (boundary's delete-then-insert clears
the staging slice once messages fold into a cell): downstream offline
strategies that need the raw chat messages (e.g. profile extraction)
deserialise the payload back into an algo ``MemCell``. Episode markdown
still carries the LLM-synthesised narrative; ``payload_json`` is the
chat-stream archive that narrative was distilled from.
"""
from __future__ import annotations
from sqlalchemy import Index
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class Memcell(BaseTable, table=True):
"""One row per MemCell. PK ``memcell_id`` (uuid4)."""
__tablename__ = "memcell" # type: ignore[assignment]
__table_args__ = (
# Scope-first composite: app/project partition the lookup before the
# session window so cross-(app, project) rows never share an index slot.
Index(
"ix_memcell_session",
"app_id",
"project_id",
"session_id",
"track",
"timestamp",
),
)
memcell_id: str = Field(primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments. Default to ``"default"`` so the column is
always populated; callers in a non-default space pass real ids."""
session_id: str = Field(index=True)
track: str
raw_type: str
message_ids_json: str
sender_ids_json: str
payload_json: str
"""``MemCell.model_dump_json()`` — the full algo-side MemCell (items =
chat messages / tool calls) serialised at boundary time so offline
strategies can deserialise it back into an algo MemCell long after
``unprocessed_buffer`` has dropped the staging rows."""
timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)

View File

@ -0,0 +1,52 @@
"""``unprocessed_buffer`` — chat-stream messages waiting on boundary detection.
Schema property: presence in the table = pending; absence = consumed.
There is no ``consumed`` column. Pipeline uses ``replace(session, track,
remaining)`` to atomically rewrite the (session, track) slice each turn.
"""
from __future__ import annotations
from sqlalchemy import Index
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class UnprocessedBuffer(BaseTable, table=True):
"""One row per unprocessed message. PK ``message_id``."""
__tablename__ = "unprocessed_buffer" # type: ignore[assignment]
__table_args__ = (
# Scope-first composite: app/project partition the (session, track)
# staging slice so different spaces never share a buffer window.
Index(
"ix_unprocessed_buffer_lookup",
"app_id",
"project_id",
"session_id",
"track",
"timestamp",
),
)
message_id: str = Field(primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments (default ``"default"``)."""
session_id: str = Field(index=True)
track: str = Field(index=True)
sender_id: str
sender_name: str | None = None
role: str
timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
# JSON-serialised raw ContentItem list (mirrors src_old
# RawMessage.content_items). Keeps the original multimodal payload
# available so a future parser can reach back to image / audio / etc.
content_items_json: str
# Derived plain-text concatenation of ``type=text`` entries — what
# downstream LLM-facing extractors and md writer consume today.
text: str
tool_calls_json: str | None = None
tool_call_id: str | None = None