chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
Elliot Chen
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions

View File

@ -0,0 +1,35 @@
"""LanceDB table schemas (one ``BaseLanceTable`` subclass per business table).
Schemas live here; cascade-daemon-driven row population is wired
through the matching repo singletons in :mod:`..repos`.
External usage::
from everos.infra.persistence.lancedb.tables import (
Episode,
AtomicFact,
Foresight,
AgentCase,
AgentSkill,
UserProfile,
ParentType,
)
"""
from ._parent_type import ParentType as ParentType
from .agent_case import AgentCase as AgentCase
from .agent_skill import AgentSkill as AgentSkill
from .atomic_fact import AtomicFact as AtomicFact
from .episode import Episode as Episode
from .foresight import Foresight as Foresight
from .user_profile import UserProfile as UserProfile
__all__ = [
"AgentCase",
"AgentSkill",
"AtomicFact",
"Episode",
"Foresight",
"ParentType",
"UserProfile",
]

View File

@ -0,0 +1,24 @@
"""``ParentType`` — provenance label for memory records linked back to a source.
Currently the only value is :attr:`ParentType.MEMCELL`: every business row
(episode / foresight / atomic_fact / agent_case) points back to a source
MemCell. The earlier opensource design enumerated ``"episode"`` as an
alternative parent but the production path never wrote that value, so the
new framework collapses the enum to its single in-use member.
Kept as an :class:`enum.Enum` (rather than a bare string constant) so that
adding a future parent kind stays a non-breaking enum extension. LanceDB's
pydantic-to-arrow conversion does not accept ``Enum`` field annotations,
so table schemas declare ``parent_type: str = ParentType.MEMCELL.value``
and reference the enum only at the default-value level.
"""
from __future__ import annotations
from enum import StrEnum
class ParentType(StrEnum):
"""Provenance label of a memory record's parent."""
MEMCELL = "memcell"

View File

@ -0,0 +1,84 @@
"""LanceDB ``agent_case`` table schema.
Field set per 17_lancedb_tables_design.md §3.4. Each row records one
task an agent worked on: intent, approach, optional pivotal insight,
and a quality score. A MemCell extracted on the agent's own execution
log yields at most one AgentCase.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class AgentCase(BaseLanceTable):
"""One agent case indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "agent_case"
BM25_FIELDS: ClassVar[list[str]] = ["task_intent_tokens", "approach_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``ac_<YYYYMMDD>_<NNNN>``."""
owner_id: str
"""The owning ``agent_id``."""
owner_type: str
"""Fixed ``"agent"`` for this table."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for agent case."""
parent_id: str
"""Source memcell id (one memcell ↔ one case)."""
quality_score: float
"""0.01.0; task completion / quality estimate."""
task_intent: str
"""≤ 50 tokens; original surface form (returned for display)."""
task_intent_tokens: str
"""App-layer pre-tokenised ``task_intent`` — BM25 main field
(whitespace tokenizer); display goes through ``task_intent``."""
approach: str
"""≤ 1000 tokens; step-by-step approach (display)."""
approach_tokens: str
"""App-layer pre-tokenised ``approach`` — BM25 secondary field
(whitespace tokenizer). Searched in parallel with
``task_intent_tokens`` then merged by max score in the recall
layer; task_intent typically scores higher because it's the
retrieval anchor, but approach catches queries that match a step
detail."""
key_insight: str | None = None
"""≤ 40 tokens; pivotal strategy shift, optional."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry — TaskIntent / Approach / KeyInsight sections plus
the ``quality_score`` inline. Audit inline (owner_id /
session_id / timestamp / parent_id) is NOT in the hash. See
:attr:`AgentCaseHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,80 @@
"""LanceDB ``agent_skill`` table schema.
Field set per 17_lancedb_tables_design.md §3.5. AgentSkill is a *named
entity* rather than a daily-log entry — PK is ``<owner_id>_<skill_name>``
(no date / seq), and same agent + same name is the same row (upsert).
``content`` is cascade-assembled from ``SKILL.md`` body plus every
``references/*.md`` sibling; ``scripts/`` is not indexed.
"""
from __future__ import annotations
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
_DIM = 1024
class AgentSkill(BaseLanceTable):
"""One agent skill indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "agent_skill"
BM25_FIELDS: ClassVar[list[str]] = ["description_tokens", "content_tokens"]
id: str
"""PK = ``<owner_id>_<skill_name>``."""
owner_id: str
"""The owning ``agent_id``."""
owner_type: str
"""Fixed ``"agent"`` for this table."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
name: str
"""Skill identifier; half of the PK."""
description: str
"""When-to-use / purpose — original surface form (Tier-1 ad copy)."""
description_tokens: str
"""App-layer pre-tokenised ``description`` — BM25 main field
(whitespace tokenizer); display goes through ``description``."""
content: str
"""Cascade-assembled body: ``SKILL.md`` main text concatenated with
every ``references/*.md`` sibling. ``scripts/`` files are excluded."""
content_tokens: str
"""App-layer pre-tokenised ``content`` (secondary BM25 field).
Tokenised by cascade when assembling ``content`` from md sources."""
confidence: float
"""0.01.0; LLM-emitted confidence in the skill."""
maturity_score: float
"""0.01.0; LLM-emitted maturity score. The retrieval-time threshold
(``maturity_threshold``) lives in MemorizeConfig, not in this row."""
source_case_ids: list[str]
"""AgentCase ids that fed into this skill's synthesis (lineage)."""
cluster_id: str | None = None
"""Optional MemScene clustering tag."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the skill: ``name`` / ``description`` (frontmatter) + SKILL.md
body + concatenated references content + ``confidence`` /
``maturity_score``. Cascade handler diffs by this digest to skip
re-upsert + re-embed when neither retrieval-anchor text nor scores
changed (e.g. the watcher fires for unrelated stat updates). See
:attr:`AgentSkillHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,62 @@
"""LanceDB ``atomic_fact`` table schema.
Field set per 17_lancedb_tables_design.md §3.2. Each row carries one
atomic fact extracted by the algo layer; the parent is always the source
MemCell — recorded via ``parent_type`` / ``parent_id``.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class AtomicFact(BaseLanceTable):
"""One atomic fact indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "atomic_fact"
BM25_FIELDS: ClassVar[list[str]] = ["fact_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``af_<YYYYMMDD>_<NNNN>``."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for atomic fact."""
parent_id: str
"""Source memcell id."""
sender_ids: list[str]
fact: str
"""Atomic fact text — original surface form (returned for display)."""
fact_tokens: str
"""App-layer pre-tokenised ``fact`` text — space-joined tokens.
BM25 index is built on this column (whitespace tokenizer);
``fact`` itself is what callers display."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry (per :attr:`AtomicFactHandler.content_change_keys`).
Matching digest → skip re-upsert + re-embed. Audit inline fields
(owner_id / session_id / timestamp / parent_id / sender_ids) are
NOT in the hash."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,78 @@
"""LanceDB ``episode`` table schema.
Field set is fixed by the LanceDB tables design spec. Rows are populated
by the cascade daemon from ``users/<owner_id>/episodes/episode-<YYYY-MM-DD>.md``
and from ``agents/<owner_id>/episodes/...`` symmetrically.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
# Vector dimension is settings-managed at runtime; the class-level
# constant pins the schema dim used at table creation.
_DIM = 1024
class Episode(BaseLanceTable):
"""One episode record indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "episode"
BM25_FIELDS: ClassVar[list[str]] = ["episode_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>`` (scalar PK)."""
entry_id: str
"""md-side seq id ``ep_<YYYYMMDD>_<NNNN>`` (cascade reverse-lookup)."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for episode."""
parent_id: str
"""Source memcell id. The pipeline knows the memcell currently being
processed and writes its id into the md entry's inline block; the
cascade handler reads it back. The new everalgo Episode type no
longer emits ``parent_id`` itself (collapsed to caller-managed),
so this is filled entirely from everos's engineering context."""
sender_ids: list[str]
"""Distinct ``role=user|assistant`` senders behind the episode."""
subject: str | None = None
summary: str | None = None
episode: str
"""Full narrative text — original surface form (returned for display)."""
episode_tokens: str
"""App-layer pre-tokenised ``episode`` text — space-joined tokens
(e.g. produced by jieba). LanceDB FTS index is built on **this**
column using a whitespace tokenizer; the original ``episode`` field
is what callers display. Two-field BM25 scheme keeps tokenisation
deterministic and provider-pluggable at the app layer."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of the
md entry (per :attr:`EpisodeHandler.content_change_keys`). On
re-reconcile, a matching digest means none of the persistence /
embedding-relevant fields changed — the entry is skipped (no
re-upsert, no re-embed). Inline audit fields (owner_id /
session_id / timestamp / parent_id / sender_ids) are intentionally
NOT in the hash so editing them doesn't waste an embedding call.
See ``16_cascade_impl_design.md`` §3.3."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,79 @@
"""LanceDB ``foresight`` table schema.
Field set per 17_lancedb_tables_design.md §3.3. Each row carries a
forward-looking inference about the user (intent window, planned
action, projected need); ``start_time`` / ``end_time`` describe the
window the foresight applies to.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class Foresight(BaseLanceTable):
"""One foresight record indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "foresight"
BM25_FIELDS: ClassVar[list[str]] = ["foresight_tokens", "evidence_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``fs_<YYYYMMDD>_<NNNN>``."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
"""Foresight generation time."""
start_time: _dt.datetime | None = None
"""Foresight coverage window start; tz-aware."""
end_time: _dt.datetime | None = None
"""Foresight coverage window end; tz-aware."""
duration_days: int | None = None
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for foresight."""
parent_id: str
"""Source memcell id."""
sender_ids: list[str]
foresight: str
"""Foresight body — original surface form (returned for display)."""
foresight_tokens: str
"""App-layer pre-tokenised ``foresight`` text — space-joined tokens.
BM25 index is built on this column (whitespace tokenizer)."""
evidence: str | None = None
"""Supporting evidence excerpt; may be empty."""
evidence_tokens: str | None = None
"""App-layer pre-tokenised ``evidence`` (secondary BM25 field).
``None`` whenever ``evidence`` is None."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry — Foresight / Evidence sections plus the time-window
inline fields (start_time / end_time / duration_days). Audit inline
(owner_id / session_id / timestamp / parent_id / sender_ids) is NOT
in the hash. See :attr:`ForesightHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,68 @@
"""LanceDB ``user_profile`` table schema.
Profile is a single-file kind: one ``users/<user_id>/user.md`` per
user, replaced wholesale on edit (mirrors ``AgentSkill`` for the
upsert/single-row contract). The LanceDB row is a typed projection
of the md frontmatter that the cascade keeps in sync; it carries no
vector / no BM25 because the recall surface is pure KV-by-owner
(``fetch(owner_id)``) — when query-aware profile lookup ships later
the schema will gain ``vector`` + ``*_tokens`` columns then.
``explicit_info`` / ``implicit_traits`` are heterogeneous LLM
emissions (mostly small dicts mixed with strings) — LanceDB has no
``list[dict]`` column type, so we stash them as JSON strings and
unpack at the recall boundary into ``profile_data`` of the DTO.
"""
from __future__ import annotations
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable
class UserProfile(BaseLanceTable):
"""One ``users/<user_id>/user.md`` indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "user_profile"
# No BM25 columns: profile recall is KV-by-owner today.
id: str
"""PK = ``owner_id`` (one row per user)."""
owner_id: str
owner_type: str
"""Always ``"user"`` for this schema; agent-side profiles would
live in a sibling table once that schema lands."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
summary: str
"""Free-form one-paragraph user summary (retrieval anchor for the
future query-aware lookup; today returned verbatim to the caller)."""
explicit_info_json: str
"""JSON-serialised ``list[Any]`` — the algo's verbatim evidence
bucket. Stored as a string because LanceDB has no
``list[dict]`` column type. The recaller json-decodes it back into
``profile_data['explicit_info']`` at the DTO boundary."""
implicit_traits_json: str
"""Same shape as :attr:`explicit_info_json`, for the LLM-inferred
preference bucket."""
profile_timestamp_ms: int
"""Algo-emitted profile timestamp (ms epoch) — pinned to the
timestamp of the freshest MemCell that fed into the synthesis.
Mirrored from :attr:`UserProfileFrontmatter.profile_timestamp_ms`
so downstream code can compare freshness without re-reading md."""
md_path: str
content_sha256: str
"""SHA-256 over the content-bearing frontmatter fields (summary +
explicit_info_json + implicit_traits_json). Matches → cascade
skips re-upsert. ``profile_timestamp_ms`` is intentionally not in
the hash: it drifts with every synthesis even when the underlying
content is identical, and the LanceDB row treats it as audit."""