chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
106
src/everos/core/persistence/__init__.py
Normal file
106
src/everos/core/persistence/__init__.py
Normal file
@ -0,0 +1,106 @@
|
||||
"""Persistence primitives.
|
||||
|
||||
Read/write toolkit for markdown files, async wrappers around the SQLite
|
||||
system DB and LanceDB index, plus a memory-root path manager. Higher
|
||||
layers (``memory``, ``infra``) layer business semantics on top of these
|
||||
building blocks; this subpackage knows nothing about Entry / MemCell /
|
||||
Episode or any other business model.
|
||||
|
||||
External usage:
|
||||
from everos.core.persistence import (
|
||||
# Path manager + lock
|
||||
MemoryRoot, memory_root_lock, LockError,
|
||||
# Markdown IO toolkit
|
||||
MarkdownReader, MarkdownWriter, ParsedMarkdown, Entry,
|
||||
parse_frontmatter, dump_frontmatter, split_entries, find_entry,
|
||||
# Frontmatter schema chassis
|
||||
BaseFrontmatter, UserScopedFrontmatter, AgentScopedFrontmatter,
|
||||
DailyLogPathMixin, SkillPathMixin,
|
||||
# Async SQLite (SQLModel / SA 2.0)
|
||||
create_system_engine, create_session_factory, session_scope,
|
||||
SQLModel, Field, Relationship, BaseTable, RepoBase,
|
||||
# Async LanceDB
|
||||
open_lancedb_connection, LanceModel, Vector, BaseLanceTable, touch,
|
||||
LanceRepoBase,
|
||||
)
|
||||
"""
|
||||
|
||||
from .lancedb import BaseLanceTable as BaseLanceTable
|
||||
from .lancedb import LanceModel as LanceModel
|
||||
from .lancedb import LanceRepoBase as LanceRepoBase
|
||||
from .lancedb import Vector as Vector
|
||||
from .lancedb import open_lancedb_connection as open_lancedb_connection
|
||||
from .lancedb import touch as touch
|
||||
from .locking import LockError as LockError
|
||||
from .locking import memory_root_lock as memory_root_lock
|
||||
from .markdown import AgentScopedFrontmatter as AgentScopedFrontmatter
|
||||
from .markdown import BaseFrontmatter as BaseFrontmatter
|
||||
from .markdown import DailyLogPathMixin as DailyLogPathMixin
|
||||
from .markdown import Entry as Entry
|
||||
from .markdown import EntryId as EntryId
|
||||
from .markdown import MarkdownReader as MarkdownReader
|
||||
from .markdown import MarkdownWriter as MarkdownWriter
|
||||
from .markdown import ParsedMarkdown as ParsedMarkdown
|
||||
from .markdown import SkillPathMixin as SkillPathMixin
|
||||
from .markdown import StructuredEntry as StructuredEntry
|
||||
from .markdown import UserScopedFrontmatter as UserScopedFrontmatter
|
||||
from .markdown import dump_frontmatter as dump_frontmatter
|
||||
from .markdown import find_entry as find_entry
|
||||
from .markdown import parse_frontmatter as parse_frontmatter
|
||||
from .markdown import parse_structured_entry as parse_structured_entry
|
||||
from .markdown import render_structured_entry as render_structured_entry
|
||||
from .markdown import split_entries as split_entries
|
||||
from .memory_root import MemoryRoot as MemoryRoot
|
||||
from .memory_root import app_dir_name as app_dir_name
|
||||
from .memory_root import app_id_from_dir as app_id_from_dir
|
||||
from .memory_root import project_dir_name as project_dir_name
|
||||
from .memory_root import project_id_from_dir as project_id_from_dir
|
||||
from .sqlite import BaseTable as BaseTable
|
||||
from .sqlite import Field as Field
|
||||
from .sqlite import Relationship as Relationship
|
||||
from .sqlite import RepoBase as RepoBase
|
||||
from .sqlite import SQLModel as SQLModel
|
||||
from .sqlite import create_session_factory as create_session_factory
|
||||
from .sqlite import create_system_engine as create_system_engine
|
||||
from .sqlite import session_scope as session_scope
|
||||
|
||||
__all__ = [
|
||||
"AgentScopedFrontmatter",
|
||||
"BaseFrontmatter",
|
||||
"BaseLanceTable",
|
||||
"BaseTable",
|
||||
"DailyLogPathMixin",
|
||||
"Entry",
|
||||
"EntryId",
|
||||
"Field",
|
||||
"LanceModel",
|
||||
"LanceRepoBase",
|
||||
"LockError",
|
||||
"MarkdownReader",
|
||||
"MarkdownWriter",
|
||||
"MemoryRoot",
|
||||
"ParsedMarkdown",
|
||||
"Relationship",
|
||||
"RepoBase",
|
||||
"SkillPathMixin",
|
||||
"StructuredEntry",
|
||||
"SQLModel",
|
||||
"UserScopedFrontmatter",
|
||||
"Vector",
|
||||
"app_dir_name",
|
||||
"app_id_from_dir",
|
||||
"create_session_factory",
|
||||
"create_system_engine",
|
||||
"dump_frontmatter",
|
||||
"find_entry",
|
||||
"memory_root_lock",
|
||||
"project_dir_name",
|
||||
"project_id_from_dir",
|
||||
"open_lancedb_connection",
|
||||
"parse_frontmatter",
|
||||
"parse_structured_entry",
|
||||
"render_structured_entry",
|
||||
"session_scope",
|
||||
"split_entries",
|
||||
"touch",
|
||||
]
|
||||
34
src/everos/core/persistence/lancedb/__init__.py
Normal file
34
src/everos/core/persistence/lancedb/__init__.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""LanceDB async persistence.
|
||||
|
||||
External usage (connection):
|
||||
from everos.core.persistence.lancedb import open_lancedb_connection
|
||||
|
||||
External usage (ORM model basics — re-exported from lancedb.pydantic):
|
||||
from everos.core.persistence.lancedb import (
|
||||
LanceModel, Vector, BaseLanceTable, touch,
|
||||
)
|
||||
|
||||
External usage (generic CRUD repository base):
|
||||
from everos.core.persistence.lancedb import LanceRepoBase
|
||||
"""
|
||||
|
||||
# Re-export the LanceDB-flavoured Pydantic primitives so business code has a
|
||||
# single canonical entry point for table schemas.
|
||||
from lancedb.pydantic import LanceModel as LanceModel
|
||||
from lancedb.pydantic import Vector as Vector
|
||||
|
||||
from .base import BaseLanceTable as BaseLanceTable
|
||||
from .base import touch as touch
|
||||
from .connection import open_lancedb_connection as open_lancedb_connection
|
||||
from .repository import LanceDailyLogRepoBase as LanceDailyLogRepoBase
|
||||
from .repository import LanceRepoBase as LanceRepoBase
|
||||
|
||||
__all__ = [
|
||||
"BaseLanceTable",
|
||||
"LanceDailyLogRepoBase",
|
||||
"LanceModel",
|
||||
"LanceRepoBase",
|
||||
"Vector",
|
||||
"open_lancedb_connection",
|
||||
"touch",
|
||||
]
|
||||
158
src/everos/core/persistence/lancedb/base.py
Normal file
158
src/everos/core/persistence/lancedb/base.py
Normal file
@ -0,0 +1,158 @@
|
||||
"""Common LanceDB base for everos tables.
|
||||
|
||||
:class:`BaseLanceTable` adds ``created_at`` / ``updated_at`` columns and
|
||||
the :attr:`BM25_FIELDS` declaration + :meth:`ensure_fts_indexes`
|
||||
classmethod so each schema owns *both* its column shape **and** its
|
||||
BM25 index spec — repos stay focused on queries.
|
||||
|
||||
Note:
|
||||
LanceDB has no SQL ``onupdate`` equivalent — the application must
|
||||
explicitly set ``updated_at = get_utc_now()`` before calling
|
||||
:meth:`AsyncTable.update` / :meth:`AsyncTable.merge_insert`. The
|
||||
convenience :func:`touch` helper does this in one call.
|
||||
|
||||
**Every datetime column automatically carries ``tz=UTC`` in the
|
||||
Arrow schema.** LanceDB's Pydantic→PyArrow converter does not
|
||||
understand ``typing.Annotated`` metadata, so :data:`UtcDatetime`
|
||||
cannot be used as the field type annotation. Instead,
|
||||
:meth:`BaseLanceTable.to_arrow_schema` walks the inferred schema
|
||||
and rewrites every ``timestamp[us]`` (naive) column to
|
||||
``timestamp[us, tz=UTC]``. PyArrow then auto-``astimezone(UTC)``
|
||||
aware inputs on write **and** returns aware UTC datetimes on read
|
||||
— no per-table configuration, no caller-side ``ensure_utc``.
|
||||
|
||||
Subclasses just declare ``datetime`` fields normally::
|
||||
|
||||
class Episode(BaseLanceTable):
|
||||
timestamp: dt.datetime
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as dt
|
||||
from typing import ClassVar
|
||||
|
||||
import pyarrow as pa
|
||||
from lancedb import AsyncTable
|
||||
from lancedb.index import FTS
|
||||
from lancedb.pydantic import LanceModel
|
||||
from pydantic import Field
|
||||
|
||||
from everos.component.utils.datetime import get_utc_now
|
||||
|
||||
|
||||
class BaseLanceTable(LanceModel):
|
||||
"""Pydantic / LanceDB base with ``created_at`` / ``updated_at`` and
|
||||
schema-level LanceDB metadata (``TABLE_NAME`` / ``BM25_FIELDS``).
|
||||
|
||||
The schema is the single source of truth for everything LanceDB
|
||||
needs to materialise the table: column shape, table name, vector
|
||||
dim (declared per-subclass), and which columns carry an FTS index.
|
||||
Repos read these ClassVars; they do not duplicate them.
|
||||
"""
|
||||
|
||||
TABLE_NAME: ClassVar[str] = ""
|
||||
"""LanceDB table name. Business schemas must override (e.g.
|
||||
``"episode"``). Left empty on chassis / test schemas that construct
|
||||
their table inline."""
|
||||
|
||||
BM25_FIELDS: ClassVar[list[str]] = []
|
||||
"""Columns to build LanceDB FTS (BM25) indexes on.
|
||||
|
||||
Each declared column must already exist as a ``str`` (or
|
||||
``str | None``) field on the schema. Tokens are assumed to be
|
||||
**app-layer pre-tokenised** (space-joined); the FTS index uses
|
||||
``base_tokenizer="whitespace"`` so segmentation is owned by the
|
||||
app layer (:class:`JiebaTokenizer`). The same boundary owns stop-
|
||||
word filtering (English + Chinese); FTS-side ``remove_stop_words``
|
||||
is OFF. FTS *does* keep lightweight English-aware normalisation
|
||||
(``lower_case`` / ``stem`` / ``ascii_folding``) as a belt-and-
|
||||
braces layer on the same English tokens that survive jieba.
|
||||
See ``17_lancedb_tables_design.md`` §2.4.1 and
|
||||
:meth:`ensure_fts_indexes` below for the exact knobs."""
|
||||
|
||||
created_at: dt.datetime = Field(default_factory=get_utc_now)
|
||||
updated_at: dt.datetime = Field(default_factory=get_utc_now)
|
||||
|
||||
@classmethod
|
||||
def to_arrow_schema(cls) -> pa.Schema:
|
||||
"""Patch the default Arrow schema: force every timestamp to ``tz=UTC``.
|
||||
|
||||
The base ``LanceModel.to_arrow_schema()`` infers Arrow types from
|
||||
Pydantic field annotations and emits naive ``timestamp[us]`` for
|
||||
every :class:`datetime.datetime` column. We rewrite **every**
|
||||
timestamp column to ``timestamp[us, tz=UTC]``:
|
||||
|
||||
* **on write** — PyArrow ``astimezone(UTC)``-s aware input
|
||||
automatically before serialising the i64 epoch micros.
|
||||
* **on read** — PyArrow returns aware UTC datetimes.
|
||||
|
||||
Zero per-table configuration. The rewrite also **overrides any
|
||||
non-UTC tz** a subclass might have declared explicitly, because
|
||||
project convention is: storage is always UTC. Mixed-tz columns
|
||||
would violate the two-zone discipline (see
|
||||
``docs/datetime.md``); enforcing UTC at the schema level closes
|
||||
that loophole.
|
||||
"""
|
||||
base = super().to_arrow_schema()
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field(f.name, pa.timestamp("us", tz="UTC"), nullable=f.nullable)
|
||||
if pa.types.is_timestamp(f.type)
|
||||
else f
|
||||
for f in base
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def ensure_fts_indexes(cls, table: AsyncTable) -> None:
|
||||
"""Create FTS indexes on every column in :attr:`BM25_FIELDS`.
|
||||
|
||||
Idempotent: columns that already have an index are skipped, so
|
||||
this is safe to call on every startup. The FTS config is fixed
|
||||
to the app-layer pre-tokenisation + LanceDB normalisation
|
||||
convention (designed for **multilingual mixed content**):
|
||||
|
||||
- ``base_tokenizer="whitespace"`` — split on the spaces our
|
||||
app-layer tokenizer provider already inserted between tokens.
|
||||
- ``lower_case=True`` — Unicode-aware case-fold (English A→a;
|
||||
no-op on CJK characters).
|
||||
- ``stem=True`` — Porter / Snowball English stemmer per
|
||||
``language="English"`` (tantivy default). CJK tokens have no
|
||||
stemmer and pass through untouched.
|
||||
- ``remove_stop_words=False`` — **stop-word removal is owned by
|
||||
the app-layer** (:class:`JiebaTokenizer`), which already drops
|
||||
both Chinese and English stop-words before tokens reach the
|
||||
FTS index. Keeping FTS-side filtering off avoids double-
|
||||
filtering and a divided source of truth.
|
||||
- ``ascii_folding=True`` — strips diacritics (é→e) on Latin
|
||||
characters; no-op on CJK.
|
||||
- ``with_position=True`` — enables phrase queries.
|
||||
|
||||
Subclasses normally do not need to override this — declaring
|
||||
:attr:`BM25_FIELDS` is enough.
|
||||
"""
|
||||
if not cls.BM25_FIELDS:
|
||||
return
|
||||
indices = await table.list_indices()
|
||||
indexed_cols = {col for idx in indices for col in (idx.columns or [])}
|
||||
for field in cls.BM25_FIELDS:
|
||||
if field in indexed_cols:
|
||||
continue
|
||||
await table.create_index(
|
||||
column=field,
|
||||
config=FTS(
|
||||
with_position=True,
|
||||
base_tokenizer="whitespace",
|
||||
lower_case=True,
|
||||
stem=True,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def touch(record: BaseLanceTable) -> BaseLanceTable:
|
||||
"""Set ``record.updated_at = now`` and return the record (chainable)."""
|
||||
record.updated_at = get_utc_now()
|
||||
return record
|
||||
68
src/everos/core/persistence/lancedb/connection.py
Normal file
68
src/everos/core/persistence/lancedb/connection.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""Async LanceDB connection factory.
|
||||
|
||||
LanceDB does not live inside the SQLAlchemy ecosystem; it has its own
|
||||
``connect_async`` returning :class:`lancedb.AsyncConnection`. This module
|
||||
is a thin wrapper that:
|
||||
|
||||
1. ensures the lancedb root directory exists
|
||||
2. converts ``LanceDBSettings.read_consistency_seconds`` into the
|
||||
:class:`datetime.timedelta` value LanceDB expects
|
||||
3. installs a capped :class:`lancedb.Session` so the global index
|
||||
cache cannot grow unbounded and exhaust file descriptors
|
||||
(see :attr:`LanceDBSettings.index_cache_size_bytes` for the
|
||||
full rationale)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as dt
|
||||
from pathlib import Path
|
||||
|
||||
import lancedb
|
||||
from lancedb import AsyncConnection
|
||||
|
||||
from everos.config import LanceDBSettings
|
||||
|
||||
|
||||
async def open_lancedb_connection(
|
||||
lancedb_dir: Path,
|
||||
lancedb_settings: LanceDBSettings,
|
||||
) -> AsyncConnection:
|
||||
"""Open an async LanceDB connection rooted at ``lancedb_dir``.
|
||||
|
||||
Args:
|
||||
lancedb_dir: Filesystem path to the LanceDB root (typically
|
||||
``MemoryRoot.lancedb_dir``). Created if missing.
|
||||
lancedb_settings: Tunables; the ``read_consistency_seconds`` field
|
||||
is converted to a :class:`~datetime.timedelta`, and
|
||||
``index_cache_size_bytes`` caps the global index cache.
|
||||
|
||||
Returns:
|
||||
An :class:`AsyncConnection` ready for table operations.
|
||||
"""
|
||||
# mkdir is a microsecond-fast syscall and only fires on first connect;
|
||||
# not worth pulling in anyio.Path / aiofiles for it.
|
||||
lancedb_dir.mkdir(parents=True, exist_ok=True) # noqa: ASYNC240
|
||||
|
||||
interval: dt.timedelta | None = None
|
||||
if lancedb_settings.read_consistency_seconds is not None:
|
||||
interval = dt.timedelta(seconds=lancedb_settings.read_consistency_seconds)
|
||||
|
||||
# Bound the index cache so its readers (each one holds the FDs of
|
||||
# an opened ``_indices/<uuid>/...`` directory) get LRU-evicted
|
||||
# rather than leaking. Without this, a long-running daemon's FD
|
||||
# count grows monotonically until ``EMFILE``. The metadata cache
|
||||
# is intentionally left at the lancedb default (unbounded): it
|
||||
# holds parsed in-memory manifests with zero FD pressure, and a
|
||||
# cap there would just thrash. See ``LanceDBSettings`` for the
|
||||
# measurement that picked the default size.
|
||||
session = lancedb.Session(
|
||||
index_cache_size_bytes=lancedb_settings.index_cache_size_bytes,
|
||||
metadata_cache_size_bytes=None,
|
||||
)
|
||||
|
||||
return await lancedb.connect_async(
|
||||
str(lancedb_dir),
|
||||
read_consistency_interval=interval,
|
||||
session=session,
|
||||
)
|
||||
530
src/everos/core/persistence/lancedb/repository.py
Normal file
530
src/everos/core/persistence/lancedb/repository.py
Normal file
@ -0,0 +1,530 @@
|
||||
"""Generic CRUD repository for LanceDB-backed tables.
|
||||
|
||||
``LanceRepoBase`` mirrors the SQLite ``RepoBase`` shape: a pure generic
|
||||
CRUD helper that knows nothing about a storage runtime. Concrete repos
|
||||
either pass an :class:`AsyncTable` explicitly (typical in tests) or
|
||||
override :meth:`_table_lookup` to pull the cached table from their
|
||||
storage manager (typical in
|
||||
:mod:`everos.infra.persistence.lancedb.repos`).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import datetime as dt
|
||||
from collections.abc import Sequence
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from lancedb import AsyncTable
|
||||
|
||||
from everos.core.observability.logging import get_logger
|
||||
|
||||
from .base import BaseLanceTable
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def _q(value: str) -> str:
|
||||
"""Escape single quotes for a LanceDB SQL-like ``where`` predicate.
|
||||
|
||||
LanceDB has no parameterised query API; predicates are strings.
|
||||
Doubling the quote (``'`` → ``''``) is the SQL-standard way to keep
|
||||
a literal single quote inside a single-quoted string. everos's PK
|
||||
convention (``<owner_id>_<entry_id>``) never carries quotes — this
|
||||
is defensive.
|
||||
"""
|
||||
return value.replace("'", "''")
|
||||
|
||||
|
||||
class LanceRepoBase[T: BaseLanceTable]:
|
||||
"""Generic CRUD repository for one LanceDB table.
|
||||
|
||||
Subclass and bind to a schema. Two ways to provide the table:
|
||||
|
||||
1. **Explicit (tests / DI)** — pass it to ``__init__``::
|
||||
|
||||
repo = EpisodeRepo(table)
|
||||
|
||||
2. **Lazy hook (production singletons)** — override
|
||||
:meth:`_table_lookup` so the repo can be instantiated as a
|
||||
module-level singleton with no live connection yet::
|
||||
|
||||
class _EpisodeRepo(LanceRepoBase[Episode]):
|
||||
schema = Episode
|
||||
|
||||
async def _table_lookup(self):
|
||||
from everos.infra.persistence.lancedb.lancedb_manager import (
|
||||
get_table,
|
||||
)
|
||||
return await get_table(self.schema.TABLE_NAME, self.schema)
|
||||
|
||||
episode_repo = _EpisodeRepo()
|
||||
await episode_repo.add([Episode(text=..., vector=[...])])
|
||||
|
||||
The LanceDB table name lives on the schema (``BaseLanceTable.TABLE_NAME``)
|
||||
so every LanceDB-side metadatum — column shape, table name,
|
||||
vector dim, BM25 index spec — sits in one place. ``table_name``
|
||||
here is a thin pass-through; subclasses normally do **not**
|
||||
override it.
|
||||
|
||||
Write paths (``add`` / ``upsert`` / ``delete`` / ``delete_by_md_path``)
|
||||
are serialised by a per-``table_name`` :class:`asyncio.Lock`. LanceDB's
|
||||
``merge_insert`` is a read-modify-write at the storage layer with no
|
||||
application-visible OCC contract — two concurrent calls against the
|
||||
same table can race on the version manifest and lose updates even
|
||||
when the row sets are disjoint (observed: cascade worker
|
||||
``asyncio.gather`` over a batch of ``user_profile`` rows where one
|
||||
write disappears). Serialising on the table name closes that window;
|
||||
reads stay unlocked so search QPS is not impacted by writers.
|
||||
|
||||
Locks live in a class-level dict keyed by table name and are never
|
||||
evicted (mirrors :mod:`everos.memory.strategies._partition_locks`
|
||||
on bpo-28427 — a lock with pending waiters must outlive any dict
|
||||
entry that points to it).
|
||||
"""
|
||||
|
||||
schema: type[T]
|
||||
|
||||
_table_locks: ClassVar[dict[str, asyncio.Lock]] = {}
|
||||
"""Per-table-name write lock pool (process-wide, lazily populated)."""
|
||||
|
||||
@property
|
||||
def table_name(self) -> str:
|
||||
"""LanceDB table name, resolved from :attr:`schema.TABLE_NAME`."""
|
||||
return self.schema.TABLE_NAME
|
||||
|
||||
@classmethod
|
||||
def _write_lock(cls, table_name: str) -> asyncio.Lock:
|
||||
"""Return the write lock for ``table_name``; create on first use.
|
||||
|
||||
``dict.setdefault`` is atomic under single-threaded asyncio (no
|
||||
``await`` between check and insert), so no meta-lock is needed.
|
||||
"""
|
||||
return cls._table_locks.setdefault(table_name, asyncio.Lock())
|
||||
|
||||
@classmethod
|
||||
def _reset_locks_for_tests(cls) -> None:
|
||||
"""Test-only: drop the write-lock pool.
|
||||
|
||||
``asyncio.Lock`` binds to the current event loop on first
|
||||
``acquire()``; pytest-asyncio creates a fresh loop per test, so
|
||||
a module-level lock surviving across tests fails with "bound to
|
||||
a different event loop". The production cascade worker runs on
|
||||
one loop forever and does not need this hook. Mirrors
|
||||
:func:`everos.memory.strategies._partition_locks._reset_for_tests`.
|
||||
"""
|
||||
cls._table_locks.clear()
|
||||
|
||||
def __init__(self, table: AsyncTable | None = None) -> None:
|
||||
"""Bind to a table directly; if ``None``, defer to ``_table_lookup``."""
|
||||
self._table_override = table
|
||||
|
||||
async def _table_lookup(self) -> AsyncTable:
|
||||
"""Resolve the table on first use. Override in subclass.
|
||||
|
||||
``LanceRepoBase`` itself has no idea where the runtime singleton
|
||||
lives. The default raises so a missing override is loud rather
|
||||
than silently broken.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{type(self).__name__}: pass table= to __init__ "
|
||||
"or override _table_lookup() to wire the storage manager."
|
||||
)
|
||||
|
||||
async def _table(self) -> AsyncTable:
|
||||
if self._table_override is not None:
|
||||
return self._table_override
|
||||
return await self._table_lookup()
|
||||
|
||||
# ── Create ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def add(self, records: Sequence[T]) -> None:
|
||||
"""Insert one or more records."""
|
||||
table = await self._table()
|
||||
async with self._write_lock(self.table_name):
|
||||
await table.add(list(records))
|
||||
|
||||
# ── Upsert ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def upsert(
|
||||
self,
|
||||
records: Sequence[T],
|
||||
*,
|
||||
by: str = "id",
|
||||
) -> None:
|
||||
"""Upsert records keyed by ``by`` (PK column, default ``"id"``).
|
||||
|
||||
Wraps LanceDB's ``merge_insert(on=...)`` fluent builder with the
|
||||
equivalent of ``INSERT ... ON CONFLICT(by) DO UPDATE`` — matching
|
||||
rows are replaced wholesale, non-matching rows inserted.
|
||||
|
||||
Cascade uses this when reconciling md → LanceDB: an entry seen
|
||||
for the first time inserts; an entry that was edited in md
|
||||
updates its existing row.
|
||||
"""
|
||||
table = await self._table()
|
||||
async with self._write_lock(self.table_name):
|
||||
await (
|
||||
table.merge_insert(by)
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(list(records))
|
||||
)
|
||||
|
||||
# ── Maintenance ────────────────────────────────────────────────────────
|
||||
|
||||
async def optimize(self, *, cleanup_older_than: dt.timedelta | None = None) -> None:
|
||||
"""Compact fragments + merge new data into the FTS / vector indexes.
|
||||
|
||||
LanceDB's ``merge_insert`` writes new data into a fresh fragment.
|
||||
The FTS (BM25) index built by :meth:`ensure_fts_indexes` only
|
||||
covers fragments visible at index-build time, so rows written
|
||||
after the initial build can become **invisible to BM25 queries**
|
||||
until ``optimize()`` runs and merges those fragments into the
|
||||
index segment that the query engine reads.
|
||||
|
||||
Symptom this guards against (verified on LoCoMo conv0): after
|
||||
steady-state cascade ingest, ``nearest_to_text("any_common_word")``
|
||||
returns 0 hits even though the column literally contains the
|
||||
token in 100% of rows — the new fragments simply hadn't been
|
||||
indexed.
|
||||
|
||||
Cascade triggers this through a per-kind throttle + trailing
|
||||
edge scheduler (``CascadeWorker._schedule_optimize``): at most
|
||||
one run per ~1s window per kind, decoupled from the drain
|
||||
loop, with a 60s heartbeat sweep as a safety net. Cost is
|
||||
O(N) data-rewrite per optimized fragment; the throttle is how
|
||||
we cap it under sustained write pressure.
|
||||
|
||||
Args:
|
||||
cleanup_older_than: When set, also prune (physically delete)
|
||||
files belonging to dataset versions older than this
|
||||
interval. ``None`` (default) compacts only — historical
|
||||
manifests, replaced data fragments, and stale index
|
||||
UUID files are kept on disk forever, which inflates the
|
||||
file count (and FD usage at scan time) without bound.
|
||||
Cascade passes a non-None value on a slower beat
|
||||
(``CascadeWorker._optimize_prune_interval``) so the
|
||||
hot drain path stays cheap. Note: this does *not*
|
||||
shrink **active** index internals (FTS ``part_N`` count
|
||||
or vector index UUID count) — those only collapse via
|
||||
``drop_index + create_index``, which is not done here.
|
||||
"""
|
||||
table = await self._table()
|
||||
await table.optimize(cleanup_older_than=cleanup_older_than)
|
||||
|
||||
async def rebuild_indexes(self) -> None:
|
||||
"""Drop and re-create every index on this table.
|
||||
|
||||
**Why this exists** — workaround for an upstream Python API gap:
|
||||
|
||||
Lance's Rust ``OptimizeOptions`` has a ``num_indices_to_merge``
|
||||
knob (default 1) that bounds the number of active index UUIDs
|
||||
per column. With ``Some(1)``, every ``optimize_indices()`` call
|
||||
merges its delta into the base — active UUID count stays at 1.
|
||||
|
||||
Two problems block us from using it from the application layer:
|
||||
|
||||
1. ``lancedb.AsyncTable.optimize()`` does **not expose** this
|
||||
parameter (verified on lancedb main 2026-05-28). It forwards
|
||||
only ``cleanup_since_ms`` and ``delete_unverified`` to Rust.
|
||||
2. Even calling Lance directly via ``pylance``, the merge
|
||||
behaviour itself is buggy on ``lance crate 4.0`` (what
|
||||
lancedb 0.30.2 embeds) — ``num_indices_to_merge=1`` does
|
||||
nothing. Fix landed in ``lance 7.x``, but ``pylance 7.x``
|
||||
can not collapse indexes on a ``lance 4.0``-format dataset
|
||||
(verified by experiment).
|
||||
|
||||
So in our current stack there is **no application-level path**
|
||||
to bound active index UUID growth. ``optimize()`` keeps
|
||||
accumulating one new UUID (vector) / one new ``part_N`` (FTS)
|
||||
per call.
|
||||
|
||||
This method is the workaround: drop every existing index and
|
||||
rebuild from the schema's ``ensure_fts_indexes`` contract. The
|
||||
rebuild is **O(N) full retrain** but cheap in practice (~0.3s
|
||||
for 50k rows × 2 FTS columns on local SSD), and during the
|
||||
window LanceDB transparently falls back to brute-force scan so
|
||||
queries and writes stay available.
|
||||
|
||||
**Cadence** — :class:`CascadeWorker` runs this on a slow loop
|
||||
(default 12h per kind). Frequency is bounded by the rebuild
|
||||
cost, not by correctness — even daily is fine functionally;
|
||||
12h is a conservative pick to keep file/UUID counts well below
|
||||
any FD ceiling under steady-state ingest.
|
||||
|
||||
**When to remove** — once lancedb exposes ``num_indices_to_merge``
|
||||
on the async Python API **and** the embedded ``lance crate``
|
||||
ships the working merge implementation, delete this method and
|
||||
switch to ``optimize(num_indices_to_merge=1)`` in the regular
|
||||
``optimize()`` path. Tracking issues / context:
|
||||
|
||||
- https://github.com/lancedb/lancedb/issues/2193
|
||||
- https://github.com/lancedb/lancedb/issues/3177
|
||||
- https://github.com/lance-format/lance/pull/6711 (partial fix
|
||||
in lance v7.0.0)
|
||||
- https://docs.rs/lancedb/latest/lancedb/table/struct.OptimizeOptions.html
|
||||
"""
|
||||
table = await self._table()
|
||||
async with self._write_lock(self.table_name):
|
||||
for idx in await table.list_indices():
|
||||
await table.drop_index(idx.name)
|
||||
await self.schema.ensure_fts_indexes(table)
|
||||
|
||||
# ── Read ───────────────────────────────────────────────────────────────
|
||||
|
||||
async def count(self) -> int:
|
||||
"""Total row count."""
|
||||
table = await self._table()
|
||||
return await table.count_rows()
|
||||
|
||||
async def get_by_id(
|
||||
self,
|
||||
id_value: str,
|
||||
*,
|
||||
id_field: str = "id",
|
||||
) -> T | None:
|
||||
"""Fetch one row by scalar PK; ``None`` if missing.
|
||||
|
||||
Uses LanceDB scalar filter ``<id_field> = '<id_value>'``. Single
|
||||
quotes in ``id_value`` are doubled to avoid breaking the SQL-like
|
||||
predicate; everos's PK convention is ``<owner_id>_<entry_id>``
|
||||
which never contains quotes, so the escape is defensive.
|
||||
"""
|
||||
table = await self._table()
|
||||
rows = (
|
||||
await table.query()
|
||||
.where(f"{id_field} = '{_q(id_value)}'")
|
||||
.limit(1)
|
||||
.to_list()
|
||||
)
|
||||
if not rows:
|
||||
return None
|
||||
return self.schema.model_validate(rows[0])
|
||||
|
||||
async def find_where(
|
||||
self,
|
||||
where: str,
|
||||
*,
|
||||
limit: int = 100,
|
||||
) -> list[T]:
|
||||
"""Scalar query returning *typed* schema instances.
|
||||
|
||||
Like :meth:`search` but returns ``list[T]`` rather than raw
|
||||
LanceDB row dicts. No vector ANN; pure scalar filter only.
|
||||
Use :meth:`search` when you need ``_distance`` or want to mix
|
||||
ANN with filters.
|
||||
"""
|
||||
table = await self._table()
|
||||
rows = await table.query().where(where).limit(limit).to_list()
|
||||
return [self.schema.model_validate(r) for r in rows]
|
||||
|
||||
async def find_one_where(self, where: str) -> T | None:
|
||||
"""Single-row variant of :meth:`find_where` (``None`` if no match)."""
|
||||
rows = await self.find_where(where, limit=1)
|
||||
return rows[0] if rows else None
|
||||
|
||||
async def find_where_paginated(
|
||||
self,
|
||||
where: str,
|
||||
*,
|
||||
sort_by: str,
|
||||
descending: bool = True,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
max_fetch: int = 20000,
|
||||
) -> tuple[list[T], int]:
|
||||
"""Paginated scalar query with in-memory sort.
|
||||
|
||||
LanceDB has no native ``ORDER BY``. The chassis fetches up to
|
||||
``max_fetch`` rows matching ``where``, sorts the resulting Arrow
|
||||
table by ``sort_by``, then slices ``page`` × ``page_size``. The
|
||||
*true* row count of the predicate is returned alongside the
|
||||
page so callers can render pagination controls without a second
|
||||
query.
|
||||
|
||||
Args:
|
||||
where: SQL-like scalar predicate. Required (no implicit
|
||||
full-table scan from ``find_where_paginated``).
|
||||
sort_by: Column name to sort the result set by.
|
||||
descending: ``True`` (default) → newest first; ``False`` →
|
||||
ascending.
|
||||
page: 1-indexed page number.
|
||||
page_size: Rows per page.
|
||||
max_fetch: Cap on rows pulled before the in-memory sort.
|
||||
When the predicate matches more rows than this cap the
|
||||
page is sorted over an *arbitrary* prefix and the page
|
||||
contents are only approximately correct — the chassis
|
||||
emits a warning so the caller learns about the
|
||||
truncation.
|
||||
|
||||
Returns:
|
||||
``(rows, total)`` — ``rows`` is the typed page,
|
||||
``total`` is ``count_rows(filter=where)`` (the predicate's
|
||||
true match count, regardless of ``max_fetch``).
|
||||
"""
|
||||
table = await self._table()
|
||||
total = await table.count_rows(filter=where)
|
||||
if total > max_fetch:
|
||||
logger.warning(
|
||||
"find_where_paginated truncated",
|
||||
extra={
|
||||
"table": self.table_name,
|
||||
"where": where,
|
||||
"total": total,
|
||||
"max_fetch": max_fetch,
|
||||
},
|
||||
)
|
||||
arrow_tbl = await table.query().where(where).limit(max_fetch).to_arrow()
|
||||
order = "descending" if descending else "ascending"
|
||||
arrow_tbl = arrow_tbl.sort_by([(sort_by, order)])
|
||||
offset = (page - 1) * page_size
|
||||
page_rows = arrow_tbl.slice(offset, page_size)
|
||||
return (
|
||||
[self.schema.model_validate(r) for r in page_rows.to_pylist()],
|
||||
total,
|
||||
)
|
||||
|
||||
async def find_by_owner(
|
||||
self,
|
||||
owner_id: str,
|
||||
*,
|
||||
limit: int = 100,
|
||||
) -> list[T]:
|
||||
"""Fetch rows by ``owner_id`` (5 business tables share this column)."""
|
||||
return await self.find_where(
|
||||
f"owner_id = '{_q(owner_id)}'",
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
async def find_by_md_path(self, md_path: str) -> T | None:
|
||||
"""Reverse-lookup from md path (cascade maps md edit → row)."""
|
||||
return await self.find_one_where(f"md_path = '{_q(md_path)}'")
|
||||
|
||||
async def search(
|
||||
self,
|
||||
*,
|
||||
vector: Sequence[float] | None = None,
|
||||
where: str | None = None,
|
||||
limit: int = 10,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Hybrid search: optional vector ANN + scalar SQL-like predicate.
|
||||
|
||||
Args:
|
||||
vector: Embedding to find nearest rows for; ``None`` skips ANN.
|
||||
where: SQL-like predicate (e.g. ``"tags = 'meeting'"``).
|
||||
limit: Max rows.
|
||||
|
||||
Returns:
|
||||
List of row dicts (LanceDB native shape — fields depend on
|
||||
``schema``; ``_distance`` added when ``vector`` is given).
|
||||
"""
|
||||
table = await self._table()
|
||||
q = table.query()
|
||||
if vector is not None:
|
||||
q = q.nearest_to(list(vector))
|
||||
if where is not None:
|
||||
q = q.where(where)
|
||||
return await q.limit(limit).to_list()
|
||||
|
||||
# ── Delete ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def delete(self, predicate: str) -> None:
|
||||
"""Delete rows matching a SQL-like predicate."""
|
||||
table = await self._table()
|
||||
async with self._write_lock(self.table_name):
|
||||
await table.delete(predicate)
|
||||
|
||||
async def delete_by_md_path(self, md_path: str) -> int:
|
||||
"""Delete every row whose ``md_path`` matches; return rows deleted.
|
||||
|
||||
Cascade handler calls this when an md file is removed on disk
|
||||
(or when reverse-reconcile discovers an orphaned LanceDB row).
|
||||
Single quotes in ``md_path`` are doubled defensively.
|
||||
"""
|
||||
table = await self._table()
|
||||
async with self._write_lock(self.table_name):
|
||||
result = await table.delete(f"md_path = '{_q(md_path)}'")
|
||||
return int(result.num_deleted_rows)
|
||||
|
||||
|
||||
class LanceDailyLogRepoBase[T: BaseLanceTable](LanceRepoBase[T]):
|
||||
"""LanceRepoBase + queries unique to daily-log tables.
|
||||
|
||||
Daily-log tables (``episode`` / ``atomic_fact`` / ``foresight`` /
|
||||
``agent_case``) share a fixed schema slice: ``entry_id`` (md seq
|
||||
id), ``session_id`` (conversation scope), and ``parent_type`` /
|
||||
``parent_id`` (record lineage). The queries below compose those
|
||||
columns; ``agent_skill`` is *not* a daily-log (it is a named
|
||||
single-file entity) and uses :class:`LanceRepoBase` directly.
|
||||
"""
|
||||
|
||||
async def find_by_owner_entry(
|
||||
self,
|
||||
owner_id: str,
|
||||
entry_id: str,
|
||||
*,
|
||||
app_id: str = "default",
|
||||
project_id: str = "default",
|
||||
) -> T | None:
|
||||
"""Single point-query by ``(app, project, owner_id, entry_id)``.
|
||||
|
||||
``entry_id`` is only unique within a (app, project, owner) scope —
|
||||
the same ``ac_<date>_<seq>`` recurs in another space — so the
|
||||
scope segments are part of the predicate to avoid a cross-space hit.
|
||||
"""
|
||||
return await self.find_one_where(
|
||||
f"owner_id = '{_q(owner_id)}' AND entry_id = '{_q(entry_id)}' "
|
||||
f"AND app_id = '{_q(app_id)}' AND project_id = '{_q(project_id)}'"
|
||||
)
|
||||
|
||||
async def find_by_owner_entries(
|
||||
self,
|
||||
owner_id: str,
|
||||
entry_ids: Sequence[str],
|
||||
*,
|
||||
app_id: str = "default",
|
||||
project_id: str = "default",
|
||||
) -> list[T]:
|
||||
"""Bulk point-query by ``(app, project, owner_id, entry_id IN ...)``.
|
||||
|
||||
Empty ``entry_ids`` short-circuits to ``[]`` rather than emit a
|
||||
``WHERE entry_id IN ()`` predicate (LanceDB rejects empty
|
||||
tuples). The query's ``limit`` is bound to ``len(entry_ids)``
|
||||
because at most one row per id can exist under one (app, project,
|
||||
owner) scope.
|
||||
"""
|
||||
if not entry_ids:
|
||||
return []
|
||||
quoted = ", ".join(f"'{_q(eid)}'" for eid in entry_ids)
|
||||
return await self.find_where(
|
||||
f"owner_id = '{_q(owner_id)}' AND entry_id IN ({quoted}) "
|
||||
f"AND app_id = '{_q(app_id)}' AND project_id = '{_q(project_id)}'",
|
||||
limit=len(entry_ids),
|
||||
)
|
||||
|
||||
async def find_by_session(
|
||||
self,
|
||||
owner_id: str,
|
||||
session_id: str,
|
||||
*,
|
||||
limit: int = 100,
|
||||
) -> list[T]:
|
||||
"""Every row in one conversation ``session_id`` under ``owner_id``."""
|
||||
return await self.find_where(
|
||||
f"owner_id = '{_q(owner_id)}' AND session_id = '{_q(session_id)}'",
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
async def find_by_parent(
|
||||
self,
|
||||
parent_type: str,
|
||||
parent_id: str,
|
||||
*,
|
||||
limit: int = 100,
|
||||
) -> list[T]:
|
||||
"""Every row whose parent matches ``(parent_type, parent_id)``."""
|
||||
return await self.find_where(
|
||||
f"parent_type = '{_q(parent_type)}' AND parent_id = '{_q(parent_id)}'",
|
||||
limit=limit,
|
||||
)
|
||||
76
src/everos/core/persistence/locking.py
Normal file
76
src/everos/core/persistence/locking.py
Normal file
@ -0,0 +1,76 @@
|
||||
"""Process-wide exclusive lock on a memory-root.
|
||||
|
||||
Uses ``fcntl.flock`` (POSIX advisory locking, available on Linux + macOS;
|
||||
Windows is not supported — see project README on platform scope). The
|
||||
public surface is an :func:`contextlib.asynccontextmanager` so callers
|
||||
use ``async with memory_root_lock(mr):``; the underlying syscalls have
|
||||
no async equivalent so they run in a worker thread via
|
||||
:func:`anyio.to_thread.run_sync`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import fcntl
|
||||
import os
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import anyio
|
||||
|
||||
from .memory_root import MemoryRoot
|
||||
|
||||
|
||||
class LockError(RuntimeError):
|
||||
"""Raised when the memory-root lock cannot be acquired in non-blocking mode."""
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def memory_root_lock(
|
||||
memory_root: MemoryRoot,
|
||||
*,
|
||||
blocking: bool = True,
|
||||
) -> AsyncIterator[None]:
|
||||
"""Acquire an exclusive process lock on the memory-root.
|
||||
|
||||
Args:
|
||||
memory_root: The memory-root to lock. The lock anchor file
|
||||
(``<root>/.lock``) is created on first use.
|
||||
blocking: If ``True`` (default), wait until the lock is free. If
|
||||
``False``, raise :class:`LockError` immediately when another
|
||||
process holds it.
|
||||
|
||||
Raises:
|
||||
LockError: When ``blocking=False`` and the lock is already held.
|
||||
"""
|
||||
await anyio.Path(memory_root.root).mkdir(parents=True, exist_ok=True)
|
||||
lock_path = memory_root.lock_file
|
||||
|
||||
# Open the anchor file (create on first use). The fd, not the path, is
|
||||
# what fcntl operates on. ``os.open`` is microsecond-fast but offloaded
|
||||
# for consistency with the rest of the lock acquisition flow.
|
||||
fd = await anyio.to_thread.run_sync(
|
||||
lambda: os.open(lock_path, os.O_RDWR | os.O_CREAT, 0o644)
|
||||
)
|
||||
|
||||
flags = fcntl.LOCK_EX
|
||||
if not blocking:
|
||||
flags |= fcntl.LOCK_NB
|
||||
|
||||
try:
|
||||
await anyio.to_thread.run_sync(fcntl.flock, fd, flags)
|
||||
except BlockingIOError as exc:
|
||||
await anyio.to_thread.run_sync(os.close, fd)
|
||||
raise LockError(
|
||||
f"another process already holds the memory-root lock at {lock_path}"
|
||||
) from exc
|
||||
|
||||
# Lock acquired — release + close strictly on exit. The BlockingIOError
|
||||
# path above already cleaned up its fd, so it must NOT enter this
|
||||
# finally block (otherwise we'd double-close).
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
try:
|
||||
await anyio.to_thread.run_sync(fcntl.flock, fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
await anyio.to_thread.run_sync(os.close, fd)
|
||||
62
src/everos/core/persistence/markdown/__init__.py
Normal file
62
src/everos/core/persistence/markdown/__init__.py
Normal file
@ -0,0 +1,62 @@
|
||||
"""Markdown file IO toolkit.
|
||||
|
||||
Atomic write + YAML frontmatter parse/dump + entry marker parse +
|
||||
audit-form structured-entry parsing. Knows nothing about business
|
||||
models (no MemCell / Episode); the :class:`Entry` here is a
|
||||
*marker-delimited* span within a markdown body, not a business record.
|
||||
|
||||
External usage (IO + parse):
|
||||
from everos.core.persistence.markdown import (
|
||||
Entry, EntryId, StructuredEntry,
|
||||
MarkdownReader, MarkdownWriter, ParsedMarkdown,
|
||||
parse_frontmatter, dump_frontmatter,
|
||||
split_entries, find_entry,
|
||||
parse_structured_entry, render_structured_entry,
|
||||
)
|
||||
|
||||
External usage (frontmatter schema chassis):
|
||||
from everos.core.persistence.markdown import (
|
||||
BaseFrontmatter, UserScopedFrontmatter, AgentScopedFrontmatter,
|
||||
DailyLogPathMixin, SkillPathMixin, ProfilePathMixin,
|
||||
)
|
||||
"""
|
||||
|
||||
from .entries import Entry as Entry
|
||||
from .entries import EntryId as EntryId
|
||||
from .entries import StructuredEntry as StructuredEntry
|
||||
from .entries import find_entry as find_entry
|
||||
from .entries import parse_structured_entry as parse_structured_entry
|
||||
from .entries import render_structured_entry as render_structured_entry
|
||||
from .entries import split_entries as split_entries
|
||||
from .frontmatter import AgentScopedFrontmatter as AgentScopedFrontmatter
|
||||
from .frontmatter import BaseFrontmatter as BaseFrontmatter
|
||||
from .frontmatter import DailyLogPathMixin as DailyLogPathMixin
|
||||
from .frontmatter import ProfilePathMixin as ProfilePathMixin
|
||||
from .frontmatter import SkillPathMixin as SkillPathMixin
|
||||
from .frontmatter import UserScopedFrontmatter as UserScopedFrontmatter
|
||||
from .frontmatter import dump_frontmatter as dump_frontmatter
|
||||
from .frontmatter import parse_frontmatter as parse_frontmatter
|
||||
from .parsed import ParsedMarkdown as ParsedMarkdown
|
||||
from .reader import MarkdownReader as MarkdownReader
|
||||
from .writer import MarkdownWriter as MarkdownWriter
|
||||
|
||||
__all__ = [
|
||||
"AgentScopedFrontmatter",
|
||||
"BaseFrontmatter",
|
||||
"DailyLogPathMixin",
|
||||
"Entry",
|
||||
"EntryId",
|
||||
"MarkdownReader",
|
||||
"MarkdownWriter",
|
||||
"ParsedMarkdown",
|
||||
"ProfilePathMixin",
|
||||
"SkillPathMixin",
|
||||
"StructuredEntry",
|
||||
"UserScopedFrontmatter",
|
||||
"dump_frontmatter",
|
||||
"find_entry",
|
||||
"parse_frontmatter",
|
||||
"parse_structured_entry",
|
||||
"render_structured_entry",
|
||||
"split_entries",
|
||||
]
|
||||
368
src/everos/core/persistence/markdown/entries.py
Normal file
368
src/everos/core/persistence/markdown/entries.py
Normal file
@ -0,0 +1,368 @@
|
||||
"""Markdown entries — id format, marker spans, and audit-form parsing.
|
||||
|
||||
Three closely-related entry concepts live together here so a reader
|
||||
sees the whole entry surface in one file:
|
||||
|
||||
1. :class:`EntryId` — the ``<prefix>_<YYYYMMDD>_<NNNN>`` structured id
|
||||
stamped into each daily-log entry's open / close markers. Carries
|
||||
the prefix declared by the frontmatter schema, the date bucket, and
|
||||
the in-file zero-padded sequence.
|
||||
|
||||
2. :class:`Entry` — a marker-delimited span inside a markdown body::
|
||||
|
||||
<!-- entry:abc123 -->
|
||||
...content...
|
||||
<!-- /entry:abc123 -->
|
||||
|
||||
:func:`split_entries` and :func:`find_entry` locate these spans
|
||||
without interpreting the inner content. Higher layers (writers,
|
||||
cascade) parse it per record type.
|
||||
|
||||
3. :class:`StructuredEntry` — :class:`Entry` extended with the parsed
|
||||
audit-form body fields (header / inline / sections). Built either
|
||||
from a raw body string via :func:`parse_structured_entry` or from
|
||||
an existing :class:`Entry` via :meth:`Entry.as_structured`.
|
||||
|
||||
Audit-form layout::
|
||||
|
||||
## <header> ← optional H2 (usually entry id, for grep)
|
||||
|
||||
**key**: value ← inline fields, one per line
|
||||
**key2**: value2
|
||||
|
||||
### Section Title ← section fields: H3 + free-form text
|
||||
body content...
|
||||
|
||||
### Another Section
|
||||
more content...
|
||||
|
||||
The audit chassis is intentionally **type-agnostic** — every field
|
||||
round-trips as a string. Inline values are stringified on render
|
||||
(lists become ``[a, b, c]``, scalars use ``str()``); on parse
|
||||
everything is the raw text after the colon. Section titles are kept
|
||||
verbatim. This keeps parsing tolerant of stray fields, wrapped
|
||||
strings, and manually-typed timestamps; the strong-typed model lives
|
||||
in business writers + the SQLite/LanceDB indexes.
|
||||
|
||||
Cross-user uniqueness is handled at the database layer via a composite
|
||||
``<user_id>_<entry_id>`` field; it is *not* encoded into the
|
||||
:class:`EntryId` string itself.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as _dt
|
||||
import re
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Self
|
||||
|
||||
# ── EntryId — structured id for marker stamping ─────────────────────────
|
||||
|
||||
_DATE_FMT = "%Y%m%d"
|
||||
_SEQ_DIGITS = 8
|
||||
"""Minimum zero-padding for the in-file seq.
|
||||
|
||||
8 digits keeps lexicographic order == numeric order up to 10**8
|
||||
entries per file (per user, per day). ``format()`` is "at least 8" —
|
||||
larger seqs emit more digits without truncation. ``parse`` is
|
||||
permissive: shorter (legacy 4-digit) and longer seq strings both
|
||||
parse cleanly; format normalises to >= 8 digits on round-trip.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class EntryId:
|
||||
"""Parsed components of an entry id (``<prefix>_<YYYYMMDD>_<NNNN>``)."""
|
||||
|
||||
prefix: str
|
||||
date: _dt.date
|
||||
seq: int
|
||||
|
||||
def format(self) -> str:
|
||||
"""Render as ``<prefix>_<YYYYMMDD>_<NNNN>``."""
|
||||
return (
|
||||
f"{self.prefix}_{self.date.strftime(_DATE_FMT)}_{self.seq:0{_SEQ_DIGITS}d}"
|
||||
)
|
||||
|
||||
def __str__(self) -> str: # noqa: D401
|
||||
return self.format()
|
||||
|
||||
@classmethod
|
||||
def parse(cls, s: str) -> Self:
|
||||
"""Parse ``<prefix>_<YYYYMMDD>_<NNNN>``.
|
||||
|
||||
Uses ``rsplit("_", 2)`` so a multi-segment prefix (rare, but
|
||||
possible) is preserved as-is.
|
||||
"""
|
||||
parts = s.rsplit("_", 2)
|
||||
if len(parts) != 3:
|
||||
raise ValueError(f"invalid entry id format: {s!r}")
|
||||
prefix, date_str, seq_str = parts
|
||||
if not prefix:
|
||||
raise ValueError(f"empty prefix in entry id: {s!r}")
|
||||
try:
|
||||
d = _dt.datetime.strptime(date_str, _DATE_FMT).date()
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"invalid date in entry id: {s!r}") from exc
|
||||
try:
|
||||
seq = int(seq_str)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"invalid seq in entry id: {s!r}") from exc
|
||||
if seq < 0:
|
||||
raise ValueError(f"negative seq in entry id: {s!r}")
|
||||
return cls(prefix=prefix, date=d, seq=seq)
|
||||
|
||||
@classmethod
|
||||
def next_for(cls, prefix: str, date: _dt.date, current_count: int) -> Self:
|
||||
"""Build the id for the next entry given the file's current count.
|
||||
|
||||
``current_count`` is the value of ``frontmatter.entry_count``
|
||||
*before* this append. The new id gets ``seq = current_count + 1``.
|
||||
"""
|
||||
if current_count < 0:
|
||||
raise ValueError(f"current_count must be >= 0, got {current_count}")
|
||||
return cls(prefix=prefix, date=date, seq=current_count + 1)
|
||||
|
||||
|
||||
# ── Entry — marker-delimited span inside a body ─────────────────────────
|
||||
|
||||
# Filename / URL-safe id alphabet for the marker.
|
||||
_ID_PATTERN = r"[A-Za-z0-9_-]+"
|
||||
_OPEN_RE = re.compile(rf"<!-- entry:({_ID_PATTERN}) -->")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Entry:
|
||||
"""One marker-delimited entry within a markdown body.
|
||||
|
||||
Attributes:
|
||||
id: Value between ``entry:`` and ``-->`` in the open marker.
|
||||
body: Content between the open and close markers, with one leading
|
||||
and one trailing newline removed (typical formatter output).
|
||||
start: Offset of the opening ``<!-- entry:id -->`` in the source body.
|
||||
end: Offset just past the closing ``<!-- /entry:id -->`` in the source.
|
||||
"""
|
||||
|
||||
id: str
|
||||
body: str
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def as_structured(self) -> StructuredEntry:
|
||||
"""Parse my body as audit-form and return a :class:`StructuredEntry`.
|
||||
|
||||
The id / body / start / end fields are preserved; the parsed
|
||||
``header`` / ``inline`` / ``sections`` are added on top.
|
||||
"""
|
||||
return parse_structured_entry(self.body, _origin=self)
|
||||
|
||||
|
||||
def split_entries(body: str) -> list[Entry]:
|
||||
"""Scan ``body`` and return every entry in order.
|
||||
|
||||
Unmatched / unterminated open markers stop the scan at the first
|
||||
such marker — partial entries are not returned. Callers needing
|
||||
strict validation should layer a dedicated check on top.
|
||||
"""
|
||||
entries: list[Entry] = []
|
||||
pos = 0
|
||||
while True:
|
||||
open_match = _OPEN_RE.search(body, pos)
|
||||
if open_match is None:
|
||||
break
|
||||
entry_id = open_match.group(1)
|
||||
close_match = _close_re_for(entry_id).search(body, open_match.end())
|
||||
if close_match is None:
|
||||
# Unterminated entry — abort further scanning.
|
||||
break
|
||||
entries.append(
|
||||
Entry(
|
||||
id=entry_id,
|
||||
body=_strip_one_newline(body[open_match.end() : close_match.start()]),
|
||||
start=open_match.start(),
|
||||
end=close_match.end(),
|
||||
)
|
||||
)
|
||||
pos = close_match.end()
|
||||
return entries
|
||||
|
||||
|
||||
def find_entry(body: str, entry_id: str) -> Entry | None:
|
||||
"""Find the first entry with ``entry_id``, or ``None``."""
|
||||
open_re = re.compile(rf"<!-- entry:{re.escape(entry_id)} -->")
|
||||
open_match = open_re.search(body)
|
||||
if open_match is None:
|
||||
return None
|
||||
close_match = _close_re_for(entry_id).search(body, open_match.end())
|
||||
if close_match is None:
|
||||
return None
|
||||
return Entry(
|
||||
id=entry_id,
|
||||
body=_strip_one_newline(body[open_match.end() : close_match.start()]),
|
||||
start=open_match.start(),
|
||||
end=close_match.end(),
|
||||
)
|
||||
|
||||
|
||||
def _close_re_for(entry_id: str) -> re.Pattern[str]:
|
||||
"""Build the close-marker regex for a specific id."""
|
||||
return re.compile(rf"<!-- /entry:{re.escape(entry_id)} -->")
|
||||
|
||||
|
||||
def _strip_one_newline(text: str) -> str:
|
||||
"""Strip one leading and one trailing newline (typical formatter padding)."""
|
||||
if text.startswith("\r\n"):
|
||||
text = text[2:]
|
||||
elif text.startswith("\n"):
|
||||
text = text[1:]
|
||||
if text.endswith("\r\n"):
|
||||
text = text[:-2]
|
||||
elif text.endswith("\n"):
|
||||
text = text[:-1]
|
||||
return text
|
||||
|
||||
|
||||
# ── StructuredEntry — Entry + parsed audit-form fields ──────────────────
|
||||
|
||||
# H2 line: ``## <header>``.
|
||||
_H2_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
|
||||
# Inline field: ``**key**: value``. Anchored to line start so a stray
|
||||
# ``**emphasis**`` mid-paragraph isn't mistaken for a field.
|
||||
_INLINE_RE = re.compile(
|
||||
r"^\*\*(?P<key>[^*\n]+?)\*\*:\s*(?P<value>.*?)\s*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
# H3 line: ``### Title``.
|
||||
_H3_RE = re.compile(r"^###\s+(.+?)\s*$", re.MULTILINE)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class StructuredEntry(Entry):
|
||||
""":class:`Entry` whose body has been parsed as audit-form data.
|
||||
|
||||
Inherits ``id`` / ``body`` / ``start`` / ``end`` from :class:`Entry`
|
||||
(zeroed when built from a raw body string with no marker context)
|
||||
and adds three parsed views of the body: the optional H2 header,
|
||||
the inline ``**key**: value`` map, and the ``### Title`` sections.
|
||||
|
||||
Audit-form values are strings only; type coercion is the caller's
|
||||
job (a strong-typed model lives in the writer / index).
|
||||
"""
|
||||
|
||||
header: str | None = None
|
||||
inline: dict[str, str] = field(default_factory=dict)
|
||||
sections: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def render_structured_entry(
|
||||
*,
|
||||
header: str | None = None,
|
||||
inline: Mapping[str, object] | None = None,
|
||||
sections: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Render an audit-form entry body.
|
||||
|
||||
Args:
|
||||
header: Optional H2 line at the top (typically the entry id —
|
||||
redundant with the marker but useful for plain-text grep).
|
||||
inline: ``{key: value}`` rendered as ``**key**: value``. Values
|
||||
are stringified: ``list``/``tuple`` become ``[a, b, c]``;
|
||||
``None`` becomes the empty string; everything else uses
|
||||
``str()``.
|
||||
sections: ``{title: body}`` rendered as ``### Title`` plus the
|
||||
body text. Title is verbatim; body's trailing whitespace is
|
||||
stripped.
|
||||
|
||||
Returns:
|
||||
The rendered string, no trailing newline (the caller — typically
|
||||
:meth:`MarkdownWriter.append_entry` — handles markers + newlines).
|
||||
"""
|
||||
inline = inline or {}
|
||||
sections = sections or {}
|
||||
lines: list[str] = []
|
||||
|
||||
if header:
|
||||
lines.append(f"## {header}")
|
||||
lines.append("")
|
||||
|
||||
for key, value in inline.items():
|
||||
lines.append(f"**{key}**: {_render_value(value)}")
|
||||
|
||||
for title, body in sections.items():
|
||||
lines.append("")
|
||||
lines.append(f"### {title}")
|
||||
lines.append(body.rstrip())
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def parse_structured_entry(
|
||||
body: str, *, _origin: Entry | None = None
|
||||
) -> StructuredEntry:
|
||||
"""Parse an audit-form entry body. Strings only — no type coercion.
|
||||
|
||||
Tolerant of:
|
||||
|
||||
- missing H2 (``header`` will be ``None``)
|
||||
- inline fields appearing before, between or after sections
|
||||
(only matches before the first H3 are taken as the inline block)
|
||||
- extra whitespace and stray lines (silently kept inside the
|
||||
enclosing section's body)
|
||||
|
||||
When called via :meth:`Entry.as_structured`, the ``_origin`` Entry
|
||||
contributes its ``id`` / ``start`` / ``end``; otherwise those fall
|
||||
back to ``""`` / ``0`` / ``len(body)``.
|
||||
|
||||
Returns:
|
||||
:class:`StructuredEntry` with everything as strings.
|
||||
"""
|
||||
text = body.strip("\n")
|
||||
|
||||
# Split on H3 lines.
|
||||
parts = _H3_RE.split(text)
|
||||
head = parts[0]
|
||||
sections_dict: dict[str, str] = {}
|
||||
for i in range(1, len(parts), 2):
|
||||
title = parts[i].strip()
|
||||
content = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
sections_dict[title] = content.strip("\n").rstrip()
|
||||
|
||||
header: str | None = None
|
||||
h2 = _H2_RE.search(head)
|
||||
if h2:
|
||||
header = h2.group(1).strip()
|
||||
|
||||
inline_dict: dict[str, str] = {
|
||||
m.group("key").strip(): m.group("value").strip()
|
||||
for m in _INLINE_RE.finditer(head)
|
||||
}
|
||||
|
||||
if _origin is not None:
|
||||
return StructuredEntry(
|
||||
id=_origin.id,
|
||||
body=_origin.body,
|
||||
start=_origin.start,
|
||||
end=_origin.end,
|
||||
header=header,
|
||||
inline=inline_dict,
|
||||
sections=sections_dict,
|
||||
)
|
||||
return StructuredEntry(
|
||||
id="",
|
||||
body=body,
|
||||
start=0,
|
||||
end=len(body),
|
||||
header=header,
|
||||
inline=inline_dict,
|
||||
sections=sections_dict,
|
||||
)
|
||||
|
||||
|
||||
def _render_value(value: object) -> str:
|
||||
"""Stringify an inline value the audit-friendly way."""
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, list | tuple):
|
||||
return "[" + ", ".join(str(item) for item in value) + "]"
|
||||
return str(value)
|
||||
300
src/everos/core/persistence/markdown/frontmatter.py
Normal file
300
src/everos/core/persistence/markdown/frontmatter.py
Normal file
@ -0,0 +1,300 @@
|
||||
"""Frontmatter — YAML block parse / dump + L1 schema chassis.
|
||||
|
||||
Frontmatter is the leading ``---``-delimited YAML block at the top of
|
||||
a markdown document::
|
||||
|
||||
---
|
||||
title: Hello
|
||||
tags: [a, b]
|
||||
---
|
||||
# Body starts here
|
||||
|
||||
Two complementary surfaces live here:
|
||||
|
||||
1. :func:`parse_frontmatter` / :func:`dump_frontmatter` — schema-free
|
||||
YAML helpers (``yaml.safe_load`` / ``yaml.safe_dump``,
|
||||
``sort_keys=False`` so caller-controlled key order is preserved).
|
||||
|
||||
2. The L1 chassis classes — :class:`BaseFrontmatter`,
|
||||
:class:`UserScopedFrontmatter`, :class:`AgentScopedFrontmatter` —
|
||||
which fix the *absolute-readonly* fields (``id`` / ``type`` /
|
||||
``schema_version``) plus scope (``user_id`` / ``agent_id`` +
|
||||
``track``). Every business frontmatter schema in
|
||||
``infra/persistence/markdown/mds/`` subclasses one of these.
|
||||
|
||||
Concrete business schemas (``UserMemcellDailyFrontmatter``,
|
||||
``SkillFrontmatter``, …) live in ``infra``; they add per-record
|
||||
business fields plus the path-resolution metadata daily-log writers
|
||||
need (``ENTRY_ID_PREFIX`` / ``DIR_NAME`` / ``FILE_PREFIX``).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, ClassVar, Literal
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
# ── YAML helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_DELIM = "---"
|
||||
|
||||
|
||||
def parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
|
||||
"""Parse a leading ``---\\n...\\n---\\n`` YAML block.
|
||||
|
||||
Returns:
|
||||
(meta, remainder): ``meta`` is the parsed YAML mapping (empty dict
|
||||
if no frontmatter present, malformed, or non-mapping). ``remainder``
|
||||
is everything after the closing delimiter line — including the body's
|
||||
leading content as-is.
|
||||
|
||||
Notes:
|
||||
- If the document does not start with ``---``, returns ``({}, text)``
|
||||
unchanged.
|
||||
- If a closing ``---`` line is not found, returns ``({}, text)``.
|
||||
- If the YAML block is empty (``---\\n---\\n``), returns
|
||||
``({}, remainder)``.
|
||||
- If the parsed YAML is not a mapping (e.g. a scalar list), returns
|
||||
``({}, text)`` — frontmatter must be a mapping.
|
||||
"""
|
||||
if not text.startswith(_DELIM):
|
||||
return {}, text
|
||||
|
||||
# Skip the opening "---" and the newline that must follow it.
|
||||
rest = text[len(_DELIM) :]
|
||||
if rest.startswith("\r\n"):
|
||||
rest = rest[2:]
|
||||
elif rest.startswith("\n"):
|
||||
rest = rest[1:]
|
||||
else:
|
||||
# Opening "---" not followed by a newline → not a valid frontmatter.
|
||||
return {}, text
|
||||
|
||||
closing_idx = _find_closing_delim(rest)
|
||||
if closing_idx is None:
|
||||
return {}, text
|
||||
|
||||
yaml_block = rest[:closing_idx]
|
||||
remainder = rest[closing_idx + len(_DELIM) :]
|
||||
# Drop the newline that follows the closing delimiter, if any.
|
||||
if remainder.startswith("\r\n"):
|
||||
remainder = remainder[2:]
|
||||
elif remainder.startswith("\n"):
|
||||
remainder = remainder[1:]
|
||||
|
||||
parsed: Any = yaml.safe_load(yaml_block) if yaml_block.strip() else {}
|
||||
if parsed is None:
|
||||
parsed = {}
|
||||
if not isinstance(parsed, dict):
|
||||
return {}, text
|
||||
return parsed, remainder
|
||||
|
||||
|
||||
def dump_frontmatter(meta: Mapping[str, Any]) -> str:
|
||||
"""Render a mapping as a ``---\\n<yaml>\\n---\\n`` block.
|
||||
|
||||
An empty mapping yields the empty string (no delimiters). The YAML
|
||||
payload preserves caller-supplied key order (``sort_keys=False``).
|
||||
"""
|
||||
if not meta:
|
||||
return ""
|
||||
yaml_block = yaml.safe_dump(
|
||||
dict(meta),
|
||||
sort_keys=False,
|
||||
allow_unicode=True,
|
||||
default_flow_style=False,
|
||||
)
|
||||
return f"{_DELIM}\n{yaml_block}{_DELIM}\n"
|
||||
|
||||
|
||||
def _find_closing_delim(text: str) -> int | None:
|
||||
"""Find the offset of a line that is exactly ``---``.
|
||||
|
||||
A "line" is text between two newlines (or string boundaries).
|
||||
Returns the offset of the first character of the matching line, or
|
||||
``None`` if no such line exists.
|
||||
"""
|
||||
pos = 0
|
||||
while pos < len(text):
|
||||
nl = text.find("\n", pos)
|
||||
line = text[pos:nl] if nl != -1 else text[pos:]
|
||||
if line.rstrip("\r") == _DELIM:
|
||||
return pos
|
||||
if nl == -1:
|
||||
return None
|
||||
pos = nl + 1
|
||||
return None
|
||||
|
||||
|
||||
# ── L1 schema chassis ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
class BaseFrontmatter(BaseModel):
|
||||
"""L1 fields every markdown frontmatter must carry.
|
||||
|
||||
These match the *absolute-readonly* tier in the EverOS Markdown First
|
||||
spec — they identify the record across markdown ↔ LanceDB and must
|
||||
never be rewritten by a human edit.
|
||||
|
||||
Subclasses add scope (``UserScopedFrontmatter`` /
|
||||
``AgentScopedFrontmatter``) plus per-record business fields.
|
||||
"""
|
||||
|
||||
SCOPE_DIR: ClassVar[str] = ""
|
||||
"""Top-level directory under the memory-root that holds this kind.
|
||||
|
||||
Scope mixins set this to ``"users"`` / ``"agents"``. Scope-agnostic
|
||||
schemas (rare) leave it empty; consumers that need to resolve a path
|
||||
(writers, layout reverse-lookup) must reject schemas with empty
|
||||
``SCOPE_DIR``.
|
||||
"""
|
||||
|
||||
id: str
|
||||
type: str
|
||||
schema_version: int = 1
|
||||
|
||||
# Permit additional fields so L2 system-managed metadata
|
||||
# (``md_sha256``, ``last_indexed_at``, ``lsn``, …) can ride along on
|
||||
# the same model without forcing every subclass to redeclare them.
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
@classmethod
|
||||
def path_glob(cls) -> str:
|
||||
"""Return an ``fnmatch``-style glob (relative to memory-root)
|
||||
covering every markdown file this schema describes.
|
||||
|
||||
Used by the cascade kind registry — the scanner walks every kind's
|
||||
``path_glob()`` to enumerate eligible files without hard-coding
|
||||
path patterns in cascade. The schema is the single source of truth
|
||||
for both the writer's path resolution and the scanner's enumeration.
|
||||
|
||||
Subclasses must override — typically by mixing in
|
||||
:class:`DailyLogPathMixin` or :class:`SkillPathMixin` *before* the
|
||||
scope mixin in the MRO so this abstract version is shadowed.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{cls.__name__} must declare path_glob() "
|
||||
f"(mix in DailyLogPathMixin / SkillPathMixin, or override directly)"
|
||||
)
|
||||
|
||||
|
||||
class DailyLogPathMixin:
|
||||
"""Path strategy for daily-log files.
|
||||
|
||||
Files live at ``<SCOPE_DIR>/<scope_id>/<DIR_NAME>/<FILE_PREFIX>-<YYYY-MM-DD>.md``.
|
||||
Subclasses must inherit a scope mixin (``UserScopedFrontmatter`` /
|
||||
``AgentScopedFrontmatter``) supplying ``SCOPE_DIR``, and must declare
|
||||
their own ``DIR_NAME`` / ``FILE_PREFIX`` ClassVars.
|
||||
|
||||
Place **this mixin first** so Python's MRO resolves ``path_glob()`` to
|
||||
the mixin's concrete implementation rather than
|
||||
:meth:`BaseFrontmatter.path_glob`'s ``NotImplementedError`` stub::
|
||||
|
||||
class EpisodeDailyFrontmatter(DailyLogPathMixin, UserScopedFrontmatter):
|
||||
DIR_NAME: ClassVar[str] = "episodes"
|
||||
FILE_PREFIX: ClassVar[str] = "episode"
|
||||
...
|
||||
"""
|
||||
|
||||
DIR_NAME: ClassVar[str]
|
||||
FILE_PREFIX: ClassVar[str]
|
||||
SCOPE_DIR: ClassVar[str]
|
||||
|
||||
@classmethod
|
||||
def path_glob(cls) -> str:
|
||||
# Leading ``*/*/`` matches the <app>/<project> scope prefix that
|
||||
# precedes every user-visible dir; the scanner's ``root.glob`` is
|
||||
# anchored at root, so the prefix is mandatory (without it nothing
|
||||
# matches), and the watcher's right-anchored ``PurePosixPath.match``
|
||||
# agrees on the same shape.
|
||||
return f"*/*/{cls.SCOPE_DIR}/*/{cls.DIR_NAME}/{cls.FILE_PREFIX}-*.md"
|
||||
|
||||
|
||||
class SkillPathMixin:
|
||||
"""Path strategy for skill-directory files.
|
||||
|
||||
Each skill lives at ``<SCOPE_DIR>/<scope_id>/<SKILLS_CONTAINER_NAME>/
|
||||
<SKILL_DIR_PREFIX><skill_name>/<SKILL_MAIN_FILENAME>``. The glob covers
|
||||
every skill's main file; sibling ``references/*.md`` and ``scripts/*``
|
||||
are excluded (they ride alongside the main file and the cascade
|
||||
daemon rebuilds the index column by concatenation, see
|
||||
:class:`AgentSkillFrontmatter`'s docstring).
|
||||
|
||||
Place **this mixin first** so MRO resolves ``path_glob()`` here::
|
||||
|
||||
class AgentSkillFrontmatter(SkillPathMixin, AgentScopedFrontmatter):
|
||||
SKILLS_CONTAINER_NAME: ClassVar[str] = "skills"
|
||||
SKILL_DIR_PREFIX: ClassVar[str] = "skill_"
|
||||
SKILL_MAIN_FILENAME: ClassVar[str] = "SKILL.md"
|
||||
...
|
||||
"""
|
||||
|
||||
SKILLS_CONTAINER_NAME: ClassVar[str]
|
||||
SKILL_DIR_PREFIX: ClassVar[str]
|
||||
SKILL_MAIN_FILENAME: ClassVar[str]
|
||||
SCOPE_DIR: ClassVar[str]
|
||||
|
||||
@classmethod
|
||||
def path_glob(cls) -> str:
|
||||
# Leading ``*/*/`` matches the <app>/<project> scope prefix.
|
||||
return (
|
||||
f"*/*/{cls.SCOPE_DIR}/*/{cls.SKILLS_CONTAINER_NAME}/"
|
||||
f"{cls.SKILL_DIR_PREFIX}*/{cls.SKILL_MAIN_FILENAME}"
|
||||
)
|
||||
|
||||
|
||||
class ProfilePathMixin:
|
||||
"""Path strategy for single-file profile markdown.
|
||||
|
||||
Profiles live at ``<SCOPE_DIR>/<scope_id>/<PROFILE_FILENAME>`` —
|
||||
one fixed-name file directly under the scope's owner directory, no
|
||||
intermediate ``<dir>/`` segment (unlike daily-logs) and no per-name
|
||||
subdir (unlike skills). Subclasses must inherit a scope mixin
|
||||
(``UserScopedFrontmatter`` / ``AgentScopedFrontmatter``) supplying
|
||||
``SCOPE_DIR`` and declare their own ``PROFILE_FILENAME``.
|
||||
|
||||
Place **this mixin first** so MRO resolves ``path_glob()`` here::
|
||||
|
||||
class UserProfileFrontmatter(ProfilePathMixin, UserScopedFrontmatter):
|
||||
PROFILE_FILENAME: ClassVar[str] = "user.md"
|
||||
...
|
||||
"""
|
||||
|
||||
PROFILE_FILENAME: ClassVar[str]
|
||||
SCOPE_DIR: ClassVar[str]
|
||||
|
||||
@classmethod
|
||||
def path_glob(cls) -> str:
|
||||
# Leading ``*/*/`` matches the <app>/<project> scope prefix.
|
||||
return f"*/*/{cls.SCOPE_DIR}/*/{cls.PROFILE_FILENAME}"
|
||||
|
||||
|
||||
class UserScopedFrontmatter(BaseFrontmatter):
|
||||
"""Records that belong to a single user (track = ``user``).
|
||||
|
||||
The frontmatter only carries the *file-level* scope (``user_id``,
|
||||
which the path itself already expresses); business attributes like
|
||||
``group_id`` live inside each entry's structured body — see
|
||||
:class:`StructuredEntry` in :mod:`.entries`.
|
||||
"""
|
||||
|
||||
SCOPE_DIR: ClassVar[str] = "users"
|
||||
|
||||
user_id: str
|
||||
track: Literal["user"] = "user"
|
||||
|
||||
|
||||
class AgentScopedFrontmatter(BaseFrontmatter):
|
||||
"""Records that belong to a single agent (track = ``agent``).
|
||||
|
||||
Same scope-vs-business split as :class:`UserScopedFrontmatter`:
|
||||
``agent_id`` is the file-level scope; ``group_id`` etc. ride on
|
||||
each entry, not on the file frontmatter.
|
||||
"""
|
||||
|
||||
SCOPE_DIR: ClassVar[str] = "agents"
|
||||
|
||||
agent_id: str
|
||||
track: Literal["agent"] = "agent"
|
||||
31
src/everos/core/persistence/markdown/parsed.py
Normal file
31
src/everos/core/persistence/markdown/parsed.py
Normal file
@ -0,0 +1,31 @@
|
||||
"""Parsed-markdown data type.
|
||||
|
||||
The output shape of :class:`MarkdownReader` is held here, separate
|
||||
from the reader implementation: callers that only consume parse
|
||||
results don't need to import the reader machinery, and downstream
|
||||
modules (writer, business readers) can produce :class:`ParsedMarkdown`
|
||||
without going through ``MarkdownReader.read`` if they already hold
|
||||
the pieces.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .entries import Entry
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedMarkdown:
|
||||
"""A markdown document after parsing.
|
||||
|
||||
Attributes:
|
||||
frontmatter: Parsed YAML mapping (empty dict if no frontmatter block).
|
||||
body: Document text after the frontmatter block; not entry-stripped.
|
||||
entries: Marker-delimited entries discovered inside ``body``.
|
||||
"""
|
||||
|
||||
frontmatter: dict[str, Any]
|
||||
body: str
|
||||
entries: list[Entry] = field(default_factory=list)
|
||||
42
src/everos/core/persistence/markdown/reader.py
Normal file
42
src/everos/core/persistence/markdown/reader.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""Markdown file reader.
|
||||
|
||||
Loads a markdown document and splits it into:
|
||||
|
||||
1. ``frontmatter`` — parsed YAML (empty dict if absent)
|
||||
2. ``body`` — raw text after the closing ``---`` delimiter
|
||||
3. ``entries`` — marker-delimited spans inside ``body``
|
||||
|
||||
The reader is purely parsing; it does not validate frontmatter shape,
|
||||
entry content, or cross-references. Higher layers add business-aware
|
||||
checks. The :class:`ParsedMarkdown` data type lives in :mod:`.parsed`.
|
||||
|
||||
``parse`` is sync (pure in-memory string processing). ``read`` is async
|
||||
and uses :class:`anyio.Path` so file I/O does not block the event loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import anyio
|
||||
|
||||
from .entries import split_entries
|
||||
from .frontmatter import parse_frontmatter
|
||||
from .parsed import ParsedMarkdown
|
||||
|
||||
|
||||
class MarkdownReader:
|
||||
"""Parse markdown files / strings into :class:`ParsedMarkdown`."""
|
||||
|
||||
@staticmethod
|
||||
def parse(text: str) -> ParsedMarkdown:
|
||||
"""Parse already-loaded text (no IO)."""
|
||||
meta, body = parse_frontmatter(text)
|
||||
entries = split_entries(body)
|
||||
return ParsedMarkdown(frontmatter=meta, body=body, entries=entries)
|
||||
|
||||
@staticmethod
|
||||
async def read(path: Path) -> ParsedMarkdown:
|
||||
"""Read the file at ``path`` and parse its content."""
|
||||
text = await anyio.Path(path).read_text(encoding="utf-8")
|
||||
return MarkdownReader.parse(text)
|
||||
269
src/everos/core/persistence/markdown/writer.py
Normal file
269
src/everos/core/persistence/markdown/writer.py
Normal file
@ -0,0 +1,269 @@
|
||||
"""Markdown file writer with atomic write semantics.
|
||||
|
||||
Atomicity is provided by writing to a same-directory temp file
|
||||
(``.<name>.tmp.<uuid>``) and using :func:`os.replace` to rename it onto
|
||||
the target. Keeping the temp file in the same directory guarantees the
|
||||
rename is on the same filesystem (POSIX rename is atomic only within a
|
||||
single fs).
|
||||
|
||||
All public methods are async. File I/O (``read_text`` / ``write_text``
|
||||
/ ``mkdir``) goes through :class:`anyio.Path`; the few syscalls without
|
||||
a native async equivalent (``os.fsync`` / ``os.replace`` / ``unlink``
|
||||
in the cleanup path) are offloaded via :func:`anyio.to_thread.run_sync`.
|
||||
|
||||
In-process per-path locking
|
||||
---------------------------
|
||||
:meth:`append_entry` / :meth:`append_entries` are read-modify-write of
|
||||
the whole file (load frontmatter+body, merge an entry block, atomic
|
||||
write the result). The atomic write itself is safe, but the read→write
|
||||
window crosses ``await`` points. Concurrent asyncio tasks targeting the
|
||||
same path would otherwise lose-update each other (both read N entries,
|
||||
both produce N+1, second write overwrites the first → 1 entry lost).
|
||||
|
||||
To prevent this, an in-process per-path :class:`asyncio.Lock` is held
|
||||
across the entire read-modify-write sequence. Lock objects live on the
|
||||
writer instance (not class-level) so they bind to the event loop active
|
||||
when the writer was constructed — this avoids the
|
||||
"Lock bound to different loop" failure mode that surfaces when
|
||||
pytest-asyncio rebuilds the loop between tests but module-level writer
|
||||
singletons leak Lock objects across boundaries.
|
||||
|
||||
Process-level coordination (multi-process writers against the same
|
||||
memory-root) remains the job of
|
||||
:func:`everos.core.persistence.locking.memory_root_lock`, which uses
|
||||
``fcntl.flock``. The two locks compose: per-path async lock serialises
|
||||
tasks within one process, ``memory_root_lock`` serialises processes
|
||||
against each other.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import uuid
|
||||
from collections.abc import Mapping, Sequence
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import anyio
|
||||
|
||||
from ..memory_root import MemoryRoot
|
||||
from .entries import EntryId
|
||||
from .frontmatter import dump_frontmatter
|
||||
from .reader import MarkdownReader
|
||||
|
||||
|
||||
class MarkdownWriter:
|
||||
"""Atomic writer for markdown files inside a memory-root.
|
||||
|
||||
The ``memory_root`` reference is held to enable future enforcement that
|
||||
targets stay within the configured root; current writes do not depend on
|
||||
it for the rename itself (same-dir temp file).
|
||||
"""
|
||||
|
||||
def __init__(self, memory_root: MemoryRoot) -> None:
|
||||
self._memory_root = memory_root
|
||||
# Per-path async lock registry. ``setdefault`` is GIL-atomic, so
|
||||
# concurrent callers race only on the dict insert (resolved by
|
||||
# ``setdefault`` returning the existing value), not on the Lock.
|
||||
# Plain dict (not WeakValueDictionary): a Lock with pending waiters
|
||||
# must outlive any task awaiting it; ref-counted GC would race with
|
||||
# those waiters. See Python bpo-28427 for the WeakValueDictionary
|
||||
# multithreading hazard that bites the weak-ref approach.
|
||||
self._path_locks: dict[Path, asyncio.Lock] = {}
|
||||
|
||||
@property
|
||||
def memory_root(self) -> MemoryRoot:
|
||||
return self._memory_root
|
||||
|
||||
def lock_for(self, path: Path) -> asyncio.Lock:
|
||||
"""Return the per-path lock; create on first use.
|
||||
|
||||
Public so that higher-level writers (e.g. :class:`BaseDailyWriter`)
|
||||
can serialise their own multi-step ``read → compute → write``
|
||||
sequences against this writer's single-step ``append`` paths.
|
||||
Pair with :meth:`_append_entries_unlocked` to avoid reentrant
|
||||
re-acquisition of the same lock from within an already-locked
|
||||
critical section (``asyncio.Lock`` is *not* reentrant).
|
||||
"""
|
||||
# Resolve to an absolute canonical path so aliases (relative vs.
|
||||
# absolute, symlinks) share the same lock object.
|
||||
key = Path(path).resolve()
|
||||
lock = self._path_locks.get(key)
|
||||
if lock is None:
|
||||
lock = asyncio.Lock()
|
||||
self._path_locks[key] = lock
|
||||
return lock
|
||||
|
||||
async def write(self, path: Path, content: str) -> Path:
|
||||
"""Atomically write ``content`` to ``path``.
|
||||
|
||||
Steps:
|
||||
1. ``mkdir -p`` the parent directory.
|
||||
2. Write to ``<parent>/.<name>.tmp.<uuid>``.
|
||||
3. ``flush`` + ``fsync`` the temp file.
|
||||
4. ``os.replace`` the temp file onto ``path`` (atomic on POSIX).
|
||||
|
||||
Returns:
|
||||
``path`` (resolved as written).
|
||||
"""
|
||||
target = Path(path)
|
||||
await anyio.Path(target.parent).mkdir(parents=True, exist_ok=True)
|
||||
tmp = target.parent / f".{target.name}.tmp.{uuid.uuid4().hex}"
|
||||
try:
|
||||
await anyio.to_thread.run_sync(_write_and_fsync, tmp, content)
|
||||
await anyio.to_thread.run_sync(os.replace, tmp, target)
|
||||
except Exception:
|
||||
# Best-effort cleanup of the staging file on failure.
|
||||
await _unlink_quiet(tmp)
|
||||
raise
|
||||
return target
|
||||
|
||||
async def write_markdown(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
frontmatter: Mapping[str, Any] | None = None,
|
||||
body: str = "",
|
||||
) -> Path:
|
||||
"""Assemble ``frontmatter`` + ``body`` then atomic-write to ``path``."""
|
||||
head = dump_frontmatter(frontmatter or {})
|
||||
return await self.write(path, head + body)
|
||||
|
||||
async def append_entry(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
entry_body: str,
|
||||
entry_id: EntryId,
|
||||
frontmatter_updates: Mapping[str, Any] | None = None,
|
||||
) -> Path:
|
||||
"""Append a single entry block to a markdown file, merging frontmatter.
|
||||
|
||||
Convenience wrapper around :meth:`append_entries` for single-entry
|
||||
callers. See that method for full semantics.
|
||||
|
||||
Args:
|
||||
path: Target markdown file. Created if missing.
|
||||
entry_body: Content between the open and close markers.
|
||||
One leading and trailing newline are added automatically.
|
||||
entry_id: The id to stamp on this entry. The caller normally
|
||||
builds it with :meth:`EntryId.next_for`.
|
||||
frontmatter_updates: Mapping shallow-merged into existing
|
||||
frontmatter (later wins). ``None`` skips the merge.
|
||||
|
||||
Returns:
|
||||
``path`` (resolved as written).
|
||||
"""
|
||||
return await self.append_entries(
|
||||
path,
|
||||
[(entry_body, entry_id)],
|
||||
frontmatter_updates=frontmatter_updates,
|
||||
)
|
||||
|
||||
async def append_entries(
|
||||
self,
|
||||
path: Path,
|
||||
entries: Sequence[tuple[str, EntryId]],
|
||||
*,
|
||||
frontmatter_updates: Mapping[str, Any] | None = None,
|
||||
) -> Path:
|
||||
"""Append ``N`` entry blocks in a single locked read-modify-write cycle.
|
||||
|
||||
Compared with calling :meth:`append_entry` ``N`` times, this:
|
||||
|
||||
* Performs one file read + one file write instead of ``N`` of each
|
||||
(IO complexity drops from ``O(N²)`` to ``O(N)`` when the file
|
||||
already holds many entries).
|
||||
* Holds the per-path lock for one short critical section instead of
|
||||
``N`` separate acquisitions.
|
||||
* Updates ``frontmatter`` once at the end (no intermediate
|
||||
``entry_count`` flapping).
|
||||
|
||||
The caller assigns and supplies all :class:`EntryId` values — see
|
||||
:meth:`append_entry` for the rationale. The order in ``entries`` is
|
||||
the order the blocks land in the file.
|
||||
|
||||
Args:
|
||||
path: Target markdown file. Created if missing.
|
||||
entries: ``(entry_body, entry_id)`` pairs to append, in order.
|
||||
Empty sequence is allowed; the file is still touched for
|
||||
frontmatter updates if any are supplied.
|
||||
frontmatter_updates: Mapping shallow-merged into existing
|
||||
frontmatter once after all entries are appended.
|
||||
|
||||
Returns:
|
||||
``path`` (resolved as written).
|
||||
"""
|
||||
target = Path(path)
|
||||
async with self.lock_for(target):
|
||||
return await self._append_entries_unlocked(
|
||||
target,
|
||||
entries,
|
||||
frontmatter_updates=frontmatter_updates,
|
||||
)
|
||||
|
||||
async def _append_entries_unlocked(
|
||||
self,
|
||||
path: Path,
|
||||
entries: Sequence[tuple[str, EntryId]],
|
||||
*,
|
||||
frontmatter_updates: Mapping[str, Any] | None = None,
|
||||
) -> Path:
|
||||
"""Same as :meth:`append_entries` but assumes the caller already
|
||||
holds :meth:`lock_for` ``(path)``.
|
||||
|
||||
For use by higher-level writers that perform a multi-step
|
||||
``read → compute eid → write`` sequence and need to keep the lock
|
||||
held across the read and the write. Public ``append_entries`` /
|
||||
``append_entry`` always wrap this with the lock.
|
||||
|
||||
Reentrant re-acquisition is unsafe — ``asyncio.Lock`` is not
|
||||
reentrant, so calling this without holding the lock yourself
|
||||
breaks the safety contract.
|
||||
"""
|
||||
target = Path(path)
|
||||
|
||||
# 1. Load existing markdown (or initialise empty).
|
||||
if await anyio.Path(target).is_file():
|
||||
parsed = await MarkdownReader.read(target)
|
||||
meta: dict[str, Any] = dict(parsed.frontmatter)
|
||||
body = parsed.body
|
||||
else:
|
||||
meta = {}
|
||||
body = ""
|
||||
|
||||
# 2. Shallow-merge frontmatter updates.
|
||||
if frontmatter_updates:
|
||||
meta.update(frontmatter_updates)
|
||||
|
||||
# 3. Append all entry blocks in order.
|
||||
if entries:
|
||||
if body and not body.endswith("\n"):
|
||||
body += "\n"
|
||||
appended_blocks: list[str] = []
|
||||
for entry_body, entry_id in entries:
|
||||
eid_str = entry_id.format()
|
||||
appended_blocks.append(
|
||||
f"<!-- entry:{eid_str} -->\n{entry_body}\n"
|
||||
f"<!-- /entry:{eid_str} -->\n"
|
||||
)
|
||||
body = body + "".join(appended_blocks)
|
||||
|
||||
# 4. Atomic write.
|
||||
return await self.write_markdown(target, frontmatter=meta, body=body)
|
||||
|
||||
|
||||
def _write_and_fsync(tmp: Path, content: str) -> None:
|
||||
"""Sync helper: write + fsync the staging file. Offloaded to a thread."""
|
||||
with open(tmp, "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
fh.flush()
|
||||
os.fsync(fh.fileno())
|
||||
|
||||
|
||||
async def _unlink_quiet(tmp: Path) -> None:
|
||||
"""Best-effort unlink — swallow OSError so the original exception wins."""
|
||||
with contextlib.suppress(OSError):
|
||||
await anyio.Path(tmp).unlink(missing_ok=True)
|
||||
243
src/everos/core/persistence/memory_root.py
Normal file
243
src/everos/core/persistence/memory_root.py
Normal file
@ -0,0 +1,243 @@
|
||||
"""memory-root path manager.
|
||||
|
||||
Single root directory holding all persisted memory:
|
||||
|
||||
User-visible (no dot prefix, edited by humans / agents):
|
||||
agents/ per-agent records
|
||||
users/ per-user records
|
||||
knowledge/ global shared knowledge
|
||||
|
||||
System-managed (dotfile prefix, hidden by default in ls / Finder):
|
||||
.index/ derived indexes (rebuildable from markdown)
|
||||
sqlite/ system.db (+ WAL/SHM), ome.db, ome.aps.db
|
||||
lancedb/ LanceDB tables
|
||||
.tmp/ atomic-write staging directory
|
||||
.lock single-process lock anchor (created on demand by
|
||||
``memory_root_lock``)
|
||||
|
||||
User-editable (at the root):
|
||||
ome.toml OME strategy overrides (hot-reloaded)
|
||||
|
||||
The cascade queue, LSN watermark, and change audit all live in
|
||||
``system.db`` (table ``md_change_state``), not in separate dotfiles.
|
||||
|
||||
The default location and tunables come from :class:`everos.config.Settings`
|
||||
(loaded from ``config/default.toml`` + ``EVEROS_*`` environment variables);
|
||||
:meth:`MemoryRoot.default` resolves the configured path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# ── app / project directory-name convention ──────────────────────────────────
|
||||
#
|
||||
# A memory root is partitioned by ``<app>/<project>`` *before* the user-visible
|
||||
# scope dirs (``agents`` / ``users`` / ``knowledge``), so memory for different
|
||||
# (app, project) pairs never shares a directory. The reserved id ``"default"``
|
||||
# materialises as ``default_app`` / ``default_project`` on disk (rather than a
|
||||
# bare ``default``) so a default space is visually distinct from a user-named
|
||||
# directory; every other id maps to itself.
|
||||
#
|
||||
# The mapping is symmetric: the cascade path parser reverses it (see
|
||||
# :func:`app_id_from_dir`) to recover the ids from an on-disk path. The write
|
||||
# side (here) and the read side (cascade) MUST stay in lockstep, or rebuilt
|
||||
# rows carry app/project that disagree with what was written. ``default_app`` /
|
||||
# ``default_project`` are therefore reserved directory names.
|
||||
_DEFAULT_SCOPE_ID = "default"
|
||||
_DEFAULT_APP_DIR = "default_app"
|
||||
_DEFAULT_PROJECT_DIR = "default_project"
|
||||
|
||||
# Path to the shipped OME override template; copied to ``<root>/ome.toml`` on
|
||||
# first ``ensure()`` so users have a real file to edit instead of having to
|
||||
# create one from scratch. ``parents[2]`` is the ``src/everos/`` package root
|
||||
# (memory_root.py sits at ``core/persistence/memory_root.py``).
|
||||
_OME_TEMPLATE_PATH = Path(__file__).parents[2] / "config" / "default_ome.toml"
|
||||
|
||||
|
||||
def app_dir_name(app_id: str) -> str:
|
||||
"""Map an ``app_id`` to its on-disk directory name."""
|
||||
return _DEFAULT_APP_DIR if app_id == _DEFAULT_SCOPE_ID else app_id
|
||||
|
||||
|
||||
def project_dir_name(project_id: str) -> str:
|
||||
"""Map a ``project_id`` to its on-disk directory name."""
|
||||
return _DEFAULT_PROJECT_DIR if project_id == _DEFAULT_SCOPE_ID else project_id
|
||||
|
||||
|
||||
def app_id_from_dir(dir_name: str) -> str:
|
||||
"""Inverse of :func:`app_dir_name` — recover the ``app_id`` from a dir name."""
|
||||
return _DEFAULT_SCOPE_ID if dir_name == _DEFAULT_APP_DIR else dir_name
|
||||
|
||||
|
||||
def project_id_from_dir(dir_name: str) -> str:
|
||||
"""Inverse of :func:`project_dir_name` — recover the ``project_id``."""
|
||||
return _DEFAULT_SCOPE_ID if dir_name == _DEFAULT_PROJECT_DIR else dir_name
|
||||
|
||||
|
||||
@dataclass(frozen=True, init=False)
|
||||
class MemoryRoot:
|
||||
"""Path manager for a memory-root directory.
|
||||
|
||||
Constructor accepts any path-like (``str`` or ``Path``); it is normalised
|
||||
to an absolute, resolved ``Path`` so equality and hashing are stable
|
||||
regardless of how the caller spells the path. ``init=False`` is paired
|
||||
with a hand-written ``__init__`` so the input type (``Path | str``) is
|
||||
decoupled from the stored field type (``Path``) — stdlib dataclass has
|
||||
no converter slot, and Pyright would otherwise reject ``MemoryRoot(s)``
|
||||
where ``s`` is a ``str``.
|
||||
"""
|
||||
|
||||
root: Path
|
||||
|
||||
def __init__(self, root: Path | str) -> None:
|
||||
# ``frozen=True`` forbids attribute assignment, so go through
|
||||
# ``object.__setattr__`` to install the normalised Path field.
|
||||
resolved = Path(root).expanduser().resolve()
|
||||
object.__setattr__(self, "root", resolved)
|
||||
|
||||
@classmethod
|
||||
def default(cls) -> MemoryRoot:
|
||||
"""Return the memory-root from :class:`everos.config.Settings`.
|
||||
|
||||
The effective default lives in ``config/default.toml`` (``[memory]
|
||||
root``); environment variable ``EVEROS_MEMORY__ROOT`` overrides it.
|
||||
"""
|
||||
# Lazy import to keep this module dependency-free at import time.
|
||||
from everos.config import load_settings
|
||||
|
||||
return cls(load_settings().memory.root)
|
||||
|
||||
# ── User-visible (partitioned by app / project) ──────────────────────────
|
||||
#
|
||||
# These take ``(app_id, project_id)`` because the scope dirs hang off the
|
||||
# ``<root>/<app>/<project>/`` prefix; they are request-level inputs, never
|
||||
# instance state. Both default to ``"default"`` so call sites that don't
|
||||
# yet carry scope still resolve to the default space.
|
||||
|
||||
def agents_dir(self, app_id: str = "default", project_id: str = "default") -> Path:
|
||||
"""``<root>/<app>/<project>/agents/`` — per-agent records."""
|
||||
return (
|
||||
self.root / app_dir_name(app_id) / project_dir_name(project_id) / "agents"
|
||||
)
|
||||
|
||||
def users_dir(self, app_id: str = "default", project_id: str = "default") -> Path:
|
||||
"""``<root>/<app>/<project>/users/`` — per-user records."""
|
||||
return self.root / app_dir_name(app_id) / project_dir_name(project_id) / "users"
|
||||
|
||||
def knowledge_dir(
|
||||
self, app_id: str = "default", project_id: str = "default"
|
||||
) -> Path:
|
||||
"""``<root>/<app>/<project>/knowledge/`` — shared knowledge."""
|
||||
return (
|
||||
self.root
|
||||
/ app_dir_name(app_id)
|
||||
/ project_dir_name(project_id)
|
||||
/ "knowledge"
|
||||
)
|
||||
|
||||
# ── System-managed (dotfiles) ───────────────────────────────────────────
|
||||
|
||||
@property
|
||||
def index_dir(self) -> Path:
|
||||
"""``<root>/.index/`` — derived index root."""
|
||||
return self.root / ".index"
|
||||
|
||||
@property
|
||||
def lancedb_dir(self) -> Path:
|
||||
"""``<root>/.index/lancedb/`` — LanceDB table root."""
|
||||
return self.index_dir / "lancedb"
|
||||
|
||||
@property
|
||||
def sqlite_dir(self) -> Path:
|
||||
"""``<root>/.index/sqlite/`` — SQLite system DB root.
|
||||
|
||||
Holds ``system.db`` plus its sidecars (``-wal`` / ``-shm`` in WAL
|
||||
mode). Symmetric with :attr:`lancedb_dir`.
|
||||
"""
|
||||
return self.index_dir / "sqlite"
|
||||
|
||||
@property
|
||||
def system_db(self) -> Path:
|
||||
"""``<root>/.index/sqlite/system.db`` — SQLite DB for system
|
||||
state, audit log, task queue, LSN watermark, and other metadata.
|
||||
"""
|
||||
return self.sqlite_dir / "system.db"
|
||||
|
||||
@property
|
||||
def ome_db(self) -> Path:
|
||||
"""``<root>/.index/sqlite/ome.db`` — SQLite DB backing the Offline
|
||||
Memory Engine's own state: run records, counter store, idle store.
|
||||
Symmetric with :attr:`system_db`.
|
||||
"""
|
||||
return self.sqlite_dir / "ome.db"
|
||||
|
||||
@property
|
||||
def ome_aps_db(self) -> Path:
|
||||
"""``<root>/.index/sqlite/ome.aps.db`` — SQLite DB holding the
|
||||
APScheduler jobstore for the Offline Memory Engine. Split from
|
||||
:attr:`ome_db` so APS's sync SQLAlchemy writer and OME's async
|
||||
aiosqlite writer never contend for the same sqlite file lock.
|
||||
"""
|
||||
return self.sqlite_dir / "ome.aps.db"
|
||||
|
||||
@property
|
||||
def ome_config(self) -> Path:
|
||||
"""``<root>/ome.toml`` — user-editable OME strategy overrides.
|
||||
|
||||
Drop a file here to toggle strategies on/off or tweak per-strategy
|
||||
knobs (max_retries, gate, cron …) without restarting the server.
|
||||
The engine watches this file and hot-reloads changes within ~2 s.
|
||||
|
||||
Example to disable foresight and user-profile extraction::
|
||||
|
||||
[strategies.extract_foresight]
|
||||
enabled = false
|
||||
|
||||
[strategies.extract_user_profile]
|
||||
enabled = false
|
||||
"""
|
||||
return self.root / "ome.toml"
|
||||
|
||||
@property
|
||||
def lock_file(self) -> Path:
|
||||
"""``<root>/.lock`` — single-process exclusive lock anchor."""
|
||||
return self.root / ".lock"
|
||||
|
||||
@property
|
||||
def tmp_dir(self) -> Path:
|
||||
"""``<root>/.tmp/`` — staging directory for batch / multi-step writes.
|
||||
|
||||
Note:
|
||||
``MarkdownWriter`` does *not* use this for atomic single-file
|
||||
writes; it uses a same-directory temp file to guarantee a
|
||||
same-filesystem rename. This directory is reserved for callers
|
||||
that need scratch space outside any single target directory.
|
||||
"""
|
||||
return self.root / ".tmp"
|
||||
|
||||
# ── Operations ──────────────────────────────────────────────────────────
|
||||
|
||||
def ensure(self) -> None:
|
||||
"""Create the memory-root and the runtime-required dotfile dirs.
|
||||
|
||||
User-visible directories (``agents/`` / ``users/`` / ``knowledge/``)
|
||||
are *not* pre-created — they appear on first write of their records.
|
||||
Only directories the runtime infrastructure requires are made:
|
||||
|
||||
<root>/
|
||||
<root>/.index/
|
||||
<root>/.index/sqlite/
|
||||
<root>/.index/lancedb/
|
||||
<root>/.tmp/
|
||||
"""
|
||||
self.root.mkdir(parents=True, exist_ok=True)
|
||||
self.index_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.sqlite_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.lancedb_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Materialize the OME override template on first run; existence-only
|
||||
# check preserves any edits the user has already made.
|
||||
if not self.ome_config.exists():
|
||||
self.ome_config.write_bytes(_OME_TEMPLATE_PATH.read_bytes())
|
||||
42
src/everos/core/persistence/sqlite/__init__.py
Normal file
42
src/everos/core/persistence/sqlite/__init__.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""SQLite async persistence (SQLModel + SQLAlchemy 2.0 + aiosqlite).
|
||||
|
||||
External usage (engine + sessions):
|
||||
from everos.core.persistence.sqlite import (
|
||||
create_system_engine, create_session_factory, session_scope,
|
||||
)
|
||||
|
||||
External usage (ORM model basics — re-exported from sqlmodel):
|
||||
from everos.core.persistence.sqlite import (
|
||||
SQLModel, Field, Relationship, BaseTable,
|
||||
)
|
||||
|
||||
External usage (generic CRUD repository base):
|
||||
from everos.core.persistence.sqlite import RepoBase
|
||||
|
||||
The ``system_db`` is the everos
|
||||
``<memory_root>/.index/sqlite/system.db`` SQLite file holding system
|
||||
state, audit log, task queue, LSN watermark, and other metadata.
|
||||
"""
|
||||
|
||||
# Re-export key sqlmodel symbols so business code has a single canonical
|
||||
# entry point (``everos.core.persistence.sqlite``) for ORM authoring.
|
||||
from sqlmodel import Field as Field
|
||||
from sqlmodel import Relationship as Relationship
|
||||
from sqlmodel import SQLModel as SQLModel
|
||||
|
||||
from .base import BaseTable as BaseTable
|
||||
from .engine import create_system_engine as create_system_engine
|
||||
from .repository import RepoBase as RepoBase
|
||||
from .session import create_session_factory as create_session_factory
|
||||
from .session import session_scope as session_scope
|
||||
|
||||
__all__ = [
|
||||
"BaseTable",
|
||||
"Field",
|
||||
"Relationship",
|
||||
"RepoBase",
|
||||
"SQLModel",
|
||||
"create_session_factory",
|
||||
"create_system_engine",
|
||||
"session_scope",
|
||||
]
|
||||
112
src/everos/core/persistence/sqlite/base.py
Normal file
112
src/everos/core/persistence/sqlite/base.py
Normal file
@ -0,0 +1,112 @@
|
||||
"""Common SQLModel base for everos tables.
|
||||
|
||||
:class:`BaseTable` adds ``created_at`` / ``updated_at`` columns. The
|
||||
``updated_at`` column auto-refreshes on UPDATE through SA's ``onupdate``
|
||||
hook (no explicit assignment needed in business code).
|
||||
|
||||
The **two-zone storage-UTC discipline** is enforced by a SQLAlchemy
|
||||
:class:`TypeDecorator` (:class:`UtcDateTimeColumn`) used as the SQL
|
||||
column type for every datetime field:
|
||||
|
||||
* **on write** — ``process_bind_param`` converts every datetime to
|
||||
aware UTC before SQLAlchemy emits the bound parameter. This covers
|
||||
*every* SQLAlchemy write path uniformly:
|
||||
|
||||
- ORM ``session.add()`` / ``session.merge()`` (unit-of-work flush)
|
||||
- Core ``session.execute(insert(...).values(...))``
|
||||
- Core ``session.execute(update(...).values(...))``
|
||||
- Bulk ``bulk_insert_mappings`` / ``bulk_save_objects``
|
||||
- Raw SQL with bound parameters
|
||||
|
||||
Reaching into the column type is the only place SQLAlchemy guarantees
|
||||
*every* write path passes through. Mapper events (``before_insert`` /
|
||||
``before_update``) only fire on the ORM unit-of-work path and would
|
||||
silently miss Core statements — which :mod:`everos.infra.persistence
|
||||
.sqlite.repos.md_change_state` uses heavily.
|
||||
|
||||
* **on read** — ``process_result_value`` re-attaches ``tzinfo=UTC`` to
|
||||
every naive datetime returned from SQLite (which has no native tz
|
||||
storage and always returns naive). Callers therefore never observe a
|
||||
naive datetime regardless of which read API they use.
|
||||
|
||||
Subclass with ``table=True`` to declare a real SQLite table::
|
||||
|
||||
from sqlmodel import Field
|
||||
|
||||
class Sender(BaseTable, table=True):
|
||||
id: int | None = Field(default=None, primary_key=True)
|
||||
name: str
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as _dt
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import DateTime
|
||||
from sqlalchemy import types as sa_types
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from everos.component.utils.datetime import UtcDatetime, ensure_utc, get_utc_now
|
||||
|
||||
|
||||
class UtcDateTimeColumn(sa_types.TypeDecorator[_dt.datetime]):
|
||||
"""SQLAlchemy column type enforcing storage-UTC on every read/write.
|
||||
|
||||
Implementation:
|
||||
|
||||
* ``impl = DateTime`` — uses the dialect's standard DateTime SQL type
|
||||
(TEXT ISO-8601 on SQLite; ``TIMESTAMP`` on Postgres etc.).
|
||||
* ``process_bind_param`` — write hook. Awares → ``astimezone(UTC)``;
|
||||
naives → assumed already UTC (storage-boundary convention; see
|
||||
:func:`ensure_utc` docstring); ``None`` passes through.
|
||||
* ``process_result_value`` — read hook. Naive ``datetime`` →
|
||||
``replace(tzinfo=UTC)``; aware passes through unchanged.
|
||||
|
||||
``cache_ok = True`` — SQLAlchemy can safely cache statement
|
||||
compilations using this type (no per-instance mutable state).
|
||||
"""
|
||||
|
||||
impl = DateTime
|
||||
cache_ok = True
|
||||
|
||||
def process_bind_param(
|
||||
self, value: _dt.datetime | None, _dialect: Any
|
||||
) -> _dt.datetime | None:
|
||||
if value is None:
|
||||
return None
|
||||
if not isinstance(value, _dt.datetime):
|
||||
return value
|
||||
return ensure_utc(value)
|
||||
|
||||
def process_result_value(
|
||||
self, value: _dt.datetime | None, _dialect: Any
|
||||
) -> _dt.datetime | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, _dt.datetime) and value.tzinfo is None:
|
||||
return value.replace(tzinfo=_dt.UTC)
|
||||
return value
|
||||
|
||||
|
||||
class BaseTable(SQLModel):
|
||||
"""Mixin providing ``created_at`` / ``updated_at`` columns.
|
||||
|
||||
Both default to :func:`get_utc_now` on INSERT.
|
||||
``updated_at`` is auto-refreshed by SQLAlchemy on every UPDATE via the
|
||||
``onupdate`` hook — do not set it manually unless overriding intentionally.
|
||||
|
||||
Both columns use :class:`UtcDateTimeColumn` as the SQL column type
|
||||
so storage-UTC is enforced **at the SQLAlchemy bind layer** on every
|
||||
write path (ORM + Core + bulk + raw bound params).
|
||||
"""
|
||||
|
||||
created_at: UtcDatetime = Field(
|
||||
default_factory=get_utc_now,
|
||||
sa_type=UtcDateTimeColumn,
|
||||
)
|
||||
updated_at: UtcDatetime = Field(
|
||||
default_factory=get_utc_now,
|
||||
sa_type=UtcDateTimeColumn,
|
||||
sa_column_kwargs={"onupdate": get_utc_now},
|
||||
)
|
||||
74
src/everos/core/persistence/sqlite/engine.py
Normal file
74
src/everos/core/persistence/sqlite/engine.py
Normal file
@ -0,0 +1,74 @@
|
||||
"""Async SQLAlchemy engine factory + per-connection PRAGMA listener.
|
||||
|
||||
The engine connects through ``aiosqlite`` (SA URL ``sqlite+aiosqlite://``).
|
||||
PRAGMAs are *per-connection* — they must be re-applied every time the
|
||||
SA pool opens a new connection. We attach a ``connect`` event listener on
|
||||
the engine's underlying sync engine for that purpose.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import event
|
||||
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
|
||||
|
||||
from everos.config import SqliteSettings
|
||||
|
||||
|
||||
def create_system_engine(
|
||||
db_path: Path,
|
||||
sqlite_settings: SqliteSettings,
|
||||
*,
|
||||
echo: bool = False,
|
||||
) -> AsyncEngine:
|
||||
"""Create an async SQLAlchemy engine for the everos system DB.
|
||||
|
||||
``MemoryRoot.system_db`` is the conventional path; the DB holds system
|
||||
state, audit log, task queue, LSN watermark, and other metadata.
|
||||
|
||||
Args:
|
||||
db_path: Filesystem path to the system DB file. Parent directory is
|
||||
created if missing.
|
||||
sqlite_settings: Tunables (journal_mode, synchronous, foreign_keys,
|
||||
temp_store, busy_timeout, journal_size_limit, cache_size).
|
||||
echo: When ``True``, SQLAlchemy logs every statement (development).
|
||||
|
||||
Returns:
|
||||
An :class:`AsyncEngine` ready for use with :class:`AsyncSession`.
|
||||
"""
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Three slashes = relative path; four slashes = absolute. ``str(db_path)``
|
||||
# of an absolute Path begins with ``/`` so the f-string yields four.
|
||||
url = f"sqlite+aiosqlite:///{db_path}"
|
||||
engine = create_async_engine(url, echo=echo, future=True)
|
||||
|
||||
_register_pragma_listener(engine, sqlite_settings)
|
||||
return engine
|
||||
|
||||
|
||||
def _register_pragma_listener(
|
||||
engine: AsyncEngine,
|
||||
sqlite_settings: SqliteSettings,
|
||||
) -> None:
|
||||
"""Attach a ``connect`` listener that applies PRAGMAs on every new connection."""
|
||||
|
||||
@event.listens_for(engine.sync_engine, "connect")
|
||||
def _apply_pragmas(dbapi_connection, _connection_record) -> None: # type: ignore[no-untyped-def]
|
||||
cursor = dbapi_connection.cursor()
|
||||
try:
|
||||
cursor.execute(f"PRAGMA journal_mode={sqlite_settings.journal_mode}")
|
||||
cursor.execute(f"PRAGMA synchronous={sqlite_settings.synchronous}")
|
||||
cursor.execute(
|
||||
f"PRAGMA foreign_keys={'ON' if sqlite_settings.foreign_keys else 'OFF'}"
|
||||
)
|
||||
cursor.execute(f"PRAGMA temp_store={sqlite_settings.temp_store}")
|
||||
cursor.execute(f"PRAGMA busy_timeout={sqlite_settings.busy_timeout_ms}")
|
||||
cursor.execute(
|
||||
f"PRAGMA journal_size_limit={sqlite_settings.journal_size_limit_bytes}"
|
||||
)
|
||||
# cache_size: negative = KB, positive = pages.
|
||||
cursor.execute(f"PRAGMA cache_size=-{sqlite_settings.cache_size_kb}")
|
||||
finally:
|
||||
cursor.close()
|
||||
166
src/everos/core/persistence/sqlite/repository.py
Normal file
166
src/everos/core/persistence/sqlite/repository.py
Normal file
@ -0,0 +1,166 @@
|
||||
"""Generic CRUD repository for SQLModel-backed tables.
|
||||
|
||||
``RepoBase`` is a pure generic CRUD helper that sits alongside
|
||||
:class:`BaseTable`. It knows nothing about a storage runtime — concrete
|
||||
repos either pass ``session_factory`` explicitly (typical in tests) or
|
||||
override :meth:`_factory_lookup` to pull the singleton from their
|
||||
storage manager (typical in :mod:`everos.infra.persistence.sqlite.repos`).
|
||||
|
||||
Each method opens its own ``session_scope`` (auto rollback on exception,
|
||||
session closed at end). For multi-step transactional work, use the
|
||||
session factory directly via :attr:`session_factory`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
||||
from sqlmodel import SQLModel, select
|
||||
|
||||
from .session import session_scope
|
||||
|
||||
|
||||
class RepoBase[T: SQLModel]:
|
||||
"""Generic CRUD repository for one SQLModel table.
|
||||
|
||||
Subclass and bind to a model. Two ways to provide the session factory:
|
||||
|
||||
1. **Explicit (tests / DI)** — pass it to ``__init__``::
|
||||
|
||||
repo = SenderRepo(session_factory)
|
||||
|
||||
2. **Lazy hook (production singletons)** — override
|
||||
:meth:`_factory_lookup` so the repo can be instantiated as a
|
||||
module-level singleton with no factory bound yet::
|
||||
|
||||
class _SenderRepo(RepoBase[Sender]):
|
||||
model = Sender
|
||||
def _factory_lookup(self):
|
||||
from everos.infra.persistence.sqlite.sqlite_manager import (
|
||||
get_session_factory,
|
||||
)
|
||||
return get_session_factory()
|
||||
|
||||
sender_repo = _SenderRepo()
|
||||
await sender_repo.add(Sender(name="alice"))
|
||||
"""
|
||||
|
||||
model: type[T]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_factory: async_sessionmaker[AsyncSession] | None = None,
|
||||
) -> None:
|
||||
"""Bind to a session factory; if ``None``, defer to ``_factory_lookup``."""
|
||||
self._factory_override = session_factory
|
||||
|
||||
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
|
||||
"""Resolve a session factory on first use. Override in subclass.
|
||||
|
||||
``RepoBase`` itself has no idea where the runtime singleton lives
|
||||
— that knowledge belongs to the infra subclass. The default raises
|
||||
so a missing override is loud rather than silently broken.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{type(self).__name__}: pass session_factory= to __init__ "
|
||||
"or override _factory_lookup() to wire the storage manager."
|
||||
)
|
||||
|
||||
@property
|
||||
def _factory(self) -> async_sessionmaker[AsyncSession]:
|
||||
if self._factory_override is not None:
|
||||
return self._factory_override
|
||||
return self._factory_lookup()
|
||||
|
||||
@property
|
||||
def session_factory(self) -> async_sessionmaker[AsyncSession]:
|
||||
"""Underlying session factory (for multi-step transactions)."""
|
||||
return self._factory
|
||||
|
||||
# ── Create ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def add(self, instance: T) -> T:
|
||||
"""Insert one row, commit, refresh, return the instance."""
|
||||
async with session_scope(self._factory) as s:
|
||||
s.add(instance)
|
||||
await s.commit()
|
||||
await s.refresh(instance)
|
||||
return instance
|
||||
|
||||
async def add_many(self, instances: Sequence[T]) -> list[T]:
|
||||
"""Insert many rows in one transaction."""
|
||||
items = list(instances)
|
||||
async with session_scope(self._factory) as s:
|
||||
s.add_all(items)
|
||||
await s.commit()
|
||||
for inst in items:
|
||||
await s.refresh(inst)
|
||||
return items
|
||||
|
||||
# ── Read ───────────────────────────────────────────────────────────────
|
||||
|
||||
async def get_by_id(self, id_value: Any) -> T | None:
|
||||
"""Get a row by primary key. Returns ``None`` if not found."""
|
||||
async with session_scope(self._factory) as s:
|
||||
return await s.get(self.model, id_value)
|
||||
|
||||
async def list_all(self) -> list[T]:
|
||||
"""Return all rows (no filter, no order)."""
|
||||
async with session_scope(self._factory) as s:
|
||||
stmt = select(self.model)
|
||||
return list((await s.execute(stmt)).scalars().all())
|
||||
|
||||
async def find_where(self, **filters: Any) -> list[T]:
|
||||
"""Equality-only filtering, e.g. ``find_where(name="alice", active=True)``."""
|
||||
async with session_scope(self._factory) as s:
|
||||
stmt = select(self.model).filter_by(**filters)
|
||||
return list((await s.execute(stmt)).scalars().all())
|
||||
|
||||
async def find_one(self, **filters: Any) -> T | None:
|
||||
"""First row matching ``filters`` (no ordering); ``None`` if not found."""
|
||||
async with session_scope(self._factory) as s:
|
||||
stmt = select(self.model).filter_by(**filters).limit(1)
|
||||
return (await s.execute(stmt)).scalars().first()
|
||||
|
||||
async def count(self) -> int:
|
||||
"""Total row count (no filter)."""
|
||||
async with session_scope(self._factory) as s:
|
||||
stmt = select(func.count()).select_from(self.model)
|
||||
return int((await s.execute(stmt)).scalar_one())
|
||||
|
||||
# ── Update ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def update(self, instance: T) -> T:
|
||||
"""Persist changes on an instance whose primary key already exists.
|
||||
|
||||
Uses ``session.merge`` so detached / fresh-from-Pydantic instances
|
||||
are reattached. ``BaseTable.updated_at`` auto-bumps via SA's
|
||||
``onupdate`` hook.
|
||||
"""
|
||||
async with session_scope(self._factory) as s:
|
||||
merged = await s.merge(instance)
|
||||
await s.commit()
|
||||
await s.refresh(merged)
|
||||
return merged
|
||||
|
||||
# ── Delete ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def delete(self, instance: T) -> None:
|
||||
"""Delete by instance (primary key must be set)."""
|
||||
async with session_scope(self._factory) as s:
|
||||
merged = await s.merge(instance)
|
||||
await s.delete(merged)
|
||||
await s.commit()
|
||||
|
||||
async def delete_by_id(self, id_value: Any) -> bool:
|
||||
"""Delete by primary key. Returns ``True`` if a row was removed."""
|
||||
async with session_scope(self._factory) as s:
|
||||
instance = await s.get(self.model, id_value)
|
||||
if instance is None:
|
||||
return False
|
||||
await s.delete(instance)
|
||||
await s.commit()
|
||||
return True
|
||||
45
src/everos/core/persistence/sqlite/session.py
Normal file
45
src/everos/core/persistence/sqlite/session.py
Normal file
@ -0,0 +1,45 @@
|
||||
"""Async session factory + session scope context manager."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
|
||||
|
||||
|
||||
def create_session_factory(engine: AsyncEngine) -> async_sessionmaker[AsyncSession]:
|
||||
"""Build an :class:`async_sessionmaker` bound to ``engine``.
|
||||
|
||||
``expire_on_commit=False`` keeps attribute access on instances valid
|
||||
after commit, which is the conventional setup for async SA usage.
|
||||
"""
|
||||
return async_sessionmaker(
|
||||
bind=engine,
|
||||
class_=AsyncSession,
|
||||
expire_on_commit=False,
|
||||
)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def session_scope(
|
||||
session_factory: async_sessionmaker[AsyncSession],
|
||||
) -> AsyncIterator[AsyncSession]:
|
||||
"""Yield an :class:`AsyncSession` inside a try/rollback/close block.
|
||||
|
||||
The session is rolled back on any exception in the ``async with`` body,
|
||||
then closed. Callers are responsible for calling ``await session.commit()``
|
||||
on success.
|
||||
|
||||
Usage:
|
||||
factory = create_session_factory(engine)
|
||||
async with session_scope(factory) as session:
|
||||
session.add(some_record)
|
||||
await session.commit()
|
||||
"""
|
||||
async with session_factory() as session:
|
||||
try:
|
||||
yield session
|
||||
except Exception:
|
||||
await session.rollback()
|
||||
raise
|
||||
Reference in New Issue
Block a user