chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
Elliot Chen
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions

View File

@ -0,0 +1,5 @@
"""Infrastructure layer.
Adapts to external storage and persists domain models. Contains no
business rules.
"""

View File

@ -0,0 +1,63 @@
"""Async offline strategy scheduling chassis.
Provides decorator-based strategy registration, event-driven triggers
(Cron/Idle/Manual), and gate-based concurrency control.
"""
from everos.infra.ome.config import OMEConfig as OMEConfig
from everos.infra.ome.context import StrategyContext as StrategyContext
from everos.infra.ome.decorator import offline_strategy as offline_strategy
from everos.infra.ome.engine import OfflineEngine as OfflineEngine
from everos.infra.ome.events import BaseEvent as BaseEvent
from everos.infra.ome.events import CronTick as CronTick
from everos.infra.ome.events import IdleTick as IdleTick
from everos.infra.ome.events import ManualTick as ManualTick
from everos.infra.ome.exceptions import (
EmitNotDeclaredError as EmitNotDeclaredError,
)
from everos.infra.ome.exceptions import (
EngineCallFromStrategyError as EngineCallFromStrategyError,
)
from everos.infra.ome.exceptions import (
EngineLockHeldError as EngineLockHeldError,
)
from everos.infra.ome.exceptions import OMEError as OMEError
from everos.infra.ome.exceptions import (
StartupValidationError as StartupValidationError,
)
from everos.infra.ome.exceptions import (
StrategyContractError as StrategyContractError,
)
from everos.infra.ome.gates import Counter as Counter
from everos.infra.ome.records import RunRecord as RunRecord
from everos.infra.ome.records import RunStatus as RunStatus
from everos.infra.ome.records import StrategyRouteInfo as StrategyRouteInfo
from everos.infra.ome.triggers import Cron as Cron
from everos.infra.ome.triggers import Idle as Idle
from everos.infra.ome.triggers import Immediate as Immediate
from everos.infra.ome.triggers import Trigger as Trigger
__all__ = [
"BaseEvent",
"Counter",
"Cron",
"CronTick",
"EmitNotDeclaredError",
"EngineCallFromStrategyError",
"EngineLockHeldError",
"Idle",
"IdleTick",
"Immediate",
"ManualTick",
"OfflineEngine",
"OMEConfig",
"OMEError",
"RunRecord",
"RunStatus",
"StartupValidationError",
"StrategyContext",
"StrategyContractError",
"StrategyRouteInfo",
"Trigger",
"offline_strategy",
]

View File

@ -0,0 +1 @@
"""Internal: background loops (idle scan / config reload / crash recovery)."""

View File

@ -0,0 +1,254 @@
"""Config hot-reload — watchfiles + tomllib + shallow merge.
Hot-updatable fields: enabled / max_retries / gate / cron / idle_seconds /
scan_interval_seconds. Trigger type swap (Immediate ↔ Cron ↔ Idle),
event subscription (Immediate.on / Idle.on), and Idle.event_field
remain immutable — these define strategy routing and changing them
requires a code change and redeploy.
Per-strategy two-phase commit: enabled is applied independently for
emergency-stop semantics; max_retries / gate / trigger parameters
form one atomic group that fully rolls back on any failure inside it.
"""
from __future__ import annotations
import asyncio
import tomllib
from contextlib import suppress
from dataclasses import replace
from pathlib import Path
from typing import TYPE_CHECKING
from pydantic import ValidationError
from watchfiles import awatch
from everos.core.observability.logging import get_logger
from everos.infra.ome._dispatch.registry import StrategyRegistry
from everos.infra.ome.config import StrategyOverride, TomlRoot
from everos.infra.ome.decorator import StrategyMeta
from everos.infra.ome.gates import Counter
from everos.infra.ome.triggers import Cron, Idle, Trigger
if TYPE_CHECKING:
from everos.infra.ome.engine import OfflineEngine
logger = get_logger(__name__)
class _SkipAtomicGroupError(Exception):
"""Internal sentinel raised when the non-enabled atomic group for
one strategy must be skipped without affecting other strategies.
"""
def _apply_enabled(
meta: StrategyMeta,
override: StrategyOverride,
name: str,
registry: StrategyRegistry,
) -> StrategyMeta:
"""Step 1: apply `enabled` independently — never blocked by other fields."""
if override.enabled is None or override.enabled == meta.enabled:
return meta
new_meta = replace(meta, enabled=override.enabled)
registry.replace(name, new_meta)
return new_meta
def _build_atomic_meta(
meta: StrategyMeta,
override: StrategyOverride,
) -> tuple[StrategyMeta, Trigger]:
"""Step 2 pure-compute: build (new_meta, new_trigger) from current state.
Raises `_SkipAtomicGroupError` on type mismatches or invalid gate intros.
No registry / engine writes happen here.
"""
new_meta = meta
new_trigger: Trigger = meta.trigger
if override.max_retries is not None:
new_meta = replace(new_meta, max_retries=override.max_retries)
if override.gate is not None:
# Introducing a gate on a strategy that has none requires an explicit
# threshold — silently defaulting to 1 would mean "fire on every
# event", which is almost certainly not what the user intended.
if meta.gate is None and override.gate.threshold is None:
raise _SkipAtomicGroupError(
"introducing a gate requires explicit threshold"
)
base = meta.gate.model_dump() if meta.gate is not None else {}
for k, v in override.gate.model_dump(exclude_unset=True).items():
if v is not None:
base[k] = v
new_meta = replace(new_meta, gate=Counter(**base))
if override.cron is not None:
if not isinstance(meta.trigger, Cron):
raise _SkipAtomicGroupError(
f"cron given on non-Cron strategy "
f"(actual: {type(meta.trigger).__name__})"
)
new_trigger = Cron(expr=override.cron)
if override.idle_seconds is not None or override.scan_interval_seconds is not None:
if not isinstance(meta.trigger, Idle):
raise _SkipAtomicGroupError(
f"idle_* given on non-Idle strategy "
f"(actual: {type(meta.trigger).__name__})"
)
updates: dict[str, int] = {}
if override.idle_seconds is not None:
updates["idle_seconds"] = override.idle_seconds
if override.scan_interval_seconds is not None:
updates["scan_interval_seconds"] = override.scan_interval_seconds
# model_validate (not model_copy) re-runs Idle._validate_event_field on
# the merged dict; model_copy(update=...) would skip it and let an
# invalid event_field reach the registry.
new_trigger = Idle.model_validate({**meta.trigger.model_dump(), **updates})
if new_trigger is not meta.trigger:
new_meta = replace(new_meta, trigger=new_trigger)
return new_meta, new_trigger
def _needs_aps_reschedule(old_trigger: Trigger, new_trigger: Trigger) -> bool:
"""True iff scheduler-driving fields changed (cron expr / Idle scan_interval)."""
if new_trigger is old_trigger:
return False
if isinstance(new_trigger, Cron) and isinstance(old_trigger, Cron):
return new_trigger.expr != old_trigger.expr
if isinstance(new_trigger, Idle) and isinstance(old_trigger, Idle):
return new_trigger.scan_interval_seconds != old_trigger.scan_interval_seconds
return False
def _maybe_reschedule_aps(
engine: OfflineEngine, name: str, new_trigger: Trigger
) -> None:
"""Push the new trigger's APS-relevant fields to the scheduler."""
if isinstance(new_trigger, Cron):
engine.reschedule_cron_job(name, new_trigger.expr)
elif isinstance(new_trigger, Idle):
engine.reschedule_idle_job(
name, scan_interval_seconds=new_trigger.scan_interval_seconds
)
def _apply_one_strategy(
name: str,
override: StrategyOverride,
registry: StrategyRegistry,
engine: OfflineEngine,
) -> None:
"""Two-phase commit for one strategy: enabled, then atomic group."""
meta = registry.get(name)
meta = _apply_enabled(meta, override, name, registry)
try:
new_meta, new_trigger = _build_atomic_meta(meta, override)
if _needs_aps_reschedule(meta.trigger, new_trigger):
_maybe_reschedule_aps(engine, name, new_trigger)
registry.replace(name, new_meta)
except Exception as e: # noqa: BLE001
# User-fixable config error (typo / type mismatch / APS runtime
# failure) — log + skip this strategy's atomic group, never the loop.
logger.warning(
"strategy_atomic_group_skipped",
strategy_name=name,
error_type=type(e).__name__,
exc_info=True,
)
def apply_overrides(
registry: StrategyRegistry,
root: TomlRoot,
engine: OfflineEngine,
) -> None:
"""Shallow-merge TomlRoot overrides into registry strategies in place.
Two-phase per-strategy semantics:
Step 1 (enabled): applied independently — emergency-stop must
never be blocked by a typo in another field.
Step 2 (max_retries / gate / trigger params): applied as an atomic
group. Any failure (type mismatch, invalid cron, APS reschedule
error, ...) rolls the whole group back to the prior values.
"""
known = {m.name for m in registry.all()}
for name, override in root.strategies.items():
if name not in known:
logger.warning("config_override_unknown_strategy", strategy_name=name)
continue
_apply_one_strategy(name, override, registry, engine)
class ConfigReloader:
"""Watch a TOML file and apply overrides on change."""
def __init__(
self,
*,
config_path: Path,
registry: StrategyRegistry,
engine: OfflineEngine,
debounce_ms: int = 1600,
) -> None:
self._path = config_path
self._registry = registry
self._engine = engine
self._debounce_ms = debounce_ms
self._task: asyncio.Task[None] | None = None
def start(self) -> None:
"""Fire-and-forget the watch loop. Idempotent: raises on double-start."""
if self._path is None:
return
if self._task is not None and not self._task.done():
raise RuntimeError("ConfigReloader already started")
self._task = asyncio.create_task(self._loop())
async def stop(self) -> None:
"""Cancel the watch task and await it; safe to call multiple times."""
if self._task is not None:
self._task.cancel()
with suppress(asyncio.CancelledError):
await self._task
self._task = None
async def _loop(self) -> None:
"""Initial load + per-FS-change reload; survives single-iteration failures."""
try:
await self._load_once()
except Exception: # noqa: BLE001
logger.exception("config_reload_iteration_failed")
async for _changes in awatch(self._path, debounce=self._debounce_ms):
try:
await self._load_once()
except Exception: # noqa: BLE001
logger.exception("config_reload_iteration_failed")
async def _load_once(self) -> None:
"""Read TOML off the loop, parse + validate, apply overrides."""
def _read_and_parse() -> TomlRoot:
with open(self._path, "rb") as f:
content = f.read()
parsed = tomllib.loads(content.decode("utf-8"))
return TomlRoot.model_validate(parsed)
try:
root = await asyncio.to_thread(_read_and_parse)
except (OSError, tomllib.TOMLDecodeError, ValidationError) as e:
logger.warning(
"config_reload_failed",
error_type=type(e).__name__,
error=str(e),
path=str(self._path),
)
return
apply_overrides(self._registry, root, self._engine)
logger.info("config_reloaded", path=str(self._path))

View File

@ -0,0 +1,79 @@
"""Startup crash recovery — stale RUNNING rows → CRASHED + re-enqueue.
Runs once at engine.start() before normal dispatching begins. Rows
whose started_at is older than ``timeout_seconds`` are marked CRASHED
and re-enqueued with a fresh run_id reusing the original event payload.
Fresher RUNNING rows are skipped — APScheduler's own jobstore may have
already reattached them.
At-most-once: ``mark_crashed`` and ``add_job`` are not atomic. If
``add_job`` fails after ``mark_crashed``, the row stays CRASHED and
the event is lost. Strategies needing at-least-once must add their own
retry / monitor layer.
"""
from __future__ import annotations
from collections.abc import Awaitable, Callable
from datetime import timedelta
from uuid import uuid4
from everos.component.utils.datetime import get_utc_now
from everos.core.observability.logging import get_logger
from everos.infra.ome._stores.run_record import RunRecordStore
logger = get_logger(__name__)
async def scan_and_resume(
*,
run_record_store: RunRecordStore,
timeout_seconds: int,
add_job: Callable[[str, str, str, str, int], Awaitable[None]],
) -> None:
"""Scan ``run_record`` for stale RUNNING rows, mark them CRASHED, and
re-enqueue each via ``add_job``. See module docstring for the
at-most-once caveat.
``add_job`` is called with positional args
``(strategy_name, run_id, event_topic, event_payload, max_retries)``.
Raises:
ValueError: If ``timeout_seconds`` is not positive.
"""
if timeout_seconds <= 0:
raise ValueError(f"timeout_seconds must be > 0, got {timeout_seconds}")
now = get_utc_now()
cutoff = now - timedelta(seconds=timeout_seconds)
running = await run_record_store.find_running()
for rec in running:
if rec.started_at >= cutoff:
continue
await run_record_store.mark_crashed(
run_id=rec.run_id,
finished_at=now,
error="crash recovery: marked CRASHED after start scan",
)
new_run_id = uuid4().hex
try:
await add_job(
rec.strategy_name,
new_run_id,
rec.event_topic,
rec.event_payload,
rec.max_retries_snapshot,
)
logger.info(
"crash_recovery_resumed",
strategy_name=rec.strategy_name,
event_topic=rec.event_topic,
old_run_id=rec.run_id,
new_run_id=new_run_id,
)
except Exception: # noqa: BLE001
logger.exception(
"crash_recovery_resume_failed",
strategy_name=rec.strategy_name,
event_topic=rec.event_topic,
old_run_id=rec.run_id,
)

View File

@ -0,0 +1,60 @@
"""IdleScanner — periodic scan of idle_store, emits IdleTick for overdue buckets."""
from __future__ import annotations
from collections.abc import Awaitable, Callable
from datetime import datetime
from everos.component.utils.datetime import get_utc_now
from everos.core.observability.logging import get_logger
from everos.infra.ome._stores.idle import IdleStore
from everos.infra.ome.events import BaseEvent, IdleTick
from everos.infra.ome.triggers import Idle
logger = get_logger(__name__)
class IdleScanner:
"""Scans idle_store for overdue buckets and emits IdleTick events."""
def __init__(
self,
*,
strategy_name: str,
trigger: Idle,
idle_store: IdleStore,
emit: Callable[[BaseEvent], Awaitable[None]],
) -> None:
self._name = strategy_name
self._trigger = trigger
self._idle_store = idle_store
self._emit = emit
async def scan_once(self, *, now: datetime | None = None) -> None:
"""Find overdue buckets and emit IdleTick for each.
Per-bucket emit failures are caught and logged so a single
downstream error (e.g. dispatch hitting a transient DB lock)
cannot prevent sibling buckets from being notified this round.
"""
effective_now = now if now is not None else get_utc_now()
overdue = await self._idle_store.scan_idle(
self._name,
idle_seconds=self._trigger.idle_seconds,
now=effective_now,
)
for bucket_key in overdue:
try:
await self._emit(
IdleTick(
strategy_name=self._name,
bucket_key=bucket_key,
idle_seconds=self._trigger.idle_seconds,
)
)
except Exception: # noqa: BLE001
logger.exception(
"idle_emit_failed",
strategy_name=self._name,
bucket_key=bucket_key,
)

View File

@ -0,0 +1 @@
"""Internal: event dispatch core (registry / dispatcher / runner)."""

View File

@ -0,0 +1,23 @@
"""ContextVar shared between Runner and OfflineEngine.
Python copies ContextVar values into child tasks at
``asyncio.create_task`` (by design, for trace propagation), so
``@_refuse_inside_strategy`` reliably catches only *same-task* calls.
Never attach it to APS callback methods (``dispatch_run`` /
``run_idle_scan``) — cascade emits would misfire.
``test_engine_chain_emit_through_ctx`` is the regression.
TODO: ``sys._getframe`` walk for a ``Runner.run`` frame is leak-proof.
"""
from __future__ import annotations
from contextvars import ContextVar
from everos.infra.ome.decorator import StrategyMeta
_CURRENT_STRATEGY: ContextVar[StrategyMeta | None] = ContextVar(
"current_strategy", default=None
)
"""Set by ``Runner.run`` around ``meta.func(event, ctx)``; read by
``@_refuse_inside_strategy``. ``None`` = not inside a strategy frame."""

View File

@ -0,0 +1,205 @@
"""EventDispatcher — routing layer applying the three OME gates.
For each dispatched event, every candidate strategy is run through three
gates in order:
1. ``enabled`` — strategy may be hot-disabled via config
2. ``applies_to`` — per-strategy predicate over the event payload
3. ``Counter`` — N-of-M rate/threshold gate against
:class:`CounterStore`
:meth:`dispatch` is the read-write entry point — passing the counter
gate increments the counter and returns ``(meta, run_id)`` pairs to
enqueue. :meth:`inspect` is its dry-run twin — same gates, no counter
mutation; returns one :class:`StrategyRouteInfo` per matched strategy
including a snapshot of the counter so debug callers can see why a
strategy will or won't fire.
By design ``inspect`` does not accept ``force_enabled`` /
``strategy_filter``: those are runtime overrides for the routing side
(``trigger_manual``), not properties a debugger should second-guess.
"""
from __future__ import annotations
from collections.abc import Callable
from uuid import uuid4
from everos.core.observability.logging import get_logger
from everos.infra.ome._dispatch.registry import StrategyRegistry
from everos.infra.ome._stores.counter import CounterStore
from everos.infra.ome.decorator import StrategyMeta
from everos.infra.ome.events import BaseEvent
from everos.infra.ome.records import CounterProgress, StrategyRouteInfo
logger = get_logger(__name__)
class EventDispatcher:
"""Apply ``enabled / applies_to / Counter`` gates to one event."""
def __init__(
self,
*,
registry: StrategyRegistry,
counter_store: CounterStore,
) -> None:
self._registry = registry
self._counter_store = counter_store
async def dispatch(
self,
event: BaseEvent,
*,
force_enabled: bool = False,
strategy_filter: str | None = None,
) -> list[tuple[StrategyMeta, str]]:
"""Run gates and return ``(meta, run_id)`` pairs to enqueue.
Args:
event: The event to route.
force_enabled: Bypass the ``meta.enabled`` gate. ``applies_to``
and the counter still apply. Used by manual triggers
with ``force=True``.
strategy_filter: Restrict to one strategy name regardless of
whether it subscribes to ``type(event)``. Manual triggers
use this when targeting a strategy with a caller-supplied
event. Raises ``KeyError`` if the name is not registered.
``applies_to`` callables raised by a single strategy are caught,
logged, and treated as ``False`` for that strategy alone — sibling
strategies still dispatch. Framework errors (e.g. CounterStore
I/O) propagate.
"""
if strategy_filter is not None:
metas: list[StrategyMeta] = [self._registry.get(strategy_filter)]
else:
metas = list(self._registry.lookup_by_event(type(event)))
out: list[tuple[StrategyMeta, str]] = []
for meta in metas:
if not _routes_to(event, meta):
continue
if not force_enabled and not meta.enabled:
continue
if not _safe_applies(meta, event):
continue
if meta.gate is not None:
bucket = _bucket_key(event, meta.gate.event_field)
passed, _ = await self._counter_store.incr_and_check(
meta.name,
bucket,
threshold=meta.gate.threshold,
cooldown_seconds=meta.gate.cooldown_seconds,
)
if not passed:
continue
out.append((meta, uuid4().hex))
return out
async def inspect(self, event: BaseEvent) -> list[StrategyRouteInfo]:
"""Dry-run twin of :meth:`dispatch` — no counter mutation.
Returns one :class:`StrategyRouteInfo` per matched strategy with
per-gate pass flags and a counter snapshot (read-only via
``get_progress``). Same exception policy as :meth:`dispatch`:
a strategy's faulty ``applies_to`` callable is logged and that
strategy reports ``applies_to_pass=False`` rather than tanking
the whole inspection.
"""
out: list[StrategyRouteInfo] = []
for meta in self._registry.lookup_by_event(type(event)):
if not _routes_to(event, meta):
continue
enabled_pass = bool(meta.enabled)
applies_pass = enabled_pass and _safe_applies(meta, event)
counter_pass = applies_pass and (meta.gate is None)
progress: CounterProgress | None = None
if applies_pass and meta.gate is not None:
bucket = _bucket_key(event, meta.gate.event_field)
cur = await self._counter_store.get_progress(
meta.name,
bucket,
)
next_cur = cur + 1
progress = CounterProgress(
current=next_cur, threshold=meta.gate.threshold
)
counter_pass = next_cur >= meta.gate.threshold
out.append(
StrategyRouteInfo(
strategy_name=meta.name,
enabled_pass=enabled_pass,
applies_to_pass=applies_pass,
counter_pass=counter_pass,
counter_progress=progress,
)
)
return out
def _routes_to(event: BaseEvent, meta: StrategyMeta) -> bool:
"""Narrow engine-emitted ticks to their single target strategy.
Cron / Idle / Manual ticks carry a ``strategy_name`` naming the
intended recipient — without this filter two strategies listening
on the same tick class would cross-fire. Business events have no
such field and therefore fan out to every matching strategy.
"""
target = getattr(event, "strategy_name", None)
return target is None or target == meta.name
def _safe_applies(meta: StrategyMeta, event: BaseEvent) -> bool:
"""Evaluate ``meta.applies_to`` with user-callable exceptions isolated.
A faulty ``applies_to`` callable is logged at exception level with
``strategy_name`` + ``event_topic`` context and treated as
``False`` so that a single buggy predicate cannot tank the entire
fan-out for an event.
"""
try:
return _applies(meta.applies_to, event)
except Exception: # noqa: BLE001
logger.exception(
"applies_to_callable_raised",
strategy_name=meta.name,
event_topic=type(event).topic(),
)
return False
def _applies(
spec: str | Callable[[BaseEvent], bool] | None,
event: BaseEvent,
) -> bool:
"""Resolve ``applies_to`` semantics.
* ``None`` — strategy applies to every event in its subscription
* callable — invoke and bool-cast the result
* str — read the named event attribute and bool-cast it; falsy
values (``""``, ``0``, ``None``, empty containers) are treated
as "field unset", so the strategy does NOT apply
Exceptions raised by a user callable propagate; the dispatcher wraps
this call in :func:`_safe_applies` to localise blast radius.
"""
if spec is None:
return True
if callable(spec):
return bool(spec(event))
return bool(getattr(event, spec, None))
def _bucket_key(event: BaseEvent, field: str | None) -> str:
"""Compute a Counter-store bucket key from an event field.
``field=None`` means the gate is un-bucketed → single shared bucket
``"__all__"``. Missing or ``None`` field values map to ``"__none__"``
so a typo doesn't accidentally collapse every event into ``"__all__"``
(the StrategyRegistry validator catches typos at startup; the sentinel
here is the runtime safety net).
"""
if field is None:
return "__all__"
val = getattr(event, field, None)
return str(val) if val is not None else "__none__"

View File

@ -0,0 +1,152 @@
"""StrategyRegistry — registration + DAG cycle detection.
Mutated at startup via :meth:`register` / :meth:`validate`, and at
runtime via :meth:`replace` (config hot-reload). Cycle detection is a
Kahn-style topological pass on the event-flow DAG implied by
``trigger.on`` (incoming) and ``emits`` (outgoing).
"""
from __future__ import annotations
from collections import defaultdict, deque
from collections.abc import Callable
from typing import Any
from everos.infra.ome.decorator import StrategyMeta
from everos.infra.ome.events import BaseEvent, CronTick, IdleTick
from everos.infra.ome.exceptions import StartupValidationError
from everos.infra.ome.triggers import Cron, Idle, Immediate, Trigger
class StrategyRegistry:
"""Startup-time registry for offline strategies with cycle detection."""
def __init__(self) -> None:
self._strategies: dict[str, StrategyMeta] = {}
def register(self, func: Callable[..., Any]) -> None:
"""Register a strategy function (reads ``_ome_strategy_meta``).
Raises ``StartupValidationError`` if ``func`` is not decorated
with ``@offline_strategy`` or if its name is already registered.
"""
meta = getattr(func, "_ome_strategy_meta", None)
if not isinstance(meta, StrategyMeta):
fn_name = getattr(func, "__name__", repr(func))
raise StartupValidationError(
f"register: {fn_name} is not decorated with @offline_strategy"
)
if meta.name in self._strategies:
raise StartupValidationError(
f"register: duplicate strategy name {meta.name!r}"
)
self._strategies[meta.name] = meta
def replace(self, name: str, new_meta: StrategyMeta) -> None:
"""Swap an already-registered strategy's meta in place (hot-reload entry).
Cycle / gate validation is **not** re-run; callers (currently
:func:`apply_overrides`) must only feed metas where the
DAG-shaping fields (``trigger.on``, ``emits``, trigger type)
match the original. Raises ``KeyError`` if ``name`` is not yet
registered.
"""
if name not in self._strategies:
raise KeyError(name)
self._strategies[name] = new_meta
def get(self, name: str) -> StrategyMeta:
"""Return meta by name (raises ``KeyError`` if absent)."""
return self._strategies[name]
def all(self) -> list[StrategyMeta]:
"""Return a snapshot list of every registered strategy."""
return list(self._strategies.values())
def lookup_by_event(self, event_cls: type[BaseEvent]) -> list[StrategyMeta]:
"""Return strategies that may receive an event of ``event_cls``.
Resolution:
* ``Immediate`` strategy listening on the class → match
* ``CronTick`` → all Cron strategies (narrowed later by name)
* ``IdleTick`` → all Idle strategies (narrowed later by name)
Engine-emitted ticks carry a ``strategy_name`` field; dispatcher
narrows the returned set to the single target via ``_routes_to``.
"""
out: list[StrategyMeta] = []
for m in self._strategies.values():
if (
(isinstance(m.trigger, Immediate) and event_cls in m.trigger.on)
or (isinstance(m.trigger, Cron) and event_cls is CronTick)
or (isinstance(m.trigger, Idle) and event_cls is IdleTick)
):
out.append(m)
return out
def validate(self) -> None:
"""Validate the strategy DAG for cycles and gate field existence."""
self._validate_no_cycles()
self._validate_gate_event_fields()
def _validate_no_cycles(self) -> None:
"""Kahn topological sort over the event-flow DAG.
Edge ``s_a → s_b`` exists iff ``s_a.emits`` intersects
``s_b.trigger.on``.
"""
adj: dict[str, set[str]] = defaultdict(set)
indeg: dict[str, int] = dict.fromkeys(self._strategies, 0)
for src in self._strategies.values():
for ev in src.emits:
for dst in self._strategies.values():
if (
isinstance(dst.trigger, Immediate)
and ev in dst.trigger.on
and dst.name not in adj[src.name]
):
adj[src.name].add(dst.name)
indeg[dst.name] += 1
queue = deque(n for n, d in indeg.items() if d == 0)
visited = 0
while queue:
n = queue.popleft()
visited += 1
for nbr in adj[n]:
indeg[nbr] -= 1
if indeg[nbr] == 0:
queue.append(nbr)
if visited < len(self._strategies):
raise StartupValidationError("cycle detected in strategy DAG")
def _validate_gate_event_fields(self) -> None:
"""Reject any ``gate.event_field`` missing from a receivable event class.
Without this check a typo silently collapses every event into one
shared bucket and the rate gate stops segmenting.
"""
for meta in self._strategies.values():
if meta.gate is None or meta.gate.event_field is None:
continue
field = meta.gate.event_field
for ev_cls in _event_classes_for_trigger(meta.trigger):
if field not in ev_cls.model_fields: # type: ignore[operator] # Pydantic model_fields → dict via @deprecated_instance_property (pydantic/main.py:277)
raise StartupValidationError(
f"strategy {meta.name!r}: gate.event_field {field!r} "
f"not found in {ev_cls.__name__} fields "
f"(available: {list(ev_cls.model_fields)})" # type: ignore[arg-type] # same as above
)
def _event_classes_for_trigger(trigger: Trigger) -> list[type[BaseEvent]]:
"""Enumerate event classes a strategy with the given trigger receives."""
if isinstance(trigger, Immediate):
return list(trigger.on)
if isinstance(trigger, Cron):
return [CronTick]
if isinstance(trigger, Idle):
return [IdleTick]
raise NotImplementedError(f"unknown trigger type: {type(trigger).__name__}")

View File

@ -0,0 +1,247 @@
"""Runner — single-strategy execution with attempt-level retry + DLQ.
Acquires ``engine_sem`` (FIFO), drives the per-attempt RunRecord state
machine (``RUNNING → SUCCESS / FAILED / DEAD_LETTER``), and fires
``on_dead_letter`` after exhausted retries or contract violations.
Per attempt, binds ``strategy_name`` / ``run_id`` / ``attempt`` into
``structlog.contextvars`` (so every log record carries those fields
automatically) and sets ``_CURRENT_STRATEGY`` ContextVar around
``meta.func`` (so ``engine.emit`` can refuse direct calls from inside
a strategy — strategies emit via ``ctx.emit``).
**Idempotency contract**: if ``mark_success`` / ``mark_failed`` /
``mark_dead_letter`` fails after the strategy body returned, the
``RUNNING`` row stays and crash recovery on next start will treat the
run as crashed and re-enqueue the same event. Strategy bodies must
therefore be safe to re-execute with the same payload.
"""
from __future__ import annotations
import asyncio
import traceback
from collections.abc import Awaitable, Callable
from uuid import uuid4
from structlog.contextvars import bound_contextvars
from everos.component.utils.datetime import get_utc_now
from everos.core.observability.logging import get_logger
from everos.infra.ome._dispatch._state import _CURRENT_STRATEGY
from everos.infra.ome._stores.run_record import RunRecordStore
from everos.infra.ome.decorator import StrategyMeta
from everos.infra.ome.events import BaseEvent
from everos.infra.ome.exceptions import EmitNotDeclaredError, StrategyContractError
from everos.infra.ome.records import RunRecord
logger = get_logger(__name__)
class _RunCtx:
"""Per-invocation context handed to ``meta.func(event, ctx)``.
Carries ``run_id``, a strategy-scoped logger, and the ``emit``
callback that enforces the declared ``emits=[...]`` contract.
"""
def __init__(
self,
*,
run_id: str,
strategy_name: str,
emit_hook: Callable[[BaseEvent], Awaitable[None]],
declared_emits: frozenset[type[BaseEvent]],
) -> None:
self.run_id = run_id
self.logger = get_logger("ome.strategy")
self._emit_hook = emit_hook
self._declared = declared_emits
self._strategy_name = strategy_name
async def emit(self, event: BaseEvent) -> None:
if type(event) not in self._declared:
raise EmitNotDeclaredError(
strategy=self._strategy_name,
event=event,
)
await self._emit_hook(event)
class Runner:
"""Drive one strategy invocation through retries to a terminal state."""
def __init__(
self,
*,
run_record_store: RunRecordStore,
engine_sem: asyncio.Semaphore,
emit_hook: Callable[[BaseEvent], Awaitable[None]],
on_dead_letter: Callable[[RunRecord], None] | None = None,
) -> None:
self._rec = run_record_store
self._sem = engine_sem
self._emit_hook = emit_hook
self._on_dead_letter = on_dead_letter
async def run(
self,
meta: StrategyMeta,
event: BaseEvent,
*,
run_id: str,
max_retries_snapshot: int,
) -> None:
"""Execute ``meta.func(event, ctx)`` with the attempt retry loop.
Holds ``engine_sem`` for the full retry chain so concurrency cap
applies end-to-end. Each attempt gets a fresh ``run_id`` after
the first, so the run history records every try.
"""
if max_retries_snapshot < 0:
raise ValueError(
f"max_retries_snapshot must be >= 0, got {max_retries_snapshot}"
)
async with self._sem:
event_topic = type(event).topic()
event_payload = event.model_dump_json()
current_run_id = run_id
for attempt in range(max_retries_snapshot + 1):
if attempt > 0:
current_run_id = uuid4().hex
terminated = await self._run_one_attempt(
meta=meta,
event=event,
current_run_id=current_run_id,
attempt=attempt,
event_topic=event_topic,
event_payload=event_payload,
max_retries_snapshot=max_retries_snapshot,
)
if terminated:
return
async def _run_one_attempt(
self,
*,
meta: StrategyMeta,
event: BaseEvent,
current_run_id: str,
attempt: int,
event_topic: str,
event_payload: str,
max_retries_snapshot: int,
) -> bool:
"""Run one attempt; return ``True`` if a terminal state was
written (success / dead-letter or persistence failure), ``False``
if FAILED and the caller should loop into the next attempt.
"""
ctx = _RunCtx(
run_id=current_run_id,
strategy_name=meta.name,
emit_hook=self._emit_hook,
declared_emits=meta.emits,
)
with bound_contextvars( # type: ignore[arg-type] # structlog typed as Generator; @contextmanager wraps at runtime (structlog/contextvars.py:170)
strategy_name=meta.name,
run_id=current_run_id,
attempt=attempt,
):
if not await self._record_start(
run_id=current_run_id,
strategy_name=meta.name,
attempt=attempt,
event_topic=event_topic,
event_payload=event_payload,
max_retries_snapshot=max_retries_snapshot,
):
return True # mark_running failed; abort run, no DB row exists
try:
token = _CURRENT_STRATEGY.set(meta)
try:
await meta.func(event, ctx)
finally:
_CURRENT_STRATEGY.reset(token)
except StrategyContractError as e:
await self._terminate_dead_letter(current_run_id, _format_error(e))
return True
except Exception as e: # noqa: BLE001
err = _format_error(e)
if attempt < max_retries_snapshot:
await self._rec.mark_failed(
run_id=current_run_id,
finished_at=get_utc_now(),
error=err,
)
return False # caller will retry
await self._terminate_dead_letter(current_run_id, err)
return True
else:
await self._rec.mark_success(
run_id=current_run_id,
finished_at=get_utc_now(),
)
return True
async def _record_start(
self,
*,
run_id: str,
strategy_name: str,
attempt: int,
event_topic: str,
event_payload: str,
max_retries_snapshot: int,
) -> bool:
"""Persist this attempt as RUNNING; return ``False`` on write failure.
When the write fails (DB lock, disk full, ...) the caller
aborts the retry loop — without a RUNNING row crash recovery
cannot rediscover the run, and it is silently lost. The
exception log emitted here is the only audit trail.
"""
try:
await self._rec.mark_running(
run_id=run_id,
strategy_name=strategy_name,
attempt=attempt,
event_topic=event_topic,
event_payload=event_payload,
max_retries_snapshot=max_retries_snapshot,
)
except Exception: # noqa: BLE001
logger.exception(
"mark_running_failed",
run_id=run_id,
strategy_name=strategy_name,
attempt=attempt,
)
return False
return True
async def _terminate_dead_letter(self, run_id: str, error: str) -> None:
"""Mark DEAD_LETTER and fire ``on_dead_letter`` callback if set."""
await self._rec.mark_dead_letter(
run_id=run_id,
finished_at=get_utc_now(),
error=error,
)
await self._fire_dead_letter_callback(run_id)
async def _fire_dead_letter_callback(self, run_id: str) -> None:
if self._on_dead_letter is None:
return
rec = await self._rec.get(run_id)
if rec is None:
return
try:
self._on_dead_letter(rec)
except Exception: # noqa: BLE001
logger.exception("on_dead_letter_failed")
def _format_error(e: BaseException) -> str:
"""Format an exception with type, message, and full traceback."""
return f"{type(e).__name__}: {e}\n{traceback.format_exc()}"

View File

@ -0,0 +1 @@
"""Internal: SQLite-backed state stores (counter / idle / run_record)."""

View File

@ -0,0 +1,107 @@
"""CounterStore — persistent (strategy_name, bucket_key) → counter rows.
Backs the ``Counter`` gate in OME's dispatch pipeline: each call to
:meth:`CounterStore.incr_and_check` atomically increments the bucket's
counter and reports whether the strategy should fire this time.
Pass semantics:
- ``counter >= threshold`` AND cooldown elapsed → ``passed=True``
- On pass, the row's counter resets to 0 and ``last_passed_ts``
advances to ``now``; the next pass needs a fresh accumulation.
- ``cooldown_seconds=0`` disables the cooldown gate (threshold alone).
"""
from __future__ import annotations
from datetime import timedelta
from everos.component.utils.datetime import (
from_iso_format,
get_utc_now,
to_iso_format,
)
from everos.infra.ome._stores.storage import OMEStorage
class CounterStore:
"""SQLite-backed counter for the ``Counter`` gate (see module docstring)."""
def __init__(self, storage: OMEStorage) -> None:
self._storage = storage
async def incr_and_check(
self,
strategy_name: str,
bucket_key: str,
*,
threshold: int,
cooldown_seconds: int,
) -> tuple[bool, int]:
"""Increment ``(strategy_name, bucket_key)``'s counter atomically.
Args:
strategy_name: Strategy whose counter to update.
bucket_key: The bucket value derived from the event field
(or ``"__all__"`` when the gate is unbucketed).
threshold: Pass once the counter reaches this value
(``>=``).
cooldown_seconds: Minimum seconds since the last pass for
the strategy/bucket; ``0`` disables the cooldown check.
Returns:
``(passed, counter)``. ``counter`` is the counter value at
the moment of the check (i.e. pre-reset on pass). Useful for
diagnostics — ``threshold`` is *not* substituted, so callers
observing ``counter > threshold`` learn the gate is
over-armed (e.g. threshold was lowered via hot reload while
the counter had already accumulated past the new value).
"""
now = get_utc_now()
async with self._storage.transaction() as conn:
cur = await conn.execute(
"SELECT counter, last_passed_ts FROM counter_store "
"WHERE strategy_name = ? AND bucket_key = ?",
(strategy_name, bucket_key),
)
row = await cur.fetchone()
counter = (row[0] if row else 0) + 1
last_passed = from_iso_format(row[1]) if row and row[1] else None
cooldown_ok = (
cooldown_seconds == 0
or last_passed is None
or now - last_passed >= timedelta(seconds=cooldown_seconds)
)
passed = counter >= threshold and cooldown_ok
new_counter = 0 if passed else counter
new_last_passed_ts = (
to_iso_format(now)
if passed
else (to_iso_format(last_passed) if last_passed else None)
)
await conn.execute(
"INSERT INTO counter_store (strategy_name, bucket_key, "
"counter, last_passed_ts) "
"VALUES (?, ?, ?, ?) "
"ON CONFLICT(strategy_name, bucket_key) DO UPDATE SET "
"counter = excluded.counter, "
"last_passed_ts = excluded.last_passed_ts",
(strategy_name, bucket_key, new_counter, new_last_passed_ts),
)
return passed, counter
async def get_progress(self, strategy_name: str, bucket_key: str) -> int:
"""Return the counter value persisted for this bucket (0 if absent).
Read-only; does not increment. Used by dispatcher inspect-mode
to report progress without mutating state.
"""
async with self._storage.connect() as conn:
cur = await conn.execute(
"SELECT counter FROM counter_store "
"WHERE strategy_name = ? AND bucket_key = ?",
(strategy_name, bucket_key),
)
row = await cur.fetchone()
return row[0] if row else 0

View File

@ -0,0 +1,64 @@
"""IdleStore — last_activity_ts rows backing the Idle trigger.
All writes pass through ``to_iso_format`` over a tz-aware datetime, so
``last_activity_ts`` is a fixed-format ISO 8601 string whose
lexicographic order matches temporal order — :meth:`scan_idle` relies
on this to keep the column un-wrapped in its predicate so SQLite can
use ``idx_idle_scan``.
"""
from __future__ import annotations
from datetime import datetime, timedelta
from everos.component.utils.datetime import from_iso_format, to_iso_format
from everos.infra.ome._stores.storage import OMEStorage
class IdleStore:
"""SQLite-backed last-activity tracker for the ``Idle`` trigger."""
def __init__(self, storage: OMEStorage) -> None:
self._storage = storage
async def touch(self, strategy_name: str, bucket_key: str, *, at: datetime) -> None:
"""UPSERT ``last_activity_ts = at`` for ``(strategy_name, bucket_key)``."""
async with self._storage.connect() as conn:
await conn.execute(
"INSERT INTO idle_store "
"(strategy_name, bucket_key, last_activity_ts) "
"VALUES (?, ?, ?) "
"ON CONFLICT(strategy_name, bucket_key) DO UPDATE SET "
"last_activity_ts = excluded.last_activity_ts",
(strategy_name, bucket_key, to_iso_format(at)),
)
await conn.commit()
async def scan_idle(
self, strategy_name: str, *, idle_seconds: int, now: datetime
) -> list[str]:
"""Return bucket_keys with ``last_activity_ts`` older than ``idle_seconds``."""
# Cutoff on the RHS so the indexed column stays un-wrapped.
cutoff = to_iso_format(now - timedelta(seconds=idle_seconds))
async with self._storage.connect() as conn:
cur = await conn.execute(
"SELECT bucket_key FROM idle_store "
"WHERE strategy_name = ? AND last_activity_ts <= ? "
"ORDER BY last_activity_ts ASC",
(strategy_name, cutoff),
)
rows = await cur.fetchall()
return [r[0] for r in rows]
async def get_last_activity(
self, strategy_name: str, bucket_key: str
) -> datetime | None:
"""Return the stored ``last_activity_ts`` (``None`` if never touched)."""
async with self._storage.connect() as conn:
cur = await conn.execute(
"SELECT last_activity_ts FROM idle_store "
"WHERE strategy_name = ? AND bucket_key = ?",
(strategy_name, bucket_key),
)
row = await cur.fetchone()
return from_iso_format(row[0]) if row else None

View File

@ -0,0 +1,168 @@
"""RunRecord persistence — state machine writes + same-transaction ring-buffer trim.
State machine (one row per ``run_id``):
RUNNING → SUCCESS / FAILED / DEAD_LETTER / CRASHED
Every :meth:`RunRecordStore.mark_running` INSERT runs inside one
``BEGIN IMMEDIATE`` transaction with a paired DELETE that keeps only
the newest ``max_records_per_strategy`` rows for that strategy. Bound
is enforced atomically — no background sweeper, no transient
over-budget state.
"""
from __future__ import annotations
from datetime import datetime
from typing import Any
from everos.component.utils.datetime import (
from_iso_format,
get_utc_now,
to_iso_format,
)
from everos.infra.ome._stores.storage import OMEStorage
from everos.infra.ome.records import RunRecord, RunStatus
class RunRecordStore:
"""SQLite-backed persistence for ``RunRecord`` (see module docstring)."""
def __init__(self, storage: OMEStorage, max_records_per_strategy: int) -> None:
self._storage = storage
self._max = max_records_per_strategy
async def mark_running(
self,
*,
run_id: str,
strategy_name: str,
attempt: int,
event_topic: str,
event_payload: str,
max_retries_snapshot: int,
) -> None:
"""Insert a new RUNNING row and trim the strategy's ring buffer atomically."""
async with self._storage.transaction() as conn:
await conn.execute(
"INSERT INTO run_record "
"(run_id, strategy_name, status, attempt, started_at, "
" event_topic, event_payload, max_retries_snapshot) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(
run_id,
strategy_name,
RunStatus.RUNNING.value,
attempt,
to_iso_format(get_utc_now()),
event_topic,
event_payload,
max_retries_snapshot,
),
)
await conn.execute(
"DELETE FROM run_record "
"WHERE strategy_name = ? AND run_id NOT IN ("
" SELECT run_id FROM run_record WHERE strategy_name = ? "
" ORDER BY started_at DESC LIMIT ?)",
(strategy_name, strategy_name, self._max),
)
async def mark_success(self, *, run_id: str, finished_at: datetime) -> None:
"""Mark RUNNING → SUCCESS."""
await self._update_status(run_id, RunStatus.SUCCESS, finished_at, None)
async def mark_failed(
self, *, run_id: str, finished_at: datetime, error: str
) -> None:
"""Mark RUNNING → FAILED (retry pending)."""
await self._update_status(run_id, RunStatus.FAILED, finished_at, error)
async def mark_dead_letter(
self, *, run_id: str, finished_at: datetime, error: str
) -> None:
"""Mark RUNNING → DEAD_LETTER (retries exhausted or non-retryable)."""
await self._update_status(run_id, RunStatus.DEAD_LETTER, finished_at, error)
async def mark_crashed(
self, *, run_id: str, finished_at: datetime, error: str
) -> None:
"""Mark RUNNING → CRASHED (called by crash-recovery sweep)."""
await self._update_status(run_id, RunStatus.CRASHED, finished_at, error)
async def _update_status(
self,
run_id: str,
status: RunStatus,
finished_at: datetime,
error: str | None,
) -> None:
async with self._storage.connect() as conn:
await conn.execute(
"UPDATE run_record "
"SET status = ?, finished_at = ?, error = ? "
"WHERE run_id = ?",
(status.value, to_iso_format(finished_at), error, run_id),
)
await conn.commit()
async def get(self, run_id: str) -> RunRecord | None:
"""Return the record for ``run_id`` (``None`` if absent)."""
async with self._storage.connect() as conn:
cur = await conn.execute(
_SELECT_COLUMNS + " WHERE run_id = ?",
(run_id,),
)
row = await cur.fetchone()
return _row_to_record(row) if row else None
async def list_runs(
self,
*,
strategy_name: str,
status: RunStatus | None = None,
limit: int = 100,
) -> list[RunRecord]:
"""Return ``strategy_name``'s records, newest first; optional status filter."""
sql = _SELECT_COLUMNS + " WHERE strategy_name = ?"
args: list[Any] = [strategy_name]
if status is not None:
sql += " AND status = ?"
args.append(status.value)
sql += " ORDER BY started_at DESC LIMIT ?"
args.append(limit)
async with self._storage.connect() as conn:
cur = await conn.execute(sql, args)
rows = await cur.fetchall()
return [_row_to_record(r) for r in rows]
async def find_running(self) -> list[RunRecord]:
"""Return every row still in RUNNING — used by crash recovery at start()."""
async with self._storage.connect() as conn:
cur = await conn.execute(
_SELECT_COLUMNS + " WHERE status = ?",
(RunStatus.RUNNING.value,),
)
rows = await cur.fetchall()
return [_row_to_record(r) for r in rows]
_SELECT_COLUMNS = (
"SELECT run_id, strategy_name, status, attempt, started_at, finished_at, "
" error, event_topic, event_payload, max_retries_snapshot "
"FROM run_record"
)
def _row_to_record(row: tuple) -> RunRecord:
return RunRecord(
run_id=row[0],
strategy_name=row[1],
status=RunStatus(row[2]),
attempt=row[3],
started_at=from_iso_format(row[4]),
finished_at=from_iso_format(row[5]) if row[5] else None,
error=row[6],
event_topic=row[7],
event_payload=row[8],
max_retries_snapshot=row[9],
)

View File

@ -0,0 +1,115 @@
"""OME SQLite storage — schema initialization + connection factory.
Single file (default ``MemoryRoot.default().ome_db`` ≡
``<memory-root>/.index/sqlite/ome.db``). Holds 3 OME-managed tables
(counter_store / idle_store / run_record); APS jobstore table is created
by APScheduler itself when its SQLAlchemyJobStore connects.
PRAGMA scopes (see https://www.sqlite.org/pragma.html):
- ``journal_mode=WAL`` is file-level — persisted in the db header,
applied once in :meth:`OMEStorage.init`.
- ``synchronous=NORMAL``, ``cache_size=-65536``, ``busy_timeout=5000``
are connection-level and reset on every new connection, so they are
re-applied inside :meth:`OMEStorage.connect` (which is why
``connect`` is an ``@asynccontextmanager`` rather than a passthrough).
This mirrors SQLAlchemy's canonical ``@event.listens_for(Engine,
"connect")`` pattern for SQLite — aiosqlite exposes no equivalent
hook. ``busy_timeout=5000`` matters because the APS jobstore writes
its own table in the same db file; without it, WAL writer-vs-writer
contention surfaces as ``SQLITE_BUSY`` instead of brief backoff.
"""
from __future__ import annotations
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from pathlib import Path
import aiosqlite
_SCHEMA = """
CREATE TABLE IF NOT EXISTS counter_store (
strategy_name TEXT NOT NULL,
bucket_key TEXT NOT NULL,
counter INTEGER NOT NULL DEFAULT 0,
last_passed_ts TIMESTAMP,
PRIMARY KEY (strategy_name, bucket_key)
);
CREATE TABLE IF NOT EXISTS idle_store (
strategy_name TEXT NOT NULL,
bucket_key TEXT NOT NULL,
last_activity_ts TIMESTAMP NOT NULL,
PRIMARY KEY (strategy_name, bucket_key)
);
CREATE INDEX IF NOT EXISTS idx_idle_scan
ON idle_store (strategy_name, last_activity_ts);
CREATE TABLE IF NOT EXISTS run_record (
run_id TEXT PRIMARY KEY,
strategy_name TEXT NOT NULL,
status TEXT NOT NULL,
attempt INTEGER NOT NULL DEFAULT 0,
started_at TIMESTAMP NOT NULL,
finished_at TIMESTAMP,
error TEXT,
event_topic TEXT NOT NULL,
event_payload TEXT NOT NULL,
max_retries_snapshot INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_run_strategy_started
ON run_record (strategy_name, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_run_status_started
ON run_record (status, started_at DESC);
"""
_INIT_PRAGMAS = ("PRAGMA journal_mode=WAL",)
_CONN_PRAGMAS = (
"PRAGMA synchronous=NORMAL",
"PRAGMA cache_size=-65536",
"PRAGMA busy_timeout=5000",
)
class OMEStorage:
"""Connection factory + schema init for the OME SQLite db."""
def __init__(self, db_path: Path) -> None:
self.db_path = db_path
async def init(self) -> None:
"""Create parent dirs + apply file-level pragmas + create schema."""
self.db_path.parent.mkdir(parents=True, exist_ok=True)
async with aiosqlite.connect(self.db_path) as conn:
for pragma in _INIT_PRAGMAS:
await conn.execute(pragma)
await conn.executescript(_SCHEMA)
await conn.commit()
@asynccontextmanager
async def connect(self) -> AsyncIterator[aiosqlite.Connection]:
"""Yield an aiosqlite connection with per-connection pragmas applied."""
async with aiosqlite.connect(self.db_path) as conn:
for pragma in _CONN_PRAGMAS:
await conn.execute(pragma)
yield conn
@asynccontextmanager
async def transaction(self) -> AsyncIterator[aiosqlite.Connection]:
"""Yield a connection inside an ``IMMEDIATE`` transaction.
Commits on success, rolls back on any exception. Mirrors
SQLAlchemy's ``conn.begin()`` for raw aiosqlite, which exposes
no built-in transaction context manager. ``BEGIN IMMEDIATE``
(rather than ``DEFERRED``) acquires the write lock upfront so
a read-modify-write block cannot lose to a competing writer
between its SELECT and its UPDATE.
"""
async with self.connect() as conn:
try:
await conn.execute("BEGIN IMMEDIATE")
yield conn
await conn.commit()
except Exception:
await conn.rollback()
raise

View File

@ -0,0 +1,157 @@
"""OMEConfig (engine-level) + TomlRoot (per-strategy override schema).
All models forbid extra keys so configuration typos surface at startup
as StartupValidationError instead of being silently ignored.
"""
from __future__ import annotations
from pathlib import Path
from typing import Annotated, Self
from apscheduler.triggers.cron import CronTrigger
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from everos.core.persistence.memory_root import MemoryRoot
def _default_jobstore_path() -> Path:
return MemoryRoot.default().ome_db
class CounterOverride(BaseModel):
"""TOML override for a strategy's Counter gate (per-key None means keep)."""
model_config = ConfigDict(extra="forbid")
threshold: Annotated[int, Field(gt=0)] | None = None
cooldown_seconds: Annotated[int, Field(ge=0)] | None = None
event_field: Annotated[str, Field(min_length=1)] | None = None
class StrategyOverride(BaseModel):
"""TOML override for one strategy's decorator parameters."""
model_config = ConfigDict(extra="forbid")
enabled: bool | None = None
max_retries: Annotated[int, Field(ge=0)] | None = None
gate: CounterOverride | None = None
cron: str | None = None
idle_seconds: Annotated[int, Field(gt=0)] | None = None
scan_interval_seconds: Annotated[int, Field(gt=0)] | None = None
@field_validator("cron")
@classmethod
def _validate_crontab(cls, v: str | None) -> str | None:
if v is not None:
CronTrigger.from_crontab(v)
return v
@model_validator(mode="after")
def _check_idle_pair_consistency(self) -> Self:
# One-sided overrides are merged with existing meta downstream,
# so cross-check only when both fields are in this payload.
if (
self.idle_seconds is not None
and self.scan_interval_seconds is not None
and self.scan_interval_seconds > self.idle_seconds // 2
):
raise ValueError(
"StrategyOverride: scan_interval_seconds "
f"({self.scan_interval_seconds}) must be <= idle_seconds // 2 "
f"({self.idle_seconds // 2})"
)
return self
class TomlRoot(BaseModel):
"""Top-level TOML schema for ome.toml."""
model_config = ConfigDict(extra="forbid")
strategies: dict[str, StrategyOverride] = Field(default_factory=dict)
class OMEConfig(BaseModel):
"""Engine-level configuration consumed by OfflineEngine."""
model_config = ConfigDict(extra="forbid")
jobstore_path: Path = Field(
default_factory=_default_jobstore_path,
description="SQLite DB path holding OME's own state (run records, "
"counter store, idle store). Defaults to "
"``MemoryRoot.default().ome_db`` (``<memory-root>/.index/sqlite/ome.db``).",
)
aps_jobstore_path: Path | None = Field(
default=None,
description="SQLite DB path holding the APScheduler jobstore. Kept "
"in a separate file from ``jobstore_path`` so APS's sync SQLAlchemy "
"writer never contends with OME's async aiosqlite writer for the "
"same SQLite file lock. When unset, defaults to a sibling "
"``<stem>.aps.db`` next to ``jobstore_path``.",
)
max_concurrent_runs: Annotated[
int,
Field(
gt=0,
description="Engine-wide cap on concurrent strategy invocations "
"(asyncio.Semaphore in Runner).",
),
] = 20
max_retries: Annotated[
int,
Field(
ge=0,
description="Default retry budget per run, overridable via "
"@offline_strategy(max_retries=...) or StrategyOverride.max_retries. "
"0 disables retries.",
),
] = 1
max_records_per_strategy: Annotated[
int,
Field(
gt=0,
description="Per-strategy RunRecord ring-buffer size; oldest "
"entries are pruned on insert.",
),
] = 1000
crash_recovery_timeout_seconds: Annotated[
int,
Field(
gt=0,
description="A run lingering in RUNNING longer than this is "
"treated as crashed, marked CRASHED, and re-enqueued with a "
"fresh run_id.",
),
] = 1800
config_path: Path | None = Field(
default=None,
description="Path to ome.toml for per-strategy overrides. None "
"disables TOML-driven hot reload.",
)
config_watch: bool = Field(
default=True,
description="When true and config_path is set, watch the file for "
"edits and apply overrides at runtime.",
)
config_watch_debounce_ms: Annotated[
int,
Field(
gt=0,
description="Debounce window collapsing bursts of filesystem "
"events (e.g. editor saves) into one reload.",
),
] = 1600
@model_validator(mode="after")
def _derive_aps_jobstore_path(self) -> Self:
# When unset, materialize as a sibling of jobstore_path so callers
# that pass only jobstore_path (e.g. tests using tmp_path) still get
# an isolated APS db rather than the global default root.
if self.aps_jobstore_path is None:
self.aps_jobstore_path = self.jobstore_path.with_name(
self.jobstore_path.stem + ".aps.db"
)
return self

View File

@ -0,0 +1,33 @@
"""StrategyContext Protocol — injected as second arg to every strategy.
Strategies access run-local state through `run_id` and `logger`, and
chain-emit follow-up events via `emit(event)`. Business IO is NOT mediated
by this Protocol — strategies directly import their persistence adapters
(memory → infra is allowed under the project's DDD layering).
"""
from __future__ import annotations
from typing import Protocol
from structlog.types import FilteringBoundLogger
from everos.infra.ome.events import BaseEvent
class StrategyContext(Protocol):
"""Per-run context handed to a strategy function.
- run_id: the current RunRecord id (string).
- logger: structlog logger; ``strategy_name`` / ``run_id`` /
``attempt`` are auto-injected into every log record in this call
— strategies don't have to use this specific logger to get those
fields.
- emit(event): chain-emit a follow-up event (must be in decorator's
``emits=[...]``, else EmitNotDeclaredError).
"""
run_id: str
logger: FilteringBoundLogger
async def emit(self, event: BaseEvent) -> None: ...

View File

@ -0,0 +1,69 @@
"""@offline_strategy decorator — attaches StrategyMeta to the function.
Decorator is side-effect-free; engine collects via explicit
`engine.register(func)`.
"""
from __future__ import annotations
import inspect
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from everos.infra.ome.context import StrategyContext
from everos.infra.ome.events import BaseEvent
from everos.infra.ome.gates import Counter
from everos.infra.ome.triggers import Trigger
type AppliesTo = str | Callable[[BaseEvent], bool] | None
type StrategyFn = Callable[[BaseEvent, StrategyContext], Awaitable[None]]
@dataclass(frozen=True)
class StrategyMeta:
"""Captured at decoration time; consumed by engine.register()."""
name: str
trigger: Trigger
emits: frozenset[type[BaseEvent]]
applies_to: AppliesTo
gate: Counter | None
max_retries: int | None
enabled: bool
func: StrategyFn
def offline_strategy(
*,
name: str,
trigger: Trigger,
emits: list[type[BaseEvent]],
applies_to: AppliesTo = None,
gate: Counter | None = None,
max_retries: int | None = None,
enabled: bool = True,
) -> Callable[[StrategyFn], StrategyFn]:
"""Mark an async function as an OME strategy."""
if not name or not name.strip():
raise ValueError("offline_strategy: name must be a non-empty string")
def wrap(func: StrategyFn) -> StrategyFn:
if not inspect.iscoroutinefunction(func):
raise TypeError(
f"offline_strategy: {func.__name__} must be async (coroutine function)"
)
meta = StrategyMeta(
name=name,
trigger=trigger,
emits=frozenset(emits),
applies_to=applies_to,
gate=gate,
max_retries=max_retries,
enabled=enabled,
func=func,
)
func._ome_strategy_meta = meta # type: ignore[attr-defined]
return func
return wrap

View File

@ -0,0 +1,797 @@
"""OfflineEngine — OME runtime and scheduler.
Manages strategy registration, start-stop lifecycle, event dispatch, and
scheduling of Cron and Idle triggers via APScheduler. Enforces single-engine
guard via portalocker for concurrent access safety.
"""
from __future__ import annotations
import asyncio
import functools
import inspect
from collections.abc import Callable
from pathlib import Path
from typing import Any
from uuid import uuid4
import portalocker
from apscheduler.executors.asyncio import AsyncIOExecutor
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from apscheduler.triggers.interval import IntervalTrigger
from everos.component.utils.datetime import get_utc_now
from everos.core.observability.logging import get_logger
from everos.infra.ome._background.config_reloader import ConfigReloader
from everos.infra.ome._background.crash_recovery import scan_and_resume
from everos.infra.ome._background.idle_scanner import IdleScanner
from everos.infra.ome._dispatch._state import _CURRENT_STRATEGY
from everos.infra.ome._dispatch.dispatcher import EventDispatcher
from everos.infra.ome._dispatch.registry import StrategyRegistry
from everos.infra.ome._dispatch.runner import Runner
from everos.infra.ome._stores.counter import CounterStore
from everos.infra.ome._stores.idle import IdleStore
from everos.infra.ome._stores.run_record import RunRecordStore
from everos.infra.ome._stores.storage import OMEStorage
from everos.infra.ome.config import OMEConfig
from everos.infra.ome.decorator import StrategyMeta
from everos.infra.ome.events import BaseEvent, CronTick, ManualTick, resolve_topic
from everos.infra.ome.exceptions import (
EngineCallFromStrategyError,
EngineLockHeldError,
OMEError,
)
from everos.infra.ome.records import RunRecord, RunStatus, StrategyRouteInfo
from everos.infra.ome.triggers import Cron, Idle
logger = get_logger(__name__)
_ENGINES: dict[str, OfflineEngine] = {}
def _refuse_inside_strategy(method: Any) -> Any:
"""Raise :class:`EngineCallFromStrategyError` when called from a strategy.
Strategies must interact with the engine only via the ``(event, ctx)``
parameters Runner provides; direct calls bypass the declared
``emits=[...]`` contract enforced by ``ctx.emit``. Wraps sync and async
methods alike.
"""
if inspect.iscoroutinefunction(method):
@functools.wraps(method)
async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
current = _CURRENT_STRATEGY.get()
if current is not None:
raise EngineCallFromStrategyError(
strategy=current.name, method=method.__name__
)
return await method(self, *args, **kwargs)
return async_wrapper
@functools.wraps(method)
def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
current = _CURRENT_STRATEGY.get()
if current is not None:
raise EngineCallFromStrategyError(
strategy=current.name, method=method.__name__
)
return method(self, *args, **kwargs)
return sync_wrapper
async def _runner_entry(
engine_id: str,
strategy_name: str,
run_id: str,
event_topic: str,
event_payload: str,
max_retries_snapshot: int,
) -> None:
"""Module-level APS jobstore callback for a single run.
Looks the engine up by id and hands off to
:meth:`OfflineEngine.dispatch_run`. Pickle-safe (no closures, no
bound methods captured into APS jobstore args).
"""
engine = _ENGINES.get(engine_id)
if engine is None:
logger.error(
"no_engine_for_runner",
engine_id=engine_id,
run_id=run_id,
)
return
await engine.dispatch_run(
strategy_name=strategy_name,
run_id=run_id,
event_topic=event_topic,
event_payload=event_payload,
max_retries_snapshot=max_retries_snapshot,
)
async def _cron_entry(engine_id: str, strategy_name: str) -> None:
"""Module-level APS jobstore callback for Cron triggers.
Looks the engine up by id and emits ``CronTick`` so the event flows
back through the standard dispatch pipeline.
"""
engine = _ENGINES.get(engine_id)
if engine is None:
logger.error(
"no_engine_for_cron",
engine_id=engine_id,
strategy_name=strategy_name,
)
return
await engine.emit(CronTick(strategy_name=strategy_name))
async def _idle_entry(engine_id: str, strategy_name: str) -> None:
"""Module-level APS jobstore callback for Idle IntervalTriggers.
Looks the engine up by id and hands off to
:meth:`OfflineEngine.run_idle_scan`.
"""
engine = _ENGINES.get(engine_id)
if engine is None:
logger.error(
"no_engine_for_idle",
engine_id=engine_id,
strategy_name=strategy_name,
)
return
await engine.run_idle_scan(strategy_name)
class OfflineEngine:
"""Offline Memory Engine — orchestrates strategy registration, scheduling,
and event dispatch.
Lifecycle::
engine = OfflineEngine(config=cfg)
engine.register(my_strategy) # before start()
engine.on_dead_letter(cb) # before start()
await engine.start() # acquires file lock, boots scheduler
await engine.emit(SomeEvent(...)) # fan out through dispatcher
await engine.stop() # graceful shutdown
Single-process invariant: a file lock on
``<jobstore_path>.lock`` guarantees at most one engine per jobstore
at any time (cross-process safe via ``portalocker``).
"""
def __init__(
self,
*,
config: OMEConfig,
) -> None:
self._config = config
self._registry = StrategyRegistry()
self._storage = OMEStorage(db_path=config.jobstore_path)
self._lock_handle: Any = None
self._started = False
self._on_dead_letter: Callable[[RunRecord], None] | None = None
# late-bound (set in start())
self._counter_store: CounterStore | None = None
self._run_record_store: RunRecordStore | None = None
self._dispatcher: EventDispatcher | None = None
self._runner: Runner | None = None
self._engine_sem: asyncio.Semaphore | None = None
self._idle_store: IdleStore | None = None
self._engine_id = uuid4().hex
self._scheduler: AsyncIOScheduler | None = None
self._config_reloader: ConfigReloader | None = None
# In-flight strategy-run accounting. Incremented at the moment a
# run is enqueued onto APS (so callers that emit-then-wait observe
# a non-zero count immediately), decremented in dispatch_run's
# finally. APS 3.x AsyncIOExecutor.shutdown(wait=True) does NOT
# honor wait for async coroutines (see apscheduler/executors/
# asyncio.py:24); this counter is how stop() / drain() learn the
# engine is genuinely idle.
self._active_runs = 0
self._idle_event: asyncio.Event | None = None
def register(self, func: Callable[..., Any]) -> None:
"""Register a strategy decorated with :func:`offline_strategy`.
Must be called before :meth:`start`; registering after start raises
:class:`OMEError` because the scheduler has already snapshotted
the strategy set for Cron / Idle job creation.
"""
if self._started:
raise OMEError("register: cannot register after start()")
self._registry.register(func)
@_refuse_inside_strategy
def reschedule_cron_job(self, name: str, expr: str) -> None:
"""Reschedule a Cron strategy's APScheduler job to a new crontab.
APS reschedule_job is atomic: on success, pending invocations are
recomputed against the new trigger; on failure it raises and APS
state is unchanged, so callers can roll back paired registry
mutations.
"""
if self._scheduler is None:
raise OMEError("reschedule_cron_job: engine not started")
self._scheduler.reschedule_job(
job_id=f"cron::{name}",
trigger=CronTrigger.from_crontab(expr),
)
@_refuse_inside_strategy
def reschedule_idle_job(self, name: str, scan_interval_seconds: int) -> None:
"""Reschedule an Idle strategy's APScheduler scan job to a new interval."""
if self._scheduler is None:
raise OMEError("reschedule_idle_job: engine not started")
self._scheduler.reschedule_job(
job_id=f"idle::{name}",
trigger=IntervalTrigger(seconds=scan_interval_seconds),
)
def on_dead_letter(self, callback: Callable[[RunRecord], None]) -> None:
"""Register a callback invoked after a run is marked DEAD_LETTER.
Must be set before start(); calls after start() are silently ignored
(logged at WARNING) to avoid racing with the already-instantiated
Runner that captured a snapshot of the callback. If called multiple
times before start(), only the last callback wins (no chaining).
"""
if self._started:
logger.warning("on_dead_letter_after_start_ignored")
return
self._on_dead_letter = callback
async def start(self) -> None:
"""Boot the engine: acquire the jobstore lock, validate the strategy
DAG, wire up late-bound stores, launch APScheduler, run crash
recovery, register Cron / Idle jobs, and optionally start the
config-reloader.
Idempotent: a second call while running is a no-op. On failure,
every partially-initialised resource (lock, scheduler thread,
:data:`_ENGINES` slot, config reloader) is rolled back so a retry
starts from a clean state.
"""
if self._started:
return
await self._storage.init()
self._acquire_lock()
try:
self._registry.validate()
self._init_components()
self._idle_event = asyncio.Event()
self._idle_event.set()
self._launch_scheduler()
_ENGINES[self._engine_id] = self
await self._run_crash_recovery()
self._register_scheduled_jobs()
self._start_config_reloader()
self._started = True
except Exception:
await self._rollback_partial_start()
raise
def _init_components(self) -> None:
"""Instantiate stores / dispatcher / runner / semaphore.
Called from :meth:`start` after the file lock is held and DAG
validation passed; never from anywhere else.
"""
self._counter_store = CounterStore(storage=self._storage)
self._run_record_store = RunRecordStore(
storage=self._storage,
max_records_per_strategy=self._config.max_records_per_strategy,
)
self._dispatcher = EventDispatcher(
registry=self._registry,
counter_store=self._counter_store,
)
self._engine_sem = asyncio.Semaphore(self._config.max_concurrent_runs)
self._runner = Runner(
run_record_store=self._run_record_store,
engine_sem=self._engine_sem,
emit_hook=self._dispatch_event,
on_dead_letter=self._on_dead_letter,
)
self._idle_store = IdleStore(storage=self._storage)
def _launch_scheduler(self) -> None:
"""Wire up AsyncIOScheduler + SQLAlchemyJobStore and start it.
The APS jobstore lives in its own SQLite file
(``aps_jobstore_path``) so APS's sync SQLAlchemy writes never
contend with OME's async aiosqlite writes for the same file lock
— both writers had previously raced on a single ``ome.db`` and
manifested as flaky ``SQLITE_BUSY: database is locked`` during
concurrent strategy dispatch.
"""
self._scheduler = AsyncIOScheduler(
jobstores={
"default": SQLAlchemyJobStore(
url=f"sqlite:///{self._config.aps_jobstore_path}",
),
},
executors={"default": AsyncIOExecutor()},
)
self._scheduler.start()
async def _run_crash_recovery(self) -> None:
"""Scan ``run_record`` for stale RUNNING rows and re-enqueue them.
Treats rows whose ``started_at`` is older than
``crash_recovery_timeout_seconds`` as crashes from a previous
engine session: they are marked CRASHED and re-added to APS with
a fresh ``run_id`` reusing the original event payload.
"""
await scan_and_resume(
run_record_store=self._run_record_store,
timeout_seconds=self._config.crash_recovery_timeout_seconds,
add_job=self._enqueue_recovery_job,
)
async def _enqueue_recovery_job(
self,
name: str,
run_id: str,
event_topic: str,
event_payload: str,
max_retries: int,
) -> None:
"""Add one APS job for a re-enqueued crashed run (callback for
:func:`scan_and_resume`).
Same enqueue-time bookkeeping as :meth:`_enqueue_run`: the run
will reach :meth:`dispatch_run` like any other, so the +1/-1
pair must wrap the ``add_job`` call here too.
"""
self._on_run_enqueued()
try:
self._scheduler.add_job(
_runner_entry,
trigger="date",
run_date=get_utc_now(),
args=[
self._engine_id,
name,
run_id,
event_topic,
event_payload,
max_retries,
],
id=run_id,
replace_existing=False,
misfire_grace_time=None, # type: ignore[arg-type] # APS accepts None ("no expiry"); stub omits it (apscheduler/job.py:213)
)
except Exception:
self._on_run_completed()
raise
def _register_scheduled_jobs(self) -> None:
"""Add Cron / Idle APS jobs for every strategy with such a trigger.
Immediate-trigger strategies have nothing scheduled here — they
fire only when their declared event class is dispatched.
"""
for meta in self._registry.all():
if isinstance(meta.trigger, Cron):
self._scheduler.add_job(
_cron_entry,
trigger=CronTrigger.from_crontab(meta.trigger.expr),
args=[self._engine_id, meta.name],
id=f"cron::{meta.name}",
replace_existing=True,
)
elif isinstance(meta.trigger, Idle):
self._scheduler.add_job(
_idle_entry,
trigger=IntervalTrigger(seconds=meta.trigger.scan_interval_seconds),
args=[self._engine_id, meta.name],
id=f"idle::{meta.name}",
replace_existing=True,
)
def _start_config_reloader(self) -> None:
"""Start :class:`ConfigReloader` iff ``config_watch`` is on and a
``config_path`` is provided.
"""
if self._config.config_watch and self._config.config_path is not None:
self._config_reloader = ConfigReloader(
config_path=self._config.config_path,
registry=self._registry,
engine=self,
debounce_ms=self._config.config_watch_debounce_ms,
)
self._config_reloader.start()
async def _rollback_partial_start(self) -> None:
"""Reverse-order cleanup of whatever :meth:`start` had already
wired up before the failure: stop reloader, drain in-flight runs
(best-effort, short timeout — startup failure shouldn't block on
recovery jobs), shut the scheduler, drop ``_ENGINES`` slot, and
release the file lock.
Same ``wait_idle → shutdown(wait=False)`` order as :meth:`stop`
for the same reasons (pause would freeze recovery jobs that
already own a +1).
"""
if self._config_reloader is not None:
try:
await self._config_reloader.stop()
finally:
self._config_reloader = None
if self._scheduler is not None:
try:
await self.wait_idle(timeout=5.0)
self._scheduler.shutdown(wait=False)
finally:
self._scheduler = None
_ENGINES.pop(self._engine_id, None)
self._release_lock()
self._idle_event = None
self._active_runs = 0
async def wait_idle(self, *, timeout: float = 30.0) -> bool: # noqa: ASYNC109
"""Block until every in-flight strategy run has settled.
Returns ``True`` on idle, ``False`` if ``timeout`` elapses with
runs still active. "In flight" means anywhere between
:meth:`_enqueue_run` (which bumps the counter just before the
``add_job`` call) and the end of :meth:`dispatch_run` (which
releases it in ``finally``).
Why this exists: APS 3.x ``AsyncIOExecutor.shutdown(wait=True)``
documents — in the executor source — that it cannot honor wait
for async coroutines and simply cancels their futures
(``apscheduler/executors/asyncio.py:24``). Anything depending on
"all jobs really completed" has to drain through this counter,
not the scheduler.
"""
if self._idle_event is None:
return self._active_runs == 0
try:
await asyncio.wait_for(self._idle_event.wait(), timeout=timeout)
return True
except TimeoutError:
return False
async def stop(self) -> None:
"""Shut the engine down gracefully: stop the config reloader, drain
in-flight strategy runs, shut the scheduler, drop the global
``_ENGINES`` slot, and release the jobstore lock.
Idempotent: calling stop on an already-stopped engine is a no-op.
Drain ordering matters and is *deliberately* not
``pause → wait_idle → shutdown``.
- We cannot ``pause()`` first: APS ``pause()`` freezes jobstore
dispatch including jobs already enqueued (see
``apscheduler/schedulers/base.py:pause``: "prevent the scheduler
from waking up to do job processing"). Each such job already
owns a +1 in ``_active_runs`` from :meth:`_enqueue_run`, so
freezing dispatch deadlocks :meth:`wait_idle`.
- We cannot use ``shutdown(wait=True)``: APS 3.x
``AsyncIOExecutor.shutdown`` documents in its own source that
it cannot honor wait for async coroutines and cancels their
futures (``apscheduler/executors/asyncio.py:24``). Cascade
``CancelledError`` / "Event loop is closed" warnings follow.
Order used here: ``wait_idle`` first (lets APS finish dispatching
everything in the jobstore and lets every dispatch_run release its
counter), then ``shutdown(wait=False)`` (drops the executor cleanly
because there is nothing left in flight).
``_ENGINES`` is popped only after the drain so ``_runner_entry``
can still find this engine via its id while finishing the last
few jobs.
"""
if not self._started:
return
if self._config_reloader is not None:
await self._config_reloader.stop()
self._config_reloader = None
if self._scheduler is not None:
drained = await self.wait_idle(timeout=30.0)
if not drained:
logger.warning(
"ome_stop_drain_timeout",
engine_id=self._engine_id,
active_runs=self._active_runs,
)
self._scheduler.shutdown(wait=False)
self._scheduler = None
_ENGINES.pop(self._engine_id, None)
self._release_lock()
self._started = False
self._idle_event = None
self._active_runs = 0
def _acquire_lock(self) -> None:
lock_path = Path(str(self._config.jobstore_path) + ".lock")
lock_path.parent.mkdir(parents=True, exist_ok=True)
try:
handle = open(lock_path, "a+") # noqa: SIM115
portalocker.lock(handle, portalocker.LOCK_EX | portalocker.LOCK_NB)
self._lock_handle = handle
except portalocker.LockException as e:
raise EngineLockHeldError(
f"another OfflineEngine instance already holds {lock_path}"
) from e
def _release_lock(self) -> None:
if self._lock_handle is not None:
try:
portalocker.unlock(self._lock_handle)
finally:
self._lock_handle.close()
self._lock_handle = None
@_refuse_inside_strategy
async def emit(self, event: BaseEvent) -> None:
"""Public engine event entry point.
Strategies must NOT call this directly; use ``ctx.emit`` instead.
The :func:`_refuse_inside_strategy` guard raises
:class:`EngineCallFromStrategyError` on in-strategy calls — only
``ctx.emit`` enforces the strategy's declared ``emits=[...]``
contract.
"""
await self._dispatch_event(event)
async def _dispatch_event(self, event: BaseEvent) -> None:
"""Internal: actually run an event through dispatch.
Used by Runner's ``emit_hook`` so ``ctx.emit`` flows through
dispatch without tripping the public-method guard.
"""
if not self._started:
raise OMEError("emit: engine not started")
# Touch idle_store for any Idle strategy listening on this event type
# (best-effort; errors do not block dispatch)
for meta in self._registry.all():
if isinstance(meta.trigger, Idle) and type(event) in meta.trigger.on:
bucket = getattr(event, meta.trigger.event_field, None)
if bucket is not None:
try:
await self._idle_store.touch( # type: ignore[union-attr]
meta.name,
str(bucket),
at=get_utc_now(),
)
except Exception as e:
logger.warning(
"idle_touch_failed",
strategy_name=meta.name,
event_field=meta.trigger.event_field,
error=str(e),
)
routes = await self._dispatcher.dispatch(event)
for meta, run_id in routes:
self._enqueue_run(meta, event, run_id)
@_refuse_inside_strategy
async def trigger_manual(
self,
name: str,
*,
event: BaseEvent | None = None,
force: bool = False,
) -> None:
"""Manually trigger one strategy.
- ``event=None`` → engine self-emits ``ManualTick(strategy_name=name)``
- ``force=True`` → bypass the ``enabled`` gate (``applies_to`` and
``Counter`` still apply)
Routes through :meth:`EventDispatcher.dispatch` with
``strategy_filter=name`` so the same three-gate logic is applied
as for engine-driven dispatch.
"""
if not self._started:
raise OMEError("trigger_manual: engine not started")
if event is None:
event = ManualTick(strategy_name=name)
routes = await self._dispatcher.dispatch(
event,
force_enabled=force,
strategy_filter=name,
)
for meta, run_id in routes:
self._enqueue_run(meta, event, run_id)
def _enqueue_run(self, meta: StrategyMeta, event: BaseEvent, run_id: str) -> None:
"""Add a one-shot APScheduler job that hands the event to Runner.
Computes ``max_retries_snapshot`` from meta or engine default and
packages a pickle-safe args tuple — the dispatch tail shared by
``_dispatch_event``, ``trigger_manual``, and crash recovery.
Counter ``self._active_runs`` is bumped *before* ``add_job`` so a
caller that ``emit`` s then immediately ``wait_idle`` s observes a
non-zero count; the matching decrement lives in
:meth:`dispatch_run` (which is guaranteed to run for every job
APS dispatches). If ``add_job`` itself raises, the counter is
rolled back here.
"""
max_retries_snapshot = (
meta.max_retries
if meta.max_retries is not None
else self._config.max_retries
)
event_topic = type(event).topic()
self._on_run_enqueued()
try:
self._scheduler.add_job(
_runner_entry,
trigger="date",
run_date=get_utc_now(),
args=[
self._engine_id,
meta.name,
run_id,
event_topic,
event.model_dump_json(),
max_retries_snapshot,
],
id=run_id,
replace_existing=False,
misfire_grace_time=None, # type: ignore[arg-type] # APS accepts None ("no expiry"); stub omits it (apscheduler/job.py:213)
)
except Exception:
self._on_run_completed()
raise
def _on_run_enqueued(self) -> None:
"""Bump in-flight count and mark the engine non-idle."""
self._active_runs += 1
if self._idle_event is not None:
self._idle_event.clear()
def _on_run_completed(self) -> None:
"""Drop in-flight count; mark the engine idle if the count hit zero.
Never lets the counter dip below zero — that would mask a bookkeeping
bug rather than fix it, and a stuck-clear idle_event would deadlock
``wait_idle``.
"""
if self._active_runs <= 0:
logger.error(
"active_runs_underflow",
engine_id=self._engine_id,
)
self._active_runs = 0
if self._idle_event is not None:
self._idle_event.set()
return
self._active_runs -= 1
if self._active_runs == 0 and self._idle_event is not None:
self._idle_event.set()
async def dispatch_run(
self,
*,
strategy_name: str,
run_id: str,
event_topic: str,
event_payload: str,
max_retries_snapshot: int,
) -> None:
"""APS jobstore callback target for one strategy run.
Public because the module-level :func:`_runner_entry` callback
must cross the pickle boundary — a bound method on ``self`` is
not picklable into the APS jobstore. Not part of the
strategy-author API; intended to be called only by
``_runner_entry`` (and crash recovery). Not guarded with
``_refuse_inside_strategy`` because APS executors may inherit
the calling task's ContextVar — a strategy that ``ctx.emit``s
and triggers a cascade would falsely trip the guard here.
Closes the +1 the matching enqueue path opened, in ``finally``
so cancellation, retries, and crashes all release the count.
"""
try:
cls = resolve_topic(event_topic)
event = cls.model_validate_json(event_payload)
meta = self._registry.get(strategy_name)
await self._runner.run(
meta,
event,
run_id=run_id,
max_retries_snapshot=max_retries_snapshot,
)
finally:
self._on_run_completed()
async def run_idle_scan(self, strategy_name: str) -> None:
"""APS IntervalTrigger callback target for one Idle strategy.
Constructs an :class:`IdleScanner` against the engine's idle_store
and runs one scan, emitting :class:`IdleTick` for each overdue
bucket. Public for the same APS-pickle reason as
:meth:`dispatch_run`; unguarded for the same ContextVar-
inheritance reason.
"""
meta = self._registry.get(strategy_name)
if not isinstance(meta.trigger, Idle):
logger.error(
"idle_entry_bad_trigger_type",
strategy_name=strategy_name,
trigger_type=type(meta.trigger).__name__,
)
return
scanner = IdleScanner(
strategy_name=strategy_name,
trigger=meta.trigger,
idle_store=self._idle_store, # type: ignore[arg-type]
emit=self.emit,
)
await scanner.scan_once()
@_refuse_inside_strategy
async def inspect_dispatch(self, event: BaseEvent) -> list[StrategyRouteInfo]:
"""Return per-strategy routing info for event (read-only).
Calls the dispatcher in inspect mode (no counter mutation).
"""
if not self._started:
raise OMEError("inspect_dispatch: engine not started")
return await self._dispatcher.inspect(event)
@_refuse_inside_strategy
async def list_runs(
self,
strategy_name: str,
*,
status: RunStatus | None = None,
limit: int = 100,
) -> list[RunRecord]:
"""Return run records for ``strategy_name``, optionally filtered by status.
Args:
strategy_name: Strategy whose runs to fetch.
status: Terminal status filter (e.g., ``RunStatus.SUCCESS``); ``None``
returns runs in any state.
limit: Maximum number of records to return; results are ordered
``started_at DESC``.
Returns:
Up to ``limit`` ``RunRecord`` instances, newest first.
Raises:
OMEError: Engine has not been started.
"""
if not self._started:
raise OMEError("list_runs: engine not started")
return await self._run_record_store.list_runs(
strategy_name=strategy_name,
status=status,
limit=limit,
)
@_refuse_inside_strategy
async def get_run_status(self, run_id: str) -> RunRecord | None:
"""Fetch a single run record by ``run_id``.
Args:
run_id: The 32-character ``uuid4().hex`` assigned at dispatch.
Returns:
The matching ``RunRecord``, or ``None`` if no row exists for that id.
Raises:
OMEError: Engine has not been started.
"""
if not self._started:
raise OMEError("get_run_status: engine not started")
return await self._run_record_store.get(run_id)

View File

@ -0,0 +1,78 @@
"""OME event base class + built-in tick events.
All business events should subclass BaseEvent. OME emits three built-in
ticks for engine-driven triggers (Cron / Idle / Manual).
"""
from __future__ import annotations
import importlib
from datetime import datetime
from functools import cache
from typing import Any
from uuid import uuid4
from pydantic import BaseModel, ConfigDict, Field
from everos.component.utils.datetime import get_utc_now
class BaseEvent(BaseModel):
"""Base for all events flowing through OME.
Subclasses must be Pydantic v2 models (immutable) so `model_dump_json` /
`model_validate_json` work for crash-recovery payload persistence.
"""
model_config = ConfigDict(frozen=True, extra="forbid")
event_id: str = Field(default_factory=lambda: uuid4().hex)
ts: datetime = Field(default_factory=get_utc_now)
@classmethod
def topic(cls) -> str:
"""Stable cross-process identifier of this event class.
Returns ``"<module>:<class>"`` (colon-separated, mirroring the
Python event-sourcing community convention). Used by OME to
persist event identity into RunRecord.event_topic and to re-import
the class during crash recovery via ``resolve_topic``.
"""
return f"{cls.__module__}:{cls.__name__}"
@cache
def resolve_topic(topic: str) -> type[BaseEvent]:
"""Inverse of ``BaseEvent.topic()``; imports and returns the class.
Cached because crash recovery may resolve the same topic many times in
a tight loop, and ``importlib.import_module`` is non-trivial.
"""
module_name, sep, cls_name = topic.partition(":")
if not sep or not cls_name:
raise ValueError(f"invalid event topic: {topic!r}")
mod: Any = importlib.import_module(module_name)
cls = getattr(mod, cls_name, None)
if not (isinstance(cls, type) and issubclass(cls, BaseEvent)):
raise TypeError(f"topic {topic!r} did not resolve to a BaseEvent subclass")
return cls
class CronTick(BaseEvent):
"""Engine-emitted event for a strategy with `trigger=Cron(...)`."""
strategy_name: str
class IdleTick(BaseEvent):
"""Engine-emitted event for a strategy with `trigger=Idle(...)`."""
strategy_name: str
bucket_key: str
idle_seconds: int
class ManualTick(BaseEvent):
"""Engine-emitted event for `engine.trigger_manual(name, event=None)`."""
strategy_name: str

View File

@ -0,0 +1,61 @@
"""OME exception hierarchy."""
from __future__ import annotations
from everos.infra.ome.events import BaseEvent
class OMEError(Exception):
"""Base for all OME-internal errors."""
class StartupValidationError(OMEError):
"""Raised by engine.start() for any startup-time validation failure."""
class EngineLockHeldError(OMEError):
"""Raised when another OfflineEngine instance holds the jobstore lock."""
class StrategyContractError(OMEError):
"""Base for strategy-side contract violations.
Subclasses indicate a programming bug in the strategy code that no
retry can fix (wrong API usage, undeclared emit). Runner
short-circuits the attempt loop on these and dead-letters
immediately — consuming the retry budget would only delay the
inevitable and spam logs. External callers can ``except
StrategyContractError`` to handle the whole category at once.
"""
class EngineCallFromStrategyError(StrategyContractError):
"""A strategy called a public OfflineEngine method directly.
The convention is: strategy code interacts with the engine only via
the ``(event, ctx)`` parameters Runner supplies. Engine methods
(``emit``, ``trigger_manual``, ``inspect_dispatch``, ``list_runs``,
``get_run_status``, ``reschedule_*``) are for external callers —
strategies invoking them bypass the framework's contracts.
"""
def __init__(self, strategy: str, method: str) -> None:
self.strategy = strategy
self.method = method
super().__init__(
f"strategy {strategy!r} called engine.{method}() directly; "
"strategies must interact with the engine only via the "
"(event, ctx) parameters"
)
class EmitNotDeclaredError(StrategyContractError):
"""Raised when a strategy emits an event not listed in its decorator's emits."""
def __init__(self, strategy: str, event: BaseEvent) -> None:
self.strategy = strategy
self.event = event
super().__init__(
f"strategy {strategy!r} emitted {type(event).__name__!r} "
"which is not in its declared emits"
)

View File

@ -0,0 +1,52 @@
"""OME gate types — declarative configuration only.
Counter is the only built-in gate. The actual N-counting lives in
_stores/counter.py keyed by (strategy_name, bucket_key).
"""
from __future__ import annotations
from typing import Annotated
from pydantic import BaseModel, ConfigDict, Field
class Counter(BaseModel):
"""Counter gate: batch trigger by accumulated event count per bucket.
Each event increments the bucket counter; the `threshold`-th event
passes and resets.
"""
model_config = ConfigDict(frozen=True, extra="forbid")
threshold: Annotated[
int,
Field(
gt=0,
description=(
"Pass once every `threshold` events; threshold=1 lets every event pass."
),
),
]
cooldown_seconds: Annotated[
int,
Field(
ge=0,
description=(
"Minimum seconds between consecutive passes per bucket; 0 disables."
),
),
] = 0
event_field: Annotated[
str | None,
Field(
description=(
'Bucket dimension on the event (e.g. "user_id"); '
"None means a single global bucket."
),
),
] = None
# Single-member alias today; becomes a union as more gate types land.
Gate = Counter

View File

@ -0,0 +1,99 @@
"""RunRecord / RunStatus / StrategyRouteInfo / CounterProgress — pure data classes.
Persistence in _stores/run_record.py.
"""
from __future__ import annotations
from enum import StrEnum
from typing import Annotated, NamedTuple, Self
from pydantic import (
AwareDatetime,
BaseModel,
ConfigDict,
Field,
computed_field,
model_validator,
)
class RunStatus(StrEnum):
"""Terminal-or-running state of a single strategy run."""
RUNNING = "running"
SUCCESS = "success"
FAILED = "failed"
DEAD_LETTER = "dead_letter"
CRASHED = "crashed"
class RunRecord(BaseModel):
"""One row of the run_record table."""
model_config = ConfigDict(frozen=True, extra="forbid")
run_id: Annotated[str, Field(min_length=1)]
strategy_name: Annotated[str, Field(min_length=1)]
status: RunStatus
attempt: Annotated[int, Field(ge=0)]
started_at: AwareDatetime
finished_at: AwareDatetime | None = None
error: Annotated[str, Field(min_length=1)] | None = None
event_topic: Annotated[
str,
Field(
min_length=1,
description="Stable cross-process event identifier in "
"``<module>:<class>`` form (see ``BaseEvent.topic()``).",
),
]
event_payload: Annotated[
str,
Field(
min_length=1,
description="JSON-encoded event (``BaseEvent.model_dump_json`` output).",
),
]
max_retries_snapshot: Annotated[int, Field(ge=0)]
@model_validator(mode="after")
def _check_status_invariants(self) -> Self:
if self.status == RunStatus.RUNNING:
if self.finished_at is not None:
raise ValueError("RunRecord: RUNNING must have finished_at=None")
if self.error is not None:
raise ValueError("RunRecord: RUNNING must have error=None")
else:
if self.finished_at is None:
raise ValueError(f"RunRecord: {self.status} must have finished_at set")
if self.status == RunStatus.SUCCESS:
if self.error is not None:
raise ValueError("RunRecord: SUCCESS must have error=None")
elif self.error is None:
raise ValueError(f"RunRecord: {self.status} must have error set")
return self
class CounterProgress(NamedTuple):
"""Per-bucket counter progress at inspect_dispatch time."""
current: int
threshold: int
class StrategyRouteInfo(BaseModel):
"""Per-strategy dispatch decision — returned by inspect_dispatch."""
model_config = ConfigDict(frozen=True, extra="forbid")
strategy_name: Annotated[str, Field(min_length=1)]
enabled_pass: bool
applies_to_pass: bool
counter_pass: bool
counter_progress: CounterProgress | None = None
@computed_field # type: ignore[prop-decorator]
@property
def will_run(self) -> bool:
return self.enabled_pass and self.applies_to_pass and self.counter_pass

View File

@ -0,0 +1,9 @@
"""OME testing helpers.
Fake strategy context and test harness for unit testing strategies.
"""
from everos.infra.ome.testing.fakes import FakeStrategyContext as FakeStrategyContext
from everos.infra.ome.testing.harness import StrategyTestHarness as StrategyTestHarness
__all__ = ["FakeStrategyContext", "StrategyTestHarness"]

View File

@ -0,0 +1,38 @@
"""In-memory test doubles for the OME StrategyContext Protocol.
Use FakeStrategyContext when you want to unit-test a strategy function
in isolation without spinning up a full OfflineEngine.
"""
from __future__ import annotations
from everos.core.observability.logging import get_logger
from everos.infra.ome.events import BaseEvent
class FakeStrategyContext:
"""Implements StrategyContext Protocol; collects emit() calls in a list.
Attributes:
run_id: Unique identifier for this run (default: "fake_run").
logger: A structlog BoundLogger for test logging.
emitted: List of BaseEvent objects passed to emit().
"""
def __init__(self, *, run_id: str = "fake_run") -> None:
"""Initialize a FakeStrategyContext.
Args:
run_id: Run identifier, defaults to "fake_run".
"""
self.run_id = run_id
self.logger = get_logger("ome.fake_ctx")
self.emitted: list[BaseEvent] = []
async def emit(self, event: BaseEvent) -> None:
"""Collect an event into the emitted list.
Args:
event: The BaseEvent to emit.
"""
self.emitted.append(event)

View File

@ -0,0 +1,118 @@
"""StrategyTestHarness — full OfflineEngine on a tmp SQLite db.
Designed for end-to-end strategy tests: register, start, emit, drain
until terminal, inspect run records. Cleans up the tmp directory on exit.
"""
from __future__ import annotations
import shutil
from pathlib import Path
from tempfile import mkdtemp
from typing import Any
from everos.infra.ome.config import OMEConfig
from everos.infra.ome.engine import OfflineEngine
from everos.infra.ome.events import BaseEvent
from everos.infra.ome.records import RunRecord, RunStatus
class StrategyTestHarness:
"""Async context manager wrapping OfflineEngine on a tmp SQLite db.
Provides a test-friendly interface to register strategies, emit events,
and inspect run records.
Example:
async with StrategyTestHarness() as h:
h.register(my_strategy_func)
await h.start()
await h.emit(MyEvent())
await h.drain(timeout=5)
runs = await h.list_runs("my_strategy")
assert len(runs) == 1
"""
def __init__(self) -> None:
"""Initialize a StrategyTestHarness with a temp SQLite db."""
self._tmpdir = Path(mkdtemp(prefix="ome_test_"))
cfg = OMEConfig(
jobstore_path=self._tmpdir / "ome.db",
config_watch=False,
max_concurrent_runs=20,
max_retries=1,
)
self._engine = OfflineEngine(config=cfg)
async def __aenter__(self) -> StrategyTestHarness:
"""Enter the async context."""
return self
async def __aexit__(self, *exc: Any) -> None:
"""Exit the async context and clean up temp resources."""
try:
await self._engine.stop()
finally:
shutil.rmtree(self._tmpdir, ignore_errors=True) # noqa: SLF001
def register(self, func: Any) -> None:
"""Register a strategy function.
Args:
func: A function decorated with @offline_strategy.
"""
self._engine.register(func)
async def start(self) -> None:
"""Start the OfflineEngine."""
await self._engine.start()
async def emit(self, event: BaseEvent) -> None:
"""Emit an event to the engine.
Args:
event: A BaseEvent subclass instance.
"""
await self._engine.emit(event)
async def drain(self, *, timeout: float = 30.0) -> None: # noqa: ASYNC109
"""Wait until every enqueued strategy run has finished.
Delegates to :meth:`OfflineEngine.wait_idle`, which tracks runs
from the moment ``_enqueue_run`` bumps the counter (so a caller
that ``emit``s then immediately ``drain``s does NOT see false-
idle while APS is still launching the coroutine). Polling
``find_running`` alone — the previous implementation — missed
that gap between ``add_job`` and ``mark_running`` and let tests
race past in-flight jobs.
Args:
timeout: Maximum seconds to wait, defaults to 30.0.
Raises:
TimeoutError: if runs remain in flight after ``timeout`` seconds.
"""
if not await self._engine.wait_idle(timeout=timeout):
raise TimeoutError(
f"drain: engine still has "
f"{self._engine._active_runs} in-flight runs after {timeout}s" # noqa: SLF001
)
async def list_runs(
self,
strategy_name: str,
status: RunStatus | None = None,
) -> list[RunRecord]:
"""List run records for a strategy, optionally filtered by status.
Args:
strategy_name: The name of the strategy.
status: Optional status filter (e.g. RunStatus.SUCCESS).
Returns:
A list of RunRecord objects.
"""
return await self._engine._run_record_store.list_runs( # noqa: SLF001
strategy_name=strategy_name,
status=status,
)

View File

@ -0,0 +1,76 @@
"""OME trigger types — declarative descriptors of when a strategy fires.
Three concrete triggers: Immediate / Cron / Idle. Engine dispatches via
`isinstance(meta.trigger, ...)` to pick the registration path.
"""
from __future__ import annotations
from typing import Annotated, Self
from apscheduler.triggers.cron import CronTrigger
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from everos.infra.ome.events import BaseEvent
class _TriggerBase(BaseModel):
model_config = ConfigDict(frozen=True, extra="forbid")
class Immediate(_TriggerBase):
"""Fire as soon as an event of any class in `on` is dispatched."""
on: Annotated[list[type[BaseEvent]], Field(min_length=1)]
class Cron(_TriggerBase):
"""Fire on a cron schedule. Engine emits CronTick to the strategy."""
expr: Annotated[str, Field(min_length=1)]
@field_validator("expr")
@classmethod
def _validate_crontab(cls, v: str) -> str:
# Delegates to APS's own parser so the trigger object cannot
# represent any crontab that APS would later refuse.
CronTrigger.from_crontab(v)
return v
class Idle(_TriggerBase):
"""Fire after every class in `on` has been silent (bucketed by
`event_field`) for `idle_seconds` — AND across classes. Engine
emits IdleTick.
"""
on: Annotated[list[type[BaseEvent]], Field(min_length=1)]
event_field: str
idle_seconds: Annotated[int, Field(gt=0)]
scan_interval_seconds: Annotated[
int,
Field(gt=0, description="Per-strategy scan cadence; <= idle_seconds / 2."),
] = 60
@model_validator(mode="after")
def _validate_event_field(self) -> Self:
for event_cls in self.on:
if self.event_field not in event_cls.model_fields: # type: ignore[operator] # Pydantic model_fields → dict via @deprecated_instance_property (pydantic/main.py:277)
available = list(event_cls.model_fields) # type: ignore[arg-type] # same as above
raise ValueError(
f"event_field {self.event_field!r} not found in "
f"{event_cls.__name__} fields (available: {available})"
)
return self
@model_validator(mode="after")
def _validate_scan_interval_bound(self) -> Self:
if self.scan_interval_seconds > self.idle_seconds // 2:
raise ValueError(
f"Idle: scan_interval_seconds ({self.scan_interval_seconds}) "
f"must be <= idle_seconds // 2 ({self.idle_seconds // 2})"
)
return self
Trigger = Immediate | Cron | Idle

View File

View File

@ -0,0 +1,132 @@
"""LanceDB business persistence layer.
Sits on top of :mod:`everos.core.persistence.lancedb` (connection
factory + ``BaseLanceTable`` + ``LanceRepoBase``) and provides:
* lazy process-wide connection + per-name table cache
(:mod:`.lancedb_manager`)
* concrete schemas under :mod:`.tables`
* concrete repository singletons under :mod:`.repos`
External usage::
from everos.infra.persistence.lancedb import (
get_connection, get_table, dispose_connection,
Episode, AtomicFact, Foresight, AgentCase, AgentSkill, UserProfile,
episode_repo, atomic_fact_repo, foresight_repo,
agent_case_repo, agent_skill_repo, user_profile_repo,
)
Three index kinds: scalar / BM25 / vector. Tables are created lazily on
first access; row population is the cascade daemon's job (see
``12_cascade_design.md``).
"""
# Importing ``tables`` registers every business :class:`BaseLanceTable`
# schema so callers can rely on the package alone to surface every schema.
from . import tables as tables # noqa: F401
from .lancedb_manager import dispose_connection as dispose_connection
from .lancedb_manager import get_connection as get_connection
from .lancedb_manager import get_table as get_table
from .repos import agent_case_repo as agent_case_repo
from .repos import agent_skill_repo as agent_skill_repo
from .repos import atomic_fact_repo as atomic_fact_repo
from .repos import episode_repo as episode_repo
from .repos import foresight_repo as foresight_repo
from .repos import user_profile_repo as user_profile_repo
from .tables import AgentCase as AgentCase
from .tables import AgentSkill as AgentSkill
from .tables import AtomicFact as AtomicFact
from .tables import Episode as Episode
from .tables import Foresight as Foresight
from .tables import ParentType as ParentType
from .tables import UserProfile as UserProfile
_BUSINESS_SCHEMAS = (
Episode,
AtomicFact,
Foresight,
AgentCase,
AgentSkill,
UserProfile,
)
class LanceDBSchemaMismatchError(RuntimeError):
"""Raised at startup when an on-disk LanceDB table's columns drift
from the corresponding Pydantic schema.
Cascade re-builds LanceDB from md (the SoT), so the recovery is
deterministic: delete the index directory and let it reindex.
The lifespan surfaces the explicit ``rm -rf ~/.everos/.index/
lancedb`` instruction in the error message; see
``docs/cascade_runbook.md`` for the wider context.
"""
async def ensure_business_indexes() -> None:
"""Ensure FTS (BM25) indexes for every business table (idempotent).
Called once at startup by :class:`LanceDBLifespanProvider`. Walks
the 5 business schemas (each schema owns its ``TABLE_NAME`` +
``BM25_FIELDS``), opens each table via :func:`get_table`, and
delegates to ``schema.ensure_fts_indexes(table)``. Already-indexed
columns are skipped, so re-runs are no-ops.
Adding a new business table = adding it to ``_BUSINESS_SCHEMAS``;
everything else (table name, columns to index) reads off the
schema's ClassVars.
"""
for schema in _BUSINESS_SCHEMAS:
table = await get_table(schema.TABLE_NAME, schema)
await schema.ensure_fts_indexes(table)
async def verify_business_schemas() -> None:
"""Fail loud at startup if an existing LanceDB table's columns don't
match its current Pydantic schema.
LanceDB doesn't migrate columns automatically; an older index dir
(e.g. with the pre-``content_sha256`` shape) would fail
unpredictably on upsert. Checking column names up-front turns that
into a clean startup error pointing the user at the recovery path
(``rm -rf ~/.everos/.index/lancedb`` — the index is rebuildable
from md, see ``12_cascade_design.md``).
"""
for schema in _BUSINESS_SCHEMAS:
table = await get_table(schema.TABLE_NAME, schema)
arrow_schema = await table.schema()
actual = set(arrow_schema.names)
expected = set(schema.model_fields.keys())
missing = expected - actual
extra = actual - expected
if missing or extra:
raise LanceDBSchemaMismatchError(
f"LanceDB table {schema.TABLE_NAME!r} schema drift: "
f"missing={sorted(missing)}, extra={sorted(extra)}. "
"The index is rebuildable from md — recover with "
"`rm -rf ~/.everos/.index/lancedb` and restart."
)
__all__ = [
"AgentCase",
"AgentSkill",
"AtomicFact",
"Episode",
"Foresight",
"LanceDBSchemaMismatchError",
"ParentType",
"UserProfile",
"agent_case_repo",
"agent_skill_repo",
"atomic_fact_repo",
"dispose_connection",
"ensure_business_indexes",
"episode_repo",
"foresight_repo",
"get_connection",
"get_table",
"user_profile_repo",
"verify_business_schemas",
]

View File

@ -0,0 +1,82 @@
"""LanceDB connection + table singletons (lazy + process-wide, async).
The single place that owns the LanceDB **runtime state**: the async
connection and per-name table cache. Connection opens lazily on first
:func:`get_connection` call; tables are cached after first
:func:`get_table`. The :class:`LanceDBLifespanProvider` calls
:func:`dispose_connection` on shutdown; in scripts you can call it
manually.
"""
from __future__ import annotations
import asyncio
from lancedb import AsyncConnection, AsyncTable
from everos.config import load_settings
from everos.core.observability.logging import get_logger
from everos.core.persistence import BaseLanceTable, MemoryRoot, open_lancedb_connection
logger = get_logger(__name__)
_conn: AsyncConnection | None = None
_tables: dict[str, AsyncTable] = {}
_lock = asyncio.Lock()
async def get_connection() -> AsyncConnection:
"""Return the process-wide async LanceDB connection.
Built on first call from ``MemoryRoot.default().lancedb_dir`` and
``Settings.lancedb``. Subsequent calls return the same instance.
"""
async with _lock:
return await _ensure_connection_locked()
async def get_table(
name: str,
schema: type[BaseLanceTable],
) -> AsyncTable:
"""Open the named table (creating from ``schema`` if missing). Cached."""
async with _lock:
if name not in _tables:
conn = await _ensure_connection_locked()
existing = await conn.list_tables()
if name in list(existing.tables):
_tables[name] = await conn.open_table(name)
logger.info("lancedb_table_opened", name=name)
else:
_tables[name] = await conn.create_table(name, schema=schema)
logger.info("lancedb_table_created", name=name)
return _tables[name]
async def dispose_connection() -> None:
"""Close the connection + clear table cache. Idempotent."""
global _conn
async with _lock:
if _conn is not None:
try:
_conn.close() # AsyncConnection.close() is sync in lancedb 0.30
except Exception:
logger.exception("lancedb_close_failed")
logger.info("lancedb_connection_closed")
_conn = None
_tables.clear()
async def _ensure_connection_locked() -> AsyncConnection:
"""Open the connection if not yet open. Caller must hold ``_lock``."""
global _conn
if _conn is None:
settings = load_settings()
memory_root = MemoryRoot.default()
memory_root.ensure()
_conn = await open_lancedb_connection(memory_root.lancedb_dir, settings.lancedb)
logger.info(
"lancedb_connection_opened",
path=str(memory_root.lancedb_dir),
)
return _conn

View File

@ -0,0 +1,37 @@
"""LanceDB repo singletons (one per business table).
Each repo is a module-level singleton — the table connection is
resolved lazily on first call via :func:`..lancedb_manager.get_table`.
Subclassing :class:`LanceRepoBase` lets each repo carry table-specific
helpers later (e.g. ``find_by_owner``, ``search_for_owner``) without
needing a separate factory.
External usage::
from everos.infra.persistence.lancedb.repos import (
episode_repo,
atomic_fact_repo,
foresight_repo,
agent_case_repo,
agent_skill_repo,
user_profile_repo,
)
await episode_repo.add([Episode(...)])
"""
from .agent_case import agent_case_repo as agent_case_repo
from .agent_skill import agent_skill_repo as agent_skill_repo
from .atomic_fact import atomic_fact_repo as atomic_fact_repo
from .episode import episode_repo as episode_repo
from .foresight import foresight_repo as foresight_repo
from .user_profile import user_profile_repo as user_profile_repo
__all__ = [
"agent_case_repo",
"agent_skill_repo",
"atomic_fact_repo",
"episode_repo",
"foresight_repo",
"user_profile_repo",
]

View File

@ -0,0 +1,20 @@
"""LanceDB repo singleton for the ``agent_case`` table."""
from __future__ import annotations
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceDailyLogRepoBase
from ..lancedb_manager import get_table
from ..tables.agent_case import AgentCase
class _AgentCaseRepo(LanceDailyLogRepoBase[AgentCase]):
schema = AgentCase
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
agent_case_repo = _AgentCaseRepo()

View File

@ -0,0 +1,84 @@
"""LanceDB repo singleton for the ``agent_skill`` table."""
from __future__ import annotations
from collections.abc import Sequence
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceRepoBase
from ..lancedb_manager import get_table
from ..tables.agent_skill import AgentSkill
class _AgentSkillRepo(LanceRepoBase[AgentSkill]):
schema = AgentSkill
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
async def count_in_cluster(self, *, owner_id: str, cluster_id: str) -> int:
"""Count skills under one ``(owner_id, cluster_id)``."""
table = await self._table()
return await table.count_rows(filter=_in_cluster(owner_id, cluster_id))
async def find_in_cluster(
self, *, owner_id: str, cluster_id: str, limit: int
) -> list[AgentSkill]:
"""Scalar fetch within one cluster; no ranking, capped at ``limit``."""
return await self.find_where(_in_cluster(owner_id, cluster_id), limit=limit)
async def find_topk_relevant_in_cluster(
self,
*,
owner_id: str,
cluster_id: str,
query_vector: Sequence[float],
top_k: int,
) -> list[AgentSkill]:
"""Top-K cosine-relevant skills inside one cluster.
Cosine ranking is pushed down to LanceDB native ANN.
``distance_type("cosine")`` matches
:class:`AgentSkillRecaller.dense_recall`, keeping ranking
semantics consistent across read paths.
Raises:
ValueError: When ``query_vector`` is empty — no relevance
signal is a caller-side policy decision; use
:meth:`find_in_cluster` for the scalar fallback.
"""
if not query_vector:
raise ValueError(
"query_vector must be non-empty; "
"call find_in_cluster for the scalar fallback"
)
table = await self._table()
rows = await (
table.query()
.nearest_to(list(query_vector))
.distance_type("cosine")
.where(_in_cluster(owner_id, cluster_id))
.limit(top_k)
.to_list()
)
# LanceDB appends ``_distance`` to ranked rows; strip it before
# ``model_validate`` so this stays robust regardless of
# pydantic ``extra`` mode on the schema.
return [
self.schema.model_validate({k: v for k, v in r.items() if k != "_distance"})
for r in rows
]
def _q(value: str) -> str:
"""SQL single-quote escape for LanceDB ``where`` predicate literals."""
return value.replace("'", "''")
def _in_cluster(owner_id: str, cluster_id: str) -> str:
return f"owner_id = '{_q(owner_id)}' AND cluster_id = '{_q(cluster_id)}'"
agent_skill_repo = _AgentSkillRepo()

View File

@ -0,0 +1,20 @@
"""LanceDB repo singleton for the ``atomic_fact`` table."""
from __future__ import annotations
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceDailyLogRepoBase
from ..lancedb_manager import get_table
from ..tables.atomic_fact import AtomicFact
class _AtomicFactRepo(LanceDailyLogRepoBase[AtomicFact]):
schema = AtomicFact
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
atomic_fact_repo = _AtomicFactRepo()

View File

@ -0,0 +1,20 @@
"""LanceDB repo singleton for the ``episode`` table."""
from __future__ import annotations
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceDailyLogRepoBase
from ..lancedb_manager import get_table
from ..tables.episode import Episode
class _EpisodeRepo(LanceDailyLogRepoBase[Episode]):
schema = Episode
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
episode_repo = _EpisodeRepo()

View File

@ -0,0 +1,20 @@
"""LanceDB repo singleton for the ``foresight`` table."""
from __future__ import annotations
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceDailyLogRepoBase
from ..lancedb_manager import get_table
from ..tables.foresight import Foresight
class _ForesightRepo(LanceDailyLogRepoBase[Foresight]):
schema = Foresight
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
foresight_repo = _ForesightRepo()

View File

@ -0,0 +1,20 @@
"""LanceDB repo singleton for the ``user_profile`` table."""
from __future__ import annotations
from lancedb import AsyncTable
from everos.core.persistence.lancedb import LanceRepoBase
from ..lancedb_manager import get_table
from ..tables.user_profile import UserProfile
class _UserProfileRepo(LanceRepoBase[UserProfile]):
schema = UserProfile
async def _table_lookup(self) -> AsyncTable:
return await get_table(self.schema.TABLE_NAME, self.schema)
user_profile_repo = _UserProfileRepo()

View File

@ -0,0 +1,35 @@
"""LanceDB table schemas (one ``BaseLanceTable`` subclass per business table).
Schemas live here; cascade-daemon-driven row population is wired
through the matching repo singletons in :mod:`..repos`.
External usage::
from everos.infra.persistence.lancedb.tables import (
Episode,
AtomicFact,
Foresight,
AgentCase,
AgentSkill,
UserProfile,
ParentType,
)
"""
from ._parent_type import ParentType as ParentType
from .agent_case import AgentCase as AgentCase
from .agent_skill import AgentSkill as AgentSkill
from .atomic_fact import AtomicFact as AtomicFact
from .episode import Episode as Episode
from .foresight import Foresight as Foresight
from .user_profile import UserProfile as UserProfile
__all__ = [
"AgentCase",
"AgentSkill",
"AtomicFact",
"Episode",
"Foresight",
"ParentType",
"UserProfile",
]

View File

@ -0,0 +1,24 @@
"""``ParentType`` — provenance label for memory records linked back to a source.
Currently the only value is :attr:`ParentType.MEMCELL`: every business row
(episode / foresight / atomic_fact / agent_case) points back to a source
MemCell. The earlier opensource design enumerated ``"episode"`` as an
alternative parent but the production path never wrote that value, so the
new framework collapses the enum to its single in-use member.
Kept as an :class:`enum.Enum` (rather than a bare string constant) so that
adding a future parent kind stays a non-breaking enum extension. LanceDB's
pydantic-to-arrow conversion does not accept ``Enum`` field annotations,
so table schemas declare ``parent_type: str = ParentType.MEMCELL.value``
and reference the enum only at the default-value level.
"""
from __future__ import annotations
from enum import StrEnum
class ParentType(StrEnum):
"""Provenance label of a memory record's parent."""
MEMCELL = "memcell"

View File

@ -0,0 +1,84 @@
"""LanceDB ``agent_case`` table schema.
Field set per 17_lancedb_tables_design.md §3.4. Each row records one
task an agent worked on: intent, approach, optional pivotal insight,
and a quality score. A MemCell extracted on the agent's own execution
log yields at most one AgentCase.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class AgentCase(BaseLanceTable):
"""One agent case indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "agent_case"
BM25_FIELDS: ClassVar[list[str]] = ["task_intent_tokens", "approach_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``ac_<YYYYMMDD>_<NNNN>``."""
owner_id: str
"""The owning ``agent_id``."""
owner_type: str
"""Fixed ``"agent"`` for this table."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for agent case."""
parent_id: str
"""Source memcell id (one memcell ↔ one case)."""
quality_score: float
"""0.01.0; task completion / quality estimate."""
task_intent: str
"""≤ 50 tokens; original surface form (returned for display)."""
task_intent_tokens: str
"""App-layer pre-tokenised ``task_intent`` — BM25 main field
(whitespace tokenizer); display goes through ``task_intent``."""
approach: str
"""≤ 1000 tokens; step-by-step approach (display)."""
approach_tokens: str
"""App-layer pre-tokenised ``approach`` — BM25 secondary field
(whitespace tokenizer). Searched in parallel with
``task_intent_tokens`` then merged by max score in the recall
layer; task_intent typically scores higher because it's the
retrieval anchor, but approach catches queries that match a step
detail."""
key_insight: str | None = None
"""≤ 40 tokens; pivotal strategy shift, optional."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry — TaskIntent / Approach / KeyInsight sections plus
the ``quality_score`` inline. Audit inline (owner_id /
session_id / timestamp / parent_id) is NOT in the hash. See
:attr:`AgentCaseHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,80 @@
"""LanceDB ``agent_skill`` table schema.
Field set per 17_lancedb_tables_design.md §3.5. AgentSkill is a *named
entity* rather than a daily-log entry — PK is ``<owner_id>_<skill_name>``
(no date / seq), and same agent + same name is the same row (upsert).
``content`` is cascade-assembled from ``SKILL.md`` body plus every
``references/*.md`` sibling; ``scripts/`` is not indexed.
"""
from __future__ import annotations
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
_DIM = 1024
class AgentSkill(BaseLanceTable):
"""One agent skill indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "agent_skill"
BM25_FIELDS: ClassVar[list[str]] = ["description_tokens", "content_tokens"]
id: str
"""PK = ``<owner_id>_<skill_name>``."""
owner_id: str
"""The owning ``agent_id``."""
owner_type: str
"""Fixed ``"agent"`` for this table."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
name: str
"""Skill identifier; half of the PK."""
description: str
"""When-to-use / purpose — original surface form (Tier-1 ad copy)."""
description_tokens: str
"""App-layer pre-tokenised ``description`` — BM25 main field
(whitespace tokenizer); display goes through ``description``."""
content: str
"""Cascade-assembled body: ``SKILL.md`` main text concatenated with
every ``references/*.md`` sibling. ``scripts/`` files are excluded."""
content_tokens: str
"""App-layer pre-tokenised ``content`` (secondary BM25 field).
Tokenised by cascade when assembling ``content`` from md sources."""
confidence: float
"""0.01.0; LLM-emitted confidence in the skill."""
maturity_score: float
"""0.01.0; LLM-emitted maturity score. The retrieval-time threshold
(``maturity_threshold``) lives in MemorizeConfig, not in this row."""
source_case_ids: list[str]
"""AgentCase ids that fed into this skill's synthesis (lineage)."""
cluster_id: str | None = None
"""Optional MemScene clustering tag."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the skill: ``name`` / ``description`` (frontmatter) + SKILL.md
body + concatenated references content + ``confidence`` /
``maturity_score``. Cascade handler diffs by this digest to skip
re-upsert + re-embed when neither retrieval-anchor text nor scores
changed (e.g. the watcher fires for unrelated stat updates). See
:attr:`AgentSkillHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,62 @@
"""LanceDB ``atomic_fact`` table schema.
Field set per 17_lancedb_tables_design.md §3.2. Each row carries one
atomic fact extracted by the algo layer; the parent is always the source
MemCell — recorded via ``parent_type`` / ``parent_id``.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class AtomicFact(BaseLanceTable):
"""One atomic fact indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "atomic_fact"
BM25_FIELDS: ClassVar[list[str]] = ["fact_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``af_<YYYYMMDD>_<NNNN>``."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for atomic fact."""
parent_id: str
"""Source memcell id."""
sender_ids: list[str]
fact: str
"""Atomic fact text — original surface form (returned for display)."""
fact_tokens: str
"""App-layer pre-tokenised ``fact`` text — space-joined tokens.
BM25 index is built on this column (whitespace tokenizer);
``fact`` itself is what callers display."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry (per :attr:`AtomicFactHandler.content_change_keys`).
Matching digest → skip re-upsert + re-embed. Audit inline fields
(owner_id / session_id / timestamp / parent_id / sender_ids) are
NOT in the hash."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,78 @@
"""LanceDB ``episode`` table schema.
Field set is fixed by the LanceDB tables design spec. Rows are populated
by the cascade daemon from ``users/<owner_id>/episodes/episode-<YYYY-MM-DD>.md``
and from ``agents/<owner_id>/episodes/...`` symmetrically.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
# Vector dimension is settings-managed at runtime; the class-level
# constant pins the schema dim used at table creation.
_DIM = 1024
class Episode(BaseLanceTable):
"""One episode record indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "episode"
BM25_FIELDS: ClassVar[list[str]] = ["episode_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>`` (scalar PK)."""
entry_id: str
"""md-side seq id ``ep_<YYYYMMDD>_<NNNN>`` (cascade reverse-lookup)."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for episode."""
parent_id: str
"""Source memcell id. The pipeline knows the memcell currently being
processed and writes its id into the md entry's inline block; the
cascade handler reads it back. The new everalgo Episode type no
longer emits ``parent_id`` itself (collapsed to caller-managed),
so this is filled entirely from everos's engineering context."""
sender_ids: list[str]
"""Distinct ``role=user|assistant`` senders behind the episode."""
subject: str | None = None
summary: str | None = None
episode: str
"""Full narrative text — original surface form (returned for display)."""
episode_tokens: str
"""App-layer pre-tokenised ``episode`` text — space-joined tokens
(e.g. produced by jieba). LanceDB FTS index is built on **this**
column using a whitespace tokenizer; the original ``episode`` field
is what callers display. Two-field BM25 scheme keeps tokenisation
deterministic and provider-pluggable at the app layer."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of the
md entry (per :attr:`EpisodeHandler.content_change_keys`). On
re-reconcile, a matching digest means none of the persistence /
embedding-relevant fields changed — the entry is skipped (no
re-upsert, no re-embed). Inline audit fields (owner_id /
session_id / timestamp / parent_id / sender_ids) are intentionally
NOT in the hash so editing them doesn't waste an embedding call.
See ``16_cascade_impl_design.md`` §3.3."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,79 @@
"""LanceDB ``foresight`` table schema.
Field set per 17_lancedb_tables_design.md §3.3. Each row carries a
forward-looking inference about the user (intent window, planned
action, projected need); ``start_time`` / ``end_time`` describe the
window the foresight applies to.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable, Vector
from ._parent_type import ParentType
_DIM = 1024
class Foresight(BaseLanceTable):
"""One foresight record indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "foresight"
BM25_FIELDS: ClassVar[list[str]] = ["foresight_tokens", "evidence_tokens"]
id: str
"""PK = ``<owner_id>_<entry_id>``."""
entry_id: str
"""md-side seq id ``fs_<YYYYMMDD>_<NNNN>``."""
owner_id: str
owner_type: str
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
session_id: str
timestamp: _dt.datetime
"""Foresight generation time."""
start_time: _dt.datetime | None = None
"""Foresight coverage window start; tz-aware."""
end_time: _dt.datetime | None = None
"""Foresight coverage window end; tz-aware."""
duration_days: int | None = None
parent_type: str = ParentType.MEMCELL.value
"""Source pointer — always :attr:`ParentType.MEMCELL` for foresight."""
parent_id: str
"""Source memcell id."""
sender_ids: list[str]
foresight: str
"""Foresight body — original surface form (returned for display)."""
foresight_tokens: str
"""App-layer pre-tokenised ``foresight`` text — space-joined tokens.
BM25 index is built on this column (whitespace tokenizer)."""
evidence: str | None = None
"""Supporting evidence excerpt; may be empty."""
evidence_tokens: str | None = None
"""App-layer pre-tokenised ``evidence`` (secondary BM25 field).
``None`` whenever ``evidence`` is None."""
md_path: str
content_sha256: str
"""SHA-256 hex digest over the **content-bearing fields only** of
the md entry — Foresight / Evidence sections plus the time-window
inline fields (start_time / end_time / duration_days). Audit inline
(owner_id / session_id / timestamp / parent_id / sender_ids) is NOT
in the hash. See :attr:`ForesightHandler.content_change_keys`."""
vector: Vector(_DIM) # type: ignore[valid-type]

View File

@ -0,0 +1,68 @@
"""LanceDB ``user_profile`` table schema.
Profile is a single-file kind: one ``users/<user_id>/user.md`` per
user, replaced wholesale on edit (mirrors ``AgentSkill`` for the
upsert/single-row contract). The LanceDB row is a typed projection
of the md frontmatter that the cascade keeps in sync; it carries no
vector / no BM25 because the recall surface is pure KV-by-owner
(``fetch(owner_id)``) — when query-aware profile lookup ships later
the schema will gain ``vector`` + ``*_tokens`` columns then.
``explicit_info`` / ``implicit_traits`` are heterogeneous LLM
emissions (mostly small dicts mixed with strings) — LanceDB has no
``list[dict]`` column type, so we stash them as JSON strings and
unpack at the recall boundary into ``profile_data`` of the DTO.
"""
from __future__ import annotations
from typing import ClassVar
from everos.core.persistence.lancedb import BaseLanceTable
class UserProfile(BaseLanceTable):
"""One ``users/<user_id>/user.md`` indexed in LanceDB."""
TABLE_NAME: ClassVar[str] = "user_profile"
# No BM25 columns: profile recall is KV-by-owner today.
id: str
"""PK = ``owner_id`` (one row per user)."""
owner_id: str
owner_type: str
"""Always ``"user"`` for this schema; agent-side profiles would
live in a sibling table once that schema lands."""
app_id: str = "default"
project_id: str = "default"
"""App / project scope (default ``"default"``); cascade fills from md path."""
summary: str
"""Free-form one-paragraph user summary (retrieval anchor for the
future query-aware lookup; today returned verbatim to the caller)."""
explicit_info_json: str
"""JSON-serialised ``list[Any]`` — the algo's verbatim evidence
bucket. Stored as a string because LanceDB has no
``list[dict]`` column type. The recaller json-decodes it back into
``profile_data['explicit_info']`` at the DTO boundary."""
implicit_traits_json: str
"""Same shape as :attr:`explicit_info_json`, for the LLM-inferred
preference bucket."""
profile_timestamp_ms: int
"""Algo-emitted profile timestamp (ms epoch) — pinned to the
timestamp of the freshest MemCell that fed into the synthesis.
Mirrored from :attr:`UserProfileFrontmatter.profile_timestamp_ms`
so downstream code can compare freshness without re-reading md."""
md_path: str
content_sha256: str
"""SHA-256 over the content-bearing frontmatter fields (summary +
explicit_info_json + implicit_traits_json). Matches → cascade
skips re-upsert. ``profile_timestamp_ms`` is intentionally not in
the hash: it drifts with every synthesis even when the underlying
content is identical, and the LanceDB row treats it as audit."""

View File

@ -0,0 +1,73 @@
"""Markdown business persistence layer.
Sits on top of :mod:`everos.core.persistence.markdown` (atomic write +
parse + frontmatter chassis) and provides:
* concrete frontmatter schemas under :mod:`.mds`
* concrete business writers under :mod:`.writers`
(``BaseDailyWriter`` + subclasses, ``AgentSkillWriter``,
``ProfileWriter``)
* concrete business readers under :mod:`.readers`
(``BaseDailyReader`` + subclasses, ``AgentSkillReader``,
``ProfileReader``)
External usage::
from everos.infra.persistence.markdown import (
BaseDailyWriter, BaseDailyReader,
EpisodeWriter, EpisodeReader, EpisodeDailyFrontmatter,
AtomicFactDailyFrontmatter,
ForesightDailyFrontmatter,
AgentCaseDailyFrontmatter,
AgentSkillFrontmatter, AgentSkillWriter, AgentSkillReader,
ProfileWriter, ProfileReader,
)
Outer layers MUST go through this top-level package because
``infra.persistence.markdown.**`` (sub-packages) are forbidden to outer
layers by import-linter.
"""
from .mds import AgentCaseDailyFrontmatter as AgentCaseDailyFrontmatter
from .mds import AgentSkillFrontmatter as AgentSkillFrontmatter
from .mds import AtomicFactDailyFrontmatter as AtomicFactDailyFrontmatter
from .mds import EpisodeDailyFrontmatter as EpisodeDailyFrontmatter
from .mds import ForesightDailyFrontmatter as ForesightDailyFrontmatter
from .mds import UserProfileFrontmatter as UserProfileFrontmatter
from .readers import AgentCaseReader as AgentCaseReader
from .readers import AgentSkillReader as AgentSkillReader
from .readers import AtomicFactReader as AtomicFactReader
from .readers import BaseDailyReader as BaseDailyReader
from .readers import EpisodeReader as EpisodeReader
from .readers import ForesightReader as ForesightReader
from .readers import ProfileReader as ProfileReader
from .writers import AgentCaseWriter as AgentCaseWriter
from .writers import AgentSkillWriter as AgentSkillWriter
from .writers import AtomicFactWriter as AtomicFactWriter
from .writers import BaseDailyWriter as BaseDailyWriter
from .writers import EpisodeWriter as EpisodeWriter
from .writers import ForesightWriter as ForesightWriter
from .writers import ProfileWriter as ProfileWriter
__all__ = [
"AgentCaseDailyFrontmatter",
"AgentCaseReader",
"AgentCaseWriter",
"AgentSkillFrontmatter",
"AgentSkillReader",
"AgentSkillWriter",
"AtomicFactDailyFrontmatter",
"AtomicFactReader",
"AtomicFactWriter",
"BaseDailyReader",
"BaseDailyWriter",
"EpisodeDailyFrontmatter",
"EpisodeReader",
"EpisodeWriter",
"ForesightDailyFrontmatter",
"ForesightReader",
"ForesightWriter",
"ProfileReader",
"ProfileWriter",
"UserProfileFrontmatter",
]

View File

@ -0,0 +1,40 @@
"""Business markdown frontmatter schemas (mds = "markdown schemas").
Each business record kind that is stored as markdown gets a concrete
frontmatter class here, subclassing one of the chassis classes from
:mod:`everos.core.persistence.markdown`:
* :class:`UserScopedFrontmatter` for user-track records
* :class:`AgentScopedFrontmatter` for agent-track records
* :class:`BaseFrontmatter` for scope-agnostic records (rare)
Schemas drive path resolution via ClassVars; each storage strategy has
its own conventions:
- **Daily-log** schemas declare ``ENTRY_ID_PREFIX`` (token in
``<prefix>_<date>_<seq>``), ``DIR_NAME`` (sub-directory under
``<scope>/<id>/``) and ``FILE_PREFIX`` (leading token of the daily
filename joined with ``-<YYYY-MM-DD>.md``).
- **Skill** schemas (:class:`AgentSkillFrontmatter`) pin the directory
layout via five ``SKILL_*`` ClassVars (container / dir prefix /
main filename / references / scripts).
- **Profile** schemas declare ``PROFILE_FILENAME`` (``"user.md"`` /
``"agent.md"`` / …) and inherit ``SCOPE_DIR`` from a scope mixin; no
profile base class — the writer/reader pair is duck-typed.
"""
from .agent_case import AgentCaseDailyFrontmatter as AgentCaseDailyFrontmatter
from .agent_skill import AgentSkillFrontmatter as AgentSkillFrontmatter
from .atomic_fact import AtomicFactDailyFrontmatter as AtomicFactDailyFrontmatter
from .episode import EpisodeDailyFrontmatter as EpisodeDailyFrontmatter
from .foresight import ForesightDailyFrontmatter as ForesightDailyFrontmatter
from .profile import UserProfileFrontmatter as UserProfileFrontmatter
__all__ = [
"AgentCaseDailyFrontmatter",
"AgentSkillFrontmatter",
"AtomicFactDailyFrontmatter",
"EpisodeDailyFrontmatter",
"ForesightDailyFrontmatter",
"UserProfileFrontmatter",
]

View File

@ -0,0 +1,37 @@
"""AgentCase frontmatter — daily-log markdown for agent-scoped cases.
Path: ``agents/<scope_id>/.cases/agent_case-<YYYY-MM-DD>.md``.
The directory is dotfile-hidden (``.cases``) so users only see the
curated ``agent_skills/`` view, not the raw per-task case log — same
convention as ``.atomic_facts`` / ``.foresights``.
Each entry records one task an agent worked on: intent, approach taken,
quality score, and an optional pivotal insight. A MemCell extracted on
the agent's own execution log yields at most one AgentCase.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar, Literal
from everos.core.persistence.markdown import (
AgentScopedFrontmatter,
DailyLogPathMixin,
)
class AgentCaseDailyFrontmatter(DailyLogPathMixin, AgentScopedFrontmatter):
"""Frontmatter for ``agents/<scope>/.cases/agent_case-<YYYY-MM-DD>.md``."""
ENTRY_ID_PREFIX: ClassVar[str] = "ac"
DIR_NAME: ClassVar[str] = ".cases"
FILE_PREFIX: ClassVar[str] = "agent_case"
type: Literal["agent_case_daily"] = "agent_case_daily"
file_type: Literal["agent_case_daily"] = "agent_case_daily"
date: _dt.date
entry_count: int = 0
created_at: _dt.datetime | None = None
last_appended_at: _dt.datetime | None = None

View File

@ -0,0 +1,63 @@
"""AgentSkill frontmatter — single SKILL.md inside a skill directory.
Path: ``agents/<scope_id>/skills/skill_<name>/SKILL.md`` (plus sibling
``references/*.md`` and ``scripts/*.<ext>`` files that are not part of
the frontmatter contract).
Skills are *named entities* rather than daily-log entries: the
LanceDB primary key is ``<owner_id>_<skill_name>`` (no date / seq).
Upserts replace the file wholesale; the cascade daemon recomputes the
``content`` index column by concatenating ``SKILL.md`` body with every
``references/*.md`` sibling.
Five directory-shape ClassVars pin the layout in one place so the
writer / reader pair reads off them — no duplicated string literals.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar, Literal
from everos.core.persistence.markdown import (
AgentScopedFrontmatter,
SkillPathMixin,
)
class AgentSkillFrontmatter(SkillPathMixin, AgentScopedFrontmatter):
"""Frontmatter for ``agents/<scope>/skills/skill_<name>/SKILL.md``."""
SKILLS_CONTAINER_NAME: ClassVar[str] = "skills"
SKILL_DIR_PREFIX: ClassVar[str] = "skill_"
SKILL_MAIN_FILENAME: ClassVar[str] = "SKILL.md"
SKILL_REFERENCES_DIR_NAME: ClassVar[str] = "references"
SKILL_SCRIPTS_DIR_NAME: ClassVar[str] = "scripts"
type: Literal["agent_skill"] = "agent_skill"
name: str
"""Skill identifier — also the directory suffix
(``skills/skill_<name>/``). Keep snake_case so it is filesystem-safe
and ID-stable."""
description: str
"""One-line summary surfaced at Tier-1 prompt injection. Short — the
agent's startup-time scanner reads ``(name, description)`` for every
skill, so the token budget is tight."""
confidence: float
"""LLM-emitted confidence in the skill's correctness, 0.01.0."""
maturity_score: float
"""LLM-emitted maturity score, 0.01.0. The retrieval-time threshold
(``maturity_threshold``) lives in MemorizeConfig, not on this file."""
source_case_ids: list[str] = []
"""AgentCase ids that fed into this skill's synthesis (lineage)."""
cluster_id: str | None = None
"""Optional MemScene clustering tag; may be unset early on."""
created_at: _dt.datetime | None = None
updated_at: _dt.datetime | None = None

View File

@ -0,0 +1,38 @@
"""AtomicFact frontmatter — daily-log markdown for user-scoped atomic facts.
Path: ``users/<scope_id>/.atomic_facts/atomic_fact-<YYYY-MM-DD>.md``.
The directory is dot-prefixed so it is hidden from end users (same
convention as ``.index``); atomic facts are framework-internal derived md,
not material the user is expected to read by hand.
Each entry carries one atomic fact extracted by the algo layer; the fact
always hangs off the source MemCell (see ``parent_type`` in each entry's
inline fields — handled at the StructuredEntry layer, not on the
file-level frontmatter).
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar, Literal
from everos.core.persistence.markdown import (
DailyLogPathMixin,
UserScopedFrontmatter,
)
class AtomicFactDailyFrontmatter(DailyLogPathMixin, UserScopedFrontmatter):
"""Frontmatter for ``users/<scope>/.atomic_facts/atomic_fact-<YYYY-MM-DD>.md``."""
ENTRY_ID_PREFIX: ClassVar[str] = "af"
DIR_NAME: ClassVar[str] = ".atomic_facts"
FILE_PREFIX: ClassVar[str] = "atomic_fact"
type: Literal["atomic_fact_daily"] = "atomic_fact_daily"
file_type: Literal["atomic_fact_daily"] = "atomic_fact_daily"
date: _dt.date
entry_count: int = 0
created_at: _dt.datetime | None = None
last_appended_at: _dt.datetime | None = None

View File

@ -0,0 +1,33 @@
"""Episode frontmatter — daily-log markdown for user-scoped episodes.
Path: ``users/<scope_id>/episodes/episode-<YYYY-MM-DD>.md``.
This milestone uses ``session_id`` as the scope key (since owner inference
is out of scope). When owner inference lands the scope key will switch to
``owner_id`` while the schema stays compatible.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar, Literal
from everos.core.persistence.markdown import (
DailyLogPathMixin,
UserScopedFrontmatter,
)
class EpisodeDailyFrontmatter(DailyLogPathMixin, UserScopedFrontmatter):
"""Frontmatter for ``users/<scope>/episodes/episode-<YYYY-MM-DD>.md``."""
ENTRY_ID_PREFIX: ClassVar[str] = "ep"
DIR_NAME: ClassVar[str] = "episodes"
FILE_PREFIX: ClassVar[str] = "episode"
type: Literal["episode_daily"] = "episode_daily"
file_type: Literal["episode_daily"] = "episode_daily"
date: _dt.date
entry_count: int = 0
created_at: _dt.datetime | None = None
last_appended_at: _dt.datetime | None = None

View File

@ -0,0 +1,38 @@
"""Foresight frontmatter — daily-log markdown for user-scoped foresights.
Path: ``users/<scope_id>/.foresights/foresight-<YYYY-MM-DD>.md``.
The directory is dot-prefixed so it is hidden from end users (same
convention as ``.index``); foresights are framework-internal derived md,
not material the user is expected to read by hand.
Each entry carries a forward-looking inference about the user (intent
window, planned action, projected need) with ``start_time`` /
``end_time`` describing the covered time range. ``parent_type`` always
points back to a MemCell.
"""
from __future__ import annotations
import datetime as _dt
from typing import ClassVar, Literal
from everos.core.persistence.markdown import (
DailyLogPathMixin,
UserScopedFrontmatter,
)
class ForesightDailyFrontmatter(DailyLogPathMixin, UserScopedFrontmatter):
"""Frontmatter for ``users/<scope>/.foresights/foresight-<YYYY-MM-DD>.md``."""
ENTRY_ID_PREFIX: ClassVar[str] = "fs"
DIR_NAME: ClassVar[str] = ".foresights"
FILE_PREFIX: ClassVar[str] = "foresight"
type: Literal["foresight_daily"] = "foresight_daily"
file_type: Literal["foresight_daily"] = "foresight_daily"
date: _dt.date
entry_count: int = 0
created_at: _dt.datetime | None = None
last_appended_at: _dt.datetime | None = None

View File

@ -0,0 +1,40 @@
"""UserProfile frontmatter — single-file profile markdown for users.
Path: ``users/<user_id>/user.md``.
Carries the LLM-synthesised user profile: a free-form ``summary`` plus the
two evidence buckets emitted by :class:`everalgo.user_memory.ProfileExtractor`
(``explicit_info`` / ``implicit_traits``). ``profile_timestamp_ms``
mirrors :attr:`everalgo.types.Profile.timestamp` so the
``extract_user_profile`` strategy can compare per-user freshness against
cluster ``last_ts`` without re-parsing the body.
"""
from __future__ import annotations
from typing import Any, ClassVar, Literal
from everos.core.persistence.markdown import ProfilePathMixin, UserScopedFrontmatter
class UserProfileFrontmatter(ProfilePathMixin, UserScopedFrontmatter):
"""Frontmatter for ``users/<user_id>/user.md``."""
PROFILE_FILENAME: ClassVar[str] = "user.md"
type: Literal["user_profile"] = "user_profile"
summary: str = ""
"""Free-form one-paragraph summary of the user — the retrieval anchor."""
explicit_info: list[Any] = []
"""Algo-side ``explicit_info`` bucket (verbatim facts the user stated)."""
implicit_traits: list[Any] = []
"""Algo-side ``implicit_traits`` bucket (LLM-inferred preferences)."""
profile_timestamp_ms: int = 0
"""Algo-emitted profile timestamp (ms epoch); equals the timestamp of
the most recent MemCell that fed into the synthesis. Compared with
:attr:`everos.infra.persistence.sqlite.Cluster.last_ts_ms` to decide
whether a cluster is fresh enough to drive a profile re-extraction."""

View File

@ -0,0 +1,49 @@
"""Business markdown readers — symmetric with the writers.
Daily-log markdown is parsed via :class:`MarkdownReader` from ``core``
(the base reader returns frontmatter dict + body + entry markers, all
schema-agnostic). Reader classes here add the **business-aware
locator** layer:
* :class:`BaseDailyReader` + subclasses — bind a daily-log schema,
resolve ``(scope_id, date)`` to a file, locate entries by id,
and optionally upgrade to :class:`StructuredEntry`. Symmetric
with :class:`BaseDailyWriter`.
* :class:`AgentSkillReader` — reads ``SKILL.md`` and parses the
frontmatter into the caller-supplied ``AgentSkillFrontmatter``
subclass; also reads individual reference / script files.
* :class:`ProfileReader` — reads a fixed-name profile file
(``user.md`` / ``agent.md`` / ``soul.md`` / …) and parses its
frontmatter into the caller-supplied schema.
By design, no batch / list APIs live here: bulk enumeration for
prompt-budget or cross-record queries goes through sqlite/lancedb
(see the cascade daemon's index sync), not a markdown directory walk.
External usage::
from everos.infra.persistence.markdown.readers import (
BaseDailyReader,
EpisodeReader,
AgentSkillReader,
ProfileReader,
)
"""
from .agent_case_reader import AgentCaseReader as AgentCaseReader
from .agent_skill_reader import AgentSkillReader as AgentSkillReader
from .atomic_fact_reader import AtomicFactReader as AtomicFactReader
from .base import BaseDailyReader as BaseDailyReader
from .episode_reader import EpisodeReader as EpisodeReader
from .foresight_reader import ForesightReader as ForesightReader
from .profile_reader import ProfileReader as ProfileReader
__all__ = [
"AgentCaseReader",
"AgentSkillReader",
"AtomicFactReader",
"BaseDailyReader",
"EpisodeReader",
"ForesightReader",
"ProfileReader",
]

View File

@ -0,0 +1,31 @@
"""AgentCase daily-log reader — symmetric with :class:`AgentCaseWriter`."""
from __future__ import annotations
import datetime as _dt
from pathlib import Path
from everos.core.persistence import MemoryRoot
from ..mds import AgentCaseDailyFrontmatter
from .base import BaseDailyReader
class AgentCaseReader(BaseDailyReader):
"""Read agent-case daily-log files."""
schema = AgentCaseDailyFrontmatter
def __init__(self, root: MemoryRoot) -> None:
super().__init__(root)
def path_for(
self,
agent_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Resolve the agent-case daily-log path under the <app>/<project> prefix."""
return super().path_for(agent_id, date, app_id=app_id, project_id=project_id)

View File

@ -0,0 +1,161 @@
"""AgentSkillReader — typed read for the AgentSkill directory layout.
Pairs with :class:`AgentSkillWriter`:
- :meth:`read_main` reads ``SKILL.md`` and returns the caller's
:class:`AgentSkillFrontmatter` subclass instance + the Tier-2 body, so
the caller never deals with raw dicts.
- :meth:`read_reference` / :meth:`read_script` are plain text reads;
no frontmatter, no schema.
All three return ``None`` when the target is missing — readers do not
raise on absence, since "skill not yet created" is a normal state for
the upsert-style workflow. Callers that need to distinguish "missing"
from "empty body" check for ``None`` explicitly.
Path resolution mirrors :class:`AgentSkillWriter` and reads the same
ClassVars off :class:`AgentSkillFrontmatter`.
"""
from __future__ import annotations
from pathlib import Path
from typing import TypeVar
import anyio
from everos.core.persistence import MarkdownReader, MemoryRoot
from ..mds import AgentSkillFrontmatter
T = TypeVar("T", bound=AgentSkillFrontmatter)
class AgentSkillReader:
"""Single-skill reader for the directory + progressive-disclosure layout."""
def __init__(self, root: MemoryRoot) -> None:
self._root = root
# ── Public API ────────────────────────────────────────────────────────
async def read_main(
self,
agent_id: str,
skill_name: str,
*,
schema: type[T],
app_id: str = "default",
project_id: str = "default",
) -> tuple[T, str] | None:
"""Read ``SKILL.md`` and parse its frontmatter into ``schema``.
Args:
schema: Concrete :class:`AgentSkillFrontmatter` subclass. The
frontmatter dict is validated against this schema via
:meth:`pydantic.BaseModel.model_validate`; extra fields
ride along (chassis sets ``extra="allow"``).
Returns:
``(frontmatter, body)`` on success, ``None`` if the file
does not exist. ``body`` is the raw text after the closing
``---``; the trailing newline added by :class:`AgentSkillWriter`
is stripped to give the *logical* body back.
"""
path = self._main_path(agent_id, skill_name, app_id, project_id)
if not await anyio.Path(path).is_file():
return None
parsed = await MarkdownReader.read(path)
frontmatter = schema.model_validate(parsed.frontmatter)
body = parsed.body.rstrip("\n")
return frontmatter, body
async def read_reference(
self,
agent_id: str,
skill_name: str,
reference_name: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> str | None:
"""Read ``references/<reference_name>.md`` verbatim, ``None`` if absent."""
path = self._reference_path(
agent_id, skill_name, reference_name, app_id, project_id
)
apath = anyio.Path(path)
if not await apath.is_file():
return None
text = await apath.read_text(encoding="utf-8")
return text.rstrip("\n")
async def read_script(
self,
agent_id: str,
skill_name: str,
script_filename: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> str | None:
"""Read ``scripts/<script_filename>`` verbatim, ``None`` if absent.
Reading ≠ executing — this only returns the source text.
Sandboxing / exec-policy decisions belong to the caller.
"""
path = self._script_path(
agent_id, skill_name, script_filename, app_id, project_id
)
apath = anyio.Path(path)
if not await apath.is_file():
return None
text = await apath.read_text(encoding="utf-8")
return text.rstrip("\n")
# ── Internals — same shape as AgentSkillWriter ────────────────────────────
def _skill_dir(
self, agent_id: str, skill_name: str, app_id: str, project_id: str
) -> Path:
return (
self._root.agents_dir(app_id, project_id)
/ agent_id
/ AgentSkillFrontmatter.SKILLS_CONTAINER_NAME
/ f"{AgentSkillFrontmatter.SKILL_DIR_PREFIX}{skill_name}"
)
def _main_path(
self, agent_id: str, skill_name: str, app_id: str, project_id: str
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_MAIN_FILENAME
)
def _reference_path(
self,
agent_id: str,
skill_name: str,
reference_name: str,
app_id: str,
project_id: str,
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_REFERENCES_DIR_NAME
/ f"{reference_name}.md"
)
def _script_path(
self,
agent_id: str,
skill_name: str,
script_filename: str,
app_id: str,
project_id: str,
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_SCRIPTS_DIR_NAME
/ script_filename
)

View File

@ -0,0 +1,31 @@
"""AtomicFact daily-log reader — symmetric with :class:`AtomicFactWriter`."""
from __future__ import annotations
import datetime as _dt
from pathlib import Path
from everos.core.persistence import MemoryRoot
from ..mds import AtomicFactDailyFrontmatter
from .base import BaseDailyReader
class AtomicFactReader(BaseDailyReader):
"""Read atomic-fact daily-log files."""
schema = AtomicFactDailyFrontmatter
def __init__(self, root: MemoryRoot) -> None:
super().__init__(root)
def path_for(
self,
owner_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Resolve the atomic-fact daily-log path under the <app>/<project> prefix."""
return super().path_for(owner_id, date, app_id=app_id, project_id=project_id)

View File

@ -0,0 +1,177 @@
"""Base business reader for daily-log markdown files.
Symmetric to :class:`BaseDailyWriter`: reads the daily-log file for
a given ``(scope_id, date)``, locates entries by id within it, and
optionally upgrades them to :class:`StructuredEntry` so service-layer
callers don't have to re-do that plumbing each time.
Subclass usage::
class _MemcellReader(BaseDailyReader):
schema = UserMemcellDailyFrontmatter
reader = _MemcellReader(root)
parsed = reader.read_for("u_jason") # today's file
entry = reader.find_entry("u_jason", "umc_20260422_0001")
structured = reader.find_structured("u_jason", entry.id)
The reader does **not** typed-parse the file's frontmatter dict — the
schema is used only for path resolution (matching what the appender
writes). Frontmatter validation belongs to higher-level callers that
know the business rules.
Path resolution is identical to :class:`BaseDailyWriter` (same
``SCOPE_DIR`` / ``DIR_NAME`` / ``FILE_PREFIX`` ClassVars), so a
reader and writer bound to the same schema agree on every path.
"""
from __future__ import annotations
import datetime as _dt
from pathlib import Path
from typing import ClassVar
import anyio
from everos.component.utils.datetime import today_with_timezone
from everos.core.persistence import (
BaseFrontmatter,
Entry,
EntryId,
MarkdownReader,
MemoryRoot,
ParsedMarkdown,
StructuredEntry,
find_entry,
)
class BaseDailyReader:
"""Single-record reader for daily-log markdown files.
Subclasses bind a :class:`BaseFrontmatter` subclass via the
``schema`` ClassVar. The schema must declare ``SCOPE_DIR``,
``DIR_NAME``, and ``FILE_PREFIX`` (same set the appender uses); no
``ENTRY_ID_PREFIX`` requirement here because the reader takes the
entry id from the caller, not the schema.
"""
schema: ClassVar[type[BaseFrontmatter]] # subclass must declare
def __init__(self, root: MemoryRoot) -> None:
schema = getattr(type(self), "schema", None)
if schema is None:
raise TypeError(
f"{type(self).__name__} must declare a class-level ``schema`` attribute"
)
for attr in ("SCOPE_DIR", "DIR_NAME", "FILE_PREFIX"):
if not getattr(schema, attr, None):
raise TypeError(f"{schema.__name__} missing ClassVar {attr!r}")
self._root = root
# ── Public API ────────────────────────────────────────────────────────
async def read_for(
self,
scope_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> ParsedMarkdown | None:
"""Read the daily-log file for ``(scope_id, date)``.
Args:
scope_id: ``user_id`` or ``agent_id``.
date: Date bucket — defaults to today in the configured TZ.
app_id: App scope segment (defaults to the ``"default"`` space).
project_id: Project scope segment (defaults to ``"default"``).
Returns:
:class:`ParsedMarkdown` (frontmatter dict + body + entries),
or ``None`` when the file does not exist on disk. ``None``
avoids forcing every caller to wrap reads in try/except —
"no file yet" is a normal early state.
"""
path = self._resolve_path(
scope_id, date or today_with_timezone(), app_id, project_id
)
if not await anyio.Path(path).is_file():
return None
return await MarkdownReader.read(path)
async def find_entry(
self,
scope_id: str,
entry_id: str | EntryId,
*,
app_id: str = "default",
project_id: str = "default",
) -> Entry | None:
"""Locate the entry with ``entry_id`` inside its daily-log file.
The date bucket is taken from the entry id (an :class:`EntryId`
encodes its own date), so the caller doesn't pass a date.
Returns ``None`` if either the file or the entry is missing.
"""
eid = entry_id if isinstance(entry_id, EntryId) else EntryId.parse(entry_id)
eid_str = eid.format()
parsed = await self.read_for(
scope_id, eid.date, app_id=app_id, project_id=project_id
)
if parsed is None:
return None
return find_entry(parsed.body, eid_str)
async def find_structured(
self,
scope_id: str,
entry_id: str | EntryId,
*,
app_id: str = "default",
project_id: str = "default",
) -> StructuredEntry | None:
"""Locate the entry and parse its body as audit-form data.
Sugar over :meth:`find_entry` + :meth:`Entry.as_structured`.
Returns ``None`` if the entry is missing.
"""
entry = await self.find_entry(
scope_id, entry_id, app_id=app_id, project_id=project_id
)
if entry is None:
return None
return entry.as_structured()
def path_for(
self,
scope_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Return the daily-log path for ``scope_id`` on ``date`` (today default).
Public counterpart of :meth:`_resolve_path` — symmetric with
:meth:`BaseDailyWriter.path_for`. Does not check existence.
"""
return self._resolve_path(
scope_id, date or today_with_timezone(), app_id, project_id
)
# ── Internals ─────────────────────────────────────────────────────────
def _resolve_path(
self, scope_id: str, date: _dt.date, app_id: str, project_id: str
) -> Path:
"""Build the daily-log path for ``scope_id`` on ``date``."""
# SCOPE_DIR ("users" / "agents") names the matching MemoryRoot method,
# which prepends the <app>/<project> business prefix.
scope_dir = getattr(self._root, f"{self.schema.SCOPE_DIR}_dir")
return (
scope_dir(app_id, project_id)
/ scope_id
/ self.schema.DIR_NAME
/ f"{self.schema.FILE_PREFIX}-{date.isoformat()}.md"
)

View File

@ -0,0 +1,41 @@
"""Episode daily-log reader — symmetric with :class:`EpisodeWriter`.
md is the source of truth for Episode memories; this reader gives
cascade / search / verification scripts a typed locator instead of
raw :class:`MarkdownReader` calls.
"""
from __future__ import annotations
import datetime as _dt
from pathlib import Path
from everos.core.persistence import MemoryRoot
from ..mds import EpisodeDailyFrontmatter
from .base import BaseDailyReader
class EpisodeReader(BaseDailyReader):
"""Read episode daily-log files."""
schema = EpisodeDailyFrontmatter
def __init__(self, root: MemoryRoot) -> None:
super().__init__(root)
def path_for(
self,
owner_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Resolve the daily-log path for ``owner_id`` on ``date`` (today by default).
Mirrors :meth:`EpisodeWriter`'s path-resolution shape so callers
can locate the file written for a given owner / day (under the
``<app>/<project>`` prefix) without instantiating the writer.
"""
return super().path_for(owner_id, date, app_id=app_id, project_id=project_id)

View File

@ -0,0 +1,31 @@
"""Foresight daily-log reader — symmetric with :class:`ForesightWriter`."""
from __future__ import annotations
import datetime as _dt
from pathlib import Path
from everos.core.persistence import MemoryRoot
from ..mds import ForesightDailyFrontmatter
from .base import BaseDailyReader
class ForesightReader(BaseDailyReader):
"""Read foresight daily-log files."""
schema = ForesightDailyFrontmatter
def __init__(self, root: MemoryRoot) -> None:
super().__init__(root)
def path_for(
self,
owner_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Resolve the foresight daily-log path under the <app>/<project> prefix."""
return super().path_for(owner_id, date, app_id=app_id, project_id=project_id)

View File

@ -0,0 +1,96 @@
"""ProfileReader — typed read for the single-file profile layout.
Pairs with :class:`ProfileWriter`. The schema (concrete profile
frontmatter class) is supplied per call; the reader pulls
``SCOPE_DIR`` + ``PROFILE_FILENAME`` ClassVars off it to build the
path, then ``MarkdownReader.read`` + ``schema.model_validate`` give
back a typed frontmatter instance plus the body string.
Returns ``None`` when the profile file does not exist — "not yet
written" is a normal early state for the upsert-style workflow.
"""
from __future__ import annotations
from pathlib import Path
from typing import TypeVar
import anyio
from everos.core.persistence import BaseFrontmatter, MarkdownReader, MemoryRoot
T = TypeVar("T", bound=BaseFrontmatter)
class ProfileReader:
"""Typed read for fixed-name profile markdown files."""
def __init__(self, root: MemoryRoot) -> None:
self._root = root
# ── Public API ────────────────────────────────────────────────────────
async def read(
self,
scope_id: str,
*,
schema: type[T],
app_id: str = "default",
project_id: str = "default",
) -> tuple[T, str] | None:
"""Read the profile file and parse its frontmatter into ``schema``.
Args:
scope_id: ``user_id`` or ``agent_id`` (must match the
schema's scope mixin).
schema: Concrete profile frontmatter class — must declare
``SCOPE_DIR`` (via scope mixin) and ``PROFILE_FILENAME``.
app_id: App scope segment (defaults to the ``"default"`` space).
project_id: Project scope segment (defaults to ``"default"``).
Returns:
``(frontmatter, body)`` on success; ``None`` if the file is
missing. ``body`` is the raw text after the closing ``---``
with the writer-added trailing newline stripped.
"""
path = self._resolve_path(scope_id, schema, app_id, project_id)
if not await anyio.Path(path).is_file():
return None
parsed = await MarkdownReader.read(path)
frontmatter = schema.model_validate(parsed.frontmatter)
body = parsed.body.rstrip("\n")
return frontmatter, body
def path_for(
self,
scope_id: str,
*,
schema: type[BaseFrontmatter],
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Return the profile path (no IO check)."""
return self._resolve_path(scope_id, schema, app_id, project_id)
# ── Internals — same shape as ProfileWriter ───────────────────────────
def _resolve_path(
self,
scope_id: str,
schema: type[BaseFrontmatter],
app_id: str,
project_id: str,
) -> Path:
scope_dir = getattr(schema, "SCOPE_DIR", "")
filename = getattr(schema, "PROFILE_FILENAME", None)
if not scope_dir:
raise TypeError(
f"{schema.__name__} missing ``SCOPE_DIR`` ClassVar — "
"must inherit a scope mixin (UserScopedFrontmatter / "
"AgentScopedFrontmatter)."
)
if not filename:
raise TypeError(f"{schema.__name__} missing ``PROFILE_FILENAME`` ClassVar.")
# SCOPE_DIR names the matching MemoryRoot method (<app>/<project> prefix).
scope_root = getattr(self._root, f"{scope_dir}_dir")(app_id, project_id)
return scope_root / scope_id / filename

View File

@ -0,0 +1,43 @@
"""Business markdown writers.
Each storage strategy from the EverOS Markdown First spec gets a class
here:
* :class:`BaseDailyWriter` — daily-log append (episode / atomic
fact / foresight / agent case). Subclass and bind ``schema``.
* :class:`AgentSkillWriter` — directory + progressive disclosure
(``skills/skill_<name>/{SKILL.md, references/, scripts/}``).
Single class, no subclassing.
* :class:`ProfileWriter` — single-file rewrite at a fixed name
(``user.md`` / ``behaviors.md`` / ``agent.md`` / ``soul.md`` /
``tools.md``). Single class, no subclassing — caller hands in a
frontmatter instance whose ``PROFILE_FILENAME`` ClassVar pins
the filename.
External usage::
from everos.infra.persistence.markdown.writers import (
BaseDailyWriter,
EpisodeWriter,
AgentSkillWriter,
ProfileWriter,
)
"""
from .agent_case_writer import AgentCaseWriter as AgentCaseWriter
from .agent_skill_writer import AgentSkillWriter as AgentSkillWriter
from .atomic_fact_writer import AtomicFactWriter as AtomicFactWriter
from .base import BaseDailyWriter as BaseDailyWriter
from .episode_writer import EpisodeWriter as EpisodeWriter
from .foresight_writer import ForesightWriter as ForesightWriter
from .profile_writer import ProfileWriter as ProfileWriter
__all__ = [
"AgentCaseWriter",
"AgentSkillWriter",
"AtomicFactWriter",
"BaseDailyWriter",
"EpisodeWriter",
"ForesightWriter",
"ProfileWriter",
]

View File

@ -0,0 +1,63 @@
"""AgentCase daily-log writer — md is the SoT for agent cases.
Lives on the agent track (``agents/<agent_id>/.cases/...``).
Inline carries audit + scoring fields (``owner_id`` / ``session_id`` /
``timestamp`` / ``parent_id`` / ``quality_score``); sections carry
``TaskIntent`` (required, primary BM25/embed), ``Approach`` (verbatim,
not indexed — too long), and optional ``KeyInsight`` (verbatim).
"""
from __future__ import annotations
import datetime as _dt
from collections.abc import Mapping
from pathlib import Path
from typing import Any
import anyio
from everos.component.utils.datetime import (
get_now_with_timezone,
to_iso_format,
)
from everos.core.persistence import MarkdownReader
from ..mds import AgentCaseDailyFrontmatter
from .base import BaseDailyWriter
class AgentCaseWriter(BaseDailyWriter):
"""Daily-log writer for the AgentCase schema (md = SoT).
``append_entry`` / ``append_entries`` come from
:class:`BaseDailyWriter`. The scope id parameter is named ``agent_id``
semantically (this is the agent track), but the base class accepts
it via the generic ``scope_id`` parameter.
"""
schema = AgentCaseDailyFrontmatter
def _frontmatter_updates(
self,
scope_id: str,
date: _dt.date,
*,
next_count: int,
) -> Mapping[str, Any] | None:
return {
"id": f"agent_case_log_{scope_id}_{date.isoformat()}",
"type": "agent_case_daily",
"file_type": "agent_case_daily",
"schema_version": 1,
"agent_id": scope_id,
"track": "agent",
"date": date.isoformat(),
"entry_count": next_count,
"last_appended_at": to_iso_format(get_now_with_timezone()),
}
async def _current_count(self, path: Path) -> int:
if not await anyio.Path(path).is_file():
return 0
parsed = await MarkdownReader.read(path)
return parsed.frontmatter.get("entry_count", 0)

View File

@ -0,0 +1,204 @@
"""AgentSkillWriter — upsert skill main file + reference / script attachments.
Skill storage is **directory + progressive disclosure** (wiki "Memory
Types Markdown Format" v4): each skill lives under
``agents/<agent_id>/skills/skill_<name>/`` with a ``SKILL.md`` main
file plus ``references/*.md`` and ``scripts/*.<ext>`` siblings.
This writer is intentionally distinct from :class:`BaseDailyWriter`:
- **Upsert, not append.** Each ``write_*`` call overwrites the target
file in full. Skills don't accumulate entry markers — the body of
``SKILL.md`` is the latest revision; references / scripts are
individually replaceable files.
- **Single-skill API.** The chassis is *not* responsible for bulk
enumeration (Tier-1 prompt scanning is a sqlite/lancedb concern,
not a markdown-walk concern). One skill in, one skill out.
- **No counters / hooks.** No frontmatter merging, no entry-id
generation, no _frontmatter_updates hook — the caller hands in a
fully-built :class:`AgentSkillFrontmatter` subclass instance and the body
string; the writer atomically replaces the file.
Path resolution comes from :class:`MemoryRoot` + the ClassVars on
:class:`AgentSkillFrontmatter` (``SKILLS_CONTAINER_NAME`` /
``SKILL_DIR_PREFIX`` / etc.). The writer + reader pair is the single
addressing API for skills.
"""
from __future__ import annotations
from pathlib import Path
from everos.core.persistence import MarkdownWriter, MemoryRoot
from ..mds import AgentSkillFrontmatter
class AgentSkillWriter:
"""Atomic writer for the AgentSkill directory layout.
Holds a :class:`MarkdownWriter` for the SKILL.md path (frontmatter +
body) and a thin atomic-write helper for plain-text reference /
script files (no frontmatter).
"""
def __init__(
self,
root: MemoryRoot,
*,
writer: MarkdownWriter | None = None,
) -> None:
self._root = root
self._writer = writer or MarkdownWriter(root)
# ── Public API ────────────────────────────────────────────────────────
async def write_main(
self,
agent_id: str,
skill_name: str,
*,
frontmatter: AgentSkillFrontmatter,
body: str,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Upsert ``skills/skill_<name>/SKILL.md``.
The file is replaced in full: ``frontmatter`` becomes the new
YAML head, ``body`` becomes the new body. Any prior content
(including manual human edits) is overwritten. The atomic
rename keeps readers from ever seeing a torn write.
Args:
agent_id: Owning agent.
skill_name: Unprefixed identifier (``"contract_risk_scan"``,
not ``"skill_contract_risk_scan"``).
frontmatter: Fully-built schema instance — its ``model_dump``
is what lands in the YAML head, including extra fields.
body: Tier-2 body text. Trailing newline is normalised.
Returns:
Absolute path of the written ``SKILL.md``.
"""
path = self._main_path(agent_id, skill_name, app_id, project_id)
head_meta = frontmatter.model_dump(exclude_none=False)
return await self._writer.write_markdown(
path,
frontmatter=head_meta,
body=_ensure_trailing_newline(body),
)
async def write_reference(
self,
agent_id: str,
skill_name: str,
reference_name: str,
content: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Upsert ``skills/skill_<name>/references/<reference_name>.md``.
Reference files are plain markdown — no frontmatter, no entry
markers. Content is written verbatim (with a normalised
trailing newline).
Args:
reference_name: Filename stem (no ``.md`` extension).
"""
path = self._reference_path(
agent_id, skill_name, reference_name, app_id, project_id
)
return await self._writer.write(path, _ensure_trailing_newline(content))
async def write_script(
self,
agent_id: str,
skill_name: str,
script_filename: str,
content: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Upsert ``skills/skill_<name>/scripts/<script_filename>``.
Script files are written verbatim — caller supplies the *full*
filename (including the extension; ``redline.py`` /
``redline.sh`` / etc.) since scripts may be in any language.
Cascade does not index this directory.
"""
path = self._script_path(
agent_id, skill_name, script_filename, app_id, project_id
)
return await self._writer.write(path, _ensure_trailing_newline(content))
# ── Path API (callers that need to echo paths in responses) ──────────
def main_path(
self,
agent_id: str,
skill_name: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Return ``skills/skill_<name>/SKILL.md`` (does not check existence)."""
return self._main_path(agent_id, skill_name, app_id, project_id)
# ── Internals — path resolution from AgentSkillFrontmatter ClassVars ──────
def _skill_dir(
self, agent_id: str, skill_name: str, app_id: str, project_id: str
) -> Path:
return (
self._root.agents_dir(app_id, project_id)
/ agent_id
/ AgentSkillFrontmatter.SKILLS_CONTAINER_NAME
/ f"{AgentSkillFrontmatter.SKILL_DIR_PREFIX}{skill_name}"
)
def _main_path(
self, agent_id: str, skill_name: str, app_id: str, project_id: str
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_MAIN_FILENAME
)
def _reference_path(
self,
agent_id: str,
skill_name: str,
reference_name: str,
app_id: str,
project_id: str,
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_REFERENCES_DIR_NAME
/ f"{reference_name}.md"
)
def _script_path(
self,
agent_id: str,
skill_name: str,
script_filename: str,
app_id: str,
project_id: str,
) -> Path:
return (
self._skill_dir(agent_id, skill_name, app_id, project_id)
/ AgentSkillFrontmatter.SKILL_SCRIPTS_DIR_NAME
/ script_filename
)
def _ensure_trailing_newline(text: str) -> str:
"""End the body with exactly one newline (POSIX text-file convention)."""
if not text:
return ""
return text if text.endswith("\n") else text + "\n"

View File

@ -0,0 +1,58 @@
"""AtomicFact daily-log writer — md is the SoT for atomic facts.
Caller hands pre-built ``inline`` (``owner_id`` / ``session_id`` /
``timestamp`` / ``parent_id`` / ``sender_ids``) plus the single
``Fact`` section. The chassis manages the in-file ``entry_id`` sequence
(``af_<YYYYMMDD>_<NNNN>``). ``append_entry`` / ``append_entries`` come
from :class:`BaseDailyWriter`; this subclass only declares the schema
and the per-schema frontmatter / counter hooks.
"""
from __future__ import annotations
import datetime as _dt
from collections.abc import Mapping
from pathlib import Path
from typing import Any
import anyio
from everos.component.utils.datetime import (
get_now_with_timezone,
to_iso_format,
)
from everos.core.persistence import MarkdownReader
from ..mds import AtomicFactDailyFrontmatter
from .base import BaseDailyWriter
class AtomicFactWriter(BaseDailyWriter):
"""Daily-log writer for the AtomicFact schema (md = SoT)."""
schema = AtomicFactDailyFrontmatter
def _frontmatter_updates(
self,
scope_id: str,
date: _dt.date,
*,
next_count: int,
) -> Mapping[str, Any] | None:
return {
"id": f"atomic_fact_log_{scope_id}_{date.isoformat()}",
"type": "atomic_fact_daily",
"file_type": "atomic_fact_daily",
"schema_version": 1,
"user_id": scope_id,
"track": "user",
"date": date.isoformat(),
"entry_count": next_count,
"last_appended_at": to_iso_format(get_now_with_timezone()),
}
async def _current_count(self, path: Path) -> int:
if not await anyio.Path(path).is_file():
return 0
parsed = await MarkdownReader.read(path)
return parsed.frontmatter.get("entry_count", 0)

View File

@ -0,0 +1,301 @@
"""Base business writer for daily-log markdown files.
Daily-log files (memcell / episode / case / atomic_fact / foresight)
share three things:
* scope (user-track or agent-track, derived from the schema)
* filename pattern: ``<FILE_PREFIX>-<YYYY-MM-DD>.md`` under
``<scope_root>/<scope_id>/<DIR_NAME>/``
* entry id pattern: ``<ENTRY_ID_PREFIX>_<YYYYMMDD>_<NNN>``
:class:`BaseDailyWriter` factors out **path resolution + entry-id
construction + today's date default**, leaving frontmatter field
maintenance (e.g. ``entry_count`` / ``last_appended_at``) to concrete
business subclasses.
Subclass usage::
class _MemcellWriter(BaseDailyWriter):
schema = UserMemcellDailyFrontmatter
writer = _MemcellWriter(layout)
eid = writer.append("u_jason", body="...")
"""
from __future__ import annotations
import datetime as _dt
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar
import anyio
from everos.component.utils.datetime import today_with_timezone
from everos.core.persistence import (
BaseFrontmatter,
EntryId,
MarkdownReader,
MarkdownWriter,
MemoryRoot,
render_structured_entry,
)
class BaseDailyWriter:
"""Append a new entry to today's (or a given date's) daily-log file.
Subclasses bind a single :class:`BaseFrontmatter` subclass via the
``schema`` ClassVar. The schema must declare ``SCOPE_DIR``,
``ENTRY_ID_PREFIX``, ``DIR_NAME``, and ``FILE_PREFIX`` —
``SCOPE_DIR`` is provided by inheriting :class:`UserScopedFrontmatter`
or :class:`AgentScopedFrontmatter` (or by a custom scope mixin).
Path resolution is driven entirely by the schema's ClassVars and
the given :class:`MemoryRoot` — write, read, and addressing for a
single record kind all live in this writer + its reader twin, no
separate layout layer.
"""
schema: ClassVar[type[BaseFrontmatter]] # subclass must declare
def __init__(
self,
root: MemoryRoot,
*,
writer: MarkdownWriter | None = None,
) -> None:
schema = getattr(type(self), "schema", None)
if schema is None:
raise TypeError(
f"{type(self).__name__} must declare a class-level ``schema`` attribute"
)
for attr in ("SCOPE_DIR", "ENTRY_ID_PREFIX", "DIR_NAME", "FILE_PREFIX"):
if not getattr(schema, attr, None):
raise TypeError(f"{schema.__name__} missing ClassVar {attr!r}")
self._root = root
self._writer = writer or MarkdownWriter(root)
# ── Public API ────────────────────────────────────────────────────────
async def append_entry(
self,
scope_id: str,
*,
inline: Mapping[str, object],
sections: Mapping[str, str],
date: _dt.date | None = None,
app_id: str = "default",
project_id: str = "default",
) -> EntryId:
"""Append a single rendered entry; return the freshly minted ``EntryId``.
Unifies the per-schema ``append_entry`` previously duplicated across
:class:`AtomicFactWriter` / :class:`ForesightWriter` /
:class:`EpisodeWriter` / :class:`AgentCaseWriter`. The whole flow
(read ``entry_count``, allocate ``EntryId``, render entry body,
update frontmatter, atomic write) runs inside one per-path lock,
eliminating the read-modify-write race that previously allowed
concurrent callers to silently overwrite each other's appends.
Args:
scope_id: ``user_id`` or ``agent_id`` (matches the schema's
scope flavour).
inline: Inline metadata (``**key**: value`` lines under the
H2 header).
sections: ``{title: body}`` blocks rendered as ``### Title`` +
body text.
date: Date bucket — defaults to today in the configured TZ.
Returns:
The :class:`EntryId` assigned to the new entry. Caller can
use it to write downstream state (sqlite row, lance index).
"""
eids = await self.append_entries(
scope_id,
[(inline, sections)],
date=date,
app_id=app_id,
project_id=project_id,
)
return eids[0]
async def append_entries(
self,
scope_id: str,
items: Sequence[tuple[Mapping[str, object], Mapping[str, str]]],
*,
date: _dt.date | None = None,
app_id: str = "default",
project_id: str = "default",
) -> list[EntryId]:
"""Append ``N`` rendered entries in one locked read-modify-write cycle.
Compared with looping :meth:`append_entry` ``N`` times, this:
* Performs one file read + one file write instead of ``N`` of each.
* Holds the per-path lock for one short critical section.
* Updates ``frontmatter`` (``entry_count`` / ``last_appended_at``)
once at the end (no intermediate flapping).
Order in ``items`` is the order entries land in the file (and the
order ``EntryId``s are allocated). Empty ``items`` is a no-op
that returns ``[]`` without touching the file.
Args:
scope_id: Subject scope (user / agent id).
items: Sequence of ``(inline, sections)`` pairs.
date: Date bucket — defaults to today in the configured TZ.
Returns:
``N`` :class:`EntryId`s in the same order as ``items``.
"""
bucket = date or today_with_timezone()
path = self._resolve_path(scope_id, bucket, app_id, project_id)
if not items:
return []
async with self._writer.lock_for(path):
base_count = await self._current_count(path)
eids = [
EntryId.next_for(self.schema.ENTRY_ID_PREFIX, bucket, base_count + i)
for i in range(len(items))
]
rendered = [
(
render_structured_entry(
header=eid.format(),
inline=inline,
sections=sections,
),
eid,
)
for eid, (inline, sections) in zip(eids, items, strict=True)
]
frontmatter_updates = self._frontmatter_updates(
scope_id, bucket, next_count=base_count + len(items)
)
await self._writer._append_entries_unlocked( # noqa: SLF001
path,
rendered,
frontmatter_updates=frontmatter_updates,
)
return eids
async def append(
self,
scope_id: str,
entry_body: str,
*,
date: _dt.date | None = None,
frontmatter_updates: Mapping[str, Any] | None = None,
app_id: str = "default",
project_id: str = "default",
) -> EntryId:
"""Append a pre-rendered ``entry_body`` to the daily-log file.
Kept for back-compat with callers that hand in fully rendered
bodies (rare — most callers should use :meth:`append_entry` and
let this class do the rendering). The whole sequence (read
``entry_count``, allocate eid, write) runs inside the per-path
lock.
Args:
scope_id: ``user_id`` or ``agent_id`` (matches the schema's
scope flavour).
entry_body: Content placed between the entry markers.
date: Date bucket — defaults to today in the configured TZ.
frontmatter_updates: Optional fields to merge into the file's
frontmatter (e.g. ``entry_count`` / ``last_appended_at``).
When ``None``, the subclass hook
:meth:`_frontmatter_updates` is consulted to build
default updates.
Returns:
The :class:`EntryId` assigned to the new entry.
"""
bucket = date or today_with_timezone()
path = self._resolve_path(scope_id, bucket, app_id, project_id)
async with self._writer.lock_for(path):
count = await self._current_count(path)
eid = EntryId.next_for(self.schema.ENTRY_ID_PREFIX, bucket, count)
# Subclass hook: derive defaults if caller passes nothing.
if frontmatter_updates is None:
frontmatter_updates = self._frontmatter_updates(
scope_id, bucket, next_count=count + 1
)
await self._writer._append_entries_unlocked( # noqa: SLF001
path,
[(entry_body, eid)],
frontmatter_updates=frontmatter_updates,
)
return eid
# ── Hooks (subclass override) ─────────────────────────────────────────
async def _current_count(self, path: Path) -> int:
"""Return the current entry count for the file.
Default: number of ``<!-- entry:... -->`` blocks already present.
Subclasses may override to read a frontmatter field (e.g.
``entry_count``) when they trust that field over a marker scan.
"""
if not await anyio.Path(path).is_file():
return 0
parsed = await MarkdownReader.read(path)
return len(parsed.entries)
def _frontmatter_updates(
self,
scope_id: str,
date: _dt.date,
*,
next_count: int,
) -> Mapping[str, Any] | None:
"""Build the per-append frontmatter dict (subclass override).
Called only when :meth:`append`'s ``frontmatter_updates`` is
``None``. Default returns ``None`` (no frontmatter mutation).
Concrete business subclasses override to maintain fields like
``id`` / ``entry_count`` / ``last_appended_at`` automatically,
so callers don't repeat themselves on every append.
"""
return None
# ── Path API ──────────────────────────────────────────────────────────
def path_for(
self,
scope_id: str,
date: _dt.date | None = None,
*,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Return the daily-log path for ``scope_id`` on ``date`` (today default).
Public counterpart of :meth:`_resolve_path` — callers (services,
scripts) should use this rather than poking at private attrs.
"""
return self._resolve_path(
scope_id, date or today_with_timezone(), app_id, project_id
)
# ── Internals ─────────────────────────────────────────────────────────
def _resolve_path(
self, scope_id: str, date: _dt.date, app_id: str, project_id: str
) -> Path:
"""Build the daily-log path for ``scope_id`` on ``date``."""
# SCOPE_DIR ("users" / "agents") names the matching MemoryRoot method,
# which prepends the <app>/<project> business prefix.
scope_dir = getattr(self._root, f"{self.schema.SCOPE_DIR}_dir")
return (
scope_dir(app_id, project_id)
/ scope_id
/ self.schema.DIR_NAME
/ f"{self.schema.FILE_PREFIX}-{date.isoformat()}.md"
)

View File

@ -0,0 +1,69 @@
"""Episode daily-log writer — md is the SoT for Episode memories.
Stays in the chassis style: caller hands in pre-built ``inline`` and
``sections`` dicts plus the scope id (``owner_id``). Domain →
structured-entry shaping lives in the calling pipeline (cf. architecture
rule: ``infra`` may not import ``memory``).
This milestone assumes well-behaved callers (no retransmit dedupe needed).
The writer just appends; the chassis manages the in-file ``entry_id``
sequence, which is the single source of identity for an md entry.
"""
from __future__ import annotations
import datetime as _dt
from collections.abc import Mapping
from pathlib import Path
from typing import Any
import anyio
from everos.component.utils.datetime import (
get_now_with_timezone,
to_iso_format,
)
from everos.core.persistence import MarkdownReader
from ..mds import EpisodeDailyFrontmatter
from .base import BaseDailyWriter
class EpisodeWriter(BaseDailyWriter):
"""Daily-log writer for the Episode schema (md = SoT).
``append_entry`` / ``append_entries`` come from
:class:`BaseDailyWriter`; the ``entry_id`` (``ep_<YYYYMMDD>_<NNNN>``)
is the in-file identity allocated under the per-path lock. Callers
can derive a globally-unique id from ``(owner_id, entry_id)``
without persisting any algo-side uuid.
"""
schema = EpisodeDailyFrontmatter
# ── Frontmatter override (entry_count + last_appended_at) ────────────
def _frontmatter_updates(
self,
scope_id: str,
date: _dt.date,
*,
next_count: int,
) -> Mapping[str, Any] | None:
return {
"id": f"episode_log_{scope_id}_{date.isoformat()}",
"type": "episode_daily",
"file_type": "episode_daily",
"schema_version": 1,
"user_id": scope_id,
"track": "user",
"date": date.isoformat(),
"entry_count": next_count,
"last_appended_at": to_iso_format(get_now_with_timezone()),
}
async def _current_count(self, path: Path) -> int:
if not await anyio.Path(path).is_file():
return 0
parsed = await MarkdownReader.read(path)
return parsed.frontmatter.get("entry_count", 0)

View File

@ -0,0 +1,58 @@
"""Foresight daily-log writer — md is the SoT for foresights.
Inline carries the audit / scope + time-window fields (``owner_id`` /
``session_id`` / ``timestamp`` / ``parent_id`` / ``sender_ids`` plus
optional ``start_time`` / ``end_time`` / ``duration_days``). Sections
carry the BM25-indexed content: ``Foresight`` (required, primary
field) and optional ``Evidence`` (secondary BM25 field).
``append_entry`` / ``append_entries`` come from :class:`BaseDailyWriter`.
"""
from __future__ import annotations
import datetime as _dt
from collections.abc import Mapping
from pathlib import Path
from typing import Any
import anyio
from everos.component.utils.datetime import (
get_now_with_timezone,
to_iso_format,
)
from everos.core.persistence import MarkdownReader
from ..mds import ForesightDailyFrontmatter
from .base import BaseDailyWriter
class ForesightWriter(BaseDailyWriter):
"""Daily-log writer for the Foresight schema (md = SoT)."""
schema = ForesightDailyFrontmatter
def _frontmatter_updates(
self,
scope_id: str,
date: _dt.date,
*,
next_count: int,
) -> Mapping[str, Any] | None:
return {
"id": f"foresight_log_{scope_id}_{date.isoformat()}",
"type": "foresight_daily",
"file_type": "foresight_daily",
"schema_version": 1,
"user_id": scope_id,
"track": "user",
"date": date.isoformat(),
"entry_count": next_count,
"last_appended_at": to_iso_format(get_now_with_timezone()),
}
async def _current_count(self, path: Path) -> int:
if not await anyio.Path(path).is_file():
return 0
parsed = await MarkdownReader.read(path)
return parsed.frontmatter.get("entry_count", 0)

View File

@ -0,0 +1,127 @@
"""ProfileWriter — upsert a single-file, fixed-name profile markdown.
Profile storage is **single-file rewrite** (the third storage strategy
in the EverOS Markdown First spec). Each profile lives at a fixed
filename under the agent or user directory::
users/<user_id>/user.md ← user profile
users/<user_id>/behaviors.md ← user behaviour patterns
agents/<agent_id>/agent.md ← agent playbook
agents/<agent_id>/soul.md ← agent identity / values
agents/<agent_id>/tools.md ← agent tool declarations
Compared with :class:`SkillWriter` (directory + progressive disclosure)
and :class:`BaseDailyWriter` (per-date append + entry markers), the
profile writer is the simplest of the three:
- **Upsert, not append.** Each ``write`` overwrites the file in full.
- **Fixed path.** Caller passes ``scope_id`` only — no ``name``
parameter; the filename is fixed by the schema's
``PROFILE_FILENAME`` ClassVar.
- **No business hooks.** No frontmatter merging, no entry-id
generation. The caller hands in a fully-built schema instance.
The schema must declare two ClassVars:
- ``SCOPE_DIR`` (``"users"`` / ``"agents"``) — inherited from
:class:`UserScopedFrontmatter` / :class:`AgentScopedFrontmatter`.
- ``PROFILE_FILENAME`` (``"user.md"`` / ``"agent.md"`` / …) —
declared on the concrete profile schema itself.
There is no ``ProfileFrontmatter`` base class: profile schemas are
duck-typed via the two ClassVars. Subclasses inherit the scope mixin
and add ``PROFILE_FILENAME`` plus their business fields directly.
"""
from __future__ import annotations
from pathlib import Path
from everos.core.persistence import BaseFrontmatter, MarkdownWriter, MemoryRoot
class ProfileWriter:
"""Atomic writer for the single-file profile layout."""
def __init__(
self,
root: MemoryRoot,
*,
writer: MarkdownWriter | None = None,
) -> None:
self._root = root
self._writer = writer or MarkdownWriter(root)
# ── Public API ────────────────────────────────────────────────────────
async def write(
self,
scope_id: str,
*,
frontmatter: BaseFrontmatter,
body: str,
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Upsert ``<app>/<project>/<scope>/<scope_id>/<PROFILE_FILENAME>``.
Args:
scope_id: ``user_id`` or ``agent_id`` (must match the
schema's scope mixin).
frontmatter: Fully-built schema instance — its ``model_dump``
lands as the YAML head, including extra fields.
body: Profile body text. Trailing newline is normalised.
app_id: App scope segment (defaults to the ``"default"`` space).
project_id: Project scope segment (defaults to ``"default"``).
Returns:
Absolute path of the written profile file.
"""
path = self._resolve_path(scope_id, type(frontmatter), app_id, project_id)
head_meta = frontmatter.model_dump(exclude_none=False)
return await self._writer.write_markdown(
path,
frontmatter=head_meta,
body=_ensure_trailing_newline(body),
)
def path_for(
self,
scope_id: str,
*,
schema: type[BaseFrontmatter],
app_id: str = "default",
project_id: str = "default",
) -> Path:
"""Return the profile path (no IO check)."""
return self._resolve_path(scope_id, schema, app_id, project_id)
# ── Internals ─────────────────────────────────────────────────────────
def _resolve_path(
self,
scope_id: str,
schema: type[BaseFrontmatter],
app_id: str,
project_id: str,
) -> Path:
scope_dir = getattr(schema, "SCOPE_DIR", "")
filename = getattr(schema, "PROFILE_FILENAME", None)
if not scope_dir:
raise TypeError(
f"{schema.__name__} missing ``SCOPE_DIR`` ClassVar — "
"must inherit a scope mixin (UserScopedFrontmatter / "
"AgentScopedFrontmatter)."
)
if not filename:
raise TypeError(f"{schema.__name__} missing ``PROFILE_FILENAME`` ClassVar.")
# SCOPE_DIR names the matching MemoryRoot method (<app>/<project> prefix).
scope_root = getattr(self._root, f"{scope_dir}_dir")(app_id, project_id)
return scope_root / scope_id / filename
def _ensure_trailing_newline(text: str) -> str:
"""End the body with exactly one newline (POSIX text-file convention)."""
if not text:
return ""
return text if text.endswith("\n") else text + "\n"

View File

@ -0,0 +1,66 @@
"""SQLite business persistence layer.
Sits on top of :mod:`everos.core.persistence.sqlite` (engine + sessions +
``BaseTable`` + ``RepoBase``) and provides:
* lazy process-wide engine + session-factory singletons
(:mod:`.sqlite_manager`)
* concrete table schemas under :mod:`.tables`
* concrete repository singletons under :mod:`.repos`
External usage::
from everos.infra.persistence.sqlite import (
get_engine, get_session_factory, dispose_engine,
# business tables / repos are re-exported here too —
# callers MUST go through this top-level package because
# ``infra.persistence.sqlite.**`` (sub-packages) are forbidden
# to ``service`` / ``memory`` / ``entrypoints`` by import-linter.
UnprocessedBuffer, Memcell, ConversationStatus,
unprocessed_buffer_repo, memcell_repo, conversation_status_repo,
)
The :class:`SqliteLifespanProvider` runs ``SQLModel.metadata.create_all``
on app startup and ``dispose_engine`` on shutdown, so business code does
not need to manage either.
"""
# Importing ``tables`` registers every business SQLModel in
# ``SQLModel.metadata`` so ``SqliteLifespanProvider.startup`` can
# ``create_all`` without callers having to import each model module.
from . import tables as tables # noqa: F401
from .repos import QueueSummary as QueueSummary
from .repos import cluster_repo as cluster_repo
from .repos import conversation_status_repo as conversation_status_repo
from .repos import md_change_state_repo as md_change_state_repo
from .repos import memcell_repo as memcell_repo
from .repos import mint_cluster_id as mint_cluster_id
from .repos import unprocessed_buffer_repo as unprocessed_buffer_repo
from .sqlite_manager import dispose_engine as dispose_engine
from .sqlite_manager import get_engine as get_engine
from .sqlite_manager import get_session_factory as get_session_factory
from .tables import Cluster as Cluster
from .tables import ClusterMember as ClusterMember
from .tables import ConversationStatus as ConversationStatus
from .tables import MdChangeState as MdChangeState
from .tables import Memcell as Memcell
from .tables import UnprocessedBuffer as UnprocessedBuffer
__all__ = [
"Cluster",
"ClusterMember",
"ConversationStatus",
"MdChangeState",
"Memcell",
"QueueSummary",
"UnprocessedBuffer",
"cluster_repo",
"conversation_status_repo",
"dispose_engine",
"get_engine",
"get_session_factory",
"md_change_state_repo",
"memcell_repo",
"mint_cluster_id",
"unprocessed_buffer_repo",
]

View File

@ -0,0 +1,23 @@
"""Business SQLite repository singletons.
Repository instances for business tables, wired to the process-wide
engine singleton.
"""
from .cluster import cluster_repo as cluster_repo
from .cluster import mint_cluster_id as mint_cluster_id
from .conversation_status import conversation_status_repo as conversation_status_repo
from .md_change_state import QueueSummary as QueueSummary
from .md_change_state import md_change_state_repo as md_change_state_repo
from .memcell import memcell_repo as memcell_repo
from .unprocessed_buffer import unprocessed_buffer_repo as unprocessed_buffer_repo
__all__ = [
"QueueSummary",
"cluster_repo",
"conversation_status_repo",
"md_change_state_repo",
"memcell_repo",
"mint_cluster_id",
"unprocessed_buffer_repo",
]

View File

@ -0,0 +1,240 @@
"""Repository for the ``cluster`` + ``cluster_member`` pair.
Bridges between the storage row shape and the algo-side
:class:`everalgo.clustering.Cluster` value object. Callers always work in
the algo type — this repo handles the centroid bytes round-trip, the
preview JSON round-trip, and the membership join so the algo's
``members: list[str]`` field is always fully populated on read. The
``last_ts`` field is stored as int milliseconds (matches the algo type
exactly) to keep the round-trip lossless across SQLite's tz-naive
``DateTime`` storage.
The single ``upsert_with_members`` write path is what every cluster
strategy invokes after a merge / new-cluster decision: it stamps the
``cluster`` row (UPSERT) and reconciles the ``cluster_member`` rows
(diff-then-insert; pre-existing members are kept, new members appended)
so calls are idempotent even if a strategy retries.
"""
from __future__ import annotations
import json
import uuid
import numpy as np
from everalgo.clustering import Cluster as AlgoCluster
from sqlalchemy import select
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from everos.component.utils.datetime import get_utc_now
from everos.core.persistence.sqlite import RepoBase, session_scope
from ..sqlite_manager import get_session_factory
from ..tables import Cluster, ClusterMember
_CENTROID_DTYPE = np.float32
def mint_cluster_id() -> str:
"""Mint a fresh cluster id (mirrors ``_mint_memcell_id``: ``cl_<12hex>``)."""
return f"cl_{uuid.uuid4().hex[:12]}"
class _ClusterRepo(RepoBase[Cluster]):
model = Cluster
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
return get_session_factory()
# ── Reads ──────────────────────────────────────────────────────────────
async def get_with_members(self, cluster_id: str) -> AlgoCluster | None:
"""Fetch one cluster as a fully-hydrated algo value object.
Returns ``None`` when no row matches ``cluster_id`` — downstream
strategies that race the writer should treat this as a transient
miss and let OME retry the run.
"""
async with session_scope(self._factory) as s:
row = await s.get(Cluster, cluster_id)
if row is None:
return None
members_by_cluster = await _load_members_by_cluster(s, [cluster_id])
return _row_to_algo(row, members_by_cluster.get(cluster_id, []))
async def list_for_owner(
self,
owner_id: str,
kind: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> list[AlgoCluster]:
"""All clusters for ``(app, project, owner, kind)``, as algo objects.
Hot path for the cluster strategies (``cluster_by_geometry`` /
``cluster_by_llm`` need the full ``existing_clusters`` list). Each
returned cluster carries its full ``members`` view, populated from
the join with :class:`ClusterMember`. Scoping by (app, project)
keeps one space's clusters from merging into another's.
"""
async with session_scope(self._factory) as s:
rows = list(
(
await s.execute(
select(Cluster)
.where(Cluster.app_id == app_id)
.where(Cluster.project_id == project_id)
.where(Cluster.owner_id == owner_id)
.where(Cluster.kind == kind)
)
)
.scalars()
.all()
)
if not rows:
return []
ids = [r.cluster_id for r in rows]
members_by_cluster = await _load_members_by_cluster(s, ids)
return [
_row_to_algo(row, members_by_cluster.get(row.cluster_id, []))
for row in rows
]
async def find_cluster_id_for_member(
self,
member_type: str,
member_id: str,
) -> str | None:
"""Reverse lookup: ``(member_type, member_id) → cluster_id``.
Returns ``None`` when the entity is not yet attached to any cluster.
Backed by ``ix_cluster_member_reverse`` so it is O(log N).
"""
async with session_scope(self._factory) as s:
stmt = (
select(ClusterMember.cluster_id)
.where(ClusterMember.member_type == member_type)
.where(ClusterMember.member_id == member_id)
.limit(1)
)
return (await s.execute(stmt)).scalar_one_or_none()
# ── Write ──────────────────────────────────────────────────────────────
async def upsert_with_members(
self,
algo_cluster: AlgoCluster,
*,
owner_id: str,
owner_type: str,
kind: str,
member_type: str,
app_id: str = "default",
project_id: str = "default",
) -> None:
"""Persist one algo cluster snapshot + its membership rows.
``algo_cluster.id`` must be non-None (caller-minted via
:func:`mint_cluster_id` for a brand-new cluster, or carried
through from a merge return). ``algo_cluster.members`` is the
full member list — the repo diffs against existing membership
and inserts only the new rows so the call is idempotent under
OME's at-least-once retry semantics.
"""
cluster_id = algo_cluster.id
if not cluster_id:
raise ValueError(
"upsert_with_members requires algo_cluster.id (mint via "
"mint_cluster_id() before passing in)."
)
now = get_utc_now()
centroid_blob = np.asarray(
algo_cluster.centroid, dtype=_CENTROID_DTYPE
).tobytes()
preview_json = json.dumps(list(algo_cluster.preview), ensure_ascii=False)
async with session_scope(self._factory) as s:
cluster_stmt = (
sqlite_insert(Cluster)
.values(
cluster_id=cluster_id,
app_id=app_id,
project_id=project_id,
owner_id=owner_id,
owner_type=owner_type,
kind=kind,
centroid_blob=centroid_blob,
count=algo_cluster.count,
last_ts_ms=algo_cluster.last_ts,
preview_json=preview_json,
)
.on_conflict_do_update(
index_elements=["cluster_id"],
set_={
"centroid_blob": centroid_blob,
"count": algo_cluster.count,
"last_ts_ms": algo_cluster.last_ts,
"preview_json": preview_json,
},
)
)
await s.execute(cluster_stmt)
existing = set(
(
await s.execute(
select(ClusterMember.member_id).where(
ClusterMember.cluster_id == cluster_id
)
)
)
.scalars()
.all()
)
new_member_rows = [
ClusterMember(
cluster_id=cluster_id,
member_id=mid,
member_type=member_type,
added_ts=now,
)
for mid in algo_cluster.members
if mid not in existing
]
if new_member_rows:
s.add_all(new_member_rows)
await s.commit()
def _row_to_algo(row: Cluster, members: list[str]) -> AlgoCluster:
centroid = np.frombuffer(row.centroid_blob, dtype=_CENTROID_DTYPE)
preview = json.loads(row.preview_json) if row.preview_json else []
return AlgoCluster(
id=row.cluster_id,
centroid=centroid,
count=row.count,
last_ts=row.last_ts_ms,
preview=preview,
members=list(members),
)
async def _load_members_by_cluster(
session: AsyncSession,
cluster_ids: list[str],
) -> dict[str, list[str]]:
"""One query → ``{cluster_id: [member_id, ...]}`` (insertion order)."""
stmt = (
select(ClusterMember.cluster_id, ClusterMember.member_id)
.where(ClusterMember.cluster_id.in_(cluster_ids))
.order_by(ClusterMember.added_ts)
)
buckets: dict[str, list[str]] = {}
for cluster_id, member_id in (await session.execute(stmt)).all():
buckets.setdefault(cluster_id, []).append(member_id)
return buckets
cluster_repo = _ClusterRepo()

View File

@ -0,0 +1,90 @@
"""Repository for ``conversation_status`` — singleton bound to ``sqlite_manager``.
Upsert helpers for the (session_id, track) window pointer.
"""
from __future__ import annotations
import datetime as dt
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from everos.core.persistence.sqlite import RepoBase, session_scope
from ..sqlite_manager import get_session_factory
from ..tables import ConversationStatus
class _ConversationStatusRepo(RepoBase[ConversationStatus]):
model = ConversationStatus
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
return get_session_factory()
async def touch_last_message_ts(
self,
session_id: str,
track: str,
ts: dt.datetime,
*,
app_id: str = "default",
project_id: str = "default",
) -> None:
"""Upsert (app, project, session, track); set ``last_message_ts``."""
await self._upsert(
session_id, track, app_id=app_id, project_id=project_id, last_message_ts=ts
)
async def touch_last_memcell_ts(
self,
session_id: str,
track: str,
ts: dt.datetime,
*,
app_id: str = "default",
project_id: str = "default",
) -> None:
"""Upsert (app, project, session, track); set ``last_memcell_ts``."""
await self._upsert(
session_id, track, app_id=app_id, project_id=project_id, last_memcell_ts=ts
)
async def _upsert(
self,
session_id: str,
track: str,
*,
app_id: str = "default",
project_id: str = "default",
last_message_ts: dt.datetime | None = None,
last_memcell_ts: dt.datetime | None = None,
) -> None:
async with session_scope(self._factory) as s:
stmt = select(ConversationStatus).where(
ConversationStatus.app_id == app_id,
ConversationStatus.project_id == project_id,
ConversationStatus.session_id == session_id,
ConversationStatus.track == track,
)
existing = (await s.execute(stmt)).scalars().first()
if existing is None:
s.add(
ConversationStatus(
app_id=app_id,
project_id=project_id,
session_id=session_id,
track=track,
last_message_ts=last_message_ts,
last_memcell_ts=last_memcell_ts,
)
)
else:
if last_message_ts is not None:
existing.last_message_ts = last_message_ts
if last_memcell_ts is not None:
existing.last_memcell_ts = last_memcell_ts
await s.commit()
conversation_status_repo = _ConversationStatusRepo()

View File

@ -0,0 +1,434 @@
"""Repository for ``md_change_state`` — cascade work queue.
Sole writer of the table. The worker, watcher, scanner, and CLI all
go through this repo so the state-machine invariants (``processing``
claim semantics, retryable flag lifecycle) live in one place.
LSN ordering is **best-effort**, not strictly monotonic across
concurrent writers: :meth:`upsert` derives ``lsn = MAX(lsn) + 1``
which is a classic read-modify-write that two parallel writers could
race on (BEGIN DEFERRED leaves the SELECT half unprotected; cross-
process this is even more visible). The table schema does **not**
declare ``lsn UNIQUE`` and no caller depends on strict monotonicity —
the worker uses ``ORDER BY lsn LIMIT N`` for fairness only, and a
collision merely reorders two rows by a few ms; both rows are still
processed and the next upsert bumps the counter past the duplicate.
If a future feature needs strict monotonicity (e.g. CDC / audit log),
revisit by giving ``upsert`` its own ``BEGIN IMMEDIATE`` transaction.
Status values:
- ``pending`` — visible to the worker.
- ``processing`` — internal claim state (one worker is on it).
- ``done`` — handler succeeded.
- ``failed`` — handler exhausted retries or hit unrecoverable error
(see ``retryable`` for the eligibility flag).
"""
from __future__ import annotations
import dataclasses
from sqlalchemy import func, select, update
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from everos.component.utils.datetime import get_utc_now
from everos.core.persistence.sqlite import RepoBase, session_scope
from ..sqlite_manager import get_session_factory
from ..tables import MdChangeState
@dataclasses.dataclass(frozen=True)
class QueueSummary:
"""Aggregate counts for ``cascade status`` CLI output.
``pending`` includes the internal ``processing`` rows so the public
state machine (12 doc §6) stays three-valued.
"""
pending: int
"""Rows the worker hasn't completed yet (includes ``processing``)."""
done: int
"""Rows landed successfully."""
failed_retryable: int
"""``status='failed' AND retryable=TRUE`` — eligible for
``cascade fix --apply`` re-enqueue."""
failed_permanent: int
"""``status='failed' AND retryable=FALSE`` — requires the user to
edit the md and re-save."""
max_lsn: int
"""Largest ``lsn`` ever assigned; 0 if the table is empty."""
last_processed_lsn: int
"""Largest ``lsn`` whose row has reached a terminal state
(``done`` or ``failed``); 0 if no terminal rows yet."""
class _MdChangeStateRepo(RepoBase[MdChangeState]):
model = MdChangeState
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
return get_session_factory()
# ── Writers: watcher / scanner / CLI sync ──────────────────────────────
async def upsert(
self,
md_path: str,
*,
kind: str,
change_type: str,
mtime: float,
) -> int:
"""Enqueue or re-enqueue ``md_path``; return the assigned LSN.
Behaviour:
- **New row** → insert with ``status='pending'``,
``lsn = MAX(lsn) + 1``.
- **Existing row** → bump ``last_changed_at``, refresh
``kind`` / ``change_type`` / ``mtime``, reset status back to
``pending``, zero ``retry_count`` / ``error`` / ``retryable``,
and assign a fresh ``MAX(lsn) + 1`` so the worker re-processes
this path *after* anything queued in between.
The fresh LSN on re-enqueue is the property that lets the worker
rely on ``ORDER BY lsn`` for ordering without losing fairness
when a file flickers in and out of the queue. The ``MAX(lsn)+1``
derivation is best-effort under concurrent writers — see module
docstring for the trade-off.
"""
now = get_utc_now()
async with session_scope(self._factory) as s:
new_lsn = await _next_lsn(s)
stmt = (
sqlite_insert(MdChangeState)
.values(
md_path=md_path,
kind=kind,
change_type=change_type,
mtime=mtime,
first_seen_at=now,
last_changed_at=now,
lsn=new_lsn,
status="pending",
retryable=None,
last_attempt_at=None,
retry_count=0,
error=None,
)
.on_conflict_do_update(
index_elements=["md_path"],
set_={
"kind": kind,
"change_type": change_type,
"mtime": mtime,
"last_changed_at": now,
"lsn": new_lsn,
"status": "pending",
"retryable": None,
"last_attempt_at": None,
"retry_count": 0,
"error": None,
},
)
)
await s.execute(stmt)
await s.commit()
return new_lsn
async def force_enqueue(self, md_path: str, kind: str) -> int:
"""`cascade sync --path` entry: re-enqueue regardless of status.
Semantically the same as :meth:`upsert` with ``change_type
='modified'``; named separately because the CLI flow has no
watcher / scanner event to attribute the row to.
"""
return await self.upsert(
md_path,
kind=kind,
change_type="modified",
mtime=0.0,
)
# ── Worker claim ───────────────────────────────────────────────────────
async def claim_one(self, md_path: str) -> MdChangeState | None:
"""Atomically transition one row ``pending → processing``.
Implements the worker's claim contract: only the caller whose
``UPDATE`` returns ``rowcount == 1`` "owns" the row and should
run the handler. All other concurrent callers get ``None`` and
must move on (no exception — claim contention is not an error).
"""
now = get_utc_now()
async with session_scope(self._factory) as s:
result = await s.execute(
update(MdChangeState)
.where(MdChangeState.md_path == md_path)
.where(MdChangeState.status == "pending")
.values(status="processing", last_attempt_at=now)
)
await s.commit()
if result.rowcount != 1:
return None
row = await s.get(MdChangeState, md_path)
return row
async def claim_pending_batch(self, limit: int = 100) -> list[MdChangeState]:
"""Claim up to ``limit`` pending rows in LSN order.
Returns the claimed rows (now ``status='processing'``); empty
list if none were pending. Sibling workers / processes may race
on the same prefix — the per-row ``WHERE status='pending'``
filter ensures each row lands in exactly one batch.
"""
if limit <= 0:
return []
now = get_utc_now()
async with session_scope(self._factory) as s:
picks = (
(
await s.execute(
select(MdChangeState.md_path)
.where(MdChangeState.status == "pending")
.order_by(MdChangeState.lsn)
.limit(limit)
)
)
.scalars()
.all()
)
if not picks:
return []
update_result = await s.execute(
update(MdChangeState)
.where(MdChangeState.md_path.in_(picks))
.where(MdChangeState.status == "pending")
.values(status="processing", last_attempt_at=now)
)
await s.commit()
if update_result.rowcount == 0:
return []
rows = (
(
await s.execute(
select(MdChangeState)
.where(MdChangeState.md_path.in_(picks))
.where(MdChangeState.status == "processing")
.order_by(MdChangeState.lsn)
)
)
.scalars()
.all()
)
return list(rows)
# ── Worker result reporting ────────────────────────────────────────────
async def mark_done(self, md_path: str) -> None:
"""Transition the row to ``done`` after a successful handler run.
Guarded by ``WHERE status='processing'`` so the call is a no-op
if a concurrent :meth:`upsert` (watcher / scanner re-enqueue)
has flipped the row back to ``pending`` while the worker was
running the handler. In that case the next
:meth:`claim_pending_batch` drain re-runs the handler against
the latest md state — losing the stale ``done`` write rather
than the new ``pending`` is the correct trade.
"""
now = get_utc_now()
async with session_scope(self._factory) as s:
await s.execute(
update(MdChangeState)
.where(MdChangeState.md_path == md_path)
.where(MdChangeState.status == "processing")
.values(
status="done",
last_attempt_at=now,
error=None,
retryable=None,
)
)
await s.commit()
async def mark_failed(
self,
md_path: str,
*,
retryable: bool,
error: str,
new_retry_count: int,
) -> None:
"""Transition the row to ``failed`` with the given diagnostic.
Args:
md_path: The row's primary key.
retryable: ``True`` for transient failures (HTTP 5xx,
connection reset, 429) — ``cascade fix --apply`` will
re-enqueue. ``False`` for unrecoverable failures
(YAML parse, schema mismatch) — needs user edit.
error: Truncated failure message for ``cascade fix`` output.
new_retry_count: The retry count *after* this attempt (the
caller knows whether it was a retry or the final
attempt).
Guarded by ``WHERE status='processing'`` for the same reason as
:meth:`mark_done` — a concurrent re-enqueue must win over a
terminal write tied to a stale claim.
"""
now = get_utc_now()
async with session_scope(self._factory) as s:
# Same guard as ``mark_done``: only flip ``processing → failed``.
# A concurrent watcher / scanner upsert may have reset the row
# back to ``pending`` (file changed during processing) — in
# that case the failure verdict is stale and we let the next
# drain re-attempt against the new md state instead of
# stamping ``failed`` over the live pending row.
await s.execute(
update(MdChangeState)
.where(MdChangeState.md_path == md_path)
.where(MdChangeState.status == "processing")
.values(
status="failed",
retryable=retryable,
last_attempt_at=now,
error=error,
retry_count=new_retry_count,
)
)
await s.commit()
# ── Startup recovery ───────────────────────────────────────────────────
async def recover_orphan_processing(self) -> int:
"""Reset every ``processing`` row to ``pending``; return the count.
Cascade runs single-process today, so any row in ``processing``
when the orchestrator boots is leftover from a prior crash
(the worker died between :meth:`claim_pending_batch` and
``mark_done`` / ``mark_failed``). Idempotent — no rows in
``processing`` is a clean no-op.
"""
async with session_scope(self._factory) as s:
result = await s.execute(
update(MdChangeState)
.where(MdChangeState.status == "processing")
.values(status="pending", last_attempt_at=None)
)
await s.commit()
return int(result.rowcount or 0)
# ── CLI fix / status ───────────────────────────────────────────────────
async def list_failed(self) -> list[MdChangeState]:
"""Return every ``status='failed'`` row, oldest LSN first.
Drives the ``cascade fix`` (no ``--apply``) preview table — the
CLI splits the result by ``retryable`` into two sections.
"""
async with session_scope(self._factory) as s:
rows = (
(
await s.execute(
select(MdChangeState)
.where(MdChangeState.status == "failed")
.order_by(MdChangeState.lsn)
)
)
.scalars()
.all()
)
return list(rows)
async def reset_retryable_to_pending(self) -> int:
"""`cascade fix --apply` engine: re-enqueue every retryable row.
Affects only ``status='failed' AND retryable=TRUE``. Rows with
``retryable=FALSE`` are left untouched — they need the user to
edit the md and re-save (the scanner / watcher will pick up the
change and re-enqueue them naturally).
Returns the number of rows transitioned.
"""
now = get_utc_now()
async with session_scope(self._factory) as s:
result = await s.execute(
update(MdChangeState)
.where(MdChangeState.status == "failed")
.where(MdChangeState.retryable.is_(True))
.values(
status="pending",
retryable=None,
retry_count=0,
error=None,
last_changed_at=now,
)
)
await s.commit()
return int(result.rowcount or 0)
async def queue_summary(self) -> QueueSummary:
"""Aggregate the table for the ``cascade status`` CLI."""
async with session_scope(self._factory) as s:
pending = await _count_where(
s, MdChangeState.status.in_(["pending", "processing"])
)
done = await _count_where(s, MdChangeState.status == "done")
failed_retryable = await _count_where(
s,
(MdChangeState.status == "failed")
& (MdChangeState.retryable.is_(True)),
)
failed_permanent = await _count_where(
s,
(MdChangeState.status == "failed")
& (MdChangeState.retryable.is_(False)),
)
max_lsn_stmt = select(func.coalesce(func.max(MdChangeState.lsn), 0))
max_lsn = int((await s.execute(max_lsn_stmt)).scalar_one())
last_processed_lsn = int(
(
await s.execute(
select(func.coalesce(func.max(MdChangeState.lsn), 0)).where(
MdChangeState.status.in_(["done", "failed"])
)
)
).scalar_one()
)
return QueueSummary(
pending=pending,
done=done,
failed_retryable=failed_retryable,
failed_permanent=failed_permanent,
max_lsn=max_lsn,
last_processed_lsn=last_processed_lsn,
)
async def _next_lsn(session: AsyncSession) -> int:
"""Pick the next global LSN (``MAX(lsn) + 1``).
Called inside the same write transaction as the UPSERT so SQLite's
WAL writer serialisation guarantees no two writers see the same
``MAX``. Empty table returns 1.
"""
result = await session.execute(
select(func.coalesce(func.max(MdChangeState.lsn), 0))
)
return int(result.scalar_one()) + 1
async def _count_where(session: AsyncSession, predicate: object) -> int:
"""``SELECT COUNT(*) WHERE <predicate>`` returning a Python int."""
stmt = select(func.count()).select_from(MdChangeState).where(predicate) # type: ignore[arg-type]
return int((await session.execute(stmt)).scalar_one())
md_change_state_repo = _MdChangeStateRepo()

View File

@ -0,0 +1,52 @@
"""Repository for ``memcell`` table — singleton bound to ``sqlite_manager``.
Pure persistence: callers build the SQLModel ``Memcell`` rows (including
``message_ids_json`` / ``sender_ids_json``) and hand them in. The pipeline
is responsible for mapping algo-side messages back to everos
``message_id`` because algo's ``Message`` does not carry per-message
identifiers.
"""
from __future__ import annotations
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from everos.core.persistence.sqlite import RepoBase, session_scope
from ..sqlite_manager import get_session_factory
from ..tables import Memcell
class _MemcellRepo(RepoBase[Memcell]):
model = Memcell
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
return get_session_factory()
async def insert_many(self, rows: list[Memcell]) -> list[Memcell]:
"""Insert MemCell rows in one transaction; rows are constructed by caller."""
async with session_scope(self._factory) as s:
s.add_all(rows)
await s.commit()
for r in rows:
await s.refresh(r)
return rows
async def find_by_ids(self, memcell_ids: list[str]) -> list[Memcell]:
"""Bulk fetch rows by primary key list — preserves caller order.
Used by offline strategies that pull every memcell in a cluster
(membership lives in :class:`ClusterMember` and is supplied to
the strategy via :class:`everalgo.clustering.Cluster.members`).
"""
if not memcell_ids:
return []
async with session_scope(self._factory) as s:
stmt = select(Memcell).where(Memcell.memcell_id.in_(memcell_ids))
rows = list((await s.execute(stmt)).scalars().all())
by_id = {r.memcell_id: r for r in rows}
return [by_id[mid] for mid in memcell_ids if mid in by_id]
memcell_repo = _MemcellRepo()

View File

@ -0,0 +1,83 @@
"""Repository for ``unprocessed_buffer`` — chat message accumulator.
Singleton bound to the process-wide ``sqlite_manager`` session factory.
Pure SQLModel persistence: row ↔ domain conversion lives in
``everos.memory.extract.pipeline`` (the only caller that needs it).
Exposes:
- :meth:`list_for_track` — load all rows of (session_id, track), ordered by ts.
- :meth:`replace` — atomically swap all rows of (session_id, track) for a
freshly-built list of :class:`UnprocessedBuffer` rows.
"""
from __future__ import annotations
from sqlalchemy import delete, select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from everos.core.persistence.sqlite import RepoBase, session_scope
from ..sqlite_manager import get_session_factory
from ..tables import UnprocessedBuffer
class _UnprocessedBufferRepo(RepoBase[UnprocessedBuffer]):
model = UnprocessedBuffer
def _factory_lookup(self) -> async_sessionmaker[AsyncSession]:
return get_session_factory()
async def list_for_track(
self,
session_id: str,
track: str,
*,
app_id: str = "default",
project_id: str = "default",
) -> list[UnprocessedBuffer]:
"""Return all rows of (app, project, session, track), ts asc."""
async with session_scope(self._factory) as s:
stmt = (
select(UnprocessedBuffer)
.where(
UnprocessedBuffer.app_id == app_id,
UnprocessedBuffer.project_id == project_id,
UnprocessedBuffer.session_id == session_id,
UnprocessedBuffer.track == track,
)
.order_by(UnprocessedBuffer.timestamp.asc()) # type: ignore[union-attr]
)
return list((await s.execute(stmt)).scalars().all())
async def replace(
self,
session_id: str,
track: str,
rows: list[UnprocessedBuffer],
*,
app_id: str = "default",
project_id: str = "default",
) -> None:
"""Atomically rewrite all rows of (app, project, session, track).
Delete-then-insert in one transaction. Empty ``rows`` clears the slice.
The delete is scoped to the same (app, project) as the incoming rows so
one space's buffer never wipes another's.
"""
async with session_scope(self._factory) as s:
await s.execute(
delete(UnprocessedBuffer).where(
UnprocessedBuffer.app_id == app_id,
UnprocessedBuffer.project_id == project_id,
UnprocessedBuffer.session_id == session_id,
UnprocessedBuffer.track == track,
)
)
if rows:
s.add_all(rows)
await s.commit()
unprocessed_buffer_repo = _UnprocessedBufferRepo()

View File

@ -0,0 +1,63 @@
"""SQLite engine + session-factory singletons (lazy + process-wide).
The single place that owns the SQLite **runtime state**: the async
SQLAlchemy engine and the session factory bound to it. Built lazily on
first :func:`get_engine` / :func:`get_session_factory` call from
:func:`everos.config.load_settings` + :meth:`MemoryRoot.default`. The
:class:`SqliteLifespanProvider` calls :func:`dispose_engine` on shutdown
to drain the connection pool; in scripts you can call it manually.
"""
from __future__ import annotations
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
from everos.config import load_settings
from everos.core.observability.logging import get_logger
from everos.core.persistence import (
MemoryRoot,
create_session_factory,
create_system_engine,
)
logger = get_logger(__name__)
_engine: AsyncEngine | None = None
_session_factory: async_sessionmaker[AsyncSession] | None = None
def get_engine() -> AsyncEngine:
"""Return the process-wide async SQLAlchemy engine.
Built on first call from ``MemoryRoot.default()`` and ``Settings.sqlite``.
Subsequent calls return the same instance.
"""
global _engine
if _engine is None:
settings = load_settings()
memory_root = MemoryRoot.default()
memory_root.ensure()
_engine = create_system_engine(memory_root.system_db, settings.sqlite)
logger.info(
"sqlite_engine_built",
db_path=str(memory_root.system_db),
)
return _engine
def get_session_factory() -> async_sessionmaker[AsyncSession]:
"""Return the process-wide async session factory."""
global _session_factory
if _session_factory is None:
_session_factory = create_session_factory(get_engine())
return _session_factory
async def dispose_engine() -> None:
"""Dispose the engine + connection pool. Idempotent."""
global _engine, _session_factory
if _engine is not None:
await _engine.dispose()
logger.info("sqlite_engine_disposed")
_engine = None
_session_factory = None

View File

@ -0,0 +1,24 @@
"""Business SQLModel table schemas.
Each business table lives in its own module here (e.g. ``memcell.py``,
``unprocessed_buffer.py``). The package ``__init__`` re-exports them so
``SQLModel.metadata.create_all`` (run by
:class:`everos.core.lifespan.SqliteLifespanProvider` at startup) sees
every registered table.
"""
from .cluster import Cluster as Cluster
from .cluster import ClusterMember as ClusterMember
from .conversation_status import ConversationStatus as ConversationStatus
from .md_change_state import MdChangeState as MdChangeState
from .memcell import Memcell as Memcell
from .unprocessed_buffer import UnprocessedBuffer as UnprocessedBuffer
__all__ = [
"Cluster",
"ClusterMember",
"ConversationStatus",
"MdChangeState",
"Memcell",
"UnprocessedBuffer",
]

View File

@ -0,0 +1,99 @@
"""``cluster`` — persisted snapshot of one ``everalgo.clustering.Cluster``.
Mirrors the algo-side frozen value object (centroid + count + last_ts +
preview) plus everos engineering metadata (``owner_id`` / ``owner_type``
/ ``kind``) so a single SQLite table can hold both the user-memory cluster
track (episode embeddings) and the agent-case cluster track (task_intent
embeddings). The ``members`` field on the algo type is persisted in the
sibling :class:`ClusterMember` table to keep the relation queryable from
both directions (forward by ``cluster_id``, reverse by ``(member_type,
member_id)``).
"""
from __future__ import annotations
from sqlalchemy import Index, LargeBinary
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class Cluster(BaseTable, table=True):
"""One row per cluster. PK ``cluster_id`` (``cl_<12hex>``)."""
__tablename__ = "cluster" # type: ignore[assignment]
__table_args__ = (
# List all clusters for one (app, project, owner, kind) on each strategy
# invocation; scope-first composite so clustering never mixes spaces.
Index("ix_cluster_owner_kind", "app_id", "project_id", "owner_id", "kind"),
)
cluster_id: str = Field(primary_key=True)
"""Caller-minted opaque id (algo type carries it through verbatim).
Format: ``cl_<12 hex chars>`` to mirror :func:`memcell._mint_memcell_id`."""
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments. The aggregation key is
``(app_id, project_id, owner_id, kind)`` so a cluster set never spans
two spaces."""
owner_id: str = Field(index=True)
"""``user_id`` (kind=``user_memory``) or ``agent_id`` (kind=``agent_case``)."""
owner_type: str
"""``"user"`` or ``"agent"`` — redundant with ``kind`` today but kept
explicit so future kinds (e.g. tenant-level) can plug in without a
schema change."""
kind: str
"""``"user_memory"`` (episode-vector cluster, drives profile extraction)
or ``"agent_case"`` (task_intent-vector cluster, drives skill extraction)."""
centroid_blob: bytes = Field(sa_type=LargeBinary)
"""``np.float32`` centroid serialised via ``ndarray.tobytes()``. The
repo round-trips bytes ↔ ``np.ndarray`` so callers see the algo type."""
count: int
"""Number of members merged into this cluster (algo-maintained)."""
last_ts_ms: int
"""Most recent member's timestamp as Unix epoch milliseconds — matches
:attr:`everalgo.clustering.Cluster.last_ts` exactly so no lossy
datetime ↔ int conversion is needed across the storage boundary."""
preview_json: str
"""JSON-encoded ``list[str]`` — short text samples used by
:func:`cluster_by_llm` ranking. Repo round-trips JSON ↔ list."""
class ClusterMember(BaseTable, table=True):
"""One row per (cluster, entity) link.
Forward lookup (``cluster_id → list[member_id]``) is the algo-side
``Cluster.members`` view. Reverse lookup (``(member_type, member_id)
→ cluster_id``) is served by the composite index below — needed when
a downstream consumer holds an entity id and wants its cluster.
``member_type`` is informational on the row (the parent ``Cluster.kind``
already disambiguates), but kept explicit so the reverse index can be
a single composite (member_type, member_id) without joining back.
"""
__tablename__ = "cluster_member" # type: ignore[assignment]
__table_args__ = (Index("ix_cluster_member_reverse", "member_type", "member_id"),)
cluster_id: str = Field(primary_key=True, foreign_key="cluster.cluster_id")
"""Parent cluster id."""
member_id: str = Field(primary_key=True)
"""``memcell_id`` (member_type=``memcell``) or md entry_id
(member_type=``case``) — the entity grouped into this cluster."""
member_type: str
"""``"memcell"`` or ``"case"``. Echoes the parent cluster's ``kind``
domain but kept on the row so the reverse index is self-contained."""
added_ts: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
"""When this entity was first attached to the cluster."""

View File

@ -0,0 +1,38 @@
"""``conversation_status`` — window pointer per (app, project, session, track).
The window pointer is scoped by ``app_id`` / ``project_id`` so the same
``session_id`` may recur in different spaces without colliding; those two
segments lead the composite ``UniqueConstraint``.
"""
from __future__ import annotations
from sqlalchemy import UniqueConstraint
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class ConversationStatus(BaseTable, table=True):
"""One row per (app, project, session, track). Tracks latest msg / memcell ts."""
__tablename__ = "conversation_status" # type: ignore[assignment]
__table_args__ = (
UniqueConstraint(
"app_id",
"project_id",
"session_id",
"track",
name="uq_conversation_status_session_track",
),
)
id: int | None = Field(default=None, primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments (default ``"default"``)."""
session_id: str = Field(index=True)
track: str
last_message_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
last_memcell_ts: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)

View File

@ -0,0 +1,119 @@
"""``md_change_state`` — cascade work queue.
One row per markdown path. Both watcher (real-time fsevents) and
scanner (periodic sweep) UPSERT into this table; the worker consumes
``pending`` rows in ``lsn`` order, transitions them through an
internal ``processing`` claim state, and lands them in ``done`` or
``failed`` (with a ``retryable`` flag).
Schema sourced from ``12_cascade_design.md`` §4.1 + decisions DD-3 …
DD-12; the four indexes below are required by ``13_cascade_design.md``
§7 status / fix queries.
"""
from __future__ import annotations
from sqlalchemy import Index, text
from everos.component.utils.datetime import UtcDatetime, get_utc_now
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class MdChangeState(BaseTable, table=True):
"""One row per markdown path; UPSERT-driven work queue for cascade.
The public state machine is the 3-tuple ``pending`` / ``done`` /
``failed`` (12 doc §6). ``processing`` is an internal claim state
used by :meth:`MdChangeStateRepo.claim_one` and rolled back into
``pending`` for CLI / status output (16 doc §4.2 — DD-12 keeps the
public surface clean).
"""
__tablename__ = "md_change_state" # type: ignore[assignment]
__table_args__ = (
# Worker scans pending rows in lsn order — partial index drops
# done/failed rows from the b-tree and keeps it tight.
Index(
"idx_md_change_pending",
"status",
"lsn",
sqlite_where=text("status = 'pending'"),
),
# `cascade fix --apply` only ever touches failed + retryable=TRUE
# rows — partial index makes that pass essentially O(retryable).
Index(
"idx_md_change_retryable",
"status",
"retryable",
sqlite_where=text("status = 'failed' AND retryable = 1"),
),
# Scanner reverse-reconcile (disk → state) compares mtime.
Index("idx_md_change_mtime", "mtime"),
# `cascade status` aggregates by kind.
Index("idx_md_change_kind", "kind"),
)
md_path: str = Field(primary_key=True)
"""Path relative to the memory-root (e.g. ``users/u_jason/
episodes/episode-2026-05-12.md``). Every reverse-link anchors here."""
kind: str = Field(nullable=False, index=True)
"""Kind registry name (e.g. ``"episode"``); worker dispatches the
matching handler."""
change_type: str = Field(nullable=False)
"""``"added"`` | ``"modified"`` | ``"deleted"``. A hint for the
worker — handler re-derives truth from the actual file state."""
mtime: float = Field(default=0.0, nullable=False)
"""File mtime captured when the row was last UPSERTed. Scanner
compares this against the on-disk mtime to identify dirty paths."""
first_seen_at: UtcDatetime = Field(
default_factory=get_utc_now, sa_type=UtcDateTimeColumn
)
"""When the path was first enqueued."""
last_changed_at: UtcDatetime = Field(
default_factory=get_utc_now, sa_type=UtcDateTimeColumn
)
"""Most recent UPSERT timestamp (re-stamped on every re-enqueue)."""
lsn: int = Field(nullable=False, index=True)
"""Global monotonic sequence (``MAX(lsn) + 1`` per UPSERT). Worker
processes pending rows in ascending lsn order; the gap between
``MAX(lsn)`` and the last processed lsn is the queue lag."""
status: str = Field(default="pending", nullable=False, index=True)
"""Lifecycle:
- ``"pending"`` — waiting for the worker.
- ``"processing"`` — claimed by a worker (internal; CLI rolls into
pending for display).
- ``"done"`` — handler completed successfully.
- ``"failed"`` — handler exhausted retries or hit an
unrecoverable error (see :attr:`retryable`).
"""
retryable: bool | None = Field(default=None)
"""Meaningful only when ``status='failed'``.
- ``TRUE`` — RecoverableError exhausted MAX_RETRY; ``cascade fix
--apply`` will re-enqueue this row (pending, retry_count reset).
- ``FALSE`` — UnrecoverableError (malformed YAML, schema error
etc.); requires editing the md and re-saving.
- ``NULL`` — not a failed row (pending / processing / done).
"""
last_attempt_at: UtcDatetime | None = Field(default=None, sa_type=UtcDateTimeColumn)
"""Timestamp of the most recent worker attempt (success or
failure)."""
retry_count: int = Field(default=0, nullable=False)
"""Number of retries the worker has *actually issued* (the first
attempt does not count). Reaches MAX_RETRY (default 3) before the
row transitions to ``failed`` with ``retryable=TRUE``."""
error: str | None = Field(default=None)
"""Most recent failure message (truncated upstream if needed)."""

View File

@ -0,0 +1,55 @@
"""``memcell`` — metadata + payload archive for boundary-detected MemCells.
Holds ``message_ids_json`` / ``sender_ids_json`` (JSON arrays of audit
ids) plus ``payload_json`` — the full :class:`everalgo.types.MemCell`
serialised via ``model_dump_json``. The payload is what
``unprocessed_buffer`` cannot keep (boundary's delete-then-insert clears
the staging slice once messages fold into a cell): downstream offline
strategies that need the raw chat messages (e.g. profile extraction)
deserialise the payload back into an algo ``MemCell``. Episode markdown
still carries the LLM-synthesised narrative; ``payload_json`` is the
chat-stream archive that narrative was distilled from.
"""
from __future__ import annotations
from sqlalchemy import Index
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class Memcell(BaseTable, table=True):
"""One row per MemCell. PK ``memcell_id`` (uuid4)."""
__tablename__ = "memcell" # type: ignore[assignment]
__table_args__ = (
# Scope-first composite: app/project partition the lookup before the
# session window so cross-(app, project) rows never share an index slot.
Index(
"ix_memcell_session",
"app_id",
"project_id",
"session_id",
"track",
"timestamp",
),
)
memcell_id: str = Field(primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments. Default to ``"default"`` so the column is
always populated; callers in a non-default space pass real ids."""
session_id: str = Field(index=True)
track: str
raw_type: str
message_ids_json: str
sender_ids_json: str
payload_json: str
"""``MemCell.model_dump_json()`` — the full algo-side MemCell (items =
chat messages / tool calls) serialised at boundary time so offline
strategies can deserialise it back into an algo MemCell long after
``unprocessed_buffer`` has dropped the staging rows."""
timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)

View File

@ -0,0 +1,52 @@
"""``unprocessed_buffer`` — chat-stream messages waiting on boundary detection.
Schema property: presence in the table = pending; absence = consumed.
There is no ``consumed`` column. Pipeline uses ``replace(session, track,
remaining)`` to atomically rewrite the (session, track) slice each turn.
"""
from __future__ import annotations
from sqlalchemy import Index
from everos.component.utils.datetime import UtcDatetime
from everos.core.persistence.sqlite import BaseTable, Field
from everos.core.persistence.sqlite.base import UtcDateTimeColumn
class UnprocessedBuffer(BaseTable, table=True):
"""One row per unprocessed message. PK ``message_id``."""
__tablename__ = "unprocessed_buffer" # type: ignore[assignment]
__table_args__ = (
# Scope-first composite: app/project partition the (session, track)
# staging slice so different spaces never share a buffer window.
Index(
"ix_unprocessed_buffer_lookup",
"app_id",
"project_id",
"session_id",
"track",
"timestamp",
),
)
message_id: str = Field(primary_key=True)
app_id: str = Field(default="default")
project_id: str = Field(default="default")
"""App / project scope segments (default ``"default"``)."""
session_id: str = Field(index=True)
track: str = Field(index=True)
sender_id: str
sender_name: str | None = None
role: str
timestamp: UtcDatetime = Field(sa_type=UtcDateTimeColumn)
# JSON-serialised raw ContentItem list (mirrors src_old
# RawMessage.content_items). Keeps the original multimodal payload
# available so a future parser can reach back to image / audio / etc.
content_items_json: str
# Derived plain-text concatenation of ``type=text`` entries — what
# downstream LLM-facing extractors and md writer consume today.
text: str
tool_calls_json: str | None = None
tool_call_id: str | None = None