Files
EverOS/tests/unit/test_infra/test_ome/test_crash_recovery.py
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

150 lines
4.5 KiB
Python

from __future__ import annotations
from datetime import timedelta
from pathlib import Path
import pytest
from everos.component.utils.datetime import get_now_with_timezone, to_iso_format
from everos.infra.ome._background.crash_recovery import scan_and_resume
from everos.infra.ome._stores.run_record import RunRecordStore
from everos.infra.ome._stores.storage import OMEStorage
from everos.infra.ome.records import RunStatus
@pytest.fixture
async def rec_store(tmp_path: Path) -> RunRecordStore:
storage = OMEStorage(db_path=tmp_path / "ome.db")
await storage.init()
return RunRecordStore(storage=storage, max_records_per_strategy=1000)
@pytest.mark.asyncio
async def test_marks_old_running_as_crashed(rec_store: RunRecordStore) -> None:
await rec_store.mark_running(
run_id="r_old",
strategy_name="s",
attempt=0,
event_topic="x:E",
event_payload="{}",
max_retries_snapshot=1,
)
async with rec_store._storage.connect() as conn:
rewind = to_iso_format(get_now_with_timezone() - timedelta(hours=2))
await conn.execute(
"UPDATE run_record SET started_at = ? WHERE run_id = ?",
(rewind, "r_old"),
)
await conn.commit()
resumed: list = []
async def add_job_hook(name, run_id, event_topic, event_payload, max_retries):
resumed.append((name, run_id, event_topic, event_payload, max_retries))
await scan_and_resume(
run_record_store=rec_store,
timeout_seconds=1800,
add_job=add_job_hook,
)
rec = await rec_store.get("r_old")
assert rec.status == RunStatus.CRASHED
assert len(resumed) == 1
new_name, new_run_id, ec, ep, mr = resumed[0]
assert new_name == "s"
assert new_run_id != "r_old"
assert ec == "x:E"
assert ep == "{}"
assert mr == 1
@pytest.mark.asyncio
async def test_recent_running_skipped(rec_store: RunRecordStore) -> None:
await rec_store.mark_running(
run_id="r_fresh",
strategy_name="s",
attempt=0,
event_topic="x:E",
event_payload="{}",
max_retries_snapshot=1,
)
resumed: list = []
async def add_job_hook(*args, **kw):
resumed.append(args)
await scan_and_resume(
run_record_store=rec_store,
timeout_seconds=1800,
add_job=add_job_hook,
)
rec = await rec_store.get("r_fresh")
assert rec.status == RunStatus.RUNNING
assert resumed == []
@pytest.mark.parametrize("bad_timeout", [0, -1])
@pytest.mark.asyncio
async def test_scan_and_resume_non_positive_timeout_raises(
rec_store: RunRecordStore, bad_timeout: int
) -> None:
"""N6: non-positive timeout must fail fast rather than silently no-op."""
async def _noop_add_job(*_args: object, **_kwargs: object) -> None:
pass
with pytest.raises(ValueError, match=r"timeout_seconds must be > 0"):
await scan_and_resume(
run_record_store=rec_store,
timeout_seconds=bad_timeout,
add_job=_noop_add_job,
)
@pytest.mark.asyncio
async def test_add_job_failure_does_not_abort_loop(
rec_store: RunRecordStore,
) -> None:
"""add_job raising on one row must not block sibling stale rows.
mark_crashed runs before add_job, so both rows end up CRASHED even
when add_job fails for one. This pins the at-most-once contract
documented in the module docstring.
"""
for run_id in ("r_old_1", "r_old_2"):
await rec_store.mark_running(
run_id=run_id,
strategy_name="s",
attempt=0,
event_topic="x:E",
event_payload="{}",
max_retries_snapshot=1,
)
async with rec_store._storage.connect() as conn:
rewind = to_iso_format(get_now_with_timezone() - timedelta(hours=2))
await conn.execute(
"UPDATE run_record SET started_at = ? WHERE run_id IN (?, ?)",
(rewind, "r_old_1", "r_old_2"),
)
await conn.commit()
calls: list[tuple] = []
async def flaky_add_job(name, run_id, event_topic, event_payload, max_retries):
calls.append((name, run_id, event_topic, event_payload, max_retries))
if len(calls) == 1:
raise RuntimeError("APS jobstore unavailable")
await scan_and_resume(
run_record_store=rec_store,
timeout_seconds=1800,
add_job=flaky_add_job,
)
rec1 = await rec_store.get("r_old_1")
rec2 = await rec_store.get("r_old_2")
assert rec1.status == RunStatus.CRASHED
assert rec2.status == RunStatus.CRASHED
assert len(calls) == 2