chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
This commit is contained in:
Elliot Chen
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions

264
tests/run_locomo_10x3.sh Executable file
View File

@ -0,0 +1,264 @@
#!/usr/bin/env bash
# Run the LoCoMo benchmark across all 10 conversations × 3 retrieval
# methods (keyword, vector, hybrid).
#
# Wraps tests/run_locomo_batch.sh with the defaults that match the
# everos post-fix benchmark protocol:
# - all 10 LoCoMo conversations (conv 0..9)
# - keyword + vector + hybrid (agentic is skipped — costs 2-3× more
# LLM tokens and the rerank loop hasn't been benchmarked yet)
# - speaker_a partition (the LoCoMo "Plan C" single-owner eval)
# - judge runs = 1 (single-pass LLM judge, no majority vote)
# - top-K 10
#
# Two ingest modes:
#
# --skip-add (default) reuse the corpus that already lives at
# ~/.everos-report-corpus. Skips the
# ~5 min/conv ingest phase × 10 = ~50 min
# saved. Note: the existing corpus may
# still carry artefacts from the OLD code
# (conv-5 missing episode rows,
# MRAG score=0.0 facts). For a strictly
# clean benchmark of the *fixed* code,
# use --fresh-corpus instead.
#
# --fresh-corpus wipe ~/.everos-report-corpus, restart
# the server, and re-ingest every conv
# with the current bug-fixed cascade.
# Adds ~50 min to the run.
#
# Server must already be running on :8000 with the current code loaded
# (i.e. the OR + optimize fixes). Health check confirmed before launch.
#
# Output structure:
#
# benchmark_results/run_<ts>_10x3/
# ├── conv0.json ... conv9.json ← per-conv final results
# ├── conv0_checkpoints/ ... ← phase-level snapshots
# └── SUMMARY.md ← cross-conv accuracy table
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd)"
# ── Defaults ──────────────────────────────────────────────────────────
BASE_URL="${BASE_URL:-http://localhost:8000}"
DATA_PATH="${DATA_PATH:-data/locomo10.json}"
MEMORY_ROOT="${EVEROS_MEMORY__ROOT:-$HOME/.everos-report-corpus}"
MODE="skip-add" # default; toggle via --fresh-corpus
TS="$(date +%Y%m%d_%H%M%S)"
OUTPUT_ROOT="$REPO_ROOT/benchmark_results/run_${TS}_10x3"
# ── Parse args ────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-add) MODE="skip-add"; shift ;;
--fresh-corpus) MODE="fresh"; shift ;;
--base-url) BASE_URL="$2"; shift 2 ;;
--memory-root) MEMORY_ROOT="$2"; shift 2 ;;
--output-root) OUTPUT_ROOT="$2"; shift 2 ;;
-h|--help)
grep -E "^# " "$0" | sed 's/^# //;s/^#//'
exit 0
;;
*)
echo "unknown arg: $1" >&2
exit 1
;;
esac
done
# ── Preflight ─────────────────────────────────────────────────────────
echo "═════════════════════════════════════════════════════════════════"
echo " LoCoMo 10 × 3 benchmark"
echo "═════════════════════════════════════════════════════════════════"
echo " mode: $MODE"
echo " base_url: $BASE_URL"
echo " memory_root: $MEMORY_ROOT"
echo " output: $OUTPUT_ROOT"
echo
# 1. Server up?
if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
echo "❌ server at $BASE_URL is not responding"
echo " start with: EVEROS_MEMORY__ROOT=$MEMORY_ROOT PYTHONPATH=src \\"
echo " python -m everos.entrypoints.cli.main server start --port 8000"
exit 1
fi
echo "✓ server healthy"
# 2. LLM env (test_locomo.py reads bare LLM_* — bridge from EVEROS_LLM__*)
if [[ -z "${LLM_API_KEY:-}" ]] || [[ -z "${LLM_BASE_URL:-}" ]] || [[ -z "${LLM_MODEL:-}" ]]; then
if [[ -f "$REPO_ROOT/.env" ]]; then
set -a
# shellcheck source=/dev/null
source <(grep -E "^EVEROS_LLM__" "$REPO_ROOT/.env" | sed 's/EVEROS_LLM__/LLM_/')
set +a
fi
fi
if [[ -z "${LLM_API_KEY:-}" ]]; then
echo "❌ LLM_API_KEY not set (and .env has no EVEROS_LLM__API_KEY to bridge from)"
exit 1
fi
echo "✓ LLM credentials: model=$LLM_MODEL @ $LLM_BASE_URL"
echo
# 3. Fresh corpus mode → wipe + restart server
if [[ "$MODE" == "fresh" ]]; then
echo "═════════════════════════════════════════════════════════════════"
echo " --fresh-corpus: wiping $MEMORY_ROOT and restarting server"
echo "═════════════════════════════════════════════════════════════════"
# Find and kill existing server (best-effort)
pids="$(pgrep -f "everos.entrypoints.cli.main server" || true)"
if [[ -n "$pids" ]]; then
echo " stopping server pid(s): $pids"
# shellcheck disable=SC2086
kill $pids
sleep 3
fi
rm -rf "$MEMORY_ROOT"
mkdir -p "$MEMORY_ROOT"
# Restart in background; the server picks up the empty memory root.
echo " starting fresh server..."
(
cd "$REPO_ROOT"
EVEROS_MEMORY__ROOT="$MEMORY_ROOT" \
PYTHONPATH=src \
nohup python -m everos.entrypoints.cli.main server start --port 8000 \
> /tmp/everos-server-${TS}.log 2>&1 &
echo " server pid=$!"
)
# Wait for lifespan ready
for i in $(seq 1 60); do
if curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
echo " server ready after ${i}s"
break
fi
sleep 1
done
if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
echo "❌ server failed to come up; see /tmp/everos-server-${TS}.log"
exit 1
fi
echo
fi
# ── Build the batch invocation ────────────────────────────────────────
BATCH_ARGS=(
--conv-indices 0-9
--methods keyword,vector,hybrid
--base-url "$BASE_URL"
--top-k 10
--eval-owner speaker_a
--judge-runs 1
--output-root "$OUTPUT_ROOT"
)
[[ "$MODE" == "skip-add" ]] && BATCH_ARGS+=( --skip-add )
echo "═════════════════════════════════════════════════════════════════"
echo " Launching: tests/run_locomo_batch.sh ${BATCH_ARGS[*]}"
echo "═════════════════════════════════════════════════════════════════"
echo
cd "$REPO_ROOT"
bash tests/run_locomo_batch.sh "${BATCH_ARGS[@]}"
# ── Summary markdown ──────────────────────────────────────────────────
echo
echo "═════════════════════════════════════════════════════════════════"
echo " Rendering SUMMARY.md"
echo "═════════════════════════════════════════════════════════════════"
python - <<PYEOF
import json
from pathlib import Path
root = Path("$OUTPUT_ROOT")
out_md = root / "SUMMARY.md"
files = sorted(root.glob("conv*.json"))
if not files:
print(f"no result files under {root}")
raise SystemExit
methods_seen: list[str] = []
for p in files:
d = json.load(open(p))
for m in d["methods"]:
if m not in methods_seen:
methods_seen.append(m)
cat_names = {"1": "single-hop", "2": "multi-hop", "3": "open-domain", "4": "temporal"}
lines: list[str] = []
lines.append(f"# LoCoMo 10×3 — run_${TS}\n")
lines.append(
f"- mode: `{'$MODE'}`\n"
f"- base_url: \`$BASE_URL\`\n"
f"- memory_root: \`$MEMORY_ROOT\`\n"
f"- methods: \`{', '.join(methods_seen)}\`\n"
)
# Per-conv table
lines.append("\n## Per-conv accuracy\n\n")
lines.append("| conv | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
lines.append("|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
agg_correct = {m: 0 for m in methods_seen}
agg_total = {m: 0 for m in methods_seen}
cat_correct: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
cat_total: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
for p in files:
d = json.load(open(p))
cells = []
for m in methods_seen:
mr = d["methods"].get(m)
if mr is None:
cells.append("—")
continue
s = mr["summary"]
cells.append(f"{s['accuracy']*100:.1f}%")
agg_correct[m] += s["correct"]
agg_total[m] += s["total"]
for cat, st in s["category_stats"].items():
cat_correct[m][cat] = cat_correct[m].get(cat, 0) + st["correct"]
cat_total[m][cat] = cat_total[m].get(cat, 0) + st["total"]
lines.append(f"| {p.stem} | " + " | ".join(cells) + " |\n")
# Overall
overall = []
for m in methods_seen:
if agg_total[m]:
overall.append(f"**{agg_correct[m]/agg_total[m]*100:.1f}%**")
else:
overall.append("—")
lines.append(f"| **OVERALL** | " + " | ".join(overall) + " |\n")
# Per-category
lines.append("\n## Per-category accuracy (across all 10 convs)\n\n")
lines.append("| cat | kind | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
lines.append("|---|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
for cat in ["1", "2", "3", "4"]:
cells = []
for m in methods_seen:
tot = cat_total[m].get(cat, 0)
if tot:
cells.append(f"{cat_correct[m][cat] / tot * 100:.1f}%")
else:
cells.append("—")
lines.append(f"| {cat} | {cat_names[cat]} | " + " | ".join(cells) + " |\n")
out_md.write_text("".join(lines), encoding="utf-8")
print(f" → {out_md}")
print()
print("".join(lines))
PYEOF
echo
echo "Done."