chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/tests/run_locomo_10x3.sh
+++ b/tests/run_locomo_10x3.sh
@ -0,0 +1,264 @@
+#!/usr/bin/env bash
+# Run the LoCoMo benchmark across all 10 conversations × 3 retrieval
+# methods (keyword, vector, hybrid).
+#
+# Wraps tests/run_locomo_batch.sh with the defaults that match the
+# everos post-fix benchmark protocol:
+#   - all 10 LoCoMo conversations (conv 0..9)
+#   - keyword + vector + hybrid (agentic is skipped — costs 2-3× more
+#     LLM tokens and the rerank loop hasn't been benchmarked yet)
+#   - speaker_a partition (the LoCoMo "Plan C" single-owner eval)
+#   - judge runs = 1 (single-pass LLM judge, no majority vote)
+#   - top-K 10
+#
+# Two ingest modes:
+#
+#   --skip-add (default)        reuse the corpus that already lives at
+#                               ~/.everos-report-corpus. Skips the
+#                               ~5 min/conv ingest phase × 10 = ~50 min
+#                               saved. Note: the existing corpus may
+#                               still carry artefacts from the OLD code
+#                               (conv-5 missing episode rows,
+#                               MRAG score=0.0 facts). For a strictly
+#                               clean benchmark of the *fixed* code,
+#                               use --fresh-corpus instead.
+#
+#   --fresh-corpus              wipe ~/.everos-report-corpus, restart
+#                               the server, and re-ingest every conv
+#                               with the current bug-fixed cascade.
+#                               Adds ~50 min to the run.
+#
+# Server must already be running on :8000 with the current code loaded
+# (i.e. the OR + optimize fixes). Health check confirmed before launch.
+#
+# Output structure:
+#
+#   benchmark_results/run_<ts>_10x3/
+#   ├── conv0.json  ...  conv9.json     ← per-conv final results
+#   ├── conv0_checkpoints/  ...         ← phase-level snapshots
+#   └── SUMMARY.md                       ← cross-conv accuracy table
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd)"
+
+# ── Defaults ──────────────────────────────────────────────────────────
+BASE_URL="${BASE_URL:-http://localhost:8000}"
+DATA_PATH="${DATA_PATH:-data/locomo10.json}"
+MEMORY_ROOT="${EVEROS_MEMORY__ROOT:-$HOME/.everos-report-corpus}"
+MODE="skip-add"        # default; toggle via --fresh-corpus
+TS="$(date +%Y%m%d_%H%M%S)"
+OUTPUT_ROOT="$REPO_ROOT/benchmark_results/run_${TS}_10x3"
+
+# ── Parse args ────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --skip-add)        MODE="skip-add"; shift ;;
+    --fresh-corpus)    MODE="fresh"; shift ;;
+    --base-url)        BASE_URL="$2"; shift 2 ;;
+    --memory-root)     MEMORY_ROOT="$2"; shift 2 ;;
+    --output-root)     OUTPUT_ROOT="$2"; shift 2 ;;
+    -h|--help)
+      grep -E "^# " "$0" | sed 's/^# //;s/^#//'
+      exit 0
+      ;;
+    *)
+      echo "unknown arg: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+# ── Preflight ─────────────────────────────────────────────────────────
+echo "═════════════════════════════════════════════════════════════════"
+echo "  LoCoMo 10 × 3 benchmark"
+echo "═════════════════════════════════════════════════════════════════"
+echo "  mode:        $MODE"
+echo "  base_url:    $BASE_URL"
+echo "  memory_root: $MEMORY_ROOT"
+echo "  output:      $OUTPUT_ROOT"
+echo
+
+# 1. Server up?
+if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
+  echo "❌ server at $BASE_URL is not responding"
+  echo "   start with: EVEROS_MEMORY__ROOT=$MEMORY_ROOT PYTHONPATH=src \\"
+  echo "     python -m everos.entrypoints.cli.main server start --port 8000"
+  exit 1
+fi
+echo "✓ server healthy"
+
+# 2. LLM env (test_locomo.py reads bare LLM_* — bridge from EVEROS_LLM__*)
+if [[ -z "${LLM_API_KEY:-}" ]] || [[ -z "${LLM_BASE_URL:-}" ]] || [[ -z "${LLM_MODEL:-}" ]]; then
+  if [[ -f "$REPO_ROOT/.env" ]]; then
+    set -a
+    # shellcheck source=/dev/null
+    source <(grep -E "^EVEROS_LLM__" "$REPO_ROOT/.env" | sed 's/EVEROS_LLM__/LLM_/')
+    set +a
+  fi
+fi
+if [[ -z "${LLM_API_KEY:-}" ]]; then
+  echo "❌ LLM_API_KEY not set (and .env has no EVEROS_LLM__API_KEY to bridge from)"
+  exit 1
+fi
+echo "✓ LLM credentials: model=$LLM_MODEL @ $LLM_BASE_URL"
+echo
+
+# 3. Fresh corpus mode → wipe + restart server
+if [[ "$MODE" == "fresh" ]]; then
+  echo "═════════════════════════════════════════════════════════════════"
+  echo "  --fresh-corpus: wiping $MEMORY_ROOT and restarting server"
+  echo "═════════════════════════════════════════════════════════════════"
+
+  # Find and kill existing server (best-effort)
+  pids="$(pgrep -f "everos.entrypoints.cli.main server" || true)"
+  if [[ -n "$pids" ]]; then
+    echo "  stopping server pid(s): $pids"
+    # shellcheck disable=SC2086
+    kill $pids
+    sleep 3
+  fi
+
+  rm -rf "$MEMORY_ROOT"
+  mkdir -p "$MEMORY_ROOT"
+
+  # Restart in background; the server picks up the empty memory root.
+  echo "  starting fresh server..."
+  (
+    cd "$REPO_ROOT"
+    EVEROS_MEMORY__ROOT="$MEMORY_ROOT" \
+    PYTHONPATH=src \
+    nohup python -m everos.entrypoints.cli.main server start --port 8000 \
+      > /tmp/everos-server-${TS}.log 2>&1 &
+    echo "  server pid=$!"
+  )
+
+  # Wait for lifespan ready
+  for i in $(seq 1 60); do
+    if curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
+      echo "  server ready after ${i}s"
+      break
+    fi
+    sleep 1
+  done
+  if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
+    echo "❌ server failed to come up; see /tmp/everos-server-${TS}.log"
+    exit 1
+  fi
+  echo
+fi
+
+# ── Build the batch invocation ────────────────────────────────────────
+BATCH_ARGS=(
+  --conv-indices 0-9
+  --methods keyword,vector,hybrid
+  --base-url "$BASE_URL"
+  --top-k 10
+  --eval-owner speaker_a
+  --judge-runs 1
+  --output-root "$OUTPUT_ROOT"
+)
+[[ "$MODE" == "skip-add" ]] && BATCH_ARGS+=( --skip-add )
+
+echo "═════════════════════════════════════════════════════════════════"
+echo "  Launching: tests/run_locomo_batch.sh ${BATCH_ARGS[*]}"
+echo "═════════════════════════════════════════════════════════════════"
+echo
+
+cd "$REPO_ROOT"
+bash tests/run_locomo_batch.sh "${BATCH_ARGS[@]}"
+
+# ── Summary markdown ──────────────────────────────────────────────────
+echo
+echo "═════════════════════════════════════════════════════════════════"
+echo "  Rendering SUMMARY.md"
+echo "═════════════════════════════════════════════════════════════════"
+
+python - <<PYEOF
+import json
+from pathlib import Path
+
+root = Path("$OUTPUT_ROOT")
+out_md = root / "SUMMARY.md"
+files = sorted(root.glob("conv*.json"))
+if not files:
+    print(f"no result files under {root}")
+    raise SystemExit
+
+methods_seen: list[str] = []
+for p in files:
+    d = json.load(open(p))
+    for m in d["methods"]:
+        if m not in methods_seen:
+            methods_seen.append(m)
+
+cat_names = {"1": "single-hop", "2": "multi-hop", "3": "open-domain", "4": "temporal"}
+
+lines: list[str] = []
+lines.append(f"# LoCoMo 10×3 — run_${TS}\n")
+lines.append(
+    f"- mode: `{'$MODE'}`\n"
+    f"- base_url: \`$BASE_URL\`\n"
+    f"- memory_root: \`$MEMORY_ROOT\`\n"
+    f"- methods: \`{', '.join(methods_seen)}\`\n"
+)
+
+# Per-conv table
+lines.append("\n## Per-conv accuracy\n\n")
+lines.append("| conv | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
+lines.append("|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
+
+agg_correct = {m: 0 for m in methods_seen}
+agg_total = {m: 0 for m in methods_seen}
+cat_correct: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
+cat_total: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
+
+for p in files:
+    d = json.load(open(p))
+    cells = []
+    for m in methods_seen:
+        mr = d["methods"].get(m)
+        if mr is None:
+            cells.append("—")
+            continue
+        s = mr["summary"]
+        cells.append(f"{s['accuracy']*100:.1f}%")
+        agg_correct[m] += s["correct"]
+        agg_total[m] += s["total"]
+        for cat, st in s["category_stats"].items():
+            cat_correct[m][cat] = cat_correct[m].get(cat, 0) + st["correct"]
+            cat_total[m][cat] = cat_total[m].get(cat, 0) + st["total"]
+    lines.append(f"| {p.stem} | " + " | ".join(cells) + " |\n")
+
+# Overall
+overall = []
+for m in methods_seen:
+    if agg_total[m]:
+        overall.append(f"**{agg_correct[m]/agg_total[m]*100:.1f}%**")
+    else:
+        overall.append("—")
+lines.append(f"| **OVERALL** | " + " | ".join(overall) + " |\n")
+
+# Per-category
+lines.append("\n## Per-category accuracy (across all 10 convs)\n\n")
+lines.append("| cat | kind | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
+lines.append("|---|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
+for cat in ["1", "2", "3", "4"]:
+    cells = []
+    for m in methods_seen:
+        tot = cat_total[m].get(cat, 0)
+        if tot:
+            cells.append(f"{cat_correct[m][cat] / tot * 100:.1f}%")
+        else:
+            cells.append("—")
+    lines.append(f"| {cat} | {cat_names[cat]} | " + " | ".join(cells) + " |\n")
+
+out_md.write_text("".join(lines), encoding="utf-8")
+print(f"  → {out_md}")
+print()
+print("".join(lines))
+PYEOF
+
+echo
+echo "Done."