md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
265 lines
10 KiB
Bash
Executable File
265 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Run the LoCoMo benchmark across all 10 conversations × 3 retrieval
|
||
# methods (keyword, vector, hybrid).
|
||
#
|
||
# Wraps tests/run_locomo_batch.sh with the defaults that match the
|
||
# everos post-fix benchmark protocol:
|
||
# - all 10 LoCoMo conversations (conv 0..9)
|
||
# - keyword + vector + hybrid (agentic is skipped — costs 2-3× more
|
||
# LLM tokens and the rerank loop hasn't been benchmarked yet)
|
||
# - speaker_a partition (the LoCoMo "Plan C" single-owner eval)
|
||
# - judge runs = 1 (single-pass LLM judge, no majority vote)
|
||
# - top-K 10
|
||
#
|
||
# Two ingest modes:
|
||
#
|
||
# --skip-add (default) reuse the corpus that already lives at
|
||
# ~/.everos-report-corpus. Skips the
|
||
# ~5 min/conv ingest phase × 10 = ~50 min
|
||
# saved. Note: the existing corpus may
|
||
# still carry artefacts from the OLD code
|
||
# (conv-5 missing episode rows,
|
||
# MRAG score=0.0 facts). For a strictly
|
||
# clean benchmark of the *fixed* code,
|
||
# use --fresh-corpus instead.
|
||
#
|
||
# --fresh-corpus wipe ~/.everos-report-corpus, restart
|
||
# the server, and re-ingest every conv
|
||
# with the current bug-fixed cascade.
|
||
# Adds ~50 min to the run.
|
||
#
|
||
# Server must already be running on :8000 with the current code loaded
|
||
# (i.e. the OR + optimize fixes). Health check confirmed before launch.
|
||
#
|
||
# Output structure:
|
||
#
|
||
# benchmark_results/run_<ts>_10x3/
|
||
# ├── conv0.json ... conv9.json ← per-conv final results
|
||
# ├── conv0_checkpoints/ ... ← phase-level snapshots
|
||
# └── SUMMARY.md ← cross-conv accuracy table
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
|
||
REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd)"
|
||
|
||
# ── Defaults ──────────────────────────────────────────────────────────
|
||
BASE_URL="${BASE_URL:-http://localhost:8000}"
|
||
DATA_PATH="${DATA_PATH:-data/locomo10.json}"
|
||
MEMORY_ROOT="${EVEROS_MEMORY__ROOT:-$HOME/.everos-report-corpus}"
|
||
MODE="skip-add" # default; toggle via --fresh-corpus
|
||
TS="$(date +%Y%m%d_%H%M%S)"
|
||
OUTPUT_ROOT="$REPO_ROOT/benchmark_results/run_${TS}_10x3"
|
||
|
||
# ── Parse args ────────────────────────────────────────────────────────
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--skip-add) MODE="skip-add"; shift ;;
|
||
--fresh-corpus) MODE="fresh"; shift ;;
|
||
--base-url) BASE_URL="$2"; shift 2 ;;
|
||
--memory-root) MEMORY_ROOT="$2"; shift 2 ;;
|
||
--output-root) OUTPUT_ROOT="$2"; shift 2 ;;
|
||
-h|--help)
|
||
grep -E "^# " "$0" | sed 's/^# //;s/^#//'
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "unknown arg: $1" >&2
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
# ── Preflight ─────────────────────────────────────────────────────────
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo " LoCoMo 10 × 3 benchmark"
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo " mode: $MODE"
|
||
echo " base_url: $BASE_URL"
|
||
echo " memory_root: $MEMORY_ROOT"
|
||
echo " output: $OUTPUT_ROOT"
|
||
echo
|
||
|
||
# 1. Server up?
|
||
if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
|
||
echo "❌ server at $BASE_URL is not responding"
|
||
echo " start with: EVEROS_MEMORY__ROOT=$MEMORY_ROOT PYTHONPATH=src \\"
|
||
echo " python -m everos.entrypoints.cli.main server start --port 8000"
|
||
exit 1
|
||
fi
|
||
echo "✓ server healthy"
|
||
|
||
# 2. LLM env (test_locomo.py reads bare LLM_* — bridge from EVEROS_LLM__*)
|
||
if [[ -z "${LLM_API_KEY:-}" ]] || [[ -z "${LLM_BASE_URL:-}" ]] || [[ -z "${LLM_MODEL:-}" ]]; then
|
||
if [[ -f "$REPO_ROOT/.env" ]]; then
|
||
set -a
|
||
# shellcheck source=/dev/null
|
||
source <(grep -E "^EVEROS_LLM__" "$REPO_ROOT/.env" | sed 's/EVEROS_LLM__/LLM_/')
|
||
set +a
|
||
fi
|
||
fi
|
||
if [[ -z "${LLM_API_KEY:-}" ]]; then
|
||
echo "❌ LLM_API_KEY not set (and .env has no EVEROS_LLM__API_KEY to bridge from)"
|
||
exit 1
|
||
fi
|
||
echo "✓ LLM credentials: model=$LLM_MODEL @ $LLM_BASE_URL"
|
||
echo
|
||
|
||
# 3. Fresh corpus mode → wipe + restart server
|
||
if [[ "$MODE" == "fresh" ]]; then
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo " --fresh-corpus: wiping $MEMORY_ROOT and restarting server"
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
|
||
# Find and kill existing server (best-effort)
|
||
pids="$(pgrep -f "everos.entrypoints.cli.main server" || true)"
|
||
if [[ -n "$pids" ]]; then
|
||
echo " stopping server pid(s): $pids"
|
||
# shellcheck disable=SC2086
|
||
kill $pids
|
||
sleep 3
|
||
fi
|
||
|
||
rm -rf "$MEMORY_ROOT"
|
||
mkdir -p "$MEMORY_ROOT"
|
||
|
||
# Restart in background; the server picks up the empty memory root.
|
||
echo " starting fresh server..."
|
||
(
|
||
cd "$REPO_ROOT"
|
||
EVEROS_MEMORY__ROOT="$MEMORY_ROOT" \
|
||
PYTHONPATH=src \
|
||
nohup python -m everos.entrypoints.cli.main server start --port 8000 \
|
||
> /tmp/everos-server-${TS}.log 2>&1 &
|
||
echo " server pid=$!"
|
||
)
|
||
|
||
# Wait for lifespan ready
|
||
for i in $(seq 1 60); do
|
||
if curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
|
||
echo " server ready after ${i}s"
|
||
break
|
||
fi
|
||
sleep 1
|
||
done
|
||
if ! curl -fsS -o /dev/null "$BASE_URL/health" 2>/dev/null; then
|
||
echo "❌ server failed to come up; see /tmp/everos-server-${TS}.log"
|
||
exit 1
|
||
fi
|
||
echo
|
||
fi
|
||
|
||
# ── Build the batch invocation ────────────────────────────────────────
|
||
BATCH_ARGS=(
|
||
--conv-indices 0-9
|
||
--methods keyword,vector,hybrid
|
||
--base-url "$BASE_URL"
|
||
--top-k 10
|
||
--eval-owner speaker_a
|
||
--judge-runs 1
|
||
--output-root "$OUTPUT_ROOT"
|
||
)
|
||
[[ "$MODE" == "skip-add" ]] && BATCH_ARGS+=( --skip-add )
|
||
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo " Launching: tests/run_locomo_batch.sh ${BATCH_ARGS[*]}"
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo
|
||
|
||
cd "$REPO_ROOT"
|
||
bash tests/run_locomo_batch.sh "${BATCH_ARGS[@]}"
|
||
|
||
# ── Summary markdown ──────────────────────────────────────────────────
|
||
echo
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
echo " Rendering SUMMARY.md"
|
||
echo "═════════════════════════════════════════════════════════════════"
|
||
|
||
python - <<PYEOF
|
||
import json
|
||
from pathlib import Path
|
||
|
||
root = Path("$OUTPUT_ROOT")
|
||
out_md = root / "SUMMARY.md"
|
||
files = sorted(root.glob("conv*.json"))
|
||
if not files:
|
||
print(f"no result files under {root}")
|
||
raise SystemExit
|
||
|
||
methods_seen: list[str] = []
|
||
for p in files:
|
||
d = json.load(open(p))
|
||
for m in d["methods"]:
|
||
if m not in methods_seen:
|
||
methods_seen.append(m)
|
||
|
||
cat_names = {"1": "single-hop", "2": "multi-hop", "3": "open-domain", "4": "temporal"}
|
||
|
||
lines: list[str] = []
|
||
lines.append(f"# LoCoMo 10×3 — run_${TS}\n")
|
||
lines.append(
|
||
f"- mode: `{'$MODE'}`\n"
|
||
f"- base_url: \`$BASE_URL\`\n"
|
||
f"- memory_root: \`$MEMORY_ROOT\`\n"
|
||
f"- methods: \`{', '.join(methods_seen)}\`\n"
|
||
)
|
||
|
||
# Per-conv table
|
||
lines.append("\n## Per-conv accuracy\n\n")
|
||
lines.append("| conv | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
|
||
lines.append("|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
|
||
|
||
agg_correct = {m: 0 for m in methods_seen}
|
||
agg_total = {m: 0 for m in methods_seen}
|
||
cat_correct: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
|
||
cat_total: dict[str, dict[str, int]] = {m: {} for m in methods_seen}
|
||
|
||
for p in files:
|
||
d = json.load(open(p))
|
||
cells = []
|
||
for m in methods_seen:
|
||
mr = d["methods"].get(m)
|
||
if mr is None:
|
||
cells.append("—")
|
||
continue
|
||
s = mr["summary"]
|
||
cells.append(f"{s['accuracy']*100:.1f}%")
|
||
agg_correct[m] += s["correct"]
|
||
agg_total[m] += s["total"]
|
||
for cat, st in s["category_stats"].items():
|
||
cat_correct[m][cat] = cat_correct[m].get(cat, 0) + st["correct"]
|
||
cat_total[m][cat] = cat_total[m].get(cat, 0) + st["total"]
|
||
lines.append(f"| {p.stem} | " + " | ".join(cells) + " |\n")
|
||
|
||
# Overall
|
||
overall = []
|
||
for m in methods_seen:
|
||
if agg_total[m]:
|
||
overall.append(f"**{agg_correct[m]/agg_total[m]*100:.1f}%**")
|
||
else:
|
||
overall.append("—")
|
||
lines.append(f"| **OVERALL** | " + " | ".join(overall) + " |\n")
|
||
|
||
# Per-category
|
||
lines.append("\n## Per-category accuracy (across all 10 convs)\n\n")
|
||
lines.append("| cat | kind | " + " | ".join(f"**{m}**" for m in methods_seen) + " |\n")
|
||
lines.append("|---|---|" + "|".join(["---"] * len(methods_seen)) + "|\n")
|
||
for cat in ["1", "2", "3", "4"]:
|
||
cells = []
|
||
for m in methods_seen:
|
||
tot = cat_total[m].get(cat, 0)
|
||
if tot:
|
||
cells.append(f"{cat_correct[m][cat] / tot * 100:.1f}%")
|
||
else:
|
||
cells.append("—")
|
||
lines.append(f"| {cat} | {cat_names[cat]} | " + " | ".join(cells) + " |\n")
|
||
|
||
out_md.write_text("".join(lines), encoding="utf-8")
|
||
print(f" → {out_md}")
|
||
print()
|
||
print("".join(lines))
|
||
PYEOF
|
||
|
||
echo
|
||
echo "Done."
|