Files
EverOS/tests/run_locomo_batch.sh
Elliot Chen 518b8eca85 chore: initialize EverOS 1.0.0
md-first memory extraction framework for AI agents.

Markdown is the single source of truth; SQLite holds state and LanceDB
provides the rebuildable vector + BM25 + scalar index. The codebase follows
a single-direction DDD layering (entrypoints -> service -> memory -> infra,
with component / core / config cross-cutting) enforced by import-linter.

Engineering surface:
- Coding conventions in .claude/rules/ (path-scoped) and workflows in
  .claude/skills/ (/commit, /new-branch, /pr).
- GitHub Actions CI runs make lint + test + integration; pre-commit mirrors
  the gates locally (ruff, hygiene hooks, gitlint commit-msg).
- Commit messages follow Conventional Commits, enforced by gitlint.
- make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-06 07:33:17 +08:00

288 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
# Batch driver for LoCoMo benchmark across multiple conversations + methods.
#
# Wraps tests/test_locomo.py in an outer ``--conv-index`` loop. test_locomo.py
# already loops over ``--methods`` internally, so one invocation per
# conversation runs the full method matrix for that conv.
#
# Per-conv outputs (separate JSON + checkpoint dir) live under
# ``benchmark_results/run_<timestamp>/conv<N>.json`` so reports never collide.
# An aggregate accuracy table is printed at the end.
#
# Examples
# ────────
# # all 10 convs, hybrid only:
# bash tests/run_locomo_batch.sh --conv-indices 0-9 --methods hybrid
#
# # 3 specific convs, two methods, skip the ~5min Add phase (corpus already loaded):
# bash tests/run_locomo_batch.sh \
# --conv-indices 0,3,7 --methods keyword,hybrid --skip-add
#
# # one conv, all 4 methods comparison:
# bash tests/run_locomo_batch.sh --conv-indices 0 --methods keyword,vector,hybrid,agentic
set -euo pipefail
# ── Defaults (override via flags) ─────────────────────────────────────
BASE_URL="${BASE_URL:-http://localhost:8000}"
DATA_PATH="${DATA_PATH:-data/locomo10.json}"
CONV_INDICES="${CONV_INDICES:-0}"
METHODS="${METHODS:-hybrid}"
TOP_K="${TOP_K:-10}"
EVAL_OWNER="${EVAL_OWNER:-speaker_a}"
JUDGE_RUNS="${JUDGE_RUNS:-1}"
SKIP_ADD="false"
OUTPUT_ROOT=""
CONCURRENCY="${CONCURRENCY:-1}"
# Default to polling cascade pending==0 (not fixed sleep). Falls back to
# ~/.everos to match the server's default data root; override via env or
# EVEROS_MEMORY__ROOT (which the server consumes). post-flush-wait becomes
# the MAX wait when corpus-path is set.
CORPUS_PATH="${CORPUS_PATH:-${EVEROS_MEMORY__ROOT:-$HOME/.everos}}"
POST_FLUSH_WAIT="${POST_FLUSH_WAIT:-600}"
EXTRA_ARGS=()
usage() {
cat <<EOF
Usage: bash tests/run_locomo_batch.sh [options]
--conv-indices <spec> conv list — "0,1,2" | "0-9" | "all" (default: $CONV_INDICES)
--methods <list> comma-separated, e.g. "keyword,hybrid" (default: $METHODS)
--base-url <url> everos server (default: $BASE_URL)
--data-path <file> LoCoMo dataset path (default: $DATA_PATH)
--top-k <int> per-question recall depth (default: $TOP_K)
--eval-owner <a|b> speaker_a | speaker_b (default: $EVAL_OWNER)
--judge-runs <int> LLM judge majority-vote runs (default: $JUDGE_RUNS)
--skip-add reuse existing corpus, skip ingest
--output-root <dir> parent dir for results
(default: benchmark_results/run_<ts>)
--concurrency <N> run up to N convs in parallel (default: 1 = serial)
per-conv stdout/stderr is redirected to
\$OUTPUT_ROOT/conv<i>.log so streams don't interleave
-h | --help show this help
-- everything after is forwarded to test_locomo.py
Any positional or unknown arg goes through to test_locomo.py untouched.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--conv-indices) CONV_INDICES="$2"; shift 2 ;;
--methods) METHODS="$2"; shift 2 ;;
--base-url) BASE_URL="$2"; shift 2 ;;
--data-path) DATA_PATH="$2"; shift 2 ;;
--top-k) TOP_K="$2"; shift 2 ;;
--eval-owner) EVAL_OWNER="$2"; shift 2 ;;
--judge-runs) JUDGE_RUNS="$2"; shift 2 ;;
--skip-add) SKIP_ADD="true"; shift ;;
--output-root) OUTPUT_ROOT="$2"; shift 2 ;;
--concurrency) CONCURRENCY="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
--) shift; EXTRA_ARGS+=("$@"); break ;;
*) EXTRA_ARGS+=("$1"); shift ;;
esac
done
# ── Expand conv-indices spec ──────────────────────────────────────────
expand_indices() {
local spec="$1"
if [[ "$spec" == "all" ]]; then
echo "0 1 2 3 4 5 6 7 8 9"
return
fi
if [[ "$spec" =~ ^([0-9]+)-([0-9]+)$ ]]; then
seq "${BASH_REMATCH[1]}" "${BASH_REMATCH[2]}"
return
fi
echo "$spec" | tr ',' ' '
}
INDICES=$(expand_indices "$CONV_INDICES")
TS="$(date +%Y%m%d_%H%M%S)"
OUTPUT_ROOT="${OUTPUT_ROOT:-benchmark_results/run_${TS}}"
mkdir -p "$OUTPUT_ROOT"
# ── Plan banner ───────────────────────────────────────────────────────
echo "═════════════════════════════════════════════════════════════════"
echo " LoCoMo batch run"
echo "═════════════════════════════════════════════════════════════════"
printf " base_url : %s\n" "$BASE_URL"
printf " conv_indices : %s\n" "$(echo "$INDICES" | tr '\n' ' ')"
printf " methods : %s\n" "$METHODS"
printf " top_k : %s\n" "$TOP_K"
printf " eval_owner : %s\n" "$EVAL_OWNER"
printf " judge_runs : %s\n" "$JUDGE_RUNS"
printf " skip_add : %s\n" "$SKIP_ADD"
printf " concurrency : %s\n" "$CONCURRENCY"
printf " output_root : %s\n" "$OUTPUT_ROOT"
[[ ${#EXTRA_ARGS[@]} -gt 0 ]] && printf " forwarded args : %s\n" "${EXTRA_ARGS[*]}"
echo
# ── Build per-conv command and launch ────────────────────────────────
#
# bash 3.2 (macOS default) lacks namerefs (`local -n`) and `wait -n`, so
# build_cmd populates a global array CMD and the parallel scheduler
# uses a poll-loop with `kill -0` instead of `wait -n`.
build_cmd() {
local _ci="$1"
CMD=(
PYTHONPATH=src
python tests/test_locomo.py
--base-url "$BASE_URL"
--data-path "$DATA_PATH"
--conv-index "$_ci"
--methods "$METHODS"
--top-k "$TOP_K"
--eval-owner "$EVAL_OWNER"
--judge-runs "$JUDGE_RUNS"
--output "$OUTPUT_ROOT/conv${_ci}.json"
--checkpoint-dir "$OUTPUT_ROOT/conv${_ci}_checkpoints"
--corpus-path "$CORPUS_PATH"
--post-flush-wait "$POST_FLUSH_WAIT"
--quiet
)
[[ "$SKIP_ADD" == "true" ]] && CMD+=( --skip-add )
[[ ${#EXTRA_ARGS[@]} -gt 0 ]] && CMD+=( "${EXTRA_ARGS[@]}" )
# Final no-op: the trailing [[ ]] above can be false (e.g. no extra
# args), which would make the function's exit status non-zero and
# trip `set -e` in the caller. Explicit success keeps the contract.
return 0
}
FAILED=()
if [[ "$CONCURRENCY" -le 1 ]]; then
# ── Serial path (legacy behaviour) ──────────────────────────────────
for CI in $INDICES; do
echo "═════════════════════════════════════════════════════════════════"
echo " conv $CI$OUTPUT_ROOT/conv${CI}.json"
echo "═════════════════════════════════════════════════════════════════"
build_cmd "$CI"
set +e
env "${CMD[@]}"
rc=$?
set -e
if [[ $rc -ne 0 ]]; then
FAILED+=("conv${CI}")
fi
done
else
# ── Parallel path: job pool of $CONCURRENCY workers ─────────────────
#
# Each conv runs in its own python process, streaming to a per-conv
# log file (conv<i>.log) so interleaved stdout doesn't turn into
# confetti. Status is collected via `wait $pid`; one conv's failure
# does not abort the rest.
echo "─────────────────────────────────────────────────────────────────"
echo " Parallel mode: up to $CONCURRENCY convs concurrent"
echo " Per-conv logs: $OUTPUT_ROOT/conv<i>.log"
echo "─────────────────────────────────────────────────────────────────"
# Parallel arrays (no associative arrays in bash 3.2).
RUN_PIDS=()
RUN_CIS=()
# Wait for *any* worker to exit, reap it, prune the slot, record
# failures. Polls because `wait -n` is bash 4.3+.
reap_one() {
while true; do
local idx
for idx in "${!RUN_PIDS[@]}"; do
local pid="${RUN_PIDS[$idx]}"
if ! kill -0 "$pid" 2>/dev/null; then
set +e
wait "$pid"
local rc=$?
set -e
local ci="${RUN_CIS[$idx]}"
if [[ $rc -eq 0 ]]; then
echo " ✓ conv${ci} done (pid $pid)"
else
echo " ✗ conv${ci} failed (pid $pid, status $rc) — see $OUTPUT_ROOT/conv${ci}.log"
FAILED+=("conv${ci}")
fi
unset 'RUN_PIDS[idx]'
unset 'RUN_CIS[idx]'
# Re-pack arrays so ${#RUN_PIDS[@]} stays accurate.
RUN_PIDS=("${RUN_PIDS[@]}")
RUN_CIS=("${RUN_CIS[@]}")
return 0
fi
done
sleep 2
done
}
for CI in $INDICES; do
build_cmd "$CI"
LOG="$OUTPUT_ROOT/conv${CI}.log"
echo " → launching conv${CI} (log: $LOG)"
env "${CMD[@]}" > "$LOG" 2>&1 &
pid=$!
RUN_PIDS+=("$pid")
RUN_CIS+=("$CI")
if [[ ${#RUN_PIDS[@]} -ge $CONCURRENCY ]]; then
reap_one
fi
done
# Drain the remaining workers.
while [[ ${#RUN_PIDS[@]} -gt 0 ]]; do
reap_one
done
fi
if [[ ${#FAILED[@]} -gt 0 ]]; then
echo
echo "${#FAILED[@]} conv(s) failed: ${FAILED[*]}"
fi
# ── Aggregate summary ─────────────────────────────────────────────────
echo
echo "═════════════════════════════════════════════════════════════════"
echo " Aggregate accuracy"
echo "═════════════════════════════════════════════════════════════════"
python - <<EOF
import json
from pathlib import Path
root = Path("$OUTPUT_ROOT")
files = sorted(root.glob("conv*.json"))
if not files:
print(" (no result files found)")
raise SystemExit
# header
methods_seen = []
for p in files:
d = json.load(open(p))
for m in d["methods"]:
if m not in methods_seen:
methods_seen.append(m)
w = max(20, max(len(p.stem) + 4 for p in files))
header = f"{'conversation':<{w}} " + " ".join(f"{m:>10}" for m in methods_seen)
print(header)
print("─" * len(header))
for p in files:
d = json.load(open(p))
label = p.stem
cells = []
for m in methods_seen:
mr = d["methods"].get(m)
if mr is None:
cells.append(f"{'—':>10}")
else:
raw = mr["summary"]["accuracy"]
acc = float(str(raw).rstrip("%")) if isinstance(raw, str) else float(raw) * 100
cells.append(f"{acc:>9.1f}%")
print(f"{label:<{w}} " + " ".join(cells))
print()
print(f" detailed JSONs: {root}/conv*.json")
print(f" phase checkpoints: {root}/conv*_checkpoints/")
EOF