#!/usr/bin/env bash # Batch driver for LoCoMo benchmark across multiple conversations + methods. # # Wraps tests/test_locomo.py in an outer ``--conv-index`` loop. test_locomo.py # already loops over ``--methods`` internally, so one invocation per # conversation runs the full method matrix for that conv. # # Per-conv outputs (separate JSON + checkpoint dir) live under # ``benchmark_results/run_/conv.json`` so reports never collide. # An aggregate accuracy table is printed at the end. # # Examples # ──────── # # all 10 convs, hybrid only: # bash tests/run_locomo_batch.sh --conv-indices 0-9 --methods hybrid # # # 3 specific convs, two methods, skip the ~5min Add phase (corpus already loaded): # bash tests/run_locomo_batch.sh \ # --conv-indices 0,3,7 --methods keyword,hybrid --skip-add # # # one conv, all 4 methods comparison: # bash tests/run_locomo_batch.sh --conv-indices 0 --methods keyword,vector,hybrid,agentic set -euo pipefail # ── Defaults (override via flags) ───────────────────────────────────── BASE_URL="${BASE_URL:-http://localhost:8000}" DATA_PATH="${DATA_PATH:-data/locomo10.json}" CONV_INDICES="${CONV_INDICES:-0}" METHODS="${METHODS:-hybrid}" TOP_K="${TOP_K:-10}" EVAL_OWNER="${EVAL_OWNER:-speaker_a}" JUDGE_RUNS="${JUDGE_RUNS:-1}" SKIP_ADD="false" OUTPUT_ROOT="" CONCURRENCY="${CONCURRENCY:-1}" # Default to polling cascade pending==0 (not fixed sleep). Falls back to # ~/.everos to match the server's default data root; override via env or # EVEROS_MEMORY__ROOT (which the server consumes). post-flush-wait becomes # the MAX wait when corpus-path is set. CORPUS_PATH="${CORPUS_PATH:-${EVEROS_MEMORY__ROOT:-$HOME/.everos}}" POST_FLUSH_WAIT="${POST_FLUSH_WAIT:-600}" EXTRA_ARGS=() usage() { cat < conv list — "0,1,2" | "0-9" | "all" (default: $CONV_INDICES) --methods comma-separated, e.g. "keyword,hybrid" (default: $METHODS) --base-url everos server (default: $BASE_URL) --data-path LoCoMo dataset path (default: $DATA_PATH) --top-k per-question recall depth (default: $TOP_K) --eval-owner speaker_a | speaker_b (default: $EVAL_OWNER) --judge-runs LLM judge majority-vote runs (default: $JUDGE_RUNS) --skip-add reuse existing corpus, skip ingest --output-root parent dir for results (default: benchmark_results/run_) --concurrency run up to N convs in parallel (default: 1 = serial) per-conv stdout/stderr is redirected to \$OUTPUT_ROOT/conv.log so streams don't interleave -h | --help show this help -- everything after is forwarded to test_locomo.py Any positional or unknown arg goes through to test_locomo.py untouched. EOF } while [[ $# -gt 0 ]]; do case "$1" in --conv-indices) CONV_INDICES="$2"; shift 2 ;; --methods) METHODS="$2"; shift 2 ;; --base-url) BASE_URL="$2"; shift 2 ;; --data-path) DATA_PATH="$2"; shift 2 ;; --top-k) TOP_K="$2"; shift 2 ;; --eval-owner) EVAL_OWNER="$2"; shift 2 ;; --judge-runs) JUDGE_RUNS="$2"; shift 2 ;; --skip-add) SKIP_ADD="true"; shift ;; --output-root) OUTPUT_ROOT="$2"; shift 2 ;; --concurrency) CONCURRENCY="$2"; shift 2 ;; -h|--help) usage; exit 0 ;; --) shift; EXTRA_ARGS+=("$@"); break ;; *) EXTRA_ARGS+=("$1"); shift ;; esac done # ── Expand conv-indices spec ────────────────────────────────────────── expand_indices() { local spec="$1" if [[ "$spec" == "all" ]]; then echo "0 1 2 3 4 5 6 7 8 9" return fi if [[ "$spec" =~ ^([0-9]+)-([0-9]+)$ ]]; then seq "${BASH_REMATCH[1]}" "${BASH_REMATCH[2]}" return fi echo "$spec" | tr ',' ' ' } INDICES=$(expand_indices "$CONV_INDICES") TS="$(date +%Y%m%d_%H%M%S)" OUTPUT_ROOT="${OUTPUT_ROOT:-benchmark_results/run_${TS}}" mkdir -p "$OUTPUT_ROOT" # ── Plan banner ─────────────────────────────────────────────────────── echo "═════════════════════════════════════════════════════════════════" echo " LoCoMo batch run" echo "═════════════════════════════════════════════════════════════════" printf " base_url : %s\n" "$BASE_URL" printf " conv_indices : %s\n" "$(echo "$INDICES" | tr '\n' ' ')" printf " methods : %s\n" "$METHODS" printf " top_k : %s\n" "$TOP_K" printf " eval_owner : %s\n" "$EVAL_OWNER" printf " judge_runs : %s\n" "$JUDGE_RUNS" printf " skip_add : %s\n" "$SKIP_ADD" printf " concurrency : %s\n" "$CONCURRENCY" printf " output_root : %s\n" "$OUTPUT_ROOT" [[ ${#EXTRA_ARGS[@]} -gt 0 ]] && printf " forwarded args : %s\n" "${EXTRA_ARGS[*]}" echo # ── Build per-conv command and launch ──────────────────────────────── # # bash 3.2 (macOS default) lacks namerefs (`local -n`) and `wait -n`, so # build_cmd populates a global array CMD and the parallel scheduler # uses a poll-loop with `kill -0` instead of `wait -n`. build_cmd() { local _ci="$1" CMD=( PYTHONPATH=src python tests/test_locomo.py --base-url "$BASE_URL" --data-path "$DATA_PATH" --conv-index "$_ci" --methods "$METHODS" --top-k "$TOP_K" --eval-owner "$EVAL_OWNER" --judge-runs "$JUDGE_RUNS" --output "$OUTPUT_ROOT/conv${_ci}.json" --checkpoint-dir "$OUTPUT_ROOT/conv${_ci}_checkpoints" --corpus-path "$CORPUS_PATH" --post-flush-wait "$POST_FLUSH_WAIT" --quiet ) [[ "$SKIP_ADD" == "true" ]] && CMD+=( --skip-add ) [[ ${#EXTRA_ARGS[@]} -gt 0 ]] && CMD+=( "${EXTRA_ARGS[@]}" ) # Final no-op: the trailing [[ ]] above can be false (e.g. no extra # args), which would make the function's exit status non-zero and # trip `set -e` in the caller. Explicit success keeps the contract. return 0 } FAILED=() if [[ "$CONCURRENCY" -le 1 ]]; then # ── Serial path (legacy behaviour) ────────────────────────────────── for CI in $INDICES; do echo "═════════════════════════════════════════════════════════════════" echo " conv $CI → $OUTPUT_ROOT/conv${CI}.json" echo "═════════════════════════════════════════════════════════════════" build_cmd "$CI" set +e env "${CMD[@]}" rc=$? set -e if [[ $rc -ne 0 ]]; then FAILED+=("conv${CI}") fi done else # ── Parallel path: job pool of $CONCURRENCY workers ───────────────── # # Each conv runs in its own python process, streaming to a per-conv # log file (conv.log) so interleaved stdout doesn't turn into # confetti. Status is collected via `wait $pid`; one conv's failure # does not abort the rest. echo "─────────────────────────────────────────────────────────────────" echo " Parallel mode: up to $CONCURRENCY convs concurrent" echo " Per-conv logs: $OUTPUT_ROOT/conv.log" echo "─────────────────────────────────────────────────────────────────" # Parallel arrays (no associative arrays in bash 3.2). RUN_PIDS=() RUN_CIS=() # Wait for *any* worker to exit, reap it, prune the slot, record # failures. Polls because `wait -n` is bash 4.3+. reap_one() { while true; do local idx for idx in "${!RUN_PIDS[@]}"; do local pid="${RUN_PIDS[$idx]}" if ! kill -0 "$pid" 2>/dev/null; then set +e wait "$pid" local rc=$? set -e local ci="${RUN_CIS[$idx]}" if [[ $rc -eq 0 ]]; then echo " ✓ conv${ci} done (pid $pid)" else echo " ✗ conv${ci} failed (pid $pid, status $rc) — see $OUTPUT_ROOT/conv${ci}.log" FAILED+=("conv${ci}") fi unset 'RUN_PIDS[idx]' unset 'RUN_CIS[idx]' # Re-pack arrays so ${#RUN_PIDS[@]} stays accurate. RUN_PIDS=("${RUN_PIDS[@]}") RUN_CIS=("${RUN_CIS[@]}") return 0 fi done sleep 2 done } for CI in $INDICES; do build_cmd "$CI" LOG="$OUTPUT_ROOT/conv${CI}.log" echo " → launching conv${CI} (log: $LOG)" env "${CMD[@]}" > "$LOG" 2>&1 & pid=$! RUN_PIDS+=("$pid") RUN_CIS+=("$CI") if [[ ${#RUN_PIDS[@]} -ge $CONCURRENCY ]]; then reap_one fi done # Drain the remaining workers. while [[ ${#RUN_PIDS[@]} -gt 0 ]]; do reap_one done fi if [[ ${#FAILED[@]} -gt 0 ]]; then echo echo "⚠ ${#FAILED[@]} conv(s) failed: ${FAILED[*]}" fi # ── Aggregate summary ───────────────────────────────────────────────── echo echo "═════════════════════════════════════════════════════════════════" echo " Aggregate accuracy" echo "═════════════════════════════════════════════════════════════════" python - <10}" for m in methods_seen) print(header) print("─" * len(header)) for p in files: d = json.load(open(p)) label = p.stem cells = [] for m in methods_seen: mr = d["methods"].get(m) if mr is None: cells.append(f"{'—':>10}") else: raw = mr["summary"]["accuracy"] acc = float(str(raw).rstrip("%")) if isinstance(raw, str) else float(raw) * 100 cells.append(f"{acc:>9.1f}%") print(f"{label:<{w}} " + " ".join(cells)) print() print(f" detailed JSONs: {root}/conv*.json") print(f" phase checkpoints: {root}/conv*_checkpoints/") EOF