chore: initialize EverOS 1.0.0

md-first memory extraction framework for AI agents. Markdown is the single source of truth; SQLite holds state and LanceDB provides the rebuildable vector + BM25 + scalar index. The codebase follows a single-direction DDD layering (entrypoints -> service -> memory -> infra, with component / core / config cross-cutting) enforced by import-linter. Engineering surface: - Coding conventions in .claude/rules/ (path-scoped) and workflows in .claude/skills/ (/commit, /new-branch, /pr). - GitHub Actions CI runs make lint + test + integration; pre-commit mirrors the gates locally (ruff, hygiene hooks, gitlint commit-msg). - Commit messages follow Conventional Commits, enforced by gitlint. - make lint also enforces datetime two-zone discipline and OpenAPI drift.
2026-06-05 22:35:51 +08:00
commit 518b8eca85
636 changed files with 160553 additions and 0 deletions
--- a/tests/run_locomo_batch.sh
+++ b/tests/run_locomo_batch.sh
@ -0,0 +1,287 @@
+#!/usr/bin/env bash
+# Batch driver for LoCoMo benchmark across multiple conversations + methods.
+#
+# Wraps tests/test_locomo.py in an outer ``--conv-index`` loop. test_locomo.py
+# already loops over ``--methods`` internally, so one invocation per
+# conversation runs the full method matrix for that conv.
+#
+# Per-conv outputs (separate JSON + checkpoint dir) live under
+# ``benchmark_results/run_<timestamp>/conv<N>.json`` so reports never collide.
+# An aggregate accuracy table is printed at the end.
+#
+# Examples
+# ────────
+#   # all 10 convs, hybrid only:
+#   bash tests/run_locomo_batch.sh --conv-indices 0-9 --methods hybrid
+#
+#   # 3 specific convs, two methods, skip the ~5min Add phase (corpus already loaded):
+#   bash tests/run_locomo_batch.sh \
+#     --conv-indices 0,3,7 --methods keyword,hybrid --skip-add
+#
+#   # one conv, all 4 methods comparison:
+#   bash tests/run_locomo_batch.sh --conv-indices 0 --methods keyword,vector,hybrid,agentic
+
+set -euo pipefail
+
+# ── Defaults (override via flags) ─────────────────────────────────────
+BASE_URL="${BASE_URL:-http://localhost:8000}"
+DATA_PATH="${DATA_PATH:-data/locomo10.json}"
+CONV_INDICES="${CONV_INDICES:-0}"
+METHODS="${METHODS:-hybrid}"
+TOP_K="${TOP_K:-10}"
+EVAL_OWNER="${EVAL_OWNER:-speaker_a}"
+JUDGE_RUNS="${JUDGE_RUNS:-1}"
+SKIP_ADD="false"
+OUTPUT_ROOT=""
+CONCURRENCY="${CONCURRENCY:-1}"
+# Default to polling cascade pending==0 (not fixed sleep). Falls back to
+# ~/.everos to match the server's default data root; override via env or
+# EVEROS_MEMORY__ROOT (which the server consumes). post-flush-wait becomes
+# the MAX wait when corpus-path is set.
+CORPUS_PATH="${CORPUS_PATH:-${EVEROS_MEMORY__ROOT:-$HOME/.everos}}"
+POST_FLUSH_WAIT="${POST_FLUSH_WAIT:-600}"
+EXTRA_ARGS=()
+
+usage() {
+  cat <<EOF
+Usage: bash tests/run_locomo_batch.sh [options]
+
+  --conv-indices <spec>   conv list — "0,1,2" | "0-9" | "all"    (default: $CONV_INDICES)
+  --methods <list>        comma-separated, e.g. "keyword,hybrid"  (default: $METHODS)
+  --base-url <url>        everos server                          (default: $BASE_URL)
+  --data-path <file>      LoCoMo dataset path                     (default: $DATA_PATH)
+  --top-k <int>           per-question recall depth               (default: $TOP_K)
+  --eval-owner <a|b>      speaker_a | speaker_b                   (default: $EVAL_OWNER)
+  --judge-runs <int>      LLM judge majority-vote runs            (default: $JUDGE_RUNS)
+  --skip-add              reuse existing corpus, skip ingest
+  --output-root <dir>     parent dir for results
+                          (default: benchmark_results/run_<ts>)
+  --concurrency <N>       run up to N convs in parallel (default: 1 = serial)
+                          per-conv stdout/stderr is redirected to
+                          \$OUTPUT_ROOT/conv<i>.log so streams don't interleave
+  -h | --help             show this help
+  --                      everything after is forwarded to test_locomo.py
+
+Any positional or unknown arg goes through to test_locomo.py untouched.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --conv-indices) CONV_INDICES="$2"; shift 2 ;;
+    --methods)      METHODS="$2"; shift 2 ;;
+    --base-url)     BASE_URL="$2"; shift 2 ;;
+    --data-path)    DATA_PATH="$2"; shift 2 ;;
+    --top-k)        TOP_K="$2"; shift 2 ;;
+    --eval-owner)   EVAL_OWNER="$2"; shift 2 ;;
+    --judge-runs)   JUDGE_RUNS="$2"; shift 2 ;;
+    --skip-add)     SKIP_ADD="true"; shift ;;
+    --output-root)  OUTPUT_ROOT="$2"; shift 2 ;;
+    --concurrency)  CONCURRENCY="$2"; shift 2 ;;
+    -h|--help)      usage; exit 0 ;;
+    --)             shift; EXTRA_ARGS+=("$@"); break ;;
+    *)              EXTRA_ARGS+=("$1"); shift ;;
+  esac
+done
+
+# ── Expand conv-indices spec ──────────────────────────────────────────
+expand_indices() {
+  local spec="$1"
+  if [[ "$spec" == "all" ]]; then
+    echo "0 1 2 3 4 5 6 7 8 9"
+    return
+  fi
+  if [[ "$spec" =~ ^([0-9]+)-([0-9]+)$ ]]; then
+    seq "${BASH_REMATCH[1]}" "${BASH_REMATCH[2]}"
+    return
+  fi
+  echo "$spec" | tr ',' ' '
+}
+
+INDICES=$(expand_indices "$CONV_INDICES")
+TS="$(date +%Y%m%d_%H%M%S)"
+OUTPUT_ROOT="${OUTPUT_ROOT:-benchmark_results/run_${TS}}"
+mkdir -p "$OUTPUT_ROOT"
+
+# ── Plan banner ───────────────────────────────────────────────────────
+echo "═════════════════════════════════════════════════════════════════"
+echo "  LoCoMo batch run"
+echo "═════════════════════════════════════════════════════════════════"
+printf "  base_url        : %s\n" "$BASE_URL"
+printf "  conv_indices    : %s\n" "$(echo "$INDICES" | tr '\n' ' ')"
+printf "  methods         : %s\n" "$METHODS"
+printf "  top_k           : %s\n" "$TOP_K"
+printf "  eval_owner      : %s\n" "$EVAL_OWNER"
+printf "  judge_runs      : %s\n" "$JUDGE_RUNS"
+printf "  skip_add        : %s\n" "$SKIP_ADD"
+printf "  concurrency     : %s\n" "$CONCURRENCY"
+printf "  output_root     : %s\n" "$OUTPUT_ROOT"
+[[ ${#EXTRA_ARGS[@]} -gt 0 ]] && printf "  forwarded args  : %s\n" "${EXTRA_ARGS[*]}"
+echo
+
+# ── Build per-conv command and launch ────────────────────────────────
+#
+# bash 3.2 (macOS default) lacks namerefs (`local -n`) and `wait -n`, so
+# build_cmd populates a global array CMD and the parallel scheduler
+# uses a poll-loop with `kill -0` instead of `wait -n`.
+build_cmd() {
+  local _ci="$1"
+  CMD=(
+    PYTHONPATH=src
+    python tests/test_locomo.py
+    --base-url        "$BASE_URL"
+    --data-path       "$DATA_PATH"
+    --conv-index      "$_ci"
+    --methods         "$METHODS"
+    --top-k           "$TOP_K"
+    --eval-owner      "$EVAL_OWNER"
+    --judge-runs      "$JUDGE_RUNS"
+    --output          "$OUTPUT_ROOT/conv${_ci}.json"
+    --checkpoint-dir  "$OUTPUT_ROOT/conv${_ci}_checkpoints"
+    --corpus-path     "$CORPUS_PATH"
+    --post-flush-wait "$POST_FLUSH_WAIT"
+    --quiet
+  )
+  [[ "$SKIP_ADD" == "true" ]] && CMD+=( --skip-add )
+  [[ ${#EXTRA_ARGS[@]} -gt 0 ]] && CMD+=( "${EXTRA_ARGS[@]}" )
+  # Final no-op: the trailing [[ ]] above can be false (e.g. no extra
+  # args), which would make the function's exit status non-zero and
+  # trip `set -e` in the caller. Explicit success keeps the contract.
+  return 0
+}
+
+FAILED=()
+
+if [[ "$CONCURRENCY" -le 1 ]]; then
+  # ── Serial path (legacy behaviour) ──────────────────────────────────
+  for CI in $INDICES; do
+    echo "═════════════════════════════════════════════════════════════════"
+    echo "  conv $CI  →  $OUTPUT_ROOT/conv${CI}.json"
+    echo "═════════════════════════════════════════════════════════════════"
+    build_cmd "$CI"
+    set +e
+    env "${CMD[@]}"
+    rc=$?
+    set -e
+    if [[ $rc -ne 0 ]]; then
+      FAILED+=("conv${CI}")
+    fi
+  done
+else
+  # ── Parallel path: job pool of $CONCURRENCY workers ─────────────────
+  #
+  # Each conv runs in its own python process, streaming to a per-conv
+  # log file (conv<i>.log) so interleaved stdout doesn't turn into
+  # confetti. Status is collected via `wait $pid`; one conv's failure
+  # does not abort the rest.
+  echo "─────────────────────────────────────────────────────────────────"
+  echo "  Parallel mode: up to $CONCURRENCY convs concurrent"
+  echo "  Per-conv logs: $OUTPUT_ROOT/conv<i>.log"
+  echo "─────────────────────────────────────────────────────────────────"
+
+  # Parallel arrays (no associative arrays in bash 3.2).
+  RUN_PIDS=()
+  RUN_CIS=()
+
+  # Wait for *any* worker to exit, reap it, prune the slot, record
+  # failures. Polls because `wait -n` is bash 4.3+.
+  reap_one() {
+    while true; do
+      local idx
+      for idx in "${!RUN_PIDS[@]}"; do
+        local pid="${RUN_PIDS[$idx]}"
+        if ! kill -0 "$pid" 2>/dev/null; then
+          set +e
+          wait "$pid"
+          local rc=$?
+          set -e
+          local ci="${RUN_CIS[$idx]}"
+          if [[ $rc -eq 0 ]]; then
+            echo "  ✓ conv${ci} done (pid $pid)"
+          else
+            echo "  ✗ conv${ci} failed (pid $pid, status $rc) — see $OUTPUT_ROOT/conv${ci}.log"
+            FAILED+=("conv${ci}")
+          fi
+          unset 'RUN_PIDS[idx]'
+          unset 'RUN_CIS[idx]'
+          # Re-pack arrays so ${#RUN_PIDS[@]} stays accurate.
+          RUN_PIDS=("${RUN_PIDS[@]}")
+          RUN_CIS=("${RUN_CIS[@]}")
+          return 0
+        fi
+      done
+      sleep 2
+    done
+  }
+
+  for CI in $INDICES; do
+    build_cmd "$CI"
+    LOG="$OUTPUT_ROOT/conv${CI}.log"
+    echo "  → launching conv${CI}  (log: $LOG)"
+    env "${CMD[@]}" > "$LOG" 2>&1 &
+    pid=$!
+    RUN_PIDS+=("$pid")
+    RUN_CIS+=("$CI")
+
+    if [[ ${#RUN_PIDS[@]} -ge $CONCURRENCY ]]; then
+      reap_one
+    fi
+  done
+
+  # Drain the remaining workers.
+  while [[ ${#RUN_PIDS[@]} -gt 0 ]]; do
+    reap_one
+  done
+fi
+
+if [[ ${#FAILED[@]} -gt 0 ]]; then
+  echo
+  echo "⚠ ${#FAILED[@]} conv(s) failed: ${FAILED[*]}"
+fi
+
+# ── Aggregate summary ─────────────────────────────────────────────────
+echo
+echo "═════════════════════════════════════════════════════════════════"
+echo "  Aggregate accuracy"
+echo "═════════════════════════════════════════════════════════════════"
+python - <<EOF
+import json
+from pathlib import Path
+
+root = Path("$OUTPUT_ROOT")
+files = sorted(root.glob("conv*.json"))
+if not files:
+    print("  (no result files found)")
+    raise SystemExit
+
+# header
+methods_seen = []
+for p in files:
+    d = json.load(open(p))
+    for m in d["methods"]:
+        if m not in methods_seen:
+            methods_seen.append(m)
+
+w = max(20, max(len(p.stem) + 4 for p in files))
+header = f"{'conversation':<{w}} " + "  ".join(f"{m:>10}" for m in methods_seen)
+print(header)
+print("─" * len(header))
+
+for p in files:
+    d = json.load(open(p))
+    label = p.stem
+    cells = []
+    for m in methods_seen:
+        mr = d["methods"].get(m)
+        if mr is None:
+            cells.append(f"{'—':>10}")
+        else:
+            raw = mr["summary"]["accuracy"]
+            acc = float(str(raw).rstrip("%")) if isinstance(raw, str) else float(raw) * 100
+            cells.append(f"{acc:>9.1f}%")
+    print(f"{label:<{w}} " + "  ".join(cells))
+
+print()
+print(f"  detailed JSONs: {root}/conv*.json")
+print(f"  phase checkpoints: {root}/conv*_checkpoints/")
+EOF