Add Hermes memory evaluation framework with LoCoMo dataset support

- Implement HermesClient for interacting with the Hermes CLI. - Create judge module for grading QA outputs from Hermes memory. - Develop LoCoMo dataset parsing and formatting utilities. - Introduce run_eval script to facilitate memory evaluation using LoCoMo-style datasets.
2026-05-27 17:06:26 +08:00
parent ba59133d80
commit c173fa45a7
11 changed files with 68338 additions and 0 deletions
--- a/eval/hermes_memory_eval/hermes_client.py
+++ b/eval/hermes_memory_eval/hermes_client.py
@ -0,0 +1,51 @@
+"""Hermes CLI client used by the memory evaluation runner."""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from dataclasses import dataclass, field
+from typing import Mapping
+
+
+@dataclass(frozen=True)
+class HermesClientConfig:
+    command: str = "hermes"
+    timeout_seconds: int = 600
+    quiet: bool = True
+    source: str = "memory-eval"
+    extra_args: list[str] = field(default_factory=list)
+
+
+class HermesClient:
+    def __init__(self, config: HermesClientConfig):
+        self._config = config
+
+    def chat(self, message: str, *, user_id: str, env: Mapping[str, str] | None = None) -> str:
+        command = [self._config.command, "chat"]
+        if self._config.quiet:
+            command.append("-Q")
+        if self._config.source:
+            command.extend(["--source", self._config.source])
+        command.extend(self._config.extra_args)
+        command.extend(["-q", message])
+
+        process_env = os.environ.copy()
+        process_env["MEMORY_SYSTEM_USER_ID"] = user_id
+        if env:
+            process_env.update({key: str(value) for key, value in env.items() if value is not None})
+
+        result = subprocess.run(
+            command,
+            capture_output=True,
+            check=False,
+            env=process_env,
+            text=True,
+            timeout=self._config.timeout_seconds,
+        )
+        if result.returncode != 0:
+            stderr = result.stderr.strip()
+            stdout = result.stdout.strip()
+            detail = stderr or stdout or f"exit code {result.returncode}"
+            raise RuntimeError(f"Hermes command failed: {detail}")
+        return result.stdout.strip()