Add Hermes memory evaluation framework with LoCoMo dataset support
- Implement HermesClient for interacting with the Hermes CLI. - Create judge module for grading QA outputs from Hermes memory. - Develop LoCoMo dataset parsing and formatting utilities. - Introduce run_eval script to facilitate memory evaluation using LoCoMo-style datasets.
This commit is contained in:
51
eval/hermes_memory_eval/hermes_client.py
Normal file
51
eval/hermes_memory_eval/hermes_client.py
Normal file
@ -0,0 +1,51 @@
|
||||
"""Hermes CLI client used by the memory evaluation runner."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Mapping
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HermesClientConfig:
|
||||
command: str = "hermes"
|
||||
timeout_seconds: int = 600
|
||||
quiet: bool = True
|
||||
source: str = "memory-eval"
|
||||
extra_args: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class HermesClient:
|
||||
def __init__(self, config: HermesClientConfig):
|
||||
self._config = config
|
||||
|
||||
def chat(self, message: str, *, user_id: str, env: Mapping[str, str] | None = None) -> str:
|
||||
command = [self._config.command, "chat"]
|
||||
if self._config.quiet:
|
||||
command.append("-Q")
|
||||
if self._config.source:
|
||||
command.extend(["--source", self._config.source])
|
||||
command.extend(self._config.extra_args)
|
||||
command.extend(["-q", message])
|
||||
|
||||
process_env = os.environ.copy()
|
||||
process_env["MEMORY_SYSTEM_USER_ID"] = user_id
|
||||
if env:
|
||||
process_env.update({key: str(value) for key, value in env.items() if value is not None})
|
||||
|
||||
result = subprocess.run(
|
||||
command,
|
||||
capture_output=True,
|
||||
check=False,
|
||||
env=process_env,
|
||||
text=True,
|
||||
timeout=self._config.timeout_seconds,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.strip()
|
||||
stdout = result.stdout.strip()
|
||||
detail = stderr or stdout or f"exit code {result.returncode}"
|
||||
raise RuntimeError(f"Hermes command failed: {detail}")
|
||||
return result.stdout.strip()
|
||||
Reference in New Issue
Block a user