feat(skill-learning): produce replay eval reports

This commit is contained in:
2026-06-08 13:35:58 +08:00
parent cc1bf85517
commit 64d789a3d0
3 changed files with 271 additions and 3 deletions

View File

@ -4,17 +4,28 @@ from __future__ import annotations
from uuid import uuid4 from uuid import uuid4
from beaver.engine.context import SkillContext
from beaver.engine.providers import ProviderBundle from beaver.engine.providers import ProviderBundle
from beaver.memory.runs import RunMemoryStore from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.skills.learning.case_selection import select_replay_cases
from beaver.skills.learning.preservation import check_preservation
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
from beaver.skills.specs import SkillDraft from beaver.skills.specs import SkillDraft
class SkillDraftEvaluator: class SkillDraftEvaluator:
"""Builds a bounded eval report without writing user-visible sessions.""" """Builds a bounded eval report without writing user-visible sessions."""
def __init__(self, run_store: RunMemoryStore) -> None: def __init__(
self,
run_store: RunMemoryStore,
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
) -> None:
self.run_store = run_store self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
async def evaluate( async def evaluate(
self, self,
@ -22,11 +33,30 @@ class SkillDraftEvaluator:
candidate: SkillLearningCandidate, candidate: SkillLearningCandidate,
draft: SkillDraft, draft: SkillDraft,
provider_bundle: ProviderBundle | None, provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
) -> SkillDraftEvalReport: ) -> SkillDraftEvalReport:
if provider_bundle is None or provider_bundle.main_provider is None: if provider_bundle is None or provider_bundle.main_provider is None:
return self._skipped(candidate, draft) return self._skipped(candidate, draft)
runs_by_id = {record.run_id: record for record in self.run_store.list_runs()} runs = self.run_store.list_runs()
replay_cases = select_replay_cases(candidate, runs)
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
draft=draft,
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
)
return self._evaluate_heuristic(candidate, draft, runs)
def _evaluate_heuristic(
self,
candidate: SkillLearningCandidate,
draft: SkillDraft,
runs: list,
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = [] cases: list[dict] = []
for run_id in candidate.source_run_ids[:8]: for run_id in candidate.source_run_ids[:8]:
record = runs_by_id.get(run_id) record = runs_by_id.get(run_id)
@ -78,6 +108,78 @@ class SkillDraftEvaluator:
created_at=_utc_now(), created_at=_utc_now(),
) )
async def _evaluate_replay(
self,
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
for case in replay_cases:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
baseline_score = surrogate["baseline_score"]
candidate_score = surrogate["candidate_score"]
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
case_reports.append(case_report)
legacy_cases.append(
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport: def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport( return SkillDraftEvalReport(
report_id=uuid4().hex, report_id=uuid4().hex,
@ -115,6 +217,108 @@ def _candidate_score(baseline: float, draft: SkillDraft) -> float:
return min(1.0, max(0.75, baseline + 0.05)) return min(1.0, max(0.75, baseline + 0.05))
def _draft_skill_context(draft: SkillDraft) -> SkillContext:
tool_hints = draft.proposed_frontmatter.get("tools")
return SkillContext(
name=f"draft:{draft.skill_name}",
content=draft.proposed_content,
version=draft.draft_id,
content_hash="draft",
activation_reason="skill_replay_eval_candidate",
tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [],
)
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
if candidate.kind not in {"revise_skill", "merge_skills"}:
return None
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
if not base_content.strip():
return None
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=passed,
baseline_score_avg=round(baseline_avg, 4),
candidate_score_avg=round(candidate_avg, 4),
score_delta=round(score_delta, 4),
regression_count=len(regressions),
improved_count=len(improved),
unchanged_count=unchanged,
cases=legacy_cases,
status="completed",
created_at=_utc_now(),
eval_version="replay-v1",
mode="replay",
execution_coverage=execution,
surrogate_coverage=surrogate,
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
preservation_report=preservation_report,
)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:
for call in report.get("tool_calls") or []:
if isinstance(call, dict) and call.get("mode") in counts:
counts[str(call["mode"])] += 1
total = sum(counts.values())
if total == 0:
return 1.0, 0.0, 0.0
return (
round(counts["executed"] / total, 4),
round(counts["surrogate"] / total, 4),
round(counts["blocked"] / total, 4),
)
def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str:
if blocked > 0.0:
return "low"
if execution >= 0.75 and surrogate <= 0.25:
return "high"
if execution >= 0.25 or "medium" in case_confidences:
return "medium"
return "low"
def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
if not calls:
return 1.0 if mode == "executed" else 0.0
return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4)
def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode)
def _utc_now() -> str: def _utc_now() -> str:
from datetime import datetime, timezone from datetime import datetime, timezone

View File

@ -8,6 +8,7 @@ from beaver.engine.providers import ProviderBundle
from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore
from beaver.skills.drafts import DraftService from beaver.skills.drafts import DraftService
from beaver.skills.learning.eval import SkillDraftEvaluator from beaver.skills.learning.eval import SkillDraftEvaluator
from beaver.skills.learning.replay import ReplayRunner
from beaver.skills.learning.service import SkillLearningService from beaver.skills.learning.service import SkillLearningService
from beaver.skills.learning.safety import SkillDraftSafetyChecker from beaver.skills.learning.safety import SkillDraftSafetyChecker
from beaver.skills.publisher import SkillPublisher from beaver.skills.publisher import SkillPublisher
@ -285,11 +286,17 @@ class SkillLearningPipelineService:
draft_id: str, draft_id: str,
*, *,
provider_bundle: ProviderBundle | None, provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
) -> SkillDraftEvalReport: ) -> SkillDraftEvalReport:
draft = self.get_draft(skill_name, draft_id) draft = self.get_draft(skill_name, draft_id)
candidate = self.get_candidate(candidate_id) candidate = self.get_candidate(candidate_id)
evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store) evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store)
report = await evaluator.evaluate(candidate=candidate, draft=draft, provider_bundle=provider_bundle) report = await evaluator.evaluate(
candidate=candidate,
draft=draft,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
)
self.learning_store.write_eval_report(report) self.learning_store.write_eval_report(report)
if report.status == "skipped_provider_unavailable": if report.status == "skipped_provider_unavailable":
status = "draft_ready" status = "draft_ready"

View File

@ -44,6 +44,7 @@ def _pipeline(tmp_path: Path, *, task_score: float = 0.8) -> SkillLearningPipeli
ended_at="end", ended_at="end",
success=True, success=True,
finish_reason="stop", finish_reason="stop",
feedback={"acceptance_type": "accept"},
validation_result={"score": task_score, "passed": True}, validation_result={"score": task_score, "passed": True},
) )
) )
@ -156,3 +157,59 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
assert safety.passed is False assert safety.passed is False
assert report.passed is True assert report.passed is True
assert pipeline.get_candidate("candidate-1").status == "safety_failed" assert pipeline.get_candidate("candidate-1").status == "safety_failed"
class FakeReplayRunner:
async def run_arm(self, request):
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"tool_calls": [
{
"tool_name": "write_file",
"mode": "executed",
"arguments": {"path": "README.md"},
"result": {"success": True, "content": "ok"},
}
],
"artifacts": [],
"side_effects": [],
}
def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(),
)
)
assert report.mode == "replay"
assert report.eval_version == "replay-v1"
assert report.case_reports
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}