From 64d789a3d094053ce6ad8af7faa2a99c7147bc3a Mon Sep 17 00:00:00 2001 From: steven_li Date: Mon, 8 Jun 2026 13:35:58 +0800 Subject: [PATCH] feat(skill-learning): produce replay eval reports --- .../backend/beaver/skills/learning/eval.py | 208 +++++++++++++++++- .../beaver/skills/learning/pipeline.py | 9 +- .../tests/unit/test_skill_learning_eval.py | 57 +++++ 3 files changed, 271 insertions(+), 3 deletions(-) diff --git a/app-instance/backend/beaver/skills/learning/eval.py b/app-instance/backend/beaver/skills/learning/eval.py index cd6f06d..ba65a30 100644 --- a/app-instance/backend/beaver/skills/learning/eval.py +++ b/app-instance/backend/beaver/skills/learning/eval.py @@ -4,17 +4,28 @@ from __future__ import annotations from uuid import uuid4 +from beaver.engine.context import SkillContext from beaver.engine.providers import ProviderBundle from beaver.memory.runs import RunMemoryStore from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate +from beaver.skills.learning.case_selection import select_replay_cases +from beaver.skills.learning.preservation import check_preservation +from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner +from beaver.skills.learning.surrogate import SurrogateToolEvaluator from beaver.skills.specs import SkillDraft class SkillDraftEvaluator: """Builds a bounded eval report without writing user-visible sessions.""" - def __init__(self, run_store: RunMemoryStore) -> None: + def __init__( + self, + run_store: RunMemoryStore, + *, + surrogate_evaluator: SurrogateToolEvaluator | None = None, + ) -> None: self.run_store = run_store + self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator() async def evaluate( self, @@ -22,11 +33,30 @@ class SkillDraftEvaluator: candidate: SkillLearningCandidate, draft: SkillDraft, provider_bundle: ProviderBundle | None, + replay_runner: ReplayRunner | None = None, ) -> SkillDraftEvalReport: if provider_bundle is None or provider_bundle.main_provider is None: return self._skipped(candidate, draft) - runs_by_id = {record.run_id: record for record in self.run_store.list_runs()} + runs = self.run_store.list_runs() + replay_cases = select_replay_cases(candidate, runs) + if replay_runner is not None and replay_cases: + return await self._evaluate_replay( + candidate=candidate, + draft=draft, + replay_cases=replay_cases, + provider_bundle=provider_bundle, + replay_runner=replay_runner, + ) + return self._evaluate_heuristic(candidate, draft, runs) + + def _evaluate_heuristic( + self, + candidate: SkillLearningCandidate, + draft: SkillDraft, + runs: list, + ) -> SkillDraftEvalReport: + runs_by_id = {record.run_id: record for record in runs} cases: list[dict] = [] for run_id in candidate.source_run_ids[:8]: record = runs_by_id.get(run_id) @@ -78,6 +108,78 @@ class SkillDraftEvaluator: created_at=_utc_now(), ) + async def _evaluate_replay( + self, + *, + candidate: SkillLearningCandidate, + draft: SkillDraft, + replay_cases: list[dict], + provider_bundle: ProviderBundle, + replay_runner: ReplayRunner, + ) -> SkillDraftEvalReport: + case_reports: list[dict] = [] + legacy_cases: list[dict] = [] + for case in replay_cases: + baseline = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:baseline", + arm="baseline", + task_text=str(case["task_text"]), + pinned_skill_names=list(case.get("baseline_skill_names") or []), + pinned_skill_contexts=[], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) + ) + candidate_arm = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:candidate", + arm="candidate", + task_text=str(case["task_text"]), + pinned_skill_names=[], + pinned_skill_contexts=[_draft_skill_context(draft)], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) + ) + surrogate = await self.surrogate_evaluator.evaluate( + task_text=str(case["task_text"]), + baseline=baseline, + candidate=candidate_arm, + ) + baseline_score = surrogate["baseline_score"] + candidate_score = surrogate["candidate_score"] + case_report = { + "run_id": case["run_id"], + "task_id": case.get("task_id"), + "session_id": case.get("session_id"), + "baseline": baseline, + "candidate": candidate_arm, + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + "execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"), + "surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"), + "blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"), + "confidence": surrogate["confidence"], + "tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])], + "artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])], + "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])], + "validator_notes": list(surrogate.get("notes") or []), + } + case_reports.append(case_report) + legacy_cases.append( + { + "run_id": case["run_id"], + "session_id": case.get("session_id") or "", + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + } + ) + preservation_report = _preservation_report(candidate, draft) + return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report) + def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport: return SkillDraftEvalReport( report_id=uuid4().hex, @@ -115,6 +217,108 @@ def _candidate_score(baseline: float, draft: SkillDraft) -> float: return min(1.0, max(0.75, baseline + 0.05)) +def _draft_skill_context(draft: SkillDraft) -> SkillContext: + tool_hints = draft.proposed_frontmatter.get("tools") + return SkillContext( + name=f"draft:{draft.skill_name}", + content=draft.proposed_content, + version=draft.draft_id, + content_hash="draft", + activation_reason="skill_replay_eval_candidate", + tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [], + ) + + +def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None: + if candidate.kind not in {"revise_skill", "merge_skills"}: + return None + base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else "" + if not base_content.strip(): + return None + return check_preservation(base_content=base_content, draft_content=draft.proposed_content) + + +def _report_from_case_reports( + candidate: SkillLearningCandidate, + draft: SkillDraft, + case_reports: list[dict], + legacy_cases: list[dict], + preservation_report: dict | None, +) -> SkillDraftEvalReport: + baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases) + candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases) + regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]] + improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]] + unchanged = len(legacy_cases) - len(regressions) - len(improved) + execution, surrogate, blocked = _coverage(case_reports) + confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports]) + score_delta = candidate_avg - baseline_avg + passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0 + return SkillDraftEvalReport( + report_id=uuid4().hex, + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id=candidate.candidate_id, + passed=passed, + baseline_score_avg=round(baseline_avg, 4), + candidate_score_avg=round(candidate_avg, 4), + score_delta=round(score_delta, 4), + regression_count=len(regressions), + improved_count=len(improved), + unchanged_count=unchanged, + cases=legacy_cases, + status="completed", + created_at=_utc_now(), + eval_version="replay-v1", + mode="replay", + execution_coverage=execution, + surrogate_coverage=surrogate, + blocked_coverage=blocked, + confidence=confidence, + case_reports=case_reports, + tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked}, + preservation_report=preservation_report, + ) + + +def _coverage(case_reports: list[dict]) -> tuple[float, float, float]: + counts = {"executed": 0, "surrogate": 0, "blocked": 0} + for report in case_reports: + for call in report.get("tool_calls") or []: + if isinstance(call, dict) and call.get("mode") in counts: + counts[str(call["mode"])] += 1 + total = sum(counts.values()) + if total == 0: + return 1.0, 0.0, 0.0 + return ( + round(counts["executed"] / total, 4), + round(counts["surrogate"] / total, 4), + round(counts["blocked"] / total, 4), + ) + + +def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str: + if blocked > 0.0: + return "low" + if execution >= 0.75 and surrogate <= 0.25: + return "high" + if execution >= 0.25 or "medium" in case_confidences: + return "medium" + return "low" + + +def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float: + calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])] + if not calls: + return 1.0 if mode == "executed" else 0.0 + return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4) + + +def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int: + calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])] + return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) + + def _utc_now() -> str: from datetime import datetime, timezone diff --git a/app-instance/backend/beaver/skills/learning/pipeline.py b/app-instance/backend/beaver/skills/learning/pipeline.py index 3194710..65f0cbc 100644 --- a/app-instance/backend/beaver/skills/learning/pipeline.py +++ b/app-instance/backend/beaver/skills/learning/pipeline.py @@ -8,6 +8,7 @@ from beaver.engine.providers import ProviderBundle from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore from beaver.skills.drafts import DraftService from beaver.skills.learning.eval import SkillDraftEvaluator +from beaver.skills.learning.replay import ReplayRunner from beaver.skills.learning.service import SkillLearningService from beaver.skills.learning.safety import SkillDraftSafetyChecker from beaver.skills.publisher import SkillPublisher @@ -285,11 +286,17 @@ class SkillLearningPipelineService: draft_id: str, *, provider_bundle: ProviderBundle | None, + replay_runner: ReplayRunner | None = None, ) -> SkillDraftEvalReport: draft = self.get_draft(skill_name, draft_id) candidate = self.get_candidate(candidate_id) evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store) - report = await evaluator.evaluate(candidate=candidate, draft=draft, provider_bundle=provider_bundle) + report = await evaluator.evaluate( + candidate=candidate, + draft=draft, + provider_bundle=provider_bundle, + replay_runner=replay_runner, + ) self.learning_store.write_eval_report(report) if report.status == "skipped_provider_unavailable": status = "draft_ready" diff --git a/app-instance/backend/tests/unit/test_skill_learning_eval.py b/app-instance/backend/tests/unit/test_skill_learning_eval.py index 61c7d56..61b70e4 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_eval.py +++ b/app-instance/backend/tests/unit/test_skill_learning_eval.py @@ -44,6 +44,7 @@ def _pipeline(tmp_path: Path, *, task_score: float = 0.8) -> SkillLearningPipeli ended_at="end", success=True, finish_reason="stop", + feedback={"acceptance_type": "accept"}, validation_result={"score": task_score, "passed": True}, ) ) @@ -156,3 +157,59 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None: assert safety.passed is False assert report.passed is True assert pipeline.get_candidate("candidate-1").status == "safety_failed" + + +class FakeReplayRunner: + async def run_arm(self, request): + return { + "case_id": request.case_id, + "arm": request.arm, + "session_id": "session-replay", + "run_id": f"{request.arm}-run", + "task_text": request.task_text, + "finish_reason": "stop", + "final_answer": "done", + "tool_calls": [ + { + "tool_name": "write_file", + "mode": "executed", + "arguments": {"path": "README.md"}, + "result": {"success": True, "content": "ok"}, + } + ], + "artifacts": [], + "side_effects": [], + } + + +def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="release-checklist", + proposed_content="# Release\n\nRun tests.", + proposed_frontmatter={"description": "release", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.update_learning_candidate( + "candidate-1", + draft_skill_name=draft.skill_name, + draft_id=draft.draft_id, + ) + + report = asyncio.run( + pipeline.evaluate_draft( + "candidate-1", + draft.skill_name, + draft.draft_id, + provider_bundle=_bundle(), + replay_runner=FakeReplayRunner(), + ) + ) + + assert report.mode == "replay" + assert report.eval_version == "replay-v1" + assert report.case_reports + assert 0.0 <= report.execution_coverage <= 1.0 + assert 0.0 <= report.surrogate_coverage <= 1.0 + assert report.confidence in {"low", "medium", "high"}