From 0fd4df3c1736f1dc2bf23585ab28c8911d0afde1 Mon Sep 17 00:00:00 2001 From: steven_li Date: Mon, 8 Jun 2026 11:26:07 +0800 Subject: [PATCH] docs: plan skill replay eval implementation --- .../plans/2026-06-08-skill-replay-eval.md | 2163 +++++++++++++++++ 1 file changed, 2163 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-08-skill-replay-eval.md diff --git a/docs/superpowers/plans/2026-06-08-skill-replay-eval.md b/docs/superpowers/plans/2026-06-08-skill-replay-eval.md new file mode 100644 index 0000000..d04b3a8 --- /dev/null +++ b/docs/superpowers/plans/2026-06-08-skill-replay-eval.md @@ -0,0 +1,2163 @@ +# Skill Replay Eval Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace heuristic-only skill draft evaluation with replay-style reports that cover all tools through either safe execution or LLM surrogate judgment, while preserving base skill content during draft synthesis. + +**Architecture:** Extend the existing skill learning pipeline instead of replacing it. Add replay report fields to `SkillDraftEvalReport`, introduce focused helper modules under `beaver/skills/learning/`, then wire the enhanced evaluator through the existing `SkillLearningPipelineService.evaluate_draft()` path and Skills UI. + +**Tech Stack:** Python dataclasses, existing file-backed stores, pytest, FastAPI response payloads, Next.js/TypeScript Skills page components. + +--- + +## File Structure + +Create focused backend modules: + +- `app-instance/backend/beaver/skills/learning/case_selection.py` selects up to 10 historical replay cases. +- `app-instance/backend/beaver/skills/learning/preservation.py` builds base skill snapshots and checks draft preservation. +- `app-instance/backend/beaver/skills/learning/replay.py` defines replay tool policy, trace executor, arm reports, and the AgentLoop-backed replay runner. +- `app-instance/backend/beaver/skills/learning/surrogate.py` scores non-executed tool calls with an LLM-backed or deterministic fallback surrogate. + +Modify existing backend modules: + +- `app-instance/backend/beaver/memory/skills/models.py` extends `SkillDraftEvalReport` with replay fields while preserving old payload compatibility. +- `app-instance/backend/beaver/skills/learning/eval.py` orchestrates replay eval and keeps heuristic fallback. +- `app-instance/backend/beaver/skills/learning/synthesizer.py` includes base skill snapshots for revision and merge prompts. +- `app-instance/backend/beaver/skills/learning/service.py` passes `SkillSpecStore` context into synthesis helpers where needed. +- `app-instance/backend/beaver/skills/learning/pipeline.py` updates publish gate confidence checks. +- `app-instance/backend/beaver/engine/loop.py` accepts a replay executor override for isolated tool execution. +- `app-instance/backend/beaver/interfaces/web/app.py` injects `ReplayRunner(agent_loop=loop)` for draft evaluation requests. + +Modify frontend: + +- `app-instance/frontend/types/index.ts` adds replay report fields. +- `app-instance/frontend/app/(app)/skills/page.tsx` shows execution coverage, surrogate coverage, confidence, replay cases, and preservation details. + +Add focused tests: + +- `app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py` +- `app-instance/backend/tests/unit/test_skill_learning_case_selection.py` +- `app-instance/backend/tests/unit/test_skill_learning_preservation.py` +- `app-instance/backend/tests/unit/test_skill_learning_replay.py` +- `app-instance/backend/tests/unit/test_skill_learning_replay_runner.py` +- `app-instance/backend/tests/unit/test_agent_loop_replay_executor.py` +- `app-instance/backend/tests/unit/test_skill_learning_surrogate.py` +- Extend `app-instance/backend/tests/unit/test_skill_learning_eval.py` +- Extend `app-instance/backend/tests/unit/test_skill_learning_pipeline.py` + +--- + +### Task 1: Extend Eval Report Model Compatibly + +**Files:** +- Modify: `app-instance/backend/beaver/memory/skills/models.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py` + +- [ ] **Step 1: Write failing model compatibility tests** + +Create `app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py`: + +```python +from __future__ import annotations + +from beaver.memory.skills import SkillDraftEvalReport + + +def test_eval_report_defaults_preserve_legacy_payload_shape() -> None: + report = SkillDraftEvalReport( + report_id="eval-1", + skill_name="debug", + draft_id="draft-1", + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.5, + candidate_score_avg=0.8, + score_delta=0.3, + regression_count=0, + improved_count=2, + unchanged_count=0, + cases=[{"run_id": "run-1"}], + status="completed", + created_at="now", + ) + + payload = report.to_dict() + + assert payload["eval_version"] == "heuristic-v1" + assert payload["mode"] == "heuristic" + assert payload["execution_coverage"] == 0.0 + assert payload["surrogate_coverage"] == 0.0 + assert payload["blocked_coverage"] == 0.0 + assert payload["confidence"] == "low" + assert payload["case_reports"] == [] + assert payload["tool_mode_summary"] == {} + assert payload["preservation_report"] is None + assert payload["cases"] == [{"run_id": "run-1"}] + + +def test_eval_report_reads_legacy_payload_without_replay_fields() -> None: + report = SkillDraftEvalReport.from_dict( + { + "report_id": "eval-legacy", + "skill_name": "debug", + "draft_id": "draft-1", + "candidate_id": "candidate-1", + "passed": True, + "baseline_score_avg": 0.4, + "candidate_score_avg": 0.8, + "score_delta": 0.4, + "regression_count": 0, + "improved_count": 1, + "unchanged_count": 0, + "cases": [{"run_id": "run-1"}], + "status": "completed", + "created_at": "now", + } + ) + + assert report.eval_version == "heuristic-v1" + assert report.mode == "heuristic" + assert report.confidence == "low" + assert report.case_reports == [] +``` + +- [ ] **Step 2: Run the new tests to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval_report_model.py -v +``` + +Expected: FAIL because `SkillDraftEvalReport` has no replay fields. + +- [ ] **Step 3: Extend the dataclass** + +In `app-instance/backend/beaver/memory/skills/models.py`, add fields to `SkillDraftEvalReport` after `created_at`: + +```python + eval_version: str = "heuristic-v1" + mode: str = "heuristic" + execution_coverage: float = 0.0 + surrogate_coverage: float = 0.0 + blocked_coverage: float = 0.0 + confidence: str = "low" + case_reports: list[dict[str, Any]] = field(default_factory=list) + tool_mode_summary: dict[str, Any] = field(default_factory=dict) + preservation_report: dict[str, Any] | None = None +``` + +Update `to_dict()` to include these fields: + +```python + "eval_version": self.eval_version, + "mode": self.mode, + "execution_coverage": self.execution_coverage, + "surrogate_coverage": self.surrogate_coverage, + "blocked_coverage": self.blocked_coverage, + "confidence": self.confidence, + "case_reports": [dict(item) for item in self.case_reports], + "tool_mode_summary": dict(self.tool_mode_summary), + "preservation_report": dict(self.preservation_report) if self.preservation_report is not None else None, +``` + +Update `from_dict()` with bounded numeric parsing: + +```python + eval_version=str(payload.get("eval_version") or "heuristic-v1"), + mode=str(payload.get("mode") or "heuristic"), + execution_coverage=_bounded_float(payload.get("execution_coverage"), default=0.0), + surrogate_coverage=_bounded_float(payload.get("surrogate_coverage"), default=0.0), + blocked_coverage=_bounded_float(payload.get("blocked_coverage"), default=0.0), + confidence=str(payload.get("confidence") or "low"), + case_reports=[dict(item) for item in payload.get("case_reports") or [] if isinstance(item, dict)], + tool_mode_summary=dict(payload.get("tool_mode_summary") or {}), + preservation_report=( + dict(payload["preservation_report"]) + if isinstance(payload.get("preservation_report"), dict) + else None + ), +``` + +Add helper near `_optional_float`: + +```python +def _bounded_float(value: Any, *, default: float = 0.0) -> float: + if value in (None, ""): + return default + try: + return max(0.0, min(1.0, float(value))) + except (TypeError, ValueError): + return default +``` + +- [ ] **Step 4: Run model tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval_report_model.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Run existing eval store test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_candidate_state.py::test_reports_are_stored_and_retrieved -v +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add app-instance/backend/beaver/memory/skills/models.py app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py +git commit -m "feat(skill-learning): extend eval report payload" +``` + +--- + +### Task 2: Add Base Skill Preservation Helpers + +**Files:** +- Create: `app-instance/backend/beaver/skills/learning/preservation.py` +- Modify: `app-instance/backend/beaver/skills/learning/__init__.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_preservation.py` + +- [ ] **Step 1: Write failing preservation tests** + +Create `app-instance/backend/tests/unit/test_skill_learning_preservation.py`: + +```python +from __future__ import annotations + +from beaver.skills.learning.preservation import check_preservation + + +def test_preservation_passes_when_base_sections_remain() -> None: + base = "# Skill\n\n## Workflow\n\n- Read first.\n\n## Safety\n\n- Do not delete files.\n" + draft = "# Skill\n\n## Workflow\n\n- Read first.\n- Then write.\n\n## Safety\n\n- Do not delete files.\n" + + report = check_preservation(base_content=base, draft_content=draft) + + assert report["passed"] is True + assert report["risk_level"] == "low" + assert "Workflow" in report["preserved_sections"] + assert "Safety" in report["preserved_sections"] + assert report["dropped_sections"] == [] + + +def test_preservation_flags_dropped_section() -> None: + base = "# Skill\n\n## Workflow\n\n- Read first.\n\n## Safety\n\n- Do not delete files.\n" + draft = "# Skill\n\n## Workflow\n\n- Read first.\n" + + report = check_preservation(base_content=base, draft_content=draft) + + assert report["passed"] is False + assert report["risk_level"] == "high" + assert "Safety" in report["dropped_sections"] +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_preservation.py -v +``` + +Expected: FAIL because `preservation.py` does not exist. + +- [ ] **Step 3: Implement preservation helpers** + +Create `app-instance/backend/beaver/skills/learning/preservation.py`: + +```python +"""Preservation checks for skill revision drafts.""" + +from __future__ import annotations + +import re +from typing import Any + + +def check_preservation(*, base_content: str, draft_content: str) -> dict[str, Any]: + base_sections = _sections(base_content) + draft_sections = _sections(draft_content) + preserved: list[str] = [] + changed: list[str] = [] + dropped: list[str] = [] + + for heading, body in base_sections.items(): + draft_body = draft_sections.get(heading) + if draft_body is None: + dropped.append(heading) + continue + if _normalize(body) == _normalize(draft_body): + preserved.append(heading) + else: + changed.append(heading) + + risk_level = "high" if dropped else ("medium" if changed else "low") + return { + "passed": not dropped, + "risk_level": risk_level, + "preserved_sections": preserved, + "changed_sections": changed, + "dropped_sections": dropped, + } + + +def _sections(content: str) -> dict[str, str]: + current = "body" + sections: dict[str, list[str]] = {current: []} + for line in (content or "").splitlines(): + match = re.match(r"^#{1,6}\s+(.+?)\s*$", line) + if match: + current = match.group(1).strip() + sections.setdefault(current, []) + continue + sections.setdefault(current, []).append(line) + return {heading: "\n".join(lines).strip() for heading, lines in sections.items() if "\n".join(lines).strip()} + + +def _normalize(value: str) -> str: + return re.sub(r"\s+", " ", value or "").strip().lower() +``` + +Export helpers from `app-instance/backend/beaver/skills/learning/__init__.py`: + +```python +from .preservation import check_preservation +``` + +Add `"check_preservation"` to `__all__`. + +- [ ] **Step 4: Run preservation tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_preservation.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/preservation.py app-instance/backend/beaver/skills/learning/__init__.py app-instance/backend/tests/unit/test_skill_learning_preservation.py +git commit -m "feat(skill-learning): add draft preservation checks" +``` + +--- + +### Task 3: Include Base Skill Snapshots During Synthesis + +**Files:** +- Modify: `app-instance/backend/beaver/skills/learning/synthesizer.py` +- Modify: `app-instance/backend/beaver/skills/learning/service.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py` + +- [ ] **Step 1: Write failing prompt test** + +Create `app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py`: + +```python +from __future__ import annotations + +from beaver.memory.skills import SkillLearningCandidate +from beaver.skills.learning.evidence import EvidencePacket +from beaver.skills.learning.synthesizer import SkillDraftSynthesizer + + +def test_revision_prompt_includes_base_skill_snapshot() -> None: + candidate = SkillLearningCandidate( + candidate_id="candidate-1", + kind="revise_skill", + source_run_ids=["run-1"], + source_session_ids=["session-1"], + related_skill_names=["debug-skill"], + reason="Improve debugging flow.", + ) + packet = EvidencePacket( + run_ids=["run-1"], + session_ids=["session-1"], + task_summaries=["debug a failing test"], + session_excerpts=["assistant: fixed it"], + ) + prompt = SkillDraftSynthesizer._build_prompt( + candidate, + packet, + "revise", + base_skill={ + "skill_name": "debug-skill", + "version": "v0001", + "frontmatter": {"description": "Debug tests", "tools": ["read_file"]}, + "content": "# Debug Skill\n\n## Safety\n\nDo not delete files.", + "summary": "Debug tests safely.", + "tool_hints": ["read_file"], + }, + ) + + assert "Base skill snapshot" in prompt + assert "# Debug Skill" in prompt + assert "Do not delete files." in prompt + assert "preserved_sections" in prompt + assert "dropped_sections" in prompt +``` + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_synthesizer_preservation.py -v +``` + +Expected: FAIL because `_build_prompt()` does not accept `base_skill`. + +- [ ] **Step 3: Update synthesizer signatures** + +In `app-instance/backend/beaver/skills/learning/synthesizer.py`, change these signatures: + +```python + async def synthesize_revision( + self, + candidate: SkillLearningCandidate, + evidence_packet: EvidencePacket, + provider: LLMProvider, + model: str, + base_skill: dict[str, Any] | None = None, + ) -> dict[str, Any]: + return await self._synthesize(candidate, evidence_packet, provider, model, "revise", base_skill=base_skill) +``` + +Apply the same optional `base_skill` parameter to `synthesize_merge()` and `_synthesize()`. Keep `synthesize_new_skill()` unchanged except for calling `_synthesize(..., base_skill=None)`. + +- [ ] **Step 4: Update prompt builder** + +Update `_build_prompt()` signature: + +```python + def _build_prompt( + candidate: SkillLearningCandidate, + evidence_packet: EvidencePacket, + action: str, + base_skill: dict[str, Any] | None = None, + ) -> str: +``` + +Before the return, build a base section: + +```python + base_section = "" + if base_skill: + base_section = ( + "\n\nBase skill snapshot:\n" + f"- skill_name: {base_skill.get('skill_name')}\n" + f"- version: {base_skill.get('version')}\n" + f"- frontmatter: {json.dumps(base_skill.get('frontmatter') or {}, ensure_ascii=False, sort_keys=True)}\n" + f"- tool_hints: {base_skill.get('tool_hints') or []}\n" + f"- summary: {base_skill.get('summary') or ''}\n" + "Base skill content:\n" + f"{base_skill.get('content') or ''}\n" + "Preserve existing instructions unless the evidence requires a change. " + "If any section is changed or dropped, explain it in changed_sections or dropped_sections." + ) +``` + +Append `base_section` before the JSON instructions. Extend the JSON instruction: + +```python + + "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays." +``` + +- [ ] **Step 5: Preserve parsed metadata from model output** + +In `_parse_payload()`, include optional arrays: + +```python + "preserved_sections": _coerce_string_list(payload.get("preserved_sections")), + "changed_sections": _coerce_string_list(payload.get("changed_sections")), + "dropped_sections": _coerce_string_list(payload.get("dropped_sections")), +``` + +In `_normalize_payload()`, return those fields as well. + +- [ ] **Step 6: Add base skill snapshot lookup in service** + +In `app-instance/backend/beaver/skills/learning/service.py`, import `SkillSpecStore` only if needed by type checking is not enough. The service already has `draft_service.store`; use it to read published skills. + +Add helper: + +```python + def _base_skill_snapshot(self, skill_name: str, version: str | None) -> dict[str, Any] | None: + loaded = self.draft_service.store.read_published_skill(skill_name, version) + if loaded is None: + return None + return { + "skill_name": loaded.version.skill_name, + "version": loaded.version.version, + "frontmatter": dict(loaded.version.frontmatter), + "content": loaded.content, + "summary": loaded.version.summary, + "tool_hints": list(loaded.version.tool_hints), + } +``` + +Pass this snapshot to `synthesize_revision()` and `synthesize_merge()`. + +- [ ] **Step 7: Run synthesizer test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_synthesizer_preservation.py -v +``` + +Expected: PASS. + +- [ ] **Step 8: Run existing learning tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval.py tests/unit/test_skill_learning_worker.py -v +``` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/synthesizer.py app-instance/backend/beaver/skills/learning/service.py app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py +git commit -m "feat(skill-learning): preserve base skill during synthesis" +``` + +--- + +### Task 4: Add Historical Replay Case Selection + +**Files:** +- Create: `app-instance/backend/beaver/skills/learning/case_selection.py` +- Modify: `app-instance/backend/beaver/skills/learning/__init__.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_case_selection.py` + +- [ ] **Step 1: Write failing case selection tests** + +Create `app-instance/backend/tests/unit/test_skill_learning_case_selection.py`: + +```python +from __future__ import annotations + +from beaver.memory.runs import RunRecord +from beaver.memory.skills import SkillLearningCandidate +from beaver.skills.learning.case_selection import select_replay_cases +from beaver.skills.specs import SkillActivationReceipt + + +def _run( + run_id: str, + *, + task_id: str = "task", + session_id: str = "session", + task_text: str = "debug task", + skill_name: str | None = None, + skill_version: str = "v0001", +) -> RunRecord: + receipts = [] + if skill_name: + receipts.append( + SkillActivationReceipt( + run_id=run_id, + session_id=session_id, + skill_name=skill_name, + skill_version=skill_version, + content_hash="hash", + activated_at="now", + activation_reason="selected", + ) + ) + return RunRecord( + run_id=run_id, + session_id=session_id, + task_id=task_id, + attempt_index=1, + task_text=task_text, + started_at=f"2026-06-08T00:00:{run_id[-2:]}+00:00", + ended_at="end", + success=True, + finish_reason="stop", + feedback={"acceptance_type": "accept"}, + activated_skills=receipts, + ) + + +def test_select_revise_cases_caps_at_ten_and_prefers_related_skill() -> None: + runs = [_run(f"run-{index:02d}", task_id=f"task-{index}", skill_name="debug", skill_version="v0001") for index in range(12)] + candidate = SkillLearningCandidate( + candidate_id="candidate-1", + kind="revise_skill", + source_run_ids=[], + source_session_ids=[], + related_skill_names=["debug"], + reason="revise", + evidence={"skill_version": "v0001"}, + ) + + cases = select_replay_cases(candidate, runs) + + assert len(cases) == 10 + assert all(case["baseline_skill_names"] == ["debug"] for case in cases) + assert cases[0]["run_id"] == "run-11" + + +def test_select_new_skill_uses_all_available_source_runs_under_ten() -> None: + runs = [_run(f"run-{index:02d}", task_id=f"task-{index}") for index in range(3)] + candidate = SkillLearningCandidate( + candidate_id="candidate-1", + kind="new_skill", + source_run_ids=["run-00", "run-01", "run-02"], + source_session_ids=["session"], + related_skill_names=[], + reason="new", + ) + + cases = select_replay_cases(candidate, runs) + + assert [case["run_id"] for case in cases] == ["run-02", "run-01", "run-00"] + assert all(case["baseline_skill_names"] == [] for case in cases) +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_case_selection.py -v +``` + +Expected: FAIL because `case_selection.py` does not exist. + +- [ ] **Step 3: Implement selector** + +Create `app-instance/backend/beaver/skills/learning/case_selection.py`: + +```python +"""Historical replay case selection for skill draft evaluation.""" + +from __future__ import annotations + +from typing import Any + +from beaver.memory.runs import RunRecord +from beaver.memory.skills import SkillLearningCandidate + +MAX_REPLAY_CASES = 10 + + +def select_replay_cases(candidate: SkillLearningCandidate, runs: list[RunRecord]) -> list[dict[str, Any]]: + accepted = [record for record in runs if _is_accepted(record)] + if candidate.kind == "revise_skill": + selected = _select_revise(candidate, accepted) + elif candidate.kind == "merge_skills": + selected = _select_merge(candidate, accepted) + else: + selected = _select_new(candidate, accepted) + return [_case_payload(candidate, record) for record in selected[:MAX_REPLAY_CASES]] + + +def _select_revise(candidate: SkillLearningCandidate, runs: list[RunRecord]) -> list[RunRecord]: + target = candidate.related_skill_names[0] if candidate.related_skill_names else "" + version = str(candidate.evidence.get("skill_version") or "") + matches = [ + record for record in runs + if any(receipt.skill_name == target and (not version or receipt.skill_version == version) for receipt in record.activated_skills) + ] + return _recent_diverse(matches) + + +def _select_merge(candidate: SkillLearningCandidate, runs: list[RunRecord]) -> list[RunRecord]: + targets = set(candidate.related_skill_names) + matches = [ + record for record in runs + if targets and targets.issubset({receipt.skill_name for receipt in record.activated_skills}) + ] + return _recent_diverse(matches) + + +def _select_new(candidate: SkillLearningCandidate, runs: list[RunRecord]) -> list[RunRecord]: + source_ids = set(candidate.source_run_ids) + if source_ids: + matches = [record for record in runs if record.run_id in source_ids] + else: + theme = str(candidate.evidence.get("theme") or "").lower().strip() + matches = [record for record in runs if theme and theme in record.task_text.lower()] + return _recent_diverse(matches) + + +def _case_payload(candidate: SkillLearningCandidate, record: RunRecord) -> dict[str, Any]: + baseline_skill_names = [] + if candidate.kind == "revise_skill": + baseline_skill_names = list(candidate.related_skill_names[:1]) + elif candidate.kind == "merge_skills": + baseline_skill_names = list(candidate.related_skill_names) + return { + "run_id": record.run_id, + "task_id": record.task_id, + "session_id": record.session_id, + "task_text": record.task_text, + "baseline_skill_names": baseline_skill_names, + "candidate_skill_name": candidate.draft_skill_name, + "accepted_score": _score(record), + } + + +def _recent_diverse(runs: list[RunRecord]) -> list[RunRecord]: + sorted_runs = sorted(runs, key=lambda item: (item.started_at, item.run_id), reverse=True) + result: list[RunRecord] = [] + seen_tasks: set[str] = set() + for record in sorted_runs: + task_key = record.task_id or record.task_text + if task_key in seen_tasks and len(sorted_runs) > MAX_REPLAY_CASES: + continue + seen_tasks.add(task_key) + result.append(record) + if len(result) >= MAX_REPLAY_CASES: + break + if len(result) < min(len(sorted_runs), MAX_REPLAY_CASES): + seen_run_ids = {record.run_id for record in result} + result.extend(record for record in sorted_runs if record.run_id not in seen_run_ids) + return result[:MAX_REPLAY_CASES] + + +def _is_accepted(record: RunRecord) -> bool: + feedback = record.feedback or {} + acceptance = feedback.get("acceptance_type") + if acceptance is None and feedback.get("feedback_type") == "satisfied": + acceptance = "accept" + return bool(record.success) and acceptance == "accept" + + +def _score(record: RunRecord) -> float: + validation = record.validation_result or {} + value = validation.get("score") if isinstance(validation, dict) else None + if value is not None: + try: + return max(0.0, min(1.0, float(value))) + except (TypeError, ValueError): + pass + return 0.8 if record.success else 0.4 +``` + +Export `select_replay_cases` from `app-instance/backend/beaver/skills/learning/__init__.py`. + +- [ ] **Step 4: Run selector tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_case_selection.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/case_selection.py app-instance/backend/beaver/skills/learning/__init__.py app-instance/backend/tests/unit/test_skill_learning_case_selection.py +git commit -m "feat(skill-learning): select replay eval cases" +``` + +--- + +### Task 5: Add Replay Tool Policy And Trace Executor + +**Files:** +- Create: `app-instance/backend/beaver/skills/learning/replay.py` +- Modify: `app-instance/backend/beaver/skills/learning/__init__.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_replay.py` + +- [ ] **Step 1: Write failing replay policy tests** + +Create `app-instance/backend/tests/unit/test_skill_learning_replay.py`: + +```python +from __future__ import annotations + +import asyncio + +from beaver.tools.base import BaseTool, ToolContext, ToolResult, ToolSpec +from beaver.tools.registry.tool_registry import ToolRegistry +from beaver.tools.runtime.executor import ToolExecutor +from beaver.skills.learning.replay import ReplayToolExecutor, ReplayToolPolicy, classify_tool_mode + + +class FakeTool(BaseTool): + def __init__(self, name: str, *, toolset: str = "filesystem", metadata: dict | None = None) -> None: + self._spec = ToolSpec( + name=name, + description=f"{name} tool", + input_schema={"type": "object", "properties": {"path": {"type": "string"}}}, + toolset=toolset, + metadata=metadata or {}, + ) + + @property + def spec(self) -> ToolSpec: + return self._spec + + async def invoke(self, arguments: dict, context: ToolContext) -> ToolResult: + return ToolResult(success=True, content=f"executed:{arguments}", tool_name=self.spec.name) + + +def _executor(*tools: FakeTool) -> ReplayToolExecutor: + registry = ToolRegistry() + for tool in tools: + registry.register(tool) + return ReplayToolExecutor(ToolExecutor(registry), registry=registry, policy=ReplayToolPolicy()) + + +def test_classify_tool_modes_from_spec() -> None: + assert classify_tool_mode(FakeTool("read_file").spec) == "executed" + assert classify_tool_mode(FakeTool("write_file").spec) == "executed" + assert classify_tool_mode(FakeTool("mcp_outlook_send_email", toolset="mcp", metadata={"transport": "mcp"}).spec) == "surrogate" + assert classify_tool_mode(FakeTool("delete_account", toolset="mcp", metadata={"transport": "mcp"}).spec) == "blocked" + + +def test_replay_executor_executes_safe_tool_and_records_trace() -> None: + executor = _executor(FakeTool("write_file")) + + result = asyncio.run(executor.execute("write_file", {"path": "a.txt"}, context=ToolContext(workspace="/tmp/replay"))) + + assert result.success is True + assert result.content.startswith("executed:") + assert executor.traces[0]["mode"] == "executed" + assert executor.traces[0]["tool_name"] == "write_file" + + +def test_replay_executor_surrogates_external_write_and_blocks_destructive() -> None: + executor = _executor( + FakeTool("mcp_outlook_send_email", toolset="mcp", metadata={"transport": "mcp"}), + FakeTool("delete_account", toolset="mcp", metadata={"transport": "mcp"}), + ) + + send = asyncio.run(executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"}, context=ToolContext())) + delete = asyncio.run(executor.execute("delete_account", {"id": "1"}, context=ToolContext())) + + assert send.success is True + assert send.error == "replay_surrogate" + assert delete.success is False + assert delete.error == "replay_blocked" + assert [trace["mode"] for trace in executor.traces] == ["surrogate", "blocked"] +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_replay.py -v +``` + +Expected: FAIL because `replay.py` does not exist. + +- [ ] **Step 3: Implement replay policy and executor** + +Create `app-instance/backend/beaver/skills/learning/replay.py`: + +```python +"""Replay execution helpers for skill draft evaluation.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Literal +from uuid import uuid4 + +from beaver.tools.base import ToolContext, ToolResult, ToolSpec +from beaver.tools.registry.tool_registry import ToolRegistry +from beaver.tools.runtime.executor import ToolExecutor + +ToolExecutionMode = Literal["executed", "surrogate", "blocked"] + + +@dataclass(slots=True) +class ReplayToolPolicy: + safe_toolsets: set[str] = field(default_factory=lambda: {"filesystem", "user_files", "core", "web", "search"}) + surrogate_transports: set[str] = field(default_factory=lambda: {"mcp", "connector"}) + destructive_terms: tuple[str, ...] = ( + "delete", + "remove", + "destroy", + "revoke", + "permission", + "credential", + "payment", + "pay", + ) + external_write_terms: tuple[str, ...] = ( + "send", + "post", + "publish", + "create", + "update", + "invite", + "reply", + "forward", + ) + + +class ReplayToolExecutor: + def __init__( + self, + inner: ToolExecutor, + *, + registry: ToolRegistry, + policy: ReplayToolPolicy | None = None, + ) -> None: + self.inner = inner + self.registry = registry + self.policy = policy or ReplayToolPolicy() + self.traces: list[dict[str, Any]] = [] + + async def execute( + self, + tool_name: str, + arguments: dict[str, Any] | None, + *, + context: ToolContext | None = None, + ) -> ToolResult: + tool = self.registry.get(tool_name) + spec = tool.spec if tool is not None else ToolSpec( + name=tool_name, + description="unregistered tool", + input_schema={"type": "object", "properties": {}}, + toolset="unknown", + ) + mode = classify_tool_mode(spec, self.policy) + trace = { + "trace_id": uuid4().hex, + "tool_name": tool_name, + "mode": mode, + "arguments": dict(arguments or {}), + "schema": dict(spec.input_schema), + "toolset": spec.toolset, + "metadata": dict(spec.metadata), + "classification_reason": _classification_reason(spec, mode), + } + if mode == "executed": + result = await self.inner.execute(tool_name, arguments or {}, context=context) + trace["result"] = {"success": result.success, "error": result.error, "content": result.content[:2000]} + self.traces.append(trace) + return result + if mode == "surrogate": + trace["result"] = {"success": True, "error": "replay_surrogate", "content": "Tool call recorded for surrogate evaluation."} + self.traces.append(trace) + return ToolResult( + success=True, + content="Tool call recorded for surrogate evaluation.", + tool_name=tool_name, + error="replay_surrogate", + raw_output=trace, + ) + trace["result"] = {"success": False, "error": "replay_blocked", "content": "Tool call blocked by replay policy."} + self.traces.append(trace) + return ToolResult( + success=False, + content="Tool call blocked by replay policy.", + tool_name=tool_name, + error="replay_blocked", + raw_output=trace, + ) + + async def execute_tool_call(self, tool_call: Any, *, context: ToolContext | None = None) -> ToolResult: + tool_name, arguments = ToolExecutor._normalize_tool_call(tool_call) + return await self.execute(tool_name, arguments, context=context) + + +def classify_tool_mode(spec: ToolSpec, policy: ReplayToolPolicy | None = None) -> ToolExecutionMode: + policy = policy or ReplayToolPolicy() + name = spec.name.lower() + toolset = spec.toolset.lower() + metadata = {str(key).lower(): str(value).lower() for key, value in spec.metadata.items()} + if any(term in name for term in policy.destructive_terms): + return "blocked" + if toolset in policy.safe_toolsets: + return "executed" + if metadata.get("transport") in policy.surrogate_transports or toolset in {"mcp", "connector", "external"}: + if any(term in name for term in policy.external_write_terms): + return "surrogate" + return "executed" + return "surrogate" + + +def _classification_reason(spec: ToolSpec, mode: ToolExecutionMode) -> str: + return f"{spec.name} classified as {mode} from toolset={spec.toolset} metadata={spec.metadata}" +``` + +Export `ReplayToolExecutor`, `ReplayToolPolicy`, and `classify_tool_mode` from `app-instance/backend/beaver/skills/learning/__init__.py`. + +- [ ] **Step 4: Run replay policy tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_replay.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/replay.py app-instance/backend/beaver/skills/learning/__init__.py app-instance/backend/tests/unit/test_skill_learning_replay.py +git commit -m "feat(skill-learning): add replay tool policy" +``` + +--- + +### Task 6: Add AgentLoop Replay Executor Injection + +**Files:** +- Modify: `app-instance/backend/beaver/engine/loop.py` +- Test: `app-instance/backend/tests/unit/test_agent_loop_replay_executor.py` + +- [ ] **Step 1: Write failing replay injection test** + +Create `app-instance/backend/tests/unit/test_agent_loop_replay_executor.py`: + +```python +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from beaver.engine.loop import AgentLoop +from beaver.engine.providers.base import LLMProvider, LLMResponse, ToolCallRequest +from beaver.engine.providers.factory import ProviderBundle +from beaver.skills.learning.replay import ReplayToolExecutor, ReplayToolPolicy + + +class ToolCallingProvider(LLMProvider): + def __init__(self) -> None: + self.calls = 0 + + async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse: + self.calls += 1 + if self.calls == 1: + return LLMResponse( + content="", + tool_calls=[ + ToolCallRequest( + id="call-1", + name="read_file", + arguments={"path": "README.md"}, + ) + ], + ) + return LLMResponse(content="done") + + def get_default_model(self) -> str: + return "stub" + + +@pytest.mark.asyncio +async def test_process_direct_uses_replay_tool_executor(tmp_path: Path) -> None: + loop = AgentLoop(workspace=tmp_path) + loaded = loop.boot() + provider = ToolCallingProvider() + runtime = SimpleNamespace(model="stub", provider_name="stub") + replay_executor = ReplayToolExecutor( + loaded.tool_executor, + registry=loaded.tool_registry, + policy=ReplayToolPolicy(), + ) + + result = await loop.process_direct( + "Read the README.", + provider_bundle=ProviderBundle(main_runtime=runtime, main_provider=provider), # type: ignore[arg-type] + include_skill_assembly=False, + pinned_skill_names=[], + tool_executor_override=replay_executor, + max_tool_iterations=2, + source="skill_replay_eval", + ) + + assert result.output_text == "done" + assert replay_executor.traces + assert replay_executor.traces[0]["tool_name"] == "read_file" +``` + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_agent_loop_replay_executor.py -v +``` + +Expected: FAIL because `process_direct()` does not accept `tool_executor_override`. + +- [ ] **Step 3: Add optional executor override to AgentLoop** + +In `app-instance/backend/beaver/engine/loop.py`, add `tool_executor_override: Any = None` to `process_direct()` and `_process_direct_impl()` keyword parameters. Pass the argument from `process_direct()` into `_process_direct_impl()`. + +After `tool_executor = self._require_loaded("tool_executor")`, add: + +```python + effective_tool_executor = tool_executor_override or tool_executor +``` + +Replace this line inside the tool loop: + +```python + result = await tool_executor.execute_tool_call(tool_call, context=tool_context) +``` + +with: + +```python + result = await effective_tool_executor.execute_tool_call(tool_call, context=tool_context) +``` + +- [ ] **Step 4: Run replay injection test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_agent_loop_replay_executor.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Run a direct-loop regression test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval.py::test_eval_provider_unavailable_is_skipped_not_failed -v +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add app-instance/backend/beaver/engine/loop.py app-instance/backend/tests/unit/test_agent_loop_replay_executor.py +git commit -m "feat(engine): allow replay tool executor injection" +``` + +--- + +### Task 7: Add Surrogate Evaluator + +**Files:** +- Create: `app-instance/backend/beaver/skills/learning/surrogate.py` +- Modify: `app-instance/backend/beaver/skills/learning/__init__.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_surrogate.py` + +- [ ] **Step 1: Write failing surrogate tests** + +Create `app-instance/backend/tests/unit/test_skill_learning_surrogate.py`: + +```python +from __future__ import annotations + +import asyncio + +from beaver.skills.learning.surrogate import SurrogateToolEvaluator + + +def test_surrogate_scores_complete_candidate_higher_than_missing_baseline() -> None: + evaluator = SurrogateToolEvaluator() + baseline = { + "arm": "baseline", + "tool_calls": [ + {"tool_name": "mcp_outlook_send_email", "mode": "surrogate", "arguments": {"to": "", "subject": ""}}, + ], + } + candidate = { + "arm": "candidate", + "tool_calls": [ + { + "tool_name": "mcp_outlook_send_email", + "mode": "surrogate", + "arguments": {"to": "ada@example.com", "subject": "Status", "body": "Done"}, + }, + ], + } + + result = asyncio.run(evaluator.evaluate(task_text="Send a status email to Ada.", baseline=baseline, candidate=candidate)) + + assert result["candidate_score"] > result["baseline_score"] + assert result["surrogate_tool_count"] == 2 + assert result["confidence"] in {"low", "medium"} +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_surrogate.py -v +``` + +Expected: FAIL because `surrogate.py` does not exist. + +- [ ] **Step 3: Implement deterministic surrogate v1** + +Create `app-instance/backend/beaver/skills/learning/surrogate.py`: + +```python +"""Surrogate evaluation for replay tool calls that cannot execute safely.""" + +from __future__ import annotations + +from typing import Any + + +class SurrogateToolEvaluator: + async def evaluate(self, *, task_text: str, baseline: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]: + baseline_score = _score_arm(task_text, baseline) + candidate_score = _score_arm(task_text, candidate) + surrogate_count = _mode_count(baseline, "surrogate") + _mode_count(candidate, "surrogate") + blocked_count = _mode_count(baseline, "blocked") + _mode_count(candidate, "blocked") + confidence = "low" if blocked_count else ("medium" if surrogate_count <= 2 else "low") + return { + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + "surrogate_tool_count": surrogate_count, + "blocked_tool_count": blocked_count, + "confidence": confidence, + "notes": [ + "Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.", + ], + } + + +def _score_arm(task_text: str, arm: dict[str, Any]) -> float: + calls = [item for item in arm.get("tool_calls") or [] if isinstance(item, dict)] + if not calls: + return 0.5 + scores = [_score_call(task_text, call) for call in calls] + return round(sum(scores) / len(scores), 4) + + +def _score_call(task_text: str, call: dict[str, Any]) -> float: + if call.get("mode") == "blocked": + return 0.2 + if call.get("mode") == "executed": + result = call.get("result") if isinstance(call.get("result"), dict) else {} + return 0.85 if result.get("success") is not False else 0.35 + arguments = dict(call.get("arguments") or {}) + if not arguments: + return 0.45 + non_empty = sum(1 for value in arguments.values() if str(value).strip()) + completeness = non_empty / max(1, len(arguments)) + argument_text = " ".join(str(value).lower() for value in arguments.values()) + relevance = 0.15 if any(token and token in argument_text for token in task_text.lower().split()[:16]) else 0.0 + return round(min(0.9, 0.5 + 0.3 * completeness + relevance), 4) + + +def _mode_count(arm: dict[str, Any], mode: str) -> int: + return sum(1 for item in arm.get("tool_calls") or [] if isinstance(item, dict) and item.get("mode") == mode) +``` + +Export `SurrogateToolEvaluator` from `__init__.py`. + +- [ ] **Step 4: Run surrogate tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_surrogate.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/surrogate.py app-instance/backend/beaver/skills/learning/__init__.py app-instance/backend/tests/unit/test_skill_learning_surrogate.py +git commit -m "feat(skill-learning): add surrogate tool evaluator" +``` + +--- + +### Task 8: Add Replay Arm Runner + +**Files:** +- Modify: `app-instance/backend/beaver/skills/learning/replay.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_replay_runner.py` + +- [ ] **Step 1: Write failing arm runner test** + +Create `app-instance/backend/tests/unit/test_skill_learning_replay_runner.py`: + +```python +from __future__ import annotations + +import asyncio +from types import SimpleNamespace + +from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner + + +class FakeAgentLoop: + def boot(self): + return SimpleNamespace(tool_executor=SimpleNamespace(), tool_registry=SimpleNamespace(get=lambda name: None)) + + async def process_direct(self, task: str, **kwargs): + executor = kwargs["tool_executor_override"] + await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"}) + return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop") + + +def test_replay_runner_returns_arm_report_with_tool_trace() -> None: + runner = ReplayRunner(agent_loop=FakeAgentLoop()) + request = ReplayArmRequest( + case_id="case-1", + arm="candidate", + task_text="Send a status email to Ada.", + pinned_skill_names=[], + pinned_skill_contexts=[], + provider_bundle=object(), + model_settings={"max_tool_iterations": 2}, + ) + + report = asyncio.run(runner.run_arm(request)) + + assert report["case_id"] == "case-1" + assert report["arm"] == "candidate" + assert report["finish_reason"] == "stop" + assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email" +``` + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_replay_runner.py -v +``` + +Expected: FAIL because `ReplayRunner` and `ReplayArmRequest` are not implemented. + +- [ ] **Step 3: Add arm request and runner** + +Append to `app-instance/backend/beaver/skills/learning/replay.py`: + +```python +@dataclass(slots=True) +class ReplayArmRequest: + case_id: str + arm: str + task_text: str + pinned_skill_names: list[str] = field(default_factory=list) + pinned_skill_contexts: list[Any] = field(default_factory=list) + provider_bundle: Any | None = None + model_settings: dict[str, Any] = field(default_factory=dict) + + +class ReplayRunner: + def __init__(self, *, agent_loop: Any, policy: ReplayToolPolicy | None = None) -> None: + self.agent_loop = agent_loop + self.policy = policy or ReplayToolPolicy() + + async def run_arm(self, request: ReplayArmRequest) -> dict[str, Any]: + loaded = self.agent_loop.boot() + replay_executor = ReplayToolExecutor( + loaded.tool_executor, + registry=loaded.tool_registry, + policy=self.policy, + ) + result = await self.agent_loop.process_direct( + request.task_text, + provider_bundle=request.provider_bundle, + include_skill_assembly=False, + include_tools=True, + pinned_skill_names=request.pinned_skill_names, + pinned_skill_contexts=request.pinned_skill_contexts, + max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4), + temperature=float(request.model_settings.get("temperature") or 0.0), + source="skill_replay_eval", + tool_executor_override=replay_executor, + ) + return { + "case_id": request.case_id, + "arm": request.arm, + "session_id": result.session_id, + "run_id": result.run_id, + "task_text": request.task_text, + "finish_reason": result.finish_reason, + "final_answer": result.output_text, + "tool_calls": list(replay_executor.traces), + "artifacts": [], + "side_effects": _side_effects_from_traces(replay_executor.traces), + } + + +def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]: + effects: list[dict[str, Any]] = [] + for trace in traces: + if trace.get("mode") in {"surrogate", "blocked"}: + effects.append( + { + "tool_name": trace.get("tool_name"), + "mode": trace.get("mode"), + "arguments": trace.get("arguments"), + "classification_reason": trace.get("classification_reason"), + } + ) + return effects +``` + +Export `ReplayRunner` and `ReplayArmRequest` from `__init__.py`. + +- [ ] **Step 4: Run arm runner test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_replay_runner.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/replay.py app-instance/backend/beaver/skills/learning/__init__.py app-instance/backend/tests/unit/test_skill_learning_replay_runner.py +git commit -m "feat(skill-learning): run replay arms through agent loop" +``` + +--- + +### Task 9: Orchestrate Replay Eval Reports + +**Files:** +- Modify: `app-instance/backend/beaver/skills/learning/eval.py` +- Modify: `app-instance/backend/beaver/skills/learning/pipeline.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_eval.py` + +- [ ] **Step 1: Add failing replay eval test** + +Append to `app-instance/backend/tests/unit/test_skill_learning_eval.py`: + +```python +class FakeReplayRunner: + async def run_arm(self, request): + return { + "case_id": request.case_id, + "arm": request.arm, + "session_id": "session-replay", + "run_id": f"{request.arm}-run", + "task_text": request.task_text, + "finish_reason": "stop", + "final_answer": "done", + "tool_calls": [ + { + "tool_name": "write_file", + "mode": "executed", + "arguments": {"path": "README.md"}, + "result": {"success": True, "content": "ok"}, + } + ], + "artifacts": [], + "side_effects": [], + } + + +def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="release-checklist", + proposed_content="# Release\n\nRun tests.", + proposed_frontmatter={"description": "release", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.update_learning_candidate( + "candidate-1", + draft_skill_name=draft.skill_name, + draft_id=draft.draft_id, + ) + + report = asyncio.run( + pipeline.evaluate_draft( + "candidate-1", + draft.skill_name, + draft.draft_id, + provider_bundle=_bundle(), + replay_runner=FakeReplayRunner(), + ) + ) + + assert report.mode == "replay" + assert report.eval_version == "replay-v1" + assert report.case_reports + assert 0.0 <= report.execution_coverage <= 1.0 + assert 0.0 <= report.surrogate_coverage <= 1.0 + assert report.confidence in {"low", "medium", "high"} +``` + +- [ ] **Step 2: Run the new test to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval.py::test_eval_report_includes_replay_case_and_coverage -v +``` + +Expected: FAIL because evaluator still returns `mode="heuristic"`. + +- [ ] **Step 3: Update evaluator imports** + +In `app-instance/backend/beaver/skills/learning/eval.py`, import: + +```python +from beaver.engine.context.builder import SkillContext +from beaver.skills.learning.case_selection import select_replay_cases +from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner +from beaver.skills.learning.surrogate import SurrogateToolEvaluator +from beaver.skills.learning.preservation import check_preservation +``` + +- [ ] **Step 4: Add optional replay runner dependency** + +Update `SkillDraftEvaluator.__init__()`: + +```python + def __init__( + self, + run_store: RunMemoryStore, + *, + surrogate_evaluator: SurrogateToolEvaluator | None = None, + ) -> None: + self.run_store = run_store + self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator() +``` + +Update `evaluate()` to accept the runner explicitly: + +```python + async def evaluate( + self, + candidate: SkillLearningCandidate, + draft: SkillDraft, + *, + provider_bundle: ProviderBundle | None = None, + replay_runner: ReplayRunner | None = None, + ) -> SkillDraftEvalReport: +``` + +Keep the existing provider unavailable branch. If `provider_bundle` is absent, return the current skipped-provider report. If `replay_runner` is absent, use the extracted heuristic evaluator so tests and workers without a loop do not crash. + +- [ ] **Step 5: Pass replay runner through pipeline** + +In `app-instance/backend/beaver/skills/learning/pipeline.py`, update `evaluate_draft()`: + +```python + async def evaluate_draft( + self, + candidate_id: str, + skill_name: str, + draft_id: str, + *, + provider_bundle: ProviderBundle | None = None, + replay_runner: ReplayRunner | None = None, + ) -> SkillDraftEvalReport: +``` + +Pass `replay_runner=replay_runner` to `evaluator.evaluate(...)`. + +- [ ] **Step 6: Build replay cases in evaluate()** + +Inside `evaluate()`, replace the heuristic-only case loop with: + +```python + runs = self.run_store.list_runs() + replay_cases = select_replay_cases(candidate, runs) + if replay_runner is not None and replay_cases: + return await self._evaluate_replay( + candidate=candidate, + draft=draft, + replay_cases=replay_cases, + provider_bundle=provider_bundle, + replay_runner=replay_runner, + ) +``` + +Keep the existing heuristic body as `_evaluate_heuristic()` and call it when no replay cases exist. + +- [ ] **Step 7: Add `_evaluate_replay()`** + +Add method: + +```python + async def _evaluate_replay( + self, + *, + candidate: SkillLearningCandidate, + draft: SkillDraft, + replay_cases: list[dict], + provider_bundle: ProviderBundle, + replay_runner: ReplayRunner, + ) -> SkillDraftEvalReport: + case_reports: list[dict] = [] + legacy_cases: list[dict] = [] + for case in replay_cases: + baseline = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:baseline", + arm="baseline", + task_text=str(case["task_text"]), + pinned_skill_names=list(case.get("baseline_skill_names") or []), + pinned_skill_contexts=[], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) + ) + candidate_arm = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:candidate", + arm="candidate", + task_text=str(case["task_text"]), + pinned_skill_names=[], + pinned_skill_contexts=[_draft_skill_context(draft)], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) + ) + surrogate = await self.surrogate_evaluator.evaluate( + task_text=str(case["task_text"]), + baseline=baseline, + candidate=candidate_arm, + ) + baseline_score = surrogate["baseline_score"] + candidate_score = surrogate["candidate_score"] + case_report = { + "run_id": case["run_id"], + "task_id": case.get("task_id"), + "session_id": case.get("session_id"), + "baseline": baseline, + "candidate": candidate_arm, + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + "confidence": surrogate["confidence"], + "validator_notes": list(surrogate.get("notes") or []), + } + case_reports.append(case_report) + legacy_cases.append( + { + "run_id": case["run_id"], + "session_id": case.get("session_id") or "", + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + } + ) + preservation_report = _preservation_report(candidate, draft) + return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report) +``` + +- [ ] **Step 8: Add helper functions** + +Add module-level helpers in `eval.py`: + +```python +def _draft_skill_context(draft: SkillDraft) -> SkillContext: + tool_hints = draft.proposed_frontmatter.get("tools") + return SkillContext( + name=f"draft:{draft.skill_name}", + content=draft.proposed_content, + version=draft.draft_id, + content_hash="draft", + activation_reason="skill_replay_eval_candidate", + tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [], + ) + + +def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None: + if candidate.kind not in {"revise_skill", "merge_skills"}: + return None + base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else "" + if not base_content.strip(): + return None + return check_preservation(base_content=base_content, draft_content=draft.proposed_content) + + +def _report_from_case_reports( + candidate: SkillLearningCandidate, + draft: SkillDraft, + case_reports: list[dict], + legacy_cases: list[dict], + preservation_report: dict | None, +) -> SkillDraftEvalReport: + baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases) + candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases) + regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]] + improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]] + unchanged = len(legacy_cases) - len(regressions) - len(improved) + execution, surrogate, blocked = _coverage(case_reports) + confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports]) + score_delta = candidate_avg - baseline_avg + passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0 + return SkillDraftEvalReport( + report_id=uuid4().hex, + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id=candidate.candidate_id, + passed=passed, + baseline_score_avg=round(baseline_avg, 4), + candidate_score_avg=round(candidate_avg, 4), + score_delta=round(score_delta, 4), + regression_count=len(regressions), + improved_count=len(improved), + unchanged_count=unchanged, + cases=legacy_cases, + status="completed", + created_at=_utc_now(), + eval_version="replay-v1", + mode="replay", + execution_coverage=execution, + surrogate_coverage=surrogate, + blocked_coverage=blocked, + confidence=confidence, + case_reports=case_reports, + tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked}, + preservation_report=preservation_report, + ) +``` + +Add `_coverage()` and `_confidence()`: + +```python +def _coverage(case_reports: list[dict]) -> tuple[float, float, float]: + counts = {"executed": 0, "surrogate": 0, "blocked": 0} + for report in case_reports: + for arm_name in ("baseline", "candidate"): + arm = report.get(arm_name) if isinstance(report.get(arm_name), dict) else {} + for call in arm.get("tool_calls") or []: + if isinstance(call, dict) and call.get("mode") in counts: + counts[str(call["mode"])] += 1 + total = sum(counts.values()) + if total == 0: + return 1.0, 0.0, 0.0 + return ( + round(counts["executed"] / total, 4), + round(counts["surrogate"] / total, 4), + round(counts["blocked"] / total, 4), + ) + + +def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str: + if blocked > 0.0: + return "low" + if execution >= 0.75 and surrogate <= 0.25: + return "high" + if execution >= 0.25 or "medium" in case_confidences: + return "medium" + return "low" +``` + +- [ ] **Step 9: Run eval test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval.py::test_eval_report_includes_replay_case_and_coverage -v +``` + +Expected: PASS. + +- [ ] **Step 10: Run all skill learning backend tests touched so far** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval_report_model.py tests/unit/test_skill_learning_case_selection.py tests/unit/test_skill_learning_preservation.py tests/unit/test_skill_learning_replay.py tests/unit/test_skill_learning_replay_runner.py tests/unit/test_skill_learning_surrogate.py tests/unit/test_skill_learning_eval.py -v +``` + +Expected: PASS. + +- [ ] **Step 11: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/eval.py app-instance/backend/beaver/skills/learning/pipeline.py app-instance/backend/tests/unit/test_skill_learning_eval.py +git commit -m "feat(skill-learning): produce replay eval reports" +``` + +--- + +### Task 10: Apply Preservation Report During Eval And Publish Gates + +**Files:** +- Modify: `app-instance/backend/beaver/skills/learning/eval.py` +- Modify: `app-instance/backend/beaver/skills/learning/pipeline.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_pipeline.py` + +- [ ] **Step 1: Write failing publish gate test** + +Append to `app-instance/backend/tests/unit/test_skill_learning_pipeline.py`: + +```python +def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="low-confidence", + proposed_content="# Low\n\nDo it.", + proposed_frontmatter={"description": "low", "tools": ["mcp_outlook_send_email"]}, + created_by="test", + reason="test", + ) + pipeline.learning_store.write_eval_report( + SkillDraftEvalReport( + report_id="eval-low", + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.7, + candidate_score_avg=0.9, + score_delta=0.2, + regression_count=0, + improved_count=1, + unchanged_count=0, + confidence="low", + mode="replay", + eval_version="replay-v1", + execution_coverage=0.0, + surrogate_coverage=1.0, + blocked_coverage=0.0, + ) + ) + pipeline.check_safety(draft.skill_name, draft.draft_id) + pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") + pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester") + + with pytest.raises(ValueError, match="low confidence"): + pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester") +``` + +Add import if missing: + +```python +from beaver.memory.skills import SkillDraftEvalReport +``` + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_pipeline.py::test_publish_blocks_low_confidence_replay_report -v +``` + +Expected: FAIL because publish gate does not check confidence. + +- [ ] **Step 3: Update publish gates** + +In `SkillLearningPipelineService._validate_publish_gates()`, after the existing eval pass check, add: + +```python + if eval_report is not None and eval_report.mode == "replay": + if eval_report.confidence == "low": + raise ValueError("Draft replay eval has low confidence and requires revision before publish") + if eval_report.blocked_coverage >= 1.0: + raise ValueError("Draft replay eval blocked all important tool calls") + preservation = eval_report.preservation_report or {} + if preservation.get("passed") is False: + raise ValueError("Draft preservation check did not pass") +``` + +- [ ] **Step 4: Add preservation gate test** + +Append to `app-instance/backend/tests/unit/test_skill_learning_pipeline.py`: + +```python +def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="dropped-section", + proposed_content="# Skill\n\n## Workflow\n\nDo it.", + proposed_frontmatter={"description": "dropped", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.write_eval_report( + SkillDraftEvalReport( + report_id="eval-preservation", + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.7, + candidate_score_avg=0.9, + score_delta=0.2, + regression_count=0, + improved_count=1, + unchanged_count=0, + confidence="medium", + mode="replay", + eval_version="replay-v1", + preservation_report={"passed": False, "risk_level": "high", "dropped_sections": ["Safety"]}, + ) + ) + pipeline.check_safety(draft.skill_name, draft.draft_id) + pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") + pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester") + + with pytest.raises(ValueError, match="preservation"): + pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester") +``` + +- [ ] **Step 5: Run pipeline tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_pipeline.py::test_publish_blocks_low_confidence_replay_report tests/unit/test_skill_learning_pipeline.py::test_publish_blocks_failed_preservation_report -v +``` + +Expected: PASS. + +- [ ] **Step 6: Run pipeline suite** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_pipeline.py tests/unit/test_skill_learning_eval.py -v +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning/eval.py app-instance/backend/beaver/skills/learning/pipeline.py app-instance/backend/tests/unit/test_skill_learning_pipeline.py +git commit -m "feat(skill-learning): gate publish on replay confidence" +``` + +--- + +### Task 11: Update Web API Types And Skills UI + +**Files:** +- Modify: `app-instance/backend/beaver/interfaces/web/app.py` +- Modify: `app-instance/frontend/types/index.ts` +- Modify: `app-instance/frontend/app/(app)/skills/page.tsx` + +- [ ] **Step 1: Inject replay runner in Web API draft evaluation** + +In `app-instance/backend/beaver/interfaces/web/app.py`, import: + +```python +from beaver.skills.learning.replay import ReplayRunner +``` + +In both `synthesize_skill_draft()` and `regenerate_skill_draft()`, keep the loop object instead of only the loaded result: + +```python + loop = agent_service.create_loop() + loaded = loop.boot() +``` + +Update each `evaluate_draft(...)` call: + +```python + await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr] + candidate_id, + draft.skill_name, + draft.draft_id, + provider_bundle=provider_bundle, + replay_runner=ReplayRunner(agent_loop=loop), + ) +``` + +This makes the normal UI path use real replay arms through the isolated `skill_replay_eval` source. + +- [ ] **Step 2: Update TypeScript types** + +In `app-instance/frontend/types/index.ts`, extend `SkillDraftEvalReport`: + +```ts +export interface SkillDraftEvalReport { + report_id: string; + skill_name: string; + draft_id: string; + candidate_id: string; + passed: boolean; + baseline_score_avg: number; + candidate_score_avg: number; + score_delta: number; + regression_count: number; + improved_count: number; + unchanged_count: number; + cases: Array>; + status: string; + created_at: string; + eval_version?: string; + mode?: 'heuristic' | 'replay' | string; + execution_coverage?: number; + surrogate_coverage?: number; + blocked_coverage?: number; + confidence?: 'low' | 'medium' | 'high' | string; + case_reports?: Array>; + tool_mode_summary?: Record; + preservation_report?: Record | null; +} +``` + +- [ ] **Step 3: Add UI metric tiles** + +In `EvalReportPanel` in `app-instance/frontend/app/(app)/skills/page.tsx`, add metric tiles after Delta: + +```tsx + + + +``` + +Add helper near `formatScore`: + +```tsx +function formatPercent(value?: number | null): string { + if (typeof value !== 'number' || Number.isNaN(value)) return '0%'; + return `${Math.round(value * 100)}%`; +} +``` + +- [ ] **Step 4: Add replay details sections** + +Inside `EvalReportPanel`, after existing replay cases table, add: + +```tsx + {Array.isArray(report.case_reports) && report.case_reports.length > 0 ? ( + + ) : null} + {report.preservation_report ? ( + + ) : null} +``` + +- [ ] **Step 5: Run backend web API test** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_web_api.py -v +``` + +Expected: PASS. + +- [ ] **Step 6: Run frontend type check or lint** + +Run: + +```bash +cd app-instance/frontend +npm run lint +``` + +Expected: PASS. If this repo does not define `lint`, run: + +```bash +cd app-instance/frontend +npm test -- --runInBand +``` + +Expected: PASS or existing unrelated failures documented in the final implementation report. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/beaver/interfaces/web/app.py app-instance/frontend/types/index.ts app-instance/frontend/app/\(app\)/skills/page.tsx +git commit -m "feat(skills-ui): show replay eval coverage" +``` + +--- + +### Task 12: Final Verification + +**Files:** +- No new source files. + +- [ ] **Step 1: Run backend skill learning tests** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_eval_report_model.py tests/unit/test_skill_learning_case_selection.py tests/unit/test_skill_learning_preservation.py tests/unit/test_skill_learning_replay.py tests/unit/test_skill_learning_surrogate.py tests/unit/test_skill_learning_eval.py tests/unit/test_skill_learning_pipeline.py tests/unit/test_skill_learning_worker.py tests/unit/test_skill_learning_web_api.py -v +``` + +Expected: PASS. + +- [ ] **Step 2: Run frontend verification** + +Run: + +```bash +cd app-instance/frontend +npm run lint +``` + +Expected: PASS. If unavailable, run the closest existing frontend test command and record the exact command. + +- [ ] **Step 3: Inspect git status** + +Run: + +```bash +git status --short +``` + +Expected: only intentional implementation changes are present; unrelated user changes remain untouched. + +- [ ] **Step 4: Commit any final fixes** + +If verification required fixes: + +```bash +git add +git commit -m "fix(skill-learning): stabilize replay eval" +``` + +Expected: commit succeeds.