feat(skill-learning): produce replay eval reports
This commit is contained in:
@ -4,17 +4,28 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from beaver.engine.context import SkillContext
|
||||||
from beaver.engine.providers import ProviderBundle
|
from beaver.engine.providers import ProviderBundle
|
||||||
from beaver.memory.runs import RunMemoryStore
|
from beaver.memory.runs import RunMemoryStore
|
||||||
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
|
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
|
||||||
|
from beaver.skills.learning.case_selection import select_replay_cases
|
||||||
|
from beaver.skills.learning.preservation import check_preservation
|
||||||
|
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
|
||||||
|
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
|
||||||
from beaver.skills.specs import SkillDraft
|
from beaver.skills.specs import SkillDraft
|
||||||
|
|
||||||
|
|
||||||
class SkillDraftEvaluator:
|
class SkillDraftEvaluator:
|
||||||
"""Builds a bounded eval report without writing user-visible sessions."""
|
"""Builds a bounded eval report without writing user-visible sessions."""
|
||||||
|
|
||||||
def __init__(self, run_store: RunMemoryStore) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
run_store: RunMemoryStore,
|
||||||
|
*,
|
||||||
|
surrogate_evaluator: SurrogateToolEvaluator | None = None,
|
||||||
|
) -> None:
|
||||||
self.run_store = run_store
|
self.run_store = run_store
|
||||||
|
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
|
||||||
|
|
||||||
async def evaluate(
|
async def evaluate(
|
||||||
self,
|
self,
|
||||||
@ -22,11 +33,30 @@ class SkillDraftEvaluator:
|
|||||||
candidate: SkillLearningCandidate,
|
candidate: SkillLearningCandidate,
|
||||||
draft: SkillDraft,
|
draft: SkillDraft,
|
||||||
provider_bundle: ProviderBundle | None,
|
provider_bundle: ProviderBundle | None,
|
||||||
|
replay_runner: ReplayRunner | None = None,
|
||||||
) -> SkillDraftEvalReport:
|
) -> SkillDraftEvalReport:
|
||||||
if provider_bundle is None or provider_bundle.main_provider is None:
|
if provider_bundle is None or provider_bundle.main_provider is None:
|
||||||
return self._skipped(candidate, draft)
|
return self._skipped(candidate, draft)
|
||||||
|
|
||||||
runs_by_id = {record.run_id: record for record in self.run_store.list_runs()}
|
runs = self.run_store.list_runs()
|
||||||
|
replay_cases = select_replay_cases(candidate, runs)
|
||||||
|
if replay_runner is not None and replay_cases:
|
||||||
|
return await self._evaluate_replay(
|
||||||
|
candidate=candidate,
|
||||||
|
draft=draft,
|
||||||
|
replay_cases=replay_cases,
|
||||||
|
provider_bundle=provider_bundle,
|
||||||
|
replay_runner=replay_runner,
|
||||||
|
)
|
||||||
|
return self._evaluate_heuristic(candidate, draft, runs)
|
||||||
|
|
||||||
|
def _evaluate_heuristic(
|
||||||
|
self,
|
||||||
|
candidate: SkillLearningCandidate,
|
||||||
|
draft: SkillDraft,
|
||||||
|
runs: list,
|
||||||
|
) -> SkillDraftEvalReport:
|
||||||
|
runs_by_id = {record.run_id: record for record in runs}
|
||||||
cases: list[dict] = []
|
cases: list[dict] = []
|
||||||
for run_id in candidate.source_run_ids[:8]:
|
for run_id in candidate.source_run_ids[:8]:
|
||||||
record = runs_by_id.get(run_id)
|
record = runs_by_id.get(run_id)
|
||||||
@ -78,6 +108,78 @@ class SkillDraftEvaluator:
|
|||||||
created_at=_utc_now(),
|
created_at=_utc_now(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def _evaluate_replay(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
candidate: SkillLearningCandidate,
|
||||||
|
draft: SkillDraft,
|
||||||
|
replay_cases: list[dict],
|
||||||
|
provider_bundle: ProviderBundle,
|
||||||
|
replay_runner: ReplayRunner,
|
||||||
|
) -> SkillDraftEvalReport:
|
||||||
|
case_reports: list[dict] = []
|
||||||
|
legacy_cases: list[dict] = []
|
||||||
|
for case in replay_cases:
|
||||||
|
baseline = await replay_runner.run_arm(
|
||||||
|
ReplayArmRequest(
|
||||||
|
case_id=f"{case['run_id']}:baseline",
|
||||||
|
arm="baseline",
|
||||||
|
task_text=str(case["task_text"]),
|
||||||
|
pinned_skill_names=list(case.get("baseline_skill_names") or []),
|
||||||
|
pinned_skill_contexts=[],
|
||||||
|
provider_bundle=provider_bundle,
|
||||||
|
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
candidate_arm = await replay_runner.run_arm(
|
||||||
|
ReplayArmRequest(
|
||||||
|
case_id=f"{case['run_id']}:candidate",
|
||||||
|
arm="candidate",
|
||||||
|
task_text=str(case["task_text"]),
|
||||||
|
pinned_skill_names=[],
|
||||||
|
pinned_skill_contexts=[_draft_skill_context(draft)],
|
||||||
|
provider_bundle=provider_bundle,
|
||||||
|
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
surrogate = await self.surrogate_evaluator.evaluate(
|
||||||
|
task_text=str(case["task_text"]),
|
||||||
|
baseline=baseline,
|
||||||
|
candidate=candidate_arm,
|
||||||
|
)
|
||||||
|
baseline_score = surrogate["baseline_score"]
|
||||||
|
candidate_score = surrogate["candidate_score"]
|
||||||
|
case_report = {
|
||||||
|
"run_id": case["run_id"],
|
||||||
|
"task_id": case.get("task_id"),
|
||||||
|
"session_id": case.get("session_id"),
|
||||||
|
"baseline": baseline,
|
||||||
|
"candidate": candidate_arm,
|
||||||
|
"baseline_score": baseline_score,
|
||||||
|
"candidate_score": candidate_score,
|
||||||
|
"delta": round(candidate_score - baseline_score, 4),
|
||||||
|
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
||||||
|
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
||||||
|
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
||||||
|
"confidence": surrogate["confidence"],
|
||||||
|
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
|
||||||
|
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
|
||||||
|
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
|
||||||
|
"validator_notes": list(surrogate.get("notes") or []),
|
||||||
|
}
|
||||||
|
case_reports.append(case_report)
|
||||||
|
legacy_cases.append(
|
||||||
|
{
|
||||||
|
"run_id": case["run_id"],
|
||||||
|
"session_id": case.get("session_id") or "",
|
||||||
|
"baseline_score": baseline_score,
|
||||||
|
"candidate_score": candidate_score,
|
||||||
|
"delta": round(candidate_score - baseline_score, 4),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
preservation_report = _preservation_report(candidate, draft)
|
||||||
|
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
|
||||||
|
|
||||||
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
||||||
return SkillDraftEvalReport(
|
return SkillDraftEvalReport(
|
||||||
report_id=uuid4().hex,
|
report_id=uuid4().hex,
|
||||||
@ -115,6 +217,108 @@ def _candidate_score(baseline: float, draft: SkillDraft) -> float:
|
|||||||
return min(1.0, max(0.75, baseline + 0.05))
|
return min(1.0, max(0.75, baseline + 0.05))
|
||||||
|
|
||||||
|
|
||||||
|
def _draft_skill_context(draft: SkillDraft) -> SkillContext:
|
||||||
|
tool_hints = draft.proposed_frontmatter.get("tools")
|
||||||
|
return SkillContext(
|
||||||
|
name=f"draft:{draft.skill_name}",
|
||||||
|
content=draft.proposed_content,
|
||||||
|
version=draft.draft_id,
|
||||||
|
content_hash="draft",
|
||||||
|
activation_reason="skill_replay_eval_candidate",
|
||||||
|
tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
|
||||||
|
if candidate.kind not in {"revise_skill", "merge_skills"}:
|
||||||
|
return None
|
||||||
|
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
|
||||||
|
if not base_content.strip():
|
||||||
|
return None
|
||||||
|
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
|
||||||
|
|
||||||
|
|
||||||
|
def _report_from_case_reports(
|
||||||
|
candidate: SkillLearningCandidate,
|
||||||
|
draft: SkillDraft,
|
||||||
|
case_reports: list[dict],
|
||||||
|
legacy_cases: list[dict],
|
||||||
|
preservation_report: dict | None,
|
||||||
|
) -> SkillDraftEvalReport:
|
||||||
|
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
|
||||||
|
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
|
||||||
|
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
|
||||||
|
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
|
||||||
|
unchanged = len(legacy_cases) - len(regressions) - len(improved)
|
||||||
|
execution, surrogate, blocked = _coverage(case_reports)
|
||||||
|
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
|
||||||
|
score_delta = candidate_avg - baseline_avg
|
||||||
|
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
|
||||||
|
return SkillDraftEvalReport(
|
||||||
|
report_id=uuid4().hex,
|
||||||
|
skill_name=draft.skill_name,
|
||||||
|
draft_id=draft.draft_id,
|
||||||
|
candidate_id=candidate.candidate_id,
|
||||||
|
passed=passed,
|
||||||
|
baseline_score_avg=round(baseline_avg, 4),
|
||||||
|
candidate_score_avg=round(candidate_avg, 4),
|
||||||
|
score_delta=round(score_delta, 4),
|
||||||
|
regression_count=len(regressions),
|
||||||
|
improved_count=len(improved),
|
||||||
|
unchanged_count=unchanged,
|
||||||
|
cases=legacy_cases,
|
||||||
|
status="completed",
|
||||||
|
created_at=_utc_now(),
|
||||||
|
eval_version="replay-v1",
|
||||||
|
mode="replay",
|
||||||
|
execution_coverage=execution,
|
||||||
|
surrogate_coverage=surrogate,
|
||||||
|
blocked_coverage=blocked,
|
||||||
|
confidence=confidence,
|
||||||
|
case_reports=case_reports,
|
||||||
|
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
|
||||||
|
preservation_report=preservation_report,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
|
||||||
|
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
|
||||||
|
for report in case_reports:
|
||||||
|
for call in report.get("tool_calls") or []:
|
||||||
|
if isinstance(call, dict) and call.get("mode") in counts:
|
||||||
|
counts[str(call["mode"])] += 1
|
||||||
|
total = sum(counts.values())
|
||||||
|
if total == 0:
|
||||||
|
return 1.0, 0.0, 0.0
|
||||||
|
return (
|
||||||
|
round(counts["executed"] / total, 4),
|
||||||
|
round(counts["surrogate"] / total, 4),
|
||||||
|
round(counts["blocked"] / total, 4),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str:
|
||||||
|
if blocked > 0.0:
|
||||||
|
return "low"
|
||||||
|
if execution >= 0.75 and surrogate <= 0.25:
|
||||||
|
return "high"
|
||||||
|
if execution >= 0.25 or "medium" in case_confidences:
|
||||||
|
return "medium"
|
||||||
|
return "low"
|
||||||
|
|
||||||
|
|
||||||
|
def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float:
|
||||||
|
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
|
||||||
|
if not calls:
|
||||||
|
return 1.0 if mode == "executed" else 0.0
|
||||||
|
return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int:
|
||||||
|
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
|
||||||
|
return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode)
|
||||||
|
|
||||||
|
|
||||||
def _utc_now() -> str:
|
def _utc_now() -> str:
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ from beaver.engine.providers import ProviderBundle
|
|||||||
from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore
|
from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore
|
||||||
from beaver.skills.drafts import DraftService
|
from beaver.skills.drafts import DraftService
|
||||||
from beaver.skills.learning.eval import SkillDraftEvaluator
|
from beaver.skills.learning.eval import SkillDraftEvaluator
|
||||||
|
from beaver.skills.learning.replay import ReplayRunner
|
||||||
from beaver.skills.learning.service import SkillLearningService
|
from beaver.skills.learning.service import SkillLearningService
|
||||||
from beaver.skills.learning.safety import SkillDraftSafetyChecker
|
from beaver.skills.learning.safety import SkillDraftSafetyChecker
|
||||||
from beaver.skills.publisher import SkillPublisher
|
from beaver.skills.publisher import SkillPublisher
|
||||||
@ -285,11 +286,17 @@ class SkillLearningPipelineService:
|
|||||||
draft_id: str,
|
draft_id: str,
|
||||||
*,
|
*,
|
||||||
provider_bundle: ProviderBundle | None,
|
provider_bundle: ProviderBundle | None,
|
||||||
|
replay_runner: ReplayRunner | None = None,
|
||||||
) -> SkillDraftEvalReport:
|
) -> SkillDraftEvalReport:
|
||||||
draft = self.get_draft(skill_name, draft_id)
|
draft = self.get_draft(skill_name, draft_id)
|
||||||
candidate = self.get_candidate(candidate_id)
|
candidate = self.get_candidate(candidate_id)
|
||||||
evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store)
|
evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store)
|
||||||
report = await evaluator.evaluate(candidate=candidate, draft=draft, provider_bundle=provider_bundle)
|
report = await evaluator.evaluate(
|
||||||
|
candidate=candidate,
|
||||||
|
draft=draft,
|
||||||
|
provider_bundle=provider_bundle,
|
||||||
|
replay_runner=replay_runner,
|
||||||
|
)
|
||||||
self.learning_store.write_eval_report(report)
|
self.learning_store.write_eval_report(report)
|
||||||
if report.status == "skipped_provider_unavailable":
|
if report.status == "skipped_provider_unavailable":
|
||||||
status = "draft_ready"
|
status = "draft_ready"
|
||||||
|
|||||||
@ -44,6 +44,7 @@ def _pipeline(tmp_path: Path, *, task_score: float = 0.8) -> SkillLearningPipeli
|
|||||||
ended_at="end",
|
ended_at="end",
|
||||||
success=True,
|
success=True,
|
||||||
finish_reason="stop",
|
finish_reason="stop",
|
||||||
|
feedback={"acceptance_type": "accept"},
|
||||||
validation_result={"score": task_score, "passed": True},
|
validation_result={"score": task_score, "passed": True},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -156,3 +157,59 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
|
|||||||
assert safety.passed is False
|
assert safety.passed is False
|
||||||
assert report.passed is True
|
assert report.passed is True
|
||||||
assert pipeline.get_candidate("candidate-1").status == "safety_failed"
|
assert pipeline.get_candidate("candidate-1").status == "safety_failed"
|
||||||
|
|
||||||
|
|
||||||
|
class FakeReplayRunner:
|
||||||
|
async def run_arm(self, request):
|
||||||
|
return {
|
||||||
|
"case_id": request.case_id,
|
||||||
|
"arm": request.arm,
|
||||||
|
"session_id": "session-replay",
|
||||||
|
"run_id": f"{request.arm}-run",
|
||||||
|
"task_text": request.task_text,
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"final_answer": "done",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"tool_name": "write_file",
|
||||||
|
"mode": "executed",
|
||||||
|
"arguments": {"path": "README.md"},
|
||||||
|
"result": {"success": True, "content": "ok"},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"artifacts": [],
|
||||||
|
"side_effects": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
|
||||||
|
pipeline = _pipeline(tmp_path)
|
||||||
|
draft = pipeline.draft_service.create_new_skill_draft(
|
||||||
|
skill_name="release-checklist",
|
||||||
|
proposed_content="# Release\n\nRun tests.",
|
||||||
|
proposed_frontmatter={"description": "release", "tools": []},
|
||||||
|
created_by="test",
|
||||||
|
reason="test",
|
||||||
|
)
|
||||||
|
pipeline.learning_store.update_learning_candidate(
|
||||||
|
"candidate-1",
|
||||||
|
draft_skill_name=draft.skill_name,
|
||||||
|
draft_id=draft.draft_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
report = asyncio.run(
|
||||||
|
pipeline.evaluate_draft(
|
||||||
|
"candidate-1",
|
||||||
|
draft.skill_name,
|
||||||
|
draft.draft_id,
|
||||||
|
provider_bundle=_bundle(),
|
||||||
|
replay_runner=FakeReplayRunner(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert report.mode == "replay"
|
||||||
|
assert report.eval_version == "replay-v1"
|
||||||
|
assert report.case_reports
|
||||||
|
assert 0.0 <= report.execution_coverage <= 1.0
|
||||||
|
assert 0.0 <= report.surrogate_coverage <= 1.0
|
||||||
|
assert report.confidence in {"low", "medium", "high"}
|
||||||
|
|||||||
Reference in New Issue
Block a user