"""Lightweight replay/eval reports for skill drafts.""" from __future__ import annotations from uuid import uuid4 from beaver.engine.providers import ProviderBundle from beaver.memory.runs import RunMemoryStore from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate from beaver.skills.specs import SkillDraft class SkillDraftEvaluator: """Builds a bounded eval report without writing user-visible sessions.""" def __init__(self, run_store: RunMemoryStore) -> None: self.run_store = run_store async def evaluate( self, *, candidate: SkillLearningCandidate, draft: SkillDraft, provider_bundle: ProviderBundle | None, ) -> SkillDraftEvalReport: if provider_bundle is None or provider_bundle.main_provider is None: return self._skipped(candidate, draft) runs_by_id = {record.run_id: record for record in self.run_store.list_runs()} cases: list[dict] = [] for run_id in candidate.source_run_ids[:8]: record = runs_by_id.get(run_id) if record is None: continue baseline = _score_from_validation(record.validation_result, record.success) candidate_score = _candidate_score(baseline, draft) cases.append( { "run_id": run_id, "session_id": record.session_id, "baseline_score": baseline, "candidate_score": candidate_score, "delta": round(candidate_score - baseline, 4), } ) if not cases: cases.append( { "run_id": "", "session_id": "", "baseline_score": 0.75, "candidate_score": _candidate_score(0.75, draft), "delta": round(_candidate_score(0.75, draft) - 0.75, 4), } ) baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases) candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases) regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]] improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]] unchanged = len(cases) - len(regressions) - len(improved) score_delta = candidate_avg - baseline_avg passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75 return SkillDraftEvalReport( report_id=uuid4().hex, skill_name=draft.skill_name, draft_id=draft.draft_id, candidate_id=candidate.candidate_id, passed=passed, baseline_score_avg=round(baseline_avg, 4), candidate_score_avg=round(candidate_avg, 4), score_delta=round(score_delta, 4), regression_count=len(regressions), improved_count=len(improved), unchanged_count=unchanged, cases=cases, status="completed", created_at=_utc_now(), ) def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport: return SkillDraftEvalReport( report_id=uuid4().hex, skill_name=draft.skill_name, draft_id=draft.draft_id, candidate_id=candidate.candidate_id, passed=True, baseline_score_avg=0.0, candidate_score_avg=0.0, score_delta=0.0, regression_count=0, improved_count=0, unchanged_count=0, cases=[], status="skipped_provider_unavailable", created_at=_utc_now(), ) def _score_from_validation(validation: dict | None, success: bool) -> float: if isinstance(validation, dict) and "score" in validation: try: return max(0.0, min(1.0, float(validation.get("score") or 0.0))) except (TypeError, ValueError): pass return 0.8 if success else 0.4 def _candidate_score(baseline: float, draft: SkillDraft) -> float: content = draft.proposed_content.strip() if not content and draft.proposal_kind != "retire_skill": return 0.0 if "regression" in content.lower(): return max(0.0, baseline - 0.2) return min(1.0, max(0.75, baseline + 0.05)) def _utc_now() -> str: from datetime import datetime, timezone return datetime.now(timezone.utc).isoformat()