"""Lightweight replay/eval reports for skill drafts.""" from __future__ import annotations import json from typing import Any from uuid import uuid4 from beaver.engine.context import SkillContext from beaver.engine.providers import ProviderBundle from beaver.memory.runs import RunMemoryStore from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate from beaver.skills.learning.case_selection import select_replay_cases from beaver.skills.learning.preservation import check_preservation from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner from beaver.skills.learning.surrogate import SurrogateToolEvaluator from beaver.skills.specs import SkillDraft class SkillDraftEvaluator: """Builds a bounded eval report without writing user-visible sessions.""" def __init__( self, run_store: RunMemoryStore, *, surrogate_evaluator: SurrogateToolEvaluator | None = None, ) -> None: self.run_store = run_store self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator() async def evaluate( self, *, candidate: SkillLearningCandidate, draft: SkillDraft, provider_bundle: ProviderBundle | None, replay_runner: ReplayRunner | None = None, ) -> SkillDraftEvalReport: if provider_bundle is None or provider_bundle.main_provider is None: return self._skipped(candidate, draft) runs = self.run_store.list_runs() if replay_runner is not None: replay_cases, case_selection_meta = await _prepare_eval_cases( candidate=candidate, draft=draft, historical_cases=select_replay_cases(candidate, runs), provider_bundle=provider_bundle, ) else: replay_cases = [] case_selection_meta = {} if replay_runner is not None and replay_cases: return await self._evaluate_replay( candidate=candidate, draft=draft, replay_cases=replay_cases, provider_bundle=provider_bundle, replay_runner=replay_runner, case_selection_meta=case_selection_meta, ) return self._evaluate_heuristic(candidate, draft, runs) def _evaluate_heuristic( self, candidate: SkillLearningCandidate, draft: SkillDraft, runs: list, ) -> SkillDraftEvalReport: runs_by_id = {record.run_id: record for record in runs} cases: list[dict] = [] for run_id in candidate.source_run_ids[:10]: record = runs_by_id.get(run_id) if record is None: continue baseline = _score_from_validation(record.validation_result, record.success) candidate_score = _candidate_score(baseline, draft) cases.append( { "run_id": run_id, "session_id": record.session_id, "baseline_score": baseline, "candidate_score": candidate_score, "delta": round(candidate_score - baseline, 4), } ) if not cases: cases.append( { "run_id": "", "session_id": "", "baseline_score": 0.75, "candidate_score": _candidate_score(0.75, draft), "delta": round(_candidate_score(0.75, draft) - 0.75, 4), } ) baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases) candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases) regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]] improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]] unchanged = len(cases) - len(regressions) - len(improved) score_delta = candidate_avg - baseline_avg passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75 return SkillDraftEvalReport( report_id=uuid4().hex, skill_name=draft.skill_name, draft_id=draft.draft_id, candidate_id=candidate.candidate_id, passed=passed, baseline_score_avg=round(baseline_avg, 4), candidate_score_avg=round(candidate_avg, 4), score_delta=round(score_delta, 4), regression_count=len(regressions), improved_count=len(improved), unchanged_count=unchanged, cases=cases, status="completed", created_at=_utc_now(), ) async def _evaluate_replay( self, *, candidate: SkillLearningCandidate, draft: SkillDraft, replay_cases: list[dict], provider_bundle: ProviderBundle, replay_runner: ReplayRunner, case_selection_meta: dict[str, Any] | None = None, ) -> SkillDraftEvalReport: case_reports: list[dict] = [] legacy_cases: list[dict] = [] for case in replay_cases: baseline = await replay_runner.run_arm( ReplayArmRequest( case_id=f"{case['run_id']}:baseline", arm="baseline", task_text=str(case["task_text"]), pinned_skill_names=list(case.get("baseline_skill_names") or []), pinned_skill_contexts=[], provider_bundle=provider_bundle, model_settings={"max_tool_iterations": 4, "temperature": 0.0}, ) ) candidate_arm = await replay_runner.run_arm( ReplayArmRequest( case_id=f"{case['run_id']}:candidate", arm="candidate", task_text=str(case["task_text"]), pinned_skill_names=[], pinned_skill_contexts=[_draft_skill_context(draft)], provider_bundle=provider_bundle, model_settings={"max_tool_iterations": 4, "temperature": 0.0}, ) ) surrogate = await self.surrogate_evaluator.evaluate( task_text=str(case["task_text"]), baseline=baseline, candidate=candidate_arm, ) baseline_ability = _ability_score( case=case, arm=baseline, arm_name="baseline", ) candidate_ability = _ability_score( case=case, arm=candidate_arm, arm_name="candidate", ) baseline_score = baseline_ability["final_score"] candidate_score = candidate_ability["final_score"] tool_execution_score = { "baseline_score": surrogate["baseline_score"], "candidate_score": surrogate["candidate_score"], "delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4), "score_role": "diagnostic_only", } case_report = { "run_id": case["run_id"], "task_id": case.get("task_id"), "session_id": case.get("session_id"), "task_text": case.get("task_text"), "synthetic": bool(case.get("synthetic")), "tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"), "validator": case.get("validator"), "baseline": baseline, "candidate": candidate_arm, "baseline_score": baseline_score, "candidate_score": candidate_score, "delta": round(candidate_score - baseline_score, 4), "ability_score": { "baseline": baseline_ability, "candidate": candidate_ability, "delta": round(candidate_score - baseline_score, 4), }, "tool_execution_score": tool_execution_score, "execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"), "surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"), "blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"), "confidence": surrogate["confidence"], "tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])], "artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])], "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])], "validator_notes": list(surrogate.get("notes") or []), } case_reports.append(case_report) legacy_cases.append( { "run_id": case["run_id"], "session_id": case.get("session_id") or "", "task_text": case.get("task_text") or "", "synthetic": bool(case.get("synthetic")), "tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"), "baseline_score": baseline_score, "candidate_score": candidate_score, "delta": round(candidate_score - baseline_score, 4), } ) preservation_report = _preservation_report(candidate, draft) return _report_from_case_reports( candidate, draft, case_reports, legacy_cases, preservation_report, case_selection_meta or {}, ) def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport: return SkillDraftEvalReport( report_id=uuid4().hex, skill_name=draft.skill_name, draft_id=draft.draft_id, candidate_id=candidate.candidate_id, passed=True, baseline_score_avg=0.0, candidate_score_avg=0.0, score_delta=0.0, regression_count=0, improved_count=0, unchanged_count=0, cases=[], status="skipped_provider_unavailable", created_at=_utc_now(), ) def _score_from_validation(validation: dict | None, success: bool) -> float: if isinstance(validation, dict) and "score" in validation: try: return max(0.0, min(1.0, float(validation.get("score") or 0.0))) except (TypeError, ValueError): pass return 0.8 if success else 0.4 def _candidate_score(baseline: float, draft: SkillDraft) -> float: content = draft.proposed_content.strip() if not content and draft.proposal_kind != "retire_skill": return 0.0 if "regression" in content.lower(): return max(0.0, baseline - 0.2) return min(1.0, max(0.75, baseline + 0.05)) def _draft_skill_context(draft: SkillDraft) -> SkillContext: tool_hints = draft.proposed_frontmatter.get("tools") return SkillContext( name=f"draft:{draft.skill_name}", content=draft.proposed_content, version=draft.draft_id, content_hash="draft", activation_reason="skill_replay_eval_candidate", tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [], ) def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None: if candidate.kind not in {"revise_skill", "merge_skills"}: return None base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else "" if not base_content.strip(): return None return check_preservation(base_content=base_content, draft_content=draft.proposed_content) async def _prepare_eval_cases( *, candidate: SkillLearningCandidate, draft: SkillDraft, historical_cases: list[dict[str, Any]], provider_bundle: ProviderBundle, ) -> tuple[list[dict[str, Any]], dict[str, Any]]: explicit_cases = _explicit_eval_cases(candidate) merged = _dedupe_cases([*explicit_cases, *historical_cases]) usable, excluded = _filter_unscorable_cases(merged) missing = max(0, 10 - len(usable)) generated: list[dict[str, Any]] = [] if missing: generated = await _generate_synthetic_cases( candidate=candidate, draft=draft, historical_cases=usable, provider_bundle=provider_bundle, count=missing, ) generated, generated_excluded = _filter_unscorable_cases(generated) excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"] if len(generated) < missing: generated.extend( _fallback_synthetic_cases( candidate=candidate, historical_cases=usable, start_index=len(generated) + 1, count=missing - len(generated), ) ) prepared = [*usable, *generated] return prepared[:10], { "requested_case_count": 10, "historical_case_count": len(historical_cases), "explicit_case_count": len(explicit_cases), "generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")), "excluded_synthetic_without_validator": excluded["synthetic_without_validator"], } def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]: raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None if not isinstance(raw_cases, list): return [] result: list[dict[str, Any]] = [] for index, raw in enumerate(raw_cases, start=1): if not isinstance(raw, dict): continue task_text = str(raw.get("task_text") or "").strip() if not task_text: continue case = { "run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"), "task_id": raw.get("task_id") or f"explicit-{index:02d}", "session_id": raw.get("session_id") or "explicit-eval", "task_text": task_text, "baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)), "candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name, "accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75), "synthetic": bool(raw.get("synthetic")), "tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"), } if isinstance(raw.get("validator"), dict): case["validator"] = dict(raw["validator"]) result.append(case) return result def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]: result: list[dict[str, Any]] = [] seen: set[str] = set() for case in cases: run_id = str(case.get("run_id") or "") task_text = str(case.get("task_text") or "") key = run_id or task_text if not key or key in seen: continue seen.add(key) result.append(case) return result def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]: result: list[dict[str, Any]] = [] excluded = {"synthetic_without_validator": 0} for case in cases: if case.get("synthetic") and not isinstance(case.get("validator"), dict): excluded["synthetic_without_validator"] += 1 continue result.append(case) return result, excluded async def _generate_synthetic_cases( *, candidate: SkillLearningCandidate, draft: SkillDraft, historical_cases: list[dict[str, Any]], provider_bundle: ProviderBundle, count: int, ) -> list[dict[str, Any]]: provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime model = getattr(runtime, "model", None) try: response = await provider.chat( messages=[ { "role": "system", "content": ( "You generate validator-first Beaver skill evaluation cases. " "Return only JSON with key cases. Each case must include task_text and validator. " "Validator type should be final_answer_contains with required_terms and optional forbidden_terms." ), }, { "role": "user", "content": _synthetic_case_prompt( candidate=candidate, draft=draft, historical_cases=historical_cases, count=count, ), }, ], model=model, max_tokens=2200, temperature=0.4, ) except Exception: return [] payload = _parse_json_payload(response.content or "") raw_cases = payload.get("cases") if isinstance(payload, dict) else None if not isinstance(raw_cases, list): return [] return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count) def _synthetic_case_prompt( *, candidate: SkillLearningCandidate, draft: SkillDraft, historical_cases: list[dict[str, Any]], count: int, ) -> str: historical = [ { "run_id": item.get("run_id"), "task_text": item.get("task_text"), "validator": item.get("validator"), } for item in historical_cases ] return ( f"Generate {count} synthetic evaluation cases for this skill draft.\n\n" f"Candidate kind: {candidate.kind}\n" f"Candidate reason: {candidate.reason}\n" f"Draft skill name: {draft.skill_name}\n" f"Related skills: {candidate.related_skill_names}\n" f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n" "Every synthetic case must be validator-first. Return exactly:\n" '{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",' '"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}' ) def _parse_json_payload(content: str) -> dict[str, Any]: cleaned = content.strip() if cleaned.startswith("```"): cleaned = cleaned.strip("`") if cleaned.startswith("json"): cleaned = cleaned[4:] try: payload = json.loads(cleaned) except json.JSONDecodeError: start = cleaned.find("{") end = cleaned.rfind("}") if start < 0 or end <= start: return {} try: payload = json.loads(cleaned[start : end + 1]) except json.JSONDecodeError: return {} return payload if isinstance(payload, dict) else {} def _synthetic_case_payloads( candidate: SkillLearningCandidate, raw_cases: list[Any], *, start_index: int, limit: int, ) -> list[dict[str, Any]]: result: list[dict[str, Any]] = [] for raw in raw_cases: if not isinstance(raw, dict): continue task_text = str(raw.get("task_text") or "").strip() validator = raw.get("validator") if not task_text or not isinstance(validator, dict): continue result.append( _synthetic_case_payload( candidate, task_text, start_index + len(result), validator=dict(validator), tier=str(raw.get("tier") or "bronze"), ) ) if len(result) >= limit: break return result def _fallback_synthetic_cases( *, candidate: SkillLearningCandidate, historical_cases: list[dict[str, Any]], start_index: int, count: int, ) -> list[dict[str, Any]]: seed_text = "" if historical_cases: seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "") if not seed_text: seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill" required_terms = _terms(seed_text)[:2] or ["done"] return [ _synthetic_case_payload( candidate, f"Complete a realistic task related to {seed_text}. Scenario {index}.", index, validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []}, tier="bronze", ) for index in range(start_index, start_index + count) ] def _synthetic_case_payload( candidate: SkillLearningCandidate, task_text: str, index: int, *, validator: dict[str, Any], tier: str, ) -> dict[str, Any]: return { "run_id": f"synthetic:{candidate.candidate_id}:{index:02d}", "task_id": f"synthetic-{index:02d}", "session_id": "synthetic-eval", "task_text": task_text, "baseline_skill_names": _baseline_skill_names(candidate), "candidate_skill_name": candidate.draft_skill_name, "accepted_score": 0.75, "synthetic": True, "tier": tier, "validator": validator, } def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]: if candidate.kind == "revise_skill": return list(candidate.related_skill_names[:1]) if candidate.kind == "merge_skills": return list(candidate.related_skill_names) return [] def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]: validator = case.get("validator") if isinstance(case.get("validator"), dict) else None if validator is not None: return _ability_from_validator(validator, arm) if not case.get("synthetic"): score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"] return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge") return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."]) def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]: validator_type = str(validator.get("type") or "") if validator_type != "final_answer_contains": return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"]) answer = str(arm.get("final_answer") or "").lower() required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()] forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()] matched = sum(1 for term in required_terms if term in answer) outcome = matched / max(1, len(required_terms)) unsafe = any(term in answer for term in forbidden_terms) safety = 0.0 if unsafe else 1.0 final_score = ( 0.40 * outcome + 0.25 * outcome + 0.15 * _process_validity(arm) + 0.10 * safety + 0.10 * _path_efficiency(arm, outcome) ) return { **_ability_breakdown(score=final_score, source="auto_validator"), "outcome_correctness": round(outcome, 4), "artifact_correctness": round(outcome, 4), "safety_no_regression": round(safety, 4), "validator_type": validator_type, } def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]: answer = str(arm.get("final_answer") or "").strip() score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3 return _ability_breakdown(score=score, source=source, notes=notes) def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]: bounded = _bounded_score(score, default=0.0) return { "outcome_correctness": bounded, "artifact_correctness": bounded, "process_validity": bounded, "safety_no_regression": bounded, "path_efficiency": bounded, "final_score": round(bounded, 4), "source": source, "notes": list(notes or []), } def _process_validity(arm: dict[str, Any]) -> float: if arm.get("finish_reason") == "error": return 0.2 return 0.8 if arm.get("tool_calls") else 0.6 def _path_efficiency(arm: dict[str, Any], outcome: float) -> float: if outcome < 0.5: return 0.3 call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)]) if call_count <= 3: return 1.0 if call_count <= 6: return 0.7 return 0.4 def _bounded_score(value: Any, *, default: float) -> float: try: return max(0.0, min(1.0, float(value))) except (TypeError, ValueError): return default def _terms(text: str) -> list[str]: return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3] def _report_from_case_reports( candidate: SkillLearningCandidate, draft: SkillDraft, case_reports: list[dict], legacy_cases: list[dict], preservation_report: dict | None, case_selection_meta: dict[str, Any] | None = None, ) -> SkillDraftEvalReport: baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases) candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases) regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]] improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]] unchanged = len(legacy_cases) - len(regressions) - len(improved) real_cases = [item for item in legacy_cases if not item.get("synthetic")] synthetic_cases = [item for item in legacy_cases if item.get("synthetic")] execution, surrogate, blocked = _coverage(case_reports) confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports]) score_delta = candidate_avg - baseline_avg passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0 selection_meta = dict(case_selection_meta or {}) real_score_avg = _avg([item["candidate_score"] for item in real_cases]) synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases]) overall_score_avg = round(candidate_avg, 4) ability_summary = { "score_role": "primary", "real_case_count": len(real_cases), "synthetic_case_count": len(synthetic_cases), "real_score_avg": real_score_avg, "synthetic_score_avg": synthetic_score_avg, "overall_score_avg": overall_score_avg, } tool_execution_summary = { "score_role": "diagnostic_only", "executed": execution, "surrogate": surrogate, "blocked": blocked, } return SkillDraftEvalReport( report_id=uuid4().hex, skill_name=draft.skill_name, draft_id=draft.draft_id, candidate_id=candidate.candidate_id, passed=passed, baseline_score_avg=round(baseline_avg, 4), candidate_score_avg=round(candidate_avg, 4), score_delta=round(score_delta, 4), regression_count=len(regressions), improved_count=len(improved), unchanged_count=unchanged, cases=legacy_cases, status="completed", created_at=_utc_now(), eval_version="replay-v1", mode="replay", execution_coverage=execution, surrogate_coverage=surrogate, blocked_coverage=blocked, confidence=confidence, case_reports=case_reports, tool_mode_summary={ "executed": execution, "surrogate": surrogate, "blocked": blocked, "score_role": "diagnostic_only", "real_case_count": len(real_cases), "synthetic_case_count": len(synthetic_cases), "real_score_avg": real_score_avg, "synthetic_score_avg": synthetic_score_avg, "overall_score_avg": overall_score_avg, **selection_meta, }, ability_score_summary=ability_summary, tool_execution_summary=tool_execution_summary, case_selection_summary=selection_meta, real_score_avg=real_score_avg, synthetic_score_avg=synthetic_score_avg, overall_score_avg=overall_score_avg, preservation_report=preservation_report, ) def _avg(values: list[float]) -> float | None: if not values: return None return round(sum(values) / len(values), 4) def _coverage(case_reports: list[dict]) -> tuple[float, float, float]: counts = {"executed": 0, "surrogate": 0, "blocked": 0} for report in case_reports: for call in report.get("tool_calls") or []: if isinstance(call, dict) and call.get("mode") in counts: counts[str(call["mode"])] += 1 total = sum(counts.values()) if total == 0: return 1.0, 0.0, 0.0 return ( round(counts["executed"] / total, 4), round(counts["surrogate"] / total, 4), round(counts["blocked"] / total, 4), ) def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str: if blocked > 0.0: return "low" if execution >= 0.75 and surrogate <= 0.25: return "high" if execution >= 0.25 or "medium" in case_confidences: return "medium" return "low" def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float: calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])] if not calls: return 1.0 if mode == "executed" else 0.0 return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4) def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int: calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])] return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) def _utc_now() -> str: from datetime import datetime, timezone return datetime.now(timezone.utc).isoformat()