From 3a16dc283d6df1dd5719a62f2acfbb8e14c5cdea Mon Sep 17 00:00:00 2001 From: steven_li Date: Mon, 8 Jun 2026 13:26:12 +0800 Subject: [PATCH] feat(skill-learning): extend eval report payload --- .../backend/beaver/memory/skills/models.py | 46 ++++++++++++++ .../test_skill_learning_eval_report_model.py | 61 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py diff --git a/app-instance/backend/beaver/memory/skills/models.py b/app-instance/backend/beaver/memory/skills/models.py index 7151511..c3cea30 100644 --- a/app-instance/backend/beaver/memory/skills/models.py +++ b/app-instance/backend/beaver/memory/skills/models.py @@ -227,6 +227,15 @@ class SkillDraftEvalReport: cases: list[dict[str, Any]] = field(default_factory=list) status: str = "completed" created_at: str = "" + eval_version: str = "heuristic-v1" + mode: str = "heuristic" + execution_coverage: float = 0.0 + surrogate_coverage: float = 0.0 + blocked_coverage: float = 0.0 + confidence: str = "low" + case_reports: list[dict[str, Any]] = field(default_factory=list) + tool_mode_summary: dict[str, Any] = field(default_factory=dict) + preservation_report: dict[str, Any] | None = None def to_dict(self) -> dict[str, Any]: return { @@ -244,6 +253,17 @@ class SkillDraftEvalReport: "cases": [dict(item) for item in self.cases], "status": self.status, "created_at": self.created_at, + "eval_version": self.eval_version, + "mode": self.mode, + "execution_coverage": self.execution_coverage, + "surrogate_coverage": self.surrogate_coverage, + "blocked_coverage": self.blocked_coverage, + "confidence": self.confidence, + "case_reports": [dict(item) for item in self.case_reports], + "tool_mode_summary": dict(self.tool_mode_summary), + "preservation_report": ( + dict(self.preservation_report) if self.preservation_report is not None else None + ), } @classmethod @@ -263,6 +283,23 @@ class SkillDraftEvalReport: cases=[dict(item) for item in payload.get("cases") or [] if isinstance(item, dict)], status=str(payload.get("status") or "completed"), created_at=str(payload.get("created_at") or ""), + eval_version=str(payload.get("eval_version") or "heuristic-v1"), + mode=str(payload.get("mode") or "heuristic"), + execution_coverage=_bounded_float(payload.get("execution_coverage"), default=0.0), + surrogate_coverage=_bounded_float(payload.get("surrogate_coverage"), default=0.0), + blocked_coverage=_bounded_float(payload.get("blocked_coverage"), default=0.0), + confidence=str(payload.get("confidence") or "low"), + case_reports=[ + dict(item) + for item in payload.get("case_reports") or [] + if isinstance(item, dict) + ], + tool_mode_summary=dict(payload.get("tool_mode_summary") or {}), + preservation_report=( + dict(payload["preservation_report"]) + if isinstance(payload.get("preservation_report"), dict) + else None + ), ) @@ -272,6 +309,15 @@ def _optional_str(value: Any) -> str | None: return str(value) +def _bounded_float(value: Any, *, default: float = 0.0) -> float: + if value in (None, ""): + return default + try: + return max(0.0, min(1.0, float(value))) + except (TypeError, ValueError): + return default + + def _summarize_evidence(payload: dict[str, Any]) -> str: evidence = payload.get("evidence") if isinstance(evidence, dict): diff --git a/app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py b/app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py new file mode 100644 index 0000000..8d3cd7d --- /dev/null +++ b/app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from beaver.memory.skills import SkillDraftEvalReport + + +def test_eval_report_defaults_preserve_legacy_payload_shape() -> None: + report = SkillDraftEvalReport( + report_id="eval-1", + skill_name="debug", + draft_id="draft-1", + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.5, + candidate_score_avg=0.8, + score_delta=0.3, + regression_count=0, + improved_count=2, + unchanged_count=0, + cases=[{"run_id": "run-1"}], + status="completed", + created_at="now", + ) + + payload = report.to_dict() + + assert payload["eval_version"] == "heuristic-v1" + assert payload["mode"] == "heuristic" + assert payload["execution_coverage"] == 0.0 + assert payload["surrogate_coverage"] == 0.0 + assert payload["blocked_coverage"] == 0.0 + assert payload["confidence"] == "low" + assert payload["case_reports"] == [] + assert payload["tool_mode_summary"] == {} + assert payload["preservation_report"] is None + assert payload["cases"] == [{"run_id": "run-1"}] + + +def test_eval_report_reads_legacy_payload_without_replay_fields() -> None: + report = SkillDraftEvalReport.from_dict( + { + "report_id": "eval-legacy", + "skill_name": "debug", + "draft_id": "draft-1", + "candidate_id": "candidate-1", + "passed": True, + "baseline_score_avg": 0.4, + "candidate_score_avg": 0.8, + "score_delta": 0.4, + "regression_count": 0, + "improved_count": 1, + "unchanged_count": 0, + "cases": [{"run_id": "run-1"}], + "status": "completed", + "created_at": "now", + } + ) + + assert report.eval_version == "heuristic-v1" + assert report.mode == "heuristic" + assert report.confidence == "low" + assert report.case_reports == []