feat(skill-learning): extend eval report payload

This commit is contained in:
2026-06-08 13:26:12 +08:00
parent 0fd4df3c17
commit 3a16dc283d
2 changed files with 107 additions and 0 deletions

View File

@ -227,6 +227,15 @@ class SkillDraftEvalReport:
cases: list[dict[str, Any]] = field(default_factory=list) cases: list[dict[str, Any]] = field(default_factory=list)
status: str = "completed" status: str = "completed"
created_at: str = "" created_at: str = ""
eval_version: str = "heuristic-v1"
mode: str = "heuristic"
execution_coverage: float = 0.0
surrogate_coverage: float = 0.0
blocked_coverage: float = 0.0
confidence: str = "low"
case_reports: list[dict[str, Any]] = field(default_factory=list)
tool_mode_summary: dict[str, Any] = field(default_factory=dict)
preservation_report: dict[str, Any] | None = None
def to_dict(self) -> dict[str, Any]: def to_dict(self) -> dict[str, Any]:
return { return {
@ -244,6 +253,17 @@ class SkillDraftEvalReport:
"cases": [dict(item) for item in self.cases], "cases": [dict(item) for item in self.cases],
"status": self.status, "status": self.status,
"created_at": self.created_at, "created_at": self.created_at,
"eval_version": self.eval_version,
"mode": self.mode,
"execution_coverage": self.execution_coverage,
"surrogate_coverage": self.surrogate_coverage,
"blocked_coverage": self.blocked_coverage,
"confidence": self.confidence,
"case_reports": [dict(item) for item in self.case_reports],
"tool_mode_summary": dict(self.tool_mode_summary),
"preservation_report": (
dict(self.preservation_report) if self.preservation_report is not None else None
),
} }
@classmethod @classmethod
@ -263,6 +283,23 @@ class SkillDraftEvalReport:
cases=[dict(item) for item in payload.get("cases") or [] if isinstance(item, dict)], cases=[dict(item) for item in payload.get("cases") or [] if isinstance(item, dict)],
status=str(payload.get("status") or "completed"), status=str(payload.get("status") or "completed"),
created_at=str(payload.get("created_at") or ""), created_at=str(payload.get("created_at") or ""),
eval_version=str(payload.get("eval_version") or "heuristic-v1"),
mode=str(payload.get("mode") or "heuristic"),
execution_coverage=_bounded_float(payload.get("execution_coverage"), default=0.0),
surrogate_coverage=_bounded_float(payload.get("surrogate_coverage"), default=0.0),
blocked_coverage=_bounded_float(payload.get("blocked_coverage"), default=0.0),
confidence=str(payload.get("confidence") or "low"),
case_reports=[
dict(item)
for item in payload.get("case_reports") or []
if isinstance(item, dict)
],
tool_mode_summary=dict(payload.get("tool_mode_summary") or {}),
preservation_report=(
dict(payload["preservation_report"])
if isinstance(payload.get("preservation_report"), dict)
else None
),
) )
@ -272,6 +309,15 @@ def _optional_str(value: Any) -> str | None:
return str(value) return str(value)
def _bounded_float(value: Any, *, default: float = 0.0) -> float:
if value in (None, ""):
return default
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _summarize_evidence(payload: dict[str, Any]) -> str: def _summarize_evidence(payload: dict[str, Any]) -> str:
evidence = payload.get("evidence") evidence = payload.get("evidence")
if isinstance(evidence, dict): if isinstance(evidence, dict):

View File

@ -0,0 +1,61 @@
from __future__ import annotations
from beaver.memory.skills import SkillDraftEvalReport
def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
report = SkillDraftEvalReport(
report_id="eval-1",
skill_name="debug",
draft_id="draft-1",
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=2,
unchanged_count=0,
cases=[{"run_id": "run-1"}],
status="completed",
created_at="now",
)
payload = report.to_dict()
assert payload["eval_version"] == "heuristic-v1"
assert payload["mode"] == "heuristic"
assert payload["execution_coverage"] == 0.0
assert payload["surrogate_coverage"] == 0.0
assert payload["blocked_coverage"] == 0.0
assert payload["confidence"] == "low"
assert payload["case_reports"] == []
assert payload["tool_mode_summary"] == {}
assert payload["preservation_report"] is None
assert payload["cases"] == [{"run_id": "run-1"}]
def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
report = SkillDraftEvalReport.from_dict(
{
"report_id": "eval-legacy",
"skill_name": "debug",
"draft_id": "draft-1",
"candidate_id": "candidate-1",
"passed": True,
"baseline_score_avg": 0.4,
"candidate_score_avg": 0.8,
"score_delta": 0.4,
"regression_count": 0,
"improved_count": 1,
"unchanged_count": 0,
"cases": [{"run_id": "run-1"}],
"status": "completed",
"created_at": "now",
}
)
assert report.eval_version == "heuristic-v1"
assert report.mode == "heuristic"
assert report.confidence == "low"
assert report.case_reports == []