feat(skill-learning): extend eval report payload
This commit is contained in:
@ -227,6 +227,15 @@ class SkillDraftEvalReport:
|
|||||||
cases: list[dict[str, Any]] = field(default_factory=list)
|
cases: list[dict[str, Any]] = field(default_factory=list)
|
||||||
status: str = "completed"
|
status: str = "completed"
|
||||||
created_at: str = ""
|
created_at: str = ""
|
||||||
|
eval_version: str = "heuristic-v1"
|
||||||
|
mode: str = "heuristic"
|
||||||
|
execution_coverage: float = 0.0
|
||||||
|
surrogate_coverage: float = 0.0
|
||||||
|
blocked_coverage: float = 0.0
|
||||||
|
confidence: str = "low"
|
||||||
|
case_reports: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
tool_mode_summary: dict[str, Any] = field(default_factory=dict)
|
||||||
|
preservation_report: dict[str, Any] | None = None
|
||||||
|
|
||||||
def to_dict(self) -> dict[str, Any]:
|
def to_dict(self) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
@ -244,6 +253,17 @@ class SkillDraftEvalReport:
|
|||||||
"cases": [dict(item) for item in self.cases],
|
"cases": [dict(item) for item in self.cases],
|
||||||
"status": self.status,
|
"status": self.status,
|
||||||
"created_at": self.created_at,
|
"created_at": self.created_at,
|
||||||
|
"eval_version": self.eval_version,
|
||||||
|
"mode": self.mode,
|
||||||
|
"execution_coverage": self.execution_coverage,
|
||||||
|
"surrogate_coverage": self.surrogate_coverage,
|
||||||
|
"blocked_coverage": self.blocked_coverage,
|
||||||
|
"confidence": self.confidence,
|
||||||
|
"case_reports": [dict(item) for item in self.case_reports],
|
||||||
|
"tool_mode_summary": dict(self.tool_mode_summary),
|
||||||
|
"preservation_report": (
|
||||||
|
dict(self.preservation_report) if self.preservation_report is not None else None
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -263,6 +283,23 @@ class SkillDraftEvalReport:
|
|||||||
cases=[dict(item) for item in payload.get("cases") or [] if isinstance(item, dict)],
|
cases=[dict(item) for item in payload.get("cases") or [] if isinstance(item, dict)],
|
||||||
status=str(payload.get("status") or "completed"),
|
status=str(payload.get("status") or "completed"),
|
||||||
created_at=str(payload.get("created_at") or ""),
|
created_at=str(payload.get("created_at") or ""),
|
||||||
|
eval_version=str(payload.get("eval_version") or "heuristic-v1"),
|
||||||
|
mode=str(payload.get("mode") or "heuristic"),
|
||||||
|
execution_coverage=_bounded_float(payload.get("execution_coverage"), default=0.0),
|
||||||
|
surrogate_coverage=_bounded_float(payload.get("surrogate_coverage"), default=0.0),
|
||||||
|
blocked_coverage=_bounded_float(payload.get("blocked_coverage"), default=0.0),
|
||||||
|
confidence=str(payload.get("confidence") or "low"),
|
||||||
|
case_reports=[
|
||||||
|
dict(item)
|
||||||
|
for item in payload.get("case_reports") or []
|
||||||
|
if isinstance(item, dict)
|
||||||
|
],
|
||||||
|
tool_mode_summary=dict(payload.get("tool_mode_summary") or {}),
|
||||||
|
preservation_report=(
|
||||||
|
dict(payload["preservation_report"])
|
||||||
|
if isinstance(payload.get("preservation_report"), dict)
|
||||||
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -272,6 +309,15 @@ def _optional_str(value: Any) -> str | None:
|
|||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _bounded_float(value: Any, *, default: float = 0.0) -> float:
|
||||||
|
if value in (None, ""):
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return max(0.0, min(1.0, float(value)))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def _summarize_evidence(payload: dict[str, Any]) -> str:
|
def _summarize_evidence(payload: dict[str, Any]) -> str:
|
||||||
evidence = payload.get("evidence")
|
evidence = payload.get("evidence")
|
||||||
if isinstance(evidence, dict):
|
if isinstance(evidence, dict):
|
||||||
|
|||||||
@ -0,0 +1,61 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from beaver.memory.skills import SkillDraftEvalReport
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
|
||||||
|
report = SkillDraftEvalReport(
|
||||||
|
report_id="eval-1",
|
||||||
|
skill_name="debug",
|
||||||
|
draft_id="draft-1",
|
||||||
|
candidate_id="candidate-1",
|
||||||
|
passed=True,
|
||||||
|
baseline_score_avg=0.5,
|
||||||
|
candidate_score_avg=0.8,
|
||||||
|
score_delta=0.3,
|
||||||
|
regression_count=0,
|
||||||
|
improved_count=2,
|
||||||
|
unchanged_count=0,
|
||||||
|
cases=[{"run_id": "run-1"}],
|
||||||
|
status="completed",
|
||||||
|
created_at="now",
|
||||||
|
)
|
||||||
|
|
||||||
|
payload = report.to_dict()
|
||||||
|
|
||||||
|
assert payload["eval_version"] == "heuristic-v1"
|
||||||
|
assert payload["mode"] == "heuristic"
|
||||||
|
assert payload["execution_coverage"] == 0.0
|
||||||
|
assert payload["surrogate_coverage"] == 0.0
|
||||||
|
assert payload["blocked_coverage"] == 0.0
|
||||||
|
assert payload["confidence"] == "low"
|
||||||
|
assert payload["case_reports"] == []
|
||||||
|
assert payload["tool_mode_summary"] == {}
|
||||||
|
assert payload["preservation_report"] is None
|
||||||
|
assert payload["cases"] == [{"run_id": "run-1"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
|
||||||
|
report = SkillDraftEvalReport.from_dict(
|
||||||
|
{
|
||||||
|
"report_id": "eval-legacy",
|
||||||
|
"skill_name": "debug",
|
||||||
|
"draft_id": "draft-1",
|
||||||
|
"candidate_id": "candidate-1",
|
||||||
|
"passed": True,
|
||||||
|
"baseline_score_avg": 0.4,
|
||||||
|
"candidate_score_avg": 0.8,
|
||||||
|
"score_delta": 0.4,
|
||||||
|
"regression_count": 0,
|
||||||
|
"improved_count": 1,
|
||||||
|
"unchanged_count": 0,
|
||||||
|
"cases": [{"run_id": "run-1"}],
|
||||||
|
"status": "completed",
|
||||||
|
"created_at": "now",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert report.eval_version == "heuristic-v1"
|
||||||
|
assert report.mode == "heuristic"
|
||||||
|
assert report.confidence == "low"
|
||||||
|
assert report.case_reports == []
|
||||||
Reference in New Issue
Block a user