feat(skill-learning): produce replay eval reports

This commit is contained in:
2026-06-08 13:35:58 +08:00
parent cc1bf85517
commit 64d789a3d0
3 changed files with 271 additions and 3 deletions

View File

@ -44,6 +44,7 @@ def _pipeline(tmp_path: Path, *, task_score: float = 0.8) -> SkillLearningPipeli
ended_at="end",
success=True,
finish_reason="stop",
feedback={"acceptance_type": "accept"},
validation_result={"score": task_score, "passed": True},
)
)
@ -156,3 +157,59 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
assert safety.passed is False
assert report.passed is True
assert pipeline.get_candidate("candidate-1").status == "safety_failed"
class FakeReplayRunner:
async def run_arm(self, request):
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"tool_calls": [
{
"tool_name": "write_file",
"mode": "executed",
"arguments": {"path": "README.md"},
"result": {"success": True, "content": "ok"},
}
],
"artifacts": [],
"side_effects": [],
}
def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(),
)
)
assert report.mode == "replay"
assert report.eval_version == "replay-v1"
assert report.case_reports
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}