feat(skill-learning): produce replay eval reports
This commit is contained in:
@ -44,6 +44,7 @@ def _pipeline(tmp_path: Path, *, task_score: float = 0.8) -> SkillLearningPipeli
|
||||
ended_at="end",
|
||||
success=True,
|
||||
finish_reason="stop",
|
||||
feedback={"acceptance_type": "accept"},
|
||||
validation_result={"score": task_score, "passed": True},
|
||||
)
|
||||
)
|
||||
@ -156,3 +157,59 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
|
||||
assert safety.passed is False
|
||||
assert report.passed is True
|
||||
assert pipeline.get_candidate("candidate-1").status == "safety_failed"
|
||||
|
||||
|
||||
class FakeReplayRunner:
|
||||
async def run_arm(self, request):
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
"session_id": "session-replay",
|
||||
"run_id": f"{request.arm}-run",
|
||||
"task_text": request.task_text,
|
||||
"finish_reason": "stop",
|
||||
"final_answer": "done",
|
||||
"tool_calls": [
|
||||
{
|
||||
"tool_name": "write_file",
|
||||
"mode": "executed",
|
||||
"arguments": {"path": "README.md"},
|
||||
"result": {"success": True, "content": "ok"},
|
||||
}
|
||||
],
|
||||
"artifacts": [],
|
||||
"side_effects": [],
|
||||
}
|
||||
|
||||
|
||||
def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
|
||||
report = asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=FakeReplayRunner(),
|
||||
)
|
||||
)
|
||||
|
||||
assert report.mode == "replay"
|
||||
assert report.eval_version == "replay-v1"
|
||||
assert report.case_reports
|
||||
assert 0.0 <= report.execution_coverage <= 1.0
|
||||
assert 0.0 <= report.surrogate_coverage <= 1.0
|
||||
assert report.confidence in {"low", "medium", "high"}
|
||||
|
||||
Reference in New Issue
Block a user