feat(tasks): add skill-templated task graph execution
This commit is contained in:
@ -395,6 +395,52 @@ def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> No
|
||||
assert report.synthetic_score_avg is not None
|
||||
|
||||
|
||||
def test_replay_real_case_without_validator_uses_same_output_scoring_for_both_arms(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path, task_score=0.8)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
evidence={
|
||||
"eval_cases": [
|
||||
{
|
||||
"run_id": "real-no-validator",
|
||||
"task_id": "real-no-validator",
|
||||
"session_id": "eval",
|
||||
"task_text": "Summarize the release checklist.",
|
||||
"accepted_score": 0.8,
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
|
||||
|
||||
report = asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=FakeReplayRunner(
|
||||
baseline_answer="Release checklist summarized.",
|
||||
candidate_answer="Release checklist summarized.",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
case = next(item for item in report.case_reports if item["run_id"] == "real-no-validator")
|
||||
legacy_case = next(item for item in report.cases if item["run_id"] == "real-no-validator")
|
||||
assert case["baseline_score"] == 0.7
|
||||
assert case["candidate_score"] == 0.7
|
||||
assert case["delta"] == 0.0
|
||||
assert legacy_case["delta"] == 0.0
|
||||
|
||||
|
||||
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
|
||||
Reference in New Issue
Block a user