feat(tasks): add skill-templated task graph execution

This commit is contained in:
2026-06-23 10:22:58 +08:00
parent 6843d89b2c
commit 53b13e8eac
53 changed files with 4773 additions and 756 deletions

View File

@ -395,6 +395,52 @@ def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> No
assert report.synthetic_score_avg is not None
def test_replay_real_case_without_validator_uses_same_output_scoring_for_both_arms(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path, task_score=0.8)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "real-no-validator",
"task_id": "real-no-validator",
"session_id": "eval",
"task_text": "Summarize the release checklist.",
"accepted_score": 0.8,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(
baseline_answer="Release checklist summarized.",
candidate_answer="Release checklist summarized.",
),
)
)
case = next(item for item in report.case_reports if item["run_id"] == "real-no-validator")
legacy_case = next(item for item in report.cases if item["run_id"] == "real-no-validator")
assert case["baseline_score"] == 0.7
assert case["candidate_score"] == 0.7
assert case["delta"] == 0.0
assert legacy_case["delta"] == 0.0
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(