feat(tasks): add skill-templated task graph execution

This commit is contained in:
2026-06-23 10:22:58 +08:00
parent 6843d89b2c
commit 53b13e8eac
53 changed files with 4773 additions and 756 deletions

View File

@ -284,6 +284,9 @@ def _build_replay_case_reports(
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
historical_accepted_score = _historical_accepted_score(case)
if historical_accepted_score is not None:
case_report["historical_accepted_score"] = historical_accepted_score
return case_report, {
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
@ -293,6 +296,7 @@ def _build_replay_case_reports(
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
**({"historical_accepted_score": historical_accepted_score} if historical_accepted_score is not None else {}),
}
@ -658,8 +662,11 @@ def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str)
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
result = _ability_from_output(arm, source="output_heuristic")
historical_accepted_score = _historical_accepted_score(case)
if historical_accepted_score is not None:
result["historical_accepted_score"] = historical_accepted_score
return result
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
@ -697,6 +704,12 @@ def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", note
return _ability_breakdown(score=score, source=source, notes=notes)
def _historical_accepted_score(case: dict[str, Any]) -> float | None:
if case.get("synthetic") or isinstance(case.get("validator"), dict) or "accepted_score" not in case:
return None
return _bounded_score(case.get("accepted_score"), default=0.75)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {