feat(tasks): add skill-templated task graph execution

2026-06-23 10:22:58 +08:00
parent 6843d89b2c
commit 53b13e8eac
53 changed files with 4773 additions and 756 deletions
--- a/app-instance/backend/beaver/skills/assembler/task_assembler.py
+++ b/app-instance/backend/beaver/skills/assembler/task_assembler.py
@ -83,6 +83,12 @@ class SkillAssembler:
            return SkillAssemblyResult()
        llm_interactions: list[dict[str, Any]] = []

+        if len(candidates) == 1:
+            return SkillAssemblyResult(
+                activated_skills=self._activate_skill_contexts([candidates[0]["name"]]),
+                llm_interactions=llm_interactions,
+            )
+
        if len(candidates) <= self.max_detailed_candidates:
            shortlisted_names = [item["name"] for item in candidates]
        else:
@ -115,6 +121,10 @@ class SkillAssembler:
        if not selected_names:
            return SkillAssemblyResult(llm_interactions=llm_interactions)

+        activated_skills = self._activate_skill_contexts(selected_names)
+        return SkillAssemblyResult(activated_skills=activated_skills, llm_interactions=llm_interactions)
+
+    def _activate_skill_contexts(self, selected_names: list[str]) -> list[SkillContext]:
        activated_skills: list[SkillContext] = []
        for name in selected_names:
            record = self.loader.get_skill_record(name)
@ -130,10 +140,11 @@ class SkillAssembler:
                    content_hash=record.content_hash or "" if record is not None else "",
                    activation_reason="llm_selected",
                    tool_hints=list(record.tool_hints) if record is not None else [],
+                    team_template=getattr(record, "team_template", None) if record is not None else None,
+                    team_template_warnings=list(getattr(record, "team_template_warnings", [])) if record is not None else [],
                )
            )
-
-        return SkillAssemblyResult(activated_skills=activated_skills, llm_interactions=llm_interactions)
+        return activated_skills

    async def _select_skill_names(
        self,
--- a/app-instance/backend/beaver/skills/catalog/loader.py
+++ b/app-instance/backend/beaver/skills/catalog/loader.py
@ -28,6 +28,7 @@ from .utils import (
    check_requirements,
    escape_xml,
    extract_required_tool_names,
+    extract_skill_team_template,
    get_missing_requirements,
    parse_frontmatter,
    parse_skill_metadata_blob,
@ -49,6 +50,8 @@ class SkillRecord:
    tool_hints: list[str] = field(default_factory=list)
    frontmatter: dict[str, Any] = field(default_factory=dict)
    description: str = ""
+    team_template: dict[str, Any] | None = None
+    team_template_warnings: list[str] = field(default_factory=list)


 class SkillsLoader:
@ -113,6 +116,7 @@ class SkillsLoader:
                    continue
                normalized_frontmatter = dict(frontmatter)
                meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
+                template_result = extract_skill_team_template(body)
                record = SkillRecord(
                    name=name,
                    path=skill_file,
@ -127,6 +131,8 @@ class SkillsLoader:
                    ),
                    frontmatter=normalized_frontmatter,
                    description=str(frontmatter.get("description") or summarize_body(body) or name),
+                    team_template=template_result.template,
+                    team_template_warnings=template_result.warnings,
                )
                if filter_unavailable and not self._record_available(record):
                    continue
@ -146,6 +152,7 @@ class SkillsLoader:
            else:
                path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md"
            _frontmatter, body = parse_frontmatter(loaded.content)
+            template_result = extract_skill_team_template(body)
            record = SkillRecord(
                name=name,
                path=path,
@ -160,6 +167,8 @@ class SkillsLoader:
                ),
                frontmatter=dict(loaded.version.frontmatter),
                description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name),
+                team_template=template_result.template,
+                team_template_warnings=template_result.warnings,
            )
            if filter_unavailable and not self._record_available(record):
                continue
--- a/app-instance/backend/beaver/skills/catalog/utils.py
+++ b/app-instance/backend/beaver/skills/catalog/utils.py
@ -17,6 +17,7 @@ import json
 import os
 import re
 import shutil
+from dataclasses import dataclass, field
 from typing import Any


@ -84,6 +85,27 @@ def strip_frontmatter(content: str) -> str:
    return body


+@dataclass(slots=True)
+class SkillTeamTemplateParseResult:
+    template: dict[str, Any] | None = None
+    warnings: list[str] = field(default_factory=list)
+
+
+def extract_skill_team_template(body: str) -> SkillTeamTemplateParseResult:
+    matches = re.findall(r"```beaver-team-template\s*\n(.*?)\n```", body, re.DOTALL)
+    if not matches:
+        return SkillTeamTemplateParseResult()
+    if len(matches) != 1:
+        return SkillTeamTemplateParseResult(warnings=["skill defines multiple team templates"])
+    try:
+        template = json.loads(matches[0])
+    except json.JSONDecodeError:
+        return SkillTeamTemplateParseResult(warnings=["team template JSON is invalid"])
+    if not isinstance(template, dict) or not isinstance(template.get("nodes", []), list):
+        return SkillTeamTemplateParseResult(warnings=["team template must be an object with a nodes list"])
+    return SkillTeamTemplateParseResult(template=template)
+
+
 def extract_required_tool_names(body: str) -> list[str]:
    """从 canonical skill 正文的 `## Required Tools` 段落提取工具名。

--- a/app-instance/backend/beaver/skills/learning/eval.py
+++ b/app-instance/backend/beaver/skills/learning/eval.py
@ -284,6 +284,9 @@ def _build_replay_case_reports(
        "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
        "validator_notes": list(surrogate.get("notes") or []),
    }
+    historical_accepted_score = _historical_accepted_score(case)
+    if historical_accepted_score is not None:
+        case_report["historical_accepted_score"] = historical_accepted_score
    return case_report, {
        "run_id": case["run_id"],
        "session_id": case.get("session_id") or "",
@ -293,6 +296,7 @@ def _build_replay_case_reports(
        "baseline_score": baseline_score,
        "candidate_score": candidate_score,
        "delta": round(candidate_score - baseline_score, 4),
+        **({"historical_accepted_score": historical_accepted_score} if historical_accepted_score is not None else {}),
    }


@ -658,8 +662,11 @@ def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str)
    if validator is not None:
        return _ability_from_validator(validator, arm)
    if not case.get("synthetic"):
-        score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
-        return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
+        result = _ability_from_output(arm, source="output_heuristic")
+        historical_accepted_score = _historical_accepted_score(case)
+        if historical_accepted_score is not None:
+            result["historical_accepted_score"] = historical_accepted_score
+        return result
    return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])


@ -697,6 +704,12 @@ def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", note
    return _ability_breakdown(score=score, source=source, notes=notes)


+def _historical_accepted_score(case: dict[str, Any]) -> float | None:
+    if case.get("synthetic") or isinstance(case.get("validator"), dict) or "accepted_score" not in case:
+        return None
+    return _bounded_score(case.get("accepted_score"), default=0.75)
+
+
 def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
    bounded = _bounded_score(score, default=0.0)
    return {