feat(tasks): add skill-templated task graph execution

2026-06-23 10:22:58 +08:00
parent 6843d89b2c
commit 53b13e8eac
53 changed files with 4773 additions and 756 deletions
--- a/app-instance/backend/tests/unit/test_task_execution_planner.py
+++ b/app-instance/backend/tests/unit/test_task_execution_planner.py
@ -3,15 +3,19 @@ from __future__ import annotations
 import asyncio
 from types import SimpleNamespace

+from beaver.engine.context import SkillContext
 from beaver.engine.providers.base import LLMProvider, LLMResponse
 from beaver.engine.providers.factory import ProviderBundle
-from beaver.tasks import TaskExecutionPlanner, TaskRecord
+from beaver.tasks import SkillResolutionReport, TaskExecutionPlanner, TaskRecord
+from beaver.tools.base import BaseTool, ToolContext, ToolResult, ToolSpec
+from beaver.tools.registry import ToolRegistry


 class PlannerProvider(LLMProvider):
    def __init__(self, response: str) -> None:
        super().__init__()
        self.response = response
+        self.calls: list[dict] = []

    async def chat(
        self,
@ -21,6 +25,15 @@ class PlannerProvider(LLMProvider):
        max_tokens: int = 4096,
        temperature: float = 0.7,
    ) -> LLMResponse:
+        self.calls.append(
+            {
+                "messages": messages,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "model": model,
+                "tools": tools,
+            }
+        )
        return LLMResponse(content=self.response, finish_reason="stop", provider_name="stub", model="stub-model")

    def get_default_model(self) -> str:
@ -43,6 +56,28 @@ class HangingPlannerProvider(LLMProvider):
        return "stub-model"


+class SequencedPlannerProvider(PlannerProvider):
+    def __init__(self, responses: list[str]) -> None:
+        super().__init__(responses[0])
+        self.responses = list(responses)
+
+    async def chat(self, *args, **kwargs) -> LLMResponse:
+        self.response = self.responses.pop(0)
+        return await super().chat(*args, **kwargs)
+
+
+class StubTool(BaseTool):
+    def __init__(self, name: str) -> None:
+        self._spec = ToolSpec(name=name, description=name, input_schema={"type": "object"})
+
+    @property
+    def spec(self) -> ToolSpec:
+        return self._spec
+
+    async def invoke(self, arguments: dict, context: ToolContext) -> ToolResult:
+        raise AssertionError("Planner tests do not execute tools")
+
+
 def _task() -> TaskRecord:
    return TaskRecord(
        task_id="task-1",
@ -59,12 +94,26 @@ def _task() -> TaskRecord:


 def _bundle(response: str) -> ProviderBundle:
+    provider = PlannerProvider(response)
    return ProviderBundle(
        main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
-        main_provider=PlannerProvider(response),
+        main_provider=provider,
    )


+def _bundle_with_provider(provider: LLMProvider) -> ProviderBundle:
+    return ProviderBundle(
+        main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
+        main_provider=provider,
+    )
+
+
+def _registry() -> ToolRegistry:
+    registry = ToolRegistry()
+    registry.register_many([StubTool("web_search"), StubTool("web_fetch"), StubTool("terminal")])
+    return registry
+
+
 def _hanging_bundle() -> ProviderBundle:
    return ProviderBundle(
        main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
@ -87,26 +136,55 @@ def test_planner_selects_single_mode() -> None:
    assert plan.reason == "main agent is enough"


+def test_planner_skips_llm_for_simple_task() -> None:
+    provider = PlannerProvider('{"mode":"team","reason":"should not be used"}')
+    bundle = ProviderBundle(
+        main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
+        main_provider=provider,
+    )
+    task = _task()
+    task.description = "查询深圳天气"
+    task.goal = "查询深圳天气"
+
+    plan = asyncio.run(
+        TaskExecutionPlanner().plan(
+            task=task,
+            user_message="帮我查一下今天深圳天气",
+            attempt_index=1,
+            provider_bundle=bundle,
+        )
+    )
+
+    assert plan.mode == "single"
+    assert plan.graph is None
+    assert plan.reason == "planner_skipped_simple_task"
+    assert provider.calls == []
+
+
 def test_planner_builds_team_graph() -> None:
+    bundle = _bundle(
+        """
+        {
+          "mode": "team",
+          "reason": "needs parallel review",
+          "strategy": "dag",
+          "nodes": [
+            {"node_id": "research", "task": "research options"},
+            {"node_id": "review", "task": "review result", "depends_on": ["research"]}
+          ],
+          "final_synthesis_instruction": "merge the findings"
+        }
+        """
+    )
+    provider = bundle.main_provider
    plan = asyncio.run(
        TaskExecutionPlanner().plan(
            task=_task(),
            user_message="implement workflow",
            attempt_index=1,
-            provider_bundle=_bundle(
-                """
-                {
-                  "mode": "team",
-                  "reason": "needs parallel review",
-                  "strategy": "dag",
-                  "nodes": [
-                    {"node_id": "research", "task": "research options", "agent": {"name": "researcher"}},
-                    {"node_id": "review", "task": "review result", "agent": {"name": "reviewer"}, "depends_on": ["research"]}
-                  ],
-                  "final_synthesis_instruction": "merge the findings"
-                }
-                """
-            ),
+            provider_bundle=bundle,
+            skill_summaries=["docker-debug: Use docker logs before editing config."],
+            tool_hints=["terminal", "search_files"],
        )
    )

@ -116,6 +194,12 @@ def test_planner_builds_team_graph() -> None:
    assert [node.node_id for node in plan.graph.nodes] == ["research", "review"]
    assert plan.graph.nodes[1].depends_on == ["research"]
    assert plan.final_synthesis_instruction == "merge the findings"
+    assert isinstance(provider, PlannerProvider)
+    prompt = provider.calls[0]["messages"][1]["content"]
+    assert "Activated skill summaries" in prompt
+    assert "docker-debug: Use docker logs before editing config." in prompt
+    assert "terminal" in prompt
+    assert "search_files" in prompt


 def test_planner_timeout_falls_back_to_single() -> None:
@ -134,7 +218,7 @@ def test_planner_timeout_falls_back_to_single() -> None:
    assert "TimeoutError" in (plan.fallback_error or "")


-def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None:
+def test_planner_team_nodes_use_task_as_internal_skill_query() -> None:
    plan = TaskExecutionPlanner().from_json(
        """
        {
@ -144,9 +228,7 @@ def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None:
          "nodes": [
            {
              "node_id": "api_review",
-              "task": "review API compatibility",
-              "skill_query": "API contract compatibility review",
-              "required_capabilities": ["schema compatibility"]
+              "task": "review API compatibility"
            }
          ]
        }
@ -158,8 +240,77 @@ def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None:
    node = plan.graph.nodes[0]
    assert node.agent.name == "api_review"
    assert node.agent.role == ""
-    assert node.agent.metadata["skill_query"] == "API contract compatibility review"
-    assert node.agent.metadata["required_capabilities"] == ["schema compatibility"]
+    assert node.agent.metadata["skill_query"] == "review API compatibility"
+    assert node.agent.metadata["required_capabilities"] == []
+
+
+def test_planner_accepts_use_skill_and_skill_query() -> None:
+    plan = TaskExecutionPlanner().from_json(
+        """
+        {
+          "mode": "team",
+          "strategy": "sequence",
+          "nodes": [
+            {
+              "node_id": "collect",
+              "task": "Collect official sources",
+              "use_skill": "official-source-research",
+              "skill_query": "official source verification"
+            }
+          ]
+        }
+        """
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    node = plan.graph.nodes[0]
+    assert node.agent.metadata["use_skill"] == "official-source-research"
+    assert node.agent.metadata["skill_query"] == "official source verification"
+    assert node.inherited_pinned_skills == []
+    assert node.allowed_tool_names is None
+    assert plan.planner_adaptation["node_skill_bindings"] == [
+        {
+            "node_id": "collect",
+            "use_skill": "official-source-research",
+            "skill_query": "official source verification",
+        }
+    ]
+
+
+def test_planner_defaults_skill_query_to_node_task_when_absent() -> None:
+    plan = TaskExecutionPlanner().from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"extract","task":"Extract financial metrics","use_skill":"financial-extraction"}]}'
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    assert plan.graph.nodes[0].agent.metadata["skill_query"] == "Extract financial metrics"
+
+
+def test_planner_adaptation_records_unresolved_use_skill_fallback() -> None:
+    planner = TaskExecutionPlanner()
+    plan = planner.from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"extract","task":"Extract metrics","use_skill":"missing-skill",'
+        '"skill_query":"financial extraction"}]}'
+    )
+    report = SkillResolutionReport(
+        node_id="extract",
+        skill_query="financial extraction",
+        requested_skill_name="missing-skill",
+        exact_binding_used=False,
+        warnings=["use_skill unresolved: missing-skill"],
+        reason="matched published skill",
+    )
+
+    planner._merge_skill_resolution_adaptation(plan, [report])
+
+    assert plan.planner_adaptation["warnings"] == ["use_skill unresolved: missing-skill"]
+    assert plan.planner_adaptation["node_skill_bindings"][0]["fallback_reason"] == (
+        "use_skill unresolved; matched published skill"
+    )


 def test_planner_invalid_outputs_fallback_to_single() -> None:
@ -193,3 +344,216 @@ def test_planner_invalid_outputs_fallback_to_single() -> None:
    assert unknown_strategy.mode == "single"
    assert too_many_nodes.mode == "single"
    assert cyclic.mode == "single"
+
+
+def test_template_plan_builds_generic_worker_and_preserves_v1_contract_fields() -> None:
+    plan = TaskExecutionPlanner(tool_registry=_registry()).from_json(
+        """
+        {
+          "mode": "team",
+          "strategy": "dag",
+          "nodes": [
+            {
+              "node_id": "collect",
+              "task": "Collect official sources",
+              "requested_tools": ["web_search"],
+              "evidence_contract": {"entities": ["MGM", "Galaxy"]},
+              "block_downstream_on_partial": true
+            }
+          ],
+          "adaptation": {"template_used": true}
+        }
+        """
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    node = plan.graph.nodes[0]
+    assert node.agent.name == "collect"
+    assert node.agent.role == ""
+    assert node.agent.metadata["sub_agent_kind"] == "generic_skill_worker"
+    assert node.allowed_tool_names == ["web_search"]
+    assert node.evidence_contract == {"entities": ["MGM", "Galaxy"]}
+    assert node.block_downstream_on_partial is True
+    assert plan.planner_adaptation["template_used"] is True
+
+
+def test_unknown_tool_is_removed_and_warned() -> None:
+    plan = TaskExecutionPlanner(tool_registry=_registry()).from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"collect","task":"Collect","requested_tools":["web_search","not_real"]}]}'
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    assert plan.graph.nodes[0].allowed_tool_names == ["web_search"]
+    assert "unknown tool removed: not_real" in plan.planner_adaptation["warnings"]
+
+
+def test_high_risk_tool_is_removed_without_failing_low_risk_plan() -> None:
+    plan = TaskExecutionPlanner(tool_registry=_registry()).from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"collect","task":"Collect","requested_tools":["web_search","terminal"]}]}'
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    assert plan.graph.nodes[0].allowed_tool_names == ["web_search"]
+    assert "requires_high_risk_review: terminal" in plan.planner_adaptation["warnings"]
+
+
+def test_planner_rejects_agent_and_role_node_fields() -> None:
+    planner = TaskExecutionPlanner(tool_registry=_registry())
+
+    agent_plan = planner.from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"collect","task":"Collect","agent":{"name":"researcher"}}]}'
+    )
+    role_plan = planner.from_json(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"collect","task":"Collect","role":"researcher"}]}'
+    )
+
+    assert agent_plan.mode == "single"
+    assert "agent" in (agent_plan.fallback_error or "")
+    assert role_plan.mode == "single"
+    assert "role" in (role_plan.fallback_error or "")
+
+
+def test_planner_records_primary_template_selection_and_ignored_templates() -> None:
+    primary = SkillContext(
+        name="financial-comparison",
+        version="v1",
+        content="Compare official financial disclosures.",
+        team_template={"version": 1, "nodes": [{"node_id": "collect", "task": "Collect"}]},
+    )
+    secondary = SkillContext(
+        name="chart-reporting",
+        version="v2",
+        content="Render chart-ready Markdown.",
+        team_template={"version": 1, "nodes": [{"node_id": "report", "task": "Report"}]},
+    )
+    provider = PlannerProvider(
+        '{"mode":"team","strategy":"sequence","nodes":['
+        '{"node_id":"collect","task":"Collect official sources"}],'
+        '"adaptation":{"template_used":true}}'
+    )
+
+    plan = asyncio.run(
+        TaskExecutionPlanner(tool_registry=_registry()).plan(
+            task=_task(),
+            user_message="compare financial workflow",
+            attempt_index=1,
+            provider_bundle=_bundle_with_provider(provider),
+            activated_skills=[primary, secondary],
+        )
+    )
+
+    assert plan.planner_adaptation == {
+        "template_used": True,
+        "selected_template": "financial-comparison",
+        "selection_reason": "first activated skill with a valid team template",
+        "ignored_templates": ["chart-reporting"],
+        "warnings": [],
+    }
+    prompt = provider.calls[0]["messages"][1]["content"]
+    assert '"skill_name": "financial-comparison"' in prompt
+    assert "Compare official financial disclosures." in prompt
+    assert "Render chart-ready Markdown." in prompt
+
+
+def test_malformed_planner_output_repairs_once_without_tools() -> None:
+    provider = SequencedPlannerProvider(
+        [
+            "not json",
+            '{"mode":"team","strategy":"sequence","nodes":[{"node_id":"collect","task":"Collect"}]}',
+        ]
+    )
+
+    plan = asyncio.run(
+        TaskExecutionPlanner(tool_registry=_registry()).plan(
+            task=_task(),
+            user_message="implement workflow",
+            attempt_index=1,
+            provider_bundle=_bundle_with_provider(provider),
+        )
+    )
+
+    assert plan.is_team
+    assert len(provider.calls) == 2
+    assert provider.calls[1]["tools"] is None
+    assert "Repair the invalid planner JSON" in provider.calls[1]["messages"][1]["content"]
+
+
+def test_failed_planner_repair_falls_back_to_single() -> None:
+    provider = SequencedPlannerProvider(["not json", "still not json"])
+
+    plan = asyncio.run(
+        TaskExecutionPlanner(tool_registry=_registry()).plan(
+            task=_task(),
+            user_message="implement workflow",
+            attempt_index=1,
+            provider_bundle=_bundle_with_provider(provider),
+        )
+    )
+
+    assert plan.mode == "single"
+    assert plan.reason == "planner_fallback_single"
+    assert len(provider.calls) == 2
+
+
+def test_finance_template_adapts_to_task_oriented_read_only_graph() -> None:
+    plan = TaskExecutionPlanner(tool_registry=_registry()).from_json(
+        """
+        {
+          "mode": "team",
+          "strategy": "dag",
+          "nodes": [
+            {
+              "node_id": "collect_official_sources",
+              "task": "Collect MGM and Galaxy official financial disclosures",
+              "requested_tools": ["web_search", "web_fetch"],
+              "required_evidence": ["tool_result", "url"]
+            },
+            {
+              "node_id": "extract_financial_metrics",
+              "task": "Extract comparable financial metrics from collected sources",
+              "depends_on": ["collect_official_sources"],
+              "requested_tools": ["web_fetch"],
+              "required_evidence": ["output"]
+            },
+            {
+              "node_id": "validate_metrics",
+              "task": "Validate metric units, periods, and source consistency",
+              "depends_on": ["extract_financial_metrics"],
+              "required_evidence": ["output"]
+            },
+            {
+              "node_id": "generate_chart_report",
+              "task": "Generate a Markdown comparison table and chart-ready data without claiming an image or file artifact",
+              "depends_on": ["validate_metrics"],
+              "requested_tools": [],
+              "required_evidence": ["output"]
+            }
+          ]
+        }
+        """
+    )
+
+    assert plan.is_team
+    assert plan.graph is not None
+    assert [node.node_id for node in plan.graph.nodes] == [
+        "collect_official_sources",
+        "extract_financial_metrics",
+        "validate_metrics",
+        "generate_chart_report",
+    ]
+    assert all(node.agent.role == "" for node in plan.graph.nodes)
+    assert not {"researcher", "writer", "reviewer", "analyst"}.intersection(
+        node.node_id for node in plan.graph.nodes
+    )
+    assert plan.graph.nodes[0].allowed_tool_names == ["web_search", "web_fetch"]
+    assert plan.graph.nodes[-1].allowed_tool_names == []
+    report_task = plan.graph.nodes[-1].task.lower()
+    assert "markdown" in report_task
+    assert "without claiming an image or file artifact" in report_task