From 60605a74e0b3054bfac3c5b682221e623dbd4b6a Mon Sep 17 00:00:00 2001 From: steven_li Date: Fri, 22 May 2026 11:30:19 +0800 Subject: [PATCH] feat(team): preserve node run evidence --- .../beaver/coordinator/execution/scheduler.py | 2 +- .../backend/beaver/coordinator/local.py | 9 ++++++++ .../backend/beaver/coordinator/models.py | 3 +++ .../backend/tests/unit/test_agent_team_v1.py | 22 ++++++++++++++++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/app-instance/backend/beaver/coordinator/execution/scheduler.py b/app-instance/backend/beaver/coordinator/execution/scheduler.py index 234407a..61f554f 100644 --- a/app-instance/backend/beaver/coordinator/execution/scheduler.py +++ b/app-instance/backend/beaver/coordinator/execution/scheduler.py @@ -241,7 +241,7 @@ class TeamGraphScheduler: failed = [item for item in results if not item.success] if failed: failure_lines = [ - f"- {item.node_id}: {item.error or item.finish_reason}" + f"- {item.node_id}: {item.error or item.finish_reason} evidence={'yes' if item.evidence else 'no'}" for item in failed ] summary_parts.append("Failed nodes:\n" + "\n".join(failure_lines)) diff --git a/app-instance/backend/beaver/coordinator/local.py b/app-instance/backend/beaver/coordinator/local.py index df448bb..b1e3cc7 100644 --- a/app-instance/backend/beaver/coordinator/local.py +++ b/app-instance/backend/beaver/coordinator/local.py @@ -6,6 +6,7 @@ from uuid import uuid4 from beaver.engine import AgentLoop from beaver.engine.providers import ProviderBundle +from beaver.tasks.evidence import EvidenceBuilder from .models import DelegationEnvelope, NodeRunResult @@ -47,6 +48,13 @@ class LocalAgentRunner: pinned_skill_contexts=envelope.inherited_pinned_skill_contexts, allow_candidate_generation=allow_candidate_generation, ) + loaded = self.loop.boot() + evidence = EvidenceBuilder(loaded.session_manager).build_run_evidence( + result.session_id, + result.run_id, + result.output_text, + result.finish_reason, + ) success = result.finish_reason == "stop" return NodeRunResult( node_id=envelope.node_id or envelope.agent.name, @@ -56,6 +64,7 @@ class LocalAgentRunner: session_id=result.session_id, finish_reason=result.finish_reason, error=None if success else (result.output_text or result.finish_reason), + evidence=evidence, ) @staticmethod diff --git a/app-instance/backend/beaver/coordinator/models.py b/app-instance/backend/beaver/coordinator/models.py index 88ed554..f54f036 100644 --- a/app-instance/backend/beaver/coordinator/models.py +++ b/app-instance/backend/beaver/coordinator/models.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: from beaver.engine.context import SkillContext + from beaver.tasks.evidence import RunEvidence TeamStrategy = Literal[ @@ -116,6 +117,7 @@ class NodeRunResult: session_id: str | None = None finish_reason: str = "stop" error: str | None = None + evidence: "RunEvidence | None" = None def to_dict(self) -> dict[str, Any]: return { @@ -126,6 +128,7 @@ class NodeRunResult: "session_id": self.session_id, "finish_reason": self.finish_reason, "error": self.error, + "evidence": self.evidence.to_dict() if self.evidence is not None else None, } diff --git a/app-instance/backend/tests/unit/test_agent_team_v1.py b/app-instance/backend/tests/unit/test_agent_team_v1.py index bed4579..942cbc7 100644 --- a/app-instance/backend/tests/unit/test_agent_team_v1.py +++ b/app-instance/backend/tests/unit/test_agent_team_v1.py @@ -153,6 +153,26 @@ def test_local_agent_runner_uses_shared_loop_and_records_parent_task(tmp_path: P assert child_session["parent_session_id"] == "session-root" +def test_team_node_preserves_evidence_when_finish_reason_is_not_stop(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider([_response("partial evidence", finish_reason="max_tool_iterations")]) + envelope = DelegationEnvelope( + parent_task_id="task-parent", + parent_session_id="session-root", + parent_run_id="run-root", + agent=AgentDescriptor(name="researcher", role="research"), + task="research the requested topic", + node_id="research", + ) + + result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider))) + + assert result.success is False + assert result.evidence is not None + assert result.evidence.output_text == "partial evidence" + assert result.evidence.finish_reason == "max_tool_iterations" + + def test_pinned_skill_is_injected_into_delegated_run(tmp_path: Path) -> None: _publish_skill( tmp_path, @@ -438,7 +458,7 @@ def test_team_summary_lists_only_failed_nodes_when_all_nodes_fail(tmp_path: Path ) assert result.success is False - assert result.summary == "Failed nodes:\n- one: one down\n- two: two down" + assert result.summary == "Failed nodes:\n- one: one down evidence=no\n- two: two down evidence=no" def test_graph_structure_errors_still_raise(tmp_path: Path) -> None: