Files
beaver_project/app-instance/backend/tests/unit/test_task_evidence.py
steven_li 520a21a027 feat(coordinator): 添加团队节点默认最大工具迭代次数配置
添加 DEFAULT_TEAM_NODE_MAX_TOOL_ITERATIONS 配置项以控制团队节点的最大工具迭代次数,
并修改 LocalAgentRunner 中的逻辑来使用此默认值当 envelope 中未指定时。

fix(runtime): 修复团队节点运行成功判断逻辑

更新运行成功判断条件,将 finish_reason 为 "max_tool_iterations_finalized" 的情况
视为运行失败,并添加对原始工具调用输出的检测,避免将其误判为成功完成。

feat(mcp): 添加团队工作流MCP工具类别支持

增加新的本地MCP工具类别 "team_workflow" 及其对应的工具创建功能,
为团队工作流提供本地工具支持。

refactor(engine): 调整AgentLoop最大工具迭代次数设置

将 AgentProfile 中的默认 max_tool_iterations 从 30 增加到 100,
同时移除 TaskExecutionPlanner 构造函数中的重复参数传递。

perf(mcp): 优化MCP连接管理避免重复连接

添加 mcp_connected 标志来跟踪MCP连接状态,确保 connect_all 只执行一次,
提高性能并避免不必要的重复连接。

refactor(skills): 移除技能团队模板相关功能

移除与技能团队模板相关的代码,包括解析、存储和处理逻辑,
简化技能记录结构和加载流程。

feat(process): 增强会话过程投影器功能

添加技能激活快照事件处理,改进团队运行完成消息显示,
并增强技能激活事件的时间戳记录功能。

refactor(tasks): 简化任务尝试编排器团队执行逻辑

移除团队执行相关代码,将所有任务统一按单步执行处理,
简化任务编排器的复杂度并提升执行效率。

fix(evidence): 修复节点证据评估中需求验证逻辑

更新节点证据评估逻辑,跳过自然语言证据需求的确定性验证,
只执行机器可读的需求验证,避免因自然语言需求导致的节点失败。
2026-06-26 16:36:29 +08:00

149 lines
4.5 KiB
Python

from __future__ import annotations
from pathlib import Path
from beaver.engine.session.manager import SessionManager
from beaver.tasks.evidence import (
EvidenceBuilder,
RunEvidence,
TaskEvidencePacket,
ToolEvidence,
evaluate_node_evidence,
render_task_evidence,
)
def _run_evidence(*, tool_results: list[ToolEvidence] | None = None) -> RunEvidence:
return RunEvidence(
run_id="run-1",
session_id="session-1",
output_text="",
finish_reason="stop",
tool_results=list(tool_results or []),
)
def test_evaluate_node_evidence_requires_successful_tool_result() -> None:
evidence = _run_evidence(
tool_results=[
ToolEvidence(
tool_name="web_fetch",
tool_call_id="call-1",
content="failed",
event_payload={"success": False},
)
]
)
assert evaluate_node_evidence(evidence, ["tool_result"], "done") == [
"missing required evidence: tool_result"
]
def test_evaluate_node_evidence_accepts_url_in_successful_tool_content() -> None:
evidence = _run_evidence(
tool_results=[
ToolEvidence(
tool_name="web_fetch",
tool_call_id="call-1",
content="Source: https://example.test/report",
event_payload={"success": True},
)
]
)
assert evaluate_node_evidence(evidence, ["tool_result", "url"], "done") == []
def test_evaluate_node_evidence_checks_output_and_ignores_natural_language_requirements() -> None:
evidence = _run_evidence()
assert evaluate_node_evidence(evidence, ["output", "至少3个价格信息来源"], " ") == [
"missing required evidence: output",
]
def test_evidence_builder_preserves_full_tool_result(tmp_path: Path) -> None:
session_manager = SessionManager(tmp_path)
session_id = "session-1"
run_id = "run-1"
long_content = "prefix " + ("x" * 700) + " MAN 3 FT 2 NFO"
session_manager.ensure_session(session_id, source="test")
session_manager.append_message(session_id, run_id=run_id, role="user", event_type="user_message_added", content="score?")
session_manager.append_message(
session_id,
run_id=run_id,
role="tool",
event_type="tool_result_recorded",
event_payload={"success": True, "url": "https://example.test/match"},
content=long_content,
tool_name="web_fetch",
tool_call_id="call-1",
)
session_manager.append_message(
session_id,
run_id=run_id,
role="system",
event_type="run_completed",
event_payload={"finish_reason": "stop"},
content="Manchester United won 3-2.",
finish_reason="stop",
context_visible=False,
)
evidence = EvidenceBuilder(session_manager).build_run_evidence(
session_id,
run_id,
"Manchester United won 3-2.",
"stop",
)
rendered = render_task_evidence(
TaskEvidencePacket(
task_id="task-1",
attempt_index=1,
main_run=evidence,
team_runs=[],
team_node_results=[],
final_output="Manchester United won 3-2.",
)
)
assert evidence.tool_results[0].content == long_content
assert "MAN 3 FT 2 NFO" in rendered
assert "https://example.test/match" in rendered
def test_render_task_evidence_includes_failed_team_run_tool_results() -> None:
run = RunEvidence(
run_id="run-team",
session_id="session-team",
output_text="Tool loop stopped.",
finish_reason="max_tool_iterations",
transcript=[],
tool_results=[
ToolEvidence(
tool_name="web_fetch",
tool_call_id="call-team",
content="Recovered partial source content.",
event_payload={"success": True, "created_at": "2026-05-22T12:00:00Z"},
created_at="2026-05-22T12:00:00Z",
)
],
warnings=["finish_reason=max_tool_iterations"],
)
packet = TaskEvidencePacket(
task_id="task-1",
attempt_index=2,
main_run=None,
team_runs=[run],
team_node_results=[],
final_output="partial answer",
)
rendered = render_task_evidence(packet)
assert "finish_reason=max_tool_iterations" in rendered
assert "partial answer" in rendered
assert "Recovered partial source content." in rendered
assert "created_at=2026-05-22T12:00:00Z" in rendered