Files
beaver_project/app-instance/backend/beaver/tasks/evidence.py
steven_li 520a21a027 feat(coordinator): 添加团队节点默认最大工具迭代次数配置
添加 DEFAULT_TEAM_NODE_MAX_TOOL_ITERATIONS 配置项以控制团队节点的最大工具迭代次数,
并修改 LocalAgentRunner 中的逻辑来使用此默认值当 envelope 中未指定时。

fix(runtime): 修复团队节点运行成功判断逻辑

更新运行成功判断条件,将 finish_reason 为 "max_tool_iterations_finalized" 的情况
视为运行失败,并添加对原始工具调用输出的检测,避免将其误判为成功完成。

feat(mcp): 添加团队工作流MCP工具类别支持

增加新的本地MCP工具类别 "team_workflow" 及其对应的工具创建功能,
为团队工作流提供本地工具支持。

refactor(engine): 调整AgentLoop最大工具迭代次数设置

将 AgentProfile 中的默认 max_tool_iterations 从 30 增加到 100,
同时移除 TaskExecutionPlanner 构造函数中的重复参数传递。

perf(mcp): 优化MCP连接管理避免重复连接

添加 mcp_connected 标志来跟踪MCP连接状态,确保 connect_all 只执行一次,
提高性能并避免不必要的重复连接。

refactor(skills): 移除技能团队模板相关功能

移除与技能团队模板相关的代码,包括解析、存储和处理逻辑,
简化技能记录结构和加载流程。

feat(process): 增强会话过程投影器功能

添加技能激活快照事件处理,改进团队运行完成消息显示,
并增强技能激活事件的时间戳记录功能。

refactor(tasks): 简化任务尝试编排器团队执行逻辑

移除团队执行相关代码,将所有任务统一按单步执行处理,
简化任务编排器的复杂度并提升执行效率。

fix(evidence): 修复节点证据评估中需求验证逻辑

更新节点证据评估逻辑,跳过自然语言证据需求的确定性验证,
只执行机器可读的需求验证,避免因自然语言需求导致的节点失败。
2026-06-26 16:36:29 +08:00

237 lines
8.0 KiB
Python

"""Structured evidence for task synthesis and validation."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass, field
from typing import Any
@dataclass(slots=True)
class ToolEvidence:
tool_name: str
tool_call_id: str | None
content: str
event_payload: dict[str, Any] = field(default_factory=dict)
url: str | None = None
title: str | None = None
created_at: str | None = None
def to_dict(self) -> dict[str, Any]:
return {
"tool_name": self.tool_name,
"tool_call_id": self.tool_call_id,
"content": self.content,
"event_payload": dict(self.event_payload),
"url": self.url,
"title": self.title,
"created_at": self.created_at,
}
@dataclass(slots=True)
class RunEvidence:
run_id: str
session_id: str
output_text: str
finish_reason: str
transcript: list[dict[str, Any]] = field(default_factory=list)
tool_results: list[ToolEvidence] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"run_id": self.run_id,
"session_id": self.session_id,
"output_text": self.output_text,
"finish_reason": self.finish_reason,
"transcript": list(self.transcript),
"tool_results": [item.to_dict() for item in self.tool_results],
"warnings": list(self.warnings),
}
@dataclass(slots=True)
class TaskEvidencePacket:
task_id: str
attempt_index: int
main_run: RunEvidence | None
team_runs: list[RunEvidence] = field(default_factory=list)
team_node_results: list[Any] = field(default_factory=list)
final_output: str = ""
def to_dict(self) -> dict[str, Any]:
return {
"task_id": self.task_id,
"attempt_index": self.attempt_index,
"main_run": self.main_run.to_dict() if self.main_run else None,
"team_runs": [item.to_dict() for item in self.team_runs],
"team_node_results": [
item.to_dict() if hasattr(item, "to_dict") else dict(item)
for item in self.team_node_results
],
"final_output": self.final_output,
}
class EvidenceBuilder:
def __init__(self, session_manager: Any) -> None:
self.session_manager = session_manager
def build_run_evidence(
self,
session_id: str,
run_id: str,
output_text: str,
finish_reason: str,
) -> RunEvidence:
events = self.session_manager.get_run_event_records(session_id, run_id)
transcript: list[dict[str, Any]] = []
tool_results: list[ToolEvidence] = []
warnings: list[str] = []
for event in events:
payload = dict(event.event_payload or {})
transcript.append(
{
"role": event.role,
"event_type": event.event_type,
"content": event.content,
"tool_name": event.tool_name,
"tool_call_id": event.tool_call_id,
"finish_reason": event.finish_reason,
"event_payload": payload,
}
)
if event.event_type == "tool_result_recorded":
tool_results.append(
ToolEvidence(
tool_name=event.tool_name or "tool",
tool_call_id=event.tool_call_id,
content=event.content or "",
event_payload=payload,
url=_optional_str(payload.get("url")),
title=_optional_str(payload.get("title")),
created_at=_optional_str(payload.get("created_at")),
)
)
if finish_reason and finish_reason != "stop":
warnings.append(f"finish_reason={finish_reason}")
return RunEvidence(
run_id=run_id,
session_id=session_id,
output_text=output_text,
finish_reason=finish_reason,
transcript=transcript,
tool_results=tool_results,
warnings=warnings,
)
def evaluate_node_evidence(
evidence: RunEvidence,
required_evidence: list[str],
output_text: str,
) -> list[str]:
"""Evaluate v1 coarse-grained node evidence requirements."""
gaps: list[str] = []
successful_tools = [
item
for item in evidence.tool_results
if item.event_payload.get("success") is True
]
for raw_requirement in required_evidence:
requirement = str(raw_requirement).strip()
if not requirement:
continue
if requirement == "tool_result":
if not successful_tools:
_append_unique(gaps, "missing required evidence: tool_result")
elif requirement == "url":
if not any(_tool_evidence_contains_url(item) for item in successful_tools):
_append_unique(gaps, "missing required evidence: url")
elif requirement == "output":
if not output_text.strip():
_append_unique(gaps, "missing required evidence: output")
else:
# v1 only enforces the coarse machine-readable requirements above.
# Natural-language evidence requirements are preserved for later
# LLM-based validation and must not fail a node deterministically.
continue
return gaps
def render_task_evidence(packet: TaskEvidencePacket) -> str:
sections = [
f"Task evidence packet: task_id={packet.task_id} attempt={packet.attempt_index}",
f"Final output:\n{packet.final_output}",
]
if packet.main_run is not None:
sections.append("Main run evidence:\n" + render_run_evidence(packet.main_run))
if packet.team_runs:
sections.append(
"Team run evidence:\n"
+ "\n\n".join(render_run_evidence(item) for item in packet.team_runs)
)
if packet.team_node_results:
lines = []
for item in packet.team_node_results:
lines.append(
f"- {getattr(item, 'node_id', '')}: success={getattr(item, 'success', False)} "
f"finish_reason={getattr(item, 'finish_reason', '')} error={getattr(item, 'error', '') or ''}"
)
sections.append("Team node results:\n" + "\n".join(lines))
return "\n\n".join(section for section in sections if section.strip())
def render_run_evidence(evidence: RunEvidence) -> str:
lines = [
f"run_id={evidence.run_id}",
f"session_id={evidence.session_id}",
f"finish_reason={evidence.finish_reason}",
]
if evidence.output_text:
lines.append(f"output:\n{evidence.output_text}")
if evidence.warnings:
lines.append("warnings:\n" + "\n".join(f"- {item}" for item in evidence.warnings))
if evidence.tool_results:
lines.append(
"tool_results:\n"
+ "\n\n".join(_render_tool_evidence(item) for item in evidence.tool_results)
)
return "\n".join(lines)
def _render_tool_evidence(item: ToolEvidence) -> str:
header = f"- tool={item.tool_name} call_id={item.tool_call_id or ''}"
metadata = []
if item.url:
metadata.append(f"url={item.url}")
if item.title:
metadata.append(f"title={item.title}")
if item.created_at:
metadata.append(f"created_at={item.created_at}")
return "\n".join([header, *metadata, item.content])
def _optional_str(value: Any) -> str | None:
return str(value) if value is not None else None
_URL_RE = re.compile(r"https?://[^\s<>'\"]+", re.IGNORECASE)
def _tool_evidence_contains_url(item: ToolEvidence) -> bool:
values = [
item.url or "",
item.content,
json.dumps(item.event_payload, ensure_ascii=False, default=str),
]
return any(_URL_RE.search(value) is not None for value in values)
def _append_unique(values: list[str], value: str) -> None:
if value not in values:
values.append(value)