feat(engine): 优化智能体循环中的助手消息处理逻辑 - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性,避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ```
249 lines
9.3 KiB
Python
249 lines
9.3 KiB
Python
"""Replay execution helpers for skill draft evaluation."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from time import perf_counter
|
|
from typing import Any, Callable, Literal
|
|
from uuid import uuid4
|
|
|
|
from beaver.tools.base import ToolContext, ToolResult, ToolSpec
|
|
from beaver.tools.registry.tool_registry import ToolRegistry
|
|
from beaver.tools.runtime.executor import ToolExecutor
|
|
|
|
ToolExecutionMode = Literal["executed", "surrogate", "blocked"]
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ReplayToolPolicy:
|
|
safe_toolsets: set[str] = field(default_factory=lambda: {"filesystem", "user_files", "core", "web", "search"})
|
|
surrogate_transports: set[str] = field(default_factory=lambda: {"mcp", "connector"})
|
|
destructive_terms: tuple[str, ...] = (
|
|
"delete",
|
|
"remove",
|
|
"destroy",
|
|
"revoke",
|
|
"permission",
|
|
"credential",
|
|
"payment",
|
|
"pay",
|
|
)
|
|
external_write_terms: tuple[str, ...] = (
|
|
"send",
|
|
"post",
|
|
"publish",
|
|
"create",
|
|
"update",
|
|
"invite",
|
|
"reply",
|
|
"forward",
|
|
)
|
|
|
|
|
|
class ReplayToolExecutor:
|
|
def __init__(
|
|
self,
|
|
inner: ToolExecutor,
|
|
*,
|
|
registry: ToolRegistry,
|
|
policy: ReplayToolPolicy | None = None,
|
|
) -> None:
|
|
self.inner = inner
|
|
self.registry = registry
|
|
self.policy = policy or ReplayToolPolicy()
|
|
self.traces: list[dict[str, Any]] = []
|
|
|
|
async def execute(
|
|
self,
|
|
tool_name: str,
|
|
arguments: dict[str, Any] | None,
|
|
*,
|
|
context: ToolContext | None = None,
|
|
) -> ToolResult:
|
|
started_at = perf_counter()
|
|
tool = self.registry.get(tool_name)
|
|
spec = tool.spec if tool is not None else ToolSpec(
|
|
name=tool_name,
|
|
description="unregistered tool",
|
|
input_schema={"type": "object", "properties": {}},
|
|
toolset="unknown",
|
|
)
|
|
mode = classify_tool_mode(spec, self.policy)
|
|
trace = {
|
|
"trace_id": uuid4().hex,
|
|
"tool_name": tool_name,
|
|
"mode": mode,
|
|
"arguments": dict(arguments or {}),
|
|
"schema": dict(spec.input_schema),
|
|
"toolset": spec.toolset,
|
|
"metadata": dict(spec.metadata),
|
|
"classification_reason": _classification_reason(spec, mode),
|
|
}
|
|
if mode == "executed":
|
|
result = await self.inner.execute(tool_name, arguments or {}, context=context)
|
|
trace["result"] = {
|
|
"success": result.success,
|
|
"error": result.error,
|
|
"content": result.content[:2000],
|
|
}
|
|
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
|
self.traces.append(trace)
|
|
return result
|
|
if mode == "surrogate":
|
|
trace["result"] = {
|
|
"success": True,
|
|
"error": "replay_surrogate",
|
|
"content": "Tool call recorded for surrogate evaluation.",
|
|
}
|
|
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
|
self.traces.append(trace)
|
|
return ToolResult(
|
|
success=True,
|
|
content="Tool call recorded for surrogate evaluation.",
|
|
tool_name=tool_name,
|
|
error="replay_surrogate",
|
|
raw_output=trace,
|
|
)
|
|
trace["result"] = {
|
|
"success": False,
|
|
"error": "replay_blocked",
|
|
"content": "Tool call blocked by replay policy.",
|
|
}
|
|
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
|
self.traces.append(trace)
|
|
return ToolResult(
|
|
success=False,
|
|
content="Tool call blocked by replay policy.",
|
|
tool_name=tool_name,
|
|
error="replay_blocked",
|
|
raw_output=trace,
|
|
)
|
|
|
|
async def execute_tool_call(self, tool_call: Any, *, context: ToolContext | None = None) -> ToolResult:
|
|
tool_name, arguments = ToolExecutor._normalize_tool_call(tool_call)
|
|
return await self.execute(tool_name, arguments, context=context)
|
|
|
|
|
|
def classify_tool_mode(spec: ToolSpec, policy: ReplayToolPolicy | None = None) -> ToolExecutionMode:
|
|
policy = policy or ReplayToolPolicy()
|
|
name = spec.name.lower()
|
|
toolset = spec.toolset.lower()
|
|
metadata = {str(key).lower(): str(value).lower() for key, value in spec.metadata.items()}
|
|
if any(term in name for term in policy.destructive_terms):
|
|
return "blocked"
|
|
if toolset in policy.safe_toolsets:
|
|
return "executed"
|
|
if metadata.get("transport") in policy.surrogate_transports or toolset in {"mcp", "connector", "external"}:
|
|
if any(term in name for term in policy.external_write_terms):
|
|
return "surrogate"
|
|
return "executed"
|
|
return "surrogate"
|
|
|
|
|
|
def _classification_reason(spec: ToolSpec, mode: ToolExecutionMode) -> str:
|
|
return f"{spec.name} classified as {mode} from toolset={spec.toolset} metadata={spec.metadata}"
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ReplayArmRequest:
|
|
case_id: str
|
|
arm: str
|
|
task_text: str
|
|
pinned_skill_names: list[str] = field(default_factory=list)
|
|
pinned_skill_contexts: list[Any] = field(default_factory=list)
|
|
provider_bundle: Any | None = None
|
|
model_settings: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
class ReplayRunner:
|
|
def __init__(
|
|
self,
|
|
*,
|
|
agent_loop: Any,
|
|
policy: ReplayToolPolicy | None = None,
|
|
isolated_loop_factory: Callable[[], Any] | None = None,
|
|
) -> None:
|
|
self.agent_loop = agent_loop
|
|
self.policy = policy or ReplayToolPolicy()
|
|
self.isolated_loop_factory = isolated_loop_factory
|
|
|
|
async def run_arm(self, request: ReplayArmRequest) -> dict[str, Any]:
|
|
target_loop = self.isolated_loop_factory() if self.isolated_loop_factory is not None else self.agent_loop
|
|
loaded = target_loop.boot()
|
|
replay_executor = ReplayToolExecutor(
|
|
loaded.tool_executor,
|
|
registry=loaded.tool_registry,
|
|
policy=self.policy,
|
|
)
|
|
direct_kwargs = {
|
|
"provider_bundle": request.provider_bundle,
|
|
"include_skill_assembly": False,
|
|
"include_tools": True,
|
|
"pinned_skill_names": request.pinned_skill_names,
|
|
"pinned_skill_contexts": request.pinned_skill_contexts,
|
|
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
|
|
"temperature": float(request.model_settings.get("temperature") or 0.0),
|
|
"source": "skill_replay_eval",
|
|
"tool_executor_override": replay_executor,
|
|
}
|
|
try:
|
|
try:
|
|
result = await target_loop.process_direct(request.task_text, **direct_kwargs)
|
|
except RuntimeError as exc:
|
|
if not _is_process_direct_disabled_while_running(exc) or not hasattr(target_loop, "submit_direct"):
|
|
raise
|
|
result = await target_loop.submit_direct(request.task_text, **direct_kwargs)
|
|
session_manager = getattr(loaded, "session_manager", None)
|
|
if session_manager is not None and hasattr(session_manager, "end_session"):
|
|
session_manager.end_session(result.session_id, "evaluation_complete")
|
|
return {
|
|
"case_id": request.case_id,
|
|
"arm": request.arm,
|
|
"session_id": result.session_id,
|
|
"run_id": result.run_id,
|
|
"task_text": request.task_text,
|
|
"finish_reason": result.finish_reason,
|
|
"final_answer": result.output_text,
|
|
"tool_calls": list(replay_executor.traces),
|
|
"artifacts": [],
|
|
"side_effects": _side_effects_from_traces(replay_executor.traces),
|
|
}
|
|
finally:
|
|
if target_loop is not self.agent_loop and hasattr(target_loop, "close"):
|
|
mcp_manager = getattr(loaded, "mcp_manager", None)
|
|
if mcp_manager is not None and hasattr(mcp_manager, "close"):
|
|
try:
|
|
await mcp_manager.close()
|
|
finally:
|
|
closeables = getattr(loaded, "closeables", None)
|
|
if isinstance(closeables, list):
|
|
loaded.closeables = [
|
|
(name, close_fn)
|
|
for name, close_fn in closeables
|
|
if name != "mcp_manager"
|
|
]
|
|
target_loop.close()
|
|
|
|
|
|
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
|
|
message = str(exc)
|
|
return (
|
|
"AgentLoop.process_direct() is disabled while run() is active" in message
|
|
and "submit tasks via submit_direct() instead" in message
|
|
)
|
|
|
|
|
|
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
effects: list[dict[str, Any]] = []
|
|
for trace in traces:
|
|
if trace.get("mode") in {"surrogate", "blocked"}:
|
|
effects.append(
|
|
{
|
|
"tool_name": trace.get("tool_name"),
|
|
"mode": trace.get("mode"),
|
|
"arguments": trace.get("arguments"),
|
|
"classification_reason": trace.get("classification_reason"),
|
|
}
|
|
)
|
|
return effects
|