feat(engine): 优化智能体循环中的助手消息处理逻辑

- 在没有工具调用时才添加助手消息到上下文
- 确保工具调用响应正确添加到消息上下文中
- 修复了消息构建的条件逻辑

fix(cron): 改进定时任务调度的时间解析功能

- 添加正则表达式导入用于时间显示解析
- 实现从显示文本中提取毫秒间隔的功能
- 增强整数转换的安全性,避免类型错误
- 优化定时任务配置的解析逻辑

feat(outlook): 增强Outlook集成的功能和稳定性

- 将默认超时时间从10秒增加到180秒
- 为状态检查函数添加可选的验证参数
- 串行执行邮件概览获取操作而非并行
- 改进连接状态验证逻辑

feat(channel): 添加设备名称作为会话标识的选项

- 为终端WebSocket适配器添加新的配置选项
- 实现基于设备名称生成会话对等ID的功能
- 记录原始对等ID和设备名称的元数据
- 支持从设备名称创建会话对等ID

feat(skills): 完善技能学习评估系统和进度跟踪

- 在应用启动时自动调度待评估的技能草稿
- 为技能评估工作创建独立的循环工厂
- 实现异步技能评估任务的取消和清理机制
- 添加技能评估进度报告和状态跟踪功能
- 扩展会话列表API以包含更多详细信息
- 防止对不存在的会话进行操作
- 优化技能草稿提交和评估的业务逻辑

perf(skills): 提升技能评估的并发性能

- 实现并行技能案例评估以提高效率
- 添加最大并行案例数的环境变量控制
- 实现实时评估进度更新和回调机制
- 优化评估过程中的资源管理和同步

refactor(services): 创建隔离的智能体循环实例

- 添加创建独立智能体循环的工厂方法
- 确保新循环继承运行时服务配置
- 支持技能评估等需要隔离环境的场景
```
This commit is contained in:
2026-06-15 14:48:16 +08:00
parent 8aeb97a5fc
commit 4b0bf65ace
53 changed files with 4328 additions and 292 deletions

View File

@ -749,14 +749,12 @@ class AgentLoop:
model=final_model,
user_id=user_id,
)
context_builder.add_assistant_message(
messages,
content=response.content,
tool_calls=assistant_tool_calls or None,
reasoning_content=response.reasoning_content,
)
if not response.has_tool_calls:
context_builder.add_assistant_message(
messages,
content=response.content,
reasoning_content=response.reasoning_content,
)
final_text = response.content or ""
if self._looks_like_raw_tool_call(final_text):
final_text = RAW_TOOL_CALL_FALLBACK
@ -795,6 +793,12 @@ class AgentLoop:
)
break
context_builder.add_assistant_message(
messages,
content=response.content,
tool_calls=assistant_tool_calls or None,
reasoning_content=response.reasoning_content,
)
iterations += 1
for tool_call in response.tool_calls:
result = await effective_tool_executor.execute_tool_call(tool_call, context=tool_context)

View File

@ -6,6 +6,7 @@ normal Task instead of a detached agent turn.
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Any, Literal
from uuid import uuid4
@ -37,13 +38,18 @@ class CronSchedule:
@classmethod
def from_dict(cls, payload: dict[str, Any]) -> "CronSchedule":
kind = str(payload.get("kind") or "every")
display = _optional_str(payload.get("display"))
every_ms = _optional_int(payload.get("every_ms") or payload.get("everyMs"))
if kind == "every" and every_ms is None:
every_ms = _every_ms_from_display(display)
return cls(
kind=str(payload.get("kind") or "every"), # type: ignore[arg-type]
kind=kind, # type: ignore[arg-type]
at_ms=_optional_int(payload.get("at_ms") or payload.get("atMs")),
every_ms=_optional_int(payload.get("every_ms") or payload.get("everyMs")),
every_ms=every_ms,
expr=_optional_str(payload.get("expr")),
tz=_optional_str(payload.get("tz")),
display=_optional_str(payload.get("display")),
display=display,
)
@ -250,6 +256,17 @@ def _optional_str(value: Any) -> str | None:
def _optional_int(value: Any) -> int | None:
if value in (None, ""):
return None
try:
return int(value)
except (TypeError, ValueError):
return None
def _every_ms_from_display(display: str | None) -> int | None:
match = re.fullmatch(r"every\s+(\d+)s", (display or "").strip(), re.IGNORECASE)
if match is None:
return None
return int(match.group(1)) * 1000
def _payload_mode(value: Any, *, default: CronPayloadMode = "notification") -> CronPayloadMode:
@ -259,7 +276,3 @@ def _payload_mode(value: Any, *, default: CronPayloadMode = "notification") -> C
if cleaned == "task":
return "task"
return "notification"
try:
return int(value)
except (TypeError, ValueError):
return None

View File

@ -73,9 +73,9 @@ OUTLOOK_TOOL_NAMES = [
def _call_timeout_seconds() -> float:
raw = os.getenv("BEAVER_OUTLOOK_MCP_CALL_TIMEOUT_SECONDS", "").strip()
try:
return max(1.0, float(raw)) if raw else 10.0
return max(1.0, float(raw)) if raw else 180.0
except ValueError:
return 10.0
return 180.0
def _use_authz_mode(config: BeaverConfig) -> bool:
@ -340,7 +340,7 @@ async def disconnect_workspace(config: BeaverConfig) -> dict[str, Any]:
return {"ok": True, "removed_state": removed, "removed_mcp": False, "server_id": OUTLOOK_SERVER_ID}
async def outlook_status(config: BeaverConfig, workspace: Path) -> dict[str, Any]:
async def outlook_status(config: BeaverConfig, workspace: Path, *, verify: bool = False) -> dict[str, Any]:
meta = _load_meta(workspace)
if not _use_authz_mode(config):
return {
@ -364,7 +364,7 @@ async def outlook_status(config: BeaverConfig, workspace: Path) -> dict[str, Any
connected = False
auth_status: dict[str, Any] | None = None
error: str | None = None
if configured:
if configured and verify:
try:
auth_status = await _call_outlook_mcp_tool(config, "auth_status", {}, scopes=["list_tools", "tool:auth_status"])
connected = bool(auth_status.get("authenticated"))
@ -403,38 +403,36 @@ async def get_overview(config: BeaverConfig, workspace: Path) -> dict[str, Any]:
warnings.append(f"{label} unavailable: {exc}")
return {"value": []}
inbox, sent, calendar = await asyncio.gather(
_load_section(
"inbox",
_call_outlook_mcp_tool(
config,
"mail_list_messages",
{"folder": "inbox", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0},
scopes=["list_tools", "tool:mail_list_messages"],
),
inbox = await _load_section(
"inbox",
_call_outlook_mcp_tool(
config,
"mail_list_messages",
{"folder": "inbox", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0},
scopes=["list_tools", "tool:mail_list_messages"],
),
_load_section(
"sent items",
_call_outlook_mcp_tool(
config,
"mail_list_messages",
{"folder": "sentitems", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0},
scopes=["list_tools", "tool:mail_list_messages"],
),
)
sent = await _load_section(
"sent items",
_call_outlook_mcp_tool(
config,
"mail_list_messages",
{"folder": "sentitems", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0},
scopes=["list_tools", "tool:mail_list_messages"],
),
_load_section(
"calendar",
_call_outlook_mcp_tool(
config,
"calendar_list_events",
{
"start_time": start_of_day.isoformat(),
"end_time": end_of_day.isoformat(),
"top": OUTLOOK_OVERVIEW_EVENT_LIMIT,
"skip": 0,
},
scopes=["list_tools", "tool:calendar_list_events"],
),
)
calendar = await _load_section(
"calendar",
_call_outlook_mcp_tool(
config,
"calendar_list_events",
{
"start_time": start_of_day.isoformat(),
"end_time": end_of_day.isoformat(),
"top": OUTLOOK_OVERVIEW_EVENT_LIMIT,
"skip": 0,
},
scopes=["list_tools", "tool:calendar_list_events"],
),
)
meta = _update_meta(workspace, last_overview_refresh_at=datetime.now().isoformat())

View File

@ -331,6 +331,10 @@ class ChannelRuntime:
event_recorder=self.record_event,
heartbeat_seconds=float(cfg.config.get("heartbeat_seconds") or 30),
max_message_chars=int(cfg.config.get("max_message_chars") or 20000),
session_peer_from_device_name=bool(
cfg.config.get("session_peer_from_device_name")
or cfg.config.get("sessionPeerFromDeviceName")
),
)
if cfg.kind == "telegram" and cfg.mode in {"polling", "webhook"}:

View File

@ -51,6 +51,7 @@ class TerminalWebSocketAdapter:
event_recorder: Callable[..., None] | None = None,
heartbeat_seconds: float = 30,
max_message_chars: int = 20000,
session_peer_from_device_name: bool = False,
) -> None:
self.channel_id = channel_id
self.kind = kind
@ -61,6 +62,7 @@ class TerminalWebSocketAdapter:
self.event_recorder = event_recorder
self.heartbeat_seconds = max(1.0, float(heartbeat_seconds))
self.max_message_chars = max(1, int(max_message_chars))
self.session_peer_from_device_name = bool(session_peer_from_device_name)
self.started = False
self._connections_by_session: dict[str, TerminalConnection] = {}
self._session_by_peer: dict[str, str] = {}
@ -131,14 +133,15 @@ class TerminalWebSocketAdapter:
*,
current: TerminalConnection | None,
) -> TerminalConnection | None:
peer_id = _clean(payload.get("peer_id"))
if not peer_id:
raw_peer_id = _clean(payload.get("peer_id"))
if not raw_peer_id:
await websocket.send_json({"type": "error", "error": "peer_id is required"})
return current
thread_id = _clean(payload.get("thread_id")) or None
user_id = _clean(payload.get("user_id")) or None
device_name = _clean(payload.get("device_name"))
peer_id = self._session_peer_id(raw_peer_id, device_name)
capabilities = [str(item) for item in payload.get("capabilities") or [] if item is not None]
identity = ChannelIdentity(
channel_id=self.channel_id,
@ -171,7 +174,12 @@ class TerminalWebSocketAdapter:
self._record(
kind="terminal_connected",
session_id=session_id,
metadata={"peer_id": peer_id, "device_name": device_name, "capabilities": capabilities},
metadata={
"peer_id": peer_id,
"raw_peer_id": raw_peer_id,
"device_name": device_name,
"capabilities": capabilities,
},
)
await websocket.send_json(
{
@ -299,3 +307,13 @@ class TerminalWebSocketAdapter:
error=error,
metadata=metadata,
)
def _session_peer_id(self, peer_id: str, device_name: str) -> str:
if self.session_peer_from_device_name and device_name:
return f"device-{_clean_session_part(device_name)}"
return peer_id
def _clean_session_part(value: str) -> str:
cleaned = "-".join(str(value or "").strip().split())
return cleaned.replace(":", "_") or "unknown"

View File

@ -264,6 +264,25 @@ async def _app_lifespan(
)
app.state.channel_runtime = channel_runtime
await channel_runtime.start()
for candidate in loaded.skill_learning_pipeline.list_candidates(status="review_pending"): # type: ignore[union-attr]
skill_name = candidate.draft_skill_name
draft_id = candidate.draft_id
if not skill_name or not draft_id:
continue
if loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) is not None: # type: ignore[union-attr]
continue
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
if draft.status != "in_review":
continue
_schedule_skill_draft_eval(
app,
agent_service=attached_service,
loop=attached_service.create_loop(),
loaded=loaded,
candidate_id=candidate.candidate_id,
skill_name=skill_name,
draft_id=draft_id,
)
except BaseException:
if owns_service and started:
with suppress(BaseException):
@ -280,7 +299,10 @@ async def _app_lifespan(
worker = SkillLearningWorker(
pipeline=loaded.skill_learning_pipeline, # type: ignore[arg-type]
provider_bundle_factory=lambda: attached_service._make_provider_bundle_for_task(loaded, {}), # noqa: SLF001
replay_runner_factory=lambda: ReplayRunner(agent_loop=attached_service.create_loop()),
replay_runner_factory=lambda: ReplayRunner(
agent_loop=attached_service.create_loop(),
isolated_loop_factory=attached_service.create_isolated_loop,
),
config=worker_config,
)
worker_task = asyncio.create_task(worker.run_forever())
@ -289,6 +311,13 @@ async def _app_lifespan(
try:
yield
finally:
skill_eval_tasks = getattr(app.state, "skill_eval_tasks", {})
for task in list(skill_eval_tasks.values()):
task.cancel()
for task in list(skill_eval_tasks.values()):
with suppress(BaseException):
await task
skill_eval_tasks.clear()
runtime = getattr(app.state, "channel_runtime", None)
if isinstance(runtime, ChannelRuntime):
with suppress(BaseException):
@ -587,6 +616,7 @@ def create_app(
)
app.state.auth_tokens = {}
app.state.handoff_codes = {}
app.state.skill_eval_tasks = {}
app.state.auth_file = Path(os.getenv("BEAVER_AUTH_FILE") or "")
max_file_size = 50 * 1024 * 1024
max_user_file_upload_size = _int_env("BEAVER_USER_FILES_MAX_UPLOAD_BYTES", 5 * 1024 * 1024 * 1024)
@ -1250,7 +1280,7 @@ def create_app(
session_manager = loaded.session_manager
rows = session_manager.list_sessions_rich(
limit=100,
exclude_sources=["subagent", "notification"],
exclude_sources=["subagent", "notification", "skill_replay_eval"],
exclude_end_reasons=["archived", "deleted"],
) # type: ignore[union-attr]
return [
@ -1259,6 +1289,9 @@ def create_app(
"created_at": _iso_from_timestamp(row.get("started_at")),
"updated_at": _iso_from_timestamp(row.get("last_active")),
"path": str(row.get("id")),
"source": row.get("source"),
"title": row.get("title"),
"preview": row.get("preview"),
}
for row in rows
]
@ -1337,7 +1370,9 @@ def create_app(
async def get_session(session_id: str, request: Request) -> dict[str, Any]:
loaded = get_agent_service(request).create_loop().boot()
session_manager = loaded.session_manager
session = session_manager.get_or_create(session_id, source="web") # type: ignore[union-attr]
session = session_manager.get_session(session_id) # type: ignore[union-attr]
if session is None:
raise HTTPException(status_code=404, detail="Session not found")
return _session_detail(session_manager, session_id, session) # type: ignore[arg-type]
@app.delete("/api/sessions/{session_id:path}")
@ -2216,21 +2251,33 @@ def create_app(
try:
safety = loaded.skill_learning_pipeline.check_safety(skill_name, draft_id) # type: ignore[union-attr]
if safety.passed and safety.risk_level != "critical":
loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
if candidate_id is not None:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
if draft.status == "draft":
loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
elif draft.status not in {"in_review", "approved"}:
raise ValueError("Draft cannot be submitted from its current status")
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
eval_report = loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) # type: ignore[union-attr]
if candidate_id is not None and eval_report is None:
loaded.skill_learning_store.transition_learning_candidate( # type: ignore[union-attr]
candidate_id,
"review_pending",
event_type="eval_queued",
last_error=None,
)
_schedule_skill_draft_eval(
app,
agent_service=agent_service,
loop=loop,
loaded=loaded,
candidate_id=candidate_id,
skill_name=skill_name,
draft_id=draft_id,
)
except ValueError as exc:
raise _skill_draft_http_error(exc) from exc
@ -3810,14 +3857,88 @@ def _skill_learning_candidate_task_text(loaded: Any, candidate: Any) -> str:
return str(evidence.get("task_text") or "").strip()
def _schedule_skill_draft_eval(
app: FastAPI,
*,
agent_service: AgentService,
loop: Any,
loaded: Any,
candidate_id: str,
skill_name: str,
draft_id: str,
) -> None:
key = f"{skill_name}:{draft_id}"
tasks: dict[str, asyncio.Task[None]] = app.state.skill_eval_tasks
current = tasks.get(key)
if current is not None and not current.done():
return
loaded.skill_learning_pipeline.mark_eval_progress( # type: ignore[union-attr]
candidate_id,
{
"phase": "preparing",
"completed_arms": 0,
"total_arms": 20,
"completed_cases": 0,
"total_cases": 10,
},
)
async def run_eval() -> None:
try:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
skill_name,
draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(
agent_loop=loop,
isolated_loop_factory=agent_service.create_isolated_loop,
),
progress_callback=lambda progress: loaded.skill_learning_pipeline.mark_eval_progress( # type: ignore[union-attr]
candidate_id,
progress,
),
)
except asyncio.CancelledError:
raise
except Exception as exc:
loaded.skill_learning_pipeline.mark_eval_failed(candidate_id, str(exc)) # type: ignore[union-attr]
task = asyncio.create_task(run_eval())
tasks[key] = task
def remove_completed(completed: asyncio.Task[None]) -> None:
if tasks.get(key) is completed:
tasks.pop(key, None)
task.add_done_callback(remove_completed)
def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include_reviews: bool = False) -> dict[str, Any]:
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
safety = loaded.skill_learning_pipeline.get_safety_report(skill_name, draft_id) # type: ignore[union-attr]
eval_report = loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) # type: ignore[union-attr]
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) if candidate_id is not None else None # type: ignore[union-attr]
if eval_report is not None:
eval_status = eval_report.status
elif candidate is None:
eval_status = "not_applicable"
elif candidate.status == "eval_failed":
eval_status = "failed"
elif draft.status in {"in_review", "approved"}:
eval_status = "pending"
else:
eval_status = "not_started"
payload = {
**draft.to_dict(),
"safety_report": safety.to_dict() if safety is not None else None,
"eval_report": eval_report.to_dict() if eval_report is not None else None,
"eval_status": eval_status,
"eval_error": candidate.last_error if candidate is not None and candidate.status == "eval_failed" else None,
"eval_progress": dict(candidate.eval_progress) if candidate is not None else None,
"target_version": _skill_draft_target_version(loaded, draft.skill_name, draft.proposal_kind),
"base_skill": _skill_draft_base_skill_payload(loaded, draft),
}

View File

@ -82,6 +82,7 @@ class SkillLearningCandidate:
draft_id: str | None = None
safety_report_id: str | None = None
eval_report_id: str | None = None
eval_progress: dict[str, Any] = field(default_factory=dict)
created_at: str = ""
updated_at: str = ""
@ -107,6 +108,7 @@ class SkillLearningCandidate:
"draft_id": self.draft_id,
"safety_report_id": self.safety_report_id,
"eval_report_id": self.eval_report_id,
"eval_progress": dict(self.eval_progress),
"created_at": self.created_at,
"updated_at": self.updated_at,
}
@ -137,6 +139,7 @@ class SkillLearningCandidate:
draft_id=_optional_str(payload.get("draft_id")),
safety_report_id=_optional_str(payload.get("safety_report_id")),
eval_report_id=_optional_str(payload.get("eval_report_id")),
eval_progress=dict(payload.get("eval_progress") or {}),
created_at=str(payload.get("created_at") or now),
updated_at=str(payload.get("updated_at") or payload.get("created_at") or now),
)

View File

@ -91,6 +91,11 @@ class AgentService:
self._loop.boot()
return self._loop
def create_isolated_loop(self) -> AgentLoop:
loop = AgentLoop(profile=self.profile, loader=self.loader)
loop.runtime_services.update(self._runtime_services)
return loop
def register_runtime_service(self, name: str, service: Any) -> None:
"""Expose process-level services to tools during agent runs."""

View File

@ -2,8 +2,10 @@
from __future__ import annotations
import asyncio
import json
from typing import Any
import os
from typing import Any, Callable
from uuid import uuid4
from beaver.engine.context import SkillContext
@ -25,9 +27,17 @@ class SkillDraftEvaluator:
run_store: RunMemoryStore,
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
max_parallel_cases: int | None = None,
) -> None:
self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
configured_parallelism = max_parallel_cases
if configured_parallelism is None:
try:
configured_parallelism = int(os.getenv("BEAVER_SKILL_EVAL_MAX_PARALLEL_CASES", "3") or "3")
except ValueError:
configured_parallelism = 3
self.max_parallel_cases = max(1, configured_parallelism)
async def evaluate(
self,
@ -36,6 +46,7 @@ class SkillDraftEvaluator:
draft: SkillDraft,
provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
if provider_bundle is None or provider_bundle.main_provider is None:
return self._skipped(candidate, draft)
@ -59,6 +70,7 @@ class SkillDraftEvaluator:
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
progress_callback=progress_callback,
)
return self._evaluate_heuristic(candidate, draft, runs)
@ -129,96 +141,72 @@ class SkillDraftEvaluator:
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
for case in replay_cases:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
total_cases = len(replay_cases)
total_arms = total_cases * 2
completed_arms = 0
completed_cases = 0
progress_lock = asyncio.Lock()
semaphore = asyncio.Semaphore(self.max_parallel_cases)
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=0,
total_cases=total_cases,
)
async def mark_progress(*, case_completed: bool) -> None:
nonlocal completed_arms, completed_cases
async with progress_lock:
completed_arms += 1
if case_completed:
completed_cases += 1
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=completed_cases,
total_cases=total_cases,
)
)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
async def evaluate_case(case: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
async with semaphore:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
case_reports.append(case_report)
legacy_cases.append(
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
await mark_progress(case_completed=False)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
await mark_progress(case_completed=True)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
return _build_replay_case_reports(case, baseline, candidate_arm, surrogate)
results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases))
case_reports = [case_report for case_report, _ in results]
legacy_cases = [legacy_case for _, legacy_case in results]
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(
candidate,
@ -248,6 +236,83 @@ class SkillDraftEvaluator:
)
def _build_replay_case_reports(
case: dict[str, Any],
baseline: dict[str, Any],
candidate_arm: dict[str, Any],
surrogate: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any]]:
baseline_ability = _ability_score(case=case, arm=baseline, arm_name="baseline")
candidate_ability = _ability_score(case=case, arm=candidate_arm, arm_name="candidate")
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tier = case.get("tier") or ("bronze" if case.get("synthetic") else "gold")
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
},
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
return case_report, {
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
def _report_progress(
callback: Callable[[dict[str, Any]], None] | None,
*,
completed_arms: int,
total_arms: int,
completed_cases: int,
total_cases: int,
) -> None:
if callback is None:
return
callback(
{
"phase": "replaying",
"completed_arms": completed_arms,
"total_arms": total_arms,
"completed_cases": completed_cases,
"total_cases": total_cases,
}
)
def _score_from_validation(validation: dict | None, success: bool) -> float:
if isinstance(validation, dict) and "score" in validation:
try:

View File

@ -2,7 +2,7 @@
from __future__ import annotations
from typing import Any
from typing import Any, Callable
from beaver.engine.providers import ProviderBundle
from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore
@ -174,12 +174,20 @@ class SkillLearningPipelineService:
safety = self.get_safety_report(skill_name, draft_id)
if safety is not None and (not safety.passed or safety.risk_level == "critical"):
raise ValueError("Draft cannot enter review because safety check failed")
return self.review_service.submit_for_review(
review = self.review_service.submit_for_review(
skill_name,
draft_id,
reviewer_request=notes,
requested_by=requested_by,
)
self._mark_candidate_by_draft(
skill_name,
draft_id,
"review_pending",
"review_submitted",
last_error=None,
)
return review
def approve(
self,
@ -258,9 +266,13 @@ class SkillLearningPipelineService:
draft = self.get_draft(skill_name, draft_id)
report = self.safety_checker.check(draft)
self.learning_store.write_safety_report(report)
status = "safety_failed" if not report.passed or report.risk_level == "critical" else "draft_ready"
status = (
"safety_failed"
if not report.passed or report.risk_level == "critical"
else self._candidate_status_for_draft(draft)
)
current = self._candidate_by_draft(skill_name, draft_id)
if current is not None and current.status == "eval_failed" and status == "draft_ready":
if current is not None and current.status == "eval_failed" and status != "safety_failed":
status = "eval_failed"
self._mark_candidate_by_draft(
skill_name,
@ -287,6 +299,7 @@ class SkillLearningPipelineService:
*,
provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
draft = self.get_draft(skill_name, draft_id)
candidate = self.get_candidate(candidate_id)
@ -296,13 +309,14 @@ class SkillLearningPipelineService:
draft=draft,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
progress_callback=progress_callback,
)
self.learning_store.write_eval_report(report)
if report.status == "skipped_provider_unavailable":
status = "draft_ready"
status = self._candidate_status_for_draft(draft)
error = "eval skipped: provider unavailable"
elif report.passed:
status = "draft_ready"
status = self._candidate_status_for_draft(draft)
error = None
else:
status = "eval_failed"
@ -316,11 +330,43 @@ class SkillLearningPipelineService:
status,
event_type="eval_completed",
eval_report_id=report.report_id,
eval_progress={
"phase": "completed",
"completed_arms": len(report.cases) * 2 if report.mode == "replay" else 0,
"total_arms": len(report.cases) * 2 if report.mode == "replay" else 0,
"completed_cases": len(report.cases),
"total_cases": len(report.cases),
},
last_error=error,
payload=report.to_dict(),
)
return report
def mark_eval_progress(self, candidate_id: str, progress: dict[str, Any]) -> SkillLearningCandidate:
return self._require_updated(
self.learning_store.update_learning_candidate(
candidate_id,
eval_progress=dict(progress),
),
candidate_id,
)
def mark_eval_failed(self, candidate_id: str, error: str) -> SkillLearningCandidate:
candidate = self.get_candidate(candidate_id)
progress = dict(candidate.eval_progress)
progress["phase"] = "failed"
return self._require_updated(
self.learning_store.transition_learning_candidate(
candidate_id,
"eval_failed",
eval_progress=progress,
event_type="eval_failed",
last_error=error,
payload={"error": error},
),
candidate_id,
)
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
@ -372,6 +418,14 @@ class SkillLearningPipelineService:
return candidate
return None
@staticmethod
def _candidate_status_for_draft(draft: SkillDraft) -> str:
if draft.status == SkillReviewState.APPROVED.value:
return "approved"
if draft.status == SkillReviewState.IN_REVIEW.value:
return "review_pending"
return "draft_ready"
@staticmethod
def _require_updated(candidate: SkillLearningCandidate | None, candidate_id: str) -> SkillLearningCandidate:
if candidate is None:

View File

@ -3,7 +3,8 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Literal
from time import perf_counter
from typing import Any, Callable, Literal
from uuid import uuid4
from beaver.tools.base import ToolContext, ToolResult, ToolSpec
@ -59,6 +60,7 @@ class ReplayToolExecutor:
*,
context: ToolContext | None = None,
) -> ToolResult:
started_at = perf_counter()
tool = self.registry.get(tool_name)
spec = tool.spec if tool is not None else ToolSpec(
name=tool_name,
@ -84,6 +86,7 @@ class ReplayToolExecutor:
"error": result.error,
"content": result.content[:2000],
}
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
self.traces.append(trace)
return result
if mode == "surrogate":
@ -92,6 +95,7 @@ class ReplayToolExecutor:
"error": "replay_surrogate",
"content": "Tool call recorded for surrogate evaluation.",
}
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
self.traces.append(trace)
return ToolResult(
success=True,
@ -105,6 +109,7 @@ class ReplayToolExecutor:
"error": "replay_blocked",
"content": "Tool call blocked by replay policy.",
}
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
self.traces.append(trace)
return ToolResult(
success=False,
@ -151,12 +156,20 @@ class ReplayArmRequest:
class ReplayRunner:
def __init__(self, *, agent_loop: Any, policy: ReplayToolPolicy | None = None) -> None:
def __init__(
self,
*,
agent_loop: Any,
policy: ReplayToolPolicy | None = None,
isolated_loop_factory: Callable[[], Any] | None = None,
) -> None:
self.agent_loop = agent_loop
self.policy = policy or ReplayToolPolicy()
self.isolated_loop_factory = isolated_loop_factory
async def run_arm(self, request: ReplayArmRequest) -> dict[str, Any]:
loaded = self.agent_loop.boot()
target_loop = self.isolated_loop_factory() if self.isolated_loop_factory is not None else self.agent_loop
loaded = target_loop.boot()
replay_executor = ReplayToolExecutor(
loaded.tool_executor,
registry=loaded.tool_registry,
@ -174,23 +187,42 @@ class ReplayRunner:
"tool_executor_override": replay_executor,
}
try:
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
except RuntimeError as exc:
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
raise
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": result.session_id,
"run_id": result.run_id,
"task_text": request.task_text,
"finish_reason": result.finish_reason,
"final_answer": result.output_text,
"tool_calls": list(replay_executor.traces),
"artifacts": [],
"side_effects": _side_effects_from_traces(replay_executor.traces),
}
try:
result = await target_loop.process_direct(request.task_text, **direct_kwargs)
except RuntimeError as exc:
if not _is_process_direct_disabled_while_running(exc) or not hasattr(target_loop, "submit_direct"):
raise
result = await target_loop.submit_direct(request.task_text, **direct_kwargs)
session_manager = getattr(loaded, "session_manager", None)
if session_manager is not None and hasattr(session_manager, "end_session"):
session_manager.end_session(result.session_id, "evaluation_complete")
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": result.session_id,
"run_id": result.run_id,
"task_text": request.task_text,
"finish_reason": result.finish_reason,
"final_answer": result.output_text,
"tool_calls": list(replay_executor.traces),
"artifacts": [],
"side_effects": _side_effects_from_traces(replay_executor.traces),
}
finally:
if target_loop is not self.agent_loop and hasattr(target_loop, "close"):
mcp_manager = getattr(loaded, "mcp_manager", None)
if mcp_manager is not None and hasattr(mcp_manager, "close"):
try:
await mcp_manager.close()
finally:
closeables = getattr(loaded, "closeables", None)
if isinstance(closeables, list):
loaded.closeables = [
(name, close_fn)
for name, close_fn in closeables
if name != "mcp_manager"
]
target_loop.close()
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:

View File

@ -2,6 +2,7 @@
from __future__ import annotations
import asyncio
from dataclasses import dataclass, field
from html import unescape
import json
@ -51,7 +52,8 @@ class WebFetchTool:
try:
safe_url = _safe_url(url)
limit = max(1000, min(int(max_chars or 12000), 50000))
async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client:
timeout = httpx.Timeout(connect=5, read=12, write=5, pool=5)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
response = await client.get(
safe_url,
headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"},
@ -76,7 +78,7 @@ class WebFetchTool:
@dataclass(slots=True)
class WebSearchTool:
name: str = "web_search"
description: str = "Search the web using DuckDuckGo HTML results. No API key required."
description: str = "Search the public web using HTML results. No API key required."
toolset: str = "web"
always_available: bool = False
parameters: dict[str, Any] = field(
@ -95,23 +97,102 @@ class WebSearchTool:
if not str(query).strip():
raise ValueError("query is required")
bounded = max(1, min(int(limit or 5), 10))
url = f"https://duckduckgo.com/html/?q={quote_plus(query)}"
async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client:
response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"})
response.raise_for_status()
html = response.text
results: list[dict[str, str]] = []
pattern = re.compile(
r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>',
re.I | re.S,
)
for match in pattern.finditer(html):
title = _strip_html(match.group("title"))
result_url = unescape(match.group("url"))
if title and result_url:
results.append({"title": title, "url": result_url, "snippet": ""})
if len(results) >= bounded:
break
return _json_result(True, query=query, results=results)
headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"}
timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
tasks = [
asyncio.create_task(
_search_bing(
client,
query=query,
limit=bounded,
headers=headers,
)
),
asyncio.create_task(
_search_duckduckgo(
client,
query=query,
limit=bounded,
headers=headers,
)
),
]
errors: list[str] = []
try:
for completed in asyncio.as_completed(tasks):
try:
engine, results = await completed
except Exception as exc:
errors.append(str(exc))
continue
if results:
return _json_result(True, query=query, engine=engine, results=results)
detail = "; ".join(error for error in errors if error) or "no search results"
return _json_result(False, query=query, error=detail)
finally:
for task in tasks:
if not task.done():
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
except Exception as exc:
return _json_result(False, query=query, error=str(exc))
async def _search_bing(
client: httpx.AsyncClient,
*,
query: str,
limit: int,
headers: dict[str, str],
) -> tuple[str, list[dict[str, str]]]:
response = await client.get(f"https://www.bing.com/search?q={quote_plus(query)}", headers=headers)
response.raise_for_status()
return "bing", _parse_bing_results(response.text, limit)
async def _search_duckduckgo(
client: httpx.AsyncClient,
*,
query: str,
limit: int,
headers: dict[str, str],
) -> tuple[str, list[dict[str, str]]]:
response = await client.get(f"https://duckduckgo.com/html/?q={quote_plus(query)}", headers=headers)
response.raise_for_status()
return "duckduckgo", _parse_duckduckgo_results(response.text, limit)
def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]:
results: list[dict[str, str]] = []
pattern = re.compile(
r'<li[^>]+class="[^"]*\bb_algo\b[^"]*"[^>]*>.*?<h2[^>]*>\s*'
r'<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>.*?'
r'(?:<p[^>]*>(?P<snippet>.*?)</p>)?',
re.I | re.S,
)
for match in pattern.finditer(html):
title = _strip_html(match.group("title"))
result_url = unescape(match.group("url"))
snippet = _strip_html(match.group("snippet") or "")
if title and result_url:
results.append({"title": title, "url": result_url, "snippet": snippet})
if len(results) >= limit:
break
return results
def _parse_duckduckgo_results(html: str, limit: int) -> list[dict[str, str]]:
results: list[dict[str, str]] = []
pattern = re.compile(
r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>',
re.I | re.S,
)
for match in pattern.finditer(html):
title = _strip_html(match.group("title"))
result_url = unescape(match.group("url"))
if title and result_url:
results.append({"title": title, "url": result_url, "snippet": ""})
if len(results) >= limit:
break
return results