feat(engine): 优化智能体循环中的助手消息处理逻辑

- 在没有工具调用时才添加助手消息到上下文
- 确保工具调用响应正确添加到消息上下文中
- 修复了消息构建的条件逻辑

fix(cron): 改进定时任务调度的时间解析功能

- 添加正则表达式导入用于时间显示解析
- 实现从显示文本中提取毫秒间隔的功能
- 增强整数转换的安全性,避免类型错误
- 优化定时任务配置的解析逻辑

feat(outlook): 增强Outlook集成的功能和稳定性

- 将默认超时时间从10秒增加到180秒
- 为状态检查函数添加可选的验证参数
- 串行执行邮件概览获取操作而非并行
- 改进连接状态验证逻辑

feat(channel): 添加设备名称作为会话标识的选项

- 为终端WebSocket适配器添加新的配置选项
- 实现基于设备名称生成会话对等ID的功能
- 记录原始对等ID和设备名称的元数据
- 支持从设备名称创建会话对等ID

feat(skills): 完善技能学习评估系统和进度跟踪

- 在应用启动时自动调度待评估的技能草稿
- 为技能评估工作创建独立的循环工厂
- 实现异步技能评估任务的取消和清理机制
- 添加技能评估进度报告和状态跟踪功能
- 扩展会话列表API以包含更多详细信息
- 防止对不存在的会话进行操作
- 优化技能草稿提交和评估的业务逻辑

perf(skills): 提升技能评估的并发性能

- 实现并行技能案例评估以提高效率
- 添加最大并行案例数的环境变量控制
- 实现实时评估进度更新和回调机制
- 优化评估过程中的资源管理和同步

refactor(services): 创建隔离的智能体循环实例

- 添加创建独立智能体循环的工厂方法
- 确保新循环继承运行时服务配置
- 支持技能评估等需要隔离环境的场景
```
This commit is contained in:
2026-06-15 14:48:16 +08:00
parent 8aeb97a5fc
commit 4b0bf65ace
53 changed files with 4328 additions and 292 deletions

View File

@ -2,8 +2,10 @@
from __future__ import annotations
import asyncio
import json
from typing import Any
import os
from typing import Any, Callable
from uuid import uuid4
from beaver.engine.context import SkillContext
@ -25,9 +27,17 @@ class SkillDraftEvaluator:
run_store: RunMemoryStore,
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
max_parallel_cases: int | None = None,
) -> None:
self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
configured_parallelism = max_parallel_cases
if configured_parallelism is None:
try:
configured_parallelism = int(os.getenv("BEAVER_SKILL_EVAL_MAX_PARALLEL_CASES", "3") or "3")
except ValueError:
configured_parallelism = 3
self.max_parallel_cases = max(1, configured_parallelism)
async def evaluate(
self,
@ -36,6 +46,7 @@ class SkillDraftEvaluator:
draft: SkillDraft,
provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
if provider_bundle is None or provider_bundle.main_provider is None:
return self._skipped(candidate, draft)
@ -59,6 +70,7 @@ class SkillDraftEvaluator:
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
progress_callback=progress_callback,
)
return self._evaluate_heuristic(candidate, draft, runs)
@ -129,96 +141,72 @@ class SkillDraftEvaluator:
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
for case in replay_cases:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
total_cases = len(replay_cases)
total_arms = total_cases * 2
completed_arms = 0
completed_cases = 0
progress_lock = asyncio.Lock()
semaphore = asyncio.Semaphore(self.max_parallel_cases)
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=0,
total_cases=total_cases,
)
async def mark_progress(*, case_completed: bool) -> None:
nonlocal completed_arms, completed_cases
async with progress_lock:
completed_arms += 1
if case_completed:
completed_cases += 1
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=completed_cases,
total_cases=total_cases,
)
)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
async def evaluate_case(case: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
async with semaphore:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
case_reports.append(case_report)
legacy_cases.append(
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
await mark_progress(case_completed=False)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
await mark_progress(case_completed=True)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
return _build_replay_case_reports(case, baseline, candidate_arm, surrogate)
results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases))
case_reports = [case_report for case_report, _ in results]
legacy_cases = [legacy_case for _, legacy_case in results]
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(
candidate,
@ -248,6 +236,83 @@ class SkillDraftEvaluator:
)
def _build_replay_case_reports(
case: dict[str, Any],
baseline: dict[str, Any],
candidate_arm: dict[str, Any],
surrogate: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any]]:
baseline_ability = _ability_score(case=case, arm=baseline, arm_name="baseline")
candidate_ability = _ability_score(case=case, arm=candidate_arm, arm_name="candidate")
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tier = case.get("tier") or ("bronze" if case.get("synthetic") else "gold")
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
},
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
return case_report, {
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
def _report_progress(
callback: Callable[[dict[str, Any]], None] | None,
*,
completed_arms: int,
total_arms: int,
completed_cases: int,
total_cases: int,
) -> None:
if callback is None:
return
callback(
{
"phase": "replaying",
"completed_arms": completed_arms,
"total_arms": total_arms,
"completed_cases": completed_cases,
"total_cases": total_cases,
}
)
def _score_from_validation(validation: dict | None, success: bool) -> float:
if isinstance(validation, dict) and "score" in validation:
try: