```
feat(engine): 优化智能体循环中的助手消息处理逻辑 - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性,避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ```
This commit is contained in:
@ -2,8 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any
|
||||
import os
|
||||
from typing import Any, Callable
|
||||
from uuid import uuid4
|
||||
|
||||
from beaver.engine.context import SkillContext
|
||||
@ -25,9 +27,17 @@ class SkillDraftEvaluator:
|
||||
run_store: RunMemoryStore,
|
||||
*,
|
||||
surrogate_evaluator: SurrogateToolEvaluator | None = None,
|
||||
max_parallel_cases: int | None = None,
|
||||
) -> None:
|
||||
self.run_store = run_store
|
||||
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
|
||||
configured_parallelism = max_parallel_cases
|
||||
if configured_parallelism is None:
|
||||
try:
|
||||
configured_parallelism = int(os.getenv("BEAVER_SKILL_EVAL_MAX_PARALLEL_CASES", "3") or "3")
|
||||
except ValueError:
|
||||
configured_parallelism = 3
|
||||
self.max_parallel_cases = max(1, configured_parallelism)
|
||||
|
||||
async def evaluate(
|
||||
self,
|
||||
@ -36,6 +46,7 @@ class SkillDraftEvaluator:
|
||||
draft: SkillDraft,
|
||||
provider_bundle: ProviderBundle | None,
|
||||
replay_runner: ReplayRunner | None = None,
|
||||
progress_callback: Callable[[dict[str, Any]], None] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
if provider_bundle is None or provider_bundle.main_provider is None:
|
||||
return self._skipped(candidate, draft)
|
||||
@ -59,6 +70,7 @@ class SkillDraftEvaluator:
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=replay_runner,
|
||||
case_selection_meta=case_selection_meta,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
return self._evaluate_heuristic(candidate, draft, runs)
|
||||
|
||||
@ -129,96 +141,72 @@ class SkillDraftEvaluator:
|
||||
provider_bundle: ProviderBundle,
|
||||
replay_runner: ReplayRunner,
|
||||
case_selection_meta: dict[str, Any] | None = None,
|
||||
progress_callback: Callable[[dict[str, Any]], None] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
case_reports: list[dict] = []
|
||||
legacy_cases: list[dict] = []
|
||||
for case in replay_cases:
|
||||
baseline = await replay_runner.run_arm(
|
||||
ReplayArmRequest(
|
||||
case_id=f"{case['run_id']}:baseline",
|
||||
arm="baseline",
|
||||
task_text=str(case["task_text"]),
|
||||
pinned_skill_names=list(case.get("baseline_skill_names") or []),
|
||||
pinned_skill_contexts=[],
|
||||
provider_bundle=provider_bundle,
|
||||
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||
total_cases = len(replay_cases)
|
||||
total_arms = total_cases * 2
|
||||
completed_arms = 0
|
||||
completed_cases = 0
|
||||
progress_lock = asyncio.Lock()
|
||||
semaphore = asyncio.Semaphore(self.max_parallel_cases)
|
||||
_report_progress(
|
||||
progress_callback,
|
||||
completed_arms=completed_arms,
|
||||
total_arms=total_arms,
|
||||
completed_cases=0,
|
||||
total_cases=total_cases,
|
||||
)
|
||||
|
||||
async def mark_progress(*, case_completed: bool) -> None:
|
||||
nonlocal completed_arms, completed_cases
|
||||
async with progress_lock:
|
||||
completed_arms += 1
|
||||
if case_completed:
|
||||
completed_cases += 1
|
||||
_report_progress(
|
||||
progress_callback,
|
||||
completed_arms=completed_arms,
|
||||
total_arms=total_arms,
|
||||
completed_cases=completed_cases,
|
||||
total_cases=total_cases,
|
||||
)
|
||||
)
|
||||
candidate_arm = await replay_runner.run_arm(
|
||||
ReplayArmRequest(
|
||||
case_id=f"{case['run_id']}:candidate",
|
||||
arm="candidate",
|
||||
task_text=str(case["task_text"]),
|
||||
pinned_skill_names=[],
|
||||
pinned_skill_contexts=[_draft_skill_context(draft)],
|
||||
provider_bundle=provider_bundle,
|
||||
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||
|
||||
async def evaluate_case(case: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
async with semaphore:
|
||||
baseline = await replay_runner.run_arm(
|
||||
ReplayArmRequest(
|
||||
case_id=f"{case['run_id']}:baseline",
|
||||
arm="baseline",
|
||||
task_text=str(case["task_text"]),
|
||||
pinned_skill_names=list(case.get("baseline_skill_names") or []),
|
||||
pinned_skill_contexts=[],
|
||||
provider_bundle=provider_bundle,
|
||||
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||
)
|
||||
)
|
||||
)
|
||||
surrogate = await self.surrogate_evaluator.evaluate(
|
||||
task_text=str(case["task_text"]),
|
||||
baseline=baseline,
|
||||
candidate=candidate_arm,
|
||||
)
|
||||
baseline_ability = _ability_score(
|
||||
case=case,
|
||||
arm=baseline,
|
||||
arm_name="baseline",
|
||||
)
|
||||
candidate_ability = _ability_score(
|
||||
case=case,
|
||||
arm=candidate_arm,
|
||||
arm_name="candidate",
|
||||
)
|
||||
baseline_score = baseline_ability["final_score"]
|
||||
candidate_score = candidate_ability["final_score"]
|
||||
tool_execution_score = {
|
||||
"baseline_score": surrogate["baseline_score"],
|
||||
"candidate_score": surrogate["candidate_score"],
|
||||
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
|
||||
"score_role": "diagnostic_only",
|
||||
}
|
||||
case_report = {
|
||||
"run_id": case["run_id"],
|
||||
"task_id": case.get("task_id"),
|
||||
"session_id": case.get("session_id"),
|
||||
"task_text": case.get("task_text"),
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"validator": case.get("validator"),
|
||||
"baseline": baseline,
|
||||
"candidate": candidate_arm,
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"ability_score": {
|
||||
"baseline": baseline_ability,
|
||||
"candidate": candidate_ability,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
},
|
||||
"tool_execution_score": tool_execution_score,
|
||||
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
||||
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
||||
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
||||
"confidence": surrogate["confidence"],
|
||||
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
|
||||
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
|
||||
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
|
||||
"validator_notes": list(surrogate.get("notes") or []),
|
||||
}
|
||||
case_reports.append(case_report)
|
||||
legacy_cases.append(
|
||||
{
|
||||
"run_id": case["run_id"],
|
||||
"session_id": case.get("session_id") or "",
|
||||
"task_text": case.get("task_text") or "",
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
}
|
||||
)
|
||||
await mark_progress(case_completed=False)
|
||||
candidate_arm = await replay_runner.run_arm(
|
||||
ReplayArmRequest(
|
||||
case_id=f"{case['run_id']}:candidate",
|
||||
arm="candidate",
|
||||
task_text=str(case["task_text"]),
|
||||
pinned_skill_names=[],
|
||||
pinned_skill_contexts=[_draft_skill_context(draft)],
|
||||
provider_bundle=provider_bundle,
|
||||
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
||||
)
|
||||
)
|
||||
await mark_progress(case_completed=True)
|
||||
surrogate = await self.surrogate_evaluator.evaluate(
|
||||
task_text=str(case["task_text"]),
|
||||
baseline=baseline,
|
||||
candidate=candidate_arm,
|
||||
)
|
||||
return _build_replay_case_reports(case, baseline, candidate_arm, surrogate)
|
||||
|
||||
results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases))
|
||||
case_reports = [case_report for case_report, _ in results]
|
||||
legacy_cases = [legacy_case for _, legacy_case in results]
|
||||
preservation_report = _preservation_report(candidate, draft)
|
||||
return _report_from_case_reports(
|
||||
candidate,
|
||||
@ -248,6 +236,83 @@ class SkillDraftEvaluator:
|
||||
)
|
||||
|
||||
|
||||
def _build_replay_case_reports(
|
||||
case: dict[str, Any],
|
||||
baseline: dict[str, Any],
|
||||
candidate_arm: dict[str, Any],
|
||||
surrogate: dict[str, Any],
|
||||
) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
baseline_ability = _ability_score(case=case, arm=baseline, arm_name="baseline")
|
||||
candidate_ability = _ability_score(case=case, arm=candidate_arm, arm_name="candidate")
|
||||
baseline_score = baseline_ability["final_score"]
|
||||
candidate_score = candidate_ability["final_score"]
|
||||
tier = case.get("tier") or ("bronze" if case.get("synthetic") else "gold")
|
||||
case_report = {
|
||||
"run_id": case["run_id"],
|
||||
"task_id": case.get("task_id"),
|
||||
"session_id": case.get("session_id"),
|
||||
"task_text": case.get("task_text"),
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": tier,
|
||||
"validator": case.get("validator"),
|
||||
"baseline": baseline,
|
||||
"candidate": candidate_arm,
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"ability_score": {
|
||||
"baseline": baseline_ability,
|
||||
"candidate": candidate_ability,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
},
|
||||
"tool_execution_score": {
|
||||
"baseline_score": surrogate["baseline_score"],
|
||||
"candidate_score": surrogate["candidate_score"],
|
||||
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
|
||||
"score_role": "diagnostic_only",
|
||||
},
|
||||
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
||||
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
||||
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
||||
"confidence": surrogate["confidence"],
|
||||
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
|
||||
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
|
||||
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
|
||||
"validator_notes": list(surrogate.get("notes") or []),
|
||||
}
|
||||
return case_report, {
|
||||
"run_id": case["run_id"],
|
||||
"session_id": case.get("session_id") or "",
|
||||
"task_text": case.get("task_text") or "",
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": tier,
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
}
|
||||
|
||||
|
||||
def _report_progress(
|
||||
callback: Callable[[dict[str, Any]], None] | None,
|
||||
*,
|
||||
completed_arms: int,
|
||||
total_arms: int,
|
||||
completed_cases: int,
|
||||
total_cases: int,
|
||||
) -> None:
|
||||
if callback is None:
|
||||
return
|
||||
callback(
|
||||
{
|
||||
"phase": "replaying",
|
||||
"completed_arms": completed_arms,
|
||||
"total_arms": total_arms,
|
||||
"completed_cases": completed_cases,
|
||||
"total_cases": total_cases,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _score_from_validation(validation: dict | None, success: bool) -> float:
|
||||
if isinstance(validation, dict) and "score" in validation:
|
||||
try:
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
from typing import Any, Callable
|
||||
|
||||
from beaver.engine.providers import ProviderBundle
|
||||
from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore
|
||||
@ -174,12 +174,20 @@ class SkillLearningPipelineService:
|
||||
safety = self.get_safety_report(skill_name, draft_id)
|
||||
if safety is not None and (not safety.passed or safety.risk_level == "critical"):
|
||||
raise ValueError("Draft cannot enter review because safety check failed")
|
||||
return self.review_service.submit_for_review(
|
||||
review = self.review_service.submit_for_review(
|
||||
skill_name,
|
||||
draft_id,
|
||||
reviewer_request=notes,
|
||||
requested_by=requested_by,
|
||||
)
|
||||
self._mark_candidate_by_draft(
|
||||
skill_name,
|
||||
draft_id,
|
||||
"review_pending",
|
||||
"review_submitted",
|
||||
last_error=None,
|
||||
)
|
||||
return review
|
||||
|
||||
def approve(
|
||||
self,
|
||||
@ -258,9 +266,13 @@ class SkillLearningPipelineService:
|
||||
draft = self.get_draft(skill_name, draft_id)
|
||||
report = self.safety_checker.check(draft)
|
||||
self.learning_store.write_safety_report(report)
|
||||
status = "safety_failed" if not report.passed or report.risk_level == "critical" else "draft_ready"
|
||||
status = (
|
||||
"safety_failed"
|
||||
if not report.passed or report.risk_level == "critical"
|
||||
else self._candidate_status_for_draft(draft)
|
||||
)
|
||||
current = self._candidate_by_draft(skill_name, draft_id)
|
||||
if current is not None and current.status == "eval_failed" and status == "draft_ready":
|
||||
if current is not None and current.status == "eval_failed" and status != "safety_failed":
|
||||
status = "eval_failed"
|
||||
self._mark_candidate_by_draft(
|
||||
skill_name,
|
||||
@ -287,6 +299,7 @@ class SkillLearningPipelineService:
|
||||
*,
|
||||
provider_bundle: ProviderBundle | None,
|
||||
replay_runner: ReplayRunner | None = None,
|
||||
progress_callback: Callable[[dict[str, Any]], None] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
draft = self.get_draft(skill_name, draft_id)
|
||||
candidate = self.get_candidate(candidate_id)
|
||||
@ -296,13 +309,14 @@ class SkillLearningPipelineService:
|
||||
draft=draft,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=replay_runner,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
self.learning_store.write_eval_report(report)
|
||||
if report.status == "skipped_provider_unavailable":
|
||||
status = "draft_ready"
|
||||
status = self._candidate_status_for_draft(draft)
|
||||
error = "eval skipped: provider unavailable"
|
||||
elif report.passed:
|
||||
status = "draft_ready"
|
||||
status = self._candidate_status_for_draft(draft)
|
||||
error = None
|
||||
else:
|
||||
status = "eval_failed"
|
||||
@ -316,11 +330,43 @@ class SkillLearningPipelineService:
|
||||
status,
|
||||
event_type="eval_completed",
|
||||
eval_report_id=report.report_id,
|
||||
eval_progress={
|
||||
"phase": "completed",
|
||||
"completed_arms": len(report.cases) * 2 if report.mode == "replay" else 0,
|
||||
"total_arms": len(report.cases) * 2 if report.mode == "replay" else 0,
|
||||
"completed_cases": len(report.cases),
|
||||
"total_cases": len(report.cases),
|
||||
},
|
||||
last_error=error,
|
||||
payload=report.to_dict(),
|
||||
)
|
||||
return report
|
||||
|
||||
def mark_eval_progress(self, candidate_id: str, progress: dict[str, Any]) -> SkillLearningCandidate:
|
||||
return self._require_updated(
|
||||
self.learning_store.update_learning_candidate(
|
||||
candidate_id,
|
||||
eval_progress=dict(progress),
|
||||
),
|
||||
candidate_id,
|
||||
)
|
||||
|
||||
def mark_eval_failed(self, candidate_id: str, error: str) -> SkillLearningCandidate:
|
||||
candidate = self.get_candidate(candidate_id)
|
||||
progress = dict(candidate.eval_progress)
|
||||
progress["phase"] = "failed"
|
||||
return self._require_updated(
|
||||
self.learning_store.transition_learning_candidate(
|
||||
candidate_id,
|
||||
"eval_failed",
|
||||
eval_progress=progress,
|
||||
event_type="eval_failed",
|
||||
last_error=error,
|
||||
payload={"error": error},
|
||||
),
|
||||
candidate_id,
|
||||
)
|
||||
|
||||
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
|
||||
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
|
||||
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
|
||||
@ -372,6 +418,14 @@ class SkillLearningPipelineService:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _candidate_status_for_draft(draft: SkillDraft) -> str:
|
||||
if draft.status == SkillReviewState.APPROVED.value:
|
||||
return "approved"
|
||||
if draft.status == SkillReviewState.IN_REVIEW.value:
|
||||
return "review_pending"
|
||||
return "draft_ready"
|
||||
|
||||
@staticmethod
|
||||
def _require_updated(candidate: SkillLearningCandidate | None, candidate_id: str) -> SkillLearningCandidate:
|
||||
if candidate is None:
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Literal
|
||||
from time import perf_counter
|
||||
from typing import Any, Callable, Literal
|
||||
from uuid import uuid4
|
||||
|
||||
from beaver.tools.base import ToolContext, ToolResult, ToolSpec
|
||||
@ -59,6 +60,7 @@ class ReplayToolExecutor:
|
||||
*,
|
||||
context: ToolContext | None = None,
|
||||
) -> ToolResult:
|
||||
started_at = perf_counter()
|
||||
tool = self.registry.get(tool_name)
|
||||
spec = tool.spec if tool is not None else ToolSpec(
|
||||
name=tool_name,
|
||||
@ -84,6 +86,7 @@ class ReplayToolExecutor:
|
||||
"error": result.error,
|
||||
"content": result.content[:2000],
|
||||
}
|
||||
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
||||
self.traces.append(trace)
|
||||
return result
|
||||
if mode == "surrogate":
|
||||
@ -92,6 +95,7 @@ class ReplayToolExecutor:
|
||||
"error": "replay_surrogate",
|
||||
"content": "Tool call recorded for surrogate evaluation.",
|
||||
}
|
||||
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
||||
self.traces.append(trace)
|
||||
return ToolResult(
|
||||
success=True,
|
||||
@ -105,6 +109,7 @@ class ReplayToolExecutor:
|
||||
"error": "replay_blocked",
|
||||
"content": "Tool call blocked by replay policy.",
|
||||
}
|
||||
trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2)
|
||||
self.traces.append(trace)
|
||||
return ToolResult(
|
||||
success=False,
|
||||
@ -151,12 +156,20 @@ class ReplayArmRequest:
|
||||
|
||||
|
||||
class ReplayRunner:
|
||||
def __init__(self, *, agent_loop: Any, policy: ReplayToolPolicy | None = None) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
agent_loop: Any,
|
||||
policy: ReplayToolPolicy | None = None,
|
||||
isolated_loop_factory: Callable[[], Any] | None = None,
|
||||
) -> None:
|
||||
self.agent_loop = agent_loop
|
||||
self.policy = policy or ReplayToolPolicy()
|
||||
self.isolated_loop_factory = isolated_loop_factory
|
||||
|
||||
async def run_arm(self, request: ReplayArmRequest) -> dict[str, Any]:
|
||||
loaded = self.agent_loop.boot()
|
||||
target_loop = self.isolated_loop_factory() if self.isolated_loop_factory is not None else self.agent_loop
|
||||
loaded = target_loop.boot()
|
||||
replay_executor = ReplayToolExecutor(
|
||||
loaded.tool_executor,
|
||||
registry=loaded.tool_registry,
|
||||
@ -174,23 +187,42 @@ class ReplayRunner:
|
||||
"tool_executor_override": replay_executor,
|
||||
}
|
||||
try:
|
||||
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
|
||||
except RuntimeError as exc:
|
||||
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
|
||||
raise
|
||||
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
"session_id": result.session_id,
|
||||
"run_id": result.run_id,
|
||||
"task_text": request.task_text,
|
||||
"finish_reason": result.finish_reason,
|
||||
"final_answer": result.output_text,
|
||||
"tool_calls": list(replay_executor.traces),
|
||||
"artifacts": [],
|
||||
"side_effects": _side_effects_from_traces(replay_executor.traces),
|
||||
}
|
||||
try:
|
||||
result = await target_loop.process_direct(request.task_text, **direct_kwargs)
|
||||
except RuntimeError as exc:
|
||||
if not _is_process_direct_disabled_while_running(exc) or not hasattr(target_loop, "submit_direct"):
|
||||
raise
|
||||
result = await target_loop.submit_direct(request.task_text, **direct_kwargs)
|
||||
session_manager = getattr(loaded, "session_manager", None)
|
||||
if session_manager is not None and hasattr(session_manager, "end_session"):
|
||||
session_manager.end_session(result.session_id, "evaluation_complete")
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
"session_id": result.session_id,
|
||||
"run_id": result.run_id,
|
||||
"task_text": request.task_text,
|
||||
"finish_reason": result.finish_reason,
|
||||
"final_answer": result.output_text,
|
||||
"tool_calls": list(replay_executor.traces),
|
||||
"artifacts": [],
|
||||
"side_effects": _side_effects_from_traces(replay_executor.traces),
|
||||
}
|
||||
finally:
|
||||
if target_loop is not self.agent_loop and hasattr(target_loop, "close"):
|
||||
mcp_manager = getattr(loaded, "mcp_manager", None)
|
||||
if mcp_manager is not None and hasattr(mcp_manager, "close"):
|
||||
try:
|
||||
await mcp_manager.close()
|
||||
finally:
|
||||
closeables = getattr(loaded, "closeables", None)
|
||||
if isinstance(closeables, list):
|
||||
loaded.closeables = [
|
||||
(name, close_fn)
|
||||
for name, close_fn in closeables
|
||||
if name != "mcp_manager"
|
||||
]
|
||||
target_loop.close()
|
||||
|
||||
|
||||
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user