```

feat(engine): 优化智能体循环中的助手消息处理逻辑 - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性，避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ```
2026-06-15 14:48:16 +08:00
parent 8aeb97a5fc
commit 4b0bf65ace
53 changed files with 4328 additions and 292 deletions
--- a/app-instance/backend/tests/unit/test_cron_service.py
+++ b/app-instance/backend/tests/unit/test_cron_service.py
@ -29,6 +29,18 @@ def test_schedule_from_frontend_payload() -> None:
    assert cron.kind == "cron"


+def test_legacy_interval_schedule_recovers_duration_from_display() -> None:
+    schedule = CronSchedule.from_dict(
+        {
+            "kind": "every",
+            "every_ms": None,
+            "display": "every 1800s",
+        }
+    )
+
+    assert schedule.every_ms == 30 * 60 * 1000
+
+
 def test_compute_next_run_skips_missed_interval() -> None:
    schedule = CronSchedule(kind="every", every_ms=60_000)
    assert compute_next_run(schedule, now_ms=1_000_000, last_run_at_ms=0) > 1_000_000
@ -80,6 +92,22 @@ def test_manual_run_records_scheduled_run_output(tmp_path) -> None:
    assert updated.to_api_dict()["last_scheduled_run_id"] == run.scheduled_run_id


+def test_persisted_interval_job_keeps_schedule_and_next_run(tmp_path) -> None:
+    store_path = tmp_path / "jobs.json"
+    service = CronService(store_path)
+    job = service.add_job(
+        name="Hydration reminder",
+        message="Drink water",
+        schedule=CronSchedule(kind="every", every_ms=30 * 60 * 1000),
+    )
+
+    reloaded = CronService(store_path).get_job(job.id)
+
+    assert reloaded is not None
+    assert reloaded.schedule.every_ms == 30 * 60 * 1000
+    assert reloaded.next_run_at_ms == job.next_run_at_ms
+
+
 def test_cron_tool_uses_runtime_service(tmp_path) -> None:
    service = CronService(tmp_path / "jobs.json")
    tool = CronTool()
--- a/app-instance/backend/tests/unit/test_outlook_integration.py
+++ b/app-instance/backend/tests/unit/test_outlook_integration.py
@ -0,0 +1,71 @@
+import asyncio
+
+import pytest
+
+from beaver.foundation.config.schema import AuthzConfig, BackendIdentityConfig, BeaverConfig
+from beaver.integrations import outlook
+
+
+class _FakeAuthzClient:
+    async def get_outlook_settings(self, backend_id: str) -> dict:
+        assert backend_id == "steven"
+        return {
+            "configured": True,
+            "email": "steven.yx.li@boardware.com",
+            "server": "mail.boardware.com.mo",
+        }
+
+
+def _authz_config() -> BeaverConfig:
+    return BeaverConfig(
+        authz=AuthzConfig(
+            enabled=True,
+            base_url="http://authz.example",
+            outlook_mcp_url="http://outlook-mcp.example/mcp",
+        ),
+        backend_identity=BackendIdentityConfig(
+            backend_id="steven",
+            client_id="steven",
+            client_secret="secret",
+        ),
+    )
+
+
+def test_outlook_status_does_not_probe_mcp_by_default(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None:
+    monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient())
+
+    async def fail_if_called(*_args, **_kwargs):
+        raise AssertionError("status should not call Outlook MCP by default")
+
+    monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fail_if_called)
+
+    result = asyncio.run(outlook.outlook_status(_authz_config(), tmp_path))
+
+    assert result["configured"] is True
+    assert result["connected"] is False
+    assert result["auth_status"] is None
+    assert result["error"] is None
+
+
+def test_outlook_overview_loads_sections_serially(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None:
+    monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient())
+    active_calls = 0
+    max_active_calls = 0
+    tool_names: list[str] = []
+
+    async def fake_call(_config, tool_name: str, _arguments, **_kwargs):
+        nonlocal active_calls, max_active_calls
+        tool_names.append(tool_name)
+        active_calls += 1
+        max_active_calls = max(max_active_calls, active_calls)
+        await asyncio.sleep(0.01)
+        active_calls -= 1
+        return {"value": []}
+
+    monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fake_call)
+
+    result = asyncio.run(outlook.get_overview(_authz_config(), tmp_path))
+
+    assert result["warnings"] == []
+    assert tool_names == ["mail_list_messages", "mail_list_messages", "calendar_list_events"]
+    assert max_active_calls == 1
--- a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py
+++ b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py
@ -27,6 +27,7 @@ class StubProvider(LLMProvider):
    def __init__(self, responses: list[LLMResponse]) -> None:
        super().__init__()
        self._responses = list(responses)
+        self.calls: list[dict] = []

    async def chat(
        self,
@ -37,6 +38,16 @@ class StubProvider(LLMProvider):
        temperature: float = 0.7,
        thinking_enabled: bool | None = None,
    ) -> LLMResponse:
+        self.calls.append(
+            {
+                "messages": messages,
+                "tools": tools,
+                "model": model,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "thinking_enabled": thinking_enabled,
+            }
+        )
        if not self._responses:
            raise AssertionError("No stubbed provider responses left")
        return self._responses.pop(0)
@ -704,32 +715,33 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path:
        skill_assembler=StubSkillAssembler([skill]),
    )
    loop = AgentLoop(loader=loader)
+    provider = StubProvider(
+        [
+            LLMResponse(
+                content="Need a tool.",
+                finish_reason="tool_calls",
+                tool_calls=[_tool_call()],
+                provider_name="stub",
+                model="stub-model",
+            ),
+            LLMResponse(
+                content="Need another tool.",
+                finish_reason="tool_calls",
+                tool_calls=[_tool_call(call_id="call-2")],
+                provider_name="stub",
+                model="stub-model",
+            ),
+            LLMResponse(
+                content="Based on the available tool result, the container likely failed during startup.",
+                finish_reason="stop",
+                provider_name="stub",
+                model="stub-model",
+            ),
+        ]
+    )
    bundle = ProviderBundle(
        main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
-        main_provider=StubProvider(
-            [
-                LLMResponse(
-                    content="Need a tool.",
-                    finish_reason="tool_calls",
-                    tool_calls=[_tool_call()],
-                    provider_name="stub",
-                    model="stub-model",
-                ),
-                LLMResponse(
-                    content="Need another tool.",
-                    finish_reason="tool_calls",
-                    tool_calls=[_tool_call(call_id="call-2")],
-                    provider_name="stub",
-                    model="stub-model",
-                ),
-                LLMResponse(
-                    content="Based on the available tool result, the container likely failed during startup.",
-                    finish_reason="stop",
-                    provider_name="stub",
-                    model="stub-model",
-                ),
-            ]
-        ),
+        main_provider=provider,
    )

    result = asyncio.run(
@ -744,6 +756,21 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path:
    assert result.finish_reason == "max_tool_iterations_finalized"
    assert "Based on the available tool result" in result.output_text
    assert "Tool loop stopped" not in result.output_text
+    finalization_messages = provider.calls[-1]["messages"]
+    assistant_tool_call_ids = [
+        call["id"]
+        for message in finalization_messages
+        for call in message.get("tool_calls", [])
+        if message.get("role") == "assistant"
+    ]
+    tool_result_ids = [
+        message.get("tool_call_id")
+        for message in finalization_messages
+        if message.get("role") == "tool"
+    ]
+    assert "call-1" in assistant_tool_call_ids
+    assert "call-2" not in assistant_tool_call_ids
+    assert set(assistant_tool_call_ids).issubset(set(tool_result_ids))
    effect_records = loaded.run_memory_store.list_skill_effects("docker-debug", version="v0007")
    assert effect_records[-1].run_id == result.run_id
    assert effect_records[-1].success is False
--- a/app-instance/backend/tests/unit/test_session_archive.py
+++ b/app-instance/backend/tests/unit/test_session_archive.py
@ -105,3 +105,29 @@ def test_web_archive_route_does_not_create_archive_suffix_session(tmp_path: Path
    assert loaded.session_manager.get_session("web:alpha")["end_reason"] == "archived"  # type: ignore[union-attr]
    assert loaded.session_manager.get_session("web:alpha/archive") is None  # type: ignore[union-attr]
    assert sessions_response.json() == []
+
+
+def test_web_session_list_hides_skill_replay_evaluation_sessions(tmp_path: Path) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    loaded.session_manager.ensure_session("eval-session", source="skill_replay_eval")  # type: ignore[union-attr]
+    loaded.session_manager.ensure_session("web:visible", source="web")  # type: ignore[union-attr]
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        response = client.get("/api/sessions")
+
+    assert response.status_code == 200
+    assert [item["key"] for item in response.json()] == ["web:visible"]
+
+
+def test_get_missing_session_returns_404_without_creating_it(tmp_path: Path) -> None:
+    service = AgentService(workspace=tmp_path)
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        response = client.get("/api/sessions/missing-session")
+
+    assert response.status_code == 404
+    loaded = service.create_loop().boot()
+    assert loaded.session_manager.get_session("missing-session") is None  # type: ignore[union-attr]
--- a/app-instance/backend/tests/unit/test_skill_learning_eval.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_eval.py
@ -201,6 +201,22 @@ class FakeReplayRunner:
        }


+class ConcurrentReplayRunner(FakeReplayRunner):
+    def __init__(self) -> None:
+        super().__init__()
+        self.active = 0
+        self.max_active = 0
+
+    async def run_arm(self, request):
+        self.active += 1
+        self.max_active = max(self.max_active, self.active)
+        await asyncio.sleep(0.02)
+        try:
+            return await super().run_arm(request)
+        finally:
+            self.active -= 1
+
+
 def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
    pipeline = _pipeline(tmp_path)
    draft = pipeline.draft_service.create_new_skill_draft(
@ -238,6 +254,94 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
    assert report.tool_execution_summary["score_role"] == "diagnostic_only"


+def test_replay_eval_reports_arm_progress(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    draft = pipeline.draft_service.create_new_skill_draft(
+        skill_name="release-checklist",
+        proposed_content="# Release\n\nRun tests.",
+        proposed_frontmatter={"description": "release", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    pipeline.learning_store.update_learning_candidate(
+        "candidate-1",
+        draft_skill_name=draft.skill_name,
+        draft_id=draft.draft_id,
+    )
+    progress: list[dict] = []
+
+    asyncio.run(
+        pipeline.evaluate_draft(
+            "candidate-1",
+            draft.skill_name,
+            draft.draft_id,
+            provider_bundle=_bundle(),
+            replay_runner=FakeReplayRunner(),
+            progress_callback=progress.append,
+        )
+    )
+
+    assert progress[0] == {
+        "phase": "replaying",
+        "completed_arms": 0,
+        "total_arms": 20,
+        "completed_cases": 0,
+        "total_cases": 10,
+    }
+    assert progress[-1] == {
+        "phase": "replaying",
+        "completed_arms": 20,
+        "total_arms": 20,
+        "completed_cases": 10,
+        "total_cases": 10,
+    }
+
+
+def test_replay_eval_runs_cases_with_bounded_parallelism(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    pipeline.evaluator = SkillDraftEvaluator(
+        pipeline.learning_service.run_store,
+        max_parallel_cases=2,
+    )
+    draft = pipeline.draft_service.create_new_skill_draft(
+        skill_name="release-checklist",
+        proposed_content="# Release\n\nRun tests.",
+        proposed_frontmatter={"description": "release", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    pipeline.learning_store.update_learning_candidate(
+        "candidate-1",
+        draft_skill_name=draft.skill_name,
+        draft_id=draft.draft_id,
+    )
+    replay_runner = ConcurrentReplayRunner()
+
+    report = asyncio.run(
+        pipeline.evaluate_draft(
+            "candidate-1",
+            draft.skill_name,
+            draft.draft_id,
+            provider_bundle=_bundle(),
+            replay_runner=replay_runner,
+        )
+    )
+
+    assert replay_runner.max_active == 2
+    assert [case["run_id"] for case in report.cases] == [
+        "run-1",
+        "synthetic:candidate-1:01",
+        "synthetic:candidate-1:02",
+        "synthetic:candidate-1:03",
+        "synthetic:candidate-1:04",
+        "synthetic:candidate-1:05",
+        "synthetic:candidate-1:06",
+        "synthetic:candidate-1:07",
+        "synthetic:candidate-1:08",
+        "synthetic:candidate-1:09",
+    ]
+
+
 def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
    pipeline = _pipeline(tmp_path)
    pipeline.learning_store.update_learning_candidate(
--- a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py
@ -98,6 +98,27 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
        pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")


+def test_safety_recheck_keeps_submitted_candidate_in_review(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    draft = pipeline.draft_service.create_new_skill_draft(
+        skill_name="reviewed-skill",
+        proposed_content="# Reviewed Skill\n\nDo the thing.",
+        proposed_frontmatter={"description": "reviewed"},
+        created_by="test",
+        reason="test",
+    )
+    candidate = pipeline.get_candidate("candidate-1")
+    candidate.draft_skill_name = draft.skill_name
+    candidate.draft_id = draft.draft_id
+    pipeline.learning_store.record_learning_candidate(candidate)
+
+    pipeline.check_safety(draft.skill_name, draft.draft_id)
+    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
+    pipeline.check_safety(draft.skill_name, draft.draft_id)
+
+    assert pipeline.get_candidate("candidate-1").status == "review_pending"
+
+
 def test_pipeline_reject_blocks_publish(tmp_path: Path) -> None:
    pipeline = _pipeline(tmp_path)
    draft = pipeline.draft_service.create_new_skill_draft(
--- a/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py
@ -7,8 +7,17 @@ from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner


 class FakeAgentLoop:
+    def __init__(self) -> None:
+        self.ended_sessions: list[tuple[str, str]] = []
+
    def boot(self):
-        return SimpleNamespace(tool_executor=SimpleNamespace(), tool_registry=SimpleNamespace(get=lambda name: None))
+        return SimpleNamespace(
+            tool_executor=SimpleNamespace(),
+            tool_registry=SimpleNamespace(get=lambda name: None),
+            session_manager=SimpleNamespace(
+                end_session=lambda session_id, reason: self.ended_sessions.append((session_id, reason))
+            ),
+        )

    async def process_direct(self, task: str, **kwargs):
        executor = kwargs["tool_executor_override"]
@ -18,6 +27,7 @@ class FakeAgentLoop:

 class FakeRunningAgentLoop(FakeAgentLoop):
    def __init__(self) -> None:
+        super().__init__()
        self.process_direct_calls = 0
        self.submit_direct_calls: list[tuple[str, dict]] = []

@ -35,6 +45,29 @@ class FakeRunningAgentLoop(FakeAgentLoop):
        return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")


+class FakeIsolatedAgentLoop(FakeAgentLoop):
+    def __init__(self) -> None:
+        super().__init__()
+        self.closed = False
+        self.mcp_manager = SimpleNamespace(close=self._close_mcp)
+        self.mcp_closed = False
+        self.loaded = None
+
+    async def _close_mcp(self) -> None:
+        self.mcp_closed = True
+
+    def close(self) -> None:
+        assert self.mcp_closed is True
+        self.closed = True
+
+    def boot(self):
+        if self.loaded is None:
+            self.loaded = super().boot()
+            self.loaded.mcp_manager = self.mcp_manager
+            self.loaded.closeables = [("mcp_manager", lambda: None)]
+        return self.loaded
+
+
 def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
    runner = ReplayRunner(agent_loop=FakeAgentLoop())
    request = ReplayArmRequest(
@ -53,6 +86,8 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
    assert report["arm"] == "candidate"
    assert report["finish_reason"] == "stop"
    assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
+    assert report["tool_calls"][0]["duration_ms"] >= 0
+    assert runner.agent_loop.ended_sessions == [("session-replay", "evaluation_complete")]


 def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
@ -83,3 +118,31 @@ def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
    assert report["session_id"] == "session-queued"
    assert report["run_id"] == "run-queued"
    assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
+    assert agent_loop.ended_sessions == [("session-queued", "evaluation_complete")]
+
+
+def test_replay_runner_uses_and_closes_isolated_loop() -> None:
+    shared_loop = FakeRunningAgentLoop()
+    isolated_loops: list[FakeIsolatedAgentLoop] = []
+
+    def create_isolated_loop() -> FakeIsolatedAgentLoop:
+        loop = FakeIsolatedAgentLoop()
+        isolated_loops.append(loop)
+        return loop
+
+    runner = ReplayRunner(agent_loop=shared_loop, isolated_loop_factory=create_isolated_loop)
+    request = ReplayArmRequest(
+        case_id="case-isolated",
+        arm="candidate",
+        task_text="Fetch current weather.",
+        provider_bundle=object(),
+    )
+
+    report = asyncio.run(runner.run_arm(request))
+
+    assert report["session_id"] == "session-replay"
+    assert shared_loop.process_direct_calls == 0
+    assert shared_loop.submit_direct_calls == []
+    assert len(isolated_loops) == 1
+    assert isolated_loops[0].mcp_closed is True
+    assert isolated_loops[0].closed is True
--- a/app-instance/backend/tests/unit/test_skill_learning_web_api.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_web_api.py
@ -1,5 +1,7 @@
 from __future__ import annotations

+import asyncio
+import time
 from pathlib import Path
 from types import SimpleNamespace

@ -16,7 +18,7 @@ class StubEvaluator:
    def __init__(self) -> None:
        self.calls = 0

-    async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
+    async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None):
        self.calls += 1
        return SkillDraftEvalReport(
            report_id="eval-existing",
@ -34,6 +36,18 @@ class StubEvaluator:
        )


+class SlowEvaluator(StubEvaluator):
+    async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None):
+        await asyncio.sleep(0.15)
+        return await super().evaluate(
+            candidate=candidate,
+            draft=draft,
+            provider_bundle=provider_bundle,
+            replay_runner=replay_runner,
+            progress_callback=progress_callback,
+        )
+
+
 def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
    service = AgentService(workspace=tmp_path)
    loaded = service.create_loop().boot()
@ -193,15 +207,79 @@ def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:

    with TestClient(app) as client:
        response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
+        deadline = time.monotonic() + 1
+        payload = response.json()
+        while payload["eval_report"] is None and time.monotonic() < deadline:
+            time.sleep(0.02)
+            payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json()

    assert response.status_code == 200
-    payload = response.json()
    assert evaluator.calls == 1
    assert payload["status"] == "in_review"
    assert payload["safety_report"]["passed"] is True
    assert payload["eval_report"]["report_id"] == "eval-existing"


+def test_submit_draft_returns_before_eval_and_is_idempotent(tmp_path: Path, monkeypatch) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft(  # type: ignore[union-attr]
+        skill_name="weather-search",
+        proposed_content="# Weather Search\n\nUse current weather sources.",
+        proposed_frontmatter={"description": "weather", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    loaded.skill_learning_store.record_learning_candidate(  # type: ignore[union-attr]
+        SkillLearningCandidate(
+            candidate_id="candidate-weather",
+            kind="revise_skill",
+            source_run_ids=["run-1"],
+            source_session_ids=["session-1"],
+            related_skill_names=["weather-search"],
+            reason="revise",
+            status="draft_ready",
+            draft_skill_name=draft.skill_name,
+            draft_id=draft.draft_id,
+        )
+    )
+    evaluator = SlowEvaluator()
+    loaded.skill_learning_pipeline.evaluator = evaluator  # type: ignore[union-attr]
+    monkeypatch.setattr(
+        service,
+        "_make_provider_bundle_for_task",
+        lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
+    )
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        started = time.monotonic()
+        first = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
+        elapsed = time.monotonic() - started
+        second = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
+        deadline = time.monotonic() + 2
+        payload = second.json()
+        while payload["eval_report"] is None and time.monotonic() < deadline:
+            time.sleep(0.05)
+            payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json()
+
+    assert first.status_code == 200
+    assert elapsed < 0.12
+    assert first.json()["status"] == "in_review"
+    assert first.json()["eval_status"] == "pending"
+    assert first.json()["eval_progress"] == {
+        "phase": "preparing",
+        "completed_arms": 0,
+        "total_arms": 20,
+        "completed_cases": 0,
+        "total_cases": 10,
+    }
+    assert second.status_code == 200
+    assert evaluator.calls == 1
+    assert payload["eval_report"]["report_id"] == "eval-existing"
+    assert loaded.skill_learning_pipeline.get_candidate("candidate-weather").status == "review_pending"  # type: ignore[union-attr]
+
+
 def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
    service = AgentService(workspace=tmp_path)
    loaded = service.create_loop().boot()
--- a/app-instance/backend/tests/unit/test_terminal_websocket_channel.py
+++ b/app-instance/backend/tests/unit/test_terminal_websocket_channel.py
@ -57,6 +57,14 @@ def write_terminal_config(tmp_path: Path) -> Path:
    return config_path


+def write_terminal_config_with_device_session(tmp_path: Path) -> Path:
+    config_path = write_terminal_config(tmp_path)
+    payload = json.loads(config_path.read_text(encoding="utf-8"))
+    payload["channels"]["terminal-dev"]["config"]["sessionPeerFromDeviceName"] = True
+    config_path.write_text(json.dumps(payload), encoding="utf-8")
+    return config_path
+
+
 def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) -> None:
    config_path = write_terminal_config(tmp_path)
    service = TerminalFakeAgentService(config_path=config_path)
@ -117,6 +125,98 @@ def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) -
    assert inbound.channel_identity.message_id == "device-001-000001"


+def test_terminal_websocket_can_use_device_name_as_stable_session_peer(tmp_path: Path) -> None:
+    config_path = write_terminal_config_with_device_session(tmp_path)
+    service = TerminalFakeAgentService(config_path=config_path)
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket:
+            websocket.send_json(
+                {
+                    "type": "connect",
+                    "peer_id": "livekit-test-livekit-07291699",
+                    "device_name": "desk-terminal",
+                }
+            )
+            first = websocket.receive_json()
+
+        with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket:
+            websocket.send_json(
+                {
+                    "type": "connect",
+                    "peer_id": "livekit-test-livekit-3fb03fff",
+                    "device_name": "desk-terminal",
+                }
+            )
+            second = websocket.receive_json()
+            websocket.send_json(
+                {
+                    "type": "message",
+                    "message_id": "livekit-test-livekit-3fb03fff-000001",
+                    "text": "hello",
+                }
+            )
+            ack = websocket.receive_json()
+            reply = websocket.receive_json()
+
+    service.close()
+    assert first["session_id"] == "terminal-dev:local:device-desk-terminal"
+    assert second["session_id"] == first["session_id"]
+    assert ack["session_id"] == first["session_id"]
+    assert reply["text"] == "echo:hello"
+    assert service.inbound_calls[0].session_id == first["session_id"]
+    assert service.inbound_calls[0].channel_identity is not None
+    assert service.inbound_calls[0].channel_identity.peer_id == "device-desk-terminal"
+
+
+def test_terminal_websocket_reconnect_delivers_pending_reply_to_latest_device_connection(tmp_path: Path) -> None:
+    config_path = write_terminal_config_with_device_session(tmp_path)
+    service = TerminalFakeAgentService(config_path=config_path, delay_seconds=0.05)
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        with client.websocket_connect("/api/channels/terminal-dev/ws") as first_websocket:
+            first_websocket.send_json(
+                {
+                    "type": "connect",
+                    "peer_id": "livekit-test-livekit-old",
+                    "device_name": "desk-terminal",
+                }
+            )
+            first = first_websocket.receive_json()
+            first_websocket.send_json(
+                {
+                    "type": "message",
+                    "message_id": "livekit-test-livekit-old-000001",
+                    "text": "slow",
+                }
+            )
+            assert first_websocket.receive_json()["accepted"] is True
+
+            with client.websocket_connect("/api/channels/terminal-dev/ws") as latest_websocket:
+                latest_websocket.send_json(
+                    {
+                        "type": "connect",
+                        "peer_id": "livekit-test-livekit-new",
+                        "device_name": "desk-terminal",
+                    }
+                )
+                latest = latest_websocket.receive_json()
+                reply = latest_websocket.receive_json()
+
+    service.close()
+    assert latest["session_id"] == first["session_id"]
+    assert reply == {
+        "type": "message",
+        "role": "assistant",
+        "message_id": "livekit-test-livekit-old-000001",
+        "run_id": "run-1",
+        "text": "echo:slow",
+        "finish_reason": "stop",
+    }
+
+
 def test_terminal_websocket_rejects_message_before_connect(tmp_path: Path) -> None:
    config_path = write_terminal_config(tmp_path)
    service = TerminalFakeAgentService(config_path=config_path)
--- a/app-instance/backend/tests/unit/test_web_tools.py
+++ b/app-instance/backend/tests/unit/test_web_tools.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 import asyncio
+import json

 from beaver.tools.builtins import web

@ -8,8 +9,16 @@ from beaver.tools.builtins import web
 class _FakeResponse:
    headers = {"content-type": "text/html"}
    status_code = 200
-    text = '<a class="result__a" href="https://example.com">Example</a>'
-    url = "https://example.com"
+
+    def __init__(self, url: str = "https://example.com") -> None:
+        self.url = url
+        if "duckduckgo.com" in url:
+            self.text = '<a class="result__a" href="https://duck.example.com">Duck Example</a>'
+        else:
+            self.text = (
+                '<li class="b_algo"><h2><a href="https://example.com">Example</a></h2>'
+                "<p>Example result</p></li>"
+            )

    def raise_for_status(self) -> None:
        return None
@ -17,6 +26,8 @@ class _FakeResponse:

 class _FakeAsyncClient:
    calls: list[dict[str, object]] = []
+    urls: list[str] = []
+    fail_bing = False

    def __init__(self, **kwargs: object) -> None:
        self.calls.append(kwargs)
@ -28,7 +39,11 @@ class _FakeAsyncClient:
        return None

    async def get(self, *args: object, **kwargs: object) -> _FakeResponse:
-        return _FakeResponse()
+        url = str(args[0])
+        self.urls.append(url)
+        if self.fail_bing and "bing.com" in url:
+            raise web.httpx.ConnectTimeout("bing unavailable")
+        return _FakeResponse(url)


 def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None:
@ -42,3 +57,56 @@ def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None:
    asyncio.run(_run())

    assert [call.get("trust_env") for call in _FakeAsyncClient.calls] == [True, True]
+
+
+def test_web_fetch_uses_short_connect_timeout(monkeypatch) -> None:
+    _FakeAsyncClient.calls = []
+    _FakeAsyncClient.urls = []
+    _FakeAsyncClient.fail_bing = False
+    monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
+
+    asyncio.run(web.WebFetchTool().execute(url="https://example.com"))
+
+    timeout = _FakeAsyncClient.calls[0]["timeout"]
+    assert isinstance(timeout, web.httpx.Timeout)
+    assert timeout.connect == 5
+    assert timeout.read == 12
+
+
+def test_web_search_uses_reachable_bing_endpoint_first(monkeypatch) -> None:
+    _FakeAsyncClient.calls = []
+    _FakeAsyncClient.urls = []
+    _FakeAsyncClient.fail_bing = False
+    monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
+
+    raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing"))
+
+    payload = json.loads(raw)
+    assert payload["success"] is True
+    assert payload["engine"] in {"bing", "duckduckgo"}
+    assert set(_FakeAsyncClient.urls) == {
+        "https://www.bing.com/search?q=weather+beijing",
+        "https://duckduckgo.com/html/?q=weather+beijing",
+    }
+
+    timeout = _FakeAsyncClient.calls[0]["timeout"]
+    assert isinstance(timeout, web.httpx.Timeout)
+    assert timeout.connect == 5
+    assert timeout.read == 8
+
+
+def test_web_search_falls_back_when_bing_is_unavailable(monkeypatch) -> None:
+    _FakeAsyncClient.calls = []
+    _FakeAsyncClient.urls = []
+    _FakeAsyncClient.fail_bing = True
+    monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
+
+    raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing"))
+
+    payload = json.loads(raw)
+    assert payload["success"] is True
+    assert payload["engine"] == "duckduckgo"
+    assert set(_FakeAsyncClient.urls) == {
+        "https://www.bing.com/search?q=weather+beijing",
+        "https://duckduckgo.com/html/?q=weather+beijing",
+    }