```
feat(engine): 优化智能体循环中的助手消息处理逻辑 - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性,避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ```
This commit is contained in:
@ -29,6 +29,18 @@ def test_schedule_from_frontend_payload() -> None:
|
||||
assert cron.kind == "cron"
|
||||
|
||||
|
||||
def test_legacy_interval_schedule_recovers_duration_from_display() -> None:
|
||||
schedule = CronSchedule.from_dict(
|
||||
{
|
||||
"kind": "every",
|
||||
"every_ms": None,
|
||||
"display": "every 1800s",
|
||||
}
|
||||
)
|
||||
|
||||
assert schedule.every_ms == 30 * 60 * 1000
|
||||
|
||||
|
||||
def test_compute_next_run_skips_missed_interval() -> None:
|
||||
schedule = CronSchedule(kind="every", every_ms=60_000)
|
||||
assert compute_next_run(schedule, now_ms=1_000_000, last_run_at_ms=0) > 1_000_000
|
||||
@ -80,6 +92,22 @@ def test_manual_run_records_scheduled_run_output(tmp_path) -> None:
|
||||
assert updated.to_api_dict()["last_scheduled_run_id"] == run.scheduled_run_id
|
||||
|
||||
|
||||
def test_persisted_interval_job_keeps_schedule_and_next_run(tmp_path) -> None:
|
||||
store_path = tmp_path / "jobs.json"
|
||||
service = CronService(store_path)
|
||||
job = service.add_job(
|
||||
name="Hydration reminder",
|
||||
message="Drink water",
|
||||
schedule=CronSchedule(kind="every", every_ms=30 * 60 * 1000),
|
||||
)
|
||||
|
||||
reloaded = CronService(store_path).get_job(job.id)
|
||||
|
||||
assert reloaded is not None
|
||||
assert reloaded.schedule.every_ms == 30 * 60 * 1000
|
||||
assert reloaded.next_run_at_ms == job.next_run_at_ms
|
||||
|
||||
|
||||
def test_cron_tool_uses_runtime_service(tmp_path) -> None:
|
||||
service = CronService(tmp_path / "jobs.json")
|
||||
tool = CronTool()
|
||||
|
||||
71
app-instance/backend/tests/unit/test_outlook_integration.py
Normal file
71
app-instance/backend/tests/unit/test_outlook_integration.py
Normal file
@ -0,0 +1,71 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from beaver.foundation.config.schema import AuthzConfig, BackendIdentityConfig, BeaverConfig
|
||||
from beaver.integrations import outlook
|
||||
|
||||
|
||||
class _FakeAuthzClient:
|
||||
async def get_outlook_settings(self, backend_id: str) -> dict:
|
||||
assert backend_id == "steven"
|
||||
return {
|
||||
"configured": True,
|
||||
"email": "steven.yx.li@boardware.com",
|
||||
"server": "mail.boardware.com.mo",
|
||||
}
|
||||
|
||||
|
||||
def _authz_config() -> BeaverConfig:
|
||||
return BeaverConfig(
|
||||
authz=AuthzConfig(
|
||||
enabled=True,
|
||||
base_url="http://authz.example",
|
||||
outlook_mcp_url="http://outlook-mcp.example/mcp",
|
||||
),
|
||||
backend_identity=BackendIdentityConfig(
|
||||
backend_id="steven",
|
||||
client_id="steven",
|
||||
client_secret="secret",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_outlook_status_does_not_probe_mcp_by_default(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None:
|
||||
monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient())
|
||||
|
||||
async def fail_if_called(*_args, **_kwargs):
|
||||
raise AssertionError("status should not call Outlook MCP by default")
|
||||
|
||||
monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fail_if_called)
|
||||
|
||||
result = asyncio.run(outlook.outlook_status(_authz_config(), tmp_path))
|
||||
|
||||
assert result["configured"] is True
|
||||
assert result["connected"] is False
|
||||
assert result["auth_status"] is None
|
||||
assert result["error"] is None
|
||||
|
||||
|
||||
def test_outlook_overview_loads_sections_serially(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None:
|
||||
monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient())
|
||||
active_calls = 0
|
||||
max_active_calls = 0
|
||||
tool_names: list[str] = []
|
||||
|
||||
async def fake_call(_config, tool_name: str, _arguments, **_kwargs):
|
||||
nonlocal active_calls, max_active_calls
|
||||
tool_names.append(tool_name)
|
||||
active_calls += 1
|
||||
max_active_calls = max(max_active_calls, active_calls)
|
||||
await asyncio.sleep(0.01)
|
||||
active_calls -= 1
|
||||
return {"value": []}
|
||||
|
||||
monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fake_call)
|
||||
|
||||
result = asyncio.run(outlook.get_overview(_authz_config(), tmp_path))
|
||||
|
||||
assert result["warnings"] == []
|
||||
assert tool_names == ["mail_list_messages", "mail_list_messages", "calendar_list_events"]
|
||||
assert max_active_calls == 1
|
||||
@ -27,6 +27,7 @@ class StubProvider(LLMProvider):
|
||||
def __init__(self, responses: list[LLMResponse]) -> None:
|
||||
super().__init__()
|
||||
self._responses = list(responses)
|
||||
self.calls: list[dict] = []
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
@ -37,6 +38,16 @@ class StubProvider(LLMProvider):
|
||||
temperature: float = 0.7,
|
||||
thinking_enabled: bool | None = None,
|
||||
) -> LLMResponse:
|
||||
self.calls.append(
|
||||
{
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"model": model,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"thinking_enabled": thinking_enabled,
|
||||
}
|
||||
)
|
||||
if not self._responses:
|
||||
raise AssertionError("No stubbed provider responses left")
|
||||
return self._responses.pop(0)
|
||||
@ -704,32 +715,33 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path:
|
||||
skill_assembler=StubSkillAssembler([skill]),
|
||||
)
|
||||
loop = AgentLoop(loader=loader)
|
||||
provider = StubProvider(
|
||||
[
|
||||
LLMResponse(
|
||||
content="Need a tool.",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_tool_call()],
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
LLMResponse(
|
||||
content="Need another tool.",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_tool_call(call_id="call-2")],
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
LLMResponse(
|
||||
content="Based on the available tool result, the container likely failed during startup.",
|
||||
finish_reason="stop",
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
]
|
||||
)
|
||||
bundle = ProviderBundle(
|
||||
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
||||
main_provider=StubProvider(
|
||||
[
|
||||
LLMResponse(
|
||||
content="Need a tool.",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_tool_call()],
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
LLMResponse(
|
||||
content="Need another tool.",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_tool_call(call_id="call-2")],
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
LLMResponse(
|
||||
content="Based on the available tool result, the container likely failed during startup.",
|
||||
finish_reason="stop",
|
||||
provider_name="stub",
|
||||
model="stub-model",
|
||||
),
|
||||
]
|
||||
),
|
||||
main_provider=provider,
|
||||
)
|
||||
|
||||
result = asyncio.run(
|
||||
@ -744,6 +756,21 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path:
|
||||
assert result.finish_reason == "max_tool_iterations_finalized"
|
||||
assert "Based on the available tool result" in result.output_text
|
||||
assert "Tool loop stopped" not in result.output_text
|
||||
finalization_messages = provider.calls[-1]["messages"]
|
||||
assistant_tool_call_ids = [
|
||||
call["id"]
|
||||
for message in finalization_messages
|
||||
for call in message.get("tool_calls", [])
|
||||
if message.get("role") == "assistant"
|
||||
]
|
||||
tool_result_ids = [
|
||||
message.get("tool_call_id")
|
||||
for message in finalization_messages
|
||||
if message.get("role") == "tool"
|
||||
]
|
||||
assert "call-1" in assistant_tool_call_ids
|
||||
assert "call-2" not in assistant_tool_call_ids
|
||||
assert set(assistant_tool_call_ids).issubset(set(tool_result_ids))
|
||||
effect_records = loaded.run_memory_store.list_skill_effects("docker-debug", version="v0007")
|
||||
assert effect_records[-1].run_id == result.run_id
|
||||
assert effect_records[-1].success is False
|
||||
|
||||
@ -105,3 +105,29 @@ def test_web_archive_route_does_not_create_archive_suffix_session(tmp_path: Path
|
||||
assert loaded.session_manager.get_session("web:alpha")["end_reason"] == "archived" # type: ignore[union-attr]
|
||||
assert loaded.session_manager.get_session("web:alpha/archive") is None # type: ignore[union-attr]
|
||||
assert sessions_response.json() == []
|
||||
|
||||
|
||||
def test_web_session_list_hides_skill_replay_evaluation_sessions(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
loaded.session_manager.ensure_session("eval-session", source="skill_replay_eval") # type: ignore[union-attr]
|
||||
loaded.session_manager.ensure_session("web:visible", source="web") # type: ignore[union-attr]
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.get("/api/sessions")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert [item["key"] for item in response.json()] == ["web:visible"]
|
||||
|
||||
|
||||
def test_get_missing_session_returns_404_without_creating_it(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.get("/api/sessions/missing-session")
|
||||
|
||||
assert response.status_code == 404
|
||||
loaded = service.create_loop().boot()
|
||||
assert loaded.session_manager.get_session("missing-session") is None # type: ignore[union-attr]
|
||||
|
||||
@ -201,6 +201,22 @@ class FakeReplayRunner:
|
||||
}
|
||||
|
||||
|
||||
class ConcurrentReplayRunner(FakeReplayRunner):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.active = 0
|
||||
self.max_active = 0
|
||||
|
||||
async def run_arm(self, request):
|
||||
self.active += 1
|
||||
self.max_active = max(self.max_active, self.active)
|
||||
await asyncio.sleep(0.02)
|
||||
try:
|
||||
return await super().run_arm(request)
|
||||
finally:
|
||||
self.active -= 1
|
||||
|
||||
|
||||
def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
@ -238,6 +254,94 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
|
||||
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
|
||||
|
||||
|
||||
def test_replay_eval_reports_arm_progress(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
progress: list[dict] = []
|
||||
|
||||
asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=FakeReplayRunner(),
|
||||
progress_callback=progress.append,
|
||||
)
|
||||
)
|
||||
|
||||
assert progress[0] == {
|
||||
"phase": "replaying",
|
||||
"completed_arms": 0,
|
||||
"total_arms": 20,
|
||||
"completed_cases": 0,
|
||||
"total_cases": 10,
|
||||
}
|
||||
assert progress[-1] == {
|
||||
"phase": "replaying",
|
||||
"completed_arms": 20,
|
||||
"total_arms": 20,
|
||||
"completed_cases": 10,
|
||||
"total_cases": 10,
|
||||
}
|
||||
|
||||
|
||||
def test_replay_eval_runs_cases_with_bounded_parallelism(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
pipeline.evaluator = SkillDraftEvaluator(
|
||||
pipeline.learning_service.run_store,
|
||||
max_parallel_cases=2,
|
||||
)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
replay_runner = ConcurrentReplayRunner()
|
||||
|
||||
report = asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=replay_runner,
|
||||
)
|
||||
)
|
||||
|
||||
assert replay_runner.max_active == 2
|
||||
assert [case["run_id"] for case in report.cases] == [
|
||||
"run-1",
|
||||
"synthetic:candidate-1:01",
|
||||
"synthetic:candidate-1:02",
|
||||
"synthetic:candidate-1:03",
|
||||
"synthetic:candidate-1:04",
|
||||
"synthetic:candidate-1:05",
|
||||
"synthetic:candidate-1:06",
|
||||
"synthetic:candidate-1:07",
|
||||
"synthetic:candidate-1:08",
|
||||
"synthetic:candidate-1:09",
|
||||
]
|
||||
|
||||
|
||||
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
|
||||
@ -98,6 +98,27 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
|
||||
|
||||
def test_safety_recheck_keeps_submitted_candidate_in_review(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="reviewed-skill",
|
||||
proposed_content="# Reviewed Skill\n\nDo the thing.",
|
||||
proposed_frontmatter={"description": "reviewed"},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
candidate = pipeline.get_candidate("candidate-1")
|
||||
candidate.draft_skill_name = draft.skill_name
|
||||
candidate.draft_id = draft.draft_id
|
||||
pipeline.learning_store.record_learning_candidate(candidate)
|
||||
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
|
||||
assert pipeline.get_candidate("candidate-1").status == "review_pending"
|
||||
|
||||
|
||||
def test_pipeline_reject_blocks_publish(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
|
||||
@ -7,8 +7,17 @@ from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
|
||||
|
||||
|
||||
class FakeAgentLoop:
|
||||
def __init__(self) -> None:
|
||||
self.ended_sessions: list[tuple[str, str]] = []
|
||||
|
||||
def boot(self):
|
||||
return SimpleNamespace(tool_executor=SimpleNamespace(), tool_registry=SimpleNamespace(get=lambda name: None))
|
||||
return SimpleNamespace(
|
||||
tool_executor=SimpleNamespace(),
|
||||
tool_registry=SimpleNamespace(get=lambda name: None),
|
||||
session_manager=SimpleNamespace(
|
||||
end_session=lambda session_id, reason: self.ended_sessions.append((session_id, reason))
|
||||
),
|
||||
)
|
||||
|
||||
async def process_direct(self, task: str, **kwargs):
|
||||
executor = kwargs["tool_executor_override"]
|
||||
@ -18,6 +27,7 @@ class FakeAgentLoop:
|
||||
|
||||
class FakeRunningAgentLoop(FakeAgentLoop):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.process_direct_calls = 0
|
||||
self.submit_direct_calls: list[tuple[str, dict]] = []
|
||||
|
||||
@ -35,6 +45,29 @@ class FakeRunningAgentLoop(FakeAgentLoop):
|
||||
return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
|
||||
|
||||
|
||||
class FakeIsolatedAgentLoop(FakeAgentLoop):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.closed = False
|
||||
self.mcp_manager = SimpleNamespace(close=self._close_mcp)
|
||||
self.mcp_closed = False
|
||||
self.loaded = None
|
||||
|
||||
async def _close_mcp(self) -> None:
|
||||
self.mcp_closed = True
|
||||
|
||||
def close(self) -> None:
|
||||
assert self.mcp_closed is True
|
||||
self.closed = True
|
||||
|
||||
def boot(self):
|
||||
if self.loaded is None:
|
||||
self.loaded = super().boot()
|
||||
self.loaded.mcp_manager = self.mcp_manager
|
||||
self.loaded.closeables = [("mcp_manager", lambda: None)]
|
||||
return self.loaded
|
||||
|
||||
|
||||
def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
|
||||
runner = ReplayRunner(agent_loop=FakeAgentLoop())
|
||||
request = ReplayArmRequest(
|
||||
@ -53,6 +86,8 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
|
||||
assert report["arm"] == "candidate"
|
||||
assert report["finish_reason"] == "stop"
|
||||
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
|
||||
assert report["tool_calls"][0]["duration_ms"] >= 0
|
||||
assert runner.agent_loop.ended_sessions == [("session-replay", "evaluation_complete")]
|
||||
|
||||
|
||||
def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
|
||||
@ -83,3 +118,31 @@ def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
|
||||
assert report["session_id"] == "session-queued"
|
||||
assert report["run_id"] == "run-queued"
|
||||
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
|
||||
assert agent_loop.ended_sessions == [("session-queued", "evaluation_complete")]
|
||||
|
||||
|
||||
def test_replay_runner_uses_and_closes_isolated_loop() -> None:
|
||||
shared_loop = FakeRunningAgentLoop()
|
||||
isolated_loops: list[FakeIsolatedAgentLoop] = []
|
||||
|
||||
def create_isolated_loop() -> FakeIsolatedAgentLoop:
|
||||
loop = FakeIsolatedAgentLoop()
|
||||
isolated_loops.append(loop)
|
||||
return loop
|
||||
|
||||
runner = ReplayRunner(agent_loop=shared_loop, isolated_loop_factory=create_isolated_loop)
|
||||
request = ReplayArmRequest(
|
||||
case_id="case-isolated",
|
||||
arm="candidate",
|
||||
task_text="Fetch current weather.",
|
||||
provider_bundle=object(),
|
||||
)
|
||||
|
||||
report = asyncio.run(runner.run_arm(request))
|
||||
|
||||
assert report["session_id"] == "session-replay"
|
||||
assert shared_loop.process_direct_calls == 0
|
||||
assert shared_loop.submit_direct_calls == []
|
||||
assert len(isolated_loops) == 1
|
||||
assert isolated_loops[0].mcp_closed is True
|
||||
assert isolated_loops[0].closed is True
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
@ -16,7 +18,7 @@ class StubEvaluator:
|
||||
def __init__(self) -> None:
|
||||
self.calls = 0
|
||||
|
||||
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
|
||||
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None):
|
||||
self.calls += 1
|
||||
return SkillDraftEvalReport(
|
||||
report_id="eval-existing",
|
||||
@ -34,6 +36,18 @@ class StubEvaluator:
|
||||
)
|
||||
|
||||
|
||||
class SlowEvaluator(StubEvaluator):
|
||||
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None):
|
||||
await asyncio.sleep(0.15)
|
||||
return await super().evaluate(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=replay_runner,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
|
||||
def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
@ -193,15 +207,79 @@ def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
|
||||
deadline = time.monotonic() + 1
|
||||
payload = response.json()
|
||||
while payload["eval_report"] is None and time.monotonic() < deadline:
|
||||
time.sleep(0.02)
|
||||
payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json()
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert evaluator.calls == 1
|
||||
assert payload["status"] == "in_review"
|
||||
assert payload["safety_report"]["passed"] is True
|
||||
assert payload["eval_report"]["report_id"] == "eval-existing"
|
||||
|
||||
|
||||
def test_submit_draft_returns_before_eval_and_is_idempotent(tmp_path: Path, monkeypatch) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
|
||||
skill_name="weather-search",
|
||||
proposed_content="# Weather Search\n\nUse current weather sources.",
|
||||
proposed_frontmatter={"description": "weather", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
|
||||
SkillLearningCandidate(
|
||||
candidate_id="candidate-weather",
|
||||
kind="revise_skill",
|
||||
source_run_ids=["run-1"],
|
||||
source_session_ids=["session-1"],
|
||||
related_skill_names=["weather-search"],
|
||||
reason="revise",
|
||||
status="draft_ready",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
)
|
||||
evaluator = SlowEvaluator()
|
||||
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
|
||||
monkeypatch.setattr(
|
||||
service,
|
||||
"_make_provider_bundle_for_task",
|
||||
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
|
||||
)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
started = time.monotonic()
|
||||
first = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
|
||||
elapsed = time.monotonic() - started
|
||||
second = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
|
||||
deadline = time.monotonic() + 2
|
||||
payload = second.json()
|
||||
while payload["eval_report"] is None and time.monotonic() < deadline:
|
||||
time.sleep(0.05)
|
||||
payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json()
|
||||
|
||||
assert first.status_code == 200
|
||||
assert elapsed < 0.12
|
||||
assert first.json()["status"] == "in_review"
|
||||
assert first.json()["eval_status"] == "pending"
|
||||
assert first.json()["eval_progress"] == {
|
||||
"phase": "preparing",
|
||||
"completed_arms": 0,
|
||||
"total_arms": 20,
|
||||
"completed_cases": 0,
|
||||
"total_cases": 10,
|
||||
}
|
||||
assert second.status_code == 200
|
||||
assert evaluator.calls == 1
|
||||
assert payload["eval_report"]["report_id"] == "eval-existing"
|
||||
assert loaded.skill_learning_pipeline.get_candidate("candidate-weather").status == "review_pending" # type: ignore[union-attr]
|
||||
|
||||
|
||||
def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
|
||||
@ -57,6 +57,14 @@ def write_terminal_config(tmp_path: Path) -> Path:
|
||||
return config_path
|
||||
|
||||
|
||||
def write_terminal_config_with_device_session(tmp_path: Path) -> Path:
|
||||
config_path = write_terminal_config(tmp_path)
|
||||
payload = json.loads(config_path.read_text(encoding="utf-8"))
|
||||
payload["channels"]["terminal-dev"]["config"]["sessionPeerFromDeviceName"] = True
|
||||
config_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
return config_path
|
||||
|
||||
|
||||
def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) -> None:
|
||||
config_path = write_terminal_config(tmp_path)
|
||||
service = TerminalFakeAgentService(config_path=config_path)
|
||||
@ -117,6 +125,98 @@ def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) -
|
||||
assert inbound.channel_identity.message_id == "device-001-000001"
|
||||
|
||||
|
||||
def test_terminal_websocket_can_use_device_name_as_stable_session_peer(tmp_path: Path) -> None:
|
||||
config_path = write_terminal_config_with_device_session(tmp_path)
|
||||
service = TerminalFakeAgentService(config_path=config_path)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket:
|
||||
websocket.send_json(
|
||||
{
|
||||
"type": "connect",
|
||||
"peer_id": "livekit-test-livekit-07291699",
|
||||
"device_name": "desk-terminal",
|
||||
}
|
||||
)
|
||||
first = websocket.receive_json()
|
||||
|
||||
with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket:
|
||||
websocket.send_json(
|
||||
{
|
||||
"type": "connect",
|
||||
"peer_id": "livekit-test-livekit-3fb03fff",
|
||||
"device_name": "desk-terminal",
|
||||
}
|
||||
)
|
||||
second = websocket.receive_json()
|
||||
websocket.send_json(
|
||||
{
|
||||
"type": "message",
|
||||
"message_id": "livekit-test-livekit-3fb03fff-000001",
|
||||
"text": "hello",
|
||||
}
|
||||
)
|
||||
ack = websocket.receive_json()
|
||||
reply = websocket.receive_json()
|
||||
|
||||
service.close()
|
||||
assert first["session_id"] == "terminal-dev:local:device-desk-terminal"
|
||||
assert second["session_id"] == first["session_id"]
|
||||
assert ack["session_id"] == first["session_id"]
|
||||
assert reply["text"] == "echo:hello"
|
||||
assert service.inbound_calls[0].session_id == first["session_id"]
|
||||
assert service.inbound_calls[0].channel_identity is not None
|
||||
assert service.inbound_calls[0].channel_identity.peer_id == "device-desk-terminal"
|
||||
|
||||
|
||||
def test_terminal_websocket_reconnect_delivers_pending_reply_to_latest_device_connection(tmp_path: Path) -> None:
|
||||
config_path = write_terminal_config_with_device_session(tmp_path)
|
||||
service = TerminalFakeAgentService(config_path=config_path, delay_seconds=0.05)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
with client.websocket_connect("/api/channels/terminal-dev/ws") as first_websocket:
|
||||
first_websocket.send_json(
|
||||
{
|
||||
"type": "connect",
|
||||
"peer_id": "livekit-test-livekit-old",
|
||||
"device_name": "desk-terminal",
|
||||
}
|
||||
)
|
||||
first = first_websocket.receive_json()
|
||||
first_websocket.send_json(
|
||||
{
|
||||
"type": "message",
|
||||
"message_id": "livekit-test-livekit-old-000001",
|
||||
"text": "slow",
|
||||
}
|
||||
)
|
||||
assert first_websocket.receive_json()["accepted"] is True
|
||||
|
||||
with client.websocket_connect("/api/channels/terminal-dev/ws") as latest_websocket:
|
||||
latest_websocket.send_json(
|
||||
{
|
||||
"type": "connect",
|
||||
"peer_id": "livekit-test-livekit-new",
|
||||
"device_name": "desk-terminal",
|
||||
}
|
||||
)
|
||||
latest = latest_websocket.receive_json()
|
||||
reply = latest_websocket.receive_json()
|
||||
|
||||
service.close()
|
||||
assert latest["session_id"] == first["session_id"]
|
||||
assert reply == {
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"message_id": "livekit-test-livekit-old-000001",
|
||||
"run_id": "run-1",
|
||||
"text": "echo:slow",
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
|
||||
|
||||
def test_terminal_websocket_rejects_message_before_connect(tmp_path: Path) -> None:
|
||||
config_path = write_terminal_config(tmp_path)
|
||||
service = TerminalFakeAgentService(config_path=config_path)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from beaver.tools.builtins import web
|
||||
|
||||
@ -8,8 +9,16 @@ from beaver.tools.builtins import web
|
||||
class _FakeResponse:
|
||||
headers = {"content-type": "text/html"}
|
||||
status_code = 200
|
||||
text = '<a class="result__a" href="https://example.com">Example</a>'
|
||||
url = "https://example.com"
|
||||
|
||||
def __init__(self, url: str = "https://example.com") -> None:
|
||||
self.url = url
|
||||
if "duckduckgo.com" in url:
|
||||
self.text = '<a class="result__a" href="https://duck.example.com">Duck Example</a>'
|
||||
else:
|
||||
self.text = (
|
||||
'<li class="b_algo"><h2><a href="https://example.com">Example</a></h2>'
|
||||
"<p>Example result</p></li>"
|
||||
)
|
||||
|
||||
def raise_for_status(self) -> None:
|
||||
return None
|
||||
@ -17,6 +26,8 @@ class _FakeResponse:
|
||||
|
||||
class _FakeAsyncClient:
|
||||
calls: list[dict[str, object]] = []
|
||||
urls: list[str] = []
|
||||
fail_bing = False
|
||||
|
||||
def __init__(self, **kwargs: object) -> None:
|
||||
self.calls.append(kwargs)
|
||||
@ -28,7 +39,11 @@ class _FakeAsyncClient:
|
||||
return None
|
||||
|
||||
async def get(self, *args: object, **kwargs: object) -> _FakeResponse:
|
||||
return _FakeResponse()
|
||||
url = str(args[0])
|
||||
self.urls.append(url)
|
||||
if self.fail_bing and "bing.com" in url:
|
||||
raise web.httpx.ConnectTimeout("bing unavailable")
|
||||
return _FakeResponse(url)
|
||||
|
||||
|
||||
def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None:
|
||||
@ -42,3 +57,56 @@ def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None:
|
||||
asyncio.run(_run())
|
||||
|
||||
assert [call.get("trust_env") for call in _FakeAsyncClient.calls] == [True, True]
|
||||
|
||||
|
||||
def test_web_fetch_uses_short_connect_timeout(monkeypatch) -> None:
|
||||
_FakeAsyncClient.calls = []
|
||||
_FakeAsyncClient.urls = []
|
||||
_FakeAsyncClient.fail_bing = False
|
||||
monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
|
||||
|
||||
asyncio.run(web.WebFetchTool().execute(url="https://example.com"))
|
||||
|
||||
timeout = _FakeAsyncClient.calls[0]["timeout"]
|
||||
assert isinstance(timeout, web.httpx.Timeout)
|
||||
assert timeout.connect == 5
|
||||
assert timeout.read == 12
|
||||
|
||||
|
||||
def test_web_search_uses_reachable_bing_endpoint_first(monkeypatch) -> None:
|
||||
_FakeAsyncClient.calls = []
|
||||
_FakeAsyncClient.urls = []
|
||||
_FakeAsyncClient.fail_bing = False
|
||||
monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
|
||||
|
||||
raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing"))
|
||||
|
||||
payload = json.loads(raw)
|
||||
assert payload["success"] is True
|
||||
assert payload["engine"] in {"bing", "duckduckgo"}
|
||||
assert set(_FakeAsyncClient.urls) == {
|
||||
"https://www.bing.com/search?q=weather+beijing",
|
||||
"https://duckduckgo.com/html/?q=weather+beijing",
|
||||
}
|
||||
|
||||
timeout = _FakeAsyncClient.calls[0]["timeout"]
|
||||
assert isinstance(timeout, web.httpx.Timeout)
|
||||
assert timeout.connect == 5
|
||||
assert timeout.read == 8
|
||||
|
||||
|
||||
def test_web_search_falls_back_when_bing_is_unavailable(monkeypatch) -> None:
|
||||
_FakeAsyncClient.calls = []
|
||||
_FakeAsyncClient.urls = []
|
||||
_FakeAsyncClient.fail_bing = True
|
||||
monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient)
|
||||
|
||||
raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing"))
|
||||
|
||||
payload = json.loads(raw)
|
||||
assert payload["success"] is True
|
||||
assert payload["engine"] == "duckduckgo"
|
||||
assert set(_FakeAsyncClient.urls) == {
|
||||
"https://www.bing.com/search?q=weather+beijing",
|
||||
"https://duckduckgo.com/html/?q=weather+beijing",
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user