From 4b0bf65acefa7ce4e59c51d37b732ebbfcee1e37 Mon Sep 17 00:00:00 2001 From: steven_li Date: Mon, 15 Jun 2026 14:48:16 +0800 Subject: [PATCH] =?UTF-8?q?```=20feat(engine):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E6=99=BA=E8=83=BD=E4=BD=93=E5=BE=AA=E7=8E=AF=E4=B8=AD=E7=9A=84?= =?UTF-8?q?=E5=8A=A9=E6=89=8B=E6=B6=88=E6=81=AF=E5=A4=84=E7=90=86=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性,避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ``` --- app-instance/backend/beaver/engine/loop.py | 18 +- .../backend/beaver/foundation/models/cron.py | 27 +- .../beaver/integrations/outlook/__init__.py | 66 +- .../beaver/interfaces/channels/runtime.py | 4 + .../interfaces/channels/terminal_websocket.py | 24 +- .../backend/beaver/interfaces/web/app.py | 153 +- .../backend/beaver/memory/skills/models.py | 3 + .../backend/beaver/services/agent_service.py | 5 + .../backend/beaver/skills/learning/eval.py | 241 ++- .../beaver/skills/learning/pipeline.py | 66 +- .../backend/beaver/skills/learning/replay.py | 72 +- .../backend/beaver/tools/builtins/web.py | 121 +- .../backend/tests/unit/test_cron_service.py | 28 + .../tests/unit/test_outlook_integration.py | 71 + .../tests/unit/test_phase5_skills_runtime.py | 75 +- .../tests/unit/test_session_archive.py | 26 + .../tests/unit/test_skill_learning_eval.py | 104 + .../unit/test_skill_learning_pipeline.py | 21 + .../unit/test_skill_learning_replay_runner.py | 65 +- .../tests/unit/test_skill_learning_web_api.py | 82 +- .../unit/test_terminal_websocket_channel.py | 100 + .../backend/tests/unit/test_web_tools.py | 74 +- .../frontend/app/(app)/notifications/page.tsx | 8 +- .../frontend/app/(app)/outlook/page.tsx | 75 +- app-instance/frontend/app/(app)/page.tsx | 25 +- .../frontend/app/(app)/skills/page.tsx | 69 +- app-instance/frontend/lib/api.ts | 5 +- .../frontend/lib/notification-runtime.test.ts | 28 + .../frontend/lib/notification-runtime.ts | 12 + .../lib/outlook-counts-visibility.test.ts | 16 + .../frontend/lib/outlook-page-state.test.ts | 29 + .../frontend/lib/outlook-page-state.ts | 20 + app-instance/frontend/types/index.ts | 12 + .../beaver-management-demo.md | 435 ++++ .../upload-files/README.md | 24 + .../upload-files/customer-feedback-q2.md | 37 + .../upload-files/meeting-notes.md | 39 + .../upload-files/project-risks.md | 57 + .../upload-files/project-status.md | 77 + .../upload-files/sales-weekly.csv | 9 + .../upload-files/support-tickets.csv | 11 + .../upload-files/weekly-ops-metrics.csv | 11 + .../assets/animations/animations.css | 0 .../assets/base.css | 0 .../assets/fonts.css | 0 .../assets/runtime.js | 0 .../index.html | 0 .../style.css | 2 +- docs/product-discovery/beaver/README.md | 2 +- .../skill-replay-eval/README.md | 2 +- .../product-discovery-report.md | 2 +- .../2026-06-15-plugin-skill-mirroring.md | 1758 +++++++++++++++++ ...026-06-15-plugin-skill-mirroring-design.md | 409 ++++ 53 files changed, 4328 insertions(+), 292 deletions(-) create mode 100644 app-instance/backend/tests/unit/test_outlook_integration.py create mode 100644 app-instance/frontend/lib/notification-runtime.test.ts create mode 100644 app-instance/frontend/lib/notification-runtime.ts create mode 100644 app-instance/frontend/lib/outlook-counts-visibility.test.ts create mode 100644 app-instance/frontend/lib/outlook-page-state.test.ts create mode 100644 app-instance/frontend/lib/outlook-page-state.ts create mode 100644 docs/presentations/beaver-management-demo/beaver-management-demo.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/README.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/customer-feedback-q2.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/meeting-notes.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/project-risks.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/project-status.md create mode 100644 docs/presentations/beaver-management-demo/upload-files/sales-weekly.csv create mode 100644 docs/presentations/beaver-management-demo/upload-files/support-tickets.csv create mode 100644 docs/presentations/beaver-management-demo/upload-files/weekly-ops-metrics.csv rename docs/presentations/{skill-replay-eval => beaver-project}/assets/animations/animations.css (100%) rename docs/presentations/{skill-replay-eval => beaver-project}/assets/base.css (100%) rename docs/presentations/{skill-replay-eval => beaver-project}/assets/fonts.css (100%) rename docs/presentations/{skill-replay-eval => beaver-project}/assets/runtime.js (100%) rename docs/presentations/{skill-replay-eval => beaver-project}/index.html (100%) rename docs/presentations/{skill-replay-eval => beaver-project}/style.css (99%) create mode 100644 docs/superpowers/plans/2026-06-15-plugin-skill-mirroring.md create mode 100644 docs/superpowers/specs/2026-06-15-plugin-skill-mirroring-design.md diff --git a/app-instance/backend/beaver/engine/loop.py b/app-instance/backend/beaver/engine/loop.py index a1a98c2..588421c 100644 --- a/app-instance/backend/beaver/engine/loop.py +++ b/app-instance/backend/beaver/engine/loop.py @@ -749,14 +749,12 @@ class AgentLoop: model=final_model, user_id=user_id, ) - context_builder.add_assistant_message( - messages, - content=response.content, - tool_calls=assistant_tool_calls or None, - reasoning_content=response.reasoning_content, - ) - if not response.has_tool_calls: + context_builder.add_assistant_message( + messages, + content=response.content, + reasoning_content=response.reasoning_content, + ) final_text = response.content or "" if self._looks_like_raw_tool_call(final_text): final_text = RAW_TOOL_CALL_FALLBACK @@ -795,6 +793,12 @@ class AgentLoop: ) break + context_builder.add_assistant_message( + messages, + content=response.content, + tool_calls=assistant_tool_calls or None, + reasoning_content=response.reasoning_content, + ) iterations += 1 for tool_call in response.tool_calls: result = await effective_tool_executor.execute_tool_call(tool_call, context=tool_context) diff --git a/app-instance/backend/beaver/foundation/models/cron.py b/app-instance/backend/beaver/foundation/models/cron.py index 4b9cf5d..6f54ef4 100644 --- a/app-instance/backend/beaver/foundation/models/cron.py +++ b/app-instance/backend/beaver/foundation/models/cron.py @@ -6,6 +6,7 @@ normal Task instead of a detached agent turn. from __future__ import annotations +import re from dataclasses import dataclass, field from typing import Any, Literal from uuid import uuid4 @@ -37,13 +38,18 @@ class CronSchedule: @classmethod def from_dict(cls, payload: dict[str, Any]) -> "CronSchedule": + kind = str(payload.get("kind") or "every") + display = _optional_str(payload.get("display")) + every_ms = _optional_int(payload.get("every_ms") or payload.get("everyMs")) + if kind == "every" and every_ms is None: + every_ms = _every_ms_from_display(display) return cls( - kind=str(payload.get("kind") or "every"), # type: ignore[arg-type] + kind=kind, # type: ignore[arg-type] at_ms=_optional_int(payload.get("at_ms") or payload.get("atMs")), - every_ms=_optional_int(payload.get("every_ms") or payload.get("everyMs")), + every_ms=every_ms, expr=_optional_str(payload.get("expr")), tz=_optional_str(payload.get("tz")), - display=_optional_str(payload.get("display")), + display=display, ) @@ -250,6 +256,17 @@ def _optional_str(value: Any) -> str | None: def _optional_int(value: Any) -> int | None: if value in (None, ""): return None + try: + return int(value) + except (TypeError, ValueError): + return None + + +def _every_ms_from_display(display: str | None) -> int | None: + match = re.fullmatch(r"every\s+(\d+)s", (display or "").strip(), re.IGNORECASE) + if match is None: + return None + return int(match.group(1)) * 1000 def _payload_mode(value: Any, *, default: CronPayloadMode = "notification") -> CronPayloadMode: @@ -259,7 +276,3 @@ def _payload_mode(value: Any, *, default: CronPayloadMode = "notification") -> C if cleaned == "task": return "task" return "notification" - try: - return int(value) - except (TypeError, ValueError): - return None diff --git a/app-instance/backend/beaver/integrations/outlook/__init__.py b/app-instance/backend/beaver/integrations/outlook/__init__.py index 8c2b6ca..1f05d75 100644 --- a/app-instance/backend/beaver/integrations/outlook/__init__.py +++ b/app-instance/backend/beaver/integrations/outlook/__init__.py @@ -73,9 +73,9 @@ OUTLOOK_TOOL_NAMES = [ def _call_timeout_seconds() -> float: raw = os.getenv("BEAVER_OUTLOOK_MCP_CALL_TIMEOUT_SECONDS", "").strip() try: - return max(1.0, float(raw)) if raw else 10.0 + return max(1.0, float(raw)) if raw else 180.0 except ValueError: - return 10.0 + return 180.0 def _use_authz_mode(config: BeaverConfig) -> bool: @@ -340,7 +340,7 @@ async def disconnect_workspace(config: BeaverConfig) -> dict[str, Any]: return {"ok": True, "removed_state": removed, "removed_mcp": False, "server_id": OUTLOOK_SERVER_ID} -async def outlook_status(config: BeaverConfig, workspace: Path) -> dict[str, Any]: +async def outlook_status(config: BeaverConfig, workspace: Path, *, verify: bool = False) -> dict[str, Any]: meta = _load_meta(workspace) if not _use_authz_mode(config): return { @@ -364,7 +364,7 @@ async def outlook_status(config: BeaverConfig, workspace: Path) -> dict[str, Any connected = False auth_status: dict[str, Any] | None = None error: str | None = None - if configured: + if configured and verify: try: auth_status = await _call_outlook_mcp_tool(config, "auth_status", {}, scopes=["list_tools", "tool:auth_status"]) connected = bool(auth_status.get("authenticated")) @@ -403,38 +403,36 @@ async def get_overview(config: BeaverConfig, workspace: Path) -> dict[str, Any]: warnings.append(f"{label} unavailable: {exc}") return {"value": []} - inbox, sent, calendar = await asyncio.gather( - _load_section( - "inbox", - _call_outlook_mcp_tool( - config, - "mail_list_messages", - {"folder": "inbox", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0}, - scopes=["list_tools", "tool:mail_list_messages"], - ), + inbox = await _load_section( + "inbox", + _call_outlook_mcp_tool( + config, + "mail_list_messages", + {"folder": "inbox", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0}, + scopes=["list_tools", "tool:mail_list_messages"], ), - _load_section( - "sent items", - _call_outlook_mcp_tool( - config, - "mail_list_messages", - {"folder": "sentitems", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0}, - scopes=["list_tools", "tool:mail_list_messages"], - ), + ) + sent = await _load_section( + "sent items", + _call_outlook_mcp_tool( + config, + "mail_list_messages", + {"folder": "sentitems", "top": OUTLOOK_OVERVIEW_MESSAGE_LIMIT, "skip": 0}, + scopes=["list_tools", "tool:mail_list_messages"], ), - _load_section( - "calendar", - _call_outlook_mcp_tool( - config, - "calendar_list_events", - { - "start_time": start_of_day.isoformat(), - "end_time": end_of_day.isoformat(), - "top": OUTLOOK_OVERVIEW_EVENT_LIMIT, - "skip": 0, - }, - scopes=["list_tools", "tool:calendar_list_events"], - ), + ) + calendar = await _load_section( + "calendar", + _call_outlook_mcp_tool( + config, + "calendar_list_events", + { + "start_time": start_of_day.isoformat(), + "end_time": end_of_day.isoformat(), + "top": OUTLOOK_OVERVIEW_EVENT_LIMIT, + "skip": 0, + }, + scopes=["list_tools", "tool:calendar_list_events"], ), ) meta = _update_meta(workspace, last_overview_refresh_at=datetime.now().isoformat()) diff --git a/app-instance/backend/beaver/interfaces/channels/runtime.py b/app-instance/backend/beaver/interfaces/channels/runtime.py index f55e910..c2b81e0 100644 --- a/app-instance/backend/beaver/interfaces/channels/runtime.py +++ b/app-instance/backend/beaver/interfaces/channels/runtime.py @@ -331,6 +331,10 @@ class ChannelRuntime: event_recorder=self.record_event, heartbeat_seconds=float(cfg.config.get("heartbeat_seconds") or 30), max_message_chars=int(cfg.config.get("max_message_chars") or 20000), + session_peer_from_device_name=bool( + cfg.config.get("session_peer_from_device_name") + or cfg.config.get("sessionPeerFromDeviceName") + ), ) if cfg.kind == "telegram" and cfg.mode in {"polling", "webhook"}: diff --git a/app-instance/backend/beaver/interfaces/channels/terminal_websocket.py b/app-instance/backend/beaver/interfaces/channels/terminal_websocket.py index e13a846..f56fa92 100644 --- a/app-instance/backend/beaver/interfaces/channels/terminal_websocket.py +++ b/app-instance/backend/beaver/interfaces/channels/terminal_websocket.py @@ -51,6 +51,7 @@ class TerminalWebSocketAdapter: event_recorder: Callable[..., None] | None = None, heartbeat_seconds: float = 30, max_message_chars: int = 20000, + session_peer_from_device_name: bool = False, ) -> None: self.channel_id = channel_id self.kind = kind @@ -61,6 +62,7 @@ class TerminalWebSocketAdapter: self.event_recorder = event_recorder self.heartbeat_seconds = max(1.0, float(heartbeat_seconds)) self.max_message_chars = max(1, int(max_message_chars)) + self.session_peer_from_device_name = bool(session_peer_from_device_name) self.started = False self._connections_by_session: dict[str, TerminalConnection] = {} self._session_by_peer: dict[str, str] = {} @@ -131,14 +133,15 @@ class TerminalWebSocketAdapter: *, current: TerminalConnection | None, ) -> TerminalConnection | None: - peer_id = _clean(payload.get("peer_id")) - if not peer_id: + raw_peer_id = _clean(payload.get("peer_id")) + if not raw_peer_id: await websocket.send_json({"type": "error", "error": "peer_id is required"}) return current thread_id = _clean(payload.get("thread_id")) or None user_id = _clean(payload.get("user_id")) or None device_name = _clean(payload.get("device_name")) + peer_id = self._session_peer_id(raw_peer_id, device_name) capabilities = [str(item) for item in payload.get("capabilities") or [] if item is not None] identity = ChannelIdentity( channel_id=self.channel_id, @@ -171,7 +174,12 @@ class TerminalWebSocketAdapter: self._record( kind="terminal_connected", session_id=session_id, - metadata={"peer_id": peer_id, "device_name": device_name, "capabilities": capabilities}, + metadata={ + "peer_id": peer_id, + "raw_peer_id": raw_peer_id, + "device_name": device_name, + "capabilities": capabilities, + }, ) await websocket.send_json( { @@ -299,3 +307,13 @@ class TerminalWebSocketAdapter: error=error, metadata=metadata, ) + + def _session_peer_id(self, peer_id: str, device_name: str) -> str: + if self.session_peer_from_device_name and device_name: + return f"device-{_clean_session_part(device_name)}" + return peer_id + + +def _clean_session_part(value: str) -> str: + cleaned = "-".join(str(value or "").strip().split()) + return cleaned.replace(":", "_") or "unknown" diff --git a/app-instance/backend/beaver/interfaces/web/app.py b/app-instance/backend/beaver/interfaces/web/app.py index 945ffda..a3a6ac1 100644 --- a/app-instance/backend/beaver/interfaces/web/app.py +++ b/app-instance/backend/beaver/interfaces/web/app.py @@ -264,6 +264,25 @@ async def _app_lifespan( ) app.state.channel_runtime = channel_runtime await channel_runtime.start() + for candidate in loaded.skill_learning_pipeline.list_candidates(status="review_pending"): # type: ignore[union-attr] + skill_name = candidate.draft_skill_name + draft_id = candidate.draft_id + if not skill_name or not draft_id: + continue + if loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) is not None: # type: ignore[union-attr] + continue + draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr] + if draft.status != "in_review": + continue + _schedule_skill_draft_eval( + app, + agent_service=attached_service, + loop=attached_service.create_loop(), + loaded=loaded, + candidate_id=candidate.candidate_id, + skill_name=skill_name, + draft_id=draft_id, + ) except BaseException: if owns_service and started: with suppress(BaseException): @@ -280,7 +299,10 @@ async def _app_lifespan( worker = SkillLearningWorker( pipeline=loaded.skill_learning_pipeline, # type: ignore[arg-type] provider_bundle_factory=lambda: attached_service._make_provider_bundle_for_task(loaded, {}), # noqa: SLF001 - replay_runner_factory=lambda: ReplayRunner(agent_loop=attached_service.create_loop()), + replay_runner_factory=lambda: ReplayRunner( + agent_loop=attached_service.create_loop(), + isolated_loop_factory=attached_service.create_isolated_loop, + ), config=worker_config, ) worker_task = asyncio.create_task(worker.run_forever()) @@ -289,6 +311,13 @@ async def _app_lifespan( try: yield finally: + skill_eval_tasks = getattr(app.state, "skill_eval_tasks", {}) + for task in list(skill_eval_tasks.values()): + task.cancel() + for task in list(skill_eval_tasks.values()): + with suppress(BaseException): + await task + skill_eval_tasks.clear() runtime = getattr(app.state, "channel_runtime", None) if isinstance(runtime, ChannelRuntime): with suppress(BaseException): @@ -587,6 +616,7 @@ def create_app( ) app.state.auth_tokens = {} app.state.handoff_codes = {} + app.state.skill_eval_tasks = {} app.state.auth_file = Path(os.getenv("BEAVER_AUTH_FILE") or "") max_file_size = 50 * 1024 * 1024 max_user_file_upload_size = _int_env("BEAVER_USER_FILES_MAX_UPLOAD_BYTES", 5 * 1024 * 1024 * 1024) @@ -1250,7 +1280,7 @@ def create_app( session_manager = loaded.session_manager rows = session_manager.list_sessions_rich( limit=100, - exclude_sources=["subagent", "notification"], + exclude_sources=["subagent", "notification", "skill_replay_eval"], exclude_end_reasons=["archived", "deleted"], ) # type: ignore[union-attr] return [ @@ -1259,6 +1289,9 @@ def create_app( "created_at": _iso_from_timestamp(row.get("started_at")), "updated_at": _iso_from_timestamp(row.get("last_active")), "path": str(row.get("id")), + "source": row.get("source"), + "title": row.get("title"), + "preview": row.get("preview"), } for row in rows ] @@ -1337,7 +1370,9 @@ def create_app( async def get_session(session_id: str, request: Request) -> dict[str, Any]: loaded = get_agent_service(request).create_loop().boot() session_manager = loaded.session_manager - session = session_manager.get_or_create(session_id, source="web") # type: ignore[union-attr] + session = session_manager.get_session(session_id) # type: ignore[union-attr] + if session is None: + raise HTTPException(status_code=404, detail="Session not found") return _session_detail(session_manager, session_id, session) # type: ignore[arg-type] @app.delete("/api/sessions/{session_id:path}") @@ -2216,21 +2251,33 @@ def create_app( try: safety = loaded.skill_learning_pipeline.check_safety(skill_name, draft_id) # type: ignore[union-attr] if safety.passed and safety.risk_level != "critical": - loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr] - skill_name, - draft_id, - requested_by=str((payload or {}).get("requested_by") or "web"), - notes=str((payload or {}).get("notes") or ""), - ) - candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id) - if candidate_id is not None: - provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001 - await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr] - candidate_id, + draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr] + if draft.status == "draft": + loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr] skill_name, draft_id, - provider_bundle=provider_bundle, - replay_runner=ReplayRunner(agent_loop=loop), + requested_by=str((payload or {}).get("requested_by") or "web"), + notes=str((payload or {}).get("notes") or ""), + ) + elif draft.status not in {"in_review", "approved"}: + raise ValueError("Draft cannot be submitted from its current status") + candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id) + eval_report = loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) # type: ignore[union-attr] + if candidate_id is not None and eval_report is None: + loaded.skill_learning_store.transition_learning_candidate( # type: ignore[union-attr] + candidate_id, + "review_pending", + event_type="eval_queued", + last_error=None, + ) + _schedule_skill_draft_eval( + app, + agent_service=agent_service, + loop=loop, + loaded=loaded, + candidate_id=candidate_id, + skill_name=skill_name, + draft_id=draft_id, ) except ValueError as exc: raise _skill_draft_http_error(exc) from exc @@ -3810,14 +3857,88 @@ def _skill_learning_candidate_task_text(loaded: Any, candidate: Any) -> str: return str(evidence.get("task_text") or "").strip() +def _schedule_skill_draft_eval( + app: FastAPI, + *, + agent_service: AgentService, + loop: Any, + loaded: Any, + candidate_id: str, + skill_name: str, + draft_id: str, +) -> None: + key = f"{skill_name}:{draft_id}" + tasks: dict[str, asyncio.Task[None]] = app.state.skill_eval_tasks + current = tasks.get(key) + if current is not None and not current.done(): + return + + loaded.skill_learning_pipeline.mark_eval_progress( # type: ignore[union-attr] + candidate_id, + { + "phase": "preparing", + "completed_arms": 0, + "total_arms": 20, + "completed_cases": 0, + "total_cases": 10, + }, + ) + + async def run_eval() -> None: + try: + provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001 + await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr] + candidate_id, + skill_name, + draft_id, + provider_bundle=provider_bundle, + replay_runner=ReplayRunner( + agent_loop=loop, + isolated_loop_factory=agent_service.create_isolated_loop, + ), + progress_callback=lambda progress: loaded.skill_learning_pipeline.mark_eval_progress( # type: ignore[union-attr] + candidate_id, + progress, + ), + ) + except asyncio.CancelledError: + raise + except Exception as exc: + loaded.skill_learning_pipeline.mark_eval_failed(candidate_id, str(exc)) # type: ignore[union-attr] + + task = asyncio.create_task(run_eval()) + tasks[key] = task + + def remove_completed(completed: asyncio.Task[None]) -> None: + if tasks.get(key) is completed: + tasks.pop(key, None) + + task.add_done_callback(remove_completed) + + def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include_reviews: bool = False) -> dict[str, Any]: draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr] safety = loaded.skill_learning_pipeline.get_safety_report(skill_name, draft_id) # type: ignore[union-attr] eval_report = loaded.skill_learning_pipeline.get_eval_report(skill_name, draft_id) # type: ignore[union-attr] + candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id) + candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) if candidate_id is not None else None # type: ignore[union-attr] + if eval_report is not None: + eval_status = eval_report.status + elif candidate is None: + eval_status = "not_applicable" + elif candidate.status == "eval_failed": + eval_status = "failed" + elif draft.status in {"in_review", "approved"}: + eval_status = "pending" + else: + eval_status = "not_started" payload = { **draft.to_dict(), "safety_report": safety.to_dict() if safety is not None else None, "eval_report": eval_report.to_dict() if eval_report is not None else None, + "eval_status": eval_status, + "eval_error": candidate.last_error if candidate is not None and candidate.status == "eval_failed" else None, + "eval_progress": dict(candidate.eval_progress) if candidate is not None else None, "target_version": _skill_draft_target_version(loaded, draft.skill_name, draft.proposal_kind), "base_skill": _skill_draft_base_skill_payload(loaded, draft), } diff --git a/app-instance/backend/beaver/memory/skills/models.py b/app-instance/backend/beaver/memory/skills/models.py index 8b98167..cdeac7e 100644 --- a/app-instance/backend/beaver/memory/skills/models.py +++ b/app-instance/backend/beaver/memory/skills/models.py @@ -82,6 +82,7 @@ class SkillLearningCandidate: draft_id: str | None = None safety_report_id: str | None = None eval_report_id: str | None = None + eval_progress: dict[str, Any] = field(default_factory=dict) created_at: str = "" updated_at: str = "" @@ -107,6 +108,7 @@ class SkillLearningCandidate: "draft_id": self.draft_id, "safety_report_id": self.safety_report_id, "eval_report_id": self.eval_report_id, + "eval_progress": dict(self.eval_progress), "created_at": self.created_at, "updated_at": self.updated_at, } @@ -137,6 +139,7 @@ class SkillLearningCandidate: draft_id=_optional_str(payload.get("draft_id")), safety_report_id=_optional_str(payload.get("safety_report_id")), eval_report_id=_optional_str(payload.get("eval_report_id")), + eval_progress=dict(payload.get("eval_progress") or {}), created_at=str(payload.get("created_at") or now), updated_at=str(payload.get("updated_at") or payload.get("created_at") or now), ) diff --git a/app-instance/backend/beaver/services/agent_service.py b/app-instance/backend/beaver/services/agent_service.py index 3cefd0d..904ccc2 100644 --- a/app-instance/backend/beaver/services/agent_service.py +++ b/app-instance/backend/beaver/services/agent_service.py @@ -91,6 +91,11 @@ class AgentService: self._loop.boot() return self._loop + def create_isolated_loop(self) -> AgentLoop: + loop = AgentLoop(profile=self.profile, loader=self.loader) + loop.runtime_services.update(self._runtime_services) + return loop + def register_runtime_service(self, name: str, service: Any) -> None: """Expose process-level services to tools during agent runs.""" diff --git a/app-instance/backend/beaver/skills/learning/eval.py b/app-instance/backend/beaver/skills/learning/eval.py index 2bcd42f..db49016 100644 --- a/app-instance/backend/beaver/skills/learning/eval.py +++ b/app-instance/backend/beaver/skills/learning/eval.py @@ -2,8 +2,10 @@ from __future__ import annotations +import asyncio import json -from typing import Any +import os +from typing import Any, Callable from uuid import uuid4 from beaver.engine.context import SkillContext @@ -25,9 +27,17 @@ class SkillDraftEvaluator: run_store: RunMemoryStore, *, surrogate_evaluator: SurrogateToolEvaluator | None = None, + max_parallel_cases: int | None = None, ) -> None: self.run_store = run_store self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator() + configured_parallelism = max_parallel_cases + if configured_parallelism is None: + try: + configured_parallelism = int(os.getenv("BEAVER_SKILL_EVAL_MAX_PARALLEL_CASES", "3") or "3") + except ValueError: + configured_parallelism = 3 + self.max_parallel_cases = max(1, configured_parallelism) async def evaluate( self, @@ -36,6 +46,7 @@ class SkillDraftEvaluator: draft: SkillDraft, provider_bundle: ProviderBundle | None, replay_runner: ReplayRunner | None = None, + progress_callback: Callable[[dict[str, Any]], None] | None = None, ) -> SkillDraftEvalReport: if provider_bundle is None or provider_bundle.main_provider is None: return self._skipped(candidate, draft) @@ -59,6 +70,7 @@ class SkillDraftEvaluator: provider_bundle=provider_bundle, replay_runner=replay_runner, case_selection_meta=case_selection_meta, + progress_callback=progress_callback, ) return self._evaluate_heuristic(candidate, draft, runs) @@ -129,96 +141,72 @@ class SkillDraftEvaluator: provider_bundle: ProviderBundle, replay_runner: ReplayRunner, case_selection_meta: dict[str, Any] | None = None, + progress_callback: Callable[[dict[str, Any]], None] | None = None, ) -> SkillDraftEvalReport: - case_reports: list[dict] = [] - legacy_cases: list[dict] = [] - for case in replay_cases: - baseline = await replay_runner.run_arm( - ReplayArmRequest( - case_id=f"{case['run_id']}:baseline", - arm="baseline", - task_text=str(case["task_text"]), - pinned_skill_names=list(case.get("baseline_skill_names") or []), - pinned_skill_contexts=[], - provider_bundle=provider_bundle, - model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + total_cases = len(replay_cases) + total_arms = total_cases * 2 + completed_arms = 0 + completed_cases = 0 + progress_lock = asyncio.Lock() + semaphore = asyncio.Semaphore(self.max_parallel_cases) + _report_progress( + progress_callback, + completed_arms=completed_arms, + total_arms=total_arms, + completed_cases=0, + total_cases=total_cases, + ) + + async def mark_progress(*, case_completed: bool) -> None: + nonlocal completed_arms, completed_cases + async with progress_lock: + completed_arms += 1 + if case_completed: + completed_cases += 1 + _report_progress( + progress_callback, + completed_arms=completed_arms, + total_arms=total_arms, + completed_cases=completed_cases, + total_cases=total_cases, ) - ) - candidate_arm = await replay_runner.run_arm( - ReplayArmRequest( - case_id=f"{case['run_id']}:candidate", - arm="candidate", - task_text=str(case["task_text"]), - pinned_skill_names=[], - pinned_skill_contexts=[_draft_skill_context(draft)], - provider_bundle=provider_bundle, - model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + + async def evaluate_case(case: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + async with semaphore: + baseline = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:baseline", + arm="baseline", + task_text=str(case["task_text"]), + pinned_skill_names=list(case.get("baseline_skill_names") or []), + pinned_skill_contexts=[], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) ) - ) - surrogate = await self.surrogate_evaluator.evaluate( - task_text=str(case["task_text"]), - baseline=baseline, - candidate=candidate_arm, - ) - baseline_ability = _ability_score( - case=case, - arm=baseline, - arm_name="baseline", - ) - candidate_ability = _ability_score( - case=case, - arm=candidate_arm, - arm_name="candidate", - ) - baseline_score = baseline_ability["final_score"] - candidate_score = candidate_ability["final_score"] - tool_execution_score = { - "baseline_score": surrogate["baseline_score"], - "candidate_score": surrogate["candidate_score"], - "delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4), - "score_role": "diagnostic_only", - } - case_report = { - "run_id": case["run_id"], - "task_id": case.get("task_id"), - "session_id": case.get("session_id"), - "task_text": case.get("task_text"), - "synthetic": bool(case.get("synthetic")), - "tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"), - "validator": case.get("validator"), - "baseline": baseline, - "candidate": candidate_arm, - "baseline_score": baseline_score, - "candidate_score": candidate_score, - "delta": round(candidate_score - baseline_score, 4), - "ability_score": { - "baseline": baseline_ability, - "candidate": candidate_ability, - "delta": round(candidate_score - baseline_score, 4), - }, - "tool_execution_score": tool_execution_score, - "execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"), - "surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"), - "blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"), - "confidence": surrogate["confidence"], - "tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])], - "artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])], - "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])], - "validator_notes": list(surrogate.get("notes") or []), - } - case_reports.append(case_report) - legacy_cases.append( - { - "run_id": case["run_id"], - "session_id": case.get("session_id") or "", - "task_text": case.get("task_text") or "", - "synthetic": bool(case.get("synthetic")), - "tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"), - "baseline_score": baseline_score, - "candidate_score": candidate_score, - "delta": round(candidate_score - baseline_score, 4), - } - ) + await mark_progress(case_completed=False) + candidate_arm = await replay_runner.run_arm( + ReplayArmRequest( + case_id=f"{case['run_id']}:candidate", + arm="candidate", + task_text=str(case["task_text"]), + pinned_skill_names=[], + pinned_skill_contexts=[_draft_skill_context(draft)], + provider_bundle=provider_bundle, + model_settings={"max_tool_iterations": 4, "temperature": 0.0}, + ) + ) + await mark_progress(case_completed=True) + surrogate = await self.surrogate_evaluator.evaluate( + task_text=str(case["task_text"]), + baseline=baseline, + candidate=candidate_arm, + ) + return _build_replay_case_reports(case, baseline, candidate_arm, surrogate) + + results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases)) + case_reports = [case_report for case_report, _ in results] + legacy_cases = [legacy_case for _, legacy_case in results] preservation_report = _preservation_report(candidate, draft) return _report_from_case_reports( candidate, @@ -248,6 +236,83 @@ class SkillDraftEvaluator: ) +def _build_replay_case_reports( + case: dict[str, Any], + baseline: dict[str, Any], + candidate_arm: dict[str, Any], + surrogate: dict[str, Any], +) -> tuple[dict[str, Any], dict[str, Any]]: + baseline_ability = _ability_score(case=case, arm=baseline, arm_name="baseline") + candidate_ability = _ability_score(case=case, arm=candidate_arm, arm_name="candidate") + baseline_score = baseline_ability["final_score"] + candidate_score = candidate_ability["final_score"] + tier = case.get("tier") or ("bronze" if case.get("synthetic") else "gold") + case_report = { + "run_id": case["run_id"], + "task_id": case.get("task_id"), + "session_id": case.get("session_id"), + "task_text": case.get("task_text"), + "synthetic": bool(case.get("synthetic")), + "tier": tier, + "validator": case.get("validator"), + "baseline": baseline, + "candidate": candidate_arm, + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + "ability_score": { + "baseline": baseline_ability, + "candidate": candidate_ability, + "delta": round(candidate_score - baseline_score, 4), + }, + "tool_execution_score": { + "baseline_score": surrogate["baseline_score"], + "candidate_score": surrogate["candidate_score"], + "delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4), + "score_role": "diagnostic_only", + }, + "execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"), + "surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"), + "blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"), + "confidence": surrogate["confidence"], + "tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])], + "artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])], + "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])], + "validator_notes": list(surrogate.get("notes") or []), + } + return case_report, { + "run_id": case["run_id"], + "session_id": case.get("session_id") or "", + "task_text": case.get("task_text") or "", + "synthetic": bool(case.get("synthetic")), + "tier": tier, + "baseline_score": baseline_score, + "candidate_score": candidate_score, + "delta": round(candidate_score - baseline_score, 4), + } + + +def _report_progress( + callback: Callable[[dict[str, Any]], None] | None, + *, + completed_arms: int, + total_arms: int, + completed_cases: int, + total_cases: int, +) -> None: + if callback is None: + return + callback( + { + "phase": "replaying", + "completed_arms": completed_arms, + "total_arms": total_arms, + "completed_cases": completed_cases, + "total_cases": total_cases, + } + ) + + def _score_from_validation(validation: dict | None, success: bool) -> float: if isinstance(validation, dict) and "score" in validation: try: diff --git a/app-instance/backend/beaver/skills/learning/pipeline.py b/app-instance/backend/beaver/skills/learning/pipeline.py index 283b36a..b7e38ce 100644 --- a/app-instance/backend/beaver/skills/learning/pipeline.py +++ b/app-instance/backend/beaver/skills/learning/pipeline.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any +from typing import Any, Callable from beaver.engine.providers import ProviderBundle from beaver.memory.skills import SkillDraftEvalReport, SkillDraftSafetyReport, SkillLearningCandidate, SkillLearningStore @@ -174,12 +174,20 @@ class SkillLearningPipelineService: safety = self.get_safety_report(skill_name, draft_id) if safety is not None and (not safety.passed or safety.risk_level == "critical"): raise ValueError("Draft cannot enter review because safety check failed") - return self.review_service.submit_for_review( + review = self.review_service.submit_for_review( skill_name, draft_id, reviewer_request=notes, requested_by=requested_by, ) + self._mark_candidate_by_draft( + skill_name, + draft_id, + "review_pending", + "review_submitted", + last_error=None, + ) + return review def approve( self, @@ -258,9 +266,13 @@ class SkillLearningPipelineService: draft = self.get_draft(skill_name, draft_id) report = self.safety_checker.check(draft) self.learning_store.write_safety_report(report) - status = "safety_failed" if not report.passed or report.risk_level == "critical" else "draft_ready" + status = ( + "safety_failed" + if not report.passed or report.risk_level == "critical" + else self._candidate_status_for_draft(draft) + ) current = self._candidate_by_draft(skill_name, draft_id) - if current is not None and current.status == "eval_failed" and status == "draft_ready": + if current is not None and current.status == "eval_failed" and status != "safety_failed": status = "eval_failed" self._mark_candidate_by_draft( skill_name, @@ -287,6 +299,7 @@ class SkillLearningPipelineService: *, provider_bundle: ProviderBundle | None, replay_runner: ReplayRunner | None = None, + progress_callback: Callable[[dict[str, Any]], None] | None = None, ) -> SkillDraftEvalReport: draft = self.get_draft(skill_name, draft_id) candidate = self.get_candidate(candidate_id) @@ -296,13 +309,14 @@ class SkillLearningPipelineService: draft=draft, provider_bundle=provider_bundle, replay_runner=replay_runner, + progress_callback=progress_callback, ) self.learning_store.write_eval_report(report) if report.status == "skipped_provider_unavailable": - status = "draft_ready" + status = self._candidate_status_for_draft(draft) error = "eval skipped: provider unavailable" elif report.passed: - status = "draft_ready" + status = self._candidate_status_for_draft(draft) error = None else: status = "eval_failed" @@ -316,11 +330,43 @@ class SkillLearningPipelineService: status, event_type="eval_completed", eval_report_id=report.report_id, + eval_progress={ + "phase": "completed", + "completed_arms": len(report.cases) * 2 if report.mode == "replay" else 0, + "total_arms": len(report.cases) * 2 if report.mode == "replay" else 0, + "completed_cases": len(report.cases), + "total_cases": len(report.cases), + }, last_error=error, payload=report.to_dict(), ) return report + def mark_eval_progress(self, candidate_id: str, progress: dict[str, Any]) -> SkillLearningCandidate: + return self._require_updated( + self.learning_store.update_learning_candidate( + candidate_id, + eval_progress=dict(progress), + ), + candidate_id, + ) + + def mark_eval_failed(self, candidate_id: str, error: str) -> SkillLearningCandidate: + candidate = self.get_candidate(candidate_id) + progress = dict(candidate.eval_progress) + progress["phase"] = "failed" + return self._require_updated( + self.learning_store.transition_learning_candidate( + candidate_id, + "eval_failed", + eval_progress=progress, + event_type="eval_failed", + last_error=error, + payload={"error": error}, + ), + candidate_id, + ) + def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None: reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id) if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews): @@ -372,6 +418,14 @@ class SkillLearningPipelineService: return candidate return None + @staticmethod + def _candidate_status_for_draft(draft: SkillDraft) -> str: + if draft.status == SkillReviewState.APPROVED.value: + return "approved" + if draft.status == SkillReviewState.IN_REVIEW.value: + return "review_pending" + return "draft_ready" + @staticmethod def _require_updated(candidate: SkillLearningCandidate | None, candidate_id: str) -> SkillLearningCandidate: if candidate is None: diff --git a/app-instance/backend/beaver/skills/learning/replay.py b/app-instance/backend/beaver/skills/learning/replay.py index debec50..cc7f42f 100644 --- a/app-instance/backend/beaver/skills/learning/replay.py +++ b/app-instance/backend/beaver/skills/learning/replay.py @@ -3,7 +3,8 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Any, Literal +from time import perf_counter +from typing import Any, Callable, Literal from uuid import uuid4 from beaver.tools.base import ToolContext, ToolResult, ToolSpec @@ -59,6 +60,7 @@ class ReplayToolExecutor: *, context: ToolContext | None = None, ) -> ToolResult: + started_at = perf_counter() tool = self.registry.get(tool_name) spec = tool.spec if tool is not None else ToolSpec( name=tool_name, @@ -84,6 +86,7 @@ class ReplayToolExecutor: "error": result.error, "content": result.content[:2000], } + trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2) self.traces.append(trace) return result if mode == "surrogate": @@ -92,6 +95,7 @@ class ReplayToolExecutor: "error": "replay_surrogate", "content": "Tool call recorded for surrogate evaluation.", } + trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2) self.traces.append(trace) return ToolResult( success=True, @@ -105,6 +109,7 @@ class ReplayToolExecutor: "error": "replay_blocked", "content": "Tool call blocked by replay policy.", } + trace["duration_ms"] = round((perf_counter() - started_at) * 1000, 2) self.traces.append(trace) return ToolResult( success=False, @@ -151,12 +156,20 @@ class ReplayArmRequest: class ReplayRunner: - def __init__(self, *, agent_loop: Any, policy: ReplayToolPolicy | None = None) -> None: + def __init__( + self, + *, + agent_loop: Any, + policy: ReplayToolPolicy | None = None, + isolated_loop_factory: Callable[[], Any] | None = None, + ) -> None: self.agent_loop = agent_loop self.policy = policy or ReplayToolPolicy() + self.isolated_loop_factory = isolated_loop_factory async def run_arm(self, request: ReplayArmRequest) -> dict[str, Any]: - loaded = self.agent_loop.boot() + target_loop = self.isolated_loop_factory() if self.isolated_loop_factory is not None else self.agent_loop + loaded = target_loop.boot() replay_executor = ReplayToolExecutor( loaded.tool_executor, registry=loaded.tool_registry, @@ -174,23 +187,42 @@ class ReplayRunner: "tool_executor_override": replay_executor, } try: - result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs) - except RuntimeError as exc: - if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"): - raise - result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs) - return { - "case_id": request.case_id, - "arm": request.arm, - "session_id": result.session_id, - "run_id": result.run_id, - "task_text": request.task_text, - "finish_reason": result.finish_reason, - "final_answer": result.output_text, - "tool_calls": list(replay_executor.traces), - "artifacts": [], - "side_effects": _side_effects_from_traces(replay_executor.traces), - } + try: + result = await target_loop.process_direct(request.task_text, **direct_kwargs) + except RuntimeError as exc: + if not _is_process_direct_disabled_while_running(exc) or not hasattr(target_loop, "submit_direct"): + raise + result = await target_loop.submit_direct(request.task_text, **direct_kwargs) + session_manager = getattr(loaded, "session_manager", None) + if session_manager is not None and hasattr(session_manager, "end_session"): + session_manager.end_session(result.session_id, "evaluation_complete") + return { + "case_id": request.case_id, + "arm": request.arm, + "session_id": result.session_id, + "run_id": result.run_id, + "task_text": request.task_text, + "finish_reason": result.finish_reason, + "final_answer": result.output_text, + "tool_calls": list(replay_executor.traces), + "artifacts": [], + "side_effects": _side_effects_from_traces(replay_executor.traces), + } + finally: + if target_loop is not self.agent_loop and hasattr(target_loop, "close"): + mcp_manager = getattr(loaded, "mcp_manager", None) + if mcp_manager is not None and hasattr(mcp_manager, "close"): + try: + await mcp_manager.close() + finally: + closeables = getattr(loaded, "closeables", None) + if isinstance(closeables, list): + loaded.closeables = [ + (name, close_fn) + for name, close_fn in closeables + if name != "mcp_manager" + ] + target_loop.close() def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool: diff --git a/app-instance/backend/beaver/tools/builtins/web.py b/app-instance/backend/beaver/tools/builtins/web.py index 8b5d469..90e55b3 100644 --- a/app-instance/backend/beaver/tools/builtins/web.py +++ b/app-instance/backend/beaver/tools/builtins/web.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio from dataclasses import dataclass, field from html import unescape import json @@ -51,7 +52,8 @@ class WebFetchTool: try: safe_url = _safe_url(url) limit = max(1000, min(int(max_chars or 12000), 50000)) - async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client: + timeout = httpx.Timeout(connect=5, read=12, write=5, pool=5) + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client: response = await client.get( safe_url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"}, @@ -76,7 +78,7 @@ class WebFetchTool: @dataclass(slots=True) class WebSearchTool: name: str = "web_search" - description: str = "Search the web using DuckDuckGo HTML results. No API key required." + description: str = "Search the public web using HTML results. No API key required." toolset: str = "web" always_available: bool = False parameters: dict[str, Any] = field( @@ -95,23 +97,102 @@ class WebSearchTool: if not str(query).strip(): raise ValueError("query is required") bounded = max(1, min(int(limit or 5), 10)) - url = f"https://duckduckgo.com/html/?q={quote_plus(query)}" - async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client: - response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"}) - response.raise_for_status() - html = response.text - results: list[dict[str, str]] = [] - pattern = re.compile( - r']+class="result__a"[^>]+href="(?P[^"]+)"[^>]*>(?P.*?)</a>', - re.I | re.S, - ) - for match in pattern.finditer(html): - title = _strip_html(match.group("title")) - result_url = unescape(match.group("url")) - if title and result_url: - results.append({"title": title, "url": result_url, "snippet": ""}) - if len(results) >= bounded: - break - return _json_result(True, query=query, results=results) + headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"} + timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5) + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client: + tasks = [ + asyncio.create_task( + _search_bing( + client, + query=query, + limit=bounded, + headers=headers, + ) + ), + asyncio.create_task( + _search_duckduckgo( + client, + query=query, + limit=bounded, + headers=headers, + ) + ), + ] + errors: list[str] = [] + try: + for completed in asyncio.as_completed(tasks): + try: + engine, results = await completed + except Exception as exc: + errors.append(str(exc)) + continue + if results: + return _json_result(True, query=query, engine=engine, results=results) + detail = "; ".join(error for error in errors if error) or "no search results" + return _json_result(False, query=query, error=detail) + finally: + for task in tasks: + if not task.done(): + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) except Exception as exc: return _json_result(False, query=query, error=str(exc)) + + +async def _search_bing( + client: httpx.AsyncClient, + *, + query: str, + limit: int, + headers: dict[str, str], +) -> tuple[str, list[dict[str, str]]]: + response = await client.get(f"https://www.bing.com/search?q={quote_plus(query)}", headers=headers) + response.raise_for_status() + return "bing", _parse_bing_results(response.text, limit) + + +async def _search_duckduckgo( + client: httpx.AsyncClient, + *, + query: str, + limit: int, + headers: dict[str, str], +) -> tuple[str, list[dict[str, str]]]: + response = await client.get(f"https://duckduckgo.com/html/?q={quote_plus(query)}", headers=headers) + response.raise_for_status() + return "duckduckgo", _parse_duckduckgo_results(response.text, limit) + + +def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]: + results: list[dict[str, str]] = [] + pattern = re.compile( + r'<li[^>]+class="[^"]*\bb_algo\b[^"]*"[^>]*>.*?<h2[^>]*>\s*' + r'<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>.*?' + r'(?:<p[^>]*>(?P<snippet>.*?)</p>)?', + re.I | re.S, + ) + for match in pattern.finditer(html): + title = _strip_html(match.group("title")) + result_url = unescape(match.group("url")) + snippet = _strip_html(match.group("snippet") or "") + if title and result_url: + results.append({"title": title, "url": result_url, "snippet": snippet}) + if len(results) >= limit: + break + return results + + +def _parse_duckduckgo_results(html: str, limit: int) -> list[dict[str, str]]: + results: list[dict[str, str]] = [] + pattern = re.compile( + r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>', + re.I | re.S, + ) + for match in pattern.finditer(html): + title = _strip_html(match.group("title")) + result_url = unescape(match.group("url")) + if title and result_url: + results.append({"title": title, "url": result_url, "snippet": ""}) + if len(results) >= limit: + break + return results diff --git a/app-instance/backend/tests/unit/test_cron_service.py b/app-instance/backend/tests/unit/test_cron_service.py index 2aeb6b6..3bbe880 100644 --- a/app-instance/backend/tests/unit/test_cron_service.py +++ b/app-instance/backend/tests/unit/test_cron_service.py @@ -29,6 +29,18 @@ def test_schedule_from_frontend_payload() -> None: assert cron.kind == "cron" +def test_legacy_interval_schedule_recovers_duration_from_display() -> None: + schedule = CronSchedule.from_dict( + { + "kind": "every", + "every_ms": None, + "display": "every 1800s", + } + ) + + assert schedule.every_ms == 30 * 60 * 1000 + + def test_compute_next_run_skips_missed_interval() -> None: schedule = CronSchedule(kind="every", every_ms=60_000) assert compute_next_run(schedule, now_ms=1_000_000, last_run_at_ms=0) > 1_000_000 @@ -80,6 +92,22 @@ def test_manual_run_records_scheduled_run_output(tmp_path) -> None: assert updated.to_api_dict()["last_scheduled_run_id"] == run.scheduled_run_id +def test_persisted_interval_job_keeps_schedule_and_next_run(tmp_path) -> None: + store_path = tmp_path / "jobs.json" + service = CronService(store_path) + job = service.add_job( + name="Hydration reminder", + message="Drink water", + schedule=CronSchedule(kind="every", every_ms=30 * 60 * 1000), + ) + + reloaded = CronService(store_path).get_job(job.id) + + assert reloaded is not None + assert reloaded.schedule.every_ms == 30 * 60 * 1000 + assert reloaded.next_run_at_ms == job.next_run_at_ms + + def test_cron_tool_uses_runtime_service(tmp_path) -> None: service = CronService(tmp_path / "jobs.json") tool = CronTool() diff --git a/app-instance/backend/tests/unit/test_outlook_integration.py b/app-instance/backend/tests/unit/test_outlook_integration.py new file mode 100644 index 0000000..be6b871 --- /dev/null +++ b/app-instance/backend/tests/unit/test_outlook_integration.py @@ -0,0 +1,71 @@ +import asyncio + +import pytest + +from beaver.foundation.config.schema import AuthzConfig, BackendIdentityConfig, BeaverConfig +from beaver.integrations import outlook + + +class _FakeAuthzClient: + async def get_outlook_settings(self, backend_id: str) -> dict: + assert backend_id == "steven" + return { + "configured": True, + "email": "steven.yx.li@boardware.com", + "server": "mail.boardware.com.mo", + } + + +def _authz_config() -> BeaverConfig: + return BeaverConfig( + authz=AuthzConfig( + enabled=True, + base_url="http://authz.example", + outlook_mcp_url="http://outlook-mcp.example/mcp", + ), + backend_identity=BackendIdentityConfig( + backend_id="steven", + client_id="steven", + client_secret="secret", + ), + ) + + +def test_outlook_status_does_not_probe_mcp_by_default(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient()) + + async def fail_if_called(*_args, **_kwargs): + raise AssertionError("status should not call Outlook MCP by default") + + monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fail_if_called) + + result = asyncio.run(outlook.outlook_status(_authz_config(), tmp_path)) + + assert result["configured"] is True + assert result["connected"] is False + assert result["auth_status"] is None + assert result["error"] is None + + +def test_outlook_overview_loads_sections_serially(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setattr(outlook, "_authz_client", lambda _config: _FakeAuthzClient()) + active_calls = 0 + max_active_calls = 0 + tool_names: list[str] = [] + + async def fake_call(_config, tool_name: str, _arguments, **_kwargs): + nonlocal active_calls, max_active_calls + tool_names.append(tool_name) + active_calls += 1 + max_active_calls = max(max_active_calls, active_calls) + await asyncio.sleep(0.01) + active_calls -= 1 + return {"value": []} + + monkeypatch.setattr(outlook, "_call_outlook_mcp_tool", fake_call) + + result = asyncio.run(outlook.get_overview(_authz_config(), tmp_path)) + + assert result["warnings"] == [] + assert tool_names == ["mail_list_messages", "mail_list_messages", "calendar_list_events"] + assert max_active_calls == 1 diff --git a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py index 688a1c6..4d45e66 100644 --- a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py +++ b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py @@ -27,6 +27,7 @@ class StubProvider(LLMProvider): def __init__(self, responses: list[LLMResponse]) -> None: super().__init__() self._responses = list(responses) + self.calls: list[dict] = [] async def chat( self, @@ -37,6 +38,16 @@ class StubProvider(LLMProvider): temperature: float = 0.7, thinking_enabled: bool | None = None, ) -> LLMResponse: + self.calls.append( + { + "messages": messages, + "tools": tools, + "model": model, + "max_tokens": max_tokens, + "temperature": temperature, + "thinking_enabled": thinking_enabled, + } + ) if not self._responses: raise AssertionError("No stubbed provider responses left") return self._responses.pop(0) @@ -704,32 +715,33 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path: skill_assembler=StubSkillAssembler([skill]), ) loop = AgentLoop(loader=loader) + provider = StubProvider( + [ + LLMResponse( + content="Need a tool.", + finish_reason="tool_calls", + tool_calls=[_tool_call()], + provider_name="stub", + model="stub-model", + ), + LLMResponse( + content="Need another tool.", + finish_reason="tool_calls", + tool_calls=[_tool_call(call_id="call-2")], + provider_name="stub", + model="stub-model", + ), + LLMResponse( + content="Based on the available tool result, the container likely failed during startup.", + finish_reason="stop", + provider_name="stub", + model="stub-model", + ), + ] + ) bundle = ProviderBundle( main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), - main_provider=StubProvider( - [ - LLMResponse( - content="Need a tool.", - finish_reason="tool_calls", - tool_calls=[_tool_call()], - provider_name="stub", - model="stub-model", - ), - LLMResponse( - content="Need another tool.", - finish_reason="tool_calls", - tool_calls=[_tool_call(call_id="call-2")], - provider_name="stub", - model="stub-model", - ), - LLMResponse( - content="Based on the available tool result, the container likely failed during startup.", - finish_reason="stop", - provider_name="stub", - model="stub-model", - ), - ] - ), + main_provider=provider, ) result = asyncio.run( @@ -744,6 +756,21 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path: assert result.finish_reason == "max_tool_iterations_finalized" assert "Based on the available tool result" in result.output_text assert "Tool loop stopped" not in result.output_text + finalization_messages = provider.calls[-1]["messages"] + assistant_tool_call_ids = [ + call["id"] + for message in finalization_messages + for call in message.get("tool_calls", []) + if message.get("role") == "assistant" + ] + tool_result_ids = [ + message.get("tool_call_id") + for message in finalization_messages + if message.get("role") == "tool" + ] + assert "call-1" in assistant_tool_call_ids + assert "call-2" not in assistant_tool_call_ids + assert set(assistant_tool_call_ids).issubset(set(tool_result_ids)) effect_records = loaded.run_memory_store.list_skill_effects("docker-debug", version="v0007") assert effect_records[-1].run_id == result.run_id assert effect_records[-1].success is False diff --git a/app-instance/backend/tests/unit/test_session_archive.py b/app-instance/backend/tests/unit/test_session_archive.py index 5b5eee6..fa9d613 100644 --- a/app-instance/backend/tests/unit/test_session_archive.py +++ b/app-instance/backend/tests/unit/test_session_archive.py @@ -105,3 +105,29 @@ def test_web_archive_route_does_not_create_archive_suffix_session(tmp_path: Path assert loaded.session_manager.get_session("web:alpha")["end_reason"] == "archived" # type: ignore[union-attr] assert loaded.session_manager.get_session("web:alpha/archive") is None # type: ignore[union-attr] assert sessions_response.json() == [] + + +def test_web_session_list_hides_skill_replay_evaluation_sessions(tmp_path: Path) -> None: + service = AgentService(workspace=tmp_path) + loaded = service.create_loop().boot() + loaded.session_manager.ensure_session("eval-session", source="skill_replay_eval") # type: ignore[union-attr] + loaded.session_manager.ensure_session("web:visible", source="web") # type: ignore[union-attr] + app = create_app(service=service, manage_service_lifecycle=False) + + with TestClient(app) as client: + response = client.get("/api/sessions") + + assert response.status_code == 200 + assert [item["key"] for item in response.json()] == ["web:visible"] + + +def test_get_missing_session_returns_404_without_creating_it(tmp_path: Path) -> None: + service = AgentService(workspace=tmp_path) + app = create_app(service=service, manage_service_lifecycle=False) + + with TestClient(app) as client: + response = client.get("/api/sessions/missing-session") + + assert response.status_code == 404 + loaded = service.create_loop().boot() + assert loaded.session_manager.get_session("missing-session") is None # type: ignore[union-attr] diff --git a/app-instance/backend/tests/unit/test_skill_learning_eval.py b/app-instance/backend/tests/unit/test_skill_learning_eval.py index 7bf11ed..2a16b77 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_eval.py +++ b/app-instance/backend/tests/unit/test_skill_learning_eval.py @@ -201,6 +201,22 @@ class FakeReplayRunner: } +class ConcurrentReplayRunner(FakeReplayRunner): + def __init__(self) -> None: + super().__init__() + self.active = 0 + self.max_active = 0 + + async def run_arm(self, request): + self.active += 1 + self.max_active = max(self.max_active, self.active) + await asyncio.sleep(0.02) + try: + return await super().run_arm(request) + finally: + self.active -= 1 + + def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None: pipeline = _pipeline(tmp_path) draft = pipeline.draft_service.create_new_skill_draft( @@ -238,6 +254,94 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None: assert report.tool_execution_summary["score_role"] == "diagnostic_only" +def test_replay_eval_reports_arm_progress(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="release-checklist", + proposed_content="# Release\n\nRun tests.", + proposed_frontmatter={"description": "release", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.update_learning_candidate( + "candidate-1", + draft_skill_name=draft.skill_name, + draft_id=draft.draft_id, + ) + progress: list[dict] = [] + + asyncio.run( + pipeline.evaluate_draft( + "candidate-1", + draft.skill_name, + draft.draft_id, + provider_bundle=_bundle(), + replay_runner=FakeReplayRunner(), + progress_callback=progress.append, + ) + ) + + assert progress[0] == { + "phase": "replaying", + "completed_arms": 0, + "total_arms": 20, + "completed_cases": 0, + "total_cases": 10, + } + assert progress[-1] == { + "phase": "replaying", + "completed_arms": 20, + "total_arms": 20, + "completed_cases": 10, + "total_cases": 10, + } + + +def test_replay_eval_runs_cases_with_bounded_parallelism(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + pipeline.evaluator = SkillDraftEvaluator( + pipeline.learning_service.run_store, + max_parallel_cases=2, + ) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="release-checklist", + proposed_content="# Release\n\nRun tests.", + proposed_frontmatter={"description": "release", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.update_learning_candidate( + "candidate-1", + draft_skill_name=draft.skill_name, + draft_id=draft.draft_id, + ) + replay_runner = ConcurrentReplayRunner() + + report = asyncio.run( + pipeline.evaluate_draft( + "candidate-1", + draft.skill_name, + draft.draft_id, + provider_bundle=_bundle(), + replay_runner=replay_runner, + ) + ) + + assert replay_runner.max_active == 2 + assert [case["run_id"] for case in report.cases] == [ + "run-1", + "synthetic:candidate-1:01", + "synthetic:candidate-1:02", + "synthetic:candidate-1:03", + "synthetic:candidate-1:04", + "synthetic:candidate-1:05", + "synthetic:candidate-1:06", + "synthetic:candidate-1:07", + "synthetic:candidate-1:08", + "synthetic:candidate-1:09", + ] + + def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None: pipeline = _pipeline(tmp_path) pipeline.learning_store.update_learning_candidate( diff --git a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py index d3e999d..5b82dd9 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py +++ b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py @@ -98,6 +98,27 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None: pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") +def test_safety_recheck_keeps_submitted_candidate_in_review(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="reviewed-skill", + proposed_content="# Reviewed Skill\n\nDo the thing.", + proposed_frontmatter={"description": "reviewed"}, + created_by="test", + reason="test", + ) + candidate = pipeline.get_candidate("candidate-1") + candidate.draft_skill_name = draft.skill_name + candidate.draft_id = draft.draft_id + pipeline.learning_store.record_learning_candidate(candidate) + + pipeline.check_safety(draft.skill_name, draft.draft_id) + pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") + pipeline.check_safety(draft.skill_name, draft.draft_id) + + assert pipeline.get_candidate("candidate-1").status == "review_pending" + + def test_pipeline_reject_blocks_publish(tmp_path: Path) -> None: pipeline = _pipeline(tmp_path) draft = pipeline.draft_service.create_new_skill_draft( diff --git a/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py b/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py index 93c09db..886c19a 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py +++ b/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py @@ -7,8 +7,17 @@ from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner class FakeAgentLoop: + def __init__(self) -> None: + self.ended_sessions: list[tuple[str, str]] = [] + def boot(self): - return SimpleNamespace(tool_executor=SimpleNamespace(), tool_registry=SimpleNamespace(get=lambda name: None)) + return SimpleNamespace( + tool_executor=SimpleNamespace(), + tool_registry=SimpleNamespace(get=lambda name: None), + session_manager=SimpleNamespace( + end_session=lambda session_id, reason: self.ended_sessions.append((session_id, reason)) + ), + ) async def process_direct(self, task: str, **kwargs): executor = kwargs["tool_executor_override"] @@ -18,6 +27,7 @@ class FakeAgentLoop: class FakeRunningAgentLoop(FakeAgentLoop): def __init__(self) -> None: + super().__init__() self.process_direct_calls = 0 self.submit_direct_calls: list[tuple[str, dict]] = [] @@ -35,6 +45,29 @@ class FakeRunningAgentLoop(FakeAgentLoop): return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop") +class FakeIsolatedAgentLoop(FakeAgentLoop): + def __init__(self) -> None: + super().__init__() + self.closed = False + self.mcp_manager = SimpleNamespace(close=self._close_mcp) + self.mcp_closed = False + self.loaded = None + + async def _close_mcp(self) -> None: + self.mcp_closed = True + + def close(self) -> None: + assert self.mcp_closed is True + self.closed = True + + def boot(self): + if self.loaded is None: + self.loaded = super().boot() + self.loaded.mcp_manager = self.mcp_manager + self.loaded.closeables = [("mcp_manager", lambda: None)] + return self.loaded + + def test_replay_runner_returns_arm_report_with_tool_trace() -> None: runner = ReplayRunner(agent_loop=FakeAgentLoop()) request = ReplayArmRequest( @@ -53,6 +86,8 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None: assert report["arm"] == "candidate" assert report["finish_reason"] == "stop" assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email" + assert report["tool_calls"][0]["duration_ms"] >= 0 + assert runner.agent_loop.ended_sessions == [("session-replay", "evaluation_complete")] def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None: @@ -83,3 +118,31 @@ def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None: assert report["session_id"] == "session-queued" assert report["run_id"] == "run-queued" assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email" + assert agent_loop.ended_sessions == [("session-queued", "evaluation_complete")] + + +def test_replay_runner_uses_and_closes_isolated_loop() -> None: + shared_loop = FakeRunningAgentLoop() + isolated_loops: list[FakeIsolatedAgentLoop] = [] + + def create_isolated_loop() -> FakeIsolatedAgentLoop: + loop = FakeIsolatedAgentLoop() + isolated_loops.append(loop) + return loop + + runner = ReplayRunner(agent_loop=shared_loop, isolated_loop_factory=create_isolated_loop) + request = ReplayArmRequest( + case_id="case-isolated", + arm="candidate", + task_text="Fetch current weather.", + provider_bundle=object(), + ) + + report = asyncio.run(runner.run_arm(request)) + + assert report["session_id"] == "session-replay" + assert shared_loop.process_direct_calls == 0 + assert shared_loop.submit_direct_calls == [] + assert len(isolated_loops) == 1 + assert isolated_loops[0].mcp_closed is True + assert isolated_loops[0].closed is True diff --git a/app-instance/backend/tests/unit/test_skill_learning_web_api.py b/app-instance/backend/tests/unit/test_skill_learning_web_api.py index 199ad6e..e7ad7a6 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_web_api.py +++ b/app-instance/backend/tests/unit/test_skill_learning_web_api.py @@ -1,5 +1,7 @@ from __future__ import annotations +import asyncio +import time from pathlib import Path from types import SimpleNamespace @@ -16,7 +18,7 @@ class StubEvaluator: def __init__(self) -> None: self.calls = 0 - async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None): + async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None): self.calls += 1 return SkillDraftEvalReport( report_id="eval-existing", @@ -34,6 +36,18 @@ class StubEvaluator: ) +class SlowEvaluator(StubEvaluator): + async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None, progress_callback=None): + await asyncio.sleep(0.15) + return await super().evaluate( + candidate=candidate, + draft=draft, + provider_bundle=provider_bundle, + replay_runner=replay_runner, + progress_callback=progress_callback, + ) + + def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None: service = AgentService(workspace=tmp_path) loaded = service.create_loop().boot() @@ -193,15 +207,79 @@ def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None: with TestClient(app) as client: response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit") + deadline = time.monotonic() + 1 + payload = response.json() + while payload["eval_report"] is None and time.monotonic() < deadline: + time.sleep(0.02) + payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json() assert response.status_code == 200 - payload = response.json() assert evaluator.calls == 1 assert payload["status"] == "in_review" assert payload["safety_report"]["passed"] is True assert payload["eval_report"]["report_id"] == "eval-existing" +def test_submit_draft_returns_before_eval_and_is_idempotent(tmp_path: Path, monkeypatch) -> None: + service = AgentService(workspace=tmp_path) + loaded = service.create_loop().boot() + draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr] + skill_name="weather-search", + proposed_content="# Weather Search\n\nUse current weather sources.", + proposed_frontmatter={"description": "weather", "tools": []}, + created_by="test", + reason="test", + ) + loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr] + SkillLearningCandidate( + candidate_id="candidate-weather", + kind="revise_skill", + source_run_ids=["run-1"], + source_session_ids=["session-1"], + related_skill_names=["weather-search"], + reason="revise", + status="draft_ready", + draft_skill_name=draft.skill_name, + draft_id=draft.draft_id, + ) + ) + evaluator = SlowEvaluator() + loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr] + monkeypatch.setattr( + service, + "_make_provider_bundle_for_task", + lambda loaded, kwargs: SimpleNamespace(main_provider=object()), + ) + app = create_app(service=service, manage_service_lifecycle=False) + + with TestClient(app) as client: + started = time.monotonic() + first = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit") + elapsed = time.monotonic() - started + second = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit") + deadline = time.monotonic() + 2 + payload = second.json() + while payload["eval_report"] is None and time.monotonic() < deadline: + time.sleep(0.05) + payload = client.get(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}").json() + + assert first.status_code == 200 + assert elapsed < 0.12 + assert first.json()["status"] == "in_review" + assert first.json()["eval_status"] == "pending" + assert first.json()["eval_progress"] == { + "phase": "preparing", + "completed_arms": 0, + "total_arms": 20, + "completed_cases": 0, + "total_cases": 10, + } + assert second.status_code == 200 + assert evaluator.calls == 1 + assert payload["eval_report"]["report_id"] == "eval-existing" + assert loaded.skill_learning_pipeline.get_candidate("candidate-weather").status == "review_pending" # type: ignore[union-attr] + + def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None: service = AgentService(workspace=tmp_path) loaded = service.create_loop().boot() diff --git a/app-instance/backend/tests/unit/test_terminal_websocket_channel.py b/app-instance/backend/tests/unit/test_terminal_websocket_channel.py index 0246805..152a17c 100644 --- a/app-instance/backend/tests/unit/test_terminal_websocket_channel.py +++ b/app-instance/backend/tests/unit/test_terminal_websocket_channel.py @@ -57,6 +57,14 @@ def write_terminal_config(tmp_path: Path) -> Path: return config_path +def write_terminal_config_with_device_session(tmp_path: Path) -> Path: + config_path = write_terminal_config(tmp_path) + payload = json.loads(config_path.read_text(encoding="utf-8")) + payload["channels"]["terminal-dev"]["config"]["sessionPeerFromDeviceName"] = True + config_path.write_text(json.dumps(payload), encoding="utf-8") + return config_path + + def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) -> None: config_path = write_terminal_config(tmp_path) service = TerminalFakeAgentService(config_path=config_path) @@ -117,6 +125,98 @@ def test_terminal_websocket_connect_ping_and_message_roundtrip(tmp_path: Path) - assert inbound.channel_identity.message_id == "device-001-000001" +def test_terminal_websocket_can_use_device_name_as_stable_session_peer(tmp_path: Path) -> None: + config_path = write_terminal_config_with_device_session(tmp_path) + service = TerminalFakeAgentService(config_path=config_path) + app = create_app(service=service, manage_service_lifecycle=False) + + with TestClient(app) as client: + with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket: + websocket.send_json( + { + "type": "connect", + "peer_id": "livekit-test-livekit-07291699", + "device_name": "desk-terminal", + } + ) + first = websocket.receive_json() + + with client.websocket_connect("/api/channels/terminal-dev/ws") as websocket: + websocket.send_json( + { + "type": "connect", + "peer_id": "livekit-test-livekit-3fb03fff", + "device_name": "desk-terminal", + } + ) + second = websocket.receive_json() + websocket.send_json( + { + "type": "message", + "message_id": "livekit-test-livekit-3fb03fff-000001", + "text": "hello", + } + ) + ack = websocket.receive_json() + reply = websocket.receive_json() + + service.close() + assert first["session_id"] == "terminal-dev:local:device-desk-terminal" + assert second["session_id"] == first["session_id"] + assert ack["session_id"] == first["session_id"] + assert reply["text"] == "echo:hello" + assert service.inbound_calls[0].session_id == first["session_id"] + assert service.inbound_calls[0].channel_identity is not None + assert service.inbound_calls[0].channel_identity.peer_id == "device-desk-terminal" + + +def test_terminal_websocket_reconnect_delivers_pending_reply_to_latest_device_connection(tmp_path: Path) -> None: + config_path = write_terminal_config_with_device_session(tmp_path) + service = TerminalFakeAgentService(config_path=config_path, delay_seconds=0.05) + app = create_app(service=service, manage_service_lifecycle=False) + + with TestClient(app) as client: + with client.websocket_connect("/api/channels/terminal-dev/ws") as first_websocket: + first_websocket.send_json( + { + "type": "connect", + "peer_id": "livekit-test-livekit-old", + "device_name": "desk-terminal", + } + ) + first = first_websocket.receive_json() + first_websocket.send_json( + { + "type": "message", + "message_id": "livekit-test-livekit-old-000001", + "text": "slow", + } + ) + assert first_websocket.receive_json()["accepted"] is True + + with client.websocket_connect("/api/channels/terminal-dev/ws") as latest_websocket: + latest_websocket.send_json( + { + "type": "connect", + "peer_id": "livekit-test-livekit-new", + "device_name": "desk-terminal", + } + ) + latest = latest_websocket.receive_json() + reply = latest_websocket.receive_json() + + service.close() + assert latest["session_id"] == first["session_id"] + assert reply == { + "type": "message", + "role": "assistant", + "message_id": "livekit-test-livekit-old-000001", + "run_id": "run-1", + "text": "echo:slow", + "finish_reason": "stop", + } + + def test_terminal_websocket_rejects_message_before_connect(tmp_path: Path) -> None: config_path = write_terminal_config(tmp_path) service = TerminalFakeAgentService(config_path=config_path) diff --git a/app-instance/backend/tests/unit/test_web_tools.py b/app-instance/backend/tests/unit/test_web_tools.py index 0f621f4..de5f8a9 100644 --- a/app-instance/backend/tests/unit/test_web_tools.py +++ b/app-instance/backend/tests/unit/test_web_tools.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import json from beaver.tools.builtins import web @@ -8,8 +9,16 @@ from beaver.tools.builtins import web class _FakeResponse: headers = {"content-type": "text/html"} status_code = 200 - text = '<a class="result__a" href="https://example.com">Example</a>' - url = "https://example.com" + + def __init__(self, url: str = "https://example.com") -> None: + self.url = url + if "duckduckgo.com" in url: + self.text = '<a class="result__a" href="https://duck.example.com">Duck Example</a>' + else: + self.text = ( + '<li class="b_algo"><h2><a href="https://example.com">Example</a></h2>' + "<p>Example result</p></li>" + ) def raise_for_status(self) -> None: return None @@ -17,6 +26,8 @@ class _FakeResponse: class _FakeAsyncClient: calls: list[dict[str, object]] = [] + urls: list[str] = [] + fail_bing = False def __init__(self, **kwargs: object) -> None: self.calls.append(kwargs) @@ -28,7 +39,11 @@ class _FakeAsyncClient: return None async def get(self, *args: object, **kwargs: object) -> _FakeResponse: - return _FakeResponse() + url = str(args[0]) + self.urls.append(url) + if self.fail_bing and "bing.com" in url: + raise web.httpx.ConnectTimeout("bing unavailable") + return _FakeResponse(url) def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None: @@ -42,3 +57,56 @@ def test_web_tools_use_environment_proxy_settings(monkeypatch) -> None: asyncio.run(_run()) assert [call.get("trust_env") for call in _FakeAsyncClient.calls] == [True, True] + + +def test_web_fetch_uses_short_connect_timeout(monkeypatch) -> None: + _FakeAsyncClient.calls = [] + _FakeAsyncClient.urls = [] + _FakeAsyncClient.fail_bing = False + monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient) + + asyncio.run(web.WebFetchTool().execute(url="https://example.com")) + + timeout = _FakeAsyncClient.calls[0]["timeout"] + assert isinstance(timeout, web.httpx.Timeout) + assert timeout.connect == 5 + assert timeout.read == 12 + + +def test_web_search_uses_reachable_bing_endpoint_first(monkeypatch) -> None: + _FakeAsyncClient.calls = [] + _FakeAsyncClient.urls = [] + _FakeAsyncClient.fail_bing = False + monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient) + + raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing")) + + payload = json.loads(raw) + assert payload["success"] is True + assert payload["engine"] in {"bing", "duckduckgo"} + assert set(_FakeAsyncClient.urls) == { + "https://www.bing.com/search?q=weather+beijing", + "https://duckduckgo.com/html/?q=weather+beijing", + } + + timeout = _FakeAsyncClient.calls[0]["timeout"] + assert isinstance(timeout, web.httpx.Timeout) + assert timeout.connect == 5 + assert timeout.read == 8 + + +def test_web_search_falls_back_when_bing_is_unavailable(monkeypatch) -> None: + _FakeAsyncClient.calls = [] + _FakeAsyncClient.urls = [] + _FakeAsyncClient.fail_bing = True + monkeypatch.setattr(web.httpx, "AsyncClient", _FakeAsyncClient) + + raw = asyncio.run(web.WebSearchTool().execute(query="weather beijing")) + + payload = json.loads(raw) + assert payload["success"] is True + assert payload["engine"] == "duckduckgo" + assert set(_FakeAsyncClient.urls) == { + "https://www.bing.com/search?q=weather+beijing", + "https://duckduckgo.com/html/?q=weather+beijing", + } diff --git a/app-instance/frontend/app/(app)/notifications/page.tsx b/app-instance/frontend/app/(app)/notifications/page.tsx index 5fa3b06..5551f6e 100644 --- a/app-instance/frontend/app/(app)/notifications/page.tsx +++ b/app-instance/frontend/app/(app)/notifications/page.tsx @@ -8,6 +8,7 @@ import { listNotifications } from '@/lib/api'; import type { NotificationRun } from '@/types'; import { pickAppText } from '@/lib/i18n/core'; import { useAppI18n } from '@/lib/i18n/provider'; +import { scheduleNotificationRefresh } from '@/lib/notification-runtime'; import { containedLongTextClass } from '@/lib/text-wrapping'; import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; @@ -19,20 +20,21 @@ export default function NotificationsPage() { const [loading, setLoading] = useState(true); const [error, setError] = useState<string | null>(null); - const load = React.useCallback(async () => { - setLoading(true); + const load = React.useCallback(async (background = false) => { + if (!background) setLoading(true); setError(null); try { setItems(await listNotifications()); } catch (err: any) { setError(err.message || pickAppText(locale, '加载通知失败', 'Failed to load notifications')); } finally { - setLoading(false); + if (!background) setLoading(false); } }, [locale]); useEffect(() => { void load(); + return scheduleNotificationRefresh(() => load(true)); }, [load]); const formatTime = (value?: string | null) => { diff --git a/app-instance/frontend/app/(app)/outlook/page.tsx b/app-instance/frontend/app/(app)/outlook/page.tsx index d069725..8553294 100644 --- a/app-instance/frontend/app/(app)/outlook/page.tsx +++ b/app-instance/frontend/app/(app)/outlook/page.tsx @@ -57,6 +57,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'; import type { AppLocale } from '@/lib/i18n/core'; import { pickAppText } from '@/lib/i18n/core'; import { useAppI18n } from '@/lib/i18n/provider'; +import { nextOutlookAutoLoadTarget, type OutlookAutoLoadView } from '@/lib/outlook-page-state'; type OutlookFormState = OutlookConnectionPayload; type OutlookView = 'inbox' | 'sent' | 'calendar' | 'settings'; @@ -368,6 +369,11 @@ export default function OutlookPage() { sent: false, }); const [calendarLoading, setCalendarLoading] = useState(false); + const [autoLoadAttempted, setAutoLoadAttempted] = useState<Record<OutlookAutoLoadView, boolean>>({ + inbox: false, + sent: false, + calendar: false, + }); const formDirtyRef = React.useRef(formDirty); useEffect(() => { @@ -399,6 +405,7 @@ export default function OutlookPage() { }, [t]); const loadMailboxPage = useCallback(async (view: OutlookMailboxView, skip = 0) => { + setAutoLoadAttempted((current) => ({ ...current, [view]: true })); setMailboxLoading((current) => ({ ...current, [view]: true })); try { const nextPage = await getOutlookMessages(view === 'inbox' ? 'inbox' : 'sentitems', { @@ -425,6 +432,7 @@ export default function OutlookPage() { }, [t]); const loadCalendarPage = useCallback(async (anchorKey: string) => { + setAutoLoadAttempted((current) => ({ ...current, calendar: true })); setCalendarLoading(true); try { const range = buildCalendarRange(anchorKey); @@ -461,9 +469,7 @@ export default function OutlookPage() { if (!background) { setStatusLoading(false); } - if (nextStatus.configured) { - await loadOverview(options?.preserveOverview ?? background); - } else { + if (!nextStatus.configured) { setOverview(null); setOverviewLoading(false); } @@ -523,9 +529,6 @@ export default function OutlookPage() { ); const isConfigured = Boolean(status?.configured); const isConnected = Boolean(status?.connected); - const inboxCount = overview?.recentInbox.length ?? 0; - const sentCount = overview?.recentSent.length ?? 0; - const eventCount = overview?.todayEvents.length ?? 0; const overviewWarnings = overview?.warnings || []; const testWarnings = testResult?.warnings || []; const statusPending = statusLoading && !status; @@ -538,7 +541,6 @@ export default function OutlookPage() { label: t('设置', 'Settings'), hint: t('配置 Outlook 连接', 'Configure the Outlook connection'), icon: Settings2, - count: null, }, ]; } @@ -549,31 +551,27 @@ export default function OutlookPage() { label: t('收件箱', 'Inbox'), hint: t('最近接收邮件', 'Recently received mail'), icon: Inbox, - count: null, }, { id: 'sent' as const, label: t('发件箱', 'Sent'), hint: t('最近发送记录', 'Recently sent messages'), icon: Send, - count: null, }, { id: 'calendar' as const, label: t('日程', 'Calendar'), hint: t('未来 7 天', 'Next 7 days'), icon: CalendarDays, - count: overviewPending ? null : eventCount, }, { id: 'settings' as const, label: t('设置', 'Settings'), hint: t('连接与状态', 'Connection and status'), icon: Settings2, - count: null, }, ]; - }, [eventCount, inboxCount, isConfigured, overviewPending, sentCount, t]); + }, [isConfigured, t]); useEffect(() => { if (!availableViews.some((view) => view.id === activeView)) { @@ -582,20 +580,31 @@ export default function OutlookPage() { }, [activeView, availableViews]); useEffect(() => { - if (!isConfigured) { - return; - } - if (activeView === 'inbox' && !inboxPage && !mailboxLoading.inbox) { + const target = nextOutlookAutoLoadTarget({ + isConfigured, + activeView, + loaded: { + inbox: Boolean(inboxPage), + sent: Boolean(sentPage), + calendar: Boolean(calendarPage), + }, + loading: { + inbox: mailboxLoading.inbox, + sent: mailboxLoading.sent, + calendar: calendarLoading, + }, + attempted: autoLoadAttempted, + }); + if (target === 'inbox') { void loadMailboxPage('inbox', 0); - } - if (activeView === 'sent' && !sentPage && !mailboxLoading.sent) { + } else if (target === 'sent') { void loadMailboxPage('sent', 0); - } - if (activeView === 'calendar' && !calendarPage && !calendarLoading) { + } else if (target === 'calendar') { void loadCalendarPage(calendarAnchorKey); } }, [ activeView, + autoLoadAttempted, calendarAnchorKey, calendarLoading, calendarPage, @@ -638,6 +647,7 @@ export default function OutlookPage() { setInboxPage(null); setSentPage(null); setCalendarPage(null); + setAutoLoadAttempted({ inbox: false, sent: false, calendar: false }); setCalendarAnchorKey(toLocalDateKey(new Date())); await loadStatus(true, { forceFormSync: true }); setActiveView('inbox'); @@ -663,6 +673,7 @@ export default function OutlookPage() { setInboxPage(null); setSentPage(null); setCalendarPage(null); + setAutoLoadAttempted({ inbox: false, sent: false, calendar: false }); setCalendarAnchorKey(toLocalDateKey(new Date())); setActiveView('settings'); setFormDirty(false); @@ -676,6 +687,7 @@ export default function OutlookPage() { const refreshOverview = async () => { await loadStatus(true, { preserveOverview: true }); + await loadOverview(true); if (activeView === 'inbox') { await loadMailboxPage('inbox', inboxPage?.page.skip ?? 0); } else if (activeView === 'sent') { @@ -723,13 +735,6 @@ export default function OutlookPage() { </div> <div className="flex flex-wrap items-center gap-2"> - {isConfigured ? ( - <> - <TopStat label={t('收件箱', 'Inbox')} value={String(inboxCount)} loading={overviewPending} /> - <TopStat label={t('发件箱', 'Sent')} value={String(sentCount)} loading={overviewPending} /> - <TopStat label={t('日程', 'Calendar')} value={String(eventCount)} loading={overviewPending} /> - </> - ) : null} <Button variant="outline" size="sm" className="h-11" onClick={() => void refreshOverview()}> <RefreshCw className={`mr-2 h-4 w-4 ${refreshing ? 'animate-spin' : ''}`} /> {t('刷新', 'Refresh')} @@ -783,9 +788,6 @@ export default function OutlookPage() { </span> <div className="text-left"> <p className="text-sm font-semibold">{view.label}</p> - {typeof view.count === 'number' ? ( - <p className="text-xs text-muted-foreground">{t(`${view.count} 条`, `${view.count} items`)}</p> - ) : null} </div> </div> </div> @@ -1210,19 +1212,6 @@ function MiniStat({ label, value }: { label: string; value: string }) { ); } -function TopStat({ label, value, loading = false }: { label: string; value: string; loading?: boolean }) { - return ( - <div className="rounded-full border bg-background px-3 py-1 text-sm"> - <span className="text-muted-foreground">{label}</span> - {loading ? ( - <Skeleton className="ml-2 inline-flex h-4 w-8 align-middle" /> - ) : ( - <span className="ml-2 font-semibold text-foreground">{value}</span> - )} - </div> - ); -} - function MessageCard({ title, icon, diff --git a/app-instance/frontend/app/(app)/page.tsx b/app-instance/frontend/app/(app)/page.tsx index 1dc67b7..b30b671 100644 --- a/app-instance/frontend/app/(app)/page.tsx +++ b/app-instance/frontend/app/(app)/page.tsx @@ -39,7 +39,7 @@ import { pickAppText } from '@/lib/i18n/core'; import { useAppI18n } from '@/lib/i18n/provider'; import { useChatStore } from '@/lib/store'; import { buildTaskTimelineView } from '@/lib/task-timeline-view'; -import type { ActiveTask, BackendTask, ChatMessage, FileAttachment, SessionUpdatedEvent, WsEvent } from '@/types'; +import type { ActiveTask, BackendTask, ChatMessage, FileAttachment, Session, SessionUpdatedEvent, WsEvent } from '@/types'; function isSessionUpdatedEvent(data: WsEvent | Record<string, unknown>): data is SessionUpdatedEvent { return data.type === 'session_updated' && typeof data.session_id === 'string'; @@ -149,7 +149,15 @@ export default function ChatPage() { const loadSessions = useCallback(async () => { try { const list = await listSessions(); - useChatStore.getState().setSessions(list); + const store = useChatStore.getState(); + store.setSessions(list); + const currentSessionId = store.sessionId; + const isOrphanedGeneratedSession = + /^[0-9a-f]{32}$/i.test(currentSessionId) && + !list.some((session) => session.key === currentSessionId); + if (isOrphanedGeneratedSession) { + store.setSessionId(list[0]?.key || 'web:default'); + } } catch { // backend may be offline during first render } @@ -576,7 +584,9 @@ export default function ChatPage() { }); }, []); - const formatSessionName = (key: string) => { + const formatSessionName = (key: string, session?: Session) => { + const descriptiveName = session?.title?.trim() || session?.preview?.trim(); + if (descriptiveName) return descriptiveName; if (key.startsWith('web:')) { const id = key.slice(4); if (id === 'default') return pickAppText(locale, '默认', 'Default'); @@ -594,7 +604,12 @@ export default function ChatPage() { return key; }; - const archiveTargetSessionName = archiveTargetSessionId ? formatSessionName(archiveTargetSessionId) : ''; + const archiveTargetSessionName = archiveTargetSessionId + ? formatSessionName( + archiveTargetSessionId, + sessions.find((session) => session.key === archiveTargetSessionId) + ) + : ''; const renderSessionSidebar = (variant: 'desktop' | 'drawer') => ( <> @@ -618,7 +633,7 @@ export default function ChatPage() { <p className="px-3 py-4 text-sm text-muted-foreground">{pickAppText(locale, '暂无对话记录', 'No chat history yet')}</p> )} {sessions.map((session) => { - const sessionName = formatSessionName(session.key); + const sessionName = formatSessionName(session.key, session); const isCurrent = session.key === sessionId; return ( diff --git a/app-instance/frontend/app/(app)/skills/page.tsx b/app-instance/frontend/app/(app)/skills/page.tsx index 160451b..771461f 100644 --- a/app-instance/frontend/app/(app)/skills/page.tsx +++ b/app-instance/frontend/app/(app)/skills/page.tsx @@ -130,6 +130,16 @@ export default function SkillsPage() { void load(); }, [load]); + useEffect(() => { + if (!drafts.some((draft) => draft.eval_status === 'pending')) return; + const timer = window.setInterval(() => { + void listSkillDrafts() + .then((items) => setDrafts(Array.isArray(items) ? items : [])) + .catch(() => null); + }, 5000); + return () => window.clearInterval(timer); + }, [drafts]); + useEffect(() => { setActiveTab(normalizeSkillsTab(searchParams?.get('tab'))); }, [searchParams]); @@ -825,7 +835,8 @@ function DraftCard({ safety?.suggested_fix, ].filter(Boolean).join('\n'); const safetyBlocksReview = Boolean(safety && (!safety.passed || safety.risk_level === 'critical')); - const submitBlocked = draft.status !== 'draft' || safetyBlocksReview; + const canRetryEval = draft.status === 'in_review' && draft.eval_status === 'failed'; + const submitBlocked = (draft.status !== 'draft' && !canRetryEval) || safetyBlocksReview; const rejectBlocked = !REJECTABLE_DRAFT_STATUSES.has(draft.status); const canPublishLabel = publishBlocked ? publishBlockReason(draft, t) @@ -912,7 +923,7 @@ function DraftCard({ <div className="flex flex-wrap gap-2"> <Button variant="outline" size="sm" className="h-11" disabled={busy || submitBlocked} onClick={() => void onSubmit()}> <Send className="mr-2 h-4 w-4" /> - {t('送审', 'Submit')} + {canRetryEval ? t('重试评估', 'Retry eval') : t('送审', 'Submit')} </Button> <Button variant="outline" size="sm" className="h-11" disabled={busy || rejectBlocked} onClick={() => void onReject()}> <XCircle className="mr-2 h-4 w-4" /> @@ -988,7 +999,12 @@ function DraftCard({ <div className="mt-3 grid min-w-0 gap-3 md:grid-cols-2"> <SafetyReportPanel report={safety} /> - <EvalReportPanel report={evalReport} /> + <EvalReportPanel + report={evalReport} + status={draft.eval_status} + error={draft.eval_error} + progress={draft.eval_progress} + /> </div> </div> ); @@ -1111,10 +1127,55 @@ function lineDiffSummary(baseContent: string, proposedContent: string): { added: return { added, removed, changed }; } -function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) { +function EvalReportPanel({ + report, + status, + error, + progress, +}: { + report?: SkillDraftEvalReport | null; + status?: SkillDraft['eval_status']; + error?: string | null; + progress?: SkillDraft['eval_progress']; +}) { const { locale } = useAppI18n(); const t = (zh: string, en: string) => pickAppText(locale, zh, en); if (!report) { + if (status === 'pending') { + const completedArms = Math.max(0, Number(progress?.completed_arms || 0)); + const totalArms = Math.max(0, Number(progress?.total_arms || 0)); + const progressText = totalArms > 0 + ? t( + `评估正在后台运行:已完成 ${completedArms}/${totalArms} 次回放(共 ${progress?.total_cases || 10} 个案例,每个案例包含 baseline 和 candidate)。`, + `Evaluation is running: ${completedArms}/${totalArms} replays completed (${progress?.total_cases || 10} cases, each with baseline and candidate).` + ) + : t('评估正在准备案例,完成后会自动更新。', 'Evaluation cases are being prepared and will update automatically.'); + return ( + <ReadablePanel + icon={<Loader2 className="h-4 w-4 animate-spin" />} + title={t('评估报告', 'Eval report')} + empty={progressText} + /> + ); + } + if (status === 'failed') { + return ( + <ReadablePanel + icon={<BarChart3 className="h-4 w-4 text-destructive" />} + title={t('评估报告', 'Eval report')} + empty={`${t('评估失败,可再次点击送审重试。', 'Evaluation failed. Submit again to retry.')} ${error || ''}`.trim()} + /> + ); + } + if (status === 'not_applicable') { + return ( + <ReadablePanel + icon={<BarChart3 className="h-4 w-4" />} + title={t('评估报告', 'Eval report')} + empty={t('该草稿没有关联学习候选,不运行 replay eval。', 'This draft has no linked learning candidate, so replay eval does not run.')} + /> + ); + } return ( <ReadablePanel icon={<BarChart3 className="h-4 w-4" />} diff --git a/app-instance/frontend/lib/api.ts b/app-instance/frontend/lib/api.ts index a284562..dc78b0d 100644 --- a/app-instance/frontend/lib/api.ts +++ b/app-instance/frontend/lib/api.ts @@ -60,7 +60,7 @@ const ACCESS_TOKEN_KEY = 'beaver_access_token'; const REFRESH_TOKEN_KEY = 'beaver_refresh_token'; export const AUTH_CLEARED_EVENT = 'beaver-auth-cleared'; const REQUEST_TIMEOUT_MS = 8000; -const OUTLOOK_REQUEST_TIMEOUT_MS = 45000; +const OUTLOOK_REQUEST_TIMEOUT_MS = 360000; const SKILL_LEARNING_REQUEST_TIMEOUT_MS = 120000; export type PromptLocale = 'zh-Hans' | 'zh-Hant' | 'en'; @@ -902,10 +902,11 @@ export async function submitSkillDraft( skillName: string, draftId: string, notes: string = '' -): Promise<SkillReviewRecord> { +): Promise<SkillDraft> { return fetchJSON(`/api/skills/${encodeURIComponent(skillName)}/drafts/${encodeURIComponent(draftId)}/submit`, { method: 'POST', body: JSON.stringify({ notes }), + timeoutMs: SKILL_LEARNING_REQUEST_TIMEOUT_MS, }); } diff --git a/app-instance/frontend/lib/notification-runtime.test.ts b/app-instance/frontend/lib/notification-runtime.test.ts new file mode 100644 index 0000000..db0f8ce --- /dev/null +++ b/app-instance/frontend/lib/notification-runtime.test.ts @@ -0,0 +1,28 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import { + NOTIFICATION_REFRESH_INTERVAL_MS, + scheduleNotificationRefresh, +} from '@/lib/notification-runtime'; + +describe('notification refresh scheduling', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it('refreshes notifications periodically until cleanup', async () => { + const refresh = vi.fn(); + const cleanup = scheduleNotificationRefresh(refresh); + + await vi.advanceTimersByTimeAsync(NOTIFICATION_REFRESH_INTERVAL_MS); + expect(refresh).toHaveBeenCalledTimes(1); + + cleanup(); + await vi.advanceTimersByTimeAsync(NOTIFICATION_REFRESH_INTERVAL_MS); + expect(refresh).toHaveBeenCalledTimes(1); + }); +}); diff --git a/app-instance/frontend/lib/notification-runtime.ts b/app-instance/frontend/lib/notification-runtime.ts new file mode 100644 index 0000000..bbbdbb0 --- /dev/null +++ b/app-instance/frontend/lib/notification-runtime.ts @@ -0,0 +1,12 @@ +export const NOTIFICATION_REFRESH_INTERVAL_MS = 5_000; + +export function scheduleNotificationRefresh( + refresh: () => void | Promise<void>, + intervalMs = NOTIFICATION_REFRESH_INTERVAL_MS, +): () => void { + const timer = setInterval(() => { + void refresh(); + }, intervalMs); + + return () => clearInterval(timer); +} diff --git a/app-instance/frontend/lib/outlook-counts-visibility.test.ts b/app-instance/frontend/lib/outlook-counts-visibility.test.ts new file mode 100644 index 0000000..04b012c --- /dev/null +++ b/app-instance/frontend/lib/outlook-counts-visibility.test.ts @@ -0,0 +1,16 @@ +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; + +import { describe, expect, it } from 'vitest'; + +describe('Outlook count presentation', () => { + it('does not render summary count chips or tab count labels', () => { + const source = readFileSync( + resolve(process.cwd(), 'app/(app)/outlook/page.tsx'), + 'utf8', + ); + + expect(source).not.toContain('<TopStat'); + expect(source).not.toContain('view.count'); + }); +}); diff --git a/app-instance/frontend/lib/outlook-page-state.test.ts b/app-instance/frontend/lib/outlook-page-state.test.ts new file mode 100644 index 0000000..9519d75 --- /dev/null +++ b/app-instance/frontend/lib/outlook-page-state.test.ts @@ -0,0 +1,29 @@ +import { describe, expect, it } from 'vitest'; + +import { nextOutlookAutoLoadTarget } from '@/lib/outlook-page-state'; + +describe('nextOutlookAutoLoadTarget', () => { + it('loads the active mailbox once when it has not been attempted', () => { + expect( + nextOutlookAutoLoadTarget({ + isConfigured: true, + activeView: 'inbox', + loaded: { inbox: false, sent: false, calendar: false }, + loading: { inbox: false, sent: false, calendar: false }, + attempted: { inbox: false, sent: false, calendar: false }, + }) + ).toBe('inbox'); + }); + + it('does not auto-retry the same mailbox after a failed attempt', () => { + expect( + nextOutlookAutoLoadTarget({ + isConfigured: true, + activeView: 'inbox', + loaded: { inbox: false, sent: false, calendar: false }, + loading: { inbox: false, sent: false, calendar: false }, + attempted: { inbox: true, sent: false, calendar: false }, + }) + ).toBeNull(); + }); +}); diff --git a/app-instance/frontend/lib/outlook-page-state.ts b/app-instance/frontend/lib/outlook-page-state.ts new file mode 100644 index 0000000..5b98540 --- /dev/null +++ b/app-instance/frontend/lib/outlook-page-state.ts @@ -0,0 +1,20 @@ +export type OutlookAutoLoadView = 'inbox' | 'sent' | 'calendar'; + +export interface OutlookAutoLoadState { + isConfigured: boolean; + activeView: OutlookAutoLoadView | 'settings'; + loaded: Record<OutlookAutoLoadView, boolean>; + loading: Record<OutlookAutoLoadView, boolean>; + attempted: Record<OutlookAutoLoadView, boolean>; +} + +export function nextOutlookAutoLoadTarget(state: OutlookAutoLoadState): OutlookAutoLoadView | null { + if (!state.isConfigured || state.activeView === 'settings') { + return null; + } + const view = state.activeView; + if (state.loaded[view] || state.loading[view] || state.attempted[view]) { + return null; + } + return view; +} diff --git a/app-instance/frontend/types/index.ts b/app-instance/frontend/types/index.ts index 471d3a1..1844689 100644 --- a/app-instance/frontend/types/index.ts +++ b/app-instance/frontend/types/index.ts @@ -63,6 +63,9 @@ export interface Session { created_at?: string; updated_at?: string; path?: string; + source?: string | null; + title?: string | null; + preview?: string | null; } export interface SessionDetail { @@ -1028,6 +1031,15 @@ export interface SkillDraft { reviews?: SkillReviewRecord[]; safety_report?: SkillDraftSafetyReport | null; eval_report?: SkillDraftEvalReport | null; + eval_status?: 'not_started' | 'not_applicable' | 'pending' | 'failed' | 'completed' | 'skipped_provider_unavailable'; + eval_error?: string | null; + eval_progress?: { + phase?: 'preparing' | 'replaying' | 'completed' | 'failed'; + completed_arms?: number; + total_arms?: number; + completed_cases?: number; + total_cases?: number; + } | null; } export interface SkillReviewRecord { diff --git a/docs/presentations/beaver-management-demo/beaver-management-demo.md b/docs/presentations/beaver-management-demo/beaver-management-demo.md new file mode 100644 index 0000000..313148d --- /dev/null +++ b/docs/presentations/beaver-management-demo/beaver-management-demo.md @@ -0,0 +1,435 @@ +# Beaver 管理层演示方案 + +对象:公司管理层 +时长:60 分钟 +目标:让老板看懂 Beaver 是什么、现在已经能做什么、可以用在公司哪些地方,以及为什么值得继续投入。 + +## 一句话定位 + +Beaver 不是一个聊天机器人,而是一个企业内部 Agent 工作台:它能执行任务、使用文件和工具、保留过程证据、等待人工验收,并把成功的工作方式沉淀成可复用的企业技能。 + +## 演示主线 + +不要按页面逐个介绍,而是讲一个业务故事: + +> 假设这是公司里普通的一天:老板需要经营晨报,产品团队需要从客户反馈里判断优先级,项目团队需要提前识别风险,团队还要准备管理层汇报、沉淀可复用方法,并让周期性工作自动运行。Beaver 就是承载这些 Agent 工作的地方。 + +## 60 分钟流程 + +| 时间 | 环节 | 目的 | +| --- | --- | --- | +| 0-5 分钟 | 开场 | 定义 Beaver 是 Agent 工作系统,不是聊天产品 | +| 5-12 分钟 | 场景 1:老板晨报 | 展示多信息源汇总和管理层摘要 | +| 12-20 分钟 | 场景 2:客户反馈到产品决策 | 展示从杂乱反馈中提炼业务判断 | +| 20-28 分钟 | 场景 3:项目风险与行动计划 | 展示风险识别和管理层决策支持 | +| 28-38 分钟 | 场景 4:复杂任务与可追踪执行 | 展示聊天转任务、过程、修订和验收 | +| 38-48 分钟 | 场景 5:企业技能复用 | 展示 Beaver 的长期复利价值 | +| 48-55 分钟 | 场景 6:定时任务与治理 | 展示主动执行、状态、日志和控制能力 | +| 55-60 分钟 | 收尾讨论 | 讨论 Beaver 下一步适合在哪些内部场景试点 | + +## 需要提前上传的文件 + +文件目录: + +```text +docs/presentations/beaver-management-demo/upload-files/ +``` + +建议上传顺序: + +1. `sales-weekly.csv` +2. `project-risks.md` +3. `customer-feedback-q2.md` +4. `meeting-notes.md` +5. `project-status.md` +6. `support-tickets.csv` +7. `weekly-ops-metrics.csv` + +## 开场话术 + +可以这样开场: + +> 今天不把 Beaver 当成聊天机器人演示。我们把它当成一个企业内部 Agent 工作台来看:员工可以把真实工作交给 Beaver,Beaver 可以使用文件和工具,生成可交付结果,留下执行过程,等待人来验收或要求修改。如果这个工作以后会重复,Beaver 还可以把被认可的方法沉淀成可复用技能。 + +然后补充业务背景: + +- 聊天工具能回答问题,但企业工作需要可交付结果。 +- 管理层需要过程证据,而不是只有一段看起来流畅的文字。 +- 企业落地 AI 需要私有部署、边界、权限和运维控制。 +- 重复发生的工作应该沉淀成组织能力,而不是每个人反复写提示词。 + +## 场景 1:老板晨报 + +### 业务问题 + +老板每天不想手动看销售表、项目记录、客户反馈和会议纪要,只想快速知道今天最重要的经营判断和需要拍板的事项。 + +### 演示目标 + +展示 Beaver 可以把分散的内部信息整理成管理层能直接看的经营晨报,并标注信息来源。 + +### 使用文件 + +- `sales-weekly.csv` +- `project-risks.md` +- `customer-feedback-q2.md` +- `meeting-notes.md` +- `weekly-ops-metrics.csv` + +### 提示词 + +```text +请基于我上传的文件,生成一份给 CEO 的今日经营晨报。 + +要求: +1. 用管理层语言,不要技术细节 +2. 分为:关键结论、风险预警、需要老板决策的事项、建议行动 +3. 每个关键结论都标注来自哪个文件 +4. 最后给出今天最重要的 3 件事 +5. 控制在 800 字以内 +``` + +### 演示步骤 + +1. 打开 Beaver 聊天工作台。 +2. 到 `Files` 页面快速展示已经上传的文件。 +3. 回到聊天页,发送提示词。 +4. 打开生成的任务或任务详情页。 +5. 展示结果、时间线,以及文件/工具相关证据。 +6. 现场要求修改: + +```text +把这份晨报改成更适合 10 分钟管理层晨会使用的版本,只保留最关键的判断和行动。 +``` + +7. 展示修订结果,并点击接受。 + +### 讲解话术 + +> 这里重点不是 Beaver 写了一份摘要,而是这件事已经变成了一项可追踪任务:有原始材料、有执行过程、有结果、有修订、有人工验收。这比一个普通聊天回答更接近真实工作。 + +### 老板视角价值 + +- 减少阅读分散信息的时间。 +- 把多个信息源整理成决策导向的简报。 +- 过程和来源可查看,方便追问和复核。 + +### 翻车预案 + +如果现场生成较慢,就先展示上传文件和预期输出结构,然后打开提前跑好的任务或聊天历史。 + +## 场景 2:客户反馈到产品决策 + +### 业务问题 + +客户反馈通常很杂:销售记录、客服工单、访谈纪要里都有不同声音。管理层真正关心的是哪些问题影响收入、续约和试点成功,哪些可以后排。 + +### 演示目标 + +展示 Beaver 能从非结构化反馈中提炼主题、判断优先级,并形成产品投入建议。 + +### 使用文件 + +- `customer-feedback-q2.md` +- `support-tickets.csv` + +### 提示词 + +```text +请分析这些客户反馈和支持工单,输出一份产品决策建议。 + +要求: +1. 聚类出 5 类主要问题 +2. 判断每类问题的业务影响 +3. 给出优先级:P0 / P1 / P2 +4. 区分“必须马上做”和“可以进入路线图” +5. 给老板一个 90 天产品投入建议 +6. 最后列出还需要进一步验证的假设 +``` + +### 演示步骤 + +1. 打开 `Files`,展示 `customer-feedback-q2.md` 和 `support-tickets.csv`。 +2. 回到聊天页发起分析任务。 +3. 展示输出结构:主题聚类、优先级、业务影响、90 天建议。 +4. 要求 Beaver 改写成一页管理层备忘录: + +```text +请把这个结果改成一页管理层备忘录,重点突出投入产出比和不做的风险。 +``` + +### 讲解话术 + +> 这个场景说明 Beaver 对管理层的价值不只是写文案,而是把大量不规整的信息转成可以讨论和决策的材料。 + +### 老板视角价值 + +- 更快从客户噪声里抓住信号。 +- 让产品优先级讨论更有依据。 +- 把产品投入和业务影响连接起来。 + +### 翻车预案 + +如果输出太长,就直接追问: + +```text +请压缩成老板只需要看 5 分钟的一页摘要。 +``` + +## 场景 3:项目风险与行动计划 + +### 业务问题 + +项目延期通常不是突然发生的,早期信号可能已经出现在会议纪要、状态周报、风险记录里,例如验收标准不清、依赖延期、资源不足、审批阻塞。 + +### 演示目标 + +展示 Beaver 可以作为 PMO 助手,提前识别项目风险,并给出管理层应该介入的事项。 + +### 使用文件 + +- `project-status.md` +- `project-risks.md` +- `meeting-notes.md` + +### 提示词 + +```text +你现在是项目管理办公室 PMO。 +请基于这些项目材料,判断哪些风险可能导致延期。 + +输出: +1. 风险清单 +2. 每个风险的影响、概率、责任人建议 +3. 本周必须推进的行动项 +4. 哪些事项需要管理层介入 +5. 一份可以发给项目负责人的跟进邮件 +``` + +### 演示步骤 + +1. 在聊天页发送 PMO 提示词。 +2. 展示 Beaver 生成的风险矩阵和行动项。 +3. 打开任务详情页,说明过程证据。 +4. 追问一个管理层问题: + +```text +如果老板今天只能拍板 2 件事,应该是哪 2 件?请说明原因和不拍板的后果。 +``` + +### 讲解话术 + +> Beaver 适合处理这种需要判断、需要留下结果、还需要人来审核的工作。这里它把项目材料转成了风险清单、决策清单和跟进邮件。 + +### 老板视角价值 + +- 更早发现项目风险。 +- 明确责任人和行动项。 +- 提高向上升级问题的质量。 + +### 翻车预案 + +如果 Beaver 漏掉某个风险,不要回避,可以把它变成修订演示: + +```text +你漏掉了“验收标准变化”这个风险,请重新评估它的影响,并更新行动计划。 +``` + +## 场景 4:复杂任务与可追踪执行 + +### 业务问题 + +真实企业工作不是一个问题一个答案,而是需要拆解、分析、起草、审核和修改。 + +### 演示目标 + +展示 Beaver 和普通聊天工具的核心区别:复杂请求可以变成可管理的任务,而不是一次性聊天回复。 + +### 使用文件 + +这个场景可以复用前面文件,也可以不依赖文件。 + +### 提示词 + +```text +请帮我为 Beaver 准备一份给公司老板看的项目汇报框架。 + +目标是说明: +1. Beaver 是什么 +2. 现在已经能做什么 +3. 可以用在哪些企业场景 +4. 为什么值得继续投入 +5. 下一阶段建议做什么 + +请先拆解任务,再生成最终汇报大纲。少讲技术,多讲业务价值、风险控制和投入产出。 +``` + +### 演示步骤 + +1. 在聊天页发送提示词。 +2. 展示 Beaver 如何从对话进入任务执行。 +3. 打开任务详情页。 +4. 展示时间线、中间步骤、最终结果和验收控件。 +5. 要求修改: + +```text +把这个汇报框架改得更像董事会材料:每一部分都要回答“为什么重要、现在有什么进展、下一步要什么资源”。 +``` + +6. 展示修订后的结果,并点击接受。 + +### 讲解话术 + +> Beaver 的核心产品想法是让 AI 工作可检查。对管理层来说,重要的是能看到问了什么、做出了什么、怎么修改过、什么时候被人接受。 + +### 老板视角价值 + +- 把模糊需求转成结构化工作。 +- 支持带上下文的连续修订。 +- 让 AI 工作具备内部使用所需的可审查性。 + +### 翻车预案 + +如果任务模式没有明显触发,就继续在聊天里演示,然后打开 `Tasks` 页面展示历史任务记录。 + +## 场景 5:企业技能复用 + +### 业务问题 + +企业里很多好方法会反复使用:周报、风险复盘、客户反馈分析、项目更新、事故总结。普通 AI 聊天每次都要重新教,经验无法自然沉淀。 + +### 演示目标 + +展示 Beaver 可以把成功工作保留下来,形成可复用技能,从而产生长期组织能力。 + +### 使用文件 + +复用前面场景的输出即可,不需要新增上传文件。 + +### 演示步骤 + +1. 打开 `Skills` 页面。 +2. 展示已发布技能,例如文件操作、搜索、Outlook、定时任务、终端、技能编写等。 +3. 解释技能生命周期: + - 已接受任务 + - 技能候选 + - 草稿生成 + - 安全检查和 replay 评测 + - 人工审核 + - 发布 + - 后续任务复用 +4. 如果页面展示评测覆盖率或报告,顺手点出来。 +5. 回到聊天页,发起一个类似任务: + +```text +请按刚才的管理层汇报风格,再生成一版项目周报。保留同样的结构:关键结论、风险、需要老板决策的事项、下一步行动。 +``` + +### 讲解话术 + +> 这是 Beaver 的复利价值。第一次运行得到一个结果;一次被接受的成功工作,可以变成可复用的方法。时间久了,公司积累的是自己的 Agent 能力库,而不是每个人自己的提示词经验。 + +### 老板视角价值 + +- 减少重复说明。 +- 沉淀公司自己的工作方法。 +- 在广泛复用前保留审核和治理环节。 + +### 翻车预案 + +如果现场完整技能生成流程不够稳,不要强行演示。展示 `Skills` 页面和生命周期即可,把它作为可治理能力说明。 + +## 场景 6:定时任务与治理 + +### 业务问题 + +很多管理动作应该周期性发生,而不是靠人每天想起来:日报、周报、风险检查、客户反馈汇总、项目提醒。 + +### 演示目标 + +展示 Beaver 可以从被动聊天变成主动运营,并且管理员可以看到状态和日志。 + +### 使用文件 + +- `sales-weekly.csv` +- `project-risks.md` +- `customer-feedback-q2.md` +- `weekly-ops-metrics.csv` + +### 演示步骤 + +1. 打开 `Cron` 页面。 +2. 新建或展示一个定时任务: + +```text +每天上午 9 点生成经营晨报,汇总销售、项目风险、客户反馈和运营指标。 +``` + +3. 展示启停、运行记录,或手动触发一次。 +4. 如果已有结果,打开 `Notifications` 展示定时运行产物。 +5. 打开 `Status` 和 `Logs`。 +6. 说明管理员可以查看 provider 配置、运行状态、连接器状态和失败记录。 + +### 讲解话术 + +> 这一步说明 Beaver 可以从助手变成运营系统:周期性 Agent 工作可以被配置、监控和审核。 + +### 老板视角价值 + +- 让重复工作主动发生。 +- 管理员能看到运行状态。 +- 有失败记录和配置入口,企业落地更可控。 + +### 翻车预案 + +如果现场没有可用的定时运行结果,就只演示创建配置,并说明生成结果会进入任务或通知。 + +## 收尾话术 + +可以这样收尾: + +> Beaver 当前最适合先在三类内部场景试点。第一,管理层信息汇总,比如晨报、周报和项目汇报。第二,围绕客户、产品、运营、项目的重复分析工作。第三,需要证据、审核和人工验收的 AI 任务。它的战略价值不是替代某个人,而是把 AI 从临时问答变成可控制、可复用、可治理的工作系统。 + +## 推荐试点场景 + +先选 2-3 个窄场景,不要一开始铺太大。 + +| 试点工作流 | 为什么适合 Beaver | 成功信号 | +| --- | --- | --- | +| CEO 或部门周报 | 多文件输入,需要简洁管理层输出 | 一轮以内修订后可接受 | +| 客户反馈分析 | 输入混乱,但输出能支持决策 | 产品负责人把结果用于优先级会议 | +| 项目风险评审 | 需要证据和管理层行动 | 风险在升级会议前被识别 | +| 每周支持工单总结 | 高频重复,适合技能复用 | 同一技能连续复用 3 周 | +| 内部事故复盘 | 需要时间线、证据和后续行动 | 审核人能从 Beaver 输出理解事件经过 | + +## 演示前检查清单 + +演示前: + +- 确认 Beaver 实例能登录。 +- 确认 provider/model 配置可用。 +- 上传 `upload-files/` 里的所有文件。 +- 提前跑一遍场景 1,并保留结果。 +- 提前跑一遍场景 4,并保留任务详情页。 +- 提前打开这些页面:Chat、Files、Tasks、Skills、Cron、Status、Logs。 +- 准备一份提示词备份,本 Markdown 可以直接作为备份。 + +演示中: + +- 不要解释每一个页面。 +- 反复回到同一个主线:任务、证据、验收、复用、治理。 +- 如果现场生成慢,切到提前跑好的历史任务。 +- 如果输出不完美,就用它演示修订和人工验收。 + +## 可放进 PPT 的一页总结 + +```text +Beaver = 企业 Agent 工作台 + +1. 执行真实工作,不只是聊天 +2. 使用文件、工具、任务和连接器 +3. 保留过程证据,方便审核 +4. 通过人工验收保证可信输出 +5. 把成功工作沉淀成可复用技能 +6. 支持私有部署和运维治理 +``` diff --git a/docs/presentations/beaver-management-demo/upload-files/README.md b/docs/presentations/beaver-management-demo/upload-files/README.md new file mode 100644 index 0000000..4c924e4 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/README.md @@ -0,0 +1,24 @@ +# Beaver 管理层演示上传文件 + +这些文件是 Beaver 管理层演示用的样例业务输入。 + +演示前建议全部上传到 Beaver: + +1. `sales-weekly.csv` +2. `project-risks.md` +3. `customer-feedback-q2.md` +4. `meeting-notes.md` +5. `project-status.md` +6. `support-tickets.csv` +7. `weekly-ops-metrics.csv` + +建议场景映射: + +| 场景 | 文件 | +| --- | --- | +| 老板晨报 | `sales-weekly.csv`, `project-risks.md`, `customer-feedback-q2.md`, `meeting-notes.md`, `weekly-ops-metrics.csv` | +| 客户反馈分析 | `customer-feedback-q2.md`, `support-tickets.csv` | +| 项目风险评审 | `project-status.md`, `project-risks.md`, `meeting-notes.md` | +| 定时经营汇总 | `sales-weekly.csv`, `project-risks.md`, `customer-feedback-q2.md`, `weekly-ops-metrics.csv` | + +文件内容是虚构数据,但按照真实管理层演示场景设计,方便现场上传和测试。 diff --git a/docs/presentations/beaver-management-demo/upload-files/customer-feedback-q2.md b/docs/presentations/beaver-management-demo/upload-files/customer-feedback-q2.md new file mode 100644 index 0000000..4c7c2f0 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/customer-feedback-q2.md @@ -0,0 +1,37 @@ +# Q2 Customer Feedback + +Source: sales calls, support notes, product interviews, and pilot discussions +Period: 2026 Q2 + +## Feedback Items + +1. "The AI answer is useful, but I do not know what source material it used." +2. "Our compliance team needs to see a trace of tool calls and file access before approving a pilot." +3. "The demo is strong when it turns a request into a task. Please make that the first thing users see." +4. "We want daily and weekly reports to run automatically, not only when someone asks in chat." +5. "The Outlook connector would be valuable if it can summarize customer emails and draft replies." +6. "We do not want every employee pasting company data into public SaaS tools." +7. "The Files page is useful, but users need clearer examples of what to upload." +8. "The task detail page helps reviewers understand what happened." +9. "The Skills concept is important. It means our team's best working methods can be reused." +10. "Skill publishing should require human approval. We do not want low-quality automations spreading." +11. "The interface has many pages. New users need a guided first workflow." +12. "Management will ask how this is different from ChatGPT Team or Copilot." +13. "The strongest value is repeatable knowledge work: weekly reports, customer feedback summaries, project risk reviews." +14. "We need a clear admin story: status, logs, provider configuration, connector health." +15. "Some users asked whether Beaver can run terminal commands. Security wants policy controls around that." +16. "The first pilot should avoid too many external integrations." +17. "We need to measure accepted tasks, revision rounds, and time saved." +18. "The model sometimes gives too much detail. Executive summaries should be shorter." +19. "Private deployment and per-user instance boundaries are important for enterprise buyers." +20. "The demo should show a failed or revised answer, because review is part of real work." + +## Raw Themes Observed + +- Trust and auditability +- Task lifecycle beyond chat +- Reusable skills and method capture +- Scheduled recurring work +- Private deployment and admin control +- Connector demand, especially email +- Need for simpler onboarding and clearer demo story diff --git a/docs/presentations/beaver-management-demo/upload-files/meeting-notes.md b/docs/presentations/beaver-management-demo/upload-files/meeting-notes.md new file mode 100644 index 0000000..c5419f4 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/meeting-notes.md @@ -0,0 +1,39 @@ +# Management Prep Meeting Notes + +Date: 2026-06-11 +Participants: Product, Engineering, Operations, Sales + +## Purpose + +Prepare a leadership demo that explains what Beaver is, what progress has been made, and what use cases are realistic for the company. + +## Discussion + +Product team recommended avoiding a page-by-page product tour. Leadership should see how Beaver supports real business work: summarize information, create a task, show evidence, revise output, accept result, and reuse the method. + +Engineering confirmed that the current system can show login, files, chat workspace, task records, task detail, skills, cron, status, and logs. The most stable story is the core loop: chat-to-task, evidence, revision, acceptance, and skill reuse explanation. + +Operations noted that management will care about governance. The demo should mention private deployment, instance boundaries, model provider configuration, connector configuration, status, and logs. The team should avoid overpromising fully autonomous actions. + +Sales said the clearest executive scenarios are: + +- CEO morning brief +- Customer feedback analysis +- Project risk review +- Weekly support summary +- AI task governance and evidence + +## Decisions + +1. Use a 60-minute demo format. +2. Target company leadership, not external customers. +3. Start with business outcomes, then show product capabilities. +4. Use realistic but fictional sample files. +5. Keep Outlook and external connector demo optional. +6. Prepare backup outputs in case live model generation is slow. + +## Open Questions + +1. Which internal workflow should become the first pilot? +2. What metric should be used to evaluate Beaver: time saved, accepted tasks, quality, or risk reduction? +3. Should the next milestone focus on polish, connector hardening, or skill lifecycle? diff --git a/docs/presentations/beaver-management-demo/upload-files/project-risks.md b/docs/presentations/beaver-management-demo/upload-files/project-risks.md new file mode 100644 index 0000000..b11e50c --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/project-risks.md @@ -0,0 +1,57 @@ +# Project Risk Notes + +Date: 2026-06-12 +Owner: PMO + +## Executive Summary + +The Beaver internal demo project is on track for a management review next week, but several risks require attention. The core product loop is demoable: login, files, chat-to-task, task detail, evidence, revision, acceptance, skills, cron, status, and logs. The main risks are demo stability, connector maturity, and clarity of business story. + +## Risks + +### R1: Demo scope is too broad + +- Impact: High +- Probability: Medium +- Signal: The product has many pages: chat, files, tasks, skills, marketplace, agents, MCP, cron, connectors, status, logs. +- Concern: If the demo becomes a feature tour, leadership may not understand the main business value. +- Suggested response: Use one storyline and only show pages that support it. + +### R2: Connector demo may be unstable + +- Impact: Medium +- Probability: Medium +- Signal: Outlook and external connector paths exist, but live external dependency can fail. +- Concern: A connector failure could distract from the core Agent workspace story. +- Suggested response: Treat connectors as optional. Demo configuration and explain target workflow if live connector is not stable. + +### R3: Skill learning flow may be too long for live presentation + +- Impact: Medium +- Probability: High +- Signal: Skill candidate, draft, safety, replay evaluation, review, and publish are powerful but require time. +- Concern: Waiting for background learning may break the demo rhythm. +- Suggested response: Show Skills page, explain lifecycle, and use pre-created examples. + +### R4: Leadership may ask for ROI + +- Impact: High +- Probability: High +- Signal: Management audience cares about adoption, risk, and next investment. +- Concern: Technical progress alone will not answer "why continue?" +- Suggested response: Position first pilots around repeated knowledge work, measurable accepted tasks, revision rounds, and time saved. + +### R5: Model output quality can vary + +- Impact: Medium +- Probability: Medium +- Signal: Live model generation may be verbose, miss details, or produce uneven structure. +- Concern: Output quality variance may look like product instability. +- Suggested response: Use revision as part of the story: Beaver supports feedback, continuation, and acceptance. + +## Management Decisions Needed + +1. Confirm the first 2-3 internal pilot workflows. +2. Decide whether the next milestone optimizes for demo polish or pilot readiness. +3. Pick one connector to harden first, preferably the one with the clearest business value. +4. Define what evidence is required before a task can be considered accepted. diff --git a/docs/presentations/beaver-management-demo/upload-files/project-status.md b/docs/presentations/beaver-management-demo/upload-files/project-status.md new file mode 100644 index 0000000..c5a03df --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/project-status.md @@ -0,0 +1,77 @@ +# Project Status: Beaver Leadership Demo + +Date: 2026-06-12 +Project owner: Product and Engineering +Target review: Next week + +## Overall Status + +Status: Yellow + +The core Beaver demonstration is feasible, but the team needs to tighten the story and prepare backup paths. The product has enough implemented surfaces to explain the Agent workspace concept: files, chat, tasks, evidence, acceptance, skills, cron, status, and logs. + +## Workstreams + +### 1. Product Story + +- Status: Yellow +- Owner: Product +- Progress: Drafted 6 management scenarios. +- Risk: If the story is too technical, leadership may see Beaver as another chatbot or internal tool experiment. +- Next action: Rehearse the opening and closing talk tracks. + +### 2. Demo Environment + +- Status: Yellow +- Owner: Engineering +- Progress: Local instance is available. Provider configuration is being checked. +- Risk: Live model response can be slow or verbose. +- Next action: Run the main scenarios once and keep completed tasks available. + +### 3. Sample Data + +- Status: Green +- Owner: Product +- Progress: Sales, customer feedback, project risk, support, and operations files prepared. +- Risk: Sample data must look realistic without exposing actual company data. +- Next action: Upload all files to Beaver before the demo. + +### 4. Skills Story + +- Status: Yellow +- Owner: Engineering +- Progress: Skills page and lifecycle exist. Replay evaluation and review flow can be explained. +- Risk: Full candidate-to-publish flow may take too long live. +- Next action: Use page walkthrough and a short reuse example. + +### 5. Scheduled Work + +- Status: Yellow +- Owner: Engineering +- Progress: Cron page can show scheduled task configuration. +- Risk: A live scheduled run may not complete within the meeting. +- Next action: Use manual trigger or show configuration and run records. + +### 6. Governance + +- Status: Green +- Owner: Operations +- Progress: Status and logs can support the governance message. +- Risk: Leadership may ask about security policy details that are not finalized. +- Next action: Keep the message clear: private deployment, task evidence, human acceptance, and controlled tool rollout. + +## Key Risks + +| Risk | Impact | Probability | Owner | Mitigation | +| --- | --- | --- | --- | --- | +| Demo becomes feature tour | High | Medium | Product | Use one storyline and 6 scenarios | +| Live output quality varies | Medium | Medium | Engineering | Prepare previous completed tasks | +| Skill flow takes too long | Medium | High | Engineering | Explain lifecycle and show page state | +| Connector dependency fails | Medium | Medium | Engineering | Keep connector optional | +| ROI question lacks answer | High | Medium | Product | Propose 2-3 measurable internal pilots | + +## Management Decisions Requested + +1. Choose the first internal pilot workflow. +2. Decide whether next sprint should prioritize demo polish, pilot hardening, or connector reliability. +3. Confirm what governance controls are required before wider internal rollout. diff --git a/docs/presentations/beaver-management-demo/upload-files/sales-weekly.csv b/docs/presentations/beaver-management-demo/upload-files/sales-weekly.csv new file mode 100644 index 0000000..3dd2672 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/sales-weekly.csv @@ -0,0 +1,9 @@ +week,region,product,new_pipeline_cny,closed_won_cny,forecast_cny,win_rate,top_account,risk_note +2026-W23,North China,Beaver Enterprise,1280000,520000,910000,0.31,Hengyuan Manufacturing,"Procurement asks for private deployment proof before signing" +2026-W23,East China,Beaver Enterprise,1860000,740000,1380000,0.37,Jianghai Finance,"Security review is positive but legal review is still open" +2026-W23,South China,Beaver Team,760000,210000,430000,0.24,Nanfang Retail,"Champion changed team; sales needs executive sponsor" +2026-W23,Overseas,Beaver Enterprise,940000,360000,690000,0.28,Atlas Components,"Customer wants Outlook connector demo before commercial discussion" +2026-W24,North China,Beaver Enterprise,1510000,680000,1050000,0.34,Hengyuan Manufacturing,"Pilot environment requested by June 18" +2026-W24,East China,Beaver Enterprise,2030000,810000,1520000,0.39,Jianghai Finance,"Deal depends on audit trail and task evidence explanation" +2026-W24,South China,Beaver Team,820000,250000,500000,0.25,Nanfang Retail,"Budget owner wants clearer ROI story" +2026-W24,Overseas,Beaver Enterprise,1010000,410000,760000,0.30,Atlas Components,"Connector reliability remains the main objection" diff --git a/docs/presentations/beaver-management-demo/upload-files/support-tickets.csv b/docs/presentations/beaver-management-demo/upload-files/support-tickets.csv new file mode 100644 index 0000000..f73fe35 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/support-tickets.csv @@ -0,0 +1,11 @@ +ticket_id,date,account,segment,category,severity,summary,status +SUP-1021,2026-05-28,Hengyuan Manufacturing,Enterprise,Deployment,P1,"Customer needs private deployment checklist for security review",Open +SUP-1028,2026-05-30,Jianghai Finance,Enterprise,Auditability,P0,"Reviewer asks how task evidence records file usage and tool calls",Open +SUP-1044,2026-06-02,Nanfang Retail,Team,Onboarding,P2,"New users do not know which first workflow to try",In Progress +SUP-1051,2026-06-03,Atlas Components,Enterprise,Connector,P1,"Outlook connector setup requires clearer success and failure status",Open +SUP-1060,2026-06-04,Hengyuan Manufacturing,Enterprise,Skills,P1,"Team wants accepted weekly report workflow to become reusable template",In Progress +SUP-1067,2026-06-05,Jianghai Finance,Enterprise,Governance,P0,"Compliance wants human approval before publishing reusable skills",Open +SUP-1075,2026-06-07,Nanfang Retail,Team,UX,P2,"Task output is too long for department managers",Resolved +SUP-1082,2026-06-08,Atlas Components,Enterprise,Cron,P1,"Customer wants weekly customer email summary to run every Monday",Open +SUP-1090,2026-06-10,Hengyuan Manufacturing,Enterprise,Model Config,P2,"Admin wants clearer provider configuration status",In Progress +SUP-1096,2026-06-11,Jianghai Finance,Enterprise,Security,P0,"Security asks whether terminal tools can be disabled for pilot users",Open diff --git a/docs/presentations/beaver-management-demo/upload-files/weekly-ops-metrics.csv b/docs/presentations/beaver-management-demo/upload-files/weekly-ops-metrics.csv new file mode 100644 index 0000000..06fbdf9 --- /dev/null +++ b/docs/presentations/beaver-management-demo/upload-files/weekly-ops-metrics.csv @@ -0,0 +1,11 @@ +metric,current_week,previous_week,target,status,note +accepted_tasks,42,31,40,Green,"Accepted task count exceeded weekly target" +average_revision_rounds,1.4,1.8,1.5,Green,"Output quality improved after prompt and skill updates" +tasks_with_evidence_percent,88,82,90,Yellow,"Close to target; some simple chat tasks lack useful evidence" +skill_reuse_count,11,6,10,Green,"Weekly report and risk review skills reused by pilot users" +failed_tool_runs,7,9,3,Red,"Most failures came from connector timeout and missing credentials" +scheduled_runs_completed,18,12,20,Yellow,"Cron usage is growing but several jobs are still manual" +new_skill_candidates,5,3,4,Green,"Accepted work is generating reusable workflow candidates" +open_p0_support_items,3,2,0,Red,"Auditability and security control questions need management attention" +active_pilot_users,16,12,20,Yellow,"Usage increased but onboarding still depends on guided examples" +average_task_completion_minutes,7.8,9.6,8.0,Green,"Median task completion time is improving" diff --git a/docs/presentations/skill-replay-eval/assets/animations/animations.css b/docs/presentations/beaver-project/assets/animations/animations.css similarity index 100% rename from docs/presentations/skill-replay-eval/assets/animations/animations.css rename to docs/presentations/beaver-project/assets/animations/animations.css diff --git a/docs/presentations/skill-replay-eval/assets/base.css b/docs/presentations/beaver-project/assets/base.css similarity index 100% rename from docs/presentations/skill-replay-eval/assets/base.css rename to docs/presentations/beaver-project/assets/base.css diff --git a/docs/presentations/skill-replay-eval/assets/fonts.css b/docs/presentations/beaver-project/assets/fonts.css similarity index 100% rename from docs/presentations/skill-replay-eval/assets/fonts.css rename to docs/presentations/beaver-project/assets/fonts.css diff --git a/docs/presentations/skill-replay-eval/assets/runtime.js b/docs/presentations/beaver-project/assets/runtime.js similarity index 100% rename from docs/presentations/skill-replay-eval/assets/runtime.js rename to docs/presentations/beaver-project/assets/runtime.js diff --git a/docs/presentations/skill-replay-eval/index.html b/docs/presentations/beaver-project/index.html similarity index 100% rename from docs/presentations/skill-replay-eval/index.html rename to docs/presentations/beaver-project/index.html diff --git a/docs/presentations/skill-replay-eval/style.css b/docs/presentations/beaver-project/style.css similarity index 99% rename from docs/presentations/skill-replay-eval/style.css rename to docs/presentations/beaver-project/style.css index e45060c..97ef4d9 100644 --- a/docs/presentations/skill-replay-eval/style.css +++ b/docs/presentations/beaver-project/style.css @@ -1,4 +1,4 @@ -/* Beaver Skill Replay Eval deck, based on html-ppt tech-sharing template. */ +/* Beaver Project deck, based on html-ppt tech-sharing template. */ .replay-root { background: #08111d; } diff --git a/docs/product-discovery/beaver/README.md b/docs/product-discovery/beaver/README.md index 1b4acb2..2e64d67 100644 --- a/docs/product-discovery/beaver/README.md +++ b/docs/product-discovery/beaver/README.md @@ -23,7 +23,7 @@ Beaver is an enterprise Agent sandbox and execution platform. It combines privat - [Backend README](../../../app-instance/backend/README.md) - [Recent Backend Features](../../../projcet_review/backend_recent_completed_features.md) - [UI/UX Page Docs](../../ui-ux/README.md) -- [Customer Presentation](../../presentations/skill-replay-eval/index.html) +- [Customer Presentation](../../presentations/beaver-project/index.html) ## Related Feature Discovery diff --git a/docs/product-discovery/skill-replay-eval/README.md b/docs/product-discovery/skill-replay-eval/README.md index 7dee067..81ef7cb 100644 --- a/docs/product-discovery/skill-replay-eval/README.md +++ b/docs/product-discovery/skill-replay-eval/README.md @@ -10,4 +10,4 @@ Related source material: - [Skill Replay Eval Design](../../superpowers/specs/2026-06-08-skill-replay-eval-design.md) - [Skill Replay Eval Implementation Plan](../../superpowers/plans/2026-06-08-skill-replay-eval.md) -- [Beaver customer presentation](../../presentations/skill-replay-eval/index.html) +- [Beaver customer presentation](../../presentations/beaver-project/index.html) diff --git a/docs/product-discovery/skill-replay-eval/product-discovery-report.md b/docs/product-discovery/skill-replay-eval/product-discovery-report.md index 9e03cf2..3e46245 100644 --- a/docs/product-discovery/skill-replay-eval/product-discovery-report.md +++ b/docs/product-discovery/skill-replay-eval/product-discovery-report.md @@ -12,7 +12,7 @@ Source context: - Feature design: `docs/superpowers/specs/2026-06-08-skill-replay-eval-design.md` - Delivery plan: `docs/superpowers/plans/2026-06-08-skill-replay-eval.md` - Current implementation signals: `beaver/skills/learning/{case_selection,preservation,replay,surrogate,eval}.py`, Skills page replay report UI, publish gate checks -- Customer positioning: `docs/presentations/skill-replay-eval/index.html` +- Customer positioning: `docs/presentations/beaver-project/index.html` ## Executive Summary diff --git a/docs/superpowers/plans/2026-06-15-plugin-skill-mirroring.md b/docs/superpowers/plans/2026-06-15-plugin-skill-mirroring.md new file mode 100644 index 0000000..c984448 --- /dev/null +++ b/docs/superpowers/plans/2026-06-15-plugin-skill-mirroring.md @@ -0,0 +1,1758 @@ +# Plugin Skill Mirroring And Upgrade Learning Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add declarative Beaver plugins whose skills are mirrored as normal managed skills, learn normally, and merge plugin upgrades through the existing safety, replay evaluation, review, publish, and rollback lifecycle. + +**Architecture:** A new `beaver.plugins` package discovers and validates `beaver.plugin.json`, computes content and full-tree hashes, persists enable/sync state, and stages immutable upstream/version trees before atomic promotion under a workspace write lock. Plugin upgrades become deterministic `plugin_skill_update` learning candidates using old upstream, current local, and new upstream inputs; the existing learning pipeline remains the only path for update publication, with sync-time reconciliation repairing failed state acknowledgements. + +**Tech Stack:** Python dataclasses and file-backed JSON stores, existing `SkillSpecStore` and skill-learning pipeline, FastAPI, pytest, Next.js/TypeScript, existing shadcn UI components. + +--- + +## Scope + +This plan implements declarative skill plugins only. Do not add Python plugin entrypoints, +hooks, providers, channels, dependency installation, or marketplace download support. +Plugin-provided tools continue to use MCP. + +## File Structure + +Create focused plugin modules: + +- `app-instance/backend/beaver/plugins/models.py`: manifest, discovery, state, and sync result dataclasses. +- `app-instance/backend/beaver/plugins/manifest.py`: JSON parsing, identifier validation, and contained-path validation. +- `app-instance/backend/beaver/plugins/hashing.py`: canonical skill-content and full-tree hashing. +- `app-instance/backend/beaver/plugins/tree_merge.py`: deterministic three-way supporting-file merge plans. +- `app-instance/backend/beaver/plugins/state.py`: atomic `.beaver/plugins/state.json` persistence. +- `app-instance/backend/beaver/plugins/discovery.py`: scan workspace and configured plugin roots. +- `app-instance/backend/beaver/plugins/transaction.py`: same-filesystem staging and immutable directory promotion. +- `app-instance/backend/beaver/plugins/skills.py`: initial mirror, update classification, candidate creation, reconciliation, pause/resume, disable, and adopt. +- `app-instance/backend/beaver/plugins/__init__.py`: public exports. +- `app-instance/backend/beaver/foundation/utils/file_lock.py`: reentrant cross-process workspace write lock. + +Modify skill lifecycle modules: + +- `app-instance/backend/beaver/skills/specs/models.py`: add upstream snapshot and draft provenance models. +- `app-instance/backend/beaver/skills/specs/storage.py`: persist immutable upstream snapshots and safely copy supporting files. +- `app-instance/backend/beaver/memory/skills/store.py`: lock candidate existence checks and JSONL mutations. +- `app-instance/backend/beaver/skills/drafts/service.py`: create plugin update drafts. +- `app-instance/backend/beaver/skills/learning/service.py`: synthesize `plugin_skill_update`. +- `app-instance/backend/beaver/skills/learning/synthesizer.py`: three-way plugin merge prompt and result. +- `app-instance/backend/beaver/skills/learning/eval.py`: plugin merge preservation report. +- `app-instance/backend/beaver/skills/learning/pipeline.py`: acknowledge successful plugin update publication. +- `app-instance/backend/beaver/skills/publisher/service.py`: carry draft provenance into published versions. + +Modify runtime and management surfaces: + +- `app-instance/backend/beaver/foundation/config/schema.py` +- `app-instance/backend/beaver/foundation/config/loader.py` +- `app-instance/backend/beaver/engine/loader.py` +- `app-instance/backend/beaver/interfaces/web/app.py` +- `app-instance/frontend/types/index.ts` +- `app-instance/frontend/lib/api.ts` +- `app-instance/frontend/app/(app)/skills/page.tsx` + +Add tests: + +- `app-instance/backend/tests/unit/test_plugin_manifest.py` +- `app-instance/backend/tests/unit/test_plugin_hashing.py` +- `app-instance/backend/tests/unit/test_plugin_state.py` +- `app-instance/backend/tests/unit/test_workspace_write_lock.py` +- `app-instance/backend/tests/unit/test_plugin_skill_storage.py` +- `app-instance/backend/tests/unit/test_plugin_skill_sync.py` +- `app-instance/backend/tests/unit/test_plugin_skill_learning.py` +- `app-instance/backend/tests/unit/test_plugin_runtime.py` +- `app-instance/backend/tests/unit/test_plugin_web_api.py` +- `app-instance/frontend/lib/plugin-api.test.ts` + +--- + +### Task 1: Add Plugin Configuration And Manifest Models + +**Files:** +- Create: `app-instance/backend/beaver/plugins/models.py` +- Create: `app-instance/backend/beaver/plugins/manifest.py` +- Create: `app-instance/backend/beaver/plugins/hashing.py` +- Create: `app-instance/backend/beaver/plugins/__init__.py` +- Modify: `app-instance/backend/beaver/foundation/config/schema.py` +- Modify: `app-instance/backend/beaver/foundation/config/loader.py` +- Modify: `app-instance/backend/beaver/foundation/config/__init__.py` +- Test: `app-instance/backend/tests/unit/test_plugin_manifest.py` +- Test: `app-instance/backend/tests/unit/test_plugin_hashing.py` +- Test: `app-instance/backend/tests/unit/test_config_loader.py` + +- [ ] **Step 1: Write failing manifest validation tests** + +Create tests covering: + +```python +def test_load_plugin_manifest_accepts_declared_skill(tmp_path: Path) -> None: + root = tmp_path / "comic" + (root / "skills" / "comic").mkdir(parents=True) + (root / "skills" / "comic" / "SKILL.md").write_text("# Comic\n", encoding="utf-8") + (root / "beaver.plugin.json").write_text( + json.dumps( + { + "schema_version": 1, + "id": "baoyu-comic", + "name": "Baoyu Comic", + "version": "1.2.0", + "skills": [{"name": "baoyu-comic", "path": "skills/comic"}], + } + ), + encoding="utf-8", + ) + + manifest = load_plugin_manifest(root / "beaver.plugin.json") + + assert manifest.plugin_id == "baoyu-comic" + assert manifest.skills[0].name == "baoyu-comic" + assert manifest.skills[0].root == root / "skills" / "comic" + + +@pytest.mark.parametrize("value", ["../outside", "/absolute", "skills/../../outside"]) +def test_load_plugin_manifest_rejects_escaping_skill_path(tmp_path: Path, value: str) -> None: + path = tmp_path / "beaver.plugin.json" + path.write_text( + json.dumps( + { + "schema_version": 1, + "id": "unsafe", + "name": "Unsafe", + "version": "1.0.0", + "skills": [{"name": "unsafe", "path": value}], + } + ), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="contained"): + load_plugin_manifest(path) +``` + +Also test invalid IDs, duplicate skill names, unsupported schema versions, missing +`SKILL.md`, and symlinked skill roots. + +Add tree-hash tests: + +```python +def test_skill_tree_hash_changes_when_supporting_file_changes(tmp_path: Path) -> None: + root = tmp_path / "skill" + root.mkdir() + (root / "SKILL.md").write_text("# Skill\n", encoding="utf-8") + (root / "templates").mkdir() + template = root / "templates" / "report.md" + template.write_text("v1", encoding="utf-8") + + first = hash_plugin_skill_tree(root) + template.write_text("v2", encoding="utf-8") + second = hash_plugin_skill_tree(root) + + assert first.skill_content_hash == second.skill_content_hash + assert first.skill_tree_hash != second.skill_tree_hash +``` + +Also verify path changes and executable-bit changes affect `skill_tree_hash`, while mtime +and non-executable permission changes do not. + +- [ ] **Step 2: Run tests and verify failure** + +Run: + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_manifest.py tests/unit/test_plugin_hashing.py tests/unit/test_config_loader.py -q +``` + +Expected: FAIL because `beaver.plugins` and `PluginsConfig` do not exist. + +- [ ] **Step 3: Implement immutable plugin models and config** + +Put plugin package models in `beaver/plugins/models.py`: + +```python +@dataclass(frozen=True, slots=True) +class PluginSkillDeclaration: + name: str + relative_path: str + root: Path + + +@dataclass(frozen=True, slots=True) +class PluginManifest: + schema_version: int + plugin_id: str + name: str + version: str + root: Path + manifest_path: Path + display_path: str + skills: tuple[PluginSkillDeclaration, ...] + + +@dataclass(frozen=True, slots=True) +class PluginSkillFileDigest: + path: str + size: int + executable: bool + content_hash: str + + +@dataclass(frozen=True, slots=True) +class PluginSkillTreeDigest: + skill_content_hash: str + skill_tree_hash: str + files: tuple[PluginSkillFileDigest, ...] + + +``` + +Put configuration in `beaver/foundation/config/schema.py` to preserve the foundation layer +and avoid importing plugin runtime modules from config: + +```python +@dataclass(slots=True) +class PluginsConfig: + search_paths: list[str] = field(default_factory=list) + auto_sync: bool = True +``` + +Add `plugins: PluginsConfig` to `BeaverConfig`. Parse both camelCase and snake_case: + +```python +def _parse_plugins(raw: Any) -> PluginsConfig: + data = _as_dict(raw) + return PluginsConfig( + search_paths=_string_list(data.get("searchPaths") or data.get("search_paths")), + auto_sync=_bool(data.get("autoSync") if "autoSync" in data else data.get("auto_sync"), default=True), + ) +``` + +- [ ] **Step 4: Implement strict JSON manifest loading** + +`load_plugin_manifest()` must: + +1. parse a JSON object; +2. require schema version `1`; +3. validate identifiers with `^[a-z0-9][a-z0-9_-]*$`; +4. resolve every skill root and check `resolved.is_relative_to(plugin_root)`; +5. reject symlinks in the path from plugin root to skill root; +6. require a regular `SKILL.md`; +7. initialize `display_path` without exposing an absolute path; +8. return frozen dataclasses. + +- [ ] **Step 5: Implement deterministic dual hashing** + +`hash_plugin_skill_tree(root)` must: + +1. reject symlinks and non-regular files; +2. enumerate regular files by normalized POSIX relative path; +3. compute `skill_content_hash` from normalized `SKILL.md`; +4. compute `skill_tree_hash` from each path, byte length, file bytes, and one normalized + executable-bit flag; +5. include `SKILL.md` and every supporting file; +6. exclude Beaver metadata such as `version.json` and `upstream.json`; +7. ignore mtime, uid/gid, and non-executable mode bits. + +Use length-prefixed binary fields in the digest input instead of ambiguous string +concatenation. + +- [ ] **Step 6: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_manifest.py tests/unit/test_plugin_hashing.py tests/unit/test_config_loader.py -q +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/beaver/plugins app-instance/backend/beaver/foundation/config app-instance/backend/tests/unit/test_plugin_manifest.py app-instance/backend/tests/unit/test_plugin_hashing.py app-instance/backend/tests/unit/test_config_loader.py +git commit -m "feat(plugins): add declarative skill manifest" +``` + +--- + +### Task 2: Add Discovery And Atomic Plugin State + +**Files:** +- Create: `app-instance/backend/beaver/plugins/discovery.py` +- Create: `app-instance/backend/beaver/plugins/state.py` +- Create: `app-instance/backend/beaver/foundation/utils/file_lock.py` +- Modify: `app-instance/backend/beaver/plugins/models.py` +- Modify: `app-instance/backend/beaver/plugins/__init__.py` +- Test: `app-instance/backend/tests/unit/test_plugin_state.py` +- Test: `app-instance/backend/tests/unit/test_workspace_write_lock.py` + +- [ ] **Step 1: Write failing discovery and state tests** + +Cover workspace discovery, configured search paths, duplicate plugin IDs, malformed +manifests reported as errors instead of crashing the full scan, and state round trips: + +```python +def test_plugin_state_round_trip_is_atomic(tmp_path: Path) -> None: + store = PluginStateStore(tmp_path) + store.set_enabled("baoyu-comic", True) + store.update_skill_binding( + "baoyu-comic", + "baoyu-comic", + PluginSkillBinding( + accepted_upstream_tree_hash="old", + observed_upstream_tree_hash="new", + accepted_beaver_version="v0001", + current_beaver_version="v0002", + pending_candidate_id="plugin-update:baoyu-comic:baoyu-comic:new", + status="update_pending", + ), + ) + + reloaded = PluginStateStore(tmp_path).get_plugin("baoyu-comic") + + assert reloaded is not None + assert reloaded.enabled is True + assert reloaded.skills["baoyu-comic"].accepted_upstream_tree_hash == "old" + assert not (tmp_path / ".beaver" / "plugins" / "state.json.tmp").exists() +``` + +Add a multiprocess lock test in which two processes enter the same workspace lock and +assert their critical sections never overlap. Add a reentrancy test in which nested +acquisitions in one process complete without deadlock. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_state.py tests/unit/test_workspace_write_lock.py -q +``` + +Expected: FAIL because discovery and state stores are missing. + +- [ ] **Step 3: Implement state dataclasses** + +Add backward-compatible `to_dict()` and `from_dict()` methods for: + +```python +@dataclass(slots=True) +class PluginSkillBinding: + accepted_upstream_tree_hash: str | None = None + observed_upstream_tree_hash: str | None = None + accepted_beaver_version: str | None = None + current_beaver_version: str | None = None + pending_candidate_id: str | None = None + status: str = "discovered" + last_error: str | None = None + + +@dataclass(slots=True) +class PluginState: + plugin_id: str + enabled: bool = False + updates_paused: bool = False + installed_version: str | None = None + manifest_path: str | None = None + status: str = "discovered" + last_error: str | None = None + skills: dict[str, PluginSkillBinding] = field(default_factory=dict) +``` + +- [ ] **Step 4: Implement atomic state persistence** + +Store data at `<workspace>/.beaver/plugins/state.json`. Write a complete JSON document to +`state.json.tmp`, flush it, then replace `state.json`. Public methods: + +```python +list_plugins() +get_plugin(plugin_id) +set_enabled(plugin_id, enabled) +upsert_plugin(plugin_state) +update_skill_binding(plugin_id, skill_name, binding) +``` + +- [ ] **Step 5: Implement the shared workspace write lock** + +Add: + +```python +class WorkspaceWriteLock: + def __init__(self, workspace: str | Path) -> None: + self.path = Path(workspace) / ".beaver" / "locks" / "plugin-skill-write.lock" + + @contextmanager + def acquire(self, *, timeout_seconds: float | None = None, blocking: bool = True): + ... +``` + +Requirements: + +- use `fcntl.flock()` on POSIX and `msvcrt.locking()` on Windows, matching + `memory/curated/store.py`; +- guard with a process-local `threading.RLock`; +- track per-thread recursion depth so nested store calls reuse the OS lock; +- support non-blocking acquisition for Engine boot; +- raise `WorkspaceWriteLockBusy` on timeout/contention; +- keep the lock file separate from atomically replaced data files. + +- [ ] **Step 6: Implement discovery** + +Scan: + +1. `<workspace>/plugins`; +2. each configured `plugins.search_paths`. + +Only direct child directories containing `beaver.plugin.json` are plugins. Return a +`PluginDiscoveryResult` containing valid manifests and per-path errors. Duplicate IDs are +errors and neither duplicate is activated. Discovery records a workspace-relative +manifest display path when possible and a redacted +`<external>/<plugin-dir>/beaver.plugin.json` path otherwise; absolute paths remain +internal. + +- [ ] **Step 7: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_state.py tests/unit/test_workspace_write_lock.py tests/unit/test_plugin_manifest.py -q +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add app-instance/backend/beaver/plugins app-instance/backend/beaver/foundation/utils/file_lock.py app-instance/backend/tests/unit/test_plugin_state.py app-instance/backend/tests/unit/test_workspace_write_lock.py +git commit -m "feat(plugins): discover packages and persist state" +``` + +--- + +### Task 3: Persist Immutable Upstream Skill Snapshots + +**Files:** +- Create: `app-instance/backend/beaver/plugins/transaction.py` +- Modify: `app-instance/backend/beaver/skills/specs/models.py` +- Modify: `app-instance/backend/beaver/skills/specs/storage.py` +- Modify: `app-instance/backend/beaver/skills/specs/__init__.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_storage.py` + +- [ ] **Step 1: Write failing snapshot storage tests** + +Test exact content, supporting files, idempotence, symlink rejection, and source +immutability: + +```python +def test_write_upstream_snapshot_copies_skill_without_mutating_source(tmp_path: Path) -> None: + source = tmp_path / "plugin" / "skills" / "comic" + source.mkdir(parents=True) + (source / "SKILL.md").write_text("# Comic\n\nOriginal.\n", encoding="utf-8") + (source / "templates").mkdir() + (source / "templates" / "panel.txt").write_text("panel", encoding="utf-8") + store = SkillSpecStore(tmp_path / "workspace") + transaction = PluginSkillTransaction(tmp_path / "workspace") + + snapshot = store.stage_upstream_snapshot( + transaction, + skill_name="baoyu-comic", + source_kind="plugin", + source_id="baoyu-comic", + source_version="1.0.0", + source_path="skills/comic", + source_root=source, + ) + store.promote_upstream_snapshot(transaction, snapshot) + + loaded = store.read_upstream_snapshot("baoyu-comic", "baoyu-comic", snapshot.skill_tree_hash) + assert loaded is not None + assert loaded.content == "# Comic\n\nOriginal.\n" + assert (loaded.root / "templates" / "panel.txt").read_text(encoding="utf-8") == "panel" + assert (source / "SKILL.md").read_text(encoding="utf-8") == "# Comic\n\nOriginal.\n" +``` + +Also test: + +- changing only `templates/panel.txt` creates a different snapshot directory; +- `SkillVersion.from_dict()` remains compatible without `tree_hash`; +- reading a legacy version derives its complete tree hash; +- staging does not make a snapshot visible to `read_upstream_snapshot()`; +- promoting a staged snapshot uses `os.replace()` and is idempotent; +- a failed metadata write leaves no current pointer to the staged version. + +- [ ] **Step 2: Run test and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_storage.py -q +``` + +Expected: FAIL because upstream snapshot APIs do not exist. + +- [ ] **Step 3: Add upstream snapshot models** + +Add: + +```python +@dataclass(slots=True) +class SkillUpstreamSnapshot: + skill_name: str + source_kind: str + source_id: str + source_version: str + source_path: str + skill_content_hash: str + skill_tree_hash: str + created_at: str + frontmatter: dict[str, Any] = field(default_factory=dict) +``` + +Add `LoadedSkillUpstreamSnapshot(snapshot, content, root)` for storage reads. Extend +`SkillVersion` with a backward-compatible `tree_hash: str = ""`; new versions persist the +complete version-tree hash, while `read_published_skill()` derives it for legacy metadata +that lacks the field. + +- [ ] **Step 4: Add safe tree-copy helper** + +Refactor a private `SkillSpecStore._copy_regular_tree(source_root, target_root)` that: + +- rejects any symlink; +- rejects paths containing empty, `.`, or `..` segments; +- copies regular files only; +- creates parents; +- never writes outside `target_root`. + +Use it for transaction staging now; Task 4 will reuse it for mirrored versions. + +- [ ] **Step 5: Implement same-filesystem staging and promotion** + +`PluginSkillTransaction` creates: + +```text +<workspace>/.beaver/staging/plugin-skills/<transaction-id>/ +``` + +The staging root must be on the same filesystem as `<workspace>/skills`. It exposes: + +```python +stage_upstream_snapshot(...) +stage_skill_version(...) +promote_directory(staged, final) +cleanup() +``` + +`promote_directory()` uses `os.replace()` and never replaces an existing non-identical +immutable directory. Cleanup removes only the transaction's staging root. + +- [ ] **Step 6: Implement snapshot APIs** + +Write snapshots to: + +```text +skills/<skill>/upstreams/<source-id>/<skill-tree-hash>/ +``` + +The snapshot metadata stores both hashes. If the directory already exists, verify all +stored metadata and return it without rewriting. + +Public methods: + +```python +stage_upstream_snapshot(transaction, ...) +promote_upstream_snapshot(transaction, snapshot) +read_upstream_snapshot(skill_name, source_id, skill_tree_hash) +``` + +- [ ] **Step 7: Make JSON/current/index writes atomic** + +Change `SkillSpecStore._write_json()` and current/index pointer writes to create a temporary +file in the target directory, flush and `fsync`, then `os.replace()`. Immutable version +directories are promoted first; runtime visibility changes only when `current.json`, +`skill.json`, and the published index are atomically replaced under the workspace lock. + +- [ ] **Step 8: Run focused and existing storage tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_storage.py tests/unit/test_phase5_skills_runtime.py -q +``` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add app-instance/backend/beaver/plugins/transaction.py app-instance/backend/beaver/skills/specs app-instance/backend/tests/unit/test_plugin_skill_storage.py +git commit -m "feat(skills): store immutable plugin upstream snapshots" +``` + +--- + +### Task 4: Mirror Initial Plugin Skills As First-Class Skills + +**Files:** +- Create: `app-instance/backend/beaver/plugins/skills.py` +- Modify: `app-instance/backend/beaver/plugins/models.py` +- Modify: `app-instance/backend/beaver/plugins/__init__.py` +- Modify: `app-instance/backend/beaver/skills/specs/storage.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_sync.py` + +- [ ] **Step 1: Write failing initial mirror tests** + +Cover: + +- enabling mirrors `SKILL.md` and supporting files; +- mirrored skill is returned by `SkillsLoader.list_published_skills()`; +- `source_kind` is `plugin`, but runtime source is still workspace; +- existing non-plugin name collision fails without modification; +- any validation/safety failure in a multi-skill plugin occurs before promotion and leaves + every linked skill unchanged; +- repeated sync is idempotent. +- supporting files are present in the promoted version; +- concurrent enable calls allocate only one version. + +Core assertion: + +```python +result = manager.enable("baoyu-comic") +record = SkillsLoader(workspace).get_skill_record("baoyu-comic") +loaded = SkillSpecStore(workspace).read_published_skill("baoyu-comic") + +assert result.status == "synced" +assert record is not None and record.source == "workspace" +assert record.source_kind == "plugin" +assert loaded is not None +assert loaded.version.version == "v0001" +assert loaded.version.provenance["plugin_id"] == "baoyu-comic" +assert loaded.version.provenance["upstream_skill_content_hash"] +assert loaded.version.provenance["upstream_skill_tree_hash"] +``` + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py -q +``` + +Expected: FAIL because `PluginManager` does not exist. + +- [ ] **Step 3: Implement `PluginManager` constructor and discovery view** + +Constructor dependencies: + +```python +class PluginManager: + def __init__( + self, + *, + workspace: Path, + manifests: dict[str, PluginManifest], + discovery_errors: list[PluginDiscoveryError], + state_store: PluginStateStore, + skill_store: SkillSpecStore, + learning_store: SkillLearningStore, + publisher: SkillPublisher, + safety_checker: SkillDraftSafetyChecker, + write_lock: WorkspaceWriteLock, + ) -> None: + ... +``` + +Keep all filesystem and lifecycle dependencies injectable for tests. + +- [ ] **Step 4: Implement exact initial mirror publication** + +Acquire the workspace write lock before reading state, allocating versions, or writing +candidates. For each declared skill: + +1. persist the upstream snapshot; +2. validate ownership conflict; +3. parse frontmatter/body and create an in-memory `SkillDraft` with + `proposal_kind="plugin_initial_mirror"`; +4. run `SkillDraftSafetyChecker.check()` and reject failed or critical reports; +5. allocate the next `vNNNN` while holding the lock; +6. stage a `SkillVersion` whose content exactly equals upstream `SKILL.md`; +7. stage snapshot supporting files into the version directory; +8. generate the complete next `SkillSpec`, current pointer, index, and plugin-state JSON + payloads in memory. + +Use provenance: + +```python +{ + "source_kind": "plugin", + "plugin_id": manifest.plugin_id, + "plugin_version": manifest.version, + "plugin_skill_path": declaration.relative_path, + "upstream_skill_content_hash": snapshot.skill_content_hash, + "upstream_skill_tree_hash": snapshot.skill_tree_hash, + "merge_mode": "initial_mirror", +} +``` + +- [ ] **Step 5: Promote the complete staged transaction** + +After every declared skill passes validation: + +1. for a new skill, promote its complete staged skill directory with one `os.replace()`; +2. for an existing skill, promote immutable upstream/version directories, atomically + replace spec/index metadata, and replace `current.json` last as the visibility switch; +3. atomically write plugin state last; +4. clean the staging directory. + +Do not implement reverse rollback across already-promoted immutable directories. If a +metadata write fails, those directories remain unreferenced and harmless; the previous +current pointers remain authoritative. Add startup cleanup for staging directories older +than 24 hours. + +- [ ] **Step 6: Run focused and loader tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py tests/unit/test_phase5_skills_runtime.py -q +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/beaver/plugins app-instance/backend/beaver/skills/specs/storage.py app-instance/backend/tests/unit/test_plugin_skill_sync.py +git commit -m "feat(plugins): mirror enabled plugin skills" +``` + +--- + +### Task 5: Detect Upgrades And Create Idempotent Learning Candidates + +**Files:** +- Modify: `app-instance/backend/beaver/plugins/skills.py` +- Modify: `app-instance/backend/beaver/memory/skills/models.py` +- Modify: `app-instance/backend/beaver/memory/skills/store.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_sync.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_candidate_state.py` + +- [ ] **Step 1: Write failing upgrade classification tests** + +Create four tree-hash fixtures representing `B`, `L`, and `U`: + +```python +@pytest.mark.parametrize( + ("base", "local", "upstream", "expected"), + [ + ("A", "A", "A", "unchanged"), + ("A", "B", "B", "already_applied"), + ("A", "A", "B", "fast_forward"), + ("A", "LOCAL", "UPSTREAM", "three_way"), + ], +) +def test_classify_plugin_skill_update(base: str, local: str, upstream: str, expected: str) -> None: + assert classify_plugin_skill_update(base, local, upstream) == expected +``` + +Also test: + +- a supporting-file-only change returns `fast_forward` or `three_way`, never `unchanged`; +- candidate ID stability across repeated sync; +- new upstream supersedes an older pending candidate; +- candidate evidence contains hashes/version references but no raw skill body; +- legacy candidate payloads still parse. +- two processes syncing the same update append only one candidate record. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py tests/unit/test_skill_learning_candidate_state.py -q +``` + +Expected: FAIL because update classification and candidate kind are missing. + +- [ ] **Step 3: Add `plugin_skill_update` candidate support** + +Do not add a special status. Existing candidate statuses remain sufficient. Ensure +`SkillLearningCandidate.from_dict()` accepts the new `kind` without changing legacy +defaults. + +Use evidence: + +```python +{ + "plugin_id": plugin_id, + "plugin_version": manifest.version, + "skill_name": skill_name, + "merge_mode": merge_mode, + "base_upstream_tree_hash": accepted_tree_hash, + "new_upstream_tree_hash": snapshot.skill_tree_hash, + "local_version": current.version.version, +} +``` + +Set `priority=10`, `confidence=1.0`, `trigger_reason="plugin_update"`. + +- [ ] **Step 4: Implement update classification and candidate creation** + +Use canonical hashes and deterministic IDs: + +```python +candidate_id = ( + f"plugin-update:{plugin_id}:{skill_name}:" + f"{new_upstream_tree_hash[:12]}" +) +``` + +For `already_applied`, advance state without a candidate. For `fast_forward` and +`three_way`, record an open candidate. If the same ID exists in any status, do not append +another JSONL record. + +- [ ] **Step 5: Make candidate mutation atomic under the shared lock** + +Add an optional `WorkspaceWriteLock` to `SkillLearningStore`; EngineLoader supplies the +shared workspace instance, while isolated unit-test construction falls back to a +store-local lock. Add: + +```python +record_learning_candidate_if_absent(candidate) -> tuple[SkillLearningCandidate, bool] +``` + +Inside one lock acquisition, read current candidates, check the deterministic ID, and +atomically rewrite or append the JSONL record. Apply the same lock to candidate update and +transition methods. Nested calls from `PluginManager` reuse the reentrant lock. + +- [ ] **Step 6: Supersede stale pending updates** + +When a different pending candidate exists for the same plugin skill: + +```python +learning_store.transition_learning_candidate( + old_candidate_id, + "superseded", + event_type="plugin_update_superseded", + payload={"replacement_candidate_id": new_candidate_id}, +) +``` + +- [ ] **Step 7: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py tests/unit/test_skill_learning_candidate_state.py -q +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add app-instance/backend/beaver/plugins/skills.py app-instance/backend/beaver/memory/skills/models.py app-instance/backend/beaver/memory/skills/store.py app-instance/backend/tests/unit/test_plugin_skill_sync.py app-instance/backend/tests/unit/test_skill_learning_candidate_state.py +git commit -m "feat(plugins): enqueue skill upgrade candidates" +``` + +--- + +### Task 6: Add Plugin Update Draft Provenance And Fast-Forward Synthesis + +**Files:** +- Modify: `app-instance/backend/beaver/skills/specs/models.py` +- Modify: `app-instance/backend/beaver/skills/drafts/service.py` +- Modify: `app-instance/backend/beaver/skills/publisher/service.py` +- Modify: `app-instance/backend/beaver/skills/learning/service.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_learning.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_pipeline.py` + +- [ ] **Step 1: Write failing model and fast-forward tests** + +Test backward-compatible draft parsing and exact upstream fast-forward: + +```python +draft = asyncio.run(service.synthesize_draft(candidate.candidate_id, provider_bundle)) + +assert draft.proposal_kind == "plugin_skill_update" +assert draft.proposed_content == new_upstream.content +assert draft.base_version == "v0001" +assert draft.provenance["merge_mode"] == "fast_forward" +assert draft.provenance["new_upstream_tree_hash"] == new_upstream.snapshot.skill_tree_hash +assert provider.calls == [] +``` + +After publish, assert the new version contains the new upstream supporting files even when +`SKILL.md` did not change. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_learning.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: FAIL because drafts have no provenance and the learning service has no plugin +update branch. + +- [ ] **Step 3: Add backward-compatible draft provenance** + +Extend `SkillDraft`: + +```python +provenance: dict[str, Any] = field(default_factory=dict) +``` + +Include it in `to_dict()` and parse missing values as `{}` in `from_dict()`. + +- [ ] **Step 4: Add a focused draft constructor** + +Add: + +```python +def create_plugin_update_draft( + self, + *, + skill_name: str, + base_version: str, + proposed_content: str, + proposed_frontmatter: dict, + created_by: str, + reason: str, + provenance: dict, + evidence_refs: list[dict] | None = None, +) -> SkillDraft: +``` + +It writes `proposal_kind="plugin_skill_update"`. + +- [ ] **Step 5: Implement fast-forward synthesis** + +In `SkillLearningService.synthesize_draft()`, branch before ordinary revision: + +```python +if candidate.kind == "plugin_skill_update": + return await self._synthesize_plugin_update(candidate, provider_bundle) +``` + +For `merge_mode == "fast_forward"`, load `U` from `SkillSpecStore`, parse its +frontmatter/body, and create a draft exactly equal to `U`. Do not call the provider. + +- [ ] **Step 6: Serialize all skill publication** + +Add an optional `WorkspaceWriteLock` to `SkillPublisher`; EngineLoader supplies the shared +workspace instance and isolated tests use a publisher-local fallback. Hold it across +`_next_version()`, version staging/promotion, spec/current/index replacement, rollback, +and disable. This protects ordinary learned skills as well as plugin-origin skills from +racing with boot or explicit plugin sync. + +- [ ] **Step 7: Materialize referenced supporting files during publish** + +For `proposal_kind="plugin_skill_update"`, resolve the snapshot and supporting-file plan +from draft provenance. Stage the complete next version directory, including `SKILL.md` +and supporting files, before promoting it. Reject missing snapshots, path conflicts, or +tree-hash mismatches. Ordinary skill publication keeps its current behavior. + +- [ ] **Step 8: Preserve draft provenance on publish** + +Change `SkillPublisher.publish()` provenance construction to: + +```python +provenance={ + **dict(draft.provenance), + "draft_id": draft_id, + "proposal_kind": draft.proposal_kind, + "trigger_run_id": draft.trigger_run_id, + "trigger_session_id": draft.trigger_session_id, +} +``` + +- [ ] **Step 9: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_learning.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: PASS. + +- [ ] **Step 10: Commit** + +```bash +git add app-instance/backend/beaver/skills app-instance/backend/tests/unit/test_plugin_skill_learning.py app-instance/backend/tests/unit/test_skill_learning_pipeline.py +git commit -m "feat(skill-learning): create plugin update drafts" +``` + +--- + +### Task 7: Implement Three-Way Plugin Skill Synthesis + +**Files:** +- Create: `app-instance/backend/beaver/plugins/tree_merge.py` +- Modify: `app-instance/backend/beaver/skills/learning/synthesizer.py` +- Modify: `app-instance/backend/beaver/skills/learning/service.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_learning.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py` + +- [ ] **Step 1: Write failing three-way prompt and parse tests** + +Assert the prompt contains labeled `OLD UPSTREAM`, `CURRENT LOCAL`, and `NEW UPSTREAM` +sections and does not confuse the current local version with the merge base. + +Test response parsing for: + +```json +{ + "frontmatter": {"name": "baoyu-comic", "description": "Comic workflow", "tools": []}, + "content": "# Baoyu Comic\n...", + "change_reason": "Adopt upstream layout while preserving learned review step.", + "preserved_local_sections": ["Review"], + "adopted_upstream_sections": ["Panel Layout"], + "resolved_conflicts": ["Output ordering"], + "dropped_sections": [] +} +``` + +Add supporting-file merge tests: + +```python +def test_supporting_file_merge_adopts_upstream_when_local_is_unchanged() -> None: + plan = merge_supporting_file_trees(base={"a.txt": "A"}, local={"a.txt": "A"}, upstream={"a.txt": "U"}) + assert plan.files["a.txt"].source == "upstream" + assert plan.conflicts == [] + + +def test_supporting_file_merge_blocks_divergent_edits() -> None: + plan = merge_supporting_file_trees(base={"a.txt": "A"}, local={"a.txt": "L"}, upstream={"a.txt": "U"}) + assert plan.conflicts[0].path == "a.txt" +``` + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_learning.py tests/unit/test_skill_learning_synthesizer_preservation.py -q +``` + +Expected: FAIL because three-way synthesis does not exist. + +- [ ] **Step 3: Add `synthesize_plugin_update()`** + +Signature: + +```python +async def synthesize_plugin_update( + self, + candidate: SkillLearningCandidate, + evidence_packet: EvidencePacket, + provider: LLMProvider, + model: str, + *, + old_upstream: dict[str, Any], + current_local: dict[str, Any], + new_upstream: dict[str, Any], +) -> dict[str, Any]: +``` + +The system message must require JSON only and state: + +- preserve valid local learning; +- adopt upstream fixes and safety changes; +- do not concatenate duplicate sections; +- list every intentional drop; +- leave `resolved_conflicts` empty only when no semantic conflict exists. + +- [ ] **Step 4: Load all three snapshots in the learning service** + +Resolve: + +- `B` using `base_upstream_tree_hash`; +- `L` using `local_version`; +- `U` using `new_upstream_tree_hash`. + +Raise a specific `ValueError` when any referenced snapshot/version is missing. Do not +fallback to a two-way merge. + +- [ ] **Step 5: Build the deterministic supporting-file merge plan** + +Compare files by path and content/executable digest: + +- `L == B`: use `U`; +- `U == B`: use `L`; +- `L == U`: use either; +- one-sided addition: use the added file; +- divergent edit, different same-path additions, and delete-versus-edit: conflict. + +Exclude `SKILL.md` because the synthesizer handles it. Store selected source references +and conflict records in draft provenance; do not duplicate file bytes in JSON. + +- [ ] **Step 6: Create the plugin update draft** + +Store merge decisions in draft provenance: + +```python +{ + **plugin_reference_fields, + "merge_mode": "three_way", + "preserved_local_sections": payload["preserved_local_sections"], + "adopted_upstream_sections": payload["adopted_upstream_sections"], + "resolved_conflicts": payload["resolved_conflicts"], + "dropped_sections": payload["dropped_sections"], + "supporting_file_plan": supporting_file_plan.to_dict(), +} +``` + +If the supporting-file plan contains conflicts, the draft may be inspected but cannot be +published. V1 does not ask the LLM to merge arbitrary or binary files. + +- [ ] **Step 7: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_learning.py tests/unit/test_skill_learning_synthesizer_preservation.py -q +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add app-instance/backend/beaver/plugins/tree_merge.py app-instance/backend/beaver/skills/learning app-instance/backend/tests/unit/test_plugin_skill_learning.py app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py +git commit -m "feat(skill-learning): synthesize three-way plugin updates" +``` + +--- + +### Task 8: Extend Replay Preservation For Plugin Merges + +**Files:** +- Modify: `app-instance/backend/beaver/skills/learning/preservation.py` +- Modify: `app-instance/backend/beaver/skills/learning/eval.py` +- Modify: `app-instance/backend/beaver/skills/learning/pipeline.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_preservation.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_eval.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_pipeline.py` + +- [ ] **Step 1: Write failing plugin merge preservation tests** + +Cover: + +- merged draft preserves local Safety and adopts new upstream Safety; +- silently dropping either Safety section fails; +- explicitly resolved non-safety conflicts pass; +- unresolved conflicts block publish; +- unresolved supporting-file conflicts block publish; +- baseline replay remains current local `L`. + +Expected report shape: + +```python +assert report.preservation_report == { + "mode": "plugin_three_way", + "passed": True, + "local": {...}, + "upstream": {...}, + "unresolved_conflicts": [], +} +``` + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_preservation.py tests/unit/test_skill_learning_eval.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: FAIL because preservation only checks one base skill. + +- [ ] **Step 3: Add plugin merge preservation helper** + +Add: + +```python +def check_plugin_merge_preservation( + *, + local_content: str, + upstream_content: str, + draft_content: str, + merge_decisions: dict[str, Any], +) -> dict[str, Any]: +``` + +It calls existing `check_preservation()` for local and upstream content, gives Safety and +Required Tools sections blocking weight, and reports unresolved conflicts separately. + +- [ ] **Step 4: Use current local as replay baseline** + +When `draft.proposal_kind == "plugin_skill_update"`, load `draft.base_version` as the +baseline skill. Continue to run the candidate arm with the draft context. Do not use raw +upstream `B` or `U` as the replay baseline. + +- [ ] **Step 5: Tighten publish gate** + +Add: + +```python +if draft.proposal_kind == "plugin_skill_update": + preservation = eval_report.preservation_report or {} + if preservation.get("mode") != "plugin_three_way" and draft.provenance.get("merge_mode") == "three_way": + raise ValueError("Plugin update requires a three-way preservation report") + if preservation.get("unresolved_conflicts"): + raise ValueError("Plugin update has unresolved merge conflicts") + if draft.provenance.get("supporting_file_plan", {}).get("conflicts"): + raise ValueError("Plugin update has unresolved supporting-file conflicts") +``` + +The existing `passed is False` gate remains active. + +- [ ] **Step 6: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_skill_learning_preservation.py tests/unit/test_skill_learning_eval.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/beaver/skills/learning app-instance/backend/tests/unit/test_skill_learning_preservation.py app-instance/backend/tests/unit/test_skill_learning_eval.py app-instance/backend/tests/unit/test_skill_learning_pipeline.py +git commit -m "feat(skill-learning): gate plugin merge preservation" +``` + +--- + +### Task 9: Reconcile Publication And Implement Pause/Disable/Adopt + +**Files:** +- Modify: `app-instance/backend/beaver/plugins/skills.py` +- Modify: `app-instance/backend/beaver/skills/learning/pipeline.py` +- Modify: `app-instance/backend/beaver/skills/publisher/service.py` +- Test: `app-instance/backend/tests/unit/test_plugin_skill_sync.py` +- Test: `app-instance/backend/tests/unit/test_skill_learning_pipeline.py` + +- [ ] **Step 1: Write failing lifecycle tests** + +Test: + +- publishing a plugin update advances accepted upstream tree hash; +- pending candidate clears; +- simulated observer failure leaves the published version intact; +- the next sync reconciles state from current version provenance and does not recreate the + candidate; +- reconciliation never moves `accepted_beaver_version` backwards after rollback; +- pause leaves linked skills active and creates no update candidates; +- resume reconciles and syncs; +- disabling plugin disables linked skills without deletion; +- re-enable restores and syncs; +- missing package sets plugin status `missing`, suspends sync, and leaves linked skills + active; +- adopt changes `source_kind` to `managed`, removes binding, and keeps the skill active. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: FAIL because publication has no plugin acknowledgement callback. + +- [ ] **Step 3: Add a narrow publication observer** + +Extend pipeline construction with: + +```python +publish_observer: Callable[[SkillDraft, SkillVersion | SkillSpec], None] | None = None +``` + +After successful publish, call it before returning. Observer failure must be recorded and +audited as `plugin_publish_ack_failed`; it must not delete the already-published version +or turn the publish API response into a failure. Mark the learning candidate published +before invoking the best-effort observer so clients do not retry a successful publish. +The next sync is responsible for reconciliation. + +- [ ] **Step 4: Implement `PluginManager.on_skill_published()`** + +For `proposal_kind="plugin_skill_update"`: + +1. validate plugin ID, skill name, and new upstream tree hash from draft provenance; +2. set `accepted_upstream_tree_hash = new_upstream_tree_hash`; +3. set `observed_upstream_tree_hash = new_upstream_tree_hash`; +4. set `accepted_beaver_version = published.version`; +5. set `current_beaver_version = published.version`; +6. clear `pending_candidate_id`; +7. set status `synced`. + +- [ ] **Step 5: Implement sync-time reconciliation** + +At the beginning of `sync_enabled()`, inspect each linked skill's current published +version. When provenance contains: + +```python +{ + "proposal_kind": "plugin_skill_update", + "plugin_id": plugin_id, + "new_upstream_tree_hash": tree_hash, +} +``` + +and the referenced upstream snapshot exists, advance state only if the current version +number is newer than `accepted_beaver_version`. Clear only the matching pending candidate. +Never regress state when the runtime current pointer was rolled back to an older version. + +- [ ] **Step 6: Implement pause, resume, disable, missing, and adopt** + +`pause(plugin_id)` sets `updates_paused=True` and leaves linked skills unchanged. +`resume(plugin_id)` clears the flag and performs reconciliation/sync. + +`disable(plugin_id, disable_linked_skills=True)` rejects calls without the explicit +confirmation and calls `SkillPublisher.disable()` for every still-linked skill. +`adopt(plugin_id, skill_name)`: + +- requires an existing binding; +- changes `SkillSpec.source_kind` to `managed`; +- appends `adopted_from_plugin:<plugin-id>` to lineage; +- removes the binding; +- leaves the current version active. + +When discovery cannot find a previously known plugin, set status `missing`, preserve +`enabled` and `updates_paused`, skip update generation, and do not disable any linked +skill. + +- [ ] **Step 7: Run focused tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_skill_sync.py tests/unit/test_skill_learning_pipeline.py -q +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add app-instance/backend/beaver/plugins/skills.py app-instance/backend/beaver/skills/learning/pipeline.py app-instance/backend/beaver/skills/publisher/service.py app-instance/backend/tests/unit/test_plugin_skill_sync.py app-instance/backend/tests/unit/test_skill_learning_pipeline.py +git commit -m "feat(plugins): track published updates and ownership" +``` + +--- + +### Task 10: Wire Plugin Sync Into Engine Loading + +**Files:** +- Modify: `app-instance/backend/beaver/engine/loader.py` +- Modify: `app-instance/backend/beaver/plugins/__init__.py` +- Test: `app-instance/backend/tests/unit/test_plugin_runtime.py` +- Test: `app-instance/backend/tests/unit/test_phase5_skills_runtime.py` + +- [ ] **Step 1: Write failing runtime assembly tests** + +Test: + +- discovered disabled plugins do not mirror; +- enabled plugin mirrors before `EngineLoadResult.skills` is calculated; +- changed plugin creates a candidate but never calls an LLM during boot; +- repeated boot creates no duplicate versions/candidates; +- concurrent multi-process boot creates no duplicate versions/candidates; +- boot skips auto-sync and reports `deferred_lock_busy` when an explicit sync holds the + workspace lock; +- `EngineLoadResult.plugin_manager` and plugin summaries are available. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_runtime.py tests/unit/test_phase5_skills_runtime.py -q +``` + +Expected: FAIL because `EngineLoader` does not assemble plugin services. + +- [ ] **Step 3: Extend `EngineLoadResult` and loader injection** + +Add: + +```python +plugin_manager: PluginManager | None = None +plugins: list[dict] = field(default_factory=list) +``` + +Allow `plugin_manager` injection in `EngineLoader.__init__()` for tests. + +- [ ] **Step 4: Assemble in dependency order** + +Required order: + +1. config/workspace; +2. `SkillSpecStore`, learning store, and `SkillsLoader`; +3. tool registry and builtins, including skill-view tools using that loader; +4. draft/review/publisher and a safety checker using the completed tool registry; +5. discovery and `PluginStateStore`; +6. `PluginManager`; +7. `plugin_manager.sync_enabled(blocking=False)` when `config.plugins.auto_sync`; +8. learning service/pipeline with publication observer; +9. result summaries. + +Do not use `SkillsLoader.extra_dirs` for plugin skills. Explicit API enable/sync uses a +bounded blocking lock timeout; Engine boot uses a non-blocking attempt and proceeds with +the current published skill set if another writer owns the lock. + +- [ ] **Step 5: Run runtime tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_runtime.py tests/unit/test_phase5_skills_runtime.py -q +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add app-instance/backend/beaver/engine/loader.py app-instance/backend/beaver/plugins app-instance/backend/tests/unit/test_plugin_runtime.py app-instance/backend/tests/unit/test_phase5_skills_runtime.py +git commit -m "feat(runtime): sync declarative plugins at boot" +``` + +--- + +### Task 11: Add Plugin Management API + +**Files:** +- Modify: `app-instance/backend/beaver/interfaces/web/app.py` +- Test: `app-instance/backend/tests/unit/test_plugin_web_api.py` + +- [ ] **Step 1: Write failing API tests** + +Cover: + +```text +GET /api/plugins +POST /api/plugins/sync +POST /api/plugins/{plugin_id}/enable +POST /api/plugins/{plugin_id}/pause +POST /api/plugins/{plugin_id}/resume +POST /api/plugins/{plugin_id}/disable +POST /api/plugins/{plugin_id}/skills/{skill_name}/adopt +``` + +Assert `404` for unknown plugin, `409` for skill ownership conflict, and `400` for invalid +manifest/sync errors. Assert lock timeout maps to `409 plugin_write_busy`. Assert no +payload contains the real absolute workspace or external search-root path. Assert disable +without `{"disable_linked_skills": true}` is rejected. + +- [ ] **Step 2: Run tests and verify failure** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_web_api.py -q +``` + +Expected: FAIL with missing routes. + +- [ ] **Step 3: Add normalized plugin payload helper** + +Return: + +```python +{ + "id": manifest.plugin_id, + "name": manifest.name, + "discovered_version": manifest.version, + "installed_version": state.installed_version, + "enabled": state.enabled, + "status": state.status, + "last_error": state.last_error, + "manifest_path": manifest.display_path, + "updates_paused": state.updates_paused, + "skills": [ + { + "name": declaration.name, + "status": binding.status, + "current_beaver_version": binding.current_beaver_version, + "accepted_upstream_tree_hash": binding.accepted_upstream_tree_hash, + "observed_upstream_tree_hash": binding.observed_upstream_tree_hash, + "accepted_beaver_version": binding.accepted_beaver_version, + "pending_candidate_id": binding.pending_candidate_id, + } + ], +} +``` + +Never return arbitrary plugin file content, secrets, or absolute server paths. + +- [ ] **Step 4: Implement routes** + +Each mutating endpoint boots one runtime, invokes its `plugin_manager`, and returns the +updated plugin payload. Map `ValueError` messages to stable HTTP status codes. + +- [ ] **Step 5: Run focused and existing web tests** + +```bash +cd app-instance/backend +pytest tests/unit/test_plugin_web_api.py tests/unit/test_skill_learning_web_api.py -q +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add app-instance/backend/beaver/interfaces/web/app.py app-instance/backend/tests/unit/test_plugin_web_api.py +git commit -m "feat(api): manage declarative plugins" +``` + +--- + +### Task 12: Add Plugin Management To The Skills UI + +**Files:** +- Modify: `app-instance/frontend/types/index.ts` +- Modify: `app-instance/frontend/lib/api.ts` +- Modify: `app-instance/frontend/app/(app)/skills/page.tsx` +- Test: `app-instance/frontend/lib/plugin-api.test.ts` + +- [ ] **Step 1: Write failing API client tests** + +Test URL, method, and response typing for list, sync, enable, pause, resume, disable, and +adopt. + +- [ ] **Step 2: Run frontend test and verify failure** + +Run the repository's existing frontend test command targeting: + +```bash +cd app-instance/frontend +npx vitest run lib/plugin-api.test.ts +``` + +Expected: FAIL because plugin API functions do not exist. + +- [ ] **Step 3: Add frontend types** + +Add: + +```typescript +export interface PluginSkillBinding { + name: string; + status: string; + current_beaver_version?: string | null; + accepted_upstream_tree_hash?: string | null; + observed_upstream_tree_hash?: string | null; + accepted_beaver_version?: string | null; + pending_candidate_id?: string | null; +} + +export interface BeaverPlugin { + id: string; + name: string; + discovered_version?: string | null; + installed_version?: string | null; + enabled: boolean; + updates_paused: boolean; + status: string; + last_error?: string | null; + manifest_path?: string | null; + skills: PluginSkillBinding[]; +} +``` + +- [ ] **Step 4: Add API functions** + +Implement: + +```typescript +listPlugins() +syncPlugins() +enablePlugin(pluginId) +pausePlugin(pluginId) +resumePlugin(pluginId) +disablePlugin(pluginId, { disable_linked_skills: true }) +adoptPluginSkill(pluginId, skillName) +``` + +- [ ] **Step 5: Add a `plugins` Skills tab** + +Extend `SkillsTab` and render a compact table with: + +- plugin name and versions; +- enabled/status badges; +- linked skills and pending candidate link; +- icon buttons with tooltips for sync, enable, pause, resume, disable, and adopt; +- confirmation before disable/adopt; +- missing-source warning stating that current skills remain active but updates are + suspended; +- existing `runAction()` and error handling. + +Do not add a separate marketing-style page or nested cards. + +- [ ] **Step 6: Label plugin-origin skills and update candidates** + +In existing Published/Candidates/Drafts views: + +- show `Plugin` source badge when `source_kind === "plugin"`; +- render `plugin_skill_update` as `插件升级合并 / Plugin update merge`; +- show `fast_forward` or `three_way` from candidate evidence/provenance. + +- [ ] **Step 7: Run frontend tests and type checks** + +```bash +cd app-instance/frontend +npx vitest run lib/plugin-api.test.ts +npm run lint +npx tsc --noEmit +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add app-instance/frontend/types/index.ts app-instance/frontend/lib/api.ts app-instance/frontend/lib/plugin-api.test.ts 'app-instance/frontend/app/(app)/skills/page.tsx' +git commit -m "feat(skills-ui): manage plugin skill mirrors" +``` + +--- + +### Task 13: Add End-To-End Lifecycle Coverage And Documentation + +**Files:** +- Create: `app-instance/backend/tests/integration/test_plugin_skill_lifecycle.py` +- Create: `docs/plugins/skill-plugins.md` +- Modify: `docs/product-discovery/beaver/README.md` + +- [ ] **Step 1: Write the end-to-end lifecycle test** + +The test must: + +1. create plugin `1.0.0`; +2. enable it and assert mirror `v0001`; +3. publish a normal learned local revision `v0002`; +4. replace the package with plugin `1.1.0`; +5. sync and assert one `three_way` candidate; +6. synthesize with a stub provider; +7. run safety and replay evaluation with a stub runner; +8. submit, approve, and publish `v0003`; +9. assert accepted upstream tree hash and provenance advanced; +10. rollback to `v0002`; +11. assert plugin source files were never modified; +12. update only a supporting file and assert a new update candidate is created; +13. simulate publish-observer failure and assert the next sync reconciles state; +14. remove the plugin package and assert the plugin is `missing` while the current skill + remains active; +15. run two sync processes and assert no duplicate version or candidate is created. + +- [ ] **Step 2: Run the integration test and fix only lifecycle defects** + +```bash +cd app-instance/backend +pytest tests/integration/test_plugin_skill_lifecycle.py -v +``` + +Expected: PASS. + +- [ ] **Step 3: Write operator documentation** + +Document: + +- package layout and manifest; +- discovery roots; +- explicit enable requirement; +- mirror and three-way merge behavior; +- dual content/tree hashing and supporting-file merge conflicts; +- update candidate review flow; +- pause/resume versus disable/adopt; +- recovery from missing/invalid plugins; +- workspace locking, deferred boot sync, and publication reconciliation; +- why plugin Python code is not executed in V1. + +- [ ] **Step 4: Run the complete relevant backend suite** + +```bash +cd app-instance/backend +pytest \ + tests/unit/test_plugin_manifest.py \ + tests/unit/test_plugin_hashing.py \ + tests/unit/test_plugin_state.py \ + tests/unit/test_workspace_write_lock.py \ + tests/unit/test_plugin_skill_storage.py \ + tests/unit/test_plugin_skill_sync.py \ + tests/unit/test_plugin_skill_learning.py \ + tests/unit/test_plugin_runtime.py \ + tests/unit/test_plugin_web_api.py \ + tests/unit/test_skill_learning_candidate_state.py \ + tests/unit/test_skill_learning_pipeline.py \ + tests/unit/test_skill_learning_eval.py \ + tests/unit/test_skill_learning_worker.py \ + tests/unit/test_phase5_skills_runtime.py \ + tests/integration/test_plugin_skill_lifecycle.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 5: Run frontend verification** + +```bash +cd app-instance/frontend +npx vitest run lib/plugin-api.test.ts +npm run lint +npx tsc --noEmit +``` + +Expected: PASS. + +- [ ] **Step 6: Run a dirty-worktree-safe diff review** + +```bash +git status --short +git diff --check +git diff --stat +``` + +Expected: + +- no whitespace errors; +- only plugin/skill lifecycle files and planned docs/tests are included in this feature; +- unrelated pre-existing user changes remain untouched. + +- [ ] **Step 7: Commit** + +```bash +git add app-instance/backend/tests/integration/test_plugin_skill_lifecycle.py docs/plugins/skill-plugins.md docs/product-discovery/beaver/README.md +git commit -m "docs(plugins): document skill mirror lifecycle" +``` + +--- + +## Release Sequence + +1. Ship backend manifest, state, snapshots, and initial mirror behind the Plugins API. +2. Enable update candidate generation after initial mirror tests pass in a real workspace. +3. Enable three-way synthesis and replay publish gates. +4. Ship the Plugins UI. +5. Keep executable plugin code disabled; design it separately with process isolation and + permission boundaries. + +## Rollout Metrics + +Track: + +- plugin discovery and manifest error count; +- initial mirror success/failure count; +- plugin update candidates created, superseded, rejected, and published; +- plugin update candidates caused by supporting-file-only changes; +- fast-forward versus three-way update ratio; +- write-lock contention and deferred boot sync count; +- publication reconciliation repair count; +- replay regression and preservation failure rate; +- time from upstream discovery to accepted publication; +- rollback count for plugin-origin versions. + +## Final Acceptance Test + +The feature is complete only when a plugin-origin skill can: + +1. be enabled and used with normal skill priority; +2. accumulate a normal Beaver-learned revision; +3. receive a newer upstream plugin version; +4. produce a three-way update draft without editing the plugin package; +5. pass the same safety, replay, review, and publish gates as ordinary skills; +6. retain full upstream and local provenance; +7. detect and publish supporting-file-only updates; +8. survive concurrent boot/sync without duplicate versions or candidates; +9. recover plugin state after observer failure; +10. remain active when its plugin package is temporarily missing; +11. be paused, resumed, rolled back, disabled, re-enabled, or adopted without data loss. diff --git a/docs/superpowers/specs/2026-06-15-plugin-skill-mirroring-design.md b/docs/superpowers/specs/2026-06-15-plugin-skill-mirroring-design.md new file mode 100644 index 0000000..68d6e2e --- /dev/null +++ b/docs/superpowers/specs/2026-06-15-plugin-skill-mirroring-design.md @@ -0,0 +1,409 @@ +# Beaver Plugin Skill Mirroring Design + +## Decision + +Beaver V1 plugins are declarative skill bundles. Enabling a plugin mirrors each declared +`SKILL.md` and its supporting files into `SkillSpecStore`. From that point onward, the +mirrored skill is a normal Beaver skill: + +- it has the same resolver priority as any workspace-managed skill; +- runtime activation, receipts, performance scoring, replay evaluation, review, publish, + rollback, and disable all use the existing skill lifecycle; +- self-learning only writes Beaver-managed versions and never edits the plugin package; +- plugin origin remains metadata, not a separate runtime class. + +An arbitrary in-process Python entrypoint, hooks, providers, and custom runtime code are +out of scope for this plan. Tool-providing plugins should continue to use MCP until a +separate executable-plugin security design is approved. + +## Why The Proposed Flow Is Correct + +The proposed "mirror, learn on the mirror, merge on plugin update, then evaluate" flow is +correct with one important refinement: plugin upgrades must be treated as a three-way +merge, not a two-document rewrite. + +The three inputs are: + +1. `B`, the last accepted upstream plugin snapshot; +2. `L`, the current Beaver-published skill, including local self-learning; +3. `U`, the newly discovered upstream plugin snapshot. + +This distinction prevents a plugin update from silently deleting local learning and +prevents local learning from silently discarding new upstream safety or workflow changes. + +## Package Contract + +Each plugin directory contains `beaver.plugin.json`: + +```json +{ + "schema_version": 1, + "id": "baoyu-comic", + "name": "Baoyu Comic", + "version": "1.2.0", + "skills": [ + { + "name": "baoyu-comic", + "path": "skills/baoyu-comic" + } + ] +} +``` + +Rules: + +- `id` and skill names use lowercase letters, digits, `_`, and `-`. +- Skill paths are relative to the plugin root and cannot escape it. +- Every skill directory must contain `SKILL.md`. +- Symlinks are rejected while mirroring. +- Two enabled plugins cannot own the same Beaver skill name. +- A plugin cannot overwrite an existing non-plugin workspace skill. +- Discovery does not enable a plugin. Enablement is an explicit admin action. + +## Storage Model + +Plugin packages remain outside the managed skill version tree: + +```text +workspace/ + plugins/ + baoyu-comic/ + beaver.plugin.json + skills/baoyu-comic/SKILL.md + .beaver/ + plugins/state.json + skills/ + baoyu-comic/ + skill.json + current.json + upstreams/ + baoyu-comic/ + <tree-hash>/ + upstream.json + SKILL.md + assets/... + versions/ + v0001/ + version.json + SKILL.md + assets/... +``` + +`upstreams/` stores immutable raw plugin snapshots. `versions/` stores runtime-visible +Beaver versions. A merged Beaver version may differ from its upstream snapshot. + +Every upstream snapshot has two hashes: + +- `skill_content_hash`: canonical hash of normalized `SKILL.md`; used by the LLM merge and + skill-content preservation checks. +- `skill_tree_hash`: hash of every regular file in the skill tree, including normalized + relative path, byte length, bytes, and executable-bit metadata. Symlinks are rejected. + This is the supply-chain identity used for update detection and state. + +The tree hash includes `SKILL.md`, templates, assets, examples, and scripts. Full Unix +mode bits are not hashed because umask and extraction tools can change them; only whether +any executable bit is set is normalized into the hash. Beaver metadata files such as +`version.json` and `upstream.json` are excluded. + +Every Beaver `SkillVersion` also stores a backward-compatible `tree_hash`. New versions +compute it from the complete promoted version directory. Older versions without the field +derive it on read, so `L.tree` is available for upgrade classification. + +Plugin state records: + +```json +{ + "plugins": { + "baoyu-comic": { + "enabled": true, + "installed_version": "1.2.0", + "manifest_path": "plugins/baoyu-comic/beaver.plugin.json", + "updates_paused": false, + "skills": { + "baoyu-comic": { + "accepted_upstream_tree_hash": "sha256...", + "observed_upstream_tree_hash": "sha256...", + "accepted_beaver_version": "v0003", + "current_beaver_version": "v0003", + "pending_candidate_id": null, + "status": "synced" + } + } + } + } +} +``` + +Skill versions and drafts also carry plugin provenance. State is operational metadata; +version provenance is the durable audit record. + +## Initial Enable Flow + +When an admin enables a valid plugin: + +1. Discover and validate the manifest. +2. Copy each declared skill into an immutable upstream snapshot. +3. Reject ownership/name conflicts before changing any skill. +4. Run the existing deterministic skill safety checker against an in-memory initial-mirror + draft and reject failed or critical results. +5. Publish an exact Beaver mirror as the next skill version. +6. Copy supporting files into that version. +7. Mark the skill `source_kind="plugin"` and active. +8. Record plugin ID, plugin version, source path, upstream hash, and mirror mode in + `SkillVersion.provenance`. +9. Update plugin state only after all declared skills succeed. + +Initial enable is an explicit trust action, so it does not require LLM synthesis. Manifest +validation, path validation, and the existing static skill safety checks still apply. + +All files are first written below a transaction staging directory on the same filesystem. +Only after manifest validation, tree hashing, conflict checks, and safety checks pass are +immutable upstream/version directories promoted with `os.replace()`. `current.json`, +`skill.json`, and indexes are atomically replaced under the workspace write lock; plugin +state is written last. A failed transaction may leave an unreferenced immutable directory, +which cleanup can remove, but it cannot make a partial version runtime-visible. + +For a new skill, the complete staged skill directory is promoted once. For an existing +skill, immutable directories and metadata are promoted first and `current.json` is +replaced last as the runtime visibility switch. This provides per-skill atomic visibility; +the workspace lock serializes writers across a multi-skill plugin operation. + +## Runtime Priority + +Mirrored plugin skills are loaded exclusively from `SkillSpecStore`. They are not supplied +through `SkillsLoader.extra_dirs`. + +This makes priority deterministic: + +1. active published workspace versions, including plugin-origin versions; +2. builtin skills. + +`source_kind="plugin"` is displayed for provenance but does not lower selection priority +or exclude the skill from self-learning. + +## Upgrade Classification + +For each linked skill, compare upstream tree hashes: + +| Condition | Action | +| --- | --- | +| `U.tree == B.tree` | No upstream change; no action. | +| `L.tree == U.tree` | Acknowledge the new upstream snapshot; no draft needed. | +| `L.tree == B.tree` and `U.tree != B.tree` | Create a deterministic `fast_forward` plugin update draft containing `U`. | +| `L.tree != B.tree` and `U.tree != B.tree` | Create a `three_way` plugin update candidate using `B`, `L`, and `U`. | + +Even the `fast_forward` case goes through safety, replay evaluation, review, and publish. +It skips LLM merge synthesis because there is no local divergence. + +Candidate IDs are deterministic: + +```text +plugin-update:<plugin-id>:<skill-name>:<new-upstream-hash-prefix> +``` + +This makes boot-time sync idempotent. + +Supporting files use a deterministic path-level three-way merge: + +- local unchanged from `B`: take `U`; +- upstream unchanged from `B`: keep `L`; +- both sides equal: keep either; +- a file added on only one side: keep it; +- divergent edits, delete-versus-edit, or different additions at the same path: record an + unresolved file conflict and block publication. + +The LLM merges only `SKILL.md`. It does not attempt to merge arbitrary or binary +supporting files. + +## Learning Integration + +Add candidate kind `plugin_skill_update`. Its evidence contains only references: + +```json +{ + "plugin_id": "baoyu-comic", + "plugin_version": "1.2.0", + "skill_name": "baoyu-comic", + "merge_mode": "three_way", + "base_upstream_tree_hash": "old-hash", + "new_upstream_tree_hash": "new-hash", + "local_version": "v0003" +} +``` + +The learning service resolves the actual snapshots from `SkillSpecStore`; raw skill +content is not duplicated into `learning-candidates.jsonl`. + +For `three_way`, the synthesizer receives: + +- old upstream `B`; +- current local skill `L`; +- new upstream `U`; +- relevant historical run evidence for `L`, when available. + +The synthesizer must return the merged skill plus explicit merge decisions: + +```json +{ + "frontmatter": {}, + "content": "...", + "change_reason": "...", + "preserved_local_sections": [], + "adopted_upstream_sections": [], + "resolved_conflicts": [], + "dropped_sections": [] +} +``` + +The generated draft uses `proposal_kind="plugin_skill_update"` and carries the complete +plugin merge provenance. + +## Evaluation And Publish Gates + +The existing flow remains authoritative: + +```text +candidate + -> draft + -> static safety + -> replay eval + -> review + -> publish + -> rollback if needed +``` + +Replay eval compares: + +- baseline arm: current local version `L`; +- candidate arm: merged draft `M`. + +The preservation report is extended for plugin updates: + +- local preservation: important instructions from `L` are not silently removed; +- upstream adoption: new important instructions from `U` are represented; +- safety/tool preservation: Safety and Required Tools changes require explicit handling; +- unresolved conflicts cause evaluation failure. + +Publishing is blocked when: + +- static safety fails; +- replay evaluation regresses; +- confidence is low under the existing gate; +- local or upstream preservation fails; +- merge decisions contain unresolved `SKILL.md` conflicts; +- the supporting-file merge plan contains unresolved path/content conflicts. + +On publish, the pipeline notifies `PluginManager`, which advances +`accepted_upstream_tree_hash`, clears the pending candidate, and records the new Beaver +version. + +Observer delivery is not the source of truth. At the start of every sync, reconciliation +inspects the current published version provenance. If it contains a valid, newer +`plugin_skill_update` result and its upstream snapshot exists, plugin state is repaired: + +- advance `accepted_upstream_tree_hash`; +- advance `accepted_beaver_version`; +- clear the matching pending candidate; +- set status to `synced`. + +Reconciliation never moves `accepted_beaver_version` backwards after a runtime rollback. +An observer failure is audited but does not make an already-successful publish request +fail, which avoids client retries creating misleading duplicate operations. + +## Concurrent And Failure Behavior + +- All plugin sync, skill version allocation/publication, plugin state mutation, and + learning-candidate mutation share a reentrant cross-process workspace write lock at + `.beaver/locks/plugin-skill-write.lock`. +- The lock uses the repository's existing `fcntl`/`msvcrt` pattern plus an in-process + reentrant guard. Nested store calls reuse the held lock instead of deadlocking. +- Candidate existence checks and JSONL writes happen inside the lock. +- Version-number allocation and version promotion happen inside the lock. +- Explicit enable/sync waits for the lock with a bounded timeout and returns a busy error + on timeout. +- Engine boot never calls an LLM. Its auto-sync uses a non-blocking lock attempt; when the + lock is busy, boot proceeds with the current published skills and reports sync deferred. +- Repeated and concurrent boot/sync is idempotent across processes, not only within one + Python object. +- If another active draft targets the same skill, the plugin update remains pending and + is not synthesized until the skill is free. +- If a newer plugin version appears while an older update is pending, the old candidate is + marked superseded and a new candidate is created against the last accepted upstream. +- Rejecting a draft preserves the plugin package, upstream snapshots, current skill, and + candidate audit history. Regeneration remains possible. +- Partial multi-skill plugin enable never promotes metadata/current pointers until every + staged skill passes validation. +- Plugin files are never modified by learning or publication. + +## Pause, Disable, Missing, And Adopt + +- Pausing updates suspends discovery-to-candidate sync while linked skills remain active. +- Resuming updates reconciles state and performs sync. +- Disabling a plugin is an explicit destructive runtime action: it pauses updates and + disables linked skills, but never deletes versions or upstream snapshots. The API + requires an explicit `disable_linked_skills=true` confirmation. +- Re-enabling restores linked skills and performs sync. +- A missing plugin package is a supply-chain status only. It marks the plugin `missing`, + suspends sync/update, and leaves the current Beaver skills active. +- An explicit `adopt` action detaches a skill from its plugin, changes + `source_kind` to `managed`, keeps the current version active, and prevents future plugin + updates from targeting it. + +## Management API And UI + +Backend endpoints: + +```text +GET /api/plugins +POST /api/plugins/sync +POST /api/plugins/{plugin_id}/enable +POST /api/plugins/{plugin_id}/pause +POST /api/plugins/{plugin_id}/resume +POST /api/plugins/{plugin_id}/disable +POST /api/plugins/{plugin_id}/skills/{skill_name}/adopt +``` + +API payloads never expose absolute server paths. Workspace manifests use workspace-relative +paths. External manifests use a redacted display path such as +`<external>/baoyu-comic/beaver.plugin.json`. + +The existing Skills page gains a Plugins tab showing: + +- discovered/enabled/missing/error state; +- installed and discovered plugin versions; +- declared skills and their current Beaver versions; +- sync state and pending learning candidate; +- enable, pause, resume, disable, sync, and adopt actions. + +Plugin-origin skills continue to appear in the normal Published, Candidates, and Drafts +tabs with provenance and merge-mode labels. + +## Non-Goals + +- Importing arbitrary plugin Python modules into the Beaver process. +- Plugin-defined hooks, providers, channels, or frontend bundles. +- Automatic downloading from a plugin marketplace. +- Automatically publishing plugin upgrades without review. +- Editing or rebasing plugin source files in place. + +## Acceptance Criteria + +1. Enabling a plugin mirrors all declared skills and supporting files into managed + versions. +2. Mirrored skills have the same resolver priority and learning eligibility as ordinary + workspace skills. +3. Self-learning never modifies the plugin package. +4. Plugin updates create idempotent `plugin_skill_update` candidates. +5. Local divergence triggers a three-way merge; no divergence triggers a deterministic + fast-forward draft. +6. Every plugin update passes the existing safety, replay, review, and publish gates. +7. Publishing advances plugin state and preserves complete provenance. +8. Pause, disable, missing package, rejection, restart, and newer-update races do not lose + the current skill or its learned versions; missing packages leave current skills active. +9. Existing non-plugin skills and legacy candidate payloads remain backward compatible. +10. Supporting-file-only updates change the upstream tree hash and create an update + candidate. +11. Concurrent boot, sync, and enable operations do not allocate duplicate versions or + append duplicate candidates. +12. Sync reconciliation repairs plugin state after a published version succeeds but its + observer/state update fails.