feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions

View File

@ -1,145 +1,4 @@
{
"agents": [
{
"agent_id": "researcher",
"capabilities": [
"research",
"analysis",
"source review",
"requirements"
],
"created_at": "2026-05-11T03:13:06.912240+00:00",
"description": "Finds facts, references, constraints, and implementation options.",
"display_name": "Researcher",
"metadata": {},
"model": null,
"name": "researcher",
"priority": 50,
"provider_name": null,
"role": "research",
"skill_names": [],
"source": "builtin",
"status": "active",
"system_prompt": "You are a research specialist. Gather concise evidence and tradeoffs for the parent task.",
"tags": [
"planning",
"research"
],
"tool_hints": [],
"updated_at": "2026-05-11T03:13:06.912247+00:00"
},
{
"agent_id": "implementer",
"capabilities": [
"implementation",
"coding",
"refactor",
"integration"
],
"created_at": "2026-05-11T03:13:06.912250+00:00",
"description": "Builds scoped implementation slices and proposes concrete changes.",
"display_name": "Implementer",
"metadata": {},
"model": null,
"name": "implementer",
"priority": 45,
"provider_name": null,
"role": "implementation",
"skill_names": [],
"source": "builtin",
"status": "active",
"system_prompt": "You are an implementation specialist. Produce practical, scoped implementation output.",
"tags": [
"coding",
"build"
],
"tool_hints": [],
"updated_at": "2026-05-11T03:13:06.912251+00:00"
},
{
"agent_id": "reviewer",
"capabilities": [
"review",
"quality",
"risk",
"verification"
],
"created_at": "2026-05-11T03:13:06.912252+00:00",
"description": "Reviews plans, code, outputs, and risks before final synthesis.",
"display_name": "Reviewer",
"metadata": {},
"model": null,
"name": "reviewer",
"priority": 45,
"provider_name": null,
"role": "review",
"skill_names": [],
"source": "builtin",
"status": "active",
"system_prompt": "You are a review specialist. Focus on defects, missing requirements, and risks.",
"tags": [
"review",
"quality"
],
"tool_hints": [],
"updated_at": "2026-05-11T03:13:06.912253+00:00"
},
{
"agent_id": "tester",
"capabilities": [
"testing",
"verification",
"regression",
"qa"
],
"created_at": "2026-05-11T03:13:06.912255+00:00",
"description": "Designs and executes verification checks for task outputs.",
"display_name": "Tester",
"metadata": {},
"model": null,
"name": "tester",
"priority": 40,
"provider_name": null,
"role": "testing",
"skill_names": [],
"source": "builtin",
"status": "active",
"system_prompt": "You are a testing specialist. Identify focused checks and report pass/fail evidence.",
"tags": [
"test",
"quality"
],
"tool_hints": [],
"updated_at": "2026-05-11T03:13:06.912256+00:00"
},
{
"agent_id": "documenter",
"capabilities": [
"documentation",
"explanation",
"migration notes",
"release notes"
],
"created_at": "2026-05-11T03:13:06.912257+00:00",
"description": "Writes and reconciles user-facing and internal documentation updates.",
"display_name": "Documenter",
"metadata": {},
"model": null,
"name": "documenter",
"priority": 35,
"provider_name": null,
"role": "documentation",
"skill_names": [],
"source": "builtin",
"status": "active",
"system_prompt": "You are a documentation specialist. Produce concise docs aligned with the implementation.",
"tags": [
"docs",
"communication"
],
"tool_hints": [],
"updated_at": "2026-05-11T03:13:06.912258+00:00"
}
],
"agents": [],
"version": 1
}

View File

@ -7,6 +7,7 @@ import asyncio
import io
import mimetypes
import os
import re
import secrets
import shutil
import time
@ -49,9 +50,11 @@ from beaver.services.user_file_resolver import (
UserFileStorageResolver,
build_file_auth_context,
)
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
from beaver.skills.authoring.format import parse_skill_rewrite_json
from beaver.skills.learning import SkillLearningService, SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.learning.replay import ReplayRunner
from beaver.skills.catalog.utils import parse_frontmatter
from beaver.skills.catalog.utils import extract_required_tool_names, parse_frontmatter
from .deps import get_agent_service
from .files import (
@ -96,8 +99,11 @@ from .schemas import (
try:
from fastapi import FastAPI, File, Form, Header, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response
except ModuleNotFoundError: # pragma: no cover - fallback for skeleton-only environments
CORSMiddleware = None # type: ignore[assignment]
def File(default: Any = None) -> Any: # type: ignore[override]
return default
@ -274,6 +280,7 @@ async def _app_lifespan(
worker = SkillLearningWorker(
pipeline=loaded.skill_learning_pipeline, # type: ignore[arg-type]
provider_bundle_factory=lambda: attached_service._make_provider_bundle_for_task(loaded, {}), # noqa: SLF001
replay_runner_factory=lambda: ReplayRunner(agent_loop=attached_service.create_loop()),
config=worker_config,
)
worker_task = asyncio.create_task(worker.run_forever())
@ -516,6 +523,20 @@ def _self_restart_enabled() -> bool:
return os.getenv("BEAVER_ENABLE_SELF_RESTART", "1").strip() not in {"0", "false", "False"}
def _cors_allow_origins() -> list[str]:
raw = os.getenv("BEAVER_CORS_ALLOW_ORIGINS", "").strip()
if raw:
return [origin.strip().rstrip("/") for origin in raw.split(",") if origin.strip()]
return [
"http://127.0.0.1:3000",
"http://localhost:3000",
"http://127.0.0.1:3080",
"http://localhost:3080",
"http://127.0.0.1:3081",
"http://localhost:3081",
]
def _schedule_self_restart(delay_seconds: float = 0.75) -> None:
import threading
@ -556,6 +577,14 @@ def create_app(
shutdown_force=shutdown_force,
),
)
if CORSMiddleware is not None:
app.add_middleware(
CORSMiddleware,
allow_origins=_cors_allow_origins(),
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.state.auth_tokens = {}
app.state.handoff_codes = {}
app.state.auth_file = Path(os.getenv("BEAVER_AUTH_FILE") or "")
@ -1992,13 +2021,19 @@ def create_app(
filename = file.filename or ""
if not filename.endswith(".zip"):
raise HTTPException(status_code=400, detail="File must be a .zip archive")
loaded = get_agent_service(request).create_loop().boot()
agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot()
try:
content = await file.read()
draft = _create_skill_upload_draft(loaded, filename, content)
draft_payload = _create_skill_upload_draft(loaded, filename, content)
draft = loaded.draft_service.get_draft(draft_payload["skill_name"], draft_payload["draft_id"])
if draft is not None:
await _rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename=filename)
draft = loaded.draft_service.get_draft(draft.skill_name, draft.draft_id) or draft
draft_payload = draft.to_dict()
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
return draft
return draft_payload
@app.get("/api/marketplaces/skills/search")
async def search_skillhub(
@ -2068,13 +2103,17 @@ def create_app(
@app.get("/api/skills/candidates")
async def list_skill_candidates(request: Request, status: str | None = None) -> list[dict[str, Any]]:
loaded = get_agent_service(request).create_loop().boot()
return [item.to_dict() for item in loaded.skill_learning_pipeline.list_candidates(status=status)] # type: ignore[union-attr]
return [
_skill_learning_candidate_payload(loaded, item)
for item in loaded.skill_learning_pipeline.list_candidates(status=status) # type: ignore[union-attr]
]
@app.get("/api/skills/candidates/{candidate_id}")
async def get_skill_candidate(candidate_id: str, request: Request) -> dict[str, Any]:
loaded = get_agent_service(request).create_loop().boot()
try:
return loaded.skill_learning_pipeline.get_candidate(candidate_id).to_dict() # type: ignore[union-attr]
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
return _skill_learning_candidate_payload(loaded, candidate)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
@ -2087,25 +2126,19 @@ def create_app(
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
if candidate.draft_skill_name and candidate.draft_id:
try:
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
loaded.skill_learning_pipeline.get_draft(candidate.draft_skill_name, candidate.draft_id) # type: ignore[union-attr]
except ValueError:
pass
else:
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
draft = await loaded.skill_learning_pipeline.synthesize_draft( # type: ignore[union-attr]
candidate_id,
provider_bundle=provider_bundle,
)
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
return draft.to_dict()
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
@app.post("/api/skills/candidates/{candidate_id}/regenerate")
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
@ -2118,17 +2151,9 @@ def create_app(
candidate_id,
provider_bundle=provider_bundle,
)
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
return draft.to_dict()
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
@app.post("/api/skills/learning/run-once")
async def run_skill_learning_once(request: Request) -> dict[str, Any]:
@ -2185,17 +2210,31 @@ def create_app(
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/submit")
async def submit_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
loaded = get_agent_service(request).create_loop().boot()
agent_service = get_agent_service(request)
loop = agent_service.create_loop()
loaded = loop.boot()
try:
review = loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
safety = loaded.skill_learning_pipeline.check_safety(skill_name, draft_id) # type: ignore[union-attr]
if safety.passed and safety.risk_level != "critical":
loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
if candidate_id is not None:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
skill_name,
draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise _skill_draft_http_error(exc) from exc
return review.to_dict()
return _skill_draft_payload(loaded, skill_name, draft_id)
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/approve")
async def approve_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
@ -2719,47 +2758,70 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
if not file_infos:
raise ValueError("Zip archive is empty")
skill_entries = []
for info in file_infos:
parts = Path(info.filename.replace("\\", "/")).parts
if "__MACOSX" in parts or Path(info.filename).name == ".DS_Store":
continue
if info.filename.replace("\\", "/").startswith("/") or any(part in {"", ".", ".."} for part in parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
if parts[-1] == "SKILL.md":
if len(parts) not in (1, 2):
raise ValueError("SKILL.md must be at root or inside one top-level directory")
skill_entries.append(info.filename)
if not skill_entries:
raise ValueError("Zip must contain SKILL.md")
skill_entry = skill_entries[0]
top = Path(skill_entry).parts[0] if len(Path(skill_entry).parts) == 2 else ""
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
frontmatter, body = parse_frontmatter(raw_skill)
skill_name = str(frontmatter.get("name") or top or Path(filename).stem).strip().replace(" ", "-")
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
raise ValueError("Could not determine a safe skill name")
files: list[tuple[str, bytes]] = []
safe_entries: list[tuple[Any, str, tuple[str, ...]]] = []
for info in file_infos:
raw = info.filename.replace("\\", "/")
parts = Path(raw).parts
if "__MACOSX" in parts or Path(raw).name == ".DS_Store":
continue
if raw.startswith("/"):
if raw.startswith("/") or any(part in {"", ".", ".."} for part in parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
if top and parts and parts[0] != top:
raise ValueError("Zip archive must contain a single top-level skill directory")
rel_parts = parts[1:] if top and parts and parts[0] == top else parts
safe_entries.append((info, raw, tuple(parts)))
if _is_skill_markdown_entry(parts[-1]):
skill_entries.append(raw)
if not skill_entries:
raise ValueError("Zip must contain SKILL.md")
if len(skill_entries) > 1:
raise ValueError("Zip must contain exactly one SKILL.md")
skill_entry = skill_entries[0]
skill_root = tuple(Path(skill_entry).parts[:-1])
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
frontmatter, body = parse_frontmatter(raw_skill)
skill_name = str(frontmatter.get("name") or (skill_root[-1] if skill_root else "") or Path(filename).stem).strip().replace(" ", "-")
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
raise ValueError("Could not determine a safe skill name")
proposed_frontmatter = normalize_skill_frontmatter(
{
**dict(frontmatter),
"name": skill_name,
"description": frontmatter.get("description") or skill_name,
},
skill_name=skill_name,
)
proposed_frontmatter["tools"] = _merge_tool_names(
proposed_frontmatter.get("tools"),
extract_required_tool_names(body),
_infer_uploaded_skill_tools(
skill_name=skill_name,
filename=filename,
frontmatter=proposed_frontmatter,
content=body,
loaded=loaded,
),
)
proposed_content = ensure_canonical_skill_body(
body,
title=skill_name,
description=str(proposed_frontmatter.get("description") or ""),
tools=list(proposed_frontmatter.get("tools") or []),
)
files: list[tuple[str, bytes]] = []
for info, raw, parts in safe_entries:
if raw == skill_entry:
continue
if skill_root:
if parts[: len(skill_root)] != skill_root:
continue
rel_parts = parts[len(skill_root):]
else:
rel_parts = parts
if not rel_parts or any(part in {"", ".", ".."} for part in rel_parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
files.append(("/".join(rel_parts), archive.read(info)))
draft = loaded.draft_service.create_new_skill_draft(
skill_name=skill_name,
proposed_content=body,
proposed_frontmatter={
**dict(frontmatter),
"name": skill_name,
"description": frontmatter.get("description") or skill_name,
},
proposed_content=proposed_content,
proposed_frontmatter=proposed_frontmatter,
created_by="web-upload",
reason=f"Uploaded {filename}",
evidence_refs=[{"kind": "upload", "filename": filename, "files": sorted(path for path, _ in files)}],
@ -2784,6 +2846,162 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
return draft.to_dict()
def _is_skill_markdown_entry(filename: str) -> bool:
return filename.strip().lower() in {"skill.md", "skills.md"}
def _merge_tool_names(*groups: Any) -> list[str]:
result: list[str] = []
for group in groups:
if isinstance(group, str):
raw_items = group.split(",")
elif isinstance(group, (list, tuple, set)):
raw_items = list(group)
else:
raw_items = []
for item in raw_items:
cleaned = str(item).strip()
if cleaned and cleaned not in result:
result.append(cleaned)
return result
def _infer_uploaded_skill_tools(
*,
skill_name: str,
filename: str,
frontmatter: dict[str, Any],
content: str,
loaded: Any,
) -> list[str]:
available = _available_runtime_tool_names(loaded)
text = "\n".join(
[
skill_name,
filename,
json.dumps(frontmatter, ensure_ascii=False, sort_keys=True),
content,
]
).lower()
inferred: list[str] = []
for tool_name in sorted(available or _COMMON_RUNTIME_TOOL_NAMES):
if re.search(rf"(?<![a-z0-9_]){re.escape(tool_name.lower())}(?![a-z0-9_])", text):
inferred.append(tool_name)
def add_if_available(*tool_names: str) -> None:
for tool_name in tool_names:
if available is not None and tool_name not in available:
continue
if tool_name not in inferred:
inferred.append(tool_name)
if re.search(r"\b(weather|forecast|temperature|precipitation|rain|snow|humidity|wind|air quality|aqi)\b", text):
add_if_available("web_fetch", "web_search")
if re.search(r"\b(latest|current|today|tomorrow|news|search|query|lookup|find online|web search)\b", text):
add_if_available("web_search")
if re.search(r"\b(url|http|https|website|webpage|page|fetch|crawl|browser|online source)\b", text):
add_if_available("web_fetch")
return inferred
def _available_runtime_tool_names(loaded: Any) -> set[str] | None:
registry = getattr(loaded, "tool_registry", None)
if registry is None:
return None
try:
return {spec.name for spec in registry.list_specs()}
except Exception:
return None
_COMMON_RUNTIME_TOOL_NAMES = {
"web_fetch",
"web_search",
"read_file",
"write_file",
"patch_file",
"search_files",
"list_directory",
"memory",
"terminal",
"process",
"execute_code",
"skill_view",
"skills_list",
"skill_manage",
"cron",
}
async def _rewrite_uploaded_skill_draft_with_llm(agent_service: Any, loaded: Any, draft: Any, *, filename: str) -> None:
try:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
provider = getattr(provider_bundle, "auxiliary_provider", None) or getattr(provider_bundle, "main_provider", None)
runtime = getattr(provider_bundle, "auxiliary_runtime", None) or getattr(provider_bundle, "main_runtime", None)
if provider is None:
return
available_tool_names = sorted(_available_runtime_tool_names(loaded) or _COMMON_RUNTIME_TOOL_NAMES)
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You rewrite uploaded Beaver skills into the required house style. "
"Return only JSON with keys: frontmatter, content, change_reason. "
"Do not include markdown fences."
),
},
{
"role": "user",
"content": (
f"Uploaded filename: {filename}\n"
f"Skill name: {draft.skill_name}\n"
f"Current frontmatter:\n{json.dumps(draft.proposed_frontmatter, ensure_ascii=False, sort_keys=True)}\n\n"
f"Current content:\n{draft.proposed_content}\n\n"
f"Available runtime tool names:\n{json.dumps(available_tool_names, ensure_ascii=False)}\n\n"
f"{canonical_skill_format_instructions()}\n\n"
"Rewrite the skill so it is operational, concrete, and ready for review/publish. "
"Infer exact required runtime tools from the uploaded content when the workflow depends on tools. "
"Keep frontmatter.tools and the Required Tools section consistent."
),
},
],
tools=None,
model=getattr(runtime, "model", None),
max_tokens=4096,
temperature=0,
)
payload = parse_skill_rewrite_json(response.content or "", skill_name=draft.skill_name)
if payload is None:
return
payload["frontmatter"]["tools"] = _merge_tool_names(
payload["frontmatter"].get("tools"),
extract_required_tool_names(payload["content"]),
_infer_uploaded_skill_tools(
skill_name=draft.skill_name,
filename=filename,
frontmatter=payload["frontmatter"],
content=payload["content"],
loaded=loaded,
),
)
payload["content"] = ensure_canonical_skill_body(
payload["content"],
title=str(payload["frontmatter"].get("name") or draft.skill_name),
description=str(payload["frontmatter"].get("description") or ""),
tools=list(payload["frontmatter"].get("tools") or []),
)
draft.proposed_frontmatter = payload["frontmatter"]
draft.proposed_content = payload["content"]
if payload.get("change_reason"):
draft.reason = f"{draft.reason}; LLM rewrite: {payload['change_reason']}"
loaded.skill_spec_store.write_draft(draft)
except Exception:
return
def _debug_runs_for_session(session_manager: Any, session_id: str) -> list[dict[str, Any]]:
grouped: dict[str, list[Any]] = {}
run_order: list[str] = []
@ -3559,6 +3777,39 @@ def _skill_detail_payload(loaded: Any, name: str, version: str | None) -> dict[s
}
def _skill_learning_candidate_payload(loaded: Any, candidate: Any) -> dict[str, Any]:
payload = candidate.to_dict()
evidence = dict(payload.get("evidence") or {})
task_text = _skill_learning_candidate_task_text(loaded, candidate)
if task_text:
evidence["task_text"] = task_text
evidence["theme"] = SkillLearningService._task_theme(task_text)
payload["evidence"] = evidence
if candidate.kind == "new_skill":
payload["evidence_summary"] = f"Theme: {evidence['theme']}"
return payload
def _skill_learning_candidate_task_text(loaded: Any, candidate: Any) -> str:
evidence = candidate.evidence if isinstance(candidate.evidence, dict) else {}
task_id = str(evidence.get("task_id") or "").strip()
source_run_ids = set(candidate.source_run_ids or [])
try:
run_store = loaded.skill_learning_pipeline.learning_service.run_store
runs = run_store.list_runs()
except Exception:
return str(evidence.get("task_text") or "").strip()
if task_id:
task_runs = [record for record in runs if record.task_id == task_id]
if task_runs:
return SkillLearningService._representative_task_text(task_runs)
source_runs = [record for record in runs if record.run_id in source_run_ids]
if source_runs:
return SkillLearningService._representative_task_text(source_runs)
return str(evidence.get("task_text") or "").strip()
def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include_reviews: bool = False) -> dict[str, Any]:
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
safety = loaded.skill_learning_pipeline.get_safety_report(skill_name, draft_id) # type: ignore[union-attr]
@ -3567,6 +3818,8 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
**draft.to_dict(),
"safety_report": safety.to_dict() if safety is not None else None,
"eval_report": eval_report.to_dict() if eval_report is not None else None,
"target_version": _skill_draft_target_version(loaded, draft.skill_name, draft.proposal_kind),
"base_skill": _skill_draft_base_skill_payload(loaded, draft),
}
if include_reviews:
payload["reviews"] = [
@ -3576,6 +3829,45 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
return payload
def _skill_draft_base_skill_payload(loaded: Any, draft: Any) -> dict[str, Any] | None:
if draft.proposal_kind == "new_skill" or not draft.base_version:
return None
store = loaded.skill_learning_pipeline.publisher.store # type: ignore[union-attr]
loaded_version = store.read_published_skill(draft.skill_name, draft.base_version)
if loaded_version is None:
return None
version = loaded_version.version
return {
"skill_name": version.skill_name,
"version": version.version,
"frontmatter": dict(version.frontmatter),
"content": loaded_version.content,
"summary": version.summary,
"tool_hints": list(version.tool_hints),
}
def _skill_draft_target_version(loaded: Any, skill_name: str, proposal_kind: str) -> str | None:
if proposal_kind == "retire_skill":
return None
versions = [
item
for item in loaded.skill_learning_pipeline.publisher.store.list_versions(skill_name) # type: ignore[union-attr]
if isinstance(item, str) and item.startswith("v") and item[1:].isdigit()
]
if not versions:
return "v0001"
latest = max(int(item[1:]) for item in versions)
return f"v{latest + 1:04d}"
def _skill_learning_candidate_id_for_draft(loaded: Any, skill_name: str, draft_id: str) -> str | None:
for candidate in loaded.skill_learning_pipeline.list_candidates(): # type: ignore[union-attr]
if candidate.draft_skill_name == skill_name and candidate.draft_id == draft_id:
return candidate.candidate_id
return None
def _skill_versions_payload(loaded: Any, record: Any) -> list[dict[str, Any]]:
if record.source != "workspace":
return [

View File

@ -235,6 +235,12 @@ class SkillDraftEvalReport:
confidence: str = "low"
case_reports: list[dict[str, Any]] = field(default_factory=list)
tool_mode_summary: dict[str, Any] = field(default_factory=dict)
ability_score_summary: dict[str, Any] = field(default_factory=dict)
tool_execution_summary: dict[str, Any] = field(default_factory=dict)
case_selection_summary: dict[str, Any] = field(default_factory=dict)
real_score_avg: float | None = None
synthetic_score_avg: float | None = None
overall_score_avg: float | None = None
preservation_report: dict[str, Any] | None = None
def to_dict(self) -> dict[str, Any]:
@ -261,6 +267,12 @@ class SkillDraftEvalReport:
"confidence": self.confidence,
"case_reports": [dict(item) for item in self.case_reports],
"tool_mode_summary": dict(self.tool_mode_summary),
"ability_score_summary": dict(self.ability_score_summary),
"tool_execution_summary": dict(self.tool_execution_summary),
"case_selection_summary": dict(self.case_selection_summary),
"real_score_avg": self.real_score_avg,
"synthetic_score_avg": self.synthetic_score_avg,
"overall_score_avg": self.overall_score_avg,
"preservation_report": (
dict(self.preservation_report) if self.preservation_report is not None else None
),
@ -295,6 +307,12 @@ class SkillDraftEvalReport:
if isinstance(item, dict)
],
tool_mode_summary=dict(payload.get("tool_mode_summary") or {}),
ability_score_summary=dict(payload.get("ability_score_summary") or {}),
tool_execution_summary=dict(payload.get("tool_execution_summary") or {}),
case_selection_summary=dict(payload.get("case_selection_summary") or {}),
real_score_avg=_optional_bounded_float(payload.get("real_score_avg")),
synthetic_score_avg=_optional_bounded_float(payload.get("synthetic_score_avg")),
overall_score_avg=_optional_bounded_float(payload.get("overall_score_avg")),
preservation_report=(
dict(payload["preservation_report"])
if isinstance(payload.get("preservation_report"), dict)
@ -309,6 +327,12 @@ def _optional_str(value: Any) -> str | None:
return str(value)
def _optional_bounded_float(value: Any) -> float | None:
if value in (None, ""):
return None
return _bounded_float(value, default=0.0)
def _bounded_float(value: Any, *, default: float = 0.0) -> float:
if value in (None, ""):
return default

View File

@ -0,0 +1,19 @@
"""Skill authoring helpers."""
from .format import (
CANONICAL_SKILL_SECTION_HEADINGS,
canonical_skill_format_instructions,
canonicalize_skill_body,
ensure_canonical_skill_body,
is_canonical_skill_body,
normalize_skill_frontmatter,
)
__all__ = [
"CANONICAL_SKILL_SECTION_HEADINGS",
"canonical_skill_format_instructions",
"canonicalize_skill_body",
"ensure_canonical_skill_body",
"is_canonical_skill_body",
"normalize_skill_frontmatter",
]

View File

@ -0,0 +1,250 @@
"""Canonical Beaver skill authoring format."""
from __future__ import annotations
import json
import re
from typing import Any
from beaver.skills.catalog.utils import extract_required_tool_names
CANONICAL_SKILL_SECTION_HEADINGS: tuple[str, ...] = (
"## Overview",
"## When to Use",
"## Required Tools",
"## Workflow",
"## Validation",
"## Boundaries",
"## Anti-Patterns",
)
def canonical_skill_format_instructions() -> str:
headings = "\n".join(f"- {heading}" for heading in CANONICAL_SKILL_SECTION_HEADINGS)
return (
"Canonical Beaver SKILL.md format:\n"
"1. Return a frontmatter object with `name`, `description`, and `tools`.\n"
"2. `name` must be lowercase kebab-case. `description` must explain when the skill should be used.\n"
"3. `tools` must be an explicit JSON array of exact runtime tool names. Use [] only if no tool is required.\n"
"4. The Markdown content must start with one H1 title and include these H2 sections in this exact order:\n"
f"{headings}\n"
"5. Write concrete operational guidance, not a story about a past task.\n"
"6. Include validation steps and anti-patterns so future runs know how to avoid false completion."
)
def normalize_skill_frontmatter(frontmatter: dict[str, Any] | None, *, skill_name: str) -> dict[str, Any]:
raw = dict(frontmatter or {})
name = _slug(str(raw.get("name") or skill_name))
description = str(raw.get("description") or f"Use when {name} guidance is needed.").strip()
tools = _coerce_string_list(raw.get("tools"))
normalized = {}
for key, value in raw.items():
if key in {"name", "description", "tools"}:
continue
if key in {"always", "internal"} and isinstance(value, str):
normalized[key] = value.strip().lower() in {"1", "true", "yes", "on"}
continue
normalized[key] = value
return {
"name": name,
"description": description,
"tools": tools,
**normalized,
}
def is_canonical_skill_body(body: str) -> bool:
text = body.strip()
if not re.search(r"^#\s+\S", text, flags=re.MULTILINE):
return False
position = 0
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
found = text.find(heading, position)
if found < 0:
return False
position = found + len(heading)
return True
def ensure_canonical_skill_body(
body: str,
*,
title: str,
description: str = "",
tools: list[str] | None = None,
) -> str:
if is_canonical_skill_body(body):
normalized = body.strip()
if tools:
normalized = _replace_required_tools_section(normalized, tools)
return normalized + "\n"
source = _compact_source_guidance(body)
overview = description or source or f"Use this skill for {title}."
return canonicalize_skill_body(
title=title,
overview=overview,
tools=list(tools or []),
workflow=[
"Identify whether the user's request matches the skill's trigger conditions.",
"Read the relevant source guidance below and apply only the steps that fit the current task.",
"Use the required tools deliberately and keep tool output tied to the user's goal.",
],
validation=[
"Verify the requested outcome with the most direct available check.",
"Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.",
],
boundaries=[
"Do not broaden the task beyond the user's request.",
"Do not use tools that are not listed or clearly available in the current runtime.",
],
anti_patterns=[
"Do not summarize the skill instead of applying it.",
"Do not claim completion without validation evidence.",
],
source_guidance=source,
)
def canonicalize_skill_body(
*,
title: str,
overview: str,
tools: list[str] | None = None,
workflow: list[str] | None = None,
validation: list[str] | None = None,
boundaries: list[str] | None = None,
anti_patterns: list[str] | None = None,
when_to_use: list[str] | None = None,
source_guidance: str = "",
) -> str:
cleaned_title = _title(title)
tool_lines = _tool_lines(tools or [])
workflow_lines = _bullet_lines(workflow or ["Follow the workflow described by the current task and evidence."])
validation_lines = _bullet_lines(validation or ["Validate the result before reporting completion."])
boundary_lines = _bullet_lines(boundaries or ["Stay within the current task and workspace boundaries."])
anti_pattern_lines = _bullet_lines(anti_patterns or ["Do not skip validation."])
when_lines = _bullet_lines(when_to_use or [f"Use when the task requires {cleaned_title} guidance."])
source_section = f"\n\n### Source Guidance\n\n{source_guidance.strip()}" if source_guidance.strip() else ""
return (
f"# {cleaned_title}\n\n"
"## Overview\n\n"
f"{overview.strip() or f'Use this skill for {cleaned_title}.'}\n\n"
"## When to Use\n\n"
f"{when_lines}\n\n"
"## Required Tools\n\n"
f"{tool_lines}\n\n"
"## Workflow\n\n"
f"{workflow_lines}{source_section}\n\n"
"## Validation\n\n"
f"{validation_lines}\n\n"
"## Boundaries\n\n"
f"{boundary_lines}\n\n"
"## Anti-Patterns\n\n"
f"{anti_pattern_lines}\n"
)
def parse_skill_rewrite_json(content: str, *, skill_name: str) -> dict[str, Any] | None:
cleaned = content.strip()
if cleaned.startswith("```"):
lines = cleaned.splitlines()
if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].startswith("```"):
cleaned = "\n".join(lines[1:-1]).strip()
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(payload, dict):
return None
frontmatter = payload.get("frontmatter")
body = payload.get("content")
if not isinstance(frontmatter, dict) or not isinstance(body, str):
return None
normalized = normalize_skill_frontmatter(frontmatter, skill_name=skill_name)
normalized["tools"] = _merge_string_lists(
normalized.get("tools"),
extract_required_tool_names(body),
)
normalized_body = ensure_canonical_skill_body(
body,
title=normalized["name"],
description=normalized["description"],
tools=normalized["tools"],
)
return {
"frontmatter": normalized,
"content": normalized_body,
"change_reason": str(payload.get("change_reason") or ""),
}
def _compact_source_guidance(body: str, *, max_chars: int = 20000) -> str:
text = body.strip()
if not text:
return ""
text = re.sub(r"^---\n.*?\n---\n?", "", text, flags=re.DOTALL).strip()
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"^(#{1,4})\s+", r"##\1 ", text, flags=re.MULTILINE)
return text[:max_chars].rstrip()
def _tool_lines(tools: list[str]) -> str:
if not tools:
return "- No dedicated tools are required."
return "\n".join(f"- `{tool}`" for tool in tools)
def _bullet_lines(items: list[str]) -> str:
cleaned = [str(item).strip() for item in items if str(item).strip()]
if not cleaned:
return "- No additional guidance."
return "\n".join(f"- {item}" for item in cleaned)
def _coerce_string_list(value: Any) -> list[str]:
if isinstance(value, list):
raw_items = value
elif isinstance(value, str):
raw_items = value.split(",")
else:
raw_items = []
result: list[str] = []
for item in raw_items:
cleaned = str(item).strip()
if cleaned and cleaned not in result:
result.append(cleaned)
return result
def _merge_string_lists(*values: Any) -> list[str]:
result: list[str] = []
for value in values:
for item in _coerce_string_list(value):
if item not in result:
result.append(item)
return result
def _replace_required_tools_section(body: str, tools: list[str]) -> str:
replacement = "## Required Tools\n\n" + _tool_lines(tools)
updated, count = re.subn(
r"(?ms)^##\s+Required\s+Tools\s*\n.*?(?=^##\s+|\Z)",
replacement + "\n\n",
body.strip(),
count=1,
)
return updated.strip() if count else body.strip()
def _slug(value: str) -> str:
text = value.strip().lower()
text = re.sub(r"[^a-z0-9-]+", "-", text)
text = re.sub(r"-{2,}", "-", text).strip("-")
return text or "generated-skill"
def _title(value: str) -> str:
cleaned = str(value or "").strip().replace("-", " ")
return cleaned.title() if cleaned else "Generated Skill"

View File

@ -28,12 +28,13 @@ Choose `new_task` when the user asks for anything that needs the main Task agent
The Intent Agent has no tools. If a request needs a tool, do not apologize and do not say you cannot access it. Route it to Task mode so the main agent can use tools.
When there is an active task, do not force every new user message into that task. Use the active task and recent conversation to decide:
When there is an active task, do not force every new user message into that task. A Session is the durable conversation/device/group context; a Task is one unit of work inside that Session. Use the active task and recent conversation to decide:
- Choose `revise_task` when the user asks to change, correct, refine, expand, reformat, or redo the latest active task result.
- Choose `continue_task` for neutral follow-up questions or additional next steps that still belong to the active task.
- Choose `continue_task` for neutral follow-up questions or additional next steps that explicitly depend on or extend the active task's latest result.
- Choose `simple_chat` for unrelated lightweight conversation. This starts a new topic and the previous task will be accepted automatically.
- Choose `new_task` when the user asks for clearly unrelated work that needs Task capabilities. This starts a new topic and the previous task will be accepted automatically.
- Choose `new_task` for a standalone tool-dependent request even when it resembles the active task. Repeating "珠海天气怎么样" later is a fresh task unless the user clearly says to continue or revise the old result.
- Choose `close_task` when the user says the task is satisfactory or finished, such as "可以了", "就这样", or "that's good".
- Choose `abandon_task` when the user says to stop, cancel, or no longer do the active task.
@ -46,6 +47,7 @@ Examples with an active weather task:
- "再详细一点" -> `revise_task`
- "加上明后天穿衣建议" -> `revise_task`
- "顺便查一下深圳" -> `continue_task`
- "珠海天气怎么样" -> `new_task` when asked as a standalone later request
- "帮我写一个采购合同" -> `new_task`
- "吃饭没" -> `simple_chat`
- "我在冰岛" -> `simple_chat`

View File

@ -27,6 +27,7 @@ from beaver.skills.specs.storage import SkillSpecStore
from .utils import (
check_requirements,
escape_xml,
extract_required_tool_names,
get_missing_requirements,
parse_frontmatter,
parse_skill_metadata_blob,
@ -111,13 +112,19 @@ class SkillsLoader:
if not include_internal and _truthy(frontmatter.get("internal")):
continue
normalized_frontmatter = dict(frontmatter)
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
record = SkillRecord(
name=name,
path=skill_file,
source=source,
version="legacy",
source_kind=source,
tool_hints=self._coerce_tool_names(frontmatter.get("tools")),
tool_hints=self._merge_tool_names(
self._coerce_tool_names(frontmatter.get("tools")),
self._coerce_tool_names(meta_blob.get("tools")),
self._coerce_tool_names(meta_blob.get("required_tools")),
extract_required_tool_names(body),
),
frontmatter=normalized_frontmatter,
description=str(frontmatter.get("description") or summarize_body(body) or name),
)
@ -138,6 +145,7 @@ class SkillsLoader:
path = self.workspace_skills / name / "SKILL.md"
else:
path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md"
_frontmatter, body = parse_frontmatter(loaded.content)
record = SkillRecord(
name=name,
path=path,
@ -146,7 +154,10 @@ class SkillsLoader:
content_hash=loaded.version.content_hash,
source_kind=str(loaded.version.provenance.get("source_kind") or "workspace"),
status=str(loaded.version.review_state or "published"),
tool_hints=list(loaded.version.tool_hints),
tool_hints=self._merge_tool_names(
loaded.version.tool_hints,
extract_required_tool_names(body),
),
frontmatter=dict(loaded.version.frontmatter),
description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name),
)
@ -201,23 +212,32 @@ class SkillsLoader:
- read_file
- search_files
- 兼容 metadata JSON blob 里的 `tools`
- 兼容 canonical 正文 `## Required Tools` 段落
"""
record = self._find_record(name)
if record is not None and record.tool_hints:
return list(record.tool_hints)
frontmatter = self.get_skill_metadata(name) or {}
content = self.load_published_skill(name) or self.load_skill(name) or ""
frontmatter, body = parse_frontmatter(content)
frontmatter = frontmatter or self.get_skill_metadata(name) or {}
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
names = [
*self._coerce_tool_names(frontmatter.get("tools")),
*self._coerce_tool_names(meta_blob.get("tools")),
*self._coerce_tool_names(meta_blob.get("required_tools")),
]
names = self._merge_tool_names(
self._coerce_tool_names(frontmatter.get("tools")),
self._coerce_tool_names(meta_blob.get("tools")),
self._coerce_tool_names(meta_blob.get("required_tools")),
extract_required_tool_names(body),
)
return names
@staticmethod
def _merge_tool_names(*groups: Any) -> list[str]:
result: list[str] = []
for item in names:
if item and item not in result:
result.append(item)
for group in groups:
for item in SkillsLoader._coerce_tool_names(group):
if item and item not in result:
result.append(item)
return result
def load_skills_for_context(self, skill_names: list[str]) -> str:

View File

@ -84,6 +84,41 @@ def strip_frontmatter(content: str) -> str:
return body
def extract_required_tool_names(body: str) -> list[str]:
"""从 canonical skill 正文的 `## Required Tools` 段落提取工具名。
这是 frontmatter `tools` 的容错补充,不从任意正文里猜工具。只读取明确
命名的 Required Tools section支持常见 bullet/code 格式。
"""
if not body:
return []
match = re.search(
r"(?ims)^##\s+Required\s+Tools\s*$\n(?P<section>.*?)(?=^##\s+|\Z)",
body,
)
if match is None:
return []
names: list[str] = []
for line in match.group("section").splitlines():
stripped = line.strip()
if not stripped or not stripped.startswith(("-", "*")):
continue
candidate = stripped[1:].strip()
code_matches = re.findall(r"`([^`]+)`", candidate)
raw_items = code_matches or re.split(r"[,]", candidate)
for raw_item in raw_items:
name = raw_item.strip().strip("`\"' ")
if not name:
continue
token = name.split()[0].strip("`\"' :-")
if re.fullmatch(r"[A-Za-z0-9_.:-]+", token) and token not in names:
names.append(token)
return names
def parse_skill_metadata_blob(raw: str) -> dict[str, Any]:
"""解析 metadata 字段里的 JSON 扩展配置。

View File

@ -2,6 +2,8 @@
from __future__ import annotations
import json
from typing import Any
from uuid import uuid4
from beaver.engine.context import SkillContext
@ -39,7 +41,16 @@ class SkillDraftEvaluator:
return self._skipped(candidate, draft)
runs = self.run_store.list_runs()
replay_cases = select_replay_cases(candidate, runs)
if replay_runner is not None:
replay_cases, case_selection_meta = await _prepare_eval_cases(
candidate=candidate,
draft=draft,
historical_cases=select_replay_cases(candidate, runs),
provider_bundle=provider_bundle,
)
else:
replay_cases = []
case_selection_meta = {}
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
@ -47,6 +58,7 @@ class SkillDraftEvaluator:
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
)
return self._evaluate_heuristic(candidate, draft, runs)
@ -58,7 +70,7 @@ class SkillDraftEvaluator:
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = []
for run_id in candidate.source_run_ids[:8]:
for run_id in candidate.source_run_ids[:10]:
record = runs_by_id.get(run_id)
if record is None:
continue
@ -116,6 +128,7 @@ class SkillDraftEvaluator:
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
@ -147,17 +160,43 @@ class SkillDraftEvaluator:
baseline=baseline,
candidate=candidate_arm,
)
baseline_score = surrogate["baseline_score"]
candidate_score = surrogate["candidate_score"]
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
@ -172,13 +211,23 @@ class SkillDraftEvaluator:
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
return _report_from_case_reports(
candidate,
draft,
case_reports,
legacy_cases,
preservation_report,
case_selection_meta or {},
)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport(
@ -238,22 +287,400 @@ def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
async def _prepare_eval_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
explicit_cases = _explicit_eval_cases(candidate)
merged = _dedupe_cases([*explicit_cases, *historical_cases])
usable, excluded = _filter_unscorable_cases(merged)
missing = max(0, 10 - len(usable))
generated: list[dict[str, Any]] = []
if missing:
generated = await _generate_synthetic_cases(
candidate=candidate,
draft=draft,
historical_cases=usable,
provider_bundle=provider_bundle,
count=missing,
)
generated, generated_excluded = _filter_unscorable_cases(generated)
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
if len(generated) < missing:
generated.extend(
_fallback_synthetic_cases(
candidate=candidate,
historical_cases=usable,
start_index=len(generated) + 1,
count=missing - len(generated),
)
)
prepared = [*usable, *generated]
return prepared[:10], {
"requested_case_count": 10,
"historical_case_count": len(historical_cases),
"explicit_case_count": len(explicit_cases),
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
}
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
if not isinstance(raw_cases, list):
return []
result: list[dict[str, Any]] = []
for index, raw in enumerate(raw_cases, start=1):
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
if not task_text:
continue
case = {
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
"session_id": raw.get("session_id") or "explicit-eval",
"task_text": task_text,
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
"synthetic": bool(raw.get("synthetic")),
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
}
if isinstance(raw.get("validator"), dict):
case["validator"] = dict(raw["validator"])
result.append(case)
return result
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
seen: set[str] = set()
for case in cases:
run_id = str(case.get("run_id") or "")
task_text = str(case.get("task_text") or "")
key = run_id or task_text
if not key or key in seen:
continue
seen.add(key)
result.append(case)
return result
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
result: list[dict[str, Any]] = []
excluded = {"synthetic_without_validator": 0}
for case in cases:
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
excluded["synthetic_without_validator"] += 1
continue
result.append(case)
return result, excluded
async def _generate_synthetic_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
count: int,
) -> list[dict[str, Any]]:
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
model = getattr(runtime, "model", None)
try:
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You generate validator-first Beaver skill evaluation cases. "
"Return only JSON with key cases. Each case must include task_text and validator. "
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
),
},
{
"role": "user",
"content": _synthetic_case_prompt(
candidate=candidate,
draft=draft,
historical_cases=historical_cases,
count=count,
),
},
],
model=model,
max_tokens=2200,
temperature=0.4,
)
except Exception:
return []
payload = _parse_json_payload(response.content or "")
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
if not isinstance(raw_cases, list):
return []
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
def _synthetic_case_prompt(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
count: int,
) -> str:
historical = [
{
"run_id": item.get("run_id"),
"task_text": item.get("task_text"),
"validator": item.get("validator"),
}
for item in historical_cases
]
return (
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
f"Candidate kind: {candidate.kind}\n"
f"Candidate reason: {candidate.reason}\n"
f"Draft skill name: {draft.skill_name}\n"
f"Related skills: {candidate.related_skill_names}\n"
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
"Every synthetic case must be validator-first. Return exactly:\n"
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
)
def _parse_json_payload(content: str) -> dict[str, Any]:
cleaned = content.strip()
if cleaned.startswith("```"):
cleaned = cleaned.strip("`")
if cleaned.startswith("json"):
cleaned = cleaned[4:]
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start < 0 or end <= start:
return {}
try:
payload = json.loads(cleaned[start : end + 1])
except json.JSONDecodeError:
return {}
return payload if isinstance(payload, dict) else {}
def _synthetic_case_payloads(
candidate: SkillLearningCandidate,
raw_cases: list[Any],
*,
start_index: int,
limit: int,
) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for raw in raw_cases:
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
validator = raw.get("validator")
if not task_text or not isinstance(validator, dict):
continue
result.append(
_synthetic_case_payload(
candidate,
task_text,
start_index + len(result),
validator=dict(validator),
tier=str(raw.get("tier") or "bronze"),
)
)
if len(result) >= limit:
break
return result
def _fallback_synthetic_cases(
*,
candidate: SkillLearningCandidate,
historical_cases: list[dict[str, Any]],
start_index: int,
count: int,
) -> list[dict[str, Any]]:
seed_text = ""
if historical_cases:
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
if not seed_text:
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
required_terms = _terms(seed_text)[:2] or ["done"]
return [
_synthetic_case_payload(
candidate,
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
index,
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
tier="bronze",
)
for index in range(start_index, start_index + count)
]
def _synthetic_case_payload(
candidate: SkillLearningCandidate,
task_text: str,
index: int,
*,
validator: dict[str, Any],
tier: str,
) -> dict[str, Any]:
return {
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
"task_id": f"synthetic-{index:02d}",
"session_id": "synthetic-eval",
"task_text": task_text,
"baseline_skill_names": _baseline_skill_names(candidate),
"candidate_skill_name": candidate.draft_skill_name,
"accepted_score": 0.75,
"synthetic": True,
"tier": tier,
"validator": validator,
}
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
if candidate.kind == "revise_skill":
return list(candidate.related_skill_names[:1])
if candidate.kind == "merge_skills":
return list(candidate.related_skill_names)
return []
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
validator_type = str(validator.get("type") or "")
if validator_type != "final_answer_contains":
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
answer = str(arm.get("final_answer") or "").lower()
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
matched = sum(1 for term in required_terms if term in answer)
outcome = matched / max(1, len(required_terms))
unsafe = any(term in answer for term in forbidden_terms)
safety = 0.0 if unsafe else 1.0
final_score = (
0.40 * outcome
+ 0.25 * outcome
+ 0.15 * _process_validity(arm)
+ 0.10 * safety
+ 0.10 * _path_efficiency(arm, outcome)
)
return {
**_ability_breakdown(score=final_score, source="auto_validator"),
"outcome_correctness": round(outcome, 4),
"artifact_correctness": round(outcome, 4),
"safety_no_regression": round(safety, 4),
"validator_type": validator_type,
}
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
answer = str(arm.get("final_answer") or "").strip()
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
return _ability_breakdown(score=score, source=source, notes=notes)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {
"outcome_correctness": bounded,
"artifact_correctness": bounded,
"process_validity": bounded,
"safety_no_regression": bounded,
"path_efficiency": bounded,
"final_score": round(bounded, 4),
"source": source,
"notes": list(notes or []),
}
def _process_validity(arm: dict[str, Any]) -> float:
if arm.get("finish_reason") == "error":
return 0.2
return 0.8 if arm.get("tool_calls") else 0.6
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
if outcome < 0.5:
return 0.3
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
if call_count <= 3:
return 1.0
if call_count <= 6:
return 0.7
return 0.4
def _bounded_score(value: Any, *, default: float) -> float:
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _terms(text: str) -> list[str]:
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
selection_meta = dict(case_selection_meta or {})
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
overall_score_avg = round(candidate_avg, 4)
ability_summary = {
"score_role": "primary",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
}
tool_execution_summary = {
"score_role": "diagnostic_only",
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
}
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
@ -276,11 +703,34 @@ def _report_from_case_reports(
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
tool_mode_summary={
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
"score_role": "diagnostic_only",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
**selection_meta,
},
ability_score_summary=ability_summary,
tool_execution_summary=tool_execution_summary,
case_selection_summary=selection_meta,
real_score_avg=real_score_avg,
synthetic_score_avg=synthetic_score_avg,
overall_score_avg=overall_score_avg,
preservation_report=preservation_report,
)
def _avg(values: list[float]) -> float | None:
if not values:
return None
return round(sum(values) / len(values), 4)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:

View File

@ -323,8 +323,8 @@ class SkillLearningPipelineService:
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
if not any(review.status == SkillReviewState.APPROVED.value for review in reviews):
raise ValueError("Draft must have an approved review before publish")
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
raise ValueError("Draft must be submitted for review before publish")
safety = self.get_safety_report(draft.skill_name, draft.draft_id)
if safety is None:
raise ValueError("Draft requires a passing safety report before publish")

View File

@ -162,18 +162,23 @@ class ReplayRunner:
registry=loaded.tool_registry,
policy=self.policy,
)
result = await self.agent_loop.process_direct(
request.task_text,
provider_bundle=request.provider_bundle,
include_skill_assembly=False,
include_tools=True,
pinned_skill_names=request.pinned_skill_names,
pinned_skill_contexts=request.pinned_skill_contexts,
max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4),
temperature=float(request.model_settings.get("temperature") or 0.0),
source="skill_replay_eval",
tool_executor_override=replay_executor,
)
direct_kwargs = {
"provider_bundle": request.provider_bundle,
"include_skill_assembly": False,
"include_tools": True,
"pinned_skill_names": request.pinned_skill_names,
"pinned_skill_contexts": request.pinned_skill_contexts,
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
"temperature": float(request.model_settings.get("temperature") or 0.0),
"source": "skill_replay_eval",
"tool_executor_override": replay_executor,
}
try:
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
except RuntimeError as exc:
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
raise
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
return {
"case_id": request.case_id,
"arm": request.arm,
@ -188,6 +193,14 @@ class ReplayRunner:
}
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
message = str(exc)
return (
"AgentLoop.process_direct() is disabled while run() is active" in message
and "submit tasks via submit_direct() instead" in message
)
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
effects: list[dict[str, Any]] = []
for trace in traces:

View File

@ -99,6 +99,7 @@ class SkillLearningService:
]
source_run_ids = [record.run_id for record in source_runs]
source_session_ids = list(dict.fromkeys(record.session_id for record in source_runs))
representative_task_text = self._representative_task_text(source_runs, fallback=final_run.task_text)
if not published_receipts:
candidates.append(
@ -113,7 +114,8 @@ class SkillLearningService:
"task_id": task_id,
"final_accepted_run_id": final_accepted_run_id,
"source_run_ids": source_run_ids,
"theme": self._task_theme(final_run.task_text),
"task_text": representative_task_text,
"theme": self._task_theme(representative_task_text),
},
status="open",
priority=1,
@ -329,8 +331,14 @@ class SkillLearningService:
def _build_new_skill_candidates(self) -> list[SkillLearningCandidate]:
groups: dict[str, list[RunRecord]] = {}
for record in self.run_store.list_runs():
key = self._task_theme(record.task_text)
all_runs = self.run_store.list_runs()
runs_by_task: dict[str, list[RunRecord]] = {}
for record in all_runs:
if record.task_id:
runs_by_task.setdefault(record.task_id, []).append(record)
for record in all_runs:
task_runs = runs_by_task.get(record.task_id, [record])
key = self._task_theme(self._representative_task_text(task_runs, fallback=record.task_text))
if not key:
continue
groups.setdefault(key, []).append(record)
@ -443,12 +451,24 @@ class SkillLearningService:
@staticmethod
def _task_theme(task_text: str) -> str:
cleaned = re.sub(r"\s+", " ", task_text.strip().lower())
cleaned = re.sub(r"\s+", " ", task_text.strip())
if not cleaned:
return ""
words = cleaned.split(" ")
first_sentence = re.split(r"[。!?.!?]", cleaned, maxsplit=1)[0].strip()
if not first_sentence:
first_sentence = cleaned
words = first_sentence.split(" ")
return " ".join(words[:8]).strip()
@staticmethod
def _representative_task_text(runs: list[RunRecord], *, fallback: str = "") -> str:
ordered = sorted(runs, key=lambda item: (item.attempt_index, item.started_at, item.run_id))
for record in ordered:
text = record.task_text.strip()
if text:
return text
return fallback.strip()
@staticmethod
def _suggest_skill_name(
candidate: SkillLearningCandidate,

View File

@ -15,12 +15,15 @@ class SurrogateToolEvaluator:
return {
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"baseline_tool_execution_score": baseline_score,
"candidate_tool_execution_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"surrogate_tool_count": surrogate_count,
"blocked_tool_count": blocked_count,
"score_role": "diagnostic_only",
"confidence": confidence,
"notes": [
"Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.",
"Tool execution score is diagnostic only and is not the main task ability score.",
],
}

View File

@ -6,6 +6,7 @@ import json
from typing import Any
from beaver.engine.providers.base import LLMProvider
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
from beaver.skills.learning.evidence import EvidencePacket
from beaver.memory.skills.models import SkillLearningCandidate
@ -58,7 +59,8 @@ class SkillDraftSynthesizer:
"content": (
"You synthesize Beaver skill drafts from execution evidence. "
"Return only JSON with keys: frontmatter, content, change_reason, "
"preserved_sections, changed_sections, dropped_sections."
"preserved_sections, changed_sections, dropped_sections. "
"The content must follow the Canonical Beaver SKILL.md format."
),
},
{"role": "user", "content": prompt},
@ -113,6 +115,7 @@ class SkillDraftSynthesizer:
+ "\n- tools: an explicit JSON array of exact tool names this skill needs. "
+ "Prefer called tool names when the workflow depends on them; use run-selected tool names only when clearly required. "
+ "Use [] only when no tool is required."
+ "\n\n" + canonical_skill_format_instructions()
+ "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays."
)
@ -144,14 +147,23 @@ class SkillDraftSynthesizer:
@staticmethod
def _normalize_payload(payload: dict[str, Any], evidence_packet: EvidencePacket) -> dict[str, Any]:
frontmatter = dict(payload.get("frontmatter") or {})
frontmatter = normalize_skill_frontmatter(
dict(payload.get("frontmatter") or {}),
skill_name=str((payload.get("frontmatter") or {}).get("name") or "generated-skill"),
)
tool_hints = _coerce_string_list(frontmatter.get("tools"))
if not tool_hints:
tool_hints = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
frontmatter["tools"] = tool_hints
content = ensure_canonical_skill_body(
str(payload.get("content") or "").strip(),
title=str(frontmatter.get("name") or "generated-skill"),
description=str(frontmatter.get("description") or ""),
tools=tool_hints,
)
return {
"frontmatter": frontmatter,
"content": str(payload.get("content") or "").strip(),
"content": content,
"change_reason": str(payload.get("change_reason") or ""),
"preserved_sections": _coerce_string_list(payload.get("preserved_sections")),
"changed_sections": _coerce_string_list(payload.get("changed_sections")),
@ -162,13 +174,20 @@ class SkillDraftSynthesizer:
def _fallback_payload(candidate: SkillLearningCandidate, evidence_packet: EvidencePacket, action: str) -> dict[str, Any]:
related = candidate.related_skill_names[0] if candidate.related_skill_names else "generated-skill"
title = related.replace("_", "-")
content = "\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured."
tools = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
content = ensure_canonical_skill_body(
"\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured.",
title=title,
description=candidate.reason or f"Auto-generated {action} draft for {title}.",
tools=tools,
)
return {
"frontmatter": {
"name": title,
"description": candidate.reason or f"Auto-generated {action} draft for {title}.",
"tools": _coerce_string_list(evidence_packet.metadata.get("tool_names")),
"tools": tools,
},
"content": f"# {title}\n\n## Evidence\n\n{content}\n",
"content": content,
"change_reason": candidate.reason or f"Fallback {action} synthesis.",
"preserved_sections": [],
"changed_sections": [],

View File

@ -10,6 +10,7 @@ from typing import Callable
from beaver.engine.providers import ProviderBundle
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.learning.pipeline import SkillLearningPipelineService
from beaver.skills.learning.replay import ReplayRunner
@dataclass(slots=True)
@ -57,10 +58,12 @@ class SkillLearningWorker:
*,
pipeline: SkillLearningPipelineService,
provider_bundle_factory: Callable[[], ProviderBundle],
replay_runner_factory: Callable[[], ReplayRunner] | None = None,
config: SkillLearningWorkerConfig | None = None,
) -> None:
self.pipeline = pipeline
self.provider_bundle_factory = provider_bundle_factory
self.replay_runner_factory = replay_runner_factory
self.config = config or SkillLearningWorkerConfig.from_env()
self._running = False
self._lock = asyncio.Lock()
@ -126,6 +129,7 @@ class SkillLearningWorker:
draft.skill_name,
draft.draft_id,
provider_bundle=self.provider_bundle_factory(),
replay_runner=self.replay_runner_factory() if self.replay_runner_factory is not None else None,
)
return True

View File

@ -16,8 +16,8 @@ class SkillPublisher:
def publish(self, skill_name: str, draft_id: str, publisher: str, notes: str = "") -> SkillVersion:
draft = self._require_draft(skill_name, draft_id)
if draft.status != SkillReviewState.APPROVED.value:
raise ValueError("Draft must be approved before publish")
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
raise ValueError("Draft must be submitted for review before publish")
if draft.proposal_kind == "retire_skill":
raise ValueError("Retire proposals must be applied through apply_retire_proposal")
@ -81,8 +81,8 @@ class SkillPublisher:
def apply_retire_proposal(self, skill_name: str, draft_id: str, actor: str, notes: str = "") -> SkillSpec:
draft = self._require_draft(skill_name, draft_id)
if draft.status != SkillReviewState.APPROVED.value:
raise ValueError("Retire proposal must be approved before apply")
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
raise ValueError("Retire proposal must be submitted for review before apply")
if draft.proposal_kind != "retire_skill":
raise ValueError("Only retire_skill proposals can be applied as retire proposals")

View File

@ -25,7 +25,11 @@ class MainAgentRouter:
timeout_seconds: float = 8.0,
) -> MainAgentDecision:
if provider is None:
return self._fallback(active_task=active_task, reason="router_provider_unavailable")
return self._apply_active_task_boundary(
self._fallback(active_task=active_task, reason="router_provider_unavailable"),
message=message,
active_task=active_task,
)
chat_kwargs: dict[str, Any] = {
"messages": [
{
@ -58,10 +62,18 @@ class MainAgentRouter:
for attempt_timeout in (timeout_seconds, 12.0):
try:
response = await asyncio.wait_for(provider.chat(**chat_kwargs), timeout=attempt_timeout)
return self.from_json(response.content or "", active_task=active_task)
return self._apply_active_task_boundary(
self.from_json(response.content or "", active_task=active_task),
message=message,
active_task=active_task,
)
except Exception as exc:
last_error = exc
return self._fallback(active_task=active_task, reason=f"router_failed: {last_error}")
return self._apply_active_task_boundary(
self._fallback(active_task=active_task, reason=f"router_failed: {last_error}"),
message=message,
active_task=active_task,
)
def from_json(self, text: str, *, active_task: TaskRecord | None = None) -> MainAgentDecision:
payload = self._parse_json_object(text)
@ -121,6 +133,31 @@ class MainAgentRouter:
return MainAgentDecision(mode="task", reason=reason, action="continue_task")
return MainAgentDecision(mode="simple", reason=reason, action="simple_chat")
def _apply_active_task_boundary(
self,
decision: MainAgentDecision,
*,
message: str,
active_task: TaskRecord | None,
) -> MainAgentDecision:
if active_task is None or decision.action != "continue_task":
return decision
if not _looks_like_fresh_task_request(message):
return decision
if _looks_like_explicit_task_followup(message):
return decision
title = decision.short_title or active_task.metadata.get("short_title")
return MainAgentDecision(
mode="task",
reason=(
"fresh standalone task request in the same session; "
"do not attach it to the active task without explicit follow-up wording"
),
starts_new_task=True,
short_title=title,
action="create_task",
)
@staticmethod
def _prompt(
*,
@ -159,15 +196,19 @@ class MainAgentRouter:
"- close_task: user explicitly says the active Task is done/satisfactory/finished.\n"
"- abandon_task: user explicitly says to stop, cancel, abandon, or no longer do the active Task.\n\n"
"Critical policy:\n"
"- If there is an active Task, choose continue_task or revise_task unless the user's topic is completely unrelated "
"to that Task or the user explicitly closes/abandons it.\n"
"- A Session is the durable conversation/device/group context. A Task is one unit of work inside that Session. "
"Do not use an active Task as a reason to merge every later message into the same work item.\n"
"- If there is an active Task, choose continue_task only when the current message explicitly depends on, extends, "
"or asks a direct follow-up about that active Task's latest result.\n"
"- With an active Task, choose simple_chat for unrelated lightweight conversation and new_task for unrelated work "
"that needs Task capabilities. Either decision starts a new topic.\n"
"- An unrelated lightweight conversation must not be classified as revise_task merely because the active Task is awaiting acceptance.\n"
"- Choose revise_task when the active Task is awaiting feedback or needs revision and the user asks for changes "
"such as '改一下', '加上', '删除', '换成', '再详细点', '格式改成', '不要', or equivalent wording.\n"
"- Choose continue_task for neutral follow-up questions or additional next steps that do not imply dissatisfaction with the previous result.\n"
"- Use new_task only when the user clearly asks to start a different task.\n"
"- Choose continue_task for neutral follow-up questions or additional next steps that refer to the previous result, "
"for example '顺便查一下深圳', '这个也加上', or '继续'.\n"
"- A standalone tool-dependent request such as a fresh weather/search/file/run/test request is new_task even when it is "
"similar to the active Task. Repeating '珠海天气怎么样' later is a new Task unless the user says to revise or continue the old result.\n"
"- If there is no active Task, choose new_task only for work that requires execution, iteration, tools, files, "
"implementation, validation, or multi-step completion. Otherwise choose simple_chat.\n"
"- Requests that need current, real-time, external, user-private, local-file, web, weather, price, news, "
@ -203,3 +244,99 @@ def _clean_short_title(value: Any) -> str | None:
return None
title = " ".join(str(value).strip().split())
return title[:40] or None
def _looks_like_explicit_task_followup(message: str) -> bool:
text = _compact_text(message)
if not text:
return False
markers = (
"继续",
"接着",
"上面",
"刚才",
"前面",
"这个",
"那个",
"",
"结果",
"",
"",
"顺便",
"补充",
"加上",
"加入",
"删除",
"去掉",
"",
"换成",
"重做",
"详细",
"展开",
"格式",
"continue",
"same task",
"previous",
"above",
"that result",
"revise",
"update it",
"add",
"remove",
"change",
"also",
)
return any(marker in text for marker in markers)
def _looks_like_fresh_task_request(message: str) -> bool:
text = _compact_text(message)
if not text:
return False
markers = (
"天气",
"气温",
"下雨",
"降雨",
"空气质量",
"预报",
"查一下",
"帮我查",
"搜索",
"搜一下",
"看看最新",
"最新",
"今天",
"明天",
"上传",
"下载",
"文件",
"运行",
"执行",
"测试",
"构建",
"部署",
"修复",
"weather",
"forecast",
"temperature",
"search",
"look up",
"latest",
"today",
"tomorrow",
"upload",
"download",
"file",
"run",
"execute",
"test",
"build",
"deploy",
"fix",
)
return any(marker in text for marker in markers)
def _compact_text(message: str) -> str:
return " ".join(str(message or "").strip().lower().split())

View File

@ -4,6 +4,7 @@ import json
from pathlib import Path
from beaver.engine import EngineLoader
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import parse_frontmatter
@ -69,6 +70,16 @@ def test_skill_authoring_admin_is_seeded_but_not_initial() -> None:
assert version["tool_hints"] == expected_tools
def test_seeded_skill_bodies_use_canonical_format() -> None:
for index_name in ("published", "disabled"):
index = json.loads((REPO_ROOT / "skills" / "_index" / f"{index_name}.json").read_text(encoding="utf-8"))
for skill_name in index["items"]:
skill_dir = REPO_ROOT / "skills" / skill_name / "versions" / "v0001"
_frontmatter, body = parse_frontmatter((skill_dir / "SKILL.md").read_text(encoding="utf-8"))
assert is_canonical_skill_body(body), skill_name
def test_default_runtime_registers_skill_view_tool(tmp_path: Path) -> None:
loaded = EngineLoader(workspace=tmp_path).load()
try:

View File

@ -87,6 +87,14 @@ def _task() -> TaskRecord:
)
def _weather_task() -> TaskRecord:
task = _task()
task.description = "珠海天气怎样"
task.goal = "珠海天气怎样"
task.metadata["short_title"] = "查询珠海天气"
return task
def test_router_continues_active_task_from_llm_decision() -> None:
provider = RouterProvider('{"action":"continue_task","reason":"related","short_title":"任务连续性"}')
decision = asyncio.run(
@ -103,6 +111,35 @@ def test_router_continues_active_task_from_llm_decision() -> None:
assert provider.calls[0]["max_tokens"] == 256
def test_router_keeps_same_session_but_starts_new_task_for_standalone_weather_repeat() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"珠海天气怎么样",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"neutral follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "create_task"
assert decision.starts_new_task is True
assert "fresh standalone task request" in decision.reason
def test_router_allows_explicit_followup_to_continue_active_weather_task() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"顺便查一下深圳",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"related follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "continue_task"
assert decision.starts_new_task is False
def test_router_marks_revision_from_llm_decision() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
@ -163,6 +200,8 @@ def test_router_prompt_treats_unrelated_lightweight_conversation_as_new_topic()
prompt = provider.calls[0]["messages"][1]["content"]
assert "unrelated lightweight conversation" in prompt
assert "must not be classified as revise_task merely because the active Task is awaiting acceptance" in prompt
assert "A Session is the durable conversation/device/group context" in prompt
assert "Repeating '珠海天气怎么样' later is a new Task" in prompt
def test_router_closes_active_task_from_llm_decision() -> None:

View File

@ -5,13 +5,40 @@ from types import SimpleNamespace
import pytest
from beaver.interfaces.web.app import _create_skill_upload_draft
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.interfaces.web.app import _create_skill_upload_draft, _rewrite_uploaded_skill_draft_with_llm
from beaver.services.skillhub_service import SkillHubService
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import extract_required_tool_names
from beaver.skills.drafts import DraftService
from beaver.skills.specs import SkillSpecStore
from beaver.tools.mcp.wrapper import MCPToolWrapper
class RewriteProvider(LLMProvider):
def __init__(self) -> None:
super().__init__()
self.messages = []
async def chat(self, messages, tools=None, model=None, max_tokens=None, temperature=0.7, thinking_enabled=None):
self.messages = messages
return LLMResponse(
content="""{
"frontmatter": {
"name": "skill",
"description": "Use when uploaded skill guidance needs QA formatting.",
"tools": ["read_file"]
},
"content": "# Skill\\n\\n## Overview\\n\\nLLM rewritten overview.\\n\\n## When to Use\\n\\n- Use when testing upload rewrite.\\n\\n## Required Tools\\n\\n- `read_file`\\n\\n## Workflow\\n\\n- Follow the rewritten workflow.\\n\\n## Validation\\n\\n- Verify the result.\\n\\n## Boundaries\\n\\n- Stay in scope.\\n\\n## Anti-Patterns\\n\\n- Do not skip rewrite validation.\\n",
"change_reason": "normalized upload"
}""",
model=model,
)
def get_default_model(self):
return "rewrite-model"
class FakeSkillHubService(SkillHubService):
async def _get_json(self, path, *, params=None):
if path == "/skills":
@ -99,6 +126,106 @@ def test_upload_skill_zip_keeps_supporting_files_on_draft(tmp_path):
assert upload_dir.endswith(draft["draft_id"])
def test_upload_skill_zip_canonicalizes_uploaded_skill_body(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"skill/SKILL.md",
"---\nname: skill\ndescription: raw upload\ntools:\n - read_file\n---\nBody without our format.\n",
)
draft = _create_skill_upload_draft(loaded, "skill.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["name"] == "skill"
assert draft["proposed_frontmatter"]["tools"] == ["read_file"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_zip_infers_weather_web_tools_from_content(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nLook up current weather and forecast for a city online.\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["tools"] == ["web_fetch", "web_search"]
assert extract_required_tool_names(draft["proposed_content"]) == ["web_fetch", "web_search"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_llm_rewrite_updates_draft(tmp_path):
store = SkillSpecStore(tmp_path)
draft_service = DraftService(store)
draft = draft_service.create_new_skill_draft(
skill_name="skill",
proposed_content="# Skill\n\n## Overview\n\nFallback.",
proposed_frontmatter={"name": "skill", "description": "fallback", "tools": ["read_file"]},
created_by="test",
reason="upload",
)
provider = RewriteProvider()
agent_service = SimpleNamespace(
_make_provider_bundle_for_task=lambda _loaded, _kwargs: SimpleNamespace(
main_provider=provider,
main_runtime=SimpleNamespace(model="rewrite-model"),
)
)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=draft_service)
asyncio.run(_rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename="skill.zip"))
rewritten = draft_service.get_draft("skill", draft.draft_id)
assert rewritten is not None
assert "LLM rewritten overview" in rewritten.proposed_content
assert is_canonical_skill_body(rewritten.proposed_content)
assert "Canonical Beaver SKILL.md format" in provider.messages[1]["content"]
assert "Available runtime tool names" in provider.messages[1]["content"]
def test_upload_skill_zip_accepts_nested_single_skill_directory(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"plugin/skills/nested-skill/SKILL.md",
"---\nname: nested-skill\ndescription: nested\n---\nBody\n",
)
archive.writestr("plugin/skills/nested-skill/references/a.txt", "context")
archive.writestr("plugin/README.md", "ignore package file")
draft = _create_skill_upload_draft(loaded, "plugin.zip", buffer.getvalue())
assert draft["skill_name"] == "nested-skill"
upload_dir = draft["evidence_refs"][0]["supporting_upload_dir"]
assert (tmp_path / "skills" / "nested-skill" / "draft_uploads" / draft["draft_id"] / "references" / "a.txt").read_text() == "context"
assert "README.md" not in draft["evidence_refs"][0]["files"]
def test_upload_skill_zip_accepts_common_skill_markdown_name_aliases(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nBody\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["skill_name"] == "weather-search"
assert draft["proposed_frontmatter"]["name"] == "weather-search"
assert is_canonical_skill_body(draft["proposed_content"])
def test_mcp_wrapper_metadata_preserves_server_id_with_underscores():
tool_def = SimpleNamespace(name="auth_status", description="Auth", inputSchema={"type": "object", "properties": {}})

View File

@ -184,7 +184,7 @@ def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
assert published.version == "v0002"
assert store.get_current_version("release-checklist") == "v0002"
with pytest.raises(ValueError, match="approved"):
with pytest.raises(ValueError, match="submitted for review"):
publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")
rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
@ -529,6 +529,66 @@ def test_skill_learning_service_generates_new_skill_for_task_without_published_s
assert candidates[0].source_run_ids == ["task-run-1"]
def test_skill_learning_service_uses_original_task_text_for_new_skill_theme(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
service = SkillLearningService(
run_store=run_store,
learning_store=learning_store,
draft_service=DraftService(store),
evidence_selector=EvidenceSelector(run_store),
)
now = datetime.now(timezone.utc).isoformat()
run_store.append_run_record(
RunRecord(
run_id="task-run-1",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
run_store.append_run_record(
RunRecord(
run_id="task-run-2",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
candidates = service.build_learning_candidates_for_task("task-1", trigger_run_id="task-run-2")
assert [candidate.candidate_id for candidate in candidates] == ["new:task:task-1"]
assert candidates[0].evidence["theme"] == "Compare direct production restart with staging rollout"
assert candidates[0].evidence["task_text"] == "Compare direct production restart with staging rollout"
def test_task_theme_uses_first_sentence_for_chinese_text() -> None:
assert (
SkillLearningService._task_theme(
"帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production。请给出推荐方案、原因、验证步骤和回滚策略。"
)
== "帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production"
)
def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
skill = SkillContext(
name="docker-debug",

View File

@ -0,0 +1,54 @@
from __future__ import annotations
from beaver.skills.authoring.format import (
CANONICAL_SKILL_SECTION_HEADINGS,
canonical_skill_format_instructions,
canonicalize_skill_body,
is_canonical_skill_body,
parse_skill_rewrite_json,
)
def test_canonical_skill_body_contains_required_sections() -> None:
body = canonicalize_skill_body(
title="Filesystem Operation",
overview="Read and update project files safely.",
tools=["read_file", "write_file"],
workflow=["Inspect the file before editing.", "Use the smallest safe edit."],
validation=["Re-read changed files before reporting completion."],
boundaries=["Do not edit files outside the workspace."],
anti_patterns=["Do not overwrite files without reading them first."],
)
assert is_canonical_skill_body(body)
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in body
def test_canonical_skill_format_instructions_are_prompt_ready() -> None:
instructions = canonical_skill_format_instructions()
assert "Canonical Beaver SKILL.md format" in instructions
assert "frontmatter" in instructions
assert "name" in instructions
assert "description" in instructions
assert "tools" in instructions
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in instructions
def test_parse_skill_rewrite_json_backfills_frontmatter_tools_from_required_tools_section() -> None:
payload = parse_skill_rewrite_json(
"""{
"frontmatter": {
"name": "weather-search",
"description": "weather lookup",
"tools": []
},
"content": "# Weather Search\\n\\n## Overview\\n\\nLook up weather.\\n\\n## When to Use\\n\\n- Weather requests.\\n\\n## Required Tools\\n\\n- `web_fetch`\\n- `web_search`\\n\\n## Workflow\\n\\n- Fetch current weather.\\n\\n## Validation\\n\\n- Check source freshness.\\n\\n## Boundaries\\n\\n- Do not guess.\\n\\n## Anti-Patterns\\n\\n- Do not fabricate data.\\n"
}""",
skill_name="weather-search",
)
assert payload is not None
assert payload["frontmatter"]["tools"] == ["web_fetch", "web_search"]

View File

@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore
class StubProvider(LLMProvider):
async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
return LLMResponse(content="ok")
def __init__(self, content: str = "ok") -> None:
super().__init__()
self.content = content
self.calls: list[dict] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
thinking_enabled: bool | None = None,
) -> LLMResponse:
self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
return LLMResponse(content=self.content)
def get_default_model(self) -> str:
return "stub"
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert report.passed is True
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is False
assert pipeline.get_candidate("candidate-1").status == "eval_failed"
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
class FakeReplayRunner:
def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
self.baseline_answer = baseline_answer
self.candidate_answer = candidate_answer
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
return {
"case_id": request.case_id,
"arm": request.arm,
@ -168,7 +187,7 @@ class FakeReplayRunner:
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"final_answer": final_answer,
"tool_calls": [
{
"tool_name": "write_file",
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}
assert "ability_score" in report.case_reports[0]
assert "tool_execution_score" in report.case_reports[0]
assert report.ability_score_summary["score_role"] == "primary"
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "validator-case",
"task_id": "validator-case",
"session_id": "eval",
"task_text": "Write the release verdict.",
"validator": {
"type": "final_answer_contains",
"required_terms": ["ship"],
"forbidden_terms": ["do not ship"],
},
"accepted_score": 0.5,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(
baseline_answer="Do not ship. Tests are failing.",
candidate_answer="Ship after smoke tests pass.",
),
)
)
case = report.case_reports[0]
assert case["tool_execution_score"]["baseline_score"] == 0.85
assert case["tool_execution_score"]["candidate_score"] == 0.85
assert case["baseline_score"] < case["candidate_score"]
assert report.tool_mode_summary["score_role"] == "diagnostic_only"
assert report.ability_score_summary["score_role"] == "primary"
assert report.real_score_avg is not None
assert report.synthetic_score_avg is not None
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "synthetic:no-validator",
"task_id": "synthetic-no-validator",
"session_id": "synthetic-eval",
"task_text": "Synthetic task without an oracle.",
"synthetic": True,
"accepted_score": 0.75,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
replay_runner = FakeReplayRunner()
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=replay_runner,
)
)
assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1

View File

@ -31,6 +31,12 @@ def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
assert payload["confidence"] == "low"
assert payload["case_reports"] == []
assert payload["tool_mode_summary"] == {}
assert payload["ability_score_summary"] == {}
assert payload["tool_execution_summary"] == {}
assert payload["case_selection_summary"] == {}
assert payload["real_score_avg"] is None
assert payload["synthetic_score_avg"] is None
assert payload["overall_score_avg"] is None
assert payload["preservation_report"] is None
assert payload["cases"] == [{"run_id": "run-1"}]
@ -59,3 +65,37 @@ def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
assert report.mode == "heuristic"
assert report.confidence == "low"
assert report.case_reports == []
def test_eval_report_persists_ability_and_case_split_fields() -> None:
report = SkillDraftEvalReport(
report_id="eval-replay",
skill_name="debug",
draft_id="draft-1",
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
mode="replay",
eval_version="replay-v2",
real_score_avg=0.9,
synthetic_score_avg=0.6,
overall_score_avg=0.8,
ability_score_summary={"score_role": "primary", "real_case_count": 1},
tool_execution_summary={"score_role": "diagnostic_only", "executed": 1.0},
case_selection_summary={"excluded_synthetic_without_validator": 2},
)
payload = report.to_dict()
restored = SkillDraftEvalReport.from_dict(payload)
assert payload["real_score_avg"] == 0.9
assert payload["synthetic_score_avg"] == 0.6
assert payload["overall_score_avg"] == 0.8
assert restored.ability_score_summary == {"score_role": "primary", "real_case_count": 1}
assert restored.tool_execution_summary == {"score_role": "diagnostic_only", "executed": 1.0}
assert restored.case_selection_summary == {"excluded_synthetic_without_validator": 2}

View File

@ -55,14 +55,12 @@ def test_pipeline_lists_candidates_and_moves_draft_through_review(tmp_path: Path
reason="test",
)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
approved = pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
version = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert pipeline.list_candidates()[0].candidate_id == "candidate-1"
assert review.status == SkillReviewState.IN_REVIEW.value
assert approved.status == SkillReviewState.APPROVED.value
assert safety.passed is True
assert version.skill_name == "new-skill"
assert pipeline.get_draft(draft.skill_name, draft.draft_id).status == SkillReviewState.PUBLISHED.value
@ -93,7 +91,6 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
@ -165,7 +162,6 @@ def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="low confidence"):
@ -201,7 +197,6 @@ def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="preservation"):

View File

@ -16,6 +16,25 @@ class FakeAgentLoop:
return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop")
class FakeRunningAgentLoop(FakeAgentLoop):
def __init__(self) -> None:
self.process_direct_calls = 0
self.submit_direct_calls: list[tuple[str, dict]] = []
async def process_direct(self, task: str, **kwargs):
self.process_direct_calls += 1
raise RuntimeError(
"AgentLoop.process_direct() is disabled while run() is active; "
"submit tasks via submit_direct() instead."
)
async def submit_direct(self, task: str, **kwargs):
self.submit_direct_calls.append((task, kwargs))
executor = kwargs["tool_executor_override"]
await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"})
return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
runner = ReplayRunner(agent_loop=FakeAgentLoop())
request = ReplayArmRequest(
@ -34,3 +53,33 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
assert report["arm"] == "candidate"
assert report["finish_reason"] == "stop"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
agent_loop = FakeRunningAgentLoop()
runner = ReplayRunner(agent_loop=agent_loop)
request = ReplayArmRequest(
case_id="case-queued",
arm="baseline",
task_text="Send a status email to Ada.",
pinned_skill_names=["filesystem-operation"],
pinned_skill_contexts=[{"name": "filesystem-operation"}],
provider_bundle=object(),
model_settings={"max_tool_iterations": 3, "temperature": 0.1},
)
report = asyncio.run(runner.run_arm(request))
assert agent_loop.process_direct_calls == 1
assert len(agent_loop.submit_direct_calls) == 1
queued_task, queued_kwargs = agent_loop.submit_direct_calls[0]
assert queued_task == "Send a status email to Ada."
assert queued_kwargs["source"] == "skill_replay_eval"
assert queued_kwargs["include_skill_assembly"] is False
assert queued_kwargs["include_tools"] is True
assert queued_kwargs["pinned_skill_names"] == ["filesystem-operation"]
assert queued_kwargs["max_tool_iterations"] == 3
assert queued_kwargs["temperature"] == 0.1
assert report["session_id"] == "session-queued"
assert report["run_id"] == "run-queued"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"

View File

@ -74,7 +74,6 @@ def test_safety_marks_dangerous_tools_high_and_requires_confirm(tmp_path: Path)
report = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is True
assert report.risk_level == "high"
@ -94,7 +93,6 @@ def test_publish_requires_safety_report(tmp_path: Path) -> None:
reason="test",
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
with pytest.raises(ValueError, match="safety report"):
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.authoring.format import CANONICAL_SKILL_SECTION_HEADINGS
from beaver.skills.learning.evidence import EvidencePacket
from beaver.skills.learning.synthesizer import SkillDraftSynthesizer
@ -39,3 +40,6 @@ def test_revision_prompt_includes_base_skill_snapshot() -> None:
assert "Do not delete files." in prompt
assert "preserved_sections" in prompt
assert "dropped_sections" in prompt
assert "Canonical Beaver SKILL.md format" in prompt
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in prompt

View File

@ -1,12 +1,37 @@
from __future__ import annotations
from pathlib import Path
from types import SimpleNamespace
from fastapi.testclient import TestClient
from beaver.memory.runs import RunRecord
from beaver.interfaces.web.app import create_app
from beaver.memory.skills import SkillLearningCandidate
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.services.agent_service import AgentService
from beaver.skills.specs import SkillVersion
class StubEvaluator:
def __init__(self) -> None:
self.calls = 0
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
self.calls += 1
return SkillDraftEvalReport(
report_id="eval-existing",
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
status="completed",
)
def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
@ -31,3 +56,191 @@ def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
assert candidates[0]["candidate_id"] == "candidate-1"
assert "risk_level" in candidates[0]
assert run_once["processed"] >= 0
def test_skill_learning_candidates_payload_prefers_original_task_text(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
now = "2026-06-11T00:00:00+00:00"
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-original",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-final",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="new:task:task-1",
kind="new_skill",
source_run_ids=["run-original", "run-final"],
source_session_ids=["session-task"],
related_skill_names=[],
reason="test",
evidence={"task_id": "task-1", "theme": "i do not see the docs"},
)
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
candidates = client.get("/api/skills/candidates").json()
payload = next(item for item in candidates if item["candidate_id"] == "new:task:task-1")
assert payload["evidence"]["theme"] == "Compare direct production restart with staging rollout"
assert payload["evidence"]["task_text"] == "Compare direct production restart with staging rollout"
def test_generate_draft_does_not_run_review_checks(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post("/api/skills/candidates/candidate-existing/draft")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 0
assert payload["draft_id"] == draft.draft_id
assert payload["safety_report"] is None
assert payload["eval_report"] is None
assert loaded.skill_learning_pipeline.get_eval_report(draft.skill_name, draft.draft_id) is None # type: ignore[union-attr]
def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 1
assert payload["status"] == "in_review"
assert payload["safety_report"]["passed"] is True
assert payload["eval_report"]["report_id"] == "eval-existing"
def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
loaded.skill_spec_store.write_skill_version( # type: ignore[union-attr]
SkillVersion(
skill_name="filesystem-operation",
version="v0001",
content_hash="hash-v1",
summary_hash="summary-v1",
created_at="2026-06-01T00:00:00+00:00",
created_by="test",
change_reason="initial",
parent_version=None,
review_state="published",
frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
summary="filesystem",
tool_hints=[],
),
"# Filesystem Operation\n\nUse files.",
)
loaded.skill_spec_store.set_current_version("filesystem-operation", "v0001") # type: ignore[union-attr]
draft = loaded.skill_learning_pipeline.draft_service.create_revision_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
base_version="v0001",
proposed_content="# Filesystem Operation\n\nUse files better.",
proposed_frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
created_by="test",
reason="revise",
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.get("/api/skills/drafts")
assert response.status_code == 200
payload = next(item for item in response.json() if item["draft_id"] == draft.draft_id)
assert payload["proposal_kind"] == "revise_skill"
assert payload["base_version"] == "v0001"
assert payload["target_version"] == "v0002"
assert payload["base_skill"]["version"] == "v0001"
assert payload["base_skill"]["content"] == "# Filesystem Operation\n\nUse files."
assert payload["base_skill"]["frontmatter"]["name"] == "filesystem-operation"

View File

@ -10,6 +10,7 @@ from beaver.engine.providers.factory import ProviderBundle
from beaver.engine.session import SessionManager
from beaver.memory.runs import RunMemoryStore, RunRecord
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.drafts import DraftService
from beaver.skills.learning import (
EvidenceSelector,
@ -48,6 +49,33 @@ def _bundle(provider: LLMProvider) -> ProviderBundle:
return ProviderBundle(main_runtime=runtime, main_provider=provider) # type: ignore[arg-type]
class FakeReplayRunner:
def __init__(self) -> None:
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "debug deployment startup done",
"tool_calls": [
{
"tool_name": "echo",
"mode": "executed",
"arguments": {"text": "ok"},
"result": {"success": True, "content": "ok"},
}
],
"artifacts": [],
"side_effects": [],
}
def _pipeline(tmp_path: Path) -> SkillLearningPipelineService:
spec_store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
@ -109,6 +137,28 @@ def test_worker_synthesizes_open_candidate_without_publish(tmp_path: Path) -> No
assert pipeline.list_drafts(candidate.draft_skill_name)[0].status == "draft"
def test_worker_evaluates_draft_with_replay_runner_when_available(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
replay_runner = FakeReplayRunner()
worker = SkillLearningWorker(
pipeline=pipeline,
provider_bundle_factory=lambda: _bundle(JsonProvider()),
replay_runner_factory=lambda: replay_runner,
config=SkillLearningWorkerConfig(max_drafts_per_run=5, max_retries=3, interval_seconds=1),
)
result = asyncio.run(worker.run_once())
candidate = pipeline.get_candidate("candidate-1")
draft = pipeline.get_draft(candidate.draft_skill_name or "", candidate.draft_id or "")
report = pipeline.get_eval_report(draft.skill_name, draft.draft_id)
assert result.succeeded == 1
assert report is not None
assert report.mode == "replay"
assert report.case_reports
assert replay_runner.requests
def test_worker_retries_and_marks_failed_after_limit(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
worker = SkillLearningWorker(
@ -147,6 +197,7 @@ def test_synthesizer_fills_missing_tools_from_evidence(tmp_path: Path) -> None:
)
assert payload["frontmatter"]["tools"] == ["web_fetch", "memory"]
assert is_canonical_skill_body(payload["content"])
def test_evidence_selector_records_run_tool_names(tmp_path: Path) -> None:

View File

@ -218,6 +218,45 @@ def test_unrelated_new_task_auto_accepts_previous_task(tmp_path: Path) -> None:
assert current.run_ids == [second.run_id]
def test_standalone_realtime_repeat_creates_new_task_in_same_session(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner(),
)
)
session_id = "feishu:group-weather"
first = asyncio.run(
service.process_direct(
"珠海天气怎样",
session_id=session_id,
provider_bundle=_bundle("Weather result"),
)
)
second = asyncio.run(
service.process_direct(
"珠海天气怎么样",
session_id=session_id,
provider_bundle=_bundle("Fresh weather result", route_action="continue_task"),
)
)
task_service = service.create_loop().boot().task_service
assert task_service is not None
previous = task_service.get_task(first.task_id or "")
current = task_service.get_task(second.task_id or "")
assert previous is not None
assert current is not None
assert previous.session_id == session_id
assert current.session_id == session_id
assert current.task_id != previous.task_id
assert previous.status == "closed"
assert previous.run_ids == [first.run_id]
assert current.status == "awaiting_acceptance"
assert current.run_ids == [second.run_id]
def test_related_follow_up_continues_active_task_without_accepting_it(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(

View File

@ -102,6 +102,58 @@ tools:
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files"]
def test_tool_assembler_uses_required_tools_section_when_frontmatter_omits_tools(tmp_path: Path) -> None:
skill_dir = tmp_path / "skills" / "docker-debug"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text(
"""---
name: docker-debug
description: Debug Docker issues.
---
# Docker Debug
## Overview
Debug Docker issues.
## Required Tools
- `terminal`
- `search_files`
## Workflow
Inspect logs and search related files.
""",
encoding="utf-8",
)
registry = ToolRegistry()
registry.register(DummyTool("memory", toolset="memory", always_available=True))
registry.register(DummyTool("terminal", toolset="shell"))
registry.register(DummyTool("search_files", toolset="file"))
registry.register(DummyTool("echo", toolset="debug"))
assembler = ToolAssembler(retriever=StaticRetriever())
loader = SkillsLoader(tmp_path)
record = loader.get_skill_record("docker-debug")
assert record is not None
assert record.tool_hints == ["terminal", "search_files"]
selected = asyncio.run(
assembler.assemble(
task_description="排查 Docker 容器日志",
registry=registry,
skills_loader=loader,
activated_skills=[SkillContext(name="docker-debug", content="", tool_hints=record.tool_hints)],
top_k=1,
)
)
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files", "echo"]
def test_embedding_fallback_can_return_all_or_top_k() -> None:
candidates = [{"name": f"tool_{index}", "description": "", "input_schema": "{}"} for index in range(3)]
retriever = EmbeddingRetriever(api_key_env="MISSING_EMBEDDING_KEY", api_base_env="MISSING_EMBEDDING_BASE")

View File

@ -0,0 +1,21 @@
from fastapi.testclient import TestClient
from beaver.interfaces.web.app import create_app
def test_local_frontend_origin_can_preflight_api_requests() -> None:
app = create_app(service=None, manage_service_lifecycle=False)
client = TestClient(app)
response = client.options(
"/api/auth/me",
headers={
"Origin": "http://127.0.0.1:3080",
"Access-Control-Request-Method": "GET",
"Access-Control-Request-Headers": "authorization",
},
)
assert response.status_code == 200
assert response.headers["access-control-allow-origin"] == "http://127.0.0.1:3080"
assert "authorization" in response.headers["access-control-allow-headers"].lower()

View File

@ -28,8 +28,10 @@ import {
deleteUserFile,
createUserFileDir,
getAccessToken,
isApiError,
} from '@/lib/api';
import type { UserFileContent, UserFileItem } from '@/lib/api';
import { canMutateUserFilesPath } from '@/lib/user-file-paths';
import { Button } from '@/components/ui/button';
import { ScrollArea } from '@/components/ui/scroll-area';
import { type AppLocale, pickAppText } from '@/lib/i18n/core';
@ -44,6 +46,10 @@ function sleep(ms: number): Promise<void> {
});
}
function isAuthError(error: unknown): boolean {
return isApiError(error, 401);
}
export default function FilesPage() {
const { locale } = useAppI18n();
const [items, setItems] = useState<UserFileItem[]>([]);
@ -78,6 +84,9 @@ export default function FilesPage() {
return;
} catch (err) {
lastError = err;
if (isAuthError(err)) {
break;
}
}
}
const message = lastError instanceof Error ? lastError.message : pickAppText(locale, '加载文件失败', 'Failed to load files');
@ -156,6 +165,15 @@ export default function FilesPage() {
const handleUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
const files = e.target.files;
if (!files || files.length === 0) return;
if (!canMutateUserFilesPath(currentPath)) {
setLoadError(pickAppText(
locale,
'请先进入 uploads、outputs、shared 或 tasks 目录后再上传。',
'Open uploads, outputs, shared, or tasks before uploading.'
));
if (fileInputRef.current) fileInputRef.current.value = '';
return;
}
setUploading(true);
setUploadProgress(0);
@ -178,6 +196,14 @@ export default function FilesPage() {
const handleCreateDir = async () => {
const name = newDirName.trim();
if (!name) return;
if (!canMutateUserFilesPath(currentPath)) {
setLoadError(pickAppText(
locale,
'请先进入 uploads、outputs、shared 或 tasks 目录后再新建文件夹。',
'Open uploads, outputs, shared, or tasks before creating a folder.'
));
return;
}
try {
const dirPath = currentPath ? `${currentPath}/${name}` : name;
await createUserFileDir(dirPath);
@ -191,6 +217,7 @@ export default function FilesPage() {
// Build breadcrumbs
const breadcrumbs = currentPath ? currentPath.split('/') : [];
const canMutateCurrentPath = canMutateUserFilesPath(currentPath);
const formatSize = (bytes: number | null) => {
if (bytes === null || bytes === undefined) return '';
@ -224,7 +251,12 @@ export default function FilesPage() {
size="sm"
className="h-11"
onClick={() => setShowMkdir(true)}
disabled={loading}
disabled={loading || !canMutateCurrentPath}
title={
canMutateCurrentPath
? undefined
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
}
>
<FolderPlus className="w-4 h-4 mr-1" />
{pickAppText(locale, '新建文件夹', 'New folder')}
@ -234,7 +266,12 @@ export default function FilesPage() {
size="sm"
className="h-11"
onClick={() => fileInputRef.current?.click()}
disabled={uploading}
disabled={uploading || !canMutateCurrentPath}
title={
canMutateCurrentPath
? undefined
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
}
>
{uploading ? (
<>
@ -272,6 +309,15 @@ export default function FilesPage() {
</Button>
</div>
</div>
{!canMutateCurrentPath && !loading && (
<p className="mb-4 rounded-md border border-[#E6E1DE] bg-muted/40 px-3 py-2 text-sm text-muted-foreground">
{pickAppText(
locale,
'请选择 uploads、outputs、shared 或 tasks 后再上传或新建文件夹。',
'Select uploads, outputs, shared, or tasks before uploading or creating folders.'
)}
</p>
)}
{/* Breadcrumbs */}
<div className="flex items-center gap-1 mb-4 text-sm text-muted-foreground flex-wrap">

View File

@ -5,7 +5,6 @@ import { usePathname, useRouter, useSearchParams } from 'next/navigation';
import {
AlertCircle,
BarChart3,
Check,
CheckCircle2,
ChevronDown,
ClipboardList,
@ -31,7 +30,6 @@ import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import {
approveSkillDraft,
deleteSkill,
disablePublishedSkill,
downloadSkill,
@ -436,11 +434,6 @@ export default function SkillsPage() {
submitSkillDraft(draft.skill_name, draft.draft_id)
)
}
onApprove={() =>
runAction(`approve:${draft.draft_id}`, () =>
approveSkillDraft(draft.skill_name, draft.draft_id)
)
}
onReject={() =>
runAction(`reject:${draft.draft_id}`, () =>
rejectSkillDraft(draft.skill_name, draft.draft_id)
@ -799,7 +792,6 @@ function DraftCard({
draft,
actionId,
onSubmit,
onApprove,
onReject,
onRecheckSafety,
onPublish,
@ -807,7 +799,6 @@ function DraftCard({
draft: SkillDraft;
actionId: string | null;
onSubmit: () => Promise<unknown>;
onApprove: () => Promise<unknown>;
onReject: () => Promise<unknown>;
onRecheckSafety: () => Promise<unknown>;
onPublish: (confirmHighRisk: boolean) => Promise<unknown>;
@ -820,8 +811,10 @@ function DraftCard({
const frontmatter = draft.proposed_frontmatter || {};
const description = String(frontmatter.description || '').trim();
const toolHints = normalizeStringList(frontmatter.tools);
const submittedForReview = draft.status === 'in_review' || draft.status === 'approved';
const isRevision = draft.proposal_kind === 'revise_skill' && Boolean(draft.base_skill);
const publishBlocked =
draft.status !== 'approved'
!submittedForReview
|| !safety
|| safety.risk_level === 'critical'
|| (evalReport?.status !== 'skipped_provider_unavailable' && evalReport?.passed === false);
@ -833,7 +826,6 @@ function DraftCard({
].filter(Boolean).join('\n');
const safetyBlocksReview = Boolean(safety && (!safety.passed || safety.risk_level === 'critical'));
const submitBlocked = draft.status !== 'draft' || safetyBlocksReview;
const approveBlocked = draft.status !== 'in_review' || safetyBlocksReview;
const rejectBlocked = !REJECTABLE_DRAFT_STATUSES.has(draft.status);
const canPublishLabel = publishBlocked
? publishBlockReason(draft, t)
@ -878,7 +870,12 @@ function DraftCard({
<p className={`mt-1 text-sm leading-6 text-muted-foreground ${containedLongTextClass}`}>
{draft.reason || description || t('没有提供草稿说明。', 'No draft notes were provided.')}
</p>
<div className="mt-3 grid gap-3 md:grid-cols-3">
{draft.proposal_kind === 'revise_skill' && draft.base_version && (
<div className="mt-2 text-sm font-medium text-muted-foreground">
{draft.skill_name}: {draft.base_version} {draft.target_version || t('下一版本', 'Next version')}
</div>
)}
<div className="mt-3 grid gap-3 md:grid-cols-4">
<ReadableFact
icon={<FileCode2 className="h-4 w-4" />}
label={t('草稿内容', 'Draft content')}
@ -889,6 +886,11 @@ function DraftCard({
label={t('基线版本', 'Base version')}
value={draft.base_version || t('新增技能,无基线', 'New skill, no base')}
/>
<ReadableFact
icon={<GitCompare className="h-4 w-4" />}
label={t('目标版本', 'Target version')}
value={draft.target_version || '-'}
/>
<ReadableFact
icon={<Info className="h-4 w-4" />}
label={t('来源', 'Source')}
@ -912,10 +914,6 @@ function DraftCard({
<Send className="mr-2 h-4 w-4" />
{t('送审', 'Submit')}
</Button>
<Button variant="outline" size="sm" className="h-11" disabled={busy || approveBlocked} onClick={() => void onApprove()}>
<Check className="mr-2 h-4 w-4" />
{t('批准', 'Approve')}
</Button>
<Button variant="outline" size="sm" className="h-11" disabled={busy || rejectBlocked} onClick={() => void onReject()}>
<XCircle className="mr-2 h-4 w-4" />
{t('拒绝', 'Reject')}
@ -926,7 +924,7 @@ function DraftCard({
</Button>
<Button size="sm" className="h-11" disabled={busy || publishBlocked} onClick={handlePublish}>
<Rocket className="mr-2 h-4 w-4" />
{t('发布', 'Publish')}
{draft.proposal_kind === 'revise_skill' ? t('发布修订', 'Publish revision') : t('发布', 'Publish')}
</Button>
</div>
</div>
@ -936,7 +934,7 @@ function DraftCard({
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
<div className="flex items-center gap-2 text-sm font-medium">
<FileText className="h-4 w-4 text-muted-foreground" />
{t('拟发布的技能正文', 'Proposed skill body')}
{isRevision ? t('修改对比', 'Revision comparison') : t('拟发布的技能正文', 'Proposed skill body')}
</div>
{toolHints.length > 0 && (
<div className="flex flex-wrap gap-1">
@ -948,7 +946,14 @@ function DraftCard({
</div>
)}
</div>
{draft.proposed_content.trim() ? (
{isRevision && draft.base_skill ? (
<RevisionComparison
baseVersion={draft.base_version || draft.base_skill.version}
targetVersion={draft.target_version || t('下一版本', 'Next version')}
baseContent={draft.base_skill.content}
proposedContent={draft.proposed_content}
/>
) : draft.proposed_content.trim() ? (
<MarkdownPreview content={draft.proposed_content} />
) : (
<p className="text-sm text-muted-foreground">{t('草稿没有正文内容。', 'This draft has no body content.')}</p>
@ -960,7 +965,7 @@ function DraftCard({
title={t('发布门禁', 'Publish gates')}
summary={canPublishLabel}
items={[
{ label: t('草稿已批准', 'Draft approved'), ok: draft.status === 'approved' },
{ label: t('草稿已送审', 'Draft submitted'), ok: submittedForReview },
{ label: t('安全报告通过', 'Safety passed'), ok: Boolean(safety?.passed) && safety?.risk_level !== 'critical' },
{
label: t('评估未回退', 'No eval regression'),
@ -971,6 +976,7 @@ function DraftCard({
<RawDetails
title={t('原始草稿内容', 'Raw draft payload')}
payload={{
base_skill: draft.base_skill,
proposed_frontmatter: draft.proposed_frontmatter,
proposed_content: draft.proposed_content,
evidence_refs: draft.evidence_refs,
@ -1040,6 +1046,71 @@ function SafetyReportPanel({ report }: { report?: SkillDraftSafetyReport | null
);
}
function RevisionComparison({
baseVersion,
targetVersion,
baseContent,
proposedContent,
}: {
baseVersion: string;
targetVersion: string;
baseContent: string;
proposedContent: string;
}) {
const { locale } = useAppI18n();
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
const diff = lineDiffSummary(baseContent, proposedContent);
return (
<div className="space-y-3">
<div className="flex flex-wrap gap-2 text-xs text-muted-foreground">
<Badge variant="outline">{baseVersion}</Badge>
<span></span>
<Badge variant="default">{targetVersion}</Badge>
<span>{t('新增', 'Added')}: {diff.added}</span>
<span>{t('删除', 'Removed')}: {diff.removed}</span>
<span>{t('修改', 'Changed')}: {diff.changed}</span>
</div>
<div className="grid min-w-0 gap-3 lg:grid-cols-2">
<DiffPane title={t('当前版本', 'Current version')} content={baseContent} />
<DiffPane title={t('草稿修订', 'Draft revision')} content={proposedContent} />
</div>
</div>
);
}
function DiffPane({ title, content }: { title: string; content: string }) {
return (
<div className="min-w-0 rounded-md border border-border bg-white">
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">{title}</div>
<pre className={`max-h-[520px] overflow-auto p-3 text-xs leading-5 ${containedLongTextClass}`}>
{content.trim() || '-'}
</pre>
</div>
);
}
function lineDiffSummary(baseContent: string, proposedContent: string): { added: number; removed: number; changed: number } {
const baseLines = baseContent.split(/\r?\n/);
const proposedLines = proposedContent.split(/\r?\n/);
const maxLength = Math.max(baseLines.length, proposedLines.length);
let added = 0;
let removed = 0;
let changed = 0;
for (let index = 0; index < maxLength; index += 1) {
const baseLine = baseLines[index];
const proposedLine = proposedLines[index];
if (baseLine === proposedLine) continue;
if (baseLine === undefined) {
added += 1;
} else if (proposedLine === undefined) {
removed += 1;
} else {
changed += 1;
}
}
return { added, removed, changed };
}
function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
const { locale } = useAppI18n();
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
@ -1066,6 +1137,15 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
);
}
const abilitySummary = report.ability_score_summary || {};
const toolExecutionSummary = report.tool_execution_summary || report.tool_mode_summary || {};
const caseSelectionSummary = report.case_selection_summary || {};
const realScore = report.real_score_avg ?? abilitySummary.real_score_avg;
const syntheticScore = report.synthetic_score_avg ?? abilitySummary.synthetic_score_avg;
const overallScore = report.overall_score_avg ?? abilitySummary.overall_score_avg ?? report.candidate_score_avg;
const realCaseCount = toNumber(abilitySummary.real_case_count);
const syntheticCaseCount = toNumber(abilitySummary.synthetic_case_count);
const excludedSynthetic = toNumber(caseSelectionSummary.excluded_synthetic_without_validator);
return (
<div className="min-w-0 rounded-md border border-border bg-muted/20 p-4">
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
@ -1079,8 +1159,8 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
<div className="grid gap-2 sm:grid-cols-3">
<MetricTile label={t('基线均分', 'Baseline avg')} value={formatScore(report.baseline_score_avg)} />
<MetricTile label={t('候选均分', 'Candidate avg')} value={formatScore(report.candidate_score_avg)} />
<MetricTile label={t('基线能力均分', 'Baseline ability')} value={formatScore(report.baseline_score_avg)} />
<MetricTile label={t('候选能力均分', 'Candidate ability')} value={formatScore(report.candidate_score_avg)} />
<MetricTile
label={t('变化', 'Delta')}
value={`${report.score_delta >= 0 ? '+' : ''}${formatScore(report.score_delta)}`}
@ -1089,8 +1169,14 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
<MetricTile label={t('真实案例均分', 'Real avg')} value={formatOptionalScore(realScore)} />
<MetricTile label={t('模拟案例均分', 'Synthetic avg')} value={formatOptionalScore(syntheticScore)} />
<MetricTile label={t('总体能力分', 'Overall ability')} value={formatOptionalScore(overallScore)} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('工具执行覆盖', 'Tool execution')} value={formatPercent(toOptionalNumber(toolExecutionSummary.executed) ?? report.execution_coverage)} />
<MetricTile label={t('替代工具评估', 'Tool surrogate')} value={formatPercent(toOptionalNumber(toolExecutionSummary.surrogate) ?? report.surrogate_coverage)} />
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
</div>
@ -1100,6 +1186,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('不变', 'Unchanged')} value={String(report.unchanged_count)} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('真实案例', 'Real cases')} value={String(realCaseCount)} />
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('模拟案例', 'Synthetic cases')} value={String(syntheticCaseCount)} />
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('无验证器已排除', 'No-validator excluded')} value={String(excludedSynthetic)} />
</div>
{report.cases.length > 0 && (
<div className="mt-3 overflow-hidden rounded-md border border-border bg-white">
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">
@ -1114,6 +1206,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<MetricTile label={t('候选', 'Candidate')} value={formatScore(toNumber(item.candidate_score))} />
<MetricTile label={t('变化', 'Delta')} value={formatSignedScore(toNumber(item.delta))} />
</div>
<div className="mt-2 text-muted-foreground">
{String(item.synthetic) === 'true' ? t('模拟案例', 'Synthetic case') : t('真实案例', 'Real case')}
{item.tier ? ` · ${String(item.tier)}` : ''}
</div>
</div>
))}
</div>
@ -1122,6 +1218,7 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<thead className="bg-muted/40 text-muted-foreground">
<tr>
<th className="px-3 py-2 font-medium">{t('运行', 'Run')}</th>
<th className="px-3 py-2 font-medium">{t('来源', 'Source')}</th>
<th className="px-3 py-2 font-medium">{t('基线', 'Baseline')}</th>
<th className="px-3 py-2 font-medium">{t('候选', 'Candidate')}</th>
<th className="px-3 py-2 font-medium">{t('变化', 'Delta')}</th>
@ -1131,6 +1228,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
{report.cases.map((item, index) => (
<tr key={`${String(item.run_id || index)}:${index}`} className="border-t border-border">
<td className="max-w-[160px] truncate px-3 py-2 font-mono">{String(item.run_id || '-')}</td>
<td className="px-3 py-2">
{String(item.synthetic) === 'true' ? t('模拟', 'Synthetic') : t('真实', 'Real')}
{item.tier ? ` · ${String(item.tier)}` : ''}
</td>
<td className="px-3 py-2">{formatScore(toNumber(item.baseline_score))}</td>
<td className="px-3 py-2">{formatScore(toNumber(item.candidate_score))}</td>
<td className="px-3 py-2">{formatSignedScore(toNumber(item.delta))}</td>
@ -1144,6 +1245,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
) : null}
{Object.keys(abilitySummary).length > 0 ? (
<RawDetails title={t('能力评分汇总', 'Ability score summary')} payload={abilitySummary} />
) : null}
{Object.keys(toolExecutionSummary).length > 0 ? (
<RawDetails title={t('工具诊断汇总', 'Tool diagnostic summary')} payload={toolExecutionSummary} />
) : null}
{report.preservation_report ? (
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
) : null}
@ -1366,7 +1473,9 @@ function triggerReasonLabel(reason: string, t: (zh: string, en: string) => strin
}
function publishBlockReason(draft: SkillDraft, t: (zh: string, en: string) => string): string {
if (draft.status !== 'approved') return t('草稿还没有批准,不能发布。', 'The draft is not approved yet.');
if (draft.status !== 'in_review' && draft.status !== 'approved') {
return t('草稿还没有送审,不能发布。', 'The draft has not been submitted yet.');
}
if (!draft.safety_report) return t('缺少安全报告,不能发布。', 'A safety report is required before publishing.');
if (draft.safety_report.risk_level === 'critical' || !draft.safety_report.passed) {
return t('安全报告存在阻断项,不能发布。', 'The safety report has blockers.');
@ -1399,6 +1508,11 @@ function formatScore(value: number): string {
return value.toFixed(2);
}
function formatOptionalScore(value: unknown): string {
const parsed = toOptionalNumber(value);
return typeof parsed === 'number' ? formatScore(parsed) : '-';
}
function formatPercent(value?: number | null): string {
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
return `${Math.round(value * 100)}%`;
@ -1414,6 +1528,12 @@ function toNumber(value: unknown): number {
return Number.isFinite(parsed) ? parsed : 0;
}
function toOptionalNumber(value: unknown): number | null {
if (value === null || value === undefined || value === '') return null;
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
function EmptyState({ icon, text }: { icon: React.ReactNode; text: string }) {
return (
<div className="py-12 text-center text-muted-foreground">
@ -1475,7 +1595,7 @@ function UploadSkillForm({
className="block w-full cursor-pointer text-sm text-muted-foreground file:mr-4 file:rounded-md file:border-0 file:bg-primary file:px-4 file:py-2 file:text-sm file:font-medium file:text-primary-foreground hover:file:bg-primary/90"
/>
<p className="text-xs text-muted-foreground">
{pickAppText(locale, '上传后进入草稿评审,并自动运行 safety 和 eval。', 'After upload, the skill enters draft review and runs safety and eval automatically.')}
{pickAppText(locale, '上传后生成草稿;送审后再运行 safety 和 eval。', 'After upload, a draft is created; safety and eval run after submission.')}
</p>
</div>
<div className="flex justify-end gap-2">

View File

@ -3,7 +3,7 @@
import { useEffect } from 'react';
import { usePathname, useRouter, useSearchParams } from 'next/navigation';
import { buildAuthPortalUrl } from '@/lib/auth-portal';
import { clearTokens, getMe, isLoggedIn } from '@/lib/api';
import { AUTH_CLEARED_EVENT, clearTokens, getMe, isLoggedIn } from '@/lib/api';
import { pickAppText } from '@/lib/i18n/core';
import { useAppI18n } from '@/lib/i18n/provider';
import { useChatStore } from '@/lib/store';
@ -66,6 +66,18 @@ export default function AuthGuard({
};
}, [setIsAuthLoading, setUser]);
useEffect(() => {
const handleAuthCleared = () => {
setUser(null);
setIsAuthLoading(false);
};
window.addEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
return () => {
window.removeEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
};
}, [setIsAuthLoading, setUser]);
useEffect(() => {
if (isAuthLoading) {
return;

View File

@ -58,6 +58,7 @@ const WS_URL = process.env.NEXT_PUBLIC_WS_URL?.trim();
const DEFAULT_API_URL = 'http://127.0.0.1:18080';
const ACCESS_TOKEN_KEY = 'beaver_access_token';
const REFRESH_TOKEN_KEY = 'beaver_refresh_token';
export const AUTH_CLEARED_EVENT = 'beaver-auth-cleared';
const REQUEST_TIMEOUT_MS = 8000;
const OUTLOOK_REQUEST_TIMEOUT_MS = 45000;
const SKILL_LEARNING_REQUEST_TIMEOUT_MS = 120000;
@ -117,6 +118,34 @@ type FetchJsonOptions = RequestInit & {
timeoutMs?: number;
};
export class ApiError extends Error {
status: number;
detail: string;
constructor(message: string, options: { status: number; detail: string }) {
super(message);
this.name = 'ApiError';
this.status = options.status;
this.detail = options.detail;
}
}
export function isApiError(error: unknown, status?: number): error is ApiError {
return error instanceof ApiError && (status === undefined || error.status === status);
}
function parseErrorDetail(text: string): string {
try {
const parsed = JSON.parse(text);
if (parsed && typeof parsed.detail === 'string') {
return parsed.detail;
}
} catch {
// keep raw text
}
return text;
}
function withTimeout(
signal?: AbortSignal,
timeoutMs: number = REQUEST_TIMEOUT_MS
@ -163,6 +192,7 @@ export function clearTokens(): void {
if (!isBrowser()) return;
localStorage.removeItem(ACCESS_TOKEN_KEY);
localStorage.removeItem(REFRESH_TOKEN_KEY);
window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT));
}
export function isLoggedIn(): boolean {
@ -215,16 +245,11 @@ async function fetchJSON<T>(path: string, options?: FetchJsonOptions): Promise<T
if (res.status === 401) {
clearTokens();
}
let detail = text;
try {
const parsed = JSON.parse(text);
if (parsed && typeof parsed.detail === 'string') {
detail = parsed.detail;
}
} catch {
// keep raw text
}
throw new Error(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`);
const detail = parseErrorDetail(text);
throw new ApiError(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`, {
status: res.status,
detail,
});
}
return res.json();
}
@ -1216,7 +1241,7 @@ export async function uploadSkill(file: File): Promise<Skill> {
if (!res.ok) {
const text = await res.text();
throw new Error(`接口错误 ${res.status}: ${text}`);
throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`);
}
return res.json();
}

View File

@ -0,0 +1,8 @@
const USER_FILE_MUTABLE_ROOTS = new Set(['uploads', 'outputs', 'shared', 'tasks']);
export function canMutateUserFilesPath(path: string): boolean {
const cleaned = path.trim().replace(/^\/+|\/+$/g, '');
if (!cleaned) return false;
const [root] = cleaned.split('/');
return USER_FILE_MUTABLE_ROOTS.has(root);
}

View File

@ -3,9 +3,23 @@ import { resolve } from 'node:path';
import { describe, expect, it } from 'vitest';
import { canMutateUserFilesPath } from './user-file-paths';
const root = resolve(__dirname, '..');
describe('user file system frontend wiring', () => {
it('only enables mutating file actions inside concrete user-file roots', () => {
expect(canMutateUserFilesPath('')).toBe(false);
expect(canMutateUserFilesPath('/')).toBe(false);
expect(canMutateUserFilesPath('qa-folder')).toBe(false);
expect(canMutateUserFilesPath('uploads')).toBe(true);
expect(canMutateUserFilesPath('uploads/qa-folder')).toBe(true);
expect(canMutateUserFilesPath('outputs/report.md')).toBe(true);
expect(canMutateUserFilesPath('shared')).toBe(true);
expect(canMutateUserFilesPath('tasks/task-1')).toBe(true);
});
it('routes API client helpers to user file endpoints', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
@ -17,6 +31,13 @@ describe('user file system frontend wiring', () => {
expect(apiSource).toContain('/api/user-files/mkdir');
});
it('notifies the app shell when API auth is cleared', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
expect(apiSource).toContain('AUTH_CLEARED_EVENT');
expect(apiSource).toContain("window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT))");
});
it('does not wire the Files page to workspace or MinIO management APIs', () => {
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
@ -29,4 +50,18 @@ describe('user file system frontend wiring', () => {
expect(pageSource).not.toContain('accessKey');
expect(pageSource).not.toContain('secretKey');
});
it('does not retry user-file loads after an auth failure', () => {
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
expect(pageSource).toContain('isAuthError');
expect(pageSource).toContain('if (isAuthError(err))');
});
it('shows backend upload error details instead of raw JSON payloads', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
expect(apiSource).toContain('function parseErrorDetail');
expect(apiSource).toContain('throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`)');
});
});

View File

@ -993,6 +993,12 @@ export interface SkillDraftEvalReport {
confidence?: 'low' | 'medium' | 'high' | string;
case_reports?: Array<Record<string, unknown>>;
tool_mode_summary?: Record<string, unknown>;
ability_score_summary?: Record<string, unknown>;
tool_execution_summary?: Record<string, unknown>;
case_selection_summary?: Record<string, unknown>;
real_score_avg?: number | null;
synthetic_score_avg?: number | null;
overall_score_avg?: number | null;
preservation_report?: Record<string, unknown> | null;
}
@ -1000,6 +1006,15 @@ export interface SkillDraft {
draft_id: string;
skill_name: string;
base_version?: string | null;
target_version?: string | null;
base_skill?: {
skill_name: string;
version: string;
frontmatter: Record<string, unknown>;
content: string;
summary?: string;
tool_hints?: string[];
} | null;
proposed_content: string;
proposed_frontmatter: Record<string, unknown>;
created_at: string;

View File

@ -47,6 +47,8 @@ http {
location /api/ {
proxy_pass http://127.0.0.1:18080;
proxy_read_timeout 3600;
proxy_send_timeout 3600;
}
location /docs {

View File

@ -99,7 +99,11 @@ def provision_user_file_minio_settings(
policy = _namespace_policy(bucket=cfg.bucket, namespace=namespace)
admin.policy_add(policy_name, policy=policy)
admin.attach_policy(policies=[policy_name], user=access_key)
try:
admin.attach_policy(policies=[policy_name], user=access_key)
except Exception as exc:
if not _is_policy_attach_already_applied(exc):
raise
except Exception as exc:
raise MinIOProvisioningError(f"MinIO user file provisioning failed: {exc}") from exc
@ -304,6 +308,15 @@ def _is_absent_error(exc: Exception) -> bool:
return any(marker in text for marker in absent_markers)
def _is_policy_attach_already_applied(exc: Exception) -> bool:
text = _safe_error_text(exc)
return (
"XMinioAdminPolicyChangeAlreadyApplied" in text
or "specified policy change is already in effect" in text.lower()
or "policy update has no net effect" in text.lower()
)
def _safe_error_text(exc: object) -> str:
text = str(exc).strip()
return text or exc.__class__.__name__

View File

@ -10,6 +10,7 @@ from fastapi.testclient import TestClient
from app.minio_provisioning import (
MinIOProvisioningConfig,
deprovision_user_file_minio_resources,
provision_user_file_minio_settings,
)
from app.models import MinIOSettings
@ -23,6 +24,7 @@ class _FakeMinio:
bucket_exists_value = True
objects: list[str] = []
removed_objects: list[str] = []
made_buckets: list[str] = []
def __init__(self, **_kwargs: Any) -> None:
pass
@ -30,6 +32,9 @@ class _FakeMinio:
def bucket_exists(self, bucket: str) -> bool:
return self.bucket_exists_value
def make_bucket(self, bucket: str, location: str | None = None) -> None:
self.made_buckets.append(bucket)
def list_objects(self, bucket: str, *, prefix: str, recursive: bool) -> list[_FakeObject]:
return [_FakeObject(name) for name in self.objects if name.startswith(prefix)]
@ -41,10 +46,26 @@ class _FakeMinio:
class _FakeAdmin:
calls: list[tuple[str, Any]] = []
missing = False
attach_policy_already_applied = False
def __init__(self, **_kwargs: Any) -> None:
pass
def user_add(self, access_key: str, secret_key: str) -> None:
self.calls.append(("user_add", access_key))
def policy_add(self, policy_name: str, *, policy: dict[str, Any]) -> None:
self.calls.append(("policy_add", policy_name))
def attach_policy(self, **kwargs: Any) -> None:
self.calls.append(("attach_policy", kwargs))
if self.attach_policy_already_applied:
raise RuntimeError(
"admin request failed; Status: 400, Body: "
'{"Code":"XMinioAdminPolicyChangeAlreadyApplied",'
'"Message":"The specified policy change is already in effect."}'
)
def detach_policy(self, **kwargs: Any) -> None:
self.calls.append(("detach_policy", kwargs))
if self.missing:
@ -88,8 +109,10 @@ def _install_fake_minio(monkeypatch) -> None:
_FakeMinio.bucket_exists_value = True
_FakeMinio.objects = []
_FakeMinio.removed_objects = []
_FakeMinio.made_buckets = []
_FakeAdmin.calls = []
_FakeAdmin.missing = False
_FakeAdmin.attach_policy_already_applied = False
def _config() -> MinIOProvisioningConfig:
@ -159,6 +182,25 @@ def test_deprovision_removes_namespace_resources_without_secrets(monkeypatch) ->
assert "secret" not in str(result).lower()
def test_provision_treats_already_attached_policy_as_idempotent(monkeypatch) -> None:
_install_fake_minio(monkeypatch)
_FakeAdmin.attach_policy_already_applied = True
settings = provision_user_file_minio_settings(
backend_id="alice",
existing=None,
config=_config(),
)
assert settings is not None
assert settings.endpoint == "minio.local:9000"
assert settings.access_key == "beaver-alice"
assert settings.bucket == "beaver-user-files"
assert settings.namespace == "users/alice"
assert settings.secret_key
assert ("attach_policy", {"policies": ["beaver-user-files-alice"], "user": "beaver-alice"}) in _FakeAdmin.calls
def test_deprovision_is_idempotent_when_resources_are_absent(monkeypatch) -> None:
_install_fake_minio(monkeypatch)
_FakeMinio.bucket_exists_value = False

View File

@ -8,7 +8,7 @@ Beaver is an enterprise Agent sandbox and execution platform. It combines privat
- [Business Strategy HTML](./index.html): business-style product discovery, strategy canvas, target users, segmentation, and competitors.
- [Product PRD HTML](./product-prd.html): product PRD, outcome roadmap, module job stories, WWA backlog items, and test scenarios.
- [Product Discovery Report](./product-discovery-report.md): product understanding, users, JTBD, opportunities, assumptions, experiments, priorities, metrics, and 30/90 day recommendations.
- [Product Discovery Report](./product-discovery-report.md): product understanding, users, JTBD, opportunities, assumptions, experiments, priorities, and 30/90 day recommendations.
- [Product Architecture Brief](./product-architecture-brief.md): product-facing architecture across auth, deployment control, routing, app instances, frontend, backend, Agent runtime, tools, skills, memory, files, connectors, and operations.
- [PRD](./PRD-beaver-agent-sandbox.md): full-product PRD for the Beaver Agent Sandbox.
- [Validation Plan](./validation-plan.md): customer, product, technical, security, usability, and business validation plan.

View File

@ -738,7 +738,6 @@
<a href="#personas">用户画像</a>
<a href="#behavior">行为分群</a>
<a href="#competitors">竞品</a>
<a href="#metrics">验收指标</a>
</nav>
</div>
</header>
@ -758,7 +757,7 @@
<div class="kpi"><span>产品主线</span><b>执行</b>不是聊天</div>
<div class="kpi"><span>商业切口</span><b>团队</b>知识工作</div>
<div class="kpi"><span>核心壁垒</span><b>复用</b>技能与记忆</div>
<div class="kpi"><span>试点指标</span><b>验收</b>真实任务</div>
<div class="kpi"><span>价值判断</span><b>交付</b>真实任务</div>
</div>
</div>
@ -853,10 +852,9 @@
<article class="card accent-amber"><span class="tag amber">3. Relative Costs</span><h3>不打最低价,强调可控价值</h3><p>Beaver 应走“私有部署 + 执行治理 + 复用资产”的高价值路线,而不是和通用 SaaS 聊天工具比低价。</p></article>
<article class="card"><span class="tag">4. Value Proposition</span><h3>从回答到交付</h3><p>BeforeAI 输出散落在聊天里How任务化执行、工具证据、用户验收After产物可交付经验可沉淀。</p></article>
<article class="card"><span class="tag">5. Trade-offs</span><h3>明确不做什么</h3><p>不先做大众聊天 SaaS不先铺满所有连接器不默认自动发布技能不在无控制台前大规模启用敏感长期记忆。</p></article>
<article class="card"><span class="tag">6. Metrics</span><h3>北极星是“已验收工作”</h3><p>核心指标不是消息数,而是每个试点团队每周完成并被接受的 Agent 工作数。季度 OMTM首批试点的已验收任务数</p></article>
<article class="card"><span class="tag">7. Growth</span><h3>销售驱动 + 试点转扩展</h3><p>先通过高价值工作流试点进入客户,再从一个团队扩展到部门,最后以技能、模板、连接器和治理能力形成扩张</p></article>
<article class="card"><span class="tag">8. Capabilities</span><h3>需要补强的能力</h3><p>工作流模板、证据叙事、Memory Control Center、Admin Health Console、连接器安全策略、技能评估门禁</p></article>
<article class="card"><span class="tag">9. Can't / Won't</span><h3>护城河来自运行闭环</h3><p>单个聊天 UI 容易复制;难复制的是私有实例、任务证据、验收反馈、技能记忆沉淀和客户真实工作流数据。</p></article>
<article class="card"><span class="tag">6. Growth</span><h3>销售驱动 + 试点转扩展</h3><p>先通过高价值工作流试点进入客户,再从一个团队扩展到部门,最后以技能、模板、连接器和治理能力形成扩张</p></article>
<article class="card"><span class="tag">7. Capabilities</span><h3>需要补强的能力</h3><p>工作流模板、证据叙事、Memory Control Center、Admin Health Console、连接器安全策略、技能评估门禁</p></article>
<article class="card"><span class="tag">8. Can't / Won't</span><h3>护城河来自运行闭环</h3><p>单个聊天 UI 容易复制;难复制的是私有实例、任务证据、验收反馈、技能记忆沉淀和客户真实工作流数据</p></article>
</div>
</section>
@ -1209,29 +1207,12 @@
<li>不要先做所有人的通用 AI 助手。</li>
<li>不要和 Dify/Stack AI 正面比“谁更会搭 Agent”。</li>
<li>不要过早承诺所有连接器和完全自治。</li>
<li>不要把验收指标、路线图和上线计划放在前面抢主线。</li>
<li>不要把路线图和上线计划放在前面抢产品发现主线。</li>
</ul>
</article>
</div>
</section>
<section id="metrics">
<div class="section-head">
<div>
<div class="eyebrow">Acceptance Metrics</div>
<h2>验收指标放在最后</h2>
</div>
<p>这些指标只作为后续试点验收的出口,不在当前页面前半段展开路线图和上线维护。</p>
</div>
<div class="grid-4">
<div class="kpi"><span>北极星</span><b>已验收任务</b>每周/每团队</div>
<div class="kpi"><span>30 天目标</span><b>30+</b>真实验收任务</div>
<div class="kpi"><span>复用目标</span><b>5</b>技能,其中 3 个复用</div>
<div class="kpi"><span>安全目标</span><b>0</b>关键事故</div>
</div>
</section>
<section id="sources">
<div class="section-head">
<div>

View File

@ -87,7 +87,6 @@ For product pilots:
| Connector maturity varies by channel | Customer demos must avoid overpromising |
| Multi-instance deployment is powerful but operationally sensitive | Pilot success depends on stable setup and clear runbooks |
| Skill learning needs strong governance | Reuse can become risk if publishing is weak |
| Metrics are not yet productized | Hard to prove pilot value without baseline and target |
| Customer research is not yet captured | Current roadmap is inferred from implementation and product judgment |
## User Segments
@ -345,51 +344,6 @@ Opportunity 3: I need successful work to become reusable.
| Production writes through connectors without review | Trust risk |
| Complex enterprise RBAC before pilot validation | May overbuild before segment clarity |
## Metrics Dashboard
### North Star Metric
Accepted Agent Workflows:
> Number of AI-assisted tasks or scheduled workflows accepted by users per active pilot team per week.
Why this metric: it captures real delivered value better than messages sent, tokens used, or model calls.
### Input Metrics
| Metric | Definition | Target For Pilot |
| --- | --- | --- |
| Task Creation Rate | Tasks created / active users / week | Increasing weekly |
| Acceptance Rate | Accepted task runs / completed task runs | >=60% in pilot |
| Revision Rate | Runs needing revision / completed runs | Track down over time |
| Evidence Coverage | Task runs with timeline/tool/artifact evidence / task runs | >=90% |
| Skill Candidate Rate | Accepted tasks producing candidates / accepted tasks | >=20% after week 2 |
| Skill Reuse Rate | Runs activating published pilot skills / task runs | >=15% after skills exist |
| Scheduled Success Rate | Accepted scheduled outputs / scheduled runs | >=50% for selected workflows |
| Deployment Success Time | Fresh deployment time to first working user | <2 hours for pilot |
### Guardrail Metrics
| Metric | Alert |
| --- | --- |
| Critical tool/security incident | Any occurrence |
| Instance creation failure rate | >10% in pilot |
| Provider configuration failure rate | >20% |
| Task run failure rate | >20% for 2 consecutive days |
| Connector side-effect incident | Any unintended external write |
| User file permission/storage incident | Any cross-user or cross-instance leak |
| p95 task completion latency | Exceeds pilot workflow tolerance |
### Business Metrics
- Pilot activation: teams reaching first accepted task.
- Time to first accepted task.
- Weekly active task users.
- Repeated workflow count.
- Skill reuse per team.
- Customer-reported time saved.
- Pilot conversion intent.
## Customer Research Plan
No direct interview transcripts were provided. Research should start immediately before locking roadmap.
@ -454,7 +408,7 @@ We are studying how teams move AI from chat into real work. We are not asking wh
1. Pick 2-3 pilot workflows: project brief, weekly report, document review, support triage, or file processing.
2. Run fresh deployment rehearsal from README/deployment guide and record gaps.
3. Define pilot metrics and instrument accepted tasks, revisions, skill candidates, skill reuse, and run failures.
3. Define pilot learning questions and instrument the events needed to answer them.
4. Create a task evidence narrative prototype on top of existing timeline data.
5. Package pilot workflow templates as skills or documented demos.
6. Validate provider onboarding with 3 non-engineer users.

View File

@ -733,7 +733,7 @@
<span class="tag green">2. Contacts</span>
<h3>关键角色</h3>
<ul>
<li>产品负责人:定义首批场景、验收指标和模块优先级。</li>
<li>产品负责人:定义首批场景、试点问题和模块优先级。</li>
<li>工程负责人:保证实例、任务、工具、技能和连接器架构可落地。</li>
<li>设计负责人:保证工作台、任务详情、技能审核和配置体验可理解。</li>
<li>运维负责人:保证部署、路由、日志、备份和故障恢复可执行。</li>

View File

@ -5,9 +5,16 @@
"display_name": "cron-scheduler",
"lineage": [],
"name": "cron-scheduler",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["cron", "scheduler", "timer", "periodic"],
"tags": [
"cron",
"scheduler",
"timer",
"periodic"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -5,13 +5,35 @@ tools:
- cron
---
# Cron Scheduler — 定时任务调度
# Cron Scheduler
## Overview
定时任务和周期性调度。支持标准 cron 表达式、一次性提醒和持久化任务。
## When to Use
- Use when the task requires Cron Scheduler guidance.
## Required Tools
- `cron`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Cron Scheduler — 定时任务调度
基于 cron 表达式的定时任务和一次性提醒。
## 工具说明
#### 工具说明
### cron
##### cron
创建和管理 Beaver 定时通知或 Task。
- `action` (str): `add` | `list` | `remove` | `toggle` | `run`
- `message` (str): 触发时执行的任务说明,`add` 时必填
@ -25,10 +47,25 @@ tools:
- `mode` (str | None): `notification``task`
- `requires_followup` (bool | None): task 模式下是否需要用户跟进
## 使用原则
#### 使用原则
1. 避开 :00 和 :30 整点分钟,分散负载
2. 一次性提醒优先使用 `at_iso` 或清晰的 `schedule`
3. 需要持续提醒时使用 `mode="notification"`,需要 Task 跟踪时才用 `mode="task"`
4. 定期用 `action="list"` 确认任务是否按预期调度
5. 任务触发时 `message` 会完整执行,确保内容自包含
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,14 @@
{
"change_reason": "Initial skill for cron scheduling",
"content_hash": "placeholder",
"content_hash": "1826b1b2921197045bccce45b4e1997ee212d10cc28b3ea5f42bf7b1982beacc",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "定时任务和周期性调度。支持标准 cron 表达式、一次性提醒和持久化任务。",
"name": "cron-scheduler",
"tools": ["cron"]
"tools": [
"cron"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +17,10 @@
},
"review_state": "published",
"skill_name": "cron-scheduler",
"summary": "Cron Scheduler — 基于 cron 表达式的定时任务和一次性提醒",
"summary_hash": "placeholder",
"tool_hints": ["cron"],
"summary": "# Cron Scheduler ## Overview 定时任务和周期性调度。支持标准 cron 表达式、一次性提醒和持久化任务。",
"summary_hash": "66b35720f0eb98008c5e53408bb8f13961f7e733deb5e01409f7cb6d017ba002",
"tool_hints": [
"cron"
],
"version": "v0001"
}

View File

@ -5,9 +5,16 @@
"display_name": "filesystem-operation",
"lineage": [],
"name": "filesystem-operation",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["filesystem", "file", "io", "directory"],
"tags": [
"filesystem",
"file",
"io",
"directory"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -9,42 +9,83 @@ tools:
- list_directory
---
# Filesystem Operation — 文件系统操作
# Filesystem Operation
## Overview
本地文件系统读写、搜索和目录操作。支持读取、写入、修改、搜索文件和目录遍历。
## When to Use
- Use when the task requires Filesystem Operation guidance.
## Required Tools
- `read_file`
- `write_file`
- `patch_file`
- `search_files`
- `list_directory`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Filesystem Operation — 文件系统操作
本地文件系统工具集,用于读写和搜索项目文件。
## 工具说明
#### 工具说明
### read_file
##### read_file
读取本地文件内容。
- 使用 `skill_view` 查看文件预览
- 大文件会分页返回,可通过 offset/limit 控制
### write_file
##### write_file
写入新文件或覆盖已有文件。
- 创建新文件时自动创建父目录
- 写入前确认不会覆盖重要配置
### patch_file
##### patch_file
精确修改文件中的指定内容。
- 通过搜索-替换方式修改
- 适用于局部更新,避免整文件重写
### search_files
##### search_files
在项目中搜索文件名或内容。
- 支持 glob 模式匹配
- 支持按内容搜索
- 支持限制搜索目录深度
### list_directory
##### list_directory
列出目录内容。
- 可递归列出子目录
- 支持过滤文件类型
## 使用原则
#### 使用原则
1. 优先使用 `read_file` 查看文件内容,再决定修改方案
2. 小范围修改用 `patch_file`,大范围用 `write_file`
3. 搜索文件时先确认路径是否存在
4. 修改前确认文件编码(默认 UTF-8
5. 敏感文件(.env、密钥等不写入版本控制
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,18 @@
{
"change_reason": "Initial skill for local filesystem operations",
"content_hash": "placeholder",
"content_hash": "d462cfff23d0a7c79e5c7319c66952133482193f063150062a93853a489e1160",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "本地文件系统读写、搜索和目录操作。支持读取、写入、修改、搜索文件和目录遍历。",
"name": "filesystem-operation",
"tools": ["read_file", "write_file", "patch_file", "search_files", "list_directory"]
"tools": [
"read_file",
"write_file",
"patch_file",
"search_files",
"list_directory"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +21,14 @@
},
"review_state": "published",
"skill_name": "filesystem-operation",
"summary": "Filesystem Operation — 本地文件系统操作工具集",
"summary_hash": "placeholder",
"tool_hints": ["read_file", "write_file", "patch_file", "search_files", "list_directory"],
"summary": "# Filesystem Operation ## Overview 本地文件系统读写、搜索和目录操作。支持读取、写入、修改、搜索文件和目录遍历。",
"summary_hash": "aa53a9010f1f28469aecbdc81e382a2a6ff1a1335cce3abba56ae9a084535605",
"tool_hints": [
"read_file",
"write_file",
"patch_file",
"search_files",
"list_directory"
],
"version": "v0001"
}
}

View File

@ -5,9 +5,16 @@
"display_name": "memory-management",
"lineage": [],
"name": "memory-management",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["memory", "persistence", "context", "preferences"],
"tags": [
"memory",
"persistence",
"context",
"preferences"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -5,13 +5,35 @@ tools:
- memory
---
# Memory Management — 记忆管理
# Memory Management
## Overview
持久化记忆管理。存储用户信息、项目上下文、偏好和反馈,实现跨会话记忆。
## When to Use
- Use when the task requires Memory Management guidance.
## Required Tools
- `memory`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Memory Management — 记忆管理
持久化记忆系统,保存用户角色、项目上下文、偏好反馈等跨会话信息。
## 工具说明
#### 工具说明
### memory
##### memory
管理记忆条目(增删改查)。
- `action` (str): `add` | `replace` | `remove`
- `target` (str): `user``memory`
@ -23,10 +45,25 @@ tools:
- 支持自动保存和检索
- 跨会话持久化
## 使用原则
#### 使用原则
1. 了解用户角色偏好后及时保存到 `user` 类型
2. 用户明确要求记住的内容立即保存
3. 过时的记忆及时更新或删除
4. 不保存可以从代码/git 推导出的信息
5. 记忆是辅助参考,当前上下文和文件状态优先级更高
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,14 @@
{
"change_reason": "Initial skill for memory management",
"content_hash": "placeholder",
"content_hash": "2d6d3f35c8f0fedbfd4d3e999298f516846e512931241c157c8f978cbcd8d697",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "持久化记忆管理。存储用户信息、项目上下文、偏好和反馈,实现跨会话记忆。",
"name": "memory-management",
"tools": ["memory"]
"tools": [
"memory"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +17,10 @@
},
"review_state": "published",
"skill_name": "memory-management",
"summary": "Memory Management — 持久化记忆系统,支持跨会话信息存储",
"summary_hash": "placeholder",
"tool_hints": ["memory"],
"summary": "# Memory Management ## Overview 持久化记忆管理。存储用户信息、项目上下文、偏好和反馈,实现跨会话记忆。",
"summary_hash": "9a90dbc4b11315e936a752395efc0df32b0d02cad57e9ebd1de341512beff197",
"tool_hints": [
"memory"
],
"version": "v0001"
}

View File

@ -7,10 +7,32 @@ tools:
# Multi Search Engine
Integration of 16 search engines for web crawling without API keys.
## Overview
Multi search engine integration with 16 engines (7 CN + 9 Global). Supports advanced search operators, time filters, site search, privacy engines, and WolframAlpha knowledge queries. No API keys required.
## When to Use
- Use when the task requires Multi Search Engine guidance.
## Required Tools
- `web_fetch`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Multi Search Engine
Integration of 16 search engines for web crawling without API keys.
#### Workflow
1. **Preparation**: AI Agent initializes an empty in-memory cookie store. Cookies are only acquired dynamically during search operations when access is denied
2. **Language Evaluation**: Detect the language attribute of the search query. If the query is in Chinese, use Domestic search engines (Baidu, Bing CN, Bing INT, 360, Sogou, WeChat, Shenma). If the query is non-Chinese, use International search engines (Google, Google HK, DuckDuckGo, Yahoo, Startpage, Brave, Ecosia, Qwant, WolframAlpha). Select engines based on query relevance and availability.
@ -32,9 +54,9 @@ Integration of 16 search engines for web crawling without API keys.
6. **Result Aggregation**: Consolidate successful results from search engines, organize and summarize them to output a core search report
## Search Engines
#### Search Engines
### Domestic (7)
##### Domestic (7)
- **Baidu**: `https://www.baidu.com/s?wd={keyword}`
- **Bing CN**: `https://cn.bing.com/search?q={keyword}&ensearch=0`
- **Bing INT**: `https://cn.bing.com/search?q={keyword}&ensearch=1`
@ -43,7 +65,7 @@ Integration of 16 search engines for web crawling without API keys.
- **WeChat**: `https://wx.sogou.com/weixin?type=2&query={keyword}`
- **Shenma**: `https://m.sm.cn/s?q={keyword}`
### International (9)
##### International (9)
- **Google**: `https://www.google.com/search?q={keyword}`
- **Google HK**: `https://www.google.com.hk/search?q={keyword}`
- **DuckDuckGo**: `https://duckduckgo.com/html/?q={keyword}`
@ -54,7 +76,7 @@ Integration of 16 search engines for web crawling without API keys.
- **Qwant**: `https://www.qwant.com/?q={keyword}`
- **WolframAlpha**: `https://www.wolframalpha.com/input?i={keyword}`
## Quick Examples
#### Quick Examples
```javascript
// Basic search
@ -79,7 +101,7 @@ web_fetch({"url": "https://duckduckgo.com/html/?q=!gh+tensorflow"})
web_fetch({"url": "https://www.wolframalpha.com/input?i=100+USD+to+CNY"})
```
## Advanced Operators
#### Advanced Operators
| Operator | Example | Description |
|----------|---------|-------------|
@ -89,7 +111,7 @@ web_fetch({"url": "https://www.wolframalpha.com/input?i=100+USD+to+CNY"})
| `-` | `python -snake` | Exclude term |
| `OR` | `cat OR dog` | Either term |
## Time Filters
#### Time Filters
| Parameter | Description |
|-----------|-------------|
@ -99,14 +121,14 @@ web_fetch({"url": "https://www.wolframalpha.com/input?i=100+USD+to+CNY"})
| `tbs=qdr:m` | Past month |
| `tbs=qdr:y` | Past year |
## Privacy Engines
#### Privacy Engines
- **DuckDuckGo**: No tracking
- **Startpage**: Google results + privacy
- **Brave**: Independent index
- **Qwant**: EU GDPR compliant
## Bangs Shortcuts (DuckDuckGo)
#### Bangs Shortcuts (DuckDuckGo)
| Bang | Destination |
|------|-------------|
@ -116,26 +138,26 @@ web_fetch({"url": "https://www.wolframalpha.com/input?i=100+USD+to+CNY"})
| `!w` | Wikipedia |
| `!yt` | YouTube |
## WolframAlpha Queries
#### WolframAlpha Queries
- Math: `integrate x^2 dx`
- Conversion: `100 USD to CNY`
- Stocks: `AAPL stock`
- Weather: `weather in Beijing`
## Documentation
#### Documentation
- `references/advanced-search.md` - Domestic search guide
- `references/international-search.md` - International search guide
- `CHANGELOG.md` - Version history
## License
#### License
MIT
## Security & Privacy Notice
#### Security & Privacy Notice
### Cookie Handling
##### Cookie Handling
- **Purpose**: Cookies are used ONLY to maintain search session state when access is denied (403/429 errors)
- **Storage**: Cookies are kept STRICTLY in memory during runtime - NEVER persisted to disk or config files
- **Acquisition**: Cookies are acquired on-demand from search engine homepages only when search requests fail
@ -144,13 +166,28 @@ MIT
- **No Pre-configuration**: No cookies are loaded from config.json or any external file at startup
- **No API Keys**: This tool uses standard web search URLs, no authentication required
### Crawling Ethics
##### Crawling Ethics
- **Rate Limiting**: Implement reasonable delays between requests (recommend 1-2 seconds)
- **Respect robots.txt**: Honor search engine crawling policies
- **Terms of Service**: Users are responsible for complying with search engine ToS
- **Purpose**: Designed for legitimate search aggregation, not mass data scraping
### Data Handling
##### Data Handling
- **No Personal Data**: Tool does not collect or transmit user personal information
- **Local Execution**: All operations run locally, no external data transmission
- **Session Isolation**: Cookies are session-specific and cleared after use
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,6 +1,6 @@
{
"change_reason": "Initial skill seeded from SkillHub global/multi-search-engine@20260413.065325",
"content_hash": "fd2d3fecd923622e6fda6c607ae4913a9a88601cbb266c7b6a25ea856e4d7f91",
"content_hash": "0b46644d3b97b94b0a4b8b0747165ef083e4f5a30b90f6dbea3337fd4ca48cb9",
"created_at": "2026-06-04T09:44:11.388282+00:00",
"created_by": "skillhub",
"frontmatter": {
@ -17,13 +17,13 @@
"slug": "multi-search-engine",
"source": "initial_skills",
"source_kind": "initial",
"upstream_source": "skillhub",
"source_url": "https://skillhub.bwgdi.com/space/global/multi-search-engine"
"source_url": "https://skillhub.bwgdi.com/space/global/multi-search-engine",
"upstream_source": "skillhub"
},
"review_state": "published",
"skill_name": "multi-search-engine",
"summary": "# Multi Search Engine Integration of 16 search engines for web crawling without API keys. ## Workflow",
"summary_hash": "214e55914a70eabf8635c1d0bd4df1f46e01f988bed9ef42070aeab6aaf12c3b",
"summary": "# Multi Search Engine ## Overview Multi search engine integration with 16 engines (7 CN + 9 Global). Supports advanced search operators, time filters, site search, privacy engines, and WolframAlpha knowledge queries. No API keys required.",
"summary_hash": "ce97577b548d0e554c02471bcf8a4082f1024ff8cd7535359713b90f655f32e5",
"tool_hints": [
"web_fetch"
],

View File

@ -18,4 +18,3 @@
],
"updated_at": "2026-05-27T00:00:00.000000+00:00"
}

View File

@ -1,7 +1,6 @@
---
name: officebench-mcp
description: Guidance for OfficeBench evaluation tasks. Use the registered mcp_officebench_* tools to inspect and edit OfficeBench files, spreadsheets, documents, emails, calendars, PDFs, and answer files.
always: true
tools:
- mcp_officebench_excel_read_file
- mcp_officebench_excel_set_cell
@ -30,13 +29,62 @@ tools:
- mcp_officebench_system_finish_task
- mcp_officebench_system_get_status
- mcp_officebench_image_convert_to_pdf
always: True
---
# OfficeBench MCP Skill
# Officebench Mcp
## Overview
Guidance for OfficeBench evaluation tasks. Use the registered mcp_officebench_* tools to inspect and edit OfficeBench files, spreadsheets, documents, emails, calendars, PDFs, and answer files.
## When to Use
- Use when the task requires Officebench Mcp guidance.
## Required Tools
- `mcp_officebench_excel_read_file`
- `mcp_officebench_excel_set_cell`
- `mcp_officebench_excel_delete_cell`
- `mcp_officebench_excel_create_new_file`
- `mcp_officebench_excel_convert_to_pdf`
- `mcp_officebench_word_read_file`
- `mcp_officebench_word_write_to_file`
- `mcp_officebench_word_create_new_file`
- `mcp_officebench_word_convert_to_pdf`
- `mcp_officebench_email_list_emails`
- `mcp_officebench_email_read_email`
- `mcp_officebench_email_send_email`
- `mcp_officebench_calendar_create_event`
- `mcp_officebench_calendar_list_events`
- `mcp_officebench_calendar_delete_event`
- `mcp_officebench_pdf_read_file`
- `mcp_officebench_pdf_convert_to_word`
- `mcp_officebench_pdf_convert_to_image`
- `mcp_officebench_ocr_recognize_file`
- `mcp_officebench_shell_command`
- `mcp_officebench_shell_list_directory`
- `mcp_officebench_shell_read_file`
- `mcp_officebench_shell_write_file`
- `mcp_officebench_shell_copy_file`
- `mcp_officebench_system_finish_task`
- `mcp_officebench_system_get_status`
- `mcp_officebench_image_convert_to_pdf`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### OfficeBench MCP Skill
Use this skill for OfficeBench evaluation runs. OfficeBench task files live in the OfficeBench MCP server, not in Beaver's local filesystem. Complete the task by calling real `mcp_officebench_*` tools.
## Critical Rules
#### Critical Rules
1. Use actual Beaver tool calls only. Do not print XML, DSML, JSON, or markdown that describes a tool call.
2. Never invent tool names. If you need to find files, use `mcp_officebench_shell_list_directory` or `mcp_officebench_shell_command`.
@ -47,9 +95,9 @@ Use this skill for OfficeBench evaluation runs. OfficeBench task files live in t
7. Verify the requested output file or edited cell exists before finishing.
8. Finish every task with `mcp_officebench_system_finish_task`.
## Tool Names And Use
#### Tool Names And Use
### Excel
##### Excel
Use these for `.xlsx` files:
@ -81,7 +129,7 @@ Typical Excel sequence:
For the common task "change Bob's midterm1 score to 100 in score.xlsx", inspect `data/score.xlsx`, find Bob's row and the `midterm1` column, then call `mcp_officebench_excel_set_cell` with that row, that column, and value `100`.
### Word
##### Word
Use these for `.docx` files:
@ -100,7 +148,7 @@ Use these for `.docx` files:
Preserve exact spelling, capitalization, punctuation, and line order from source files.
### Email
##### Email
Use these for email tasks:
@ -115,7 +163,7 @@ Use these for email tasks:
For email-search tasks, final answers should use plain text with literal lines like `Subject: ...`. Do not add markdown labels.
### Calendar
##### Calendar
Use these for calendar `.ics` tasks:
@ -130,7 +178,7 @@ Use these for calendar `.ics` tasks:
Use the task's current date/time context when interpreting relative dates.
### PDF, OCR, And Images
##### PDF, OCR, And Images
Use these for PDF/image tasks:
@ -152,7 +200,7 @@ Use these for PDF/image tasks:
For conversion tasks, create the exact requested filename and verify it exists.
### Shell And System
##### Shell And System
Use these for safe file discovery and text files:
@ -177,7 +225,7 @@ Use these for safe file discovery and text files:
Prefer dedicated Office tools for Office documents. Use shell tools for listing directories, copying/renaming files, and reading/writing plain text.
## Anti-Patterns
#### Anti-Patterns
Do not do any of the following:
@ -188,3 +236,17 @@ Do not do any of the following:
- Do not use `/testbed` as a literal prefix in path arguments unless a tool explicitly asks for an absolute path.
- Do not correct misspellings found in source data. Preserve source text exactly.
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,6 +1,6 @@
{
"change_reason": "Initial OfficeBench MCP skill for evaluation runs",
"content_hash": "6afdd5a93ce552f39c1e285fc552059cfada7971e0d5bb91bcd56c6ca608ba17",
"content_hash": "54547e8b2b5de5700d57c464a19e941a2cddd6c42af69c91122f8bd4b9c6726c",
"created_at": "2026-05-27T00:00:00.000000+00:00",
"created_by": "codex",
"frontmatter": {
@ -44,8 +44,8 @@
},
"review_state": "published",
"skill_name": "officebench-mcp",
"summary": "OfficeBench MCP skill for using registered mcp_officebench_* tools correctly during evaluation runs.",
"summary_hash": "914d6759650fce29884f648b84929e0482475c3ccd6601e9903c9b8b826dd874",
"summary": "# Officebench Mcp ## Overview Guidance for OfficeBench evaluation tasks. Use the registered mcp_officebench_* tools to inspect and edit OfficeBench files, spreadsheets, documents, emails, calendars, PDFs, and answer files.",
"summary_hash": "c8702c29954060ae65ca49e5c1a0fbfcd68c40e0522c64d75c7bb3f8c705ee66",
"tool_hints": [
"mcp_officebench_excel_read_file",
"mcp_officebench_excel_set_cell",
@ -77,4 +77,3 @@
],
"version": "v0001"
}

View File

@ -5,9 +5,17 @@
"display_name": "outlook-mail",
"lineage": [],
"name": "outlook-mail",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["outlook", "email", "calendar", "mcp", "microsoft"],
"tags": [
"outlook",
"email",
"calendar",
"mcp",
"microsoft"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -19,35 +19,71 @@ tools:
- mcp_outlook_mcp_calendar_delta_sync
---
# Outlook MCP — 邮件与日历管理
# Outlook Mail
## Overview
通过 Outlook MCP 进行邮件收发、日历管理和会议安排。支持 Graph API 和 on-prem Exchange。
## When to Use
- Use when the task requires Outlook Mail guidance.
## Required Tools
- `mcp_outlook_mcp_mail_list_folders`
- `mcp_outlook_mcp_mail_list_messages`
- `mcp_outlook_mcp_mail_search_messages`
- `mcp_outlook_mcp_mail_get_message`
- `mcp_outlook_mcp_mail_send_email`
- `mcp_outlook_mcp_mail_reply_to_message`
- `mcp_outlook_mcp_mail_forward_message`
- `mcp_outlook_mcp_mail_move_message`
- `mcp_outlook_mcp_mail_delta_sync`
- `mcp_outlook_mcp_calendar_list_events`
- `mcp_outlook_mcp_calendar_create_event`
- `mcp_outlook_mcp_calendar_update_event`
- `mcp_outlook_mcp_calendar_get_schedule`
- `mcp_outlook_mcp_calendar_find_meeting_times`
- `mcp_outlook_mcp_calendar_delta_sync`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Outlook MCP — 邮件与日历管理
通过 MCP server 连接 OutlookMicrosoft Graph / on-prem Exchange提供邮件和日历的完整操作能力。
## 邮件工具
#### 邮件工具
### mcp_outlook_mcp_mail_list_folders
##### mcp_outlook_mcp_mail_list_folders
列出 Outlook 邮件文件夹。
- `top` (int, 默认 50): 返回数量上限
### mcp_outlook_mcp_mail_list_messages
##### mcp_outlook_mcp_mail_list_messages
列出指定文件夹的邮件。
- `folder` (str, 默认 "inbox"): 文件夹名
- `top` (int, 默认 20): 返回条数
- `skip` (int, 默认 0): 跳过的条数
- `unread_only` (bool, 默认 false): 仅未读
### mcp_outlook_mcp_mail_search_messages
##### mcp_outlook_mcp_mail_search_messages
搜索邮件(使用 Graph search 语义)。
- `query` (str): 搜索关键词
- `folder` (str | None): 限定文件夹
- `top` (int, 默认 20): 返回条数
### mcp_outlook_mcp_mail_get_message
##### mcp_outlook_mcp_mail_get_message
读取单封邮件的完整内容。
- `message_id` (str): 邮件 ID
- `changekey` (str | None): EWS changekeyon-prem 需要)
### mcp_outlook_mcp_mail_send_email
##### mcp_outlook_mcp_mail_send_email
发送新邮件。**幂等操作**,支持 idempotency_key。
- `subject` (str): 主题
- `body` (str): 正文(支持 HTML
@ -56,14 +92,14 @@ tools:
- `bcc_recipients` (list[str] | None): 密送
- `idempotency_key` (str | None): 幂等键,防止重复发送
### mcp_outlook_mcp_mail_reply_to_message
##### mcp_outlook_mcp_mail_reply_to_message
回复一封邮件。
- `message_id` (str): 原邮件 ID
- `comment` (str): 回复内容
- `changekey` (str | None): EWS changekey
- `idempotency_key` (str | None)
### mcp_outlook_mcp_mail_forward_message
##### mcp_outlook_mcp_mail_forward_message
转发邮件给其他人。
- `message_id` (str): 原邮件 ID
- `to_recipients` (list[str]): 转发目标
@ -72,30 +108,30 @@ tools:
- `changekey` (str | None)
- `idempotency_key` (str | None)
### mcp_outlook_mcp_mail_move_message
##### mcp_outlook_mcp_mail_move_message
移动邮件到其他文件夹。
- `message_id` (str): 邮件 ID
- `destination_folder` (str): 目标文件夹
- `changekey` (str | None)
- `idempotency_key` (str | None)
### mcp_outlook_mcp_mail_delta_sync
##### mcp_outlook_mcp_mail_delta_sync
增量同步邮件变更。支持游标持久化,适合长期同步场景。
- `folder` (str, 默认 "inbox"): 文件夹
- `delta_link` (str | None): 增量链接(续传时提供)
- `top` (int, 默认 50)
- `persist_cursor` (bool, 默认 true): 是否持久化游标
## 日历工具
#### 日历工具
### mcp_outlook_mcp_calendar_list_events
##### mcp_outlook_mcp_calendar_list_events
列出日历事件或日历视图。
- `start_time` (str | None): ISO 开始时间,与 end_time 成对提供
- `end_time` (str | None): ISO 结束时间
- `top` (int, 默认 20)
- `skip` (int, 默认 0)
### mcp_outlook_mcp_calendar_create_event
##### mcp_outlook_mcp_calendar_create_event
创建日历事件或正式会议邀请。**幂等操作**。
- `subject` (str): 主题
- `start_time` (str): ISO 开始时间
@ -109,13 +145,13 @@ tools:
- `transaction_id` (str | None): 事务 ID
- `idempotency_key` (str | None)
### mcp_outlook_mcp_calendar_update_event
##### mcp_outlook_mcp_calendar_update_event
更新已有日历事件。
- `event_id` (str): 事件 ID
- `subject` / `start_time` / `end_time` / `timezone` / `body` / `location` / `attendees`: 可选更新字段
- `idempotency_key` (str | None)
### mcp_outlook_mcp_calendar_get_schedule
##### mcp_outlook_mcp_calendar_get_schedule
查询与会人忙闲状态。
- `schedules` (list[str]): 要查询的人员列表
- `start_time` (str): ISO 开始
@ -123,7 +159,7 @@ tools:
- `availability_view_interval` (int, 默认 30): 时间间隔(分钟)
- `timezone` (str, 默认 "UTC")
### mcp_outlook_mcp_calendar_find_meeting_times
##### mcp_outlook_mcp_calendar_find_meeting_times
推荐最佳会议时间。
- `attendees` (list[str]): 参会人
- `start_time` (str): 时间范围开始
@ -132,7 +168,7 @@ tools:
- `timezone` (str, 默认 "UTC")
- `max_candidates` (int, 默认 10): 候选数
### mcp_outlook_mcp_calendar_delta_sync
##### mcp_outlook_mcp_calendar_delta_sync
增量同步日历事件变更。
- `start_time` (str): 同步窗口开始
- `end_time` (str): 同步窗口结束
@ -141,10 +177,25 @@ tools:
- `persist_cursor` (bool, 默认 true)
- `cursor_key` (str, 默认 "calendar:primary")
## 使用原则
#### 使用原则
1. 邮件操作优先使用幂等键idempotency_key防止重复发送
2. 日历时间参数统一使用 ISO 8601 格式
3. 增量同步时优先使用返回的 delta_link 续传,避免全量拉取
4. 发送邮件前确认收件人地址格式正确
5. 创建会议时明确时区,避免跨时区混淆
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,28 @@
{
"change_reason": "Initial skill for Outlook MCP mail and calendar operations",
"content_hash": "placeholder",
"content_hash": "b63cb304dccb498387044c36d257a32cbf84ebe34ed003df209d7094f93f7599",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "通过 Outlook MCP 进行邮件收发、日历管理和会议安排。支持 Graph API 和 on-prem Exchange。",
"name": "outlook-mail",
"tools": ["mcp_outlook_mcp_mail_list_folders", "mcp_outlook_mcp_mail_list_messages", "mcp_outlook_mcp_mail_search_messages", "mcp_outlook_mcp_mail_get_message", "mcp_outlook_mcp_mail_send_email", "mcp_outlook_mcp_mail_reply_to_message", "mcp_outlook_mcp_mail_forward_message", "mcp_outlook_mcp_mail_move_message", "mcp_outlook_mcp_mail_delta_sync", "mcp_outlook_mcp_calendar_list_events", "mcp_outlook_mcp_calendar_create_event", "mcp_outlook_mcp_calendar_update_event", "mcp_outlook_mcp_calendar_get_schedule", "mcp_outlook_mcp_calendar_find_meeting_times", "mcp_outlook_mcp_calendar_delta_sync"]
"tools": [
"mcp_outlook_mcp_mail_list_folders",
"mcp_outlook_mcp_mail_list_messages",
"mcp_outlook_mcp_mail_search_messages",
"mcp_outlook_mcp_mail_get_message",
"mcp_outlook_mcp_mail_send_email",
"mcp_outlook_mcp_mail_reply_to_message",
"mcp_outlook_mcp_mail_forward_message",
"mcp_outlook_mcp_mail_move_message",
"mcp_outlook_mcp_mail_delta_sync",
"mcp_outlook_mcp_calendar_list_events",
"mcp_outlook_mcp_calendar_create_event",
"mcp_outlook_mcp_calendar_update_event",
"mcp_outlook_mcp_calendar_get_schedule",
"mcp_outlook_mcp_calendar_find_meeting_times",
"mcp_outlook_mcp_calendar_delta_sync"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +31,24 @@
},
"review_state": "published",
"skill_name": "outlook-mail",
"summary": "Outlook MCP — 邮件与日历管理。通过 MCP server 连接 Outlook提供邮件和日历的完整操作能力。",
"summary_hash": "placeholder",
"tool_hints": ["mcp_outlook_mcp_mail_list_folders", "mcp_outlook_mcp_mail_list_messages", "mcp_outlook_mcp_mail_search_messages", "mcp_outlook_mcp_mail_get_message", "mcp_outlook_mcp_mail_send_email", "mcp_outlook_mcp_mail_reply_to_message", "mcp_outlook_mcp_mail_forward_message", "mcp_outlook_mcp_mail_move_message", "mcp_outlook_mcp_mail_delta_sync", "mcp_outlook_mcp_calendar_list_events", "mcp_outlook_mcp_calendar_create_event", "mcp_outlook_mcp_calendar_update_event", "mcp_outlook_mcp_calendar_get_schedule", "mcp_outlook_mcp_calendar_find_meeting_times", "mcp_outlook_mcp_calendar_delta_sync"],
"summary": "# Outlook Mail ## Overview 通过 Outlook MCP 进行邮件收发、日历管理和会议安排。支持 Graph API 和 on-prem Exchange。",
"summary_hash": "b4c9b010447a1df9fe4196f9e1af7c962529445382cfed8d17b3796afc79a6bb",
"tool_hints": [
"mcp_outlook_mcp_mail_list_folders",
"mcp_outlook_mcp_mail_list_messages",
"mcp_outlook_mcp_mail_search_messages",
"mcp_outlook_mcp_mail_get_message",
"mcp_outlook_mcp_mail_send_email",
"mcp_outlook_mcp_mail_reply_to_message",
"mcp_outlook_mcp_mail_forward_message",
"mcp_outlook_mcp_mail_move_message",
"mcp_outlook_mcp_mail_delta_sync",
"mcp_outlook_mcp_calendar_list_events",
"mcp_outlook_mcp_calendar_create_event",
"mcp_outlook_mcp_calendar_update_event",
"mcp_outlook_mcp_calendar_get_schedule",
"mcp_outlook_mcp_calendar_find_meeting_times",
"mcp_outlook_mcp_calendar_delta_sync"
],
"version": "v0001"
}

View File

@ -5,9 +5,15 @@
"display_name": "skills-admin",
"lineage": [],
"name": "skills-admin",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["skills", "admin", "inspection"],
"tags": [
"skills",
"admin",
"inspection"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}

View File

@ -6,27 +6,65 @@ tools:
- skill_view
---
# Skills Admin — 技能查看
# Skills Admin
## Overview
技能Skill列表查看和内容加载。用于浏览已发布技能、读取技能正文和支持文件。
## When to Use
- Use when the task requires Skills Admin guidance.
## Required Tools
- `skills_list`
- `skill_view`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Skills Admin — 技能查看
查看已发布的技能列表并加载技能详情。
## 工具说明
#### 工具说明
### skills_list
##### skills_list
列出系统中所有可用技能及其描述。
- 返回技能名称、描述和版本
- 用于浏览当前可用的技能
### skill_view
##### skill_view
加载某个技能的完整正文或支持文件。
- `name` (str): 技能名称
- `file_path` (str | None): 可选的支持文件路径
- 不传文件路径时返回 SKILL.md 主内容
- 支持按需加载 references/、templates/ 等目录
## 使用原则
#### 使用原则
1. 需要参考某个技能的详细内容时,先 `skills_list` 找到名称,再用 `skill_view` 加载
2. 用户问“你有哪些技能”时,优先使用 `skills_list` 获取当前可见技能
3. 用户问某个技能如何工作时,用 `skill_view` 读取正文或支持文件
4. 这个默认技能不创建草稿;技能创作能力属于单独的 authoring/admin skill
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,15 @@
{
"change_reason": "Initial skill for skills inspection",
"content_hash": "placeholder",
"content_hash": "62238f16c6fe63d178a8557f391fc1f6f424d5f64eb940eb32c8ba73f8c77a05",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "技能Skill列表查看和内容加载。用于浏览已发布技能、读取技能正文和支持文件。",
"name": "skills-admin",
"tools": ["skills_list", "skill_view"]
"tools": [
"skills_list",
"skill_view"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +18,11 @@
},
"review_state": "published",
"skill_name": "skills-admin",
"summary": "Skills Admin — 技能列表查看和内容加载",
"summary_hash": "placeholder",
"tool_hints": ["skills_list", "skill_view"],
"summary": "# Skills Admin ## Overview 技能Skill列表查看和内容加载。用于浏览已发布技能、读取技能正文和支持文件。",
"summary_hash": "f7b43e2ab596c025cfc9396f3f5d82eaaec1d36daf0c5be97ce46afb046b16a2",
"tool_hints": [
"skills_list",
"skill_view"
],
"version": "v0001"
}

View File

@ -5,9 +5,16 @@
"display_name": "skills-authoring-admin",
"lineage": [],
"name": "skills-authoring-admin",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "disabled",
"tags": ["skills", "admin", "authoring", "draft"],
"tags": [
"skills",
"admin",
"authoring",
"draft"
],
"updated_at": "2026-06-04T00:00:00.000000+00:00"
}

View File

@ -5,13 +5,35 @@ tools:
- skill_manage
---
# Skills Authoring Admin — 技能草稿创建
# Skills Authoring Admin
## Overview
技能草稿创建管理。用于显式创建新 Skill draft默认不向普通 Agent 暴露。
## When to Use
- Use when the task requires Skills Authoring Admin guidance.
## Required Tools
- `skill_manage`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Skills Authoring Admin — 技能草稿创建
创建新的技能草稿。这个能力用于管理员、开发者或受控的技能创作流程,不属于默认初始 Agent 能力。
## 工具说明
#### 工具说明
### skill_manage
##### skill_manage
创建新技能草稿draft
- `action` (str): 仅支持 "create_draft"
- `name` (str): 技能名称
@ -19,10 +41,25 @@ tools:
- `content` (str): 技能正文SKILL.md 格式)
- 创建的草稿需经过 review → publish 流程
## 使用原则
#### 使用原则
1. 只有用户明确要求创建或沉淀一个 Skill 时才使用
2. 创建草稿前确认 skill 名称、触发场景、工具依赖和正文边界
3. 技能正文使用标准 frontmatter + Markdown 格式
4. Draft 创建后必须经过 review → publish 流程才能生效
5. 自学习候选生成草稿不依赖这个 tool自学习流程走 SkillLearningPipelineService
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,14 @@
{
"change_reason": "Split skill draft authoring out of default skills admin",
"content_hash": "placeholder",
"content_hash": "6dfc5011e61cdc4cdf5a5c6f3c91b3a6b815f2a94df643cb367c0fa9c4176ec3",
"created_at": "2026-06-04T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "技能草稿创建管理。用于显式创建新 Skill draft默认不向普通 Agent 暴露。",
"name": "skills-authoring-admin",
"tools": ["skill_manage"]
"tools": [
"skill_manage"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +17,10 @@
},
"review_state": "disabled",
"skill_name": "skills-authoring-admin",
"summary": "Skills Authoring Admin — 技能草稿创建",
"summary_hash": "placeholder",
"tool_hints": ["skill_manage"],
"summary": "# Skills Authoring Admin ## Overview 技能草稿创建管理。用于显式创建新 Skill draft默认不向普通 Agent 暴露。",
"summary_hash": "6ec2f68be143cbebb24b1958e298f2a0b05c6749541025d131f0da9c1be30a65",
"tool_hints": [
"skill_manage"
],
"version": "v0001"
}

View File

@ -5,9 +5,17 @@
"display_name": "terminal-operation",
"lineage": [],
"name": "terminal-operation",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["terminal", "shell", "command", "process", "execution"],
"tags": [
"terminal",
"shell",
"command",
"process",
"execution"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -7,13 +7,37 @@ tools:
- execute_code
---
# Terminal Operation — 终端与进程管理
# Terminal Operation
## Overview
Shell 命令执行、后台进程管理和 Python 代码执行。支持超时控制和后台运行。
## When to Use
- Use when the task requires Terminal Operation guidance.
## Required Tools
- `terminal`
- `process`
- `execute_code`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Terminal Operation — 终端与进程管理
Shell 命令执行、后台进程管理和 Python 代码执行工具集。
## 工具说明
#### 工具说明
### terminal
##### terminal
执行 shell 命令。
- `command` (str): 要执行的命令
- `working_dir` (str, 默认 "."): 工作目录
@ -21,7 +45,7 @@ Shell 命令执行、后台进程管理和 Python 代码执行工具集。
- `background` (bool, 默认 false): 是否后台运行
- 后台运行时返回 process_id可通过 process 工具管理
### process
##### process
管理后台进程。
- `action` (str): `list` | `log` | `kill`
- `process_id` (str | None): 进程 ID
@ -29,7 +53,7 @@ Shell 命令执行、后台进程管理和 Python 代码执行工具集。
- `log`: 查看进程日志(最后 12000 字节)
- `kill`: 终止进程(先 SIGTERM5 秒后 SIGKILL
### execute_code
##### execute_code
执行 Python 代码片段。
- `code` (str): Python 代码
- `language` (str, 默认 "python"): 仅支持 python
@ -37,10 +61,25 @@ Shell 命令执行、后台进程管理和 Python 代码执行工具集。
- `working_dir` (str, 默认 "."): 工作目录
- 适合快速验证脚本逻辑,不适合长期运行任务
## 使用原则
#### 使用原则
1. 长期运行任务使用 `background=true`
2. 执行危险命令rm -rf、dd、格式化等前必须确认用户意图
3. `execute_code` 适合轻量脚本验证,重型任务用 `terminal`
4. 后台进程用完后及时 kill 清理
5. 注意命令注入风险,不要直接拼接用户输入
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,16 @@
{
"change_reason": "Initial skill for terminal and process management",
"content_hash": "placeholder",
"content_hash": "2d122feb0963e072faa627ca644fff0b39aa7ff3a6a502f8b313bb26d7aee154",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "Shell 命令执行、后台进程管理和 Python 代码执行。支持超时控制和后台运行。",
"name": "terminal-operation",
"tools": ["terminal", "process", "execute_code"]
"tools": [
"terminal",
"process",
"execute_code"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +19,12 @@
},
"review_state": "published",
"skill_name": "terminal-operation",
"summary": "Terminal Operation Shell 命令执行、后台进程管理Python 代码执行",
"summary_hash": "placeholder",
"tool_hints": ["terminal", "process", "execute_code"],
"summary": "# Terminal Operation ## Overview Shell 命令执行、后台进程管理Python 代码执行。支持超时控制和后台运行。",
"summary_hash": "8571fa76cc5e5aa682bd9503d45e91e4f111e6ef9d64152a69efa0462ae04294",
"tool_hints": [
"terminal",
"process",
"execute_code"
],
"version": "v0001"
}
}

View File

@ -1,13 +1,21 @@
{
"created_at": "2026-05-26T00:00:00.000000+00:00",
"current_version": "v0001",
"description": "辅助工具集包括任务分解Todo、任务委托Delegate、子 Agent 生成Spawn、消息发送和需求澄清Clarify。",
"description": "辅助工具集包括任务分解Todo、任务委托Delegate、子 Agent 生成Spawn、消息发送和需求澄清。",
"display_name": "utility-tools",
"lineage": [],
"name": "utility-tools",
"owners": ["system"],
"owners": [
"system"
],
"source_kind": "initial",
"status": "active",
"tags": ["utility", "delegate", "todo", "spawn", "clarify"],
"tags": [
"utility",
"delegate",
"todo",
"spawn",
"clarify"
],
"updated_at": "2026-05-26T00:00:00.000000+00:00"
}
}

View File

@ -9,44 +9,85 @@ tools:
- todo
---
# Utility Tools — 辅助工具集
# Utility Tools
## Overview
辅助工具集包括任务分解Todo、任务委托Delegate、子 Agent 生成Spawn、消息发送和需求澄清。
## When to Use
- Use when the task requires Utility Tools guidance.
## Required Tools
- `clarify`
- `delegate`
- `send_message`
- `spawn`
- `todo`
## Workflow
- Identify whether the user's request matches the skill's trigger conditions.
- Read the relevant source guidance below and apply only the steps that fit the current task.
- Use the required tools deliberately and keep tool output tied to the user's goal.
### Source Guidance
### Utility Tools — 辅助工具集
任务管理、委托和协作的辅助工具。
## 工具说明
#### 工具说明
### todo (TodoWrite)
##### todo (TodoWrite)
创建和管理任务列表,跟踪复杂任务的进度。
- 适合多步骤、复杂任务时使用
- 标记当前正在进行的任务
- 完成后立即更新状态
### delegate (DelegateTool)
##### delegate (DelegateTool)
将任务委托给专门的子 Agent 执行。
- 适合独立、可并行的工作
- 委托时提供清晰的上下文和目标
- 子 Agent 完成后再整合结果
### spawn (SpawnTool)
##### spawn (SpawnTool)
启动新的 Agent 实例执行特定任务。
- 适合需要独立运行的工作
- 支持后台运行(不阻塞主流程)
### send_message (SendMessageTool)
##### send_message (SendMessageTool)
与其他 Agent 或团队成员通信。
- 适合多 Agent 协作场景
- 消息会直接送达目标
### clarify (ClarifyTool)
##### clarify (ClarifyTool)
当需求不明确时向用户提问澄清。
- 提供 2-4 个选项供用户选择
- 附带推荐选项和理由
- 避免模糊提问,给出明确建议
## 使用原则
#### 使用原则
1. 复杂任务先创建 Todo 列表,明确步骤
2. 可并行的工作使用 Delegate/Spawn 分散执行
3. 需求不明确时主动 Clarify不要猜测
4. 多 Agent 协作时保持通信简洁
5. 记得到 todo list 更新进度
## Validation
- Verify the requested outcome with the most direct available check.
- Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.
## Boundaries
- Do not broaden the task beyond the user's request.
- Do not use tools that are not listed or clearly available in the current runtime.
## Anti-Patterns
- Do not summarize the skill instead of applying it.
- Do not claim completion without validation evidence.

View File

@ -1,12 +1,18 @@
{
"change_reason": "Initial skill for utility and delegation tools",
"content_hash": "placeholder",
"content_hash": "1f3f6db4ad2844ba1587531a17b2e044e11742c20d7d0bc5efdc2358f9c27b9b",
"created_at": "2026-05-26T00:00:00.000000+00:00",
"created_by": "system",
"frontmatter": {
"description": "辅助工具集包括任务分解Todo、任务委托Delegate、子 Agent 生成Spawn、消息发送和需求澄清。",
"name": "utility-tools",
"tools": ["clarify", "delegate", "send_message", "spawn", "todo"]
"tools": [
"clarify",
"delegate",
"send_message",
"spawn",
"todo"
]
},
"parent_version": null,
"provenance": {
@ -15,8 +21,14 @@
},
"review_state": "published",
"skill_name": "utility-tools",
"summary": "Utility Tools — 任务管理、委托和协作辅助工具集",
"summary_hash": "placeholder",
"tool_hints": ["clarify", "delegate", "send_message", "spawn", "todo"],
"summary": "# Utility Tools ## Overview 辅助工具集包括任务分解Todo、任务委托Delegate、子 Agent 生成Spawn、消息发送和需求澄清。",
"summary_hash": "7c24c7da7f8d53bc57475f177fb1aea3c33b0d012baa578d6438befee4db2045",
"tool_hints": [
"clarify",
"delegate",
"send_message",
"spawn",
"todo"
],
"version": "v0001"
}
}