feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions

View File

@ -7,6 +7,7 @@ import asyncio
import io
import mimetypes
import os
import re
import secrets
import shutil
import time
@ -49,9 +50,11 @@ from beaver.services.user_file_resolver import (
UserFileStorageResolver,
build_file_auth_context,
)
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
from beaver.skills.authoring.format import parse_skill_rewrite_json
from beaver.skills.learning import SkillLearningService, SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.learning.replay import ReplayRunner
from beaver.skills.catalog.utils import parse_frontmatter
from beaver.skills.catalog.utils import extract_required_tool_names, parse_frontmatter
from .deps import get_agent_service
from .files import (
@ -96,8 +99,11 @@ from .schemas import (
try:
from fastapi import FastAPI, File, Form, Header, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response
except ModuleNotFoundError: # pragma: no cover - fallback for skeleton-only environments
CORSMiddleware = None # type: ignore[assignment]
def File(default: Any = None) -> Any: # type: ignore[override]
return default
@ -274,6 +280,7 @@ async def _app_lifespan(
worker = SkillLearningWorker(
pipeline=loaded.skill_learning_pipeline, # type: ignore[arg-type]
provider_bundle_factory=lambda: attached_service._make_provider_bundle_for_task(loaded, {}), # noqa: SLF001
replay_runner_factory=lambda: ReplayRunner(agent_loop=attached_service.create_loop()),
config=worker_config,
)
worker_task = asyncio.create_task(worker.run_forever())
@ -516,6 +523,20 @@ def _self_restart_enabled() -> bool:
return os.getenv("BEAVER_ENABLE_SELF_RESTART", "1").strip() not in {"0", "false", "False"}
def _cors_allow_origins() -> list[str]:
raw = os.getenv("BEAVER_CORS_ALLOW_ORIGINS", "").strip()
if raw:
return [origin.strip().rstrip("/") for origin in raw.split(",") if origin.strip()]
return [
"http://127.0.0.1:3000",
"http://localhost:3000",
"http://127.0.0.1:3080",
"http://localhost:3080",
"http://127.0.0.1:3081",
"http://localhost:3081",
]
def _schedule_self_restart(delay_seconds: float = 0.75) -> None:
import threading
@ -556,6 +577,14 @@ def create_app(
shutdown_force=shutdown_force,
),
)
if CORSMiddleware is not None:
app.add_middleware(
CORSMiddleware,
allow_origins=_cors_allow_origins(),
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.state.auth_tokens = {}
app.state.handoff_codes = {}
app.state.auth_file = Path(os.getenv("BEAVER_AUTH_FILE") or "")
@ -1992,13 +2021,19 @@ def create_app(
filename = file.filename or ""
if not filename.endswith(".zip"):
raise HTTPException(status_code=400, detail="File must be a .zip archive")
loaded = get_agent_service(request).create_loop().boot()
agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot()
try:
content = await file.read()
draft = _create_skill_upload_draft(loaded, filename, content)
draft_payload = _create_skill_upload_draft(loaded, filename, content)
draft = loaded.draft_service.get_draft(draft_payload["skill_name"], draft_payload["draft_id"])
if draft is not None:
await _rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename=filename)
draft = loaded.draft_service.get_draft(draft.skill_name, draft.draft_id) or draft
draft_payload = draft.to_dict()
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
return draft
return draft_payload
@app.get("/api/marketplaces/skills/search")
async def search_skillhub(
@ -2068,13 +2103,17 @@ def create_app(
@app.get("/api/skills/candidates")
async def list_skill_candidates(request: Request, status: str | None = None) -> list[dict[str, Any]]:
loaded = get_agent_service(request).create_loop().boot()
return [item.to_dict() for item in loaded.skill_learning_pipeline.list_candidates(status=status)] # type: ignore[union-attr]
return [
_skill_learning_candidate_payload(loaded, item)
for item in loaded.skill_learning_pipeline.list_candidates(status=status) # type: ignore[union-attr]
]
@app.get("/api/skills/candidates/{candidate_id}")
async def get_skill_candidate(candidate_id: str, request: Request) -> dict[str, Any]:
loaded = get_agent_service(request).create_loop().boot()
try:
return loaded.skill_learning_pipeline.get_candidate(candidate_id).to_dict() # type: ignore[union-attr]
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
return _skill_learning_candidate_payload(loaded, candidate)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
@ -2087,25 +2126,19 @@ def create_app(
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
if candidate.draft_skill_name and candidate.draft_id:
try:
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
loaded.skill_learning_pipeline.get_draft(candidate.draft_skill_name, candidate.draft_id) # type: ignore[union-attr]
except ValueError:
pass
else:
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
draft = await loaded.skill_learning_pipeline.synthesize_draft( # type: ignore[union-attr]
candidate_id,
provider_bundle=provider_bundle,
)
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
return draft.to_dict()
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
@app.post("/api/skills/candidates/{candidate_id}/regenerate")
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
@ -2118,17 +2151,9 @@ def create_app(
candidate_id,
provider_bundle=provider_bundle,
)
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
return draft.to_dict()
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
@app.post("/api/skills/learning/run-once")
async def run_skill_learning_once(request: Request) -> dict[str, Any]:
@ -2185,17 +2210,31 @@ def create_app(
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/submit")
async def submit_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
loaded = get_agent_service(request).create_loop().boot()
agent_service = get_agent_service(request)
loop = agent_service.create_loop()
loaded = loop.boot()
try:
review = loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
safety = loaded.skill_learning_pipeline.check_safety(skill_name, draft_id) # type: ignore[union-attr]
if safety.passed and safety.risk_level != "critical":
loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
skill_name,
draft_id,
requested_by=str((payload or {}).get("requested_by") or "web"),
notes=str((payload or {}).get("notes") or ""),
)
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
if candidate_id is not None:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
candidate_id,
skill_name,
draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise _skill_draft_http_error(exc) from exc
return review.to_dict()
return _skill_draft_payload(loaded, skill_name, draft_id)
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/approve")
async def approve_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
@ -2719,47 +2758,70 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
if not file_infos:
raise ValueError("Zip archive is empty")
skill_entries = []
for info in file_infos:
parts = Path(info.filename.replace("\\", "/")).parts
if "__MACOSX" in parts or Path(info.filename).name == ".DS_Store":
continue
if info.filename.replace("\\", "/").startswith("/") or any(part in {"", ".", ".."} for part in parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
if parts[-1] == "SKILL.md":
if len(parts) not in (1, 2):
raise ValueError("SKILL.md must be at root or inside one top-level directory")
skill_entries.append(info.filename)
if not skill_entries:
raise ValueError("Zip must contain SKILL.md")
skill_entry = skill_entries[0]
top = Path(skill_entry).parts[0] if len(Path(skill_entry).parts) == 2 else ""
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
frontmatter, body = parse_frontmatter(raw_skill)
skill_name = str(frontmatter.get("name") or top or Path(filename).stem).strip().replace(" ", "-")
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
raise ValueError("Could not determine a safe skill name")
files: list[tuple[str, bytes]] = []
safe_entries: list[tuple[Any, str, tuple[str, ...]]] = []
for info in file_infos:
raw = info.filename.replace("\\", "/")
parts = Path(raw).parts
if "__MACOSX" in parts or Path(raw).name == ".DS_Store":
continue
if raw.startswith("/"):
if raw.startswith("/") or any(part in {"", ".", ".."} for part in parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
if top and parts and parts[0] != top:
raise ValueError("Zip archive must contain a single top-level skill directory")
rel_parts = parts[1:] if top and parts and parts[0] == top else parts
safe_entries.append((info, raw, tuple(parts)))
if _is_skill_markdown_entry(parts[-1]):
skill_entries.append(raw)
if not skill_entries:
raise ValueError("Zip must contain SKILL.md")
if len(skill_entries) > 1:
raise ValueError("Zip must contain exactly one SKILL.md")
skill_entry = skill_entries[0]
skill_root = tuple(Path(skill_entry).parts[:-1])
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
frontmatter, body = parse_frontmatter(raw_skill)
skill_name = str(frontmatter.get("name") or (skill_root[-1] if skill_root else "") or Path(filename).stem).strip().replace(" ", "-")
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
raise ValueError("Could not determine a safe skill name")
proposed_frontmatter = normalize_skill_frontmatter(
{
**dict(frontmatter),
"name": skill_name,
"description": frontmatter.get("description") or skill_name,
},
skill_name=skill_name,
)
proposed_frontmatter["tools"] = _merge_tool_names(
proposed_frontmatter.get("tools"),
extract_required_tool_names(body),
_infer_uploaded_skill_tools(
skill_name=skill_name,
filename=filename,
frontmatter=proposed_frontmatter,
content=body,
loaded=loaded,
),
)
proposed_content = ensure_canonical_skill_body(
body,
title=skill_name,
description=str(proposed_frontmatter.get("description") or ""),
tools=list(proposed_frontmatter.get("tools") or []),
)
files: list[tuple[str, bytes]] = []
for info, raw, parts in safe_entries:
if raw == skill_entry:
continue
if skill_root:
if parts[: len(skill_root)] != skill_root:
continue
rel_parts = parts[len(skill_root):]
else:
rel_parts = parts
if not rel_parts or any(part in {"", ".", ".."} for part in rel_parts):
raise ValueError(f"Unsafe archive entry: {info.filename}")
files.append(("/".join(rel_parts), archive.read(info)))
draft = loaded.draft_service.create_new_skill_draft(
skill_name=skill_name,
proposed_content=body,
proposed_frontmatter={
**dict(frontmatter),
"name": skill_name,
"description": frontmatter.get("description") or skill_name,
},
proposed_content=proposed_content,
proposed_frontmatter=proposed_frontmatter,
created_by="web-upload",
reason=f"Uploaded {filename}",
evidence_refs=[{"kind": "upload", "filename": filename, "files": sorted(path for path, _ in files)}],
@ -2784,6 +2846,162 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
return draft.to_dict()
def _is_skill_markdown_entry(filename: str) -> bool:
return filename.strip().lower() in {"skill.md", "skills.md"}
def _merge_tool_names(*groups: Any) -> list[str]:
result: list[str] = []
for group in groups:
if isinstance(group, str):
raw_items = group.split(",")
elif isinstance(group, (list, tuple, set)):
raw_items = list(group)
else:
raw_items = []
for item in raw_items:
cleaned = str(item).strip()
if cleaned and cleaned not in result:
result.append(cleaned)
return result
def _infer_uploaded_skill_tools(
*,
skill_name: str,
filename: str,
frontmatter: dict[str, Any],
content: str,
loaded: Any,
) -> list[str]:
available = _available_runtime_tool_names(loaded)
text = "\n".join(
[
skill_name,
filename,
json.dumps(frontmatter, ensure_ascii=False, sort_keys=True),
content,
]
).lower()
inferred: list[str] = []
for tool_name in sorted(available or _COMMON_RUNTIME_TOOL_NAMES):
if re.search(rf"(?<![a-z0-9_]){re.escape(tool_name.lower())}(?![a-z0-9_])", text):
inferred.append(tool_name)
def add_if_available(*tool_names: str) -> None:
for tool_name in tool_names:
if available is not None and tool_name not in available:
continue
if tool_name not in inferred:
inferred.append(tool_name)
if re.search(r"\b(weather|forecast|temperature|precipitation|rain|snow|humidity|wind|air quality|aqi)\b", text):
add_if_available("web_fetch", "web_search")
if re.search(r"\b(latest|current|today|tomorrow|news|search|query|lookup|find online|web search)\b", text):
add_if_available("web_search")
if re.search(r"\b(url|http|https|website|webpage|page|fetch|crawl|browser|online source)\b", text):
add_if_available("web_fetch")
return inferred
def _available_runtime_tool_names(loaded: Any) -> set[str] | None:
registry = getattr(loaded, "tool_registry", None)
if registry is None:
return None
try:
return {spec.name for spec in registry.list_specs()}
except Exception:
return None
_COMMON_RUNTIME_TOOL_NAMES = {
"web_fetch",
"web_search",
"read_file",
"write_file",
"patch_file",
"search_files",
"list_directory",
"memory",
"terminal",
"process",
"execute_code",
"skill_view",
"skills_list",
"skill_manage",
"cron",
}
async def _rewrite_uploaded_skill_draft_with_llm(agent_service: Any, loaded: Any, draft: Any, *, filename: str) -> None:
try:
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
provider = getattr(provider_bundle, "auxiliary_provider", None) or getattr(provider_bundle, "main_provider", None)
runtime = getattr(provider_bundle, "auxiliary_runtime", None) or getattr(provider_bundle, "main_runtime", None)
if provider is None:
return
available_tool_names = sorted(_available_runtime_tool_names(loaded) or _COMMON_RUNTIME_TOOL_NAMES)
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You rewrite uploaded Beaver skills into the required house style. "
"Return only JSON with keys: frontmatter, content, change_reason. "
"Do not include markdown fences."
),
},
{
"role": "user",
"content": (
f"Uploaded filename: {filename}\n"
f"Skill name: {draft.skill_name}\n"
f"Current frontmatter:\n{json.dumps(draft.proposed_frontmatter, ensure_ascii=False, sort_keys=True)}\n\n"
f"Current content:\n{draft.proposed_content}\n\n"
f"Available runtime tool names:\n{json.dumps(available_tool_names, ensure_ascii=False)}\n\n"
f"{canonical_skill_format_instructions()}\n\n"
"Rewrite the skill so it is operational, concrete, and ready for review/publish. "
"Infer exact required runtime tools from the uploaded content when the workflow depends on tools. "
"Keep frontmatter.tools and the Required Tools section consistent."
),
},
],
tools=None,
model=getattr(runtime, "model", None),
max_tokens=4096,
temperature=0,
)
payload = parse_skill_rewrite_json(response.content or "", skill_name=draft.skill_name)
if payload is None:
return
payload["frontmatter"]["tools"] = _merge_tool_names(
payload["frontmatter"].get("tools"),
extract_required_tool_names(payload["content"]),
_infer_uploaded_skill_tools(
skill_name=draft.skill_name,
filename=filename,
frontmatter=payload["frontmatter"],
content=payload["content"],
loaded=loaded,
),
)
payload["content"] = ensure_canonical_skill_body(
payload["content"],
title=str(payload["frontmatter"].get("name") or draft.skill_name),
description=str(payload["frontmatter"].get("description") or ""),
tools=list(payload["frontmatter"].get("tools") or []),
)
draft.proposed_frontmatter = payload["frontmatter"]
draft.proposed_content = payload["content"]
if payload.get("change_reason"):
draft.reason = f"{draft.reason}; LLM rewrite: {payload['change_reason']}"
loaded.skill_spec_store.write_draft(draft)
except Exception:
return
def _debug_runs_for_session(session_manager: Any, session_id: str) -> list[dict[str, Any]]:
grouped: dict[str, list[Any]] = {}
run_order: list[str] = []
@ -3559,6 +3777,39 @@ def _skill_detail_payload(loaded: Any, name: str, version: str | None) -> dict[s
}
def _skill_learning_candidate_payload(loaded: Any, candidate: Any) -> dict[str, Any]:
payload = candidate.to_dict()
evidence = dict(payload.get("evidence") or {})
task_text = _skill_learning_candidate_task_text(loaded, candidate)
if task_text:
evidence["task_text"] = task_text
evidence["theme"] = SkillLearningService._task_theme(task_text)
payload["evidence"] = evidence
if candidate.kind == "new_skill":
payload["evidence_summary"] = f"Theme: {evidence['theme']}"
return payload
def _skill_learning_candidate_task_text(loaded: Any, candidate: Any) -> str:
evidence = candidate.evidence if isinstance(candidate.evidence, dict) else {}
task_id = str(evidence.get("task_id") or "").strip()
source_run_ids = set(candidate.source_run_ids or [])
try:
run_store = loaded.skill_learning_pipeline.learning_service.run_store
runs = run_store.list_runs()
except Exception:
return str(evidence.get("task_text") or "").strip()
if task_id:
task_runs = [record for record in runs if record.task_id == task_id]
if task_runs:
return SkillLearningService._representative_task_text(task_runs)
source_runs = [record for record in runs if record.run_id in source_run_ids]
if source_runs:
return SkillLearningService._representative_task_text(source_runs)
return str(evidence.get("task_text") or "").strip()
def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include_reviews: bool = False) -> dict[str, Any]:
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
safety = loaded.skill_learning_pipeline.get_safety_report(skill_name, draft_id) # type: ignore[union-attr]
@ -3567,6 +3818,8 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
**draft.to_dict(),
"safety_report": safety.to_dict() if safety is not None else None,
"eval_report": eval_report.to_dict() if eval_report is not None else None,
"target_version": _skill_draft_target_version(loaded, draft.skill_name, draft.proposal_kind),
"base_skill": _skill_draft_base_skill_payload(loaded, draft),
}
if include_reviews:
payload["reviews"] = [
@ -3576,6 +3829,45 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
return payload
def _skill_draft_base_skill_payload(loaded: Any, draft: Any) -> dict[str, Any] | None:
if draft.proposal_kind == "new_skill" or not draft.base_version:
return None
store = loaded.skill_learning_pipeline.publisher.store # type: ignore[union-attr]
loaded_version = store.read_published_skill(draft.skill_name, draft.base_version)
if loaded_version is None:
return None
version = loaded_version.version
return {
"skill_name": version.skill_name,
"version": version.version,
"frontmatter": dict(version.frontmatter),
"content": loaded_version.content,
"summary": version.summary,
"tool_hints": list(version.tool_hints),
}
def _skill_draft_target_version(loaded: Any, skill_name: str, proposal_kind: str) -> str | None:
if proposal_kind == "retire_skill":
return None
versions = [
item
for item in loaded.skill_learning_pipeline.publisher.store.list_versions(skill_name) # type: ignore[union-attr]
if isinstance(item, str) and item.startswith("v") and item[1:].isdigit()
]
if not versions:
return "v0001"
latest = max(int(item[1:]) for item in versions)
return f"v{latest + 1:04d}"
def _skill_learning_candidate_id_for_draft(loaded: Any, skill_name: str, draft_id: str) -> str | None:
for candidate in loaded.skill_learning_pipeline.list_candidates(): # type: ignore[union-attr]
if candidate.draft_skill_name == skill_name and candidate.draft_id == draft_id:
return candidate.candidate_id
return None
def _skill_versions_payload(loaded: Any, record: Any) -> list[dict[str, Any]]:
if record.source != "workspace":
return [

View File

@ -235,6 +235,12 @@ class SkillDraftEvalReport:
confidence: str = "low"
case_reports: list[dict[str, Any]] = field(default_factory=list)
tool_mode_summary: dict[str, Any] = field(default_factory=dict)
ability_score_summary: dict[str, Any] = field(default_factory=dict)
tool_execution_summary: dict[str, Any] = field(default_factory=dict)
case_selection_summary: dict[str, Any] = field(default_factory=dict)
real_score_avg: float | None = None
synthetic_score_avg: float | None = None
overall_score_avg: float | None = None
preservation_report: dict[str, Any] | None = None
def to_dict(self) -> dict[str, Any]:
@ -261,6 +267,12 @@ class SkillDraftEvalReport:
"confidence": self.confidence,
"case_reports": [dict(item) for item in self.case_reports],
"tool_mode_summary": dict(self.tool_mode_summary),
"ability_score_summary": dict(self.ability_score_summary),
"tool_execution_summary": dict(self.tool_execution_summary),
"case_selection_summary": dict(self.case_selection_summary),
"real_score_avg": self.real_score_avg,
"synthetic_score_avg": self.synthetic_score_avg,
"overall_score_avg": self.overall_score_avg,
"preservation_report": (
dict(self.preservation_report) if self.preservation_report is not None else None
),
@ -295,6 +307,12 @@ class SkillDraftEvalReport:
if isinstance(item, dict)
],
tool_mode_summary=dict(payload.get("tool_mode_summary") or {}),
ability_score_summary=dict(payload.get("ability_score_summary") or {}),
tool_execution_summary=dict(payload.get("tool_execution_summary") or {}),
case_selection_summary=dict(payload.get("case_selection_summary") or {}),
real_score_avg=_optional_bounded_float(payload.get("real_score_avg")),
synthetic_score_avg=_optional_bounded_float(payload.get("synthetic_score_avg")),
overall_score_avg=_optional_bounded_float(payload.get("overall_score_avg")),
preservation_report=(
dict(payload["preservation_report"])
if isinstance(payload.get("preservation_report"), dict)
@ -309,6 +327,12 @@ def _optional_str(value: Any) -> str | None:
return str(value)
def _optional_bounded_float(value: Any) -> float | None:
if value in (None, ""):
return None
return _bounded_float(value, default=0.0)
def _bounded_float(value: Any, *, default: float = 0.0) -> float:
if value in (None, ""):
return default

View File

@ -0,0 +1,19 @@
"""Skill authoring helpers."""
from .format import (
CANONICAL_SKILL_SECTION_HEADINGS,
canonical_skill_format_instructions,
canonicalize_skill_body,
ensure_canonical_skill_body,
is_canonical_skill_body,
normalize_skill_frontmatter,
)
__all__ = [
"CANONICAL_SKILL_SECTION_HEADINGS",
"canonical_skill_format_instructions",
"canonicalize_skill_body",
"ensure_canonical_skill_body",
"is_canonical_skill_body",
"normalize_skill_frontmatter",
]

View File

@ -0,0 +1,250 @@
"""Canonical Beaver skill authoring format."""
from __future__ import annotations
import json
import re
from typing import Any
from beaver.skills.catalog.utils import extract_required_tool_names
CANONICAL_SKILL_SECTION_HEADINGS: tuple[str, ...] = (
"## Overview",
"## When to Use",
"## Required Tools",
"## Workflow",
"## Validation",
"## Boundaries",
"## Anti-Patterns",
)
def canonical_skill_format_instructions() -> str:
headings = "\n".join(f"- {heading}" for heading in CANONICAL_SKILL_SECTION_HEADINGS)
return (
"Canonical Beaver SKILL.md format:\n"
"1. Return a frontmatter object with `name`, `description`, and `tools`.\n"
"2. `name` must be lowercase kebab-case. `description` must explain when the skill should be used.\n"
"3. `tools` must be an explicit JSON array of exact runtime tool names. Use [] only if no tool is required.\n"
"4. The Markdown content must start with one H1 title and include these H2 sections in this exact order:\n"
f"{headings}\n"
"5. Write concrete operational guidance, not a story about a past task.\n"
"6. Include validation steps and anti-patterns so future runs know how to avoid false completion."
)
def normalize_skill_frontmatter(frontmatter: dict[str, Any] | None, *, skill_name: str) -> dict[str, Any]:
raw = dict(frontmatter or {})
name = _slug(str(raw.get("name") or skill_name))
description = str(raw.get("description") or f"Use when {name} guidance is needed.").strip()
tools = _coerce_string_list(raw.get("tools"))
normalized = {}
for key, value in raw.items():
if key in {"name", "description", "tools"}:
continue
if key in {"always", "internal"} and isinstance(value, str):
normalized[key] = value.strip().lower() in {"1", "true", "yes", "on"}
continue
normalized[key] = value
return {
"name": name,
"description": description,
"tools": tools,
**normalized,
}
def is_canonical_skill_body(body: str) -> bool:
text = body.strip()
if not re.search(r"^#\s+\S", text, flags=re.MULTILINE):
return False
position = 0
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
found = text.find(heading, position)
if found < 0:
return False
position = found + len(heading)
return True
def ensure_canonical_skill_body(
body: str,
*,
title: str,
description: str = "",
tools: list[str] | None = None,
) -> str:
if is_canonical_skill_body(body):
normalized = body.strip()
if tools:
normalized = _replace_required_tools_section(normalized, tools)
return normalized + "\n"
source = _compact_source_guidance(body)
overview = description or source or f"Use this skill for {title}."
return canonicalize_skill_body(
title=title,
overview=overview,
tools=list(tools or []),
workflow=[
"Identify whether the user's request matches the skill's trigger conditions.",
"Read the relevant source guidance below and apply only the steps that fit the current task.",
"Use the required tools deliberately and keep tool output tied to the user's goal.",
],
validation=[
"Verify the requested outcome with the most direct available check.",
"Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.",
],
boundaries=[
"Do not broaden the task beyond the user's request.",
"Do not use tools that are not listed or clearly available in the current runtime.",
],
anti_patterns=[
"Do not summarize the skill instead of applying it.",
"Do not claim completion without validation evidence.",
],
source_guidance=source,
)
def canonicalize_skill_body(
*,
title: str,
overview: str,
tools: list[str] | None = None,
workflow: list[str] | None = None,
validation: list[str] | None = None,
boundaries: list[str] | None = None,
anti_patterns: list[str] | None = None,
when_to_use: list[str] | None = None,
source_guidance: str = "",
) -> str:
cleaned_title = _title(title)
tool_lines = _tool_lines(tools or [])
workflow_lines = _bullet_lines(workflow or ["Follow the workflow described by the current task and evidence."])
validation_lines = _bullet_lines(validation or ["Validate the result before reporting completion."])
boundary_lines = _bullet_lines(boundaries or ["Stay within the current task and workspace boundaries."])
anti_pattern_lines = _bullet_lines(anti_patterns or ["Do not skip validation."])
when_lines = _bullet_lines(when_to_use or [f"Use when the task requires {cleaned_title} guidance."])
source_section = f"\n\n### Source Guidance\n\n{source_guidance.strip()}" if source_guidance.strip() else ""
return (
f"# {cleaned_title}\n\n"
"## Overview\n\n"
f"{overview.strip() or f'Use this skill for {cleaned_title}.'}\n\n"
"## When to Use\n\n"
f"{when_lines}\n\n"
"## Required Tools\n\n"
f"{tool_lines}\n\n"
"## Workflow\n\n"
f"{workflow_lines}{source_section}\n\n"
"## Validation\n\n"
f"{validation_lines}\n\n"
"## Boundaries\n\n"
f"{boundary_lines}\n\n"
"## Anti-Patterns\n\n"
f"{anti_pattern_lines}\n"
)
def parse_skill_rewrite_json(content: str, *, skill_name: str) -> dict[str, Any] | None:
cleaned = content.strip()
if cleaned.startswith("```"):
lines = cleaned.splitlines()
if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].startswith("```"):
cleaned = "\n".join(lines[1:-1]).strip()
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(payload, dict):
return None
frontmatter = payload.get("frontmatter")
body = payload.get("content")
if not isinstance(frontmatter, dict) or not isinstance(body, str):
return None
normalized = normalize_skill_frontmatter(frontmatter, skill_name=skill_name)
normalized["tools"] = _merge_string_lists(
normalized.get("tools"),
extract_required_tool_names(body),
)
normalized_body = ensure_canonical_skill_body(
body,
title=normalized["name"],
description=normalized["description"],
tools=normalized["tools"],
)
return {
"frontmatter": normalized,
"content": normalized_body,
"change_reason": str(payload.get("change_reason") or ""),
}
def _compact_source_guidance(body: str, *, max_chars: int = 20000) -> str:
text = body.strip()
if not text:
return ""
text = re.sub(r"^---\n.*?\n---\n?", "", text, flags=re.DOTALL).strip()
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"^(#{1,4})\s+", r"##\1 ", text, flags=re.MULTILINE)
return text[:max_chars].rstrip()
def _tool_lines(tools: list[str]) -> str:
if not tools:
return "- No dedicated tools are required."
return "\n".join(f"- `{tool}`" for tool in tools)
def _bullet_lines(items: list[str]) -> str:
cleaned = [str(item).strip() for item in items if str(item).strip()]
if not cleaned:
return "- No additional guidance."
return "\n".join(f"- {item}" for item in cleaned)
def _coerce_string_list(value: Any) -> list[str]:
if isinstance(value, list):
raw_items = value
elif isinstance(value, str):
raw_items = value.split(",")
else:
raw_items = []
result: list[str] = []
for item in raw_items:
cleaned = str(item).strip()
if cleaned and cleaned not in result:
result.append(cleaned)
return result
def _merge_string_lists(*values: Any) -> list[str]:
result: list[str] = []
for value in values:
for item in _coerce_string_list(value):
if item not in result:
result.append(item)
return result
def _replace_required_tools_section(body: str, tools: list[str]) -> str:
replacement = "## Required Tools\n\n" + _tool_lines(tools)
updated, count = re.subn(
r"(?ms)^##\s+Required\s+Tools\s*\n.*?(?=^##\s+|\Z)",
replacement + "\n\n",
body.strip(),
count=1,
)
return updated.strip() if count else body.strip()
def _slug(value: str) -> str:
text = value.strip().lower()
text = re.sub(r"[^a-z0-9-]+", "-", text)
text = re.sub(r"-{2,}", "-", text).strip("-")
return text or "generated-skill"
def _title(value: str) -> str:
cleaned = str(value or "").strip().replace("-", " ")
return cleaned.title() if cleaned else "Generated Skill"

View File

@ -28,12 +28,13 @@ Choose `new_task` when the user asks for anything that needs the main Task agent
The Intent Agent has no tools. If a request needs a tool, do not apologize and do not say you cannot access it. Route it to Task mode so the main agent can use tools.
When there is an active task, do not force every new user message into that task. Use the active task and recent conversation to decide:
When there is an active task, do not force every new user message into that task. A Session is the durable conversation/device/group context; a Task is one unit of work inside that Session. Use the active task and recent conversation to decide:
- Choose `revise_task` when the user asks to change, correct, refine, expand, reformat, or redo the latest active task result.
- Choose `continue_task` for neutral follow-up questions or additional next steps that still belong to the active task.
- Choose `continue_task` for neutral follow-up questions or additional next steps that explicitly depend on or extend the active task's latest result.
- Choose `simple_chat` for unrelated lightweight conversation. This starts a new topic and the previous task will be accepted automatically.
- Choose `new_task` when the user asks for clearly unrelated work that needs Task capabilities. This starts a new topic and the previous task will be accepted automatically.
- Choose `new_task` for a standalone tool-dependent request even when it resembles the active task. Repeating "珠海天气怎么样" later is a fresh task unless the user clearly says to continue or revise the old result.
- Choose `close_task` when the user says the task is satisfactory or finished, such as "可以了", "就这样", or "that's good".
- Choose `abandon_task` when the user says to stop, cancel, or no longer do the active task.
@ -46,6 +47,7 @@ Examples with an active weather task:
- "再详细一点" -> `revise_task`
- "加上明后天穿衣建议" -> `revise_task`
- "顺便查一下深圳" -> `continue_task`
- "珠海天气怎么样" -> `new_task` when asked as a standalone later request
- "帮我写一个采购合同" -> `new_task`
- "吃饭没" -> `simple_chat`
- "我在冰岛" -> `simple_chat`

View File

@ -27,6 +27,7 @@ from beaver.skills.specs.storage import SkillSpecStore
from .utils import (
check_requirements,
escape_xml,
extract_required_tool_names,
get_missing_requirements,
parse_frontmatter,
parse_skill_metadata_blob,
@ -111,13 +112,19 @@ class SkillsLoader:
if not include_internal and _truthy(frontmatter.get("internal")):
continue
normalized_frontmatter = dict(frontmatter)
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
record = SkillRecord(
name=name,
path=skill_file,
source=source,
version="legacy",
source_kind=source,
tool_hints=self._coerce_tool_names(frontmatter.get("tools")),
tool_hints=self._merge_tool_names(
self._coerce_tool_names(frontmatter.get("tools")),
self._coerce_tool_names(meta_blob.get("tools")),
self._coerce_tool_names(meta_blob.get("required_tools")),
extract_required_tool_names(body),
),
frontmatter=normalized_frontmatter,
description=str(frontmatter.get("description") or summarize_body(body) or name),
)
@ -138,6 +145,7 @@ class SkillsLoader:
path = self.workspace_skills / name / "SKILL.md"
else:
path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md"
_frontmatter, body = parse_frontmatter(loaded.content)
record = SkillRecord(
name=name,
path=path,
@ -146,7 +154,10 @@ class SkillsLoader:
content_hash=loaded.version.content_hash,
source_kind=str(loaded.version.provenance.get("source_kind") or "workspace"),
status=str(loaded.version.review_state or "published"),
tool_hints=list(loaded.version.tool_hints),
tool_hints=self._merge_tool_names(
loaded.version.tool_hints,
extract_required_tool_names(body),
),
frontmatter=dict(loaded.version.frontmatter),
description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name),
)
@ -201,23 +212,32 @@ class SkillsLoader:
- read_file
- search_files
- 兼容 metadata JSON blob 里的 `tools`
- 兼容 canonical 正文 `## Required Tools` 段落
"""
record = self._find_record(name)
if record is not None and record.tool_hints:
return list(record.tool_hints)
frontmatter = self.get_skill_metadata(name) or {}
content = self.load_published_skill(name) or self.load_skill(name) or ""
frontmatter, body = parse_frontmatter(content)
frontmatter = frontmatter or self.get_skill_metadata(name) or {}
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
names = [
*self._coerce_tool_names(frontmatter.get("tools")),
*self._coerce_tool_names(meta_blob.get("tools")),
*self._coerce_tool_names(meta_blob.get("required_tools")),
]
names = self._merge_tool_names(
self._coerce_tool_names(frontmatter.get("tools")),
self._coerce_tool_names(meta_blob.get("tools")),
self._coerce_tool_names(meta_blob.get("required_tools")),
extract_required_tool_names(body),
)
return names
@staticmethod
def _merge_tool_names(*groups: Any) -> list[str]:
result: list[str] = []
for item in names:
if item and item not in result:
result.append(item)
for group in groups:
for item in SkillsLoader._coerce_tool_names(group):
if item and item not in result:
result.append(item)
return result
def load_skills_for_context(self, skill_names: list[str]) -> str:

View File

@ -84,6 +84,41 @@ def strip_frontmatter(content: str) -> str:
return body
def extract_required_tool_names(body: str) -> list[str]:
"""从 canonical skill 正文的 `## Required Tools` 段落提取工具名。
这是 frontmatter `tools` 的容错补充,不从任意正文里猜工具。只读取明确
命名的 Required Tools section支持常见 bullet/code 格式。
"""
if not body:
return []
match = re.search(
r"(?ims)^##\s+Required\s+Tools\s*$\n(?P<section>.*?)(?=^##\s+|\Z)",
body,
)
if match is None:
return []
names: list[str] = []
for line in match.group("section").splitlines():
stripped = line.strip()
if not stripped or not stripped.startswith(("-", "*")):
continue
candidate = stripped[1:].strip()
code_matches = re.findall(r"`([^`]+)`", candidate)
raw_items = code_matches or re.split(r"[,]", candidate)
for raw_item in raw_items:
name = raw_item.strip().strip("`\"' ")
if not name:
continue
token = name.split()[0].strip("`\"' :-")
if re.fullmatch(r"[A-Za-z0-9_.:-]+", token) and token not in names:
names.append(token)
return names
def parse_skill_metadata_blob(raw: str) -> dict[str, Any]:
"""解析 metadata 字段里的 JSON 扩展配置。

View File

@ -2,6 +2,8 @@
from __future__ import annotations
import json
from typing import Any
from uuid import uuid4
from beaver.engine.context import SkillContext
@ -39,7 +41,16 @@ class SkillDraftEvaluator:
return self._skipped(candidate, draft)
runs = self.run_store.list_runs()
replay_cases = select_replay_cases(candidate, runs)
if replay_runner is not None:
replay_cases, case_selection_meta = await _prepare_eval_cases(
candidate=candidate,
draft=draft,
historical_cases=select_replay_cases(candidate, runs),
provider_bundle=provider_bundle,
)
else:
replay_cases = []
case_selection_meta = {}
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
@ -47,6 +58,7 @@ class SkillDraftEvaluator:
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
)
return self._evaluate_heuristic(candidate, draft, runs)
@ -58,7 +70,7 @@ class SkillDraftEvaluator:
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = []
for run_id in candidate.source_run_ids[:8]:
for run_id in candidate.source_run_ids[:10]:
record = runs_by_id.get(run_id)
if record is None:
continue
@ -116,6 +128,7 @@ class SkillDraftEvaluator:
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
@ -147,17 +160,43 @@ class SkillDraftEvaluator:
baseline=baseline,
candidate=candidate_arm,
)
baseline_score = surrogate["baseline_score"]
candidate_score = surrogate["candidate_score"]
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
@ -172,13 +211,23 @@ class SkillDraftEvaluator:
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
return _report_from_case_reports(
candidate,
draft,
case_reports,
legacy_cases,
preservation_report,
case_selection_meta or {},
)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport(
@ -238,22 +287,400 @@ def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
async def _prepare_eval_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
explicit_cases = _explicit_eval_cases(candidate)
merged = _dedupe_cases([*explicit_cases, *historical_cases])
usable, excluded = _filter_unscorable_cases(merged)
missing = max(0, 10 - len(usable))
generated: list[dict[str, Any]] = []
if missing:
generated = await _generate_synthetic_cases(
candidate=candidate,
draft=draft,
historical_cases=usable,
provider_bundle=provider_bundle,
count=missing,
)
generated, generated_excluded = _filter_unscorable_cases(generated)
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
if len(generated) < missing:
generated.extend(
_fallback_synthetic_cases(
candidate=candidate,
historical_cases=usable,
start_index=len(generated) + 1,
count=missing - len(generated),
)
)
prepared = [*usable, *generated]
return prepared[:10], {
"requested_case_count": 10,
"historical_case_count": len(historical_cases),
"explicit_case_count": len(explicit_cases),
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
}
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
if not isinstance(raw_cases, list):
return []
result: list[dict[str, Any]] = []
for index, raw in enumerate(raw_cases, start=1):
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
if not task_text:
continue
case = {
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
"session_id": raw.get("session_id") or "explicit-eval",
"task_text": task_text,
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
"synthetic": bool(raw.get("synthetic")),
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
}
if isinstance(raw.get("validator"), dict):
case["validator"] = dict(raw["validator"])
result.append(case)
return result
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
seen: set[str] = set()
for case in cases:
run_id = str(case.get("run_id") or "")
task_text = str(case.get("task_text") or "")
key = run_id or task_text
if not key or key in seen:
continue
seen.add(key)
result.append(case)
return result
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
result: list[dict[str, Any]] = []
excluded = {"synthetic_without_validator": 0}
for case in cases:
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
excluded["synthetic_without_validator"] += 1
continue
result.append(case)
return result, excluded
async def _generate_synthetic_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
count: int,
) -> list[dict[str, Any]]:
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
model = getattr(runtime, "model", None)
try:
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You generate validator-first Beaver skill evaluation cases. "
"Return only JSON with key cases. Each case must include task_text and validator. "
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
),
},
{
"role": "user",
"content": _synthetic_case_prompt(
candidate=candidate,
draft=draft,
historical_cases=historical_cases,
count=count,
),
},
],
model=model,
max_tokens=2200,
temperature=0.4,
)
except Exception:
return []
payload = _parse_json_payload(response.content or "")
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
if not isinstance(raw_cases, list):
return []
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
def _synthetic_case_prompt(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
count: int,
) -> str:
historical = [
{
"run_id": item.get("run_id"),
"task_text": item.get("task_text"),
"validator": item.get("validator"),
}
for item in historical_cases
]
return (
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
f"Candidate kind: {candidate.kind}\n"
f"Candidate reason: {candidate.reason}\n"
f"Draft skill name: {draft.skill_name}\n"
f"Related skills: {candidate.related_skill_names}\n"
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
"Every synthetic case must be validator-first. Return exactly:\n"
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
)
def _parse_json_payload(content: str) -> dict[str, Any]:
cleaned = content.strip()
if cleaned.startswith("```"):
cleaned = cleaned.strip("`")
if cleaned.startswith("json"):
cleaned = cleaned[4:]
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start < 0 or end <= start:
return {}
try:
payload = json.loads(cleaned[start : end + 1])
except json.JSONDecodeError:
return {}
return payload if isinstance(payload, dict) else {}
def _synthetic_case_payloads(
candidate: SkillLearningCandidate,
raw_cases: list[Any],
*,
start_index: int,
limit: int,
) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for raw in raw_cases:
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
validator = raw.get("validator")
if not task_text or not isinstance(validator, dict):
continue
result.append(
_synthetic_case_payload(
candidate,
task_text,
start_index + len(result),
validator=dict(validator),
tier=str(raw.get("tier") or "bronze"),
)
)
if len(result) >= limit:
break
return result
def _fallback_synthetic_cases(
*,
candidate: SkillLearningCandidate,
historical_cases: list[dict[str, Any]],
start_index: int,
count: int,
) -> list[dict[str, Any]]:
seed_text = ""
if historical_cases:
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
if not seed_text:
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
required_terms = _terms(seed_text)[:2] or ["done"]
return [
_synthetic_case_payload(
candidate,
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
index,
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
tier="bronze",
)
for index in range(start_index, start_index + count)
]
def _synthetic_case_payload(
candidate: SkillLearningCandidate,
task_text: str,
index: int,
*,
validator: dict[str, Any],
tier: str,
) -> dict[str, Any]:
return {
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
"task_id": f"synthetic-{index:02d}",
"session_id": "synthetic-eval",
"task_text": task_text,
"baseline_skill_names": _baseline_skill_names(candidate),
"candidate_skill_name": candidate.draft_skill_name,
"accepted_score": 0.75,
"synthetic": True,
"tier": tier,
"validator": validator,
}
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
if candidate.kind == "revise_skill":
return list(candidate.related_skill_names[:1])
if candidate.kind == "merge_skills":
return list(candidate.related_skill_names)
return []
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
validator_type = str(validator.get("type") or "")
if validator_type != "final_answer_contains":
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
answer = str(arm.get("final_answer") or "").lower()
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
matched = sum(1 for term in required_terms if term in answer)
outcome = matched / max(1, len(required_terms))
unsafe = any(term in answer for term in forbidden_terms)
safety = 0.0 if unsafe else 1.0
final_score = (
0.40 * outcome
+ 0.25 * outcome
+ 0.15 * _process_validity(arm)
+ 0.10 * safety
+ 0.10 * _path_efficiency(arm, outcome)
)
return {
**_ability_breakdown(score=final_score, source="auto_validator"),
"outcome_correctness": round(outcome, 4),
"artifact_correctness": round(outcome, 4),
"safety_no_regression": round(safety, 4),
"validator_type": validator_type,
}
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
answer = str(arm.get("final_answer") or "").strip()
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
return _ability_breakdown(score=score, source=source, notes=notes)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {
"outcome_correctness": bounded,
"artifact_correctness": bounded,
"process_validity": bounded,
"safety_no_regression": bounded,
"path_efficiency": bounded,
"final_score": round(bounded, 4),
"source": source,
"notes": list(notes or []),
}
def _process_validity(arm: dict[str, Any]) -> float:
if arm.get("finish_reason") == "error":
return 0.2
return 0.8 if arm.get("tool_calls") else 0.6
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
if outcome < 0.5:
return 0.3
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
if call_count <= 3:
return 1.0
if call_count <= 6:
return 0.7
return 0.4
def _bounded_score(value: Any, *, default: float) -> float:
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _terms(text: str) -> list[str]:
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
selection_meta = dict(case_selection_meta or {})
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
overall_score_avg = round(candidate_avg, 4)
ability_summary = {
"score_role": "primary",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
}
tool_execution_summary = {
"score_role": "diagnostic_only",
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
}
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
@ -276,11 +703,34 @@ def _report_from_case_reports(
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
tool_mode_summary={
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
"score_role": "diagnostic_only",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
**selection_meta,
},
ability_score_summary=ability_summary,
tool_execution_summary=tool_execution_summary,
case_selection_summary=selection_meta,
real_score_avg=real_score_avg,
synthetic_score_avg=synthetic_score_avg,
overall_score_avg=overall_score_avg,
preservation_report=preservation_report,
)
def _avg(values: list[float]) -> float | None:
if not values:
return None
return round(sum(values) / len(values), 4)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:

View File

@ -323,8 +323,8 @@ class SkillLearningPipelineService:
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
if not any(review.status == SkillReviewState.APPROVED.value for review in reviews):
raise ValueError("Draft must have an approved review before publish")
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
raise ValueError("Draft must be submitted for review before publish")
safety = self.get_safety_report(draft.skill_name, draft.draft_id)
if safety is None:
raise ValueError("Draft requires a passing safety report before publish")

View File

@ -162,18 +162,23 @@ class ReplayRunner:
registry=loaded.tool_registry,
policy=self.policy,
)
result = await self.agent_loop.process_direct(
request.task_text,
provider_bundle=request.provider_bundle,
include_skill_assembly=False,
include_tools=True,
pinned_skill_names=request.pinned_skill_names,
pinned_skill_contexts=request.pinned_skill_contexts,
max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4),
temperature=float(request.model_settings.get("temperature") or 0.0),
source="skill_replay_eval",
tool_executor_override=replay_executor,
)
direct_kwargs = {
"provider_bundle": request.provider_bundle,
"include_skill_assembly": False,
"include_tools": True,
"pinned_skill_names": request.pinned_skill_names,
"pinned_skill_contexts": request.pinned_skill_contexts,
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
"temperature": float(request.model_settings.get("temperature") or 0.0),
"source": "skill_replay_eval",
"tool_executor_override": replay_executor,
}
try:
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
except RuntimeError as exc:
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
raise
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
return {
"case_id": request.case_id,
"arm": request.arm,
@ -188,6 +193,14 @@ class ReplayRunner:
}
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
message = str(exc)
return (
"AgentLoop.process_direct() is disabled while run() is active" in message
and "submit tasks via submit_direct() instead" in message
)
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
effects: list[dict[str, Any]] = []
for trace in traces:

View File

@ -99,6 +99,7 @@ class SkillLearningService:
]
source_run_ids = [record.run_id for record in source_runs]
source_session_ids = list(dict.fromkeys(record.session_id for record in source_runs))
representative_task_text = self._representative_task_text(source_runs, fallback=final_run.task_text)
if not published_receipts:
candidates.append(
@ -113,7 +114,8 @@ class SkillLearningService:
"task_id": task_id,
"final_accepted_run_id": final_accepted_run_id,
"source_run_ids": source_run_ids,
"theme": self._task_theme(final_run.task_text),
"task_text": representative_task_text,
"theme": self._task_theme(representative_task_text),
},
status="open",
priority=1,
@ -329,8 +331,14 @@ class SkillLearningService:
def _build_new_skill_candidates(self) -> list[SkillLearningCandidate]:
groups: dict[str, list[RunRecord]] = {}
for record in self.run_store.list_runs():
key = self._task_theme(record.task_text)
all_runs = self.run_store.list_runs()
runs_by_task: dict[str, list[RunRecord]] = {}
for record in all_runs:
if record.task_id:
runs_by_task.setdefault(record.task_id, []).append(record)
for record in all_runs:
task_runs = runs_by_task.get(record.task_id, [record])
key = self._task_theme(self._representative_task_text(task_runs, fallback=record.task_text))
if not key:
continue
groups.setdefault(key, []).append(record)
@ -443,12 +451,24 @@ class SkillLearningService:
@staticmethod
def _task_theme(task_text: str) -> str:
cleaned = re.sub(r"\s+", " ", task_text.strip().lower())
cleaned = re.sub(r"\s+", " ", task_text.strip())
if not cleaned:
return ""
words = cleaned.split(" ")
first_sentence = re.split(r"[。!?.!?]", cleaned, maxsplit=1)[0].strip()
if not first_sentence:
first_sentence = cleaned
words = first_sentence.split(" ")
return " ".join(words[:8]).strip()
@staticmethod
def _representative_task_text(runs: list[RunRecord], *, fallback: str = "") -> str:
ordered = sorted(runs, key=lambda item: (item.attempt_index, item.started_at, item.run_id))
for record in ordered:
text = record.task_text.strip()
if text:
return text
return fallback.strip()
@staticmethod
def _suggest_skill_name(
candidate: SkillLearningCandidate,

View File

@ -15,12 +15,15 @@ class SurrogateToolEvaluator:
return {
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"baseline_tool_execution_score": baseline_score,
"candidate_tool_execution_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"surrogate_tool_count": surrogate_count,
"blocked_tool_count": blocked_count,
"score_role": "diagnostic_only",
"confidence": confidence,
"notes": [
"Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.",
"Tool execution score is diagnostic only and is not the main task ability score.",
],
}

View File

@ -6,6 +6,7 @@ import json
from typing import Any
from beaver.engine.providers.base import LLMProvider
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
from beaver.skills.learning.evidence import EvidencePacket
from beaver.memory.skills.models import SkillLearningCandidate
@ -58,7 +59,8 @@ class SkillDraftSynthesizer:
"content": (
"You synthesize Beaver skill drafts from execution evidence. "
"Return only JSON with keys: frontmatter, content, change_reason, "
"preserved_sections, changed_sections, dropped_sections."
"preserved_sections, changed_sections, dropped_sections. "
"The content must follow the Canonical Beaver SKILL.md format."
),
},
{"role": "user", "content": prompt},
@ -113,6 +115,7 @@ class SkillDraftSynthesizer:
+ "\n- tools: an explicit JSON array of exact tool names this skill needs. "
+ "Prefer called tool names when the workflow depends on them; use run-selected tool names only when clearly required. "
+ "Use [] only when no tool is required."
+ "\n\n" + canonical_skill_format_instructions()
+ "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays."
)
@ -144,14 +147,23 @@ class SkillDraftSynthesizer:
@staticmethod
def _normalize_payload(payload: dict[str, Any], evidence_packet: EvidencePacket) -> dict[str, Any]:
frontmatter = dict(payload.get("frontmatter") or {})
frontmatter = normalize_skill_frontmatter(
dict(payload.get("frontmatter") or {}),
skill_name=str((payload.get("frontmatter") or {}).get("name") or "generated-skill"),
)
tool_hints = _coerce_string_list(frontmatter.get("tools"))
if not tool_hints:
tool_hints = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
frontmatter["tools"] = tool_hints
content = ensure_canonical_skill_body(
str(payload.get("content") or "").strip(),
title=str(frontmatter.get("name") or "generated-skill"),
description=str(frontmatter.get("description") or ""),
tools=tool_hints,
)
return {
"frontmatter": frontmatter,
"content": str(payload.get("content") or "").strip(),
"content": content,
"change_reason": str(payload.get("change_reason") or ""),
"preserved_sections": _coerce_string_list(payload.get("preserved_sections")),
"changed_sections": _coerce_string_list(payload.get("changed_sections")),
@ -162,13 +174,20 @@ class SkillDraftSynthesizer:
def _fallback_payload(candidate: SkillLearningCandidate, evidence_packet: EvidencePacket, action: str) -> dict[str, Any]:
related = candidate.related_skill_names[0] if candidate.related_skill_names else "generated-skill"
title = related.replace("_", "-")
content = "\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured."
tools = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
content = ensure_canonical_skill_body(
"\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured.",
title=title,
description=candidate.reason or f"Auto-generated {action} draft for {title}.",
tools=tools,
)
return {
"frontmatter": {
"name": title,
"description": candidate.reason or f"Auto-generated {action} draft for {title}.",
"tools": _coerce_string_list(evidence_packet.metadata.get("tool_names")),
"tools": tools,
},
"content": f"# {title}\n\n## Evidence\n\n{content}\n",
"content": content,
"change_reason": candidate.reason or f"Fallback {action} synthesis.",
"preserved_sections": [],
"changed_sections": [],

View File

@ -10,6 +10,7 @@ from typing import Callable
from beaver.engine.providers import ProviderBundle
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.learning.pipeline import SkillLearningPipelineService
from beaver.skills.learning.replay import ReplayRunner
@dataclass(slots=True)
@ -57,10 +58,12 @@ class SkillLearningWorker:
*,
pipeline: SkillLearningPipelineService,
provider_bundle_factory: Callable[[], ProviderBundle],
replay_runner_factory: Callable[[], ReplayRunner] | None = None,
config: SkillLearningWorkerConfig | None = None,
) -> None:
self.pipeline = pipeline
self.provider_bundle_factory = provider_bundle_factory
self.replay_runner_factory = replay_runner_factory
self.config = config or SkillLearningWorkerConfig.from_env()
self._running = False
self._lock = asyncio.Lock()
@ -126,6 +129,7 @@ class SkillLearningWorker:
draft.skill_name,
draft.draft_id,
provider_bundle=self.provider_bundle_factory(),
replay_runner=self.replay_runner_factory() if self.replay_runner_factory is not None else None,
)
return True

View File

@ -16,8 +16,8 @@ class SkillPublisher:
def publish(self, skill_name: str, draft_id: str, publisher: str, notes: str = "") -> SkillVersion:
draft = self._require_draft(skill_name, draft_id)
if draft.status != SkillReviewState.APPROVED.value:
raise ValueError("Draft must be approved before publish")
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
raise ValueError("Draft must be submitted for review before publish")
if draft.proposal_kind == "retire_skill":
raise ValueError("Retire proposals must be applied through apply_retire_proposal")
@ -81,8 +81,8 @@ class SkillPublisher:
def apply_retire_proposal(self, skill_name: str, draft_id: str, actor: str, notes: str = "") -> SkillSpec:
draft = self._require_draft(skill_name, draft_id)
if draft.status != SkillReviewState.APPROVED.value:
raise ValueError("Retire proposal must be approved before apply")
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
raise ValueError("Retire proposal must be submitted for review before apply")
if draft.proposal_kind != "retire_skill":
raise ValueError("Only retire_skill proposals can be applied as retire proposals")

View File

@ -25,7 +25,11 @@ class MainAgentRouter:
timeout_seconds: float = 8.0,
) -> MainAgentDecision:
if provider is None:
return self._fallback(active_task=active_task, reason="router_provider_unavailable")
return self._apply_active_task_boundary(
self._fallback(active_task=active_task, reason="router_provider_unavailable"),
message=message,
active_task=active_task,
)
chat_kwargs: dict[str, Any] = {
"messages": [
{
@ -58,10 +62,18 @@ class MainAgentRouter:
for attempt_timeout in (timeout_seconds, 12.0):
try:
response = await asyncio.wait_for(provider.chat(**chat_kwargs), timeout=attempt_timeout)
return self.from_json(response.content or "", active_task=active_task)
return self._apply_active_task_boundary(
self.from_json(response.content or "", active_task=active_task),
message=message,
active_task=active_task,
)
except Exception as exc:
last_error = exc
return self._fallback(active_task=active_task, reason=f"router_failed: {last_error}")
return self._apply_active_task_boundary(
self._fallback(active_task=active_task, reason=f"router_failed: {last_error}"),
message=message,
active_task=active_task,
)
def from_json(self, text: str, *, active_task: TaskRecord | None = None) -> MainAgentDecision:
payload = self._parse_json_object(text)
@ -121,6 +133,31 @@ class MainAgentRouter:
return MainAgentDecision(mode="task", reason=reason, action="continue_task")
return MainAgentDecision(mode="simple", reason=reason, action="simple_chat")
def _apply_active_task_boundary(
self,
decision: MainAgentDecision,
*,
message: str,
active_task: TaskRecord | None,
) -> MainAgentDecision:
if active_task is None or decision.action != "continue_task":
return decision
if not _looks_like_fresh_task_request(message):
return decision
if _looks_like_explicit_task_followup(message):
return decision
title = decision.short_title or active_task.metadata.get("short_title")
return MainAgentDecision(
mode="task",
reason=(
"fresh standalone task request in the same session; "
"do not attach it to the active task without explicit follow-up wording"
),
starts_new_task=True,
short_title=title,
action="create_task",
)
@staticmethod
def _prompt(
*,
@ -159,15 +196,19 @@ class MainAgentRouter:
"- close_task: user explicitly says the active Task is done/satisfactory/finished.\n"
"- abandon_task: user explicitly says to stop, cancel, abandon, or no longer do the active Task.\n\n"
"Critical policy:\n"
"- If there is an active Task, choose continue_task or revise_task unless the user's topic is completely unrelated "
"to that Task or the user explicitly closes/abandons it.\n"
"- A Session is the durable conversation/device/group context. A Task is one unit of work inside that Session. "
"Do not use an active Task as a reason to merge every later message into the same work item.\n"
"- If there is an active Task, choose continue_task only when the current message explicitly depends on, extends, "
"or asks a direct follow-up about that active Task's latest result.\n"
"- With an active Task, choose simple_chat for unrelated lightweight conversation and new_task for unrelated work "
"that needs Task capabilities. Either decision starts a new topic.\n"
"- An unrelated lightweight conversation must not be classified as revise_task merely because the active Task is awaiting acceptance.\n"
"- Choose revise_task when the active Task is awaiting feedback or needs revision and the user asks for changes "
"such as '改一下', '加上', '删除', '换成', '再详细点', '格式改成', '不要', or equivalent wording.\n"
"- Choose continue_task for neutral follow-up questions or additional next steps that do not imply dissatisfaction with the previous result.\n"
"- Use new_task only when the user clearly asks to start a different task.\n"
"- Choose continue_task for neutral follow-up questions or additional next steps that refer to the previous result, "
"for example '顺便查一下深圳', '这个也加上', or '继续'.\n"
"- A standalone tool-dependent request such as a fresh weather/search/file/run/test request is new_task even when it is "
"similar to the active Task. Repeating '珠海天气怎么样' later is a new Task unless the user says to revise or continue the old result.\n"
"- If there is no active Task, choose new_task only for work that requires execution, iteration, tools, files, "
"implementation, validation, or multi-step completion. Otherwise choose simple_chat.\n"
"- Requests that need current, real-time, external, user-private, local-file, web, weather, price, news, "
@ -203,3 +244,99 @@ def _clean_short_title(value: Any) -> str | None:
return None
title = " ".join(str(value).strip().split())
return title[:40] or None
def _looks_like_explicit_task_followup(message: str) -> bool:
text = _compact_text(message)
if not text:
return False
markers = (
"继续",
"接着",
"上面",
"刚才",
"前面",
"这个",
"那个",
"",
"结果",
"",
"",
"顺便",
"补充",
"加上",
"加入",
"删除",
"去掉",
"",
"换成",
"重做",
"详细",
"展开",
"格式",
"continue",
"same task",
"previous",
"above",
"that result",
"revise",
"update it",
"add",
"remove",
"change",
"also",
)
return any(marker in text for marker in markers)
def _looks_like_fresh_task_request(message: str) -> bool:
text = _compact_text(message)
if not text:
return False
markers = (
"天气",
"气温",
"下雨",
"降雨",
"空气质量",
"预报",
"查一下",
"帮我查",
"搜索",
"搜一下",
"看看最新",
"最新",
"今天",
"明天",
"上传",
"下载",
"文件",
"运行",
"执行",
"测试",
"构建",
"部署",
"修复",
"weather",
"forecast",
"temperature",
"search",
"look up",
"latest",
"today",
"tomorrow",
"upload",
"download",
"file",
"run",
"execute",
"test",
"build",
"deploy",
"fix",
)
return any(marker in text for marker in markers)
def _compact_text(message: str) -> str:
return " ".join(str(message or "").strip().lower().split())

View File

@ -4,6 +4,7 @@ import json
from pathlib import Path
from beaver.engine import EngineLoader
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import parse_frontmatter
@ -69,6 +70,16 @@ def test_skill_authoring_admin_is_seeded_but_not_initial() -> None:
assert version["tool_hints"] == expected_tools
def test_seeded_skill_bodies_use_canonical_format() -> None:
for index_name in ("published", "disabled"):
index = json.loads((REPO_ROOT / "skills" / "_index" / f"{index_name}.json").read_text(encoding="utf-8"))
for skill_name in index["items"]:
skill_dir = REPO_ROOT / "skills" / skill_name / "versions" / "v0001"
_frontmatter, body = parse_frontmatter((skill_dir / "SKILL.md").read_text(encoding="utf-8"))
assert is_canonical_skill_body(body), skill_name
def test_default_runtime_registers_skill_view_tool(tmp_path: Path) -> None:
loaded = EngineLoader(workspace=tmp_path).load()
try:

View File

@ -87,6 +87,14 @@ def _task() -> TaskRecord:
)
def _weather_task() -> TaskRecord:
task = _task()
task.description = "珠海天气怎样"
task.goal = "珠海天气怎样"
task.metadata["short_title"] = "查询珠海天气"
return task
def test_router_continues_active_task_from_llm_decision() -> None:
provider = RouterProvider('{"action":"continue_task","reason":"related","short_title":"任务连续性"}')
decision = asyncio.run(
@ -103,6 +111,35 @@ def test_router_continues_active_task_from_llm_decision() -> None:
assert provider.calls[0]["max_tokens"] == 256
def test_router_keeps_same_session_but_starts_new_task_for_standalone_weather_repeat() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"珠海天气怎么样",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"neutral follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "create_task"
assert decision.starts_new_task is True
assert "fresh standalone task request" in decision.reason
def test_router_allows_explicit_followup_to_continue_active_weather_task() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"顺便查一下深圳",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"related follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "continue_task"
assert decision.starts_new_task is False
def test_router_marks_revision_from_llm_decision() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
@ -163,6 +200,8 @@ def test_router_prompt_treats_unrelated_lightweight_conversation_as_new_topic()
prompt = provider.calls[0]["messages"][1]["content"]
assert "unrelated lightweight conversation" in prompt
assert "must not be classified as revise_task merely because the active Task is awaiting acceptance" in prompt
assert "A Session is the durable conversation/device/group context" in prompt
assert "Repeating '珠海天气怎么样' later is a new Task" in prompt
def test_router_closes_active_task_from_llm_decision() -> None:

View File

@ -5,13 +5,40 @@ from types import SimpleNamespace
import pytest
from beaver.interfaces.web.app import _create_skill_upload_draft
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.interfaces.web.app import _create_skill_upload_draft, _rewrite_uploaded_skill_draft_with_llm
from beaver.services.skillhub_service import SkillHubService
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import extract_required_tool_names
from beaver.skills.drafts import DraftService
from beaver.skills.specs import SkillSpecStore
from beaver.tools.mcp.wrapper import MCPToolWrapper
class RewriteProvider(LLMProvider):
def __init__(self) -> None:
super().__init__()
self.messages = []
async def chat(self, messages, tools=None, model=None, max_tokens=None, temperature=0.7, thinking_enabled=None):
self.messages = messages
return LLMResponse(
content="""{
"frontmatter": {
"name": "skill",
"description": "Use when uploaded skill guidance needs QA formatting.",
"tools": ["read_file"]
},
"content": "# Skill\\n\\n## Overview\\n\\nLLM rewritten overview.\\n\\n## When to Use\\n\\n- Use when testing upload rewrite.\\n\\n## Required Tools\\n\\n- `read_file`\\n\\n## Workflow\\n\\n- Follow the rewritten workflow.\\n\\n## Validation\\n\\n- Verify the result.\\n\\n## Boundaries\\n\\n- Stay in scope.\\n\\n## Anti-Patterns\\n\\n- Do not skip rewrite validation.\\n",
"change_reason": "normalized upload"
}""",
model=model,
)
def get_default_model(self):
return "rewrite-model"
class FakeSkillHubService(SkillHubService):
async def _get_json(self, path, *, params=None):
if path == "/skills":
@ -99,6 +126,106 @@ def test_upload_skill_zip_keeps_supporting_files_on_draft(tmp_path):
assert upload_dir.endswith(draft["draft_id"])
def test_upload_skill_zip_canonicalizes_uploaded_skill_body(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"skill/SKILL.md",
"---\nname: skill\ndescription: raw upload\ntools:\n - read_file\n---\nBody without our format.\n",
)
draft = _create_skill_upload_draft(loaded, "skill.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["name"] == "skill"
assert draft["proposed_frontmatter"]["tools"] == ["read_file"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_zip_infers_weather_web_tools_from_content(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nLook up current weather and forecast for a city online.\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["tools"] == ["web_fetch", "web_search"]
assert extract_required_tool_names(draft["proposed_content"]) == ["web_fetch", "web_search"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_llm_rewrite_updates_draft(tmp_path):
store = SkillSpecStore(tmp_path)
draft_service = DraftService(store)
draft = draft_service.create_new_skill_draft(
skill_name="skill",
proposed_content="# Skill\n\n## Overview\n\nFallback.",
proposed_frontmatter={"name": "skill", "description": "fallback", "tools": ["read_file"]},
created_by="test",
reason="upload",
)
provider = RewriteProvider()
agent_service = SimpleNamespace(
_make_provider_bundle_for_task=lambda _loaded, _kwargs: SimpleNamespace(
main_provider=provider,
main_runtime=SimpleNamespace(model="rewrite-model"),
)
)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=draft_service)
asyncio.run(_rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename="skill.zip"))
rewritten = draft_service.get_draft("skill", draft.draft_id)
assert rewritten is not None
assert "LLM rewritten overview" in rewritten.proposed_content
assert is_canonical_skill_body(rewritten.proposed_content)
assert "Canonical Beaver SKILL.md format" in provider.messages[1]["content"]
assert "Available runtime tool names" in provider.messages[1]["content"]
def test_upload_skill_zip_accepts_nested_single_skill_directory(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"plugin/skills/nested-skill/SKILL.md",
"---\nname: nested-skill\ndescription: nested\n---\nBody\n",
)
archive.writestr("plugin/skills/nested-skill/references/a.txt", "context")
archive.writestr("plugin/README.md", "ignore package file")
draft = _create_skill_upload_draft(loaded, "plugin.zip", buffer.getvalue())
assert draft["skill_name"] == "nested-skill"
upload_dir = draft["evidence_refs"][0]["supporting_upload_dir"]
assert (tmp_path / "skills" / "nested-skill" / "draft_uploads" / draft["draft_id"] / "references" / "a.txt").read_text() == "context"
assert "README.md" not in draft["evidence_refs"][0]["files"]
def test_upload_skill_zip_accepts_common_skill_markdown_name_aliases(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nBody\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["skill_name"] == "weather-search"
assert draft["proposed_frontmatter"]["name"] == "weather-search"
assert is_canonical_skill_body(draft["proposed_content"])
def test_mcp_wrapper_metadata_preserves_server_id_with_underscores():
tool_def = SimpleNamespace(name="auth_status", description="Auth", inputSchema={"type": "object", "properties": {}})

View File

@ -184,7 +184,7 @@ def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
assert published.version == "v0002"
assert store.get_current_version("release-checklist") == "v0002"
with pytest.raises(ValueError, match="approved"):
with pytest.raises(ValueError, match="submitted for review"):
publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")
rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
@ -529,6 +529,66 @@ def test_skill_learning_service_generates_new_skill_for_task_without_published_s
assert candidates[0].source_run_ids == ["task-run-1"]
def test_skill_learning_service_uses_original_task_text_for_new_skill_theme(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
service = SkillLearningService(
run_store=run_store,
learning_store=learning_store,
draft_service=DraftService(store),
evidence_selector=EvidenceSelector(run_store),
)
now = datetime.now(timezone.utc).isoformat()
run_store.append_run_record(
RunRecord(
run_id="task-run-1",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
run_store.append_run_record(
RunRecord(
run_id="task-run-2",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
candidates = service.build_learning_candidates_for_task("task-1", trigger_run_id="task-run-2")
assert [candidate.candidate_id for candidate in candidates] == ["new:task:task-1"]
assert candidates[0].evidence["theme"] == "Compare direct production restart with staging rollout"
assert candidates[0].evidence["task_text"] == "Compare direct production restart with staging rollout"
def test_task_theme_uses_first_sentence_for_chinese_text() -> None:
assert (
SkillLearningService._task_theme(
"帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production。请给出推荐方案、原因、验证步骤和回滚策略。"
)
== "帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production"
)
def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
skill = SkillContext(
name="docker-debug",

View File

@ -0,0 +1,54 @@
from __future__ import annotations
from beaver.skills.authoring.format import (
CANONICAL_SKILL_SECTION_HEADINGS,
canonical_skill_format_instructions,
canonicalize_skill_body,
is_canonical_skill_body,
parse_skill_rewrite_json,
)
def test_canonical_skill_body_contains_required_sections() -> None:
body = canonicalize_skill_body(
title="Filesystem Operation",
overview="Read and update project files safely.",
tools=["read_file", "write_file"],
workflow=["Inspect the file before editing.", "Use the smallest safe edit."],
validation=["Re-read changed files before reporting completion."],
boundaries=["Do not edit files outside the workspace."],
anti_patterns=["Do not overwrite files without reading them first."],
)
assert is_canonical_skill_body(body)
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in body
def test_canonical_skill_format_instructions_are_prompt_ready() -> None:
instructions = canonical_skill_format_instructions()
assert "Canonical Beaver SKILL.md format" in instructions
assert "frontmatter" in instructions
assert "name" in instructions
assert "description" in instructions
assert "tools" in instructions
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in instructions
def test_parse_skill_rewrite_json_backfills_frontmatter_tools_from_required_tools_section() -> None:
payload = parse_skill_rewrite_json(
"""{
"frontmatter": {
"name": "weather-search",
"description": "weather lookup",
"tools": []
},
"content": "# Weather Search\\n\\n## Overview\\n\\nLook up weather.\\n\\n## When to Use\\n\\n- Weather requests.\\n\\n## Required Tools\\n\\n- `web_fetch`\\n- `web_search`\\n\\n## Workflow\\n\\n- Fetch current weather.\\n\\n## Validation\\n\\n- Check source freshness.\\n\\n## Boundaries\\n\\n- Do not guess.\\n\\n## Anti-Patterns\\n\\n- Do not fabricate data.\\n"
}""",
skill_name="weather-search",
)
assert payload is not None
assert payload["frontmatter"]["tools"] == ["web_fetch", "web_search"]

View File

@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore
class StubProvider(LLMProvider):
async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
return LLMResponse(content="ok")
def __init__(self, content: str = "ok") -> None:
super().__init__()
self.content = content
self.calls: list[dict] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
thinking_enabled: bool | None = None,
) -> LLMResponse:
self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
return LLMResponse(content=self.content)
def get_default_model(self) -> str:
return "stub"
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert report.passed is True
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is False
assert pipeline.get_candidate("candidate-1").status == "eval_failed"
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
class FakeReplayRunner:
def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
self.baseline_answer = baseline_answer
self.candidate_answer = candidate_answer
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
return {
"case_id": request.case_id,
"arm": request.arm,
@ -168,7 +187,7 @@ class FakeReplayRunner:
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"final_answer": final_answer,
"tool_calls": [
{
"tool_name": "write_file",
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}
assert "ability_score" in report.case_reports[0]
assert "tool_execution_score" in report.case_reports[0]
assert report.ability_score_summary["score_role"] == "primary"
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "validator-case",
"task_id": "validator-case",
"session_id": "eval",
"task_text": "Write the release verdict.",
"validator": {
"type": "final_answer_contains",
"required_terms": ["ship"],
"forbidden_terms": ["do not ship"],
},
"accepted_score": 0.5,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(
baseline_answer="Do not ship. Tests are failing.",
candidate_answer="Ship after smoke tests pass.",
),
)
)
case = report.case_reports[0]
assert case["tool_execution_score"]["baseline_score"] == 0.85
assert case["tool_execution_score"]["candidate_score"] == 0.85
assert case["baseline_score"] < case["candidate_score"]
assert report.tool_mode_summary["score_role"] == "diagnostic_only"
assert report.ability_score_summary["score_role"] == "primary"
assert report.real_score_avg is not None
assert report.synthetic_score_avg is not None
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "synthetic:no-validator",
"task_id": "synthetic-no-validator",
"session_id": "synthetic-eval",
"task_text": "Synthetic task without an oracle.",
"synthetic": True,
"accepted_score": 0.75,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
replay_runner = FakeReplayRunner()
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=replay_runner,
)
)
assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1

View File

@ -31,6 +31,12 @@ def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
assert payload["confidence"] == "low"
assert payload["case_reports"] == []
assert payload["tool_mode_summary"] == {}
assert payload["ability_score_summary"] == {}
assert payload["tool_execution_summary"] == {}
assert payload["case_selection_summary"] == {}
assert payload["real_score_avg"] is None
assert payload["synthetic_score_avg"] is None
assert payload["overall_score_avg"] is None
assert payload["preservation_report"] is None
assert payload["cases"] == [{"run_id": "run-1"}]
@ -59,3 +65,37 @@ def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
assert report.mode == "heuristic"
assert report.confidence == "low"
assert report.case_reports == []
def test_eval_report_persists_ability_and_case_split_fields() -> None:
report = SkillDraftEvalReport(
report_id="eval-replay",
skill_name="debug",
draft_id="draft-1",
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
mode="replay",
eval_version="replay-v2",
real_score_avg=0.9,
synthetic_score_avg=0.6,
overall_score_avg=0.8,
ability_score_summary={"score_role": "primary", "real_case_count": 1},
tool_execution_summary={"score_role": "diagnostic_only", "executed": 1.0},
case_selection_summary={"excluded_synthetic_without_validator": 2},
)
payload = report.to_dict()
restored = SkillDraftEvalReport.from_dict(payload)
assert payload["real_score_avg"] == 0.9
assert payload["synthetic_score_avg"] == 0.6
assert payload["overall_score_avg"] == 0.8
assert restored.ability_score_summary == {"score_role": "primary", "real_case_count": 1}
assert restored.tool_execution_summary == {"score_role": "diagnostic_only", "executed": 1.0}
assert restored.case_selection_summary == {"excluded_synthetic_without_validator": 2}

View File

@ -55,14 +55,12 @@ def test_pipeline_lists_candidates_and_moves_draft_through_review(tmp_path: Path
reason="test",
)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
approved = pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
version = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert pipeline.list_candidates()[0].candidate_id == "candidate-1"
assert review.status == SkillReviewState.IN_REVIEW.value
assert approved.status == SkillReviewState.APPROVED.value
assert safety.passed is True
assert version.skill_name == "new-skill"
assert pipeline.get_draft(draft.skill_name, draft.draft_id).status == SkillReviewState.PUBLISHED.value
@ -93,7 +91,6 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
@ -165,7 +162,6 @@ def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="low confidence"):
@ -201,7 +197,6 @@ def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="preservation"):

View File

@ -16,6 +16,25 @@ class FakeAgentLoop:
return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop")
class FakeRunningAgentLoop(FakeAgentLoop):
def __init__(self) -> None:
self.process_direct_calls = 0
self.submit_direct_calls: list[tuple[str, dict]] = []
async def process_direct(self, task: str, **kwargs):
self.process_direct_calls += 1
raise RuntimeError(
"AgentLoop.process_direct() is disabled while run() is active; "
"submit tasks via submit_direct() instead."
)
async def submit_direct(self, task: str, **kwargs):
self.submit_direct_calls.append((task, kwargs))
executor = kwargs["tool_executor_override"]
await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"})
return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
runner = ReplayRunner(agent_loop=FakeAgentLoop())
request = ReplayArmRequest(
@ -34,3 +53,33 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
assert report["arm"] == "candidate"
assert report["finish_reason"] == "stop"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
agent_loop = FakeRunningAgentLoop()
runner = ReplayRunner(agent_loop=agent_loop)
request = ReplayArmRequest(
case_id="case-queued",
arm="baseline",
task_text="Send a status email to Ada.",
pinned_skill_names=["filesystem-operation"],
pinned_skill_contexts=[{"name": "filesystem-operation"}],
provider_bundle=object(),
model_settings={"max_tool_iterations": 3, "temperature": 0.1},
)
report = asyncio.run(runner.run_arm(request))
assert agent_loop.process_direct_calls == 1
assert len(agent_loop.submit_direct_calls) == 1
queued_task, queued_kwargs = agent_loop.submit_direct_calls[0]
assert queued_task == "Send a status email to Ada."
assert queued_kwargs["source"] == "skill_replay_eval"
assert queued_kwargs["include_skill_assembly"] is False
assert queued_kwargs["include_tools"] is True
assert queued_kwargs["pinned_skill_names"] == ["filesystem-operation"]
assert queued_kwargs["max_tool_iterations"] == 3
assert queued_kwargs["temperature"] == 0.1
assert report["session_id"] == "session-queued"
assert report["run_id"] == "run-queued"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"

View File

@ -74,7 +74,6 @@ def test_safety_marks_dangerous_tools_high_and_requires_confirm(tmp_path: Path)
report = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is True
assert report.risk_level == "high"
@ -94,7 +93,6 @@ def test_publish_requires_safety_report(tmp_path: Path) -> None:
reason="test",
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
with pytest.raises(ValueError, match="safety report"):
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.authoring.format import CANONICAL_SKILL_SECTION_HEADINGS
from beaver.skills.learning.evidence import EvidencePacket
from beaver.skills.learning.synthesizer import SkillDraftSynthesizer
@ -39,3 +40,6 @@ def test_revision_prompt_includes_base_skill_snapshot() -> None:
assert "Do not delete files." in prompt
assert "preserved_sections" in prompt
assert "dropped_sections" in prompt
assert "Canonical Beaver SKILL.md format" in prompt
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in prompt

View File

@ -1,12 +1,37 @@
from __future__ import annotations
from pathlib import Path
from types import SimpleNamespace
from fastapi.testclient import TestClient
from beaver.memory.runs import RunRecord
from beaver.interfaces.web.app import create_app
from beaver.memory.skills import SkillLearningCandidate
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.services.agent_service import AgentService
from beaver.skills.specs import SkillVersion
class StubEvaluator:
def __init__(self) -> None:
self.calls = 0
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
self.calls += 1
return SkillDraftEvalReport(
report_id="eval-existing",
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
status="completed",
)
def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
@ -31,3 +56,191 @@ def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
assert candidates[0]["candidate_id"] == "candidate-1"
assert "risk_level" in candidates[0]
assert run_once["processed"] >= 0
def test_skill_learning_candidates_payload_prefers_original_task_text(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
now = "2026-06-11T00:00:00+00:00"
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-original",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-final",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="new:task:task-1",
kind="new_skill",
source_run_ids=["run-original", "run-final"],
source_session_ids=["session-task"],
related_skill_names=[],
reason="test",
evidence={"task_id": "task-1", "theme": "i do not see the docs"},
)
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
candidates = client.get("/api/skills/candidates").json()
payload = next(item for item in candidates if item["candidate_id"] == "new:task:task-1")
assert payload["evidence"]["theme"] == "Compare direct production restart with staging rollout"
assert payload["evidence"]["task_text"] == "Compare direct production restart with staging rollout"
def test_generate_draft_does_not_run_review_checks(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post("/api/skills/candidates/candidate-existing/draft")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 0
assert payload["draft_id"] == draft.draft_id
assert payload["safety_report"] is None
assert payload["eval_report"] is None
assert loaded.skill_learning_pipeline.get_eval_report(draft.skill_name, draft.draft_id) is None # type: ignore[union-attr]
def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 1
assert payload["status"] == "in_review"
assert payload["safety_report"]["passed"] is True
assert payload["eval_report"]["report_id"] == "eval-existing"
def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
loaded.skill_spec_store.write_skill_version( # type: ignore[union-attr]
SkillVersion(
skill_name="filesystem-operation",
version="v0001",
content_hash="hash-v1",
summary_hash="summary-v1",
created_at="2026-06-01T00:00:00+00:00",
created_by="test",
change_reason="initial",
parent_version=None,
review_state="published",
frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
summary="filesystem",
tool_hints=[],
),
"# Filesystem Operation\n\nUse files.",
)
loaded.skill_spec_store.set_current_version("filesystem-operation", "v0001") # type: ignore[union-attr]
draft = loaded.skill_learning_pipeline.draft_service.create_revision_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
base_version="v0001",
proposed_content="# Filesystem Operation\n\nUse files better.",
proposed_frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
created_by="test",
reason="revise",
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.get("/api/skills/drafts")
assert response.status_code == 200
payload = next(item for item in response.json() if item["draft_id"] == draft.draft_id)
assert payload["proposal_kind"] == "revise_skill"
assert payload["base_version"] == "v0001"
assert payload["target_version"] == "v0002"
assert payload["base_skill"]["version"] == "v0001"
assert payload["base_skill"]["content"] == "# Filesystem Operation\n\nUse files."
assert payload["base_skill"]["frontmatter"]["name"] == "filesystem-operation"

View File

@ -10,6 +10,7 @@ from beaver.engine.providers.factory import ProviderBundle
from beaver.engine.session import SessionManager
from beaver.memory.runs import RunMemoryStore, RunRecord
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.drafts import DraftService
from beaver.skills.learning import (
EvidenceSelector,
@ -48,6 +49,33 @@ def _bundle(provider: LLMProvider) -> ProviderBundle:
return ProviderBundle(main_runtime=runtime, main_provider=provider) # type: ignore[arg-type]
class FakeReplayRunner:
def __init__(self) -> None:
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "debug deployment startup done",
"tool_calls": [
{
"tool_name": "echo",
"mode": "executed",
"arguments": {"text": "ok"},
"result": {"success": True, "content": "ok"},
}
],
"artifacts": [],
"side_effects": [],
}
def _pipeline(tmp_path: Path) -> SkillLearningPipelineService:
spec_store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
@ -109,6 +137,28 @@ def test_worker_synthesizes_open_candidate_without_publish(tmp_path: Path) -> No
assert pipeline.list_drafts(candidate.draft_skill_name)[0].status == "draft"
def test_worker_evaluates_draft_with_replay_runner_when_available(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
replay_runner = FakeReplayRunner()
worker = SkillLearningWorker(
pipeline=pipeline,
provider_bundle_factory=lambda: _bundle(JsonProvider()),
replay_runner_factory=lambda: replay_runner,
config=SkillLearningWorkerConfig(max_drafts_per_run=5, max_retries=3, interval_seconds=1),
)
result = asyncio.run(worker.run_once())
candidate = pipeline.get_candidate("candidate-1")
draft = pipeline.get_draft(candidate.draft_skill_name or "", candidate.draft_id or "")
report = pipeline.get_eval_report(draft.skill_name, draft.draft_id)
assert result.succeeded == 1
assert report is not None
assert report.mode == "replay"
assert report.case_reports
assert replay_runner.requests
def test_worker_retries_and_marks_failed_after_limit(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
worker = SkillLearningWorker(
@ -147,6 +197,7 @@ def test_synthesizer_fills_missing_tools_from_evidence(tmp_path: Path) -> None:
)
assert payload["frontmatter"]["tools"] == ["web_fetch", "memory"]
assert is_canonical_skill_body(payload["content"])
def test_evidence_selector_records_run_tool_names(tmp_path: Path) -> None:

View File

@ -218,6 +218,45 @@ def test_unrelated_new_task_auto_accepts_previous_task(tmp_path: Path) -> None:
assert current.run_ids == [second.run_id]
def test_standalone_realtime_repeat_creates_new_task_in_same_session(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner(),
)
)
session_id = "feishu:group-weather"
first = asyncio.run(
service.process_direct(
"珠海天气怎样",
session_id=session_id,
provider_bundle=_bundle("Weather result"),
)
)
second = asyncio.run(
service.process_direct(
"珠海天气怎么样",
session_id=session_id,
provider_bundle=_bundle("Fresh weather result", route_action="continue_task"),
)
)
task_service = service.create_loop().boot().task_service
assert task_service is not None
previous = task_service.get_task(first.task_id or "")
current = task_service.get_task(second.task_id or "")
assert previous is not None
assert current is not None
assert previous.session_id == session_id
assert current.session_id == session_id
assert current.task_id != previous.task_id
assert previous.status == "closed"
assert previous.run_ids == [first.run_id]
assert current.status == "awaiting_acceptance"
assert current.run_ids == [second.run_id]
def test_related_follow_up_continues_active_task_without_accepting_it(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(

View File

@ -102,6 +102,58 @@ tools:
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files"]
def test_tool_assembler_uses_required_tools_section_when_frontmatter_omits_tools(tmp_path: Path) -> None:
skill_dir = tmp_path / "skills" / "docker-debug"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text(
"""---
name: docker-debug
description: Debug Docker issues.
---
# Docker Debug
## Overview
Debug Docker issues.
## Required Tools
- `terminal`
- `search_files`
## Workflow
Inspect logs and search related files.
""",
encoding="utf-8",
)
registry = ToolRegistry()
registry.register(DummyTool("memory", toolset="memory", always_available=True))
registry.register(DummyTool("terminal", toolset="shell"))
registry.register(DummyTool("search_files", toolset="file"))
registry.register(DummyTool("echo", toolset="debug"))
assembler = ToolAssembler(retriever=StaticRetriever())
loader = SkillsLoader(tmp_path)
record = loader.get_skill_record("docker-debug")
assert record is not None
assert record.tool_hints == ["terminal", "search_files"]
selected = asyncio.run(
assembler.assemble(
task_description="排查 Docker 容器日志",
registry=registry,
skills_loader=loader,
activated_skills=[SkillContext(name="docker-debug", content="", tool_hints=record.tool_hints)],
top_k=1,
)
)
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files", "echo"]
def test_embedding_fallback_can_return_all_or_top_k() -> None:
candidates = [{"name": f"tool_{index}", "description": "", "input_schema": "{}"} for index in range(3)]
retriever = EmbeddingRetriever(api_key_env="MISSING_EMBEDDING_KEY", api_base_env="MISSING_EMBEDDING_BASE")

View File

@ -0,0 +1,21 @@
from fastapi.testclient import TestClient
from beaver.interfaces.web.app import create_app
def test_local_frontend_origin_can_preflight_api_requests() -> None:
app = create_app(service=None, manage_service_lifecycle=False)
client = TestClient(app)
response = client.options(
"/api/auth/me",
headers={
"Origin": "http://127.0.0.1:3080",
"Access-Control-Request-Method": "GET",
"Access-Control-Request-Headers": "authorization",
},
)
assert response.status_code == 200
assert response.headers["access-control-allow-origin"] == "http://127.0.0.1:3080"
assert "authorization" in response.headers["access-control-allow-headers"].lower()

View File

@ -28,8 +28,10 @@ import {
deleteUserFile,
createUserFileDir,
getAccessToken,
isApiError,
} from '@/lib/api';
import type { UserFileContent, UserFileItem } from '@/lib/api';
import { canMutateUserFilesPath } from '@/lib/user-file-paths';
import { Button } from '@/components/ui/button';
import { ScrollArea } from '@/components/ui/scroll-area';
import { type AppLocale, pickAppText } from '@/lib/i18n/core';
@ -44,6 +46,10 @@ function sleep(ms: number): Promise<void> {
});
}
function isAuthError(error: unknown): boolean {
return isApiError(error, 401);
}
export default function FilesPage() {
const { locale } = useAppI18n();
const [items, setItems] = useState<UserFileItem[]>([]);
@ -78,6 +84,9 @@ export default function FilesPage() {
return;
} catch (err) {
lastError = err;
if (isAuthError(err)) {
break;
}
}
}
const message = lastError instanceof Error ? lastError.message : pickAppText(locale, '加载文件失败', 'Failed to load files');
@ -156,6 +165,15 @@ export default function FilesPage() {
const handleUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
const files = e.target.files;
if (!files || files.length === 0) return;
if (!canMutateUserFilesPath(currentPath)) {
setLoadError(pickAppText(
locale,
'请先进入 uploads、outputs、shared 或 tasks 目录后再上传。',
'Open uploads, outputs, shared, or tasks before uploading.'
));
if (fileInputRef.current) fileInputRef.current.value = '';
return;
}
setUploading(true);
setUploadProgress(0);
@ -178,6 +196,14 @@ export default function FilesPage() {
const handleCreateDir = async () => {
const name = newDirName.trim();
if (!name) return;
if (!canMutateUserFilesPath(currentPath)) {
setLoadError(pickAppText(
locale,
'请先进入 uploads、outputs、shared 或 tasks 目录后再新建文件夹。',
'Open uploads, outputs, shared, or tasks before creating a folder.'
));
return;
}
try {
const dirPath = currentPath ? `${currentPath}/${name}` : name;
await createUserFileDir(dirPath);
@ -191,6 +217,7 @@ export default function FilesPage() {
// Build breadcrumbs
const breadcrumbs = currentPath ? currentPath.split('/') : [];
const canMutateCurrentPath = canMutateUserFilesPath(currentPath);
const formatSize = (bytes: number | null) => {
if (bytes === null || bytes === undefined) return '';
@ -224,7 +251,12 @@ export default function FilesPage() {
size="sm"
className="h-11"
onClick={() => setShowMkdir(true)}
disabled={loading}
disabled={loading || !canMutateCurrentPath}
title={
canMutateCurrentPath
? undefined
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
}
>
<FolderPlus className="w-4 h-4 mr-1" />
{pickAppText(locale, '新建文件夹', 'New folder')}
@ -234,7 +266,12 @@ export default function FilesPage() {
size="sm"
className="h-11"
onClick={() => fileInputRef.current?.click()}
disabled={uploading}
disabled={uploading || !canMutateCurrentPath}
title={
canMutateCurrentPath
? undefined
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
}
>
{uploading ? (
<>
@ -272,6 +309,15 @@ export default function FilesPage() {
</Button>
</div>
</div>
{!canMutateCurrentPath && !loading && (
<p className="mb-4 rounded-md border border-[#E6E1DE] bg-muted/40 px-3 py-2 text-sm text-muted-foreground">
{pickAppText(
locale,
'请选择 uploads、outputs、shared 或 tasks 后再上传或新建文件夹。',
'Select uploads, outputs, shared, or tasks before uploading or creating folders.'
)}
</p>
)}
{/* Breadcrumbs */}
<div className="flex items-center gap-1 mb-4 text-sm text-muted-foreground flex-wrap">

View File

@ -5,7 +5,6 @@ import { usePathname, useRouter, useSearchParams } from 'next/navigation';
import {
AlertCircle,
BarChart3,
Check,
CheckCircle2,
ChevronDown,
ClipboardList,
@ -31,7 +30,6 @@ import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import {
approveSkillDraft,
deleteSkill,
disablePublishedSkill,
downloadSkill,
@ -436,11 +434,6 @@ export default function SkillsPage() {
submitSkillDraft(draft.skill_name, draft.draft_id)
)
}
onApprove={() =>
runAction(`approve:${draft.draft_id}`, () =>
approveSkillDraft(draft.skill_name, draft.draft_id)
)
}
onReject={() =>
runAction(`reject:${draft.draft_id}`, () =>
rejectSkillDraft(draft.skill_name, draft.draft_id)
@ -799,7 +792,6 @@ function DraftCard({
draft,
actionId,
onSubmit,
onApprove,
onReject,
onRecheckSafety,
onPublish,
@ -807,7 +799,6 @@ function DraftCard({
draft: SkillDraft;
actionId: string | null;
onSubmit: () => Promise<unknown>;
onApprove: () => Promise<unknown>;
onReject: () => Promise<unknown>;
onRecheckSafety: () => Promise<unknown>;
onPublish: (confirmHighRisk: boolean) => Promise<unknown>;
@ -820,8 +811,10 @@ function DraftCard({
const frontmatter = draft.proposed_frontmatter || {};
const description = String(frontmatter.description || '').trim();
const toolHints = normalizeStringList(frontmatter.tools);
const submittedForReview = draft.status === 'in_review' || draft.status === 'approved';
const isRevision = draft.proposal_kind === 'revise_skill' && Boolean(draft.base_skill);
const publishBlocked =
draft.status !== 'approved'
!submittedForReview
|| !safety
|| safety.risk_level === 'critical'
|| (evalReport?.status !== 'skipped_provider_unavailable' && evalReport?.passed === false);
@ -833,7 +826,6 @@ function DraftCard({
].filter(Boolean).join('\n');
const safetyBlocksReview = Boolean(safety && (!safety.passed || safety.risk_level === 'critical'));
const submitBlocked = draft.status !== 'draft' || safetyBlocksReview;
const approveBlocked = draft.status !== 'in_review' || safetyBlocksReview;
const rejectBlocked = !REJECTABLE_DRAFT_STATUSES.has(draft.status);
const canPublishLabel = publishBlocked
? publishBlockReason(draft, t)
@ -878,7 +870,12 @@ function DraftCard({
<p className={`mt-1 text-sm leading-6 text-muted-foreground ${containedLongTextClass}`}>
{draft.reason || description || t('没有提供草稿说明。', 'No draft notes were provided.')}
</p>
<div className="mt-3 grid gap-3 md:grid-cols-3">
{draft.proposal_kind === 'revise_skill' && draft.base_version && (
<div className="mt-2 text-sm font-medium text-muted-foreground">
{draft.skill_name}: {draft.base_version} {draft.target_version || t('下一版本', 'Next version')}
</div>
)}
<div className="mt-3 grid gap-3 md:grid-cols-4">
<ReadableFact
icon={<FileCode2 className="h-4 w-4" />}
label={t('草稿内容', 'Draft content')}
@ -889,6 +886,11 @@ function DraftCard({
label={t('基线版本', 'Base version')}
value={draft.base_version || t('新增技能,无基线', 'New skill, no base')}
/>
<ReadableFact
icon={<GitCompare className="h-4 w-4" />}
label={t('目标版本', 'Target version')}
value={draft.target_version || '-'}
/>
<ReadableFact
icon={<Info className="h-4 w-4" />}
label={t('来源', 'Source')}
@ -912,10 +914,6 @@ function DraftCard({
<Send className="mr-2 h-4 w-4" />
{t('送审', 'Submit')}
</Button>
<Button variant="outline" size="sm" className="h-11" disabled={busy || approveBlocked} onClick={() => void onApprove()}>
<Check className="mr-2 h-4 w-4" />
{t('批准', 'Approve')}
</Button>
<Button variant="outline" size="sm" className="h-11" disabled={busy || rejectBlocked} onClick={() => void onReject()}>
<XCircle className="mr-2 h-4 w-4" />
{t('拒绝', 'Reject')}
@ -926,7 +924,7 @@ function DraftCard({
</Button>
<Button size="sm" className="h-11" disabled={busy || publishBlocked} onClick={handlePublish}>
<Rocket className="mr-2 h-4 w-4" />
{t('发布', 'Publish')}
{draft.proposal_kind === 'revise_skill' ? t('发布修订', 'Publish revision') : t('发布', 'Publish')}
</Button>
</div>
</div>
@ -936,7 +934,7 @@ function DraftCard({
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
<div className="flex items-center gap-2 text-sm font-medium">
<FileText className="h-4 w-4 text-muted-foreground" />
{t('拟发布的技能正文', 'Proposed skill body')}
{isRevision ? t('修改对比', 'Revision comparison') : t('拟发布的技能正文', 'Proposed skill body')}
</div>
{toolHints.length > 0 && (
<div className="flex flex-wrap gap-1">
@ -948,7 +946,14 @@ function DraftCard({
</div>
)}
</div>
{draft.proposed_content.trim() ? (
{isRevision && draft.base_skill ? (
<RevisionComparison
baseVersion={draft.base_version || draft.base_skill.version}
targetVersion={draft.target_version || t('下一版本', 'Next version')}
baseContent={draft.base_skill.content}
proposedContent={draft.proposed_content}
/>
) : draft.proposed_content.trim() ? (
<MarkdownPreview content={draft.proposed_content} />
) : (
<p className="text-sm text-muted-foreground">{t('草稿没有正文内容。', 'This draft has no body content.')}</p>
@ -960,7 +965,7 @@ function DraftCard({
title={t('发布门禁', 'Publish gates')}
summary={canPublishLabel}
items={[
{ label: t('草稿已批准', 'Draft approved'), ok: draft.status === 'approved' },
{ label: t('草稿已送审', 'Draft submitted'), ok: submittedForReview },
{ label: t('安全报告通过', 'Safety passed'), ok: Boolean(safety?.passed) && safety?.risk_level !== 'critical' },
{
label: t('评估未回退', 'No eval regression'),
@ -971,6 +976,7 @@ function DraftCard({
<RawDetails
title={t('原始草稿内容', 'Raw draft payload')}
payload={{
base_skill: draft.base_skill,
proposed_frontmatter: draft.proposed_frontmatter,
proposed_content: draft.proposed_content,
evidence_refs: draft.evidence_refs,
@ -1040,6 +1046,71 @@ function SafetyReportPanel({ report }: { report?: SkillDraftSafetyReport | null
);
}
function RevisionComparison({
baseVersion,
targetVersion,
baseContent,
proposedContent,
}: {
baseVersion: string;
targetVersion: string;
baseContent: string;
proposedContent: string;
}) {
const { locale } = useAppI18n();
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
const diff = lineDiffSummary(baseContent, proposedContent);
return (
<div className="space-y-3">
<div className="flex flex-wrap gap-2 text-xs text-muted-foreground">
<Badge variant="outline">{baseVersion}</Badge>
<span></span>
<Badge variant="default">{targetVersion}</Badge>
<span>{t('新增', 'Added')}: {diff.added}</span>
<span>{t('删除', 'Removed')}: {diff.removed}</span>
<span>{t('修改', 'Changed')}: {diff.changed}</span>
</div>
<div className="grid min-w-0 gap-3 lg:grid-cols-2">
<DiffPane title={t('当前版本', 'Current version')} content={baseContent} />
<DiffPane title={t('草稿修订', 'Draft revision')} content={proposedContent} />
</div>
</div>
);
}
function DiffPane({ title, content }: { title: string; content: string }) {
return (
<div className="min-w-0 rounded-md border border-border bg-white">
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">{title}</div>
<pre className={`max-h-[520px] overflow-auto p-3 text-xs leading-5 ${containedLongTextClass}`}>
{content.trim() || '-'}
</pre>
</div>
);
}
function lineDiffSummary(baseContent: string, proposedContent: string): { added: number; removed: number; changed: number } {
const baseLines = baseContent.split(/\r?\n/);
const proposedLines = proposedContent.split(/\r?\n/);
const maxLength = Math.max(baseLines.length, proposedLines.length);
let added = 0;
let removed = 0;
let changed = 0;
for (let index = 0; index < maxLength; index += 1) {
const baseLine = baseLines[index];
const proposedLine = proposedLines[index];
if (baseLine === proposedLine) continue;
if (baseLine === undefined) {
added += 1;
} else if (proposedLine === undefined) {
removed += 1;
} else {
changed += 1;
}
}
return { added, removed, changed };
}
function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
const { locale } = useAppI18n();
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
@ -1066,6 +1137,15 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
);
}
const abilitySummary = report.ability_score_summary || {};
const toolExecutionSummary = report.tool_execution_summary || report.tool_mode_summary || {};
const caseSelectionSummary = report.case_selection_summary || {};
const realScore = report.real_score_avg ?? abilitySummary.real_score_avg;
const syntheticScore = report.synthetic_score_avg ?? abilitySummary.synthetic_score_avg;
const overallScore = report.overall_score_avg ?? abilitySummary.overall_score_avg ?? report.candidate_score_avg;
const realCaseCount = toNumber(abilitySummary.real_case_count);
const syntheticCaseCount = toNumber(abilitySummary.synthetic_case_count);
const excludedSynthetic = toNumber(caseSelectionSummary.excluded_synthetic_without_validator);
return (
<div className="min-w-0 rounded-md border border-border bg-muted/20 p-4">
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
@ -1079,8 +1159,8 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
<div className="grid gap-2 sm:grid-cols-3">
<MetricTile label={t('基线均分', 'Baseline avg')} value={formatScore(report.baseline_score_avg)} />
<MetricTile label={t('候选均分', 'Candidate avg')} value={formatScore(report.candidate_score_avg)} />
<MetricTile label={t('基线能力均分', 'Baseline ability')} value={formatScore(report.baseline_score_avg)} />
<MetricTile label={t('候选能力均分', 'Candidate ability')} value={formatScore(report.candidate_score_avg)} />
<MetricTile
label={t('变化', 'Delta')}
value={`${report.score_delta >= 0 ? '+' : ''}${formatScore(report.score_delta)}`}
@ -1089,8 +1169,14 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
<MetricTile label={t('真实案例均分', 'Real avg')} value={formatOptionalScore(realScore)} />
<MetricTile label={t('模拟案例均分', 'Synthetic avg')} value={formatOptionalScore(syntheticScore)} />
<MetricTile label={t('总体能力分', 'Overall ability')} value={formatOptionalScore(overallScore)} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('工具执行覆盖', 'Tool execution')} value={formatPercent(toOptionalNumber(toolExecutionSummary.executed) ?? report.execution_coverage)} />
<MetricTile label={t('替代工具评估', 'Tool surrogate')} value={formatPercent(toOptionalNumber(toolExecutionSummary.surrogate) ?? report.surrogate_coverage)} />
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
</div>
@ -1100,6 +1186,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('不变', 'Unchanged')} value={String(report.unchanged_count)} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('真实案例', 'Real cases')} value={String(realCaseCount)} />
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('模拟案例', 'Synthetic cases')} value={String(syntheticCaseCount)} />
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('无验证器已排除', 'No-validator excluded')} value={String(excludedSynthetic)} />
</div>
{report.cases.length > 0 && (
<div className="mt-3 overflow-hidden rounded-md border border-border bg-white">
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">
@ -1114,6 +1206,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<MetricTile label={t('候选', 'Candidate')} value={formatScore(toNumber(item.candidate_score))} />
<MetricTile label={t('变化', 'Delta')} value={formatSignedScore(toNumber(item.delta))} />
</div>
<div className="mt-2 text-muted-foreground">
{String(item.synthetic) === 'true' ? t('模拟案例', 'Synthetic case') : t('真实案例', 'Real case')}
{item.tier ? ` · ${String(item.tier)}` : ''}
</div>
</div>
))}
</div>
@ -1122,6 +1218,7 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
<thead className="bg-muted/40 text-muted-foreground">
<tr>
<th className="px-3 py-2 font-medium">{t('运行', 'Run')}</th>
<th className="px-3 py-2 font-medium">{t('来源', 'Source')}</th>
<th className="px-3 py-2 font-medium">{t('基线', 'Baseline')}</th>
<th className="px-3 py-2 font-medium">{t('候选', 'Candidate')}</th>
<th className="px-3 py-2 font-medium">{t('变化', 'Delta')}</th>
@ -1131,6 +1228,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
{report.cases.map((item, index) => (
<tr key={`${String(item.run_id || index)}:${index}`} className="border-t border-border">
<td className="max-w-[160px] truncate px-3 py-2 font-mono">{String(item.run_id || '-')}</td>
<td className="px-3 py-2">
{String(item.synthetic) === 'true' ? t('模拟', 'Synthetic') : t('真实', 'Real')}
{item.tier ? ` · ${String(item.tier)}` : ''}
</td>
<td className="px-3 py-2">{formatScore(toNumber(item.baseline_score))}</td>
<td className="px-3 py-2">{formatScore(toNumber(item.candidate_score))}</td>
<td className="px-3 py-2">{formatSignedScore(toNumber(item.delta))}</td>
@ -1144,6 +1245,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
) : null}
{Object.keys(abilitySummary).length > 0 ? (
<RawDetails title={t('能力评分汇总', 'Ability score summary')} payload={abilitySummary} />
) : null}
{Object.keys(toolExecutionSummary).length > 0 ? (
<RawDetails title={t('工具诊断汇总', 'Tool diagnostic summary')} payload={toolExecutionSummary} />
) : null}
{report.preservation_report ? (
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
) : null}
@ -1366,7 +1473,9 @@ function triggerReasonLabel(reason: string, t: (zh: string, en: string) => strin
}
function publishBlockReason(draft: SkillDraft, t: (zh: string, en: string) => string): string {
if (draft.status !== 'approved') return t('草稿还没有批准,不能发布。', 'The draft is not approved yet.');
if (draft.status !== 'in_review' && draft.status !== 'approved') {
return t('草稿还没有送审,不能发布。', 'The draft has not been submitted yet.');
}
if (!draft.safety_report) return t('缺少安全报告,不能发布。', 'A safety report is required before publishing.');
if (draft.safety_report.risk_level === 'critical' || !draft.safety_report.passed) {
return t('安全报告存在阻断项,不能发布。', 'The safety report has blockers.');
@ -1399,6 +1508,11 @@ function formatScore(value: number): string {
return value.toFixed(2);
}
function formatOptionalScore(value: unknown): string {
const parsed = toOptionalNumber(value);
return typeof parsed === 'number' ? formatScore(parsed) : '-';
}
function formatPercent(value?: number | null): string {
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
return `${Math.round(value * 100)}%`;
@ -1414,6 +1528,12 @@ function toNumber(value: unknown): number {
return Number.isFinite(parsed) ? parsed : 0;
}
function toOptionalNumber(value: unknown): number | null {
if (value === null || value === undefined || value === '') return null;
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
function EmptyState({ icon, text }: { icon: React.ReactNode; text: string }) {
return (
<div className="py-12 text-center text-muted-foreground">
@ -1475,7 +1595,7 @@ function UploadSkillForm({
className="block w-full cursor-pointer text-sm text-muted-foreground file:mr-4 file:rounded-md file:border-0 file:bg-primary file:px-4 file:py-2 file:text-sm file:font-medium file:text-primary-foreground hover:file:bg-primary/90"
/>
<p className="text-xs text-muted-foreground">
{pickAppText(locale, '上传后进入草稿评审,并自动运行 safety 和 eval。', 'After upload, the skill enters draft review and runs safety and eval automatically.')}
{pickAppText(locale, '上传后生成草稿;送审后再运行 safety 和 eval。', 'After upload, a draft is created; safety and eval run after submission.')}
</p>
</div>
<div className="flex justify-end gap-2">

View File

@ -3,7 +3,7 @@
import { useEffect } from 'react';
import { usePathname, useRouter, useSearchParams } from 'next/navigation';
import { buildAuthPortalUrl } from '@/lib/auth-portal';
import { clearTokens, getMe, isLoggedIn } from '@/lib/api';
import { AUTH_CLEARED_EVENT, clearTokens, getMe, isLoggedIn } from '@/lib/api';
import { pickAppText } from '@/lib/i18n/core';
import { useAppI18n } from '@/lib/i18n/provider';
import { useChatStore } from '@/lib/store';
@ -66,6 +66,18 @@ export default function AuthGuard({
};
}, [setIsAuthLoading, setUser]);
useEffect(() => {
const handleAuthCleared = () => {
setUser(null);
setIsAuthLoading(false);
};
window.addEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
return () => {
window.removeEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
};
}, [setIsAuthLoading, setUser]);
useEffect(() => {
if (isAuthLoading) {
return;

View File

@ -58,6 +58,7 @@ const WS_URL = process.env.NEXT_PUBLIC_WS_URL?.trim();
const DEFAULT_API_URL = 'http://127.0.0.1:18080';
const ACCESS_TOKEN_KEY = 'beaver_access_token';
const REFRESH_TOKEN_KEY = 'beaver_refresh_token';
export const AUTH_CLEARED_EVENT = 'beaver-auth-cleared';
const REQUEST_TIMEOUT_MS = 8000;
const OUTLOOK_REQUEST_TIMEOUT_MS = 45000;
const SKILL_LEARNING_REQUEST_TIMEOUT_MS = 120000;
@ -117,6 +118,34 @@ type FetchJsonOptions = RequestInit & {
timeoutMs?: number;
};
export class ApiError extends Error {
status: number;
detail: string;
constructor(message: string, options: { status: number; detail: string }) {
super(message);
this.name = 'ApiError';
this.status = options.status;
this.detail = options.detail;
}
}
export function isApiError(error: unknown, status?: number): error is ApiError {
return error instanceof ApiError && (status === undefined || error.status === status);
}
function parseErrorDetail(text: string): string {
try {
const parsed = JSON.parse(text);
if (parsed && typeof parsed.detail === 'string') {
return parsed.detail;
}
} catch {
// keep raw text
}
return text;
}
function withTimeout(
signal?: AbortSignal,
timeoutMs: number = REQUEST_TIMEOUT_MS
@ -163,6 +192,7 @@ export function clearTokens(): void {
if (!isBrowser()) return;
localStorage.removeItem(ACCESS_TOKEN_KEY);
localStorage.removeItem(REFRESH_TOKEN_KEY);
window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT));
}
export function isLoggedIn(): boolean {
@ -215,16 +245,11 @@ async function fetchJSON<T>(path: string, options?: FetchJsonOptions): Promise<T
if (res.status === 401) {
clearTokens();
}
let detail = text;
try {
const parsed = JSON.parse(text);
if (parsed && typeof parsed.detail === 'string') {
detail = parsed.detail;
}
} catch {
// keep raw text
}
throw new Error(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`);
const detail = parseErrorDetail(text);
throw new ApiError(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`, {
status: res.status,
detail,
});
}
return res.json();
}
@ -1216,7 +1241,7 @@ export async function uploadSkill(file: File): Promise<Skill> {
if (!res.ok) {
const text = await res.text();
throw new Error(`接口错误 ${res.status}: ${text}`);
throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`);
}
return res.json();
}

View File

@ -0,0 +1,8 @@
const USER_FILE_MUTABLE_ROOTS = new Set(['uploads', 'outputs', 'shared', 'tasks']);
export function canMutateUserFilesPath(path: string): boolean {
const cleaned = path.trim().replace(/^\/+|\/+$/g, '');
if (!cleaned) return false;
const [root] = cleaned.split('/');
return USER_FILE_MUTABLE_ROOTS.has(root);
}

View File

@ -3,9 +3,23 @@ import { resolve } from 'node:path';
import { describe, expect, it } from 'vitest';
import { canMutateUserFilesPath } from './user-file-paths';
const root = resolve(__dirname, '..');
describe('user file system frontend wiring', () => {
it('only enables mutating file actions inside concrete user-file roots', () => {
expect(canMutateUserFilesPath('')).toBe(false);
expect(canMutateUserFilesPath('/')).toBe(false);
expect(canMutateUserFilesPath('qa-folder')).toBe(false);
expect(canMutateUserFilesPath('uploads')).toBe(true);
expect(canMutateUserFilesPath('uploads/qa-folder')).toBe(true);
expect(canMutateUserFilesPath('outputs/report.md')).toBe(true);
expect(canMutateUserFilesPath('shared')).toBe(true);
expect(canMutateUserFilesPath('tasks/task-1')).toBe(true);
});
it('routes API client helpers to user file endpoints', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
@ -17,6 +31,13 @@ describe('user file system frontend wiring', () => {
expect(apiSource).toContain('/api/user-files/mkdir');
});
it('notifies the app shell when API auth is cleared', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
expect(apiSource).toContain('AUTH_CLEARED_EVENT');
expect(apiSource).toContain("window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT))");
});
it('does not wire the Files page to workspace or MinIO management APIs', () => {
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
@ -29,4 +50,18 @@ describe('user file system frontend wiring', () => {
expect(pageSource).not.toContain('accessKey');
expect(pageSource).not.toContain('secretKey');
});
it('does not retry user-file loads after an auth failure', () => {
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
expect(pageSource).toContain('isAuthError');
expect(pageSource).toContain('if (isAuthError(err))');
});
it('shows backend upload error details instead of raw JSON payloads', () => {
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
expect(apiSource).toContain('function parseErrorDetail');
expect(apiSource).toContain('throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`)');
});
});

View File

@ -993,6 +993,12 @@ export interface SkillDraftEvalReport {
confidence?: 'low' | 'medium' | 'high' | string;
case_reports?: Array<Record<string, unknown>>;
tool_mode_summary?: Record<string, unknown>;
ability_score_summary?: Record<string, unknown>;
tool_execution_summary?: Record<string, unknown>;
case_selection_summary?: Record<string, unknown>;
real_score_avg?: number | null;
synthetic_score_avg?: number | null;
overall_score_avg?: number | null;
preservation_report?: Record<string, unknown> | null;
}
@ -1000,6 +1006,15 @@ export interface SkillDraft {
draft_id: string;
skill_name: string;
base_version?: string | null;
target_version?: string | null;
base_skill?: {
skill_name: string;
version: string;
frontmatter: Record<string, unknown>;
content: string;
summary?: string;
tool_hints?: string[];
} | null;
proposed_content: string;
proposed_frontmatter: Record<string, unknown>;
created_at: string;

View File

@ -47,6 +47,8 @@ http {
location /api/ {
proxy_pass http://127.0.0.1:18080;
proxy_read_timeout 3600;
proxy_send_timeout 3600;
}
location /docs {