feat(app): 移除内置agents并添加CORS支持和技能上传优化
移除了agents/registry.json中的所有内置agents配置,将agents数组清空。 为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。 重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。 新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。 更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。 修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
@ -7,6 +7,7 @@ import asyncio
|
||||
import io
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import secrets
|
||||
import shutil
|
||||
import time
|
||||
@ -49,9 +50,11 @@ from beaver.services.user_file_resolver import (
|
||||
UserFileStorageResolver,
|
||||
build_file_auth_context,
|
||||
)
|
||||
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
|
||||
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
|
||||
from beaver.skills.authoring.format import parse_skill_rewrite_json
|
||||
from beaver.skills.learning import SkillLearningService, SkillLearningWorker, SkillLearningWorkerConfig
|
||||
from beaver.skills.learning.replay import ReplayRunner
|
||||
from beaver.skills.catalog.utils import parse_frontmatter
|
||||
from beaver.skills.catalog.utils import extract_required_tool_names, parse_frontmatter
|
||||
|
||||
from .deps import get_agent_service
|
||||
from .files import (
|
||||
@ -96,8 +99,11 @@ from .schemas import (
|
||||
|
||||
try:
|
||||
from fastapi import FastAPI, File, Form, Header, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
except ModuleNotFoundError: # pragma: no cover - fallback for skeleton-only environments
|
||||
CORSMiddleware = None # type: ignore[assignment]
|
||||
|
||||
def File(default: Any = None) -> Any: # type: ignore[override]
|
||||
return default
|
||||
|
||||
@ -274,6 +280,7 @@ async def _app_lifespan(
|
||||
worker = SkillLearningWorker(
|
||||
pipeline=loaded.skill_learning_pipeline, # type: ignore[arg-type]
|
||||
provider_bundle_factory=lambda: attached_service._make_provider_bundle_for_task(loaded, {}), # noqa: SLF001
|
||||
replay_runner_factory=lambda: ReplayRunner(agent_loop=attached_service.create_loop()),
|
||||
config=worker_config,
|
||||
)
|
||||
worker_task = asyncio.create_task(worker.run_forever())
|
||||
@ -516,6 +523,20 @@ def _self_restart_enabled() -> bool:
|
||||
return os.getenv("BEAVER_ENABLE_SELF_RESTART", "1").strip() not in {"0", "false", "False"}
|
||||
|
||||
|
||||
def _cors_allow_origins() -> list[str]:
|
||||
raw = os.getenv("BEAVER_CORS_ALLOW_ORIGINS", "").strip()
|
||||
if raw:
|
||||
return [origin.strip().rstrip("/") for origin in raw.split(",") if origin.strip()]
|
||||
return [
|
||||
"http://127.0.0.1:3000",
|
||||
"http://localhost:3000",
|
||||
"http://127.0.0.1:3080",
|
||||
"http://localhost:3080",
|
||||
"http://127.0.0.1:3081",
|
||||
"http://localhost:3081",
|
||||
]
|
||||
|
||||
|
||||
def _schedule_self_restart(delay_seconds: float = 0.75) -> None:
|
||||
import threading
|
||||
|
||||
@ -556,6 +577,14 @@ def create_app(
|
||||
shutdown_force=shutdown_force,
|
||||
),
|
||||
)
|
||||
if CORSMiddleware is not None:
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=_cors_allow_origins(),
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
app.state.auth_tokens = {}
|
||||
app.state.handoff_codes = {}
|
||||
app.state.auth_file = Path(os.getenv("BEAVER_AUTH_FILE") or "")
|
||||
@ -1992,13 +2021,19 @@ def create_app(
|
||||
filename = file.filename or ""
|
||||
if not filename.endswith(".zip"):
|
||||
raise HTTPException(status_code=400, detail="File must be a .zip archive")
|
||||
loaded = get_agent_service(request).create_loop().boot()
|
||||
agent_service = get_agent_service(request)
|
||||
loaded = agent_service.create_loop().boot()
|
||||
try:
|
||||
content = await file.read()
|
||||
draft = _create_skill_upload_draft(loaded, filename, content)
|
||||
draft_payload = _create_skill_upload_draft(loaded, filename, content)
|
||||
draft = loaded.draft_service.get_draft(draft_payload["skill_name"], draft_payload["draft_id"])
|
||||
if draft is not None:
|
||||
await _rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename=filename)
|
||||
draft = loaded.draft_service.get_draft(draft.skill_name, draft.draft_id) or draft
|
||||
draft_payload = draft.to_dict()
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
return draft
|
||||
return draft_payload
|
||||
|
||||
@app.get("/api/marketplaces/skills/search")
|
||||
async def search_skillhub(
|
||||
@ -2068,13 +2103,17 @@ def create_app(
|
||||
@app.get("/api/skills/candidates")
|
||||
async def list_skill_candidates(request: Request, status: str | None = None) -> list[dict[str, Any]]:
|
||||
loaded = get_agent_service(request).create_loop().boot()
|
||||
return [item.to_dict() for item in loaded.skill_learning_pipeline.list_candidates(status=status)] # type: ignore[union-attr]
|
||||
return [
|
||||
_skill_learning_candidate_payload(loaded, item)
|
||||
for item in loaded.skill_learning_pipeline.list_candidates(status=status) # type: ignore[union-attr]
|
||||
]
|
||||
|
||||
@app.get("/api/skills/candidates/{candidate_id}")
|
||||
async def get_skill_candidate(candidate_id: str, request: Request) -> dict[str, Any]:
|
||||
loaded = get_agent_service(request).create_loop().boot()
|
||||
try:
|
||||
return loaded.skill_learning_pipeline.get_candidate(candidate_id).to_dict() # type: ignore[union-attr]
|
||||
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
|
||||
return _skill_learning_candidate_payload(loaded, candidate)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
|
||||
@ -2087,25 +2126,19 @@ def create_app(
|
||||
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
|
||||
if candidate.draft_skill_name and candidate.draft_id:
|
||||
try:
|
||||
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
|
||||
loaded.skill_learning_pipeline.get_draft(candidate.draft_skill_name, candidate.draft_id) # type: ignore[union-attr]
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return _skill_draft_payload(loaded, candidate.draft_skill_name, candidate.draft_id)
|
||||
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
|
||||
draft = await loaded.skill_learning_pipeline.synthesize_draft( # type: ignore[union-attr]
|
||||
candidate_id,
|
||||
provider_bundle=provider_bundle,
|
||||
)
|
||||
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
|
||||
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
|
||||
candidate_id,
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=ReplayRunner(agent_loop=loop),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
return draft.to_dict()
|
||||
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
|
||||
|
||||
@app.post("/api/skills/candidates/{candidate_id}/regenerate")
|
||||
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
|
||||
@ -2118,17 +2151,9 @@ def create_app(
|
||||
candidate_id,
|
||||
provider_bundle=provider_bundle,
|
||||
)
|
||||
loaded.skill_learning_pipeline.check_safety(draft.skill_name, draft.draft_id) # type: ignore[union-attr]
|
||||
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
|
||||
candidate_id,
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=ReplayRunner(agent_loop=loop),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
return draft.to_dict()
|
||||
return _skill_draft_payload(loaded, draft.skill_name, draft.draft_id)
|
||||
|
||||
@app.post("/api/skills/learning/run-once")
|
||||
async def run_skill_learning_once(request: Request) -> dict[str, Any]:
|
||||
@ -2185,17 +2210,31 @@ def create_app(
|
||||
|
||||
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/submit")
|
||||
async def submit_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
loaded = get_agent_service(request).create_loop().boot()
|
||||
agent_service = get_agent_service(request)
|
||||
loop = agent_service.create_loop()
|
||||
loaded = loop.boot()
|
||||
try:
|
||||
review = loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
|
||||
skill_name,
|
||||
draft_id,
|
||||
requested_by=str((payload or {}).get("requested_by") or "web"),
|
||||
notes=str((payload or {}).get("notes") or ""),
|
||||
)
|
||||
safety = loaded.skill_learning_pipeline.check_safety(skill_name, draft_id) # type: ignore[union-attr]
|
||||
if safety.passed and safety.risk_level != "critical":
|
||||
loaded.skill_learning_pipeline.submit_review( # type: ignore[union-attr]
|
||||
skill_name,
|
||||
draft_id,
|
||||
requested_by=str((payload or {}).get("requested_by") or "web"),
|
||||
notes=str((payload or {}).get("notes") or ""),
|
||||
)
|
||||
candidate_id = _skill_learning_candidate_id_for_draft(loaded, skill_name, draft_id)
|
||||
if candidate_id is not None:
|
||||
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
|
||||
await loaded.skill_learning_pipeline.evaluate_draft( # type: ignore[union-attr]
|
||||
candidate_id,
|
||||
skill_name,
|
||||
draft_id,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=ReplayRunner(agent_loop=loop),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise _skill_draft_http_error(exc) from exc
|
||||
return review.to_dict()
|
||||
return _skill_draft_payload(loaded, skill_name, draft_id)
|
||||
|
||||
@app.post("/api/skills/{skill_name}/drafts/{draft_id}/approve")
|
||||
async def approve_skill_draft(skill_name: str, draft_id: str, request: Request, payload: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
@ -2719,47 +2758,70 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
|
||||
if not file_infos:
|
||||
raise ValueError("Zip archive is empty")
|
||||
skill_entries = []
|
||||
for info in file_infos:
|
||||
parts = Path(info.filename.replace("\\", "/")).parts
|
||||
if "__MACOSX" in parts or Path(info.filename).name == ".DS_Store":
|
||||
continue
|
||||
if info.filename.replace("\\", "/").startswith("/") or any(part in {"", ".", ".."} for part in parts):
|
||||
raise ValueError(f"Unsafe archive entry: {info.filename}")
|
||||
if parts[-1] == "SKILL.md":
|
||||
if len(parts) not in (1, 2):
|
||||
raise ValueError("SKILL.md must be at root or inside one top-level directory")
|
||||
skill_entries.append(info.filename)
|
||||
if not skill_entries:
|
||||
raise ValueError("Zip must contain SKILL.md")
|
||||
skill_entry = skill_entries[0]
|
||||
top = Path(skill_entry).parts[0] if len(Path(skill_entry).parts) == 2 else ""
|
||||
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
|
||||
frontmatter, body = parse_frontmatter(raw_skill)
|
||||
skill_name = str(frontmatter.get("name") or top or Path(filename).stem).strip().replace(" ", "-")
|
||||
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
|
||||
raise ValueError("Could not determine a safe skill name")
|
||||
files: list[tuple[str, bytes]] = []
|
||||
safe_entries: list[tuple[Any, str, tuple[str, ...]]] = []
|
||||
for info in file_infos:
|
||||
raw = info.filename.replace("\\", "/")
|
||||
parts = Path(raw).parts
|
||||
if "__MACOSX" in parts or Path(raw).name == ".DS_Store":
|
||||
continue
|
||||
if raw.startswith("/"):
|
||||
if raw.startswith("/") or any(part in {"", ".", ".."} for part in parts):
|
||||
raise ValueError(f"Unsafe archive entry: {info.filename}")
|
||||
if top and parts and parts[0] != top:
|
||||
raise ValueError("Zip archive must contain a single top-level skill directory")
|
||||
rel_parts = parts[1:] if top and parts and parts[0] == top else parts
|
||||
safe_entries.append((info, raw, tuple(parts)))
|
||||
if _is_skill_markdown_entry(parts[-1]):
|
||||
skill_entries.append(raw)
|
||||
if not skill_entries:
|
||||
raise ValueError("Zip must contain SKILL.md")
|
||||
if len(skill_entries) > 1:
|
||||
raise ValueError("Zip must contain exactly one SKILL.md")
|
||||
skill_entry = skill_entries[0]
|
||||
skill_root = tuple(Path(skill_entry).parts[:-1])
|
||||
raw_skill = archive.read(skill_entry).decode("utf-8", errors="replace")
|
||||
frontmatter, body = parse_frontmatter(raw_skill)
|
||||
skill_name = str(frontmatter.get("name") or (skill_root[-1] if skill_root else "") or Path(filename).stem).strip().replace(" ", "-")
|
||||
if not skill_name or "/" in skill_name or "\\" in skill_name or skill_name in {".", ".."}:
|
||||
raise ValueError("Could not determine a safe skill name")
|
||||
proposed_frontmatter = normalize_skill_frontmatter(
|
||||
{
|
||||
**dict(frontmatter),
|
||||
"name": skill_name,
|
||||
"description": frontmatter.get("description") or skill_name,
|
||||
},
|
||||
skill_name=skill_name,
|
||||
)
|
||||
proposed_frontmatter["tools"] = _merge_tool_names(
|
||||
proposed_frontmatter.get("tools"),
|
||||
extract_required_tool_names(body),
|
||||
_infer_uploaded_skill_tools(
|
||||
skill_name=skill_name,
|
||||
filename=filename,
|
||||
frontmatter=proposed_frontmatter,
|
||||
content=body,
|
||||
loaded=loaded,
|
||||
),
|
||||
)
|
||||
proposed_content = ensure_canonical_skill_body(
|
||||
body,
|
||||
title=skill_name,
|
||||
description=str(proposed_frontmatter.get("description") or ""),
|
||||
tools=list(proposed_frontmatter.get("tools") or []),
|
||||
)
|
||||
files: list[tuple[str, bytes]] = []
|
||||
for info, raw, parts in safe_entries:
|
||||
if raw == skill_entry:
|
||||
continue
|
||||
if skill_root:
|
||||
if parts[: len(skill_root)] != skill_root:
|
||||
continue
|
||||
rel_parts = parts[len(skill_root):]
|
||||
else:
|
||||
rel_parts = parts
|
||||
if not rel_parts or any(part in {"", ".", ".."} for part in rel_parts):
|
||||
raise ValueError(f"Unsafe archive entry: {info.filename}")
|
||||
files.append(("/".join(rel_parts), archive.read(info)))
|
||||
draft = loaded.draft_service.create_new_skill_draft(
|
||||
skill_name=skill_name,
|
||||
proposed_content=body,
|
||||
proposed_frontmatter={
|
||||
**dict(frontmatter),
|
||||
"name": skill_name,
|
||||
"description": frontmatter.get("description") or skill_name,
|
||||
},
|
||||
proposed_content=proposed_content,
|
||||
proposed_frontmatter=proposed_frontmatter,
|
||||
created_by="web-upload",
|
||||
reason=f"Uploaded {filename}",
|
||||
evidence_refs=[{"kind": "upload", "filename": filename, "files": sorted(path for path, _ in files)}],
|
||||
@ -2784,6 +2846,162 @@ def _create_skill_upload_draft(loaded: Any, filename: str, content: bytes) -> di
|
||||
return draft.to_dict()
|
||||
|
||||
|
||||
def _is_skill_markdown_entry(filename: str) -> bool:
|
||||
return filename.strip().lower() in {"skill.md", "skills.md"}
|
||||
|
||||
|
||||
def _merge_tool_names(*groups: Any) -> list[str]:
|
||||
result: list[str] = []
|
||||
for group in groups:
|
||||
if isinstance(group, str):
|
||||
raw_items = group.split(",")
|
||||
elif isinstance(group, (list, tuple, set)):
|
||||
raw_items = list(group)
|
||||
else:
|
||||
raw_items = []
|
||||
for item in raw_items:
|
||||
cleaned = str(item).strip()
|
||||
if cleaned and cleaned not in result:
|
||||
result.append(cleaned)
|
||||
return result
|
||||
|
||||
|
||||
def _infer_uploaded_skill_tools(
|
||||
*,
|
||||
skill_name: str,
|
||||
filename: str,
|
||||
frontmatter: dict[str, Any],
|
||||
content: str,
|
||||
loaded: Any,
|
||||
) -> list[str]:
|
||||
available = _available_runtime_tool_names(loaded)
|
||||
text = "\n".join(
|
||||
[
|
||||
skill_name,
|
||||
filename,
|
||||
json.dumps(frontmatter, ensure_ascii=False, sort_keys=True),
|
||||
content,
|
||||
]
|
||||
).lower()
|
||||
inferred: list[str] = []
|
||||
|
||||
for tool_name in sorted(available or _COMMON_RUNTIME_TOOL_NAMES):
|
||||
if re.search(rf"(?<![a-z0-9_]){re.escape(tool_name.lower())}(?![a-z0-9_])", text):
|
||||
inferred.append(tool_name)
|
||||
|
||||
def add_if_available(*tool_names: str) -> None:
|
||||
for tool_name in tool_names:
|
||||
if available is not None and tool_name not in available:
|
||||
continue
|
||||
if tool_name not in inferred:
|
||||
inferred.append(tool_name)
|
||||
|
||||
if re.search(r"\b(weather|forecast|temperature|precipitation|rain|snow|humidity|wind|air quality|aqi)\b", text):
|
||||
add_if_available("web_fetch", "web_search")
|
||||
if re.search(r"\b(latest|current|today|tomorrow|news|search|query|lookup|find online|web search)\b", text):
|
||||
add_if_available("web_search")
|
||||
if re.search(r"\b(url|http|https|website|webpage|page|fetch|crawl|browser|online source)\b", text):
|
||||
add_if_available("web_fetch")
|
||||
|
||||
return inferred
|
||||
|
||||
|
||||
def _available_runtime_tool_names(loaded: Any) -> set[str] | None:
|
||||
registry = getattr(loaded, "tool_registry", None)
|
||||
if registry is None:
|
||||
return None
|
||||
try:
|
||||
return {spec.name for spec in registry.list_specs()}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
_COMMON_RUNTIME_TOOL_NAMES = {
|
||||
"web_fetch",
|
||||
"web_search",
|
||||
"read_file",
|
||||
"write_file",
|
||||
"patch_file",
|
||||
"search_files",
|
||||
"list_directory",
|
||||
"memory",
|
||||
"terminal",
|
||||
"process",
|
||||
"execute_code",
|
||||
"skill_view",
|
||||
"skills_list",
|
||||
"skill_manage",
|
||||
"cron",
|
||||
}
|
||||
|
||||
|
||||
async def _rewrite_uploaded_skill_draft_with_llm(agent_service: Any, loaded: Any, draft: Any, *, filename: str) -> None:
|
||||
try:
|
||||
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
|
||||
provider = getattr(provider_bundle, "auxiliary_provider", None) or getattr(provider_bundle, "main_provider", None)
|
||||
runtime = getattr(provider_bundle, "auxiliary_runtime", None) or getattr(provider_bundle, "main_runtime", None)
|
||||
if provider is None:
|
||||
return
|
||||
available_tool_names = sorted(_available_runtime_tool_names(loaded) or _COMMON_RUNTIME_TOOL_NAMES)
|
||||
response = await provider.chat(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You rewrite uploaded Beaver skills into the required house style. "
|
||||
"Return only JSON with keys: frontmatter, content, change_reason. "
|
||||
"Do not include markdown fences."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"Uploaded filename: {filename}\n"
|
||||
f"Skill name: {draft.skill_name}\n"
|
||||
f"Current frontmatter:\n{json.dumps(draft.proposed_frontmatter, ensure_ascii=False, sort_keys=True)}\n\n"
|
||||
f"Current content:\n{draft.proposed_content}\n\n"
|
||||
f"Available runtime tool names:\n{json.dumps(available_tool_names, ensure_ascii=False)}\n\n"
|
||||
f"{canonical_skill_format_instructions()}\n\n"
|
||||
"Rewrite the skill so it is operational, concrete, and ready for review/publish. "
|
||||
"Infer exact required runtime tools from the uploaded content when the workflow depends on tools. "
|
||||
"Keep frontmatter.tools and the Required Tools section consistent."
|
||||
),
|
||||
},
|
||||
],
|
||||
tools=None,
|
||||
model=getattr(runtime, "model", None),
|
||||
max_tokens=4096,
|
||||
temperature=0,
|
||||
)
|
||||
payload = parse_skill_rewrite_json(response.content or "", skill_name=draft.skill_name)
|
||||
if payload is None:
|
||||
return
|
||||
payload["frontmatter"]["tools"] = _merge_tool_names(
|
||||
payload["frontmatter"].get("tools"),
|
||||
extract_required_tool_names(payload["content"]),
|
||||
_infer_uploaded_skill_tools(
|
||||
skill_name=draft.skill_name,
|
||||
filename=filename,
|
||||
frontmatter=payload["frontmatter"],
|
||||
content=payload["content"],
|
||||
loaded=loaded,
|
||||
),
|
||||
)
|
||||
payload["content"] = ensure_canonical_skill_body(
|
||||
payload["content"],
|
||||
title=str(payload["frontmatter"].get("name") or draft.skill_name),
|
||||
description=str(payload["frontmatter"].get("description") or ""),
|
||||
tools=list(payload["frontmatter"].get("tools") or []),
|
||||
)
|
||||
draft.proposed_frontmatter = payload["frontmatter"]
|
||||
draft.proposed_content = payload["content"]
|
||||
if payload.get("change_reason"):
|
||||
draft.reason = f"{draft.reason}; LLM rewrite: {payload['change_reason']}"
|
||||
loaded.skill_spec_store.write_draft(draft)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
|
||||
def _debug_runs_for_session(session_manager: Any, session_id: str) -> list[dict[str, Any]]:
|
||||
grouped: dict[str, list[Any]] = {}
|
||||
run_order: list[str] = []
|
||||
@ -3559,6 +3777,39 @@ def _skill_detail_payload(loaded: Any, name: str, version: str | None) -> dict[s
|
||||
}
|
||||
|
||||
|
||||
def _skill_learning_candidate_payload(loaded: Any, candidate: Any) -> dict[str, Any]:
|
||||
payload = candidate.to_dict()
|
||||
evidence = dict(payload.get("evidence") or {})
|
||||
task_text = _skill_learning_candidate_task_text(loaded, candidate)
|
||||
if task_text:
|
||||
evidence["task_text"] = task_text
|
||||
evidence["theme"] = SkillLearningService._task_theme(task_text)
|
||||
payload["evidence"] = evidence
|
||||
if candidate.kind == "new_skill":
|
||||
payload["evidence_summary"] = f"Theme: {evidence['theme']}"
|
||||
return payload
|
||||
|
||||
|
||||
def _skill_learning_candidate_task_text(loaded: Any, candidate: Any) -> str:
|
||||
evidence = candidate.evidence if isinstance(candidate.evidence, dict) else {}
|
||||
task_id = str(evidence.get("task_id") or "").strip()
|
||||
source_run_ids = set(candidate.source_run_ids or [])
|
||||
try:
|
||||
run_store = loaded.skill_learning_pipeline.learning_service.run_store
|
||||
runs = run_store.list_runs()
|
||||
except Exception:
|
||||
return str(evidence.get("task_text") or "").strip()
|
||||
|
||||
if task_id:
|
||||
task_runs = [record for record in runs if record.task_id == task_id]
|
||||
if task_runs:
|
||||
return SkillLearningService._representative_task_text(task_runs)
|
||||
source_runs = [record for record in runs if record.run_id in source_run_ids]
|
||||
if source_runs:
|
||||
return SkillLearningService._representative_task_text(source_runs)
|
||||
return str(evidence.get("task_text") or "").strip()
|
||||
|
||||
|
||||
def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include_reviews: bool = False) -> dict[str, Any]:
|
||||
draft = loaded.skill_learning_pipeline.get_draft(skill_name, draft_id) # type: ignore[union-attr]
|
||||
safety = loaded.skill_learning_pipeline.get_safety_report(skill_name, draft_id) # type: ignore[union-attr]
|
||||
@ -3567,6 +3818,8 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
|
||||
**draft.to_dict(),
|
||||
"safety_report": safety.to_dict() if safety is not None else None,
|
||||
"eval_report": eval_report.to_dict() if eval_report is not None else None,
|
||||
"target_version": _skill_draft_target_version(loaded, draft.skill_name, draft.proposal_kind),
|
||||
"base_skill": _skill_draft_base_skill_payload(loaded, draft),
|
||||
}
|
||||
if include_reviews:
|
||||
payload["reviews"] = [
|
||||
@ -3576,6 +3829,45 @@ def _skill_draft_payload(loaded: Any, skill_name: str, draft_id: str, *, include
|
||||
return payload
|
||||
|
||||
|
||||
def _skill_draft_base_skill_payload(loaded: Any, draft: Any) -> dict[str, Any] | None:
|
||||
if draft.proposal_kind == "new_skill" or not draft.base_version:
|
||||
return None
|
||||
store = loaded.skill_learning_pipeline.publisher.store # type: ignore[union-attr]
|
||||
loaded_version = store.read_published_skill(draft.skill_name, draft.base_version)
|
||||
if loaded_version is None:
|
||||
return None
|
||||
version = loaded_version.version
|
||||
return {
|
||||
"skill_name": version.skill_name,
|
||||
"version": version.version,
|
||||
"frontmatter": dict(version.frontmatter),
|
||||
"content": loaded_version.content,
|
||||
"summary": version.summary,
|
||||
"tool_hints": list(version.tool_hints),
|
||||
}
|
||||
|
||||
|
||||
def _skill_draft_target_version(loaded: Any, skill_name: str, proposal_kind: str) -> str | None:
|
||||
if proposal_kind == "retire_skill":
|
||||
return None
|
||||
versions = [
|
||||
item
|
||||
for item in loaded.skill_learning_pipeline.publisher.store.list_versions(skill_name) # type: ignore[union-attr]
|
||||
if isinstance(item, str) and item.startswith("v") and item[1:].isdigit()
|
||||
]
|
||||
if not versions:
|
||||
return "v0001"
|
||||
latest = max(int(item[1:]) for item in versions)
|
||||
return f"v{latest + 1:04d}"
|
||||
|
||||
|
||||
def _skill_learning_candidate_id_for_draft(loaded: Any, skill_name: str, draft_id: str) -> str | None:
|
||||
for candidate in loaded.skill_learning_pipeline.list_candidates(): # type: ignore[union-attr]
|
||||
if candidate.draft_skill_name == skill_name and candidate.draft_id == draft_id:
|
||||
return candidate.candidate_id
|
||||
return None
|
||||
|
||||
|
||||
def _skill_versions_payload(loaded: Any, record: Any) -> list[dict[str, Any]]:
|
||||
if record.source != "workspace":
|
||||
return [
|
||||
|
||||
@ -235,6 +235,12 @@ class SkillDraftEvalReport:
|
||||
confidence: str = "low"
|
||||
case_reports: list[dict[str, Any]] = field(default_factory=list)
|
||||
tool_mode_summary: dict[str, Any] = field(default_factory=dict)
|
||||
ability_score_summary: dict[str, Any] = field(default_factory=dict)
|
||||
tool_execution_summary: dict[str, Any] = field(default_factory=dict)
|
||||
case_selection_summary: dict[str, Any] = field(default_factory=dict)
|
||||
real_score_avg: float | None = None
|
||||
synthetic_score_avg: float | None = None
|
||||
overall_score_avg: float | None = None
|
||||
preservation_report: dict[str, Any] | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
@ -261,6 +267,12 @@ class SkillDraftEvalReport:
|
||||
"confidence": self.confidence,
|
||||
"case_reports": [dict(item) for item in self.case_reports],
|
||||
"tool_mode_summary": dict(self.tool_mode_summary),
|
||||
"ability_score_summary": dict(self.ability_score_summary),
|
||||
"tool_execution_summary": dict(self.tool_execution_summary),
|
||||
"case_selection_summary": dict(self.case_selection_summary),
|
||||
"real_score_avg": self.real_score_avg,
|
||||
"synthetic_score_avg": self.synthetic_score_avg,
|
||||
"overall_score_avg": self.overall_score_avg,
|
||||
"preservation_report": (
|
||||
dict(self.preservation_report) if self.preservation_report is not None else None
|
||||
),
|
||||
@ -295,6 +307,12 @@ class SkillDraftEvalReport:
|
||||
if isinstance(item, dict)
|
||||
],
|
||||
tool_mode_summary=dict(payload.get("tool_mode_summary") or {}),
|
||||
ability_score_summary=dict(payload.get("ability_score_summary") or {}),
|
||||
tool_execution_summary=dict(payload.get("tool_execution_summary") or {}),
|
||||
case_selection_summary=dict(payload.get("case_selection_summary") or {}),
|
||||
real_score_avg=_optional_bounded_float(payload.get("real_score_avg")),
|
||||
synthetic_score_avg=_optional_bounded_float(payload.get("synthetic_score_avg")),
|
||||
overall_score_avg=_optional_bounded_float(payload.get("overall_score_avg")),
|
||||
preservation_report=(
|
||||
dict(payload["preservation_report"])
|
||||
if isinstance(payload.get("preservation_report"), dict)
|
||||
@ -309,6 +327,12 @@ def _optional_str(value: Any) -> str | None:
|
||||
return str(value)
|
||||
|
||||
|
||||
def _optional_bounded_float(value: Any) -> float | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
return _bounded_float(value, default=0.0)
|
||||
|
||||
|
||||
def _bounded_float(value: Any, *, default: float = 0.0) -> float:
|
||||
if value in (None, ""):
|
||||
return default
|
||||
|
||||
19
app-instance/backend/beaver/skills/authoring/__init__.py
Normal file
19
app-instance/backend/beaver/skills/authoring/__init__.py
Normal file
@ -0,0 +1,19 @@
|
||||
"""Skill authoring helpers."""
|
||||
|
||||
from .format import (
|
||||
CANONICAL_SKILL_SECTION_HEADINGS,
|
||||
canonical_skill_format_instructions,
|
||||
canonicalize_skill_body,
|
||||
ensure_canonical_skill_body,
|
||||
is_canonical_skill_body,
|
||||
normalize_skill_frontmatter,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CANONICAL_SKILL_SECTION_HEADINGS",
|
||||
"canonical_skill_format_instructions",
|
||||
"canonicalize_skill_body",
|
||||
"ensure_canonical_skill_body",
|
||||
"is_canonical_skill_body",
|
||||
"normalize_skill_frontmatter",
|
||||
]
|
||||
250
app-instance/backend/beaver/skills/authoring/format.py
Normal file
250
app-instance/backend/beaver/skills/authoring/format.py
Normal file
@ -0,0 +1,250 @@
|
||||
"""Canonical Beaver skill authoring format."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from beaver.skills.catalog.utils import extract_required_tool_names
|
||||
|
||||
|
||||
CANONICAL_SKILL_SECTION_HEADINGS: tuple[str, ...] = (
|
||||
"## Overview",
|
||||
"## When to Use",
|
||||
"## Required Tools",
|
||||
"## Workflow",
|
||||
"## Validation",
|
||||
"## Boundaries",
|
||||
"## Anti-Patterns",
|
||||
)
|
||||
|
||||
|
||||
def canonical_skill_format_instructions() -> str:
|
||||
headings = "\n".join(f"- {heading}" for heading in CANONICAL_SKILL_SECTION_HEADINGS)
|
||||
return (
|
||||
"Canonical Beaver SKILL.md format:\n"
|
||||
"1. Return a frontmatter object with `name`, `description`, and `tools`.\n"
|
||||
"2. `name` must be lowercase kebab-case. `description` must explain when the skill should be used.\n"
|
||||
"3. `tools` must be an explicit JSON array of exact runtime tool names. Use [] only if no tool is required.\n"
|
||||
"4. The Markdown content must start with one H1 title and include these H2 sections in this exact order:\n"
|
||||
f"{headings}\n"
|
||||
"5. Write concrete operational guidance, not a story about a past task.\n"
|
||||
"6. Include validation steps and anti-patterns so future runs know how to avoid false completion."
|
||||
)
|
||||
|
||||
|
||||
def normalize_skill_frontmatter(frontmatter: dict[str, Any] | None, *, skill_name: str) -> dict[str, Any]:
|
||||
raw = dict(frontmatter or {})
|
||||
name = _slug(str(raw.get("name") or skill_name))
|
||||
description = str(raw.get("description") or f"Use when {name} guidance is needed.").strip()
|
||||
tools = _coerce_string_list(raw.get("tools"))
|
||||
normalized = {}
|
||||
for key, value in raw.items():
|
||||
if key in {"name", "description", "tools"}:
|
||||
continue
|
||||
if key in {"always", "internal"} and isinstance(value, str):
|
||||
normalized[key] = value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
continue
|
||||
normalized[key] = value
|
||||
return {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"tools": tools,
|
||||
**normalized,
|
||||
}
|
||||
|
||||
|
||||
def is_canonical_skill_body(body: str) -> bool:
|
||||
text = body.strip()
|
||||
if not re.search(r"^#\s+\S", text, flags=re.MULTILINE):
|
||||
return False
|
||||
position = 0
|
||||
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
|
||||
found = text.find(heading, position)
|
||||
if found < 0:
|
||||
return False
|
||||
position = found + len(heading)
|
||||
return True
|
||||
|
||||
|
||||
def ensure_canonical_skill_body(
|
||||
body: str,
|
||||
*,
|
||||
title: str,
|
||||
description: str = "",
|
||||
tools: list[str] | None = None,
|
||||
) -> str:
|
||||
if is_canonical_skill_body(body):
|
||||
normalized = body.strip()
|
||||
if tools:
|
||||
normalized = _replace_required_tools_section(normalized, tools)
|
||||
return normalized + "\n"
|
||||
source = _compact_source_guidance(body)
|
||||
overview = description or source or f"Use this skill for {title}."
|
||||
return canonicalize_skill_body(
|
||||
title=title,
|
||||
overview=overview,
|
||||
tools=list(tools or []),
|
||||
workflow=[
|
||||
"Identify whether the user's request matches the skill's trigger conditions.",
|
||||
"Read the relevant source guidance below and apply only the steps that fit the current task.",
|
||||
"Use the required tools deliberately and keep tool output tied to the user's goal.",
|
||||
],
|
||||
validation=[
|
||||
"Verify the requested outcome with the most direct available check.",
|
||||
"Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.",
|
||||
],
|
||||
boundaries=[
|
||||
"Do not broaden the task beyond the user's request.",
|
||||
"Do not use tools that are not listed or clearly available in the current runtime.",
|
||||
],
|
||||
anti_patterns=[
|
||||
"Do not summarize the skill instead of applying it.",
|
||||
"Do not claim completion without validation evidence.",
|
||||
],
|
||||
source_guidance=source,
|
||||
)
|
||||
|
||||
|
||||
def canonicalize_skill_body(
|
||||
*,
|
||||
title: str,
|
||||
overview: str,
|
||||
tools: list[str] | None = None,
|
||||
workflow: list[str] | None = None,
|
||||
validation: list[str] | None = None,
|
||||
boundaries: list[str] | None = None,
|
||||
anti_patterns: list[str] | None = None,
|
||||
when_to_use: list[str] | None = None,
|
||||
source_guidance: str = "",
|
||||
) -> str:
|
||||
cleaned_title = _title(title)
|
||||
tool_lines = _tool_lines(tools or [])
|
||||
workflow_lines = _bullet_lines(workflow or ["Follow the workflow described by the current task and evidence."])
|
||||
validation_lines = _bullet_lines(validation or ["Validate the result before reporting completion."])
|
||||
boundary_lines = _bullet_lines(boundaries or ["Stay within the current task and workspace boundaries."])
|
||||
anti_pattern_lines = _bullet_lines(anti_patterns or ["Do not skip validation."])
|
||||
when_lines = _bullet_lines(when_to_use or [f"Use when the task requires {cleaned_title} guidance."])
|
||||
source_section = f"\n\n### Source Guidance\n\n{source_guidance.strip()}" if source_guidance.strip() else ""
|
||||
return (
|
||||
f"# {cleaned_title}\n\n"
|
||||
"## Overview\n\n"
|
||||
f"{overview.strip() or f'Use this skill for {cleaned_title}.'}\n\n"
|
||||
"## When to Use\n\n"
|
||||
f"{when_lines}\n\n"
|
||||
"## Required Tools\n\n"
|
||||
f"{tool_lines}\n\n"
|
||||
"## Workflow\n\n"
|
||||
f"{workflow_lines}{source_section}\n\n"
|
||||
"## Validation\n\n"
|
||||
f"{validation_lines}\n\n"
|
||||
"## Boundaries\n\n"
|
||||
f"{boundary_lines}\n\n"
|
||||
"## Anti-Patterns\n\n"
|
||||
f"{anti_pattern_lines}\n"
|
||||
)
|
||||
|
||||
|
||||
def parse_skill_rewrite_json(content: str, *, skill_name: str) -> dict[str, Any] | None:
|
||||
cleaned = content.strip()
|
||||
if cleaned.startswith("```"):
|
||||
lines = cleaned.splitlines()
|
||||
if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].startswith("```"):
|
||||
cleaned = "\n".join(lines[1:-1]).strip()
|
||||
try:
|
||||
payload = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
frontmatter = payload.get("frontmatter")
|
||||
body = payload.get("content")
|
||||
if not isinstance(frontmatter, dict) or not isinstance(body, str):
|
||||
return None
|
||||
normalized = normalize_skill_frontmatter(frontmatter, skill_name=skill_name)
|
||||
normalized["tools"] = _merge_string_lists(
|
||||
normalized.get("tools"),
|
||||
extract_required_tool_names(body),
|
||||
)
|
||||
normalized_body = ensure_canonical_skill_body(
|
||||
body,
|
||||
title=normalized["name"],
|
||||
description=normalized["description"],
|
||||
tools=normalized["tools"],
|
||||
)
|
||||
return {
|
||||
"frontmatter": normalized,
|
||||
"content": normalized_body,
|
||||
"change_reason": str(payload.get("change_reason") or ""),
|
||||
}
|
||||
|
||||
|
||||
def _compact_source_guidance(body: str, *, max_chars: int = 20000) -> str:
|
||||
text = body.strip()
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"^---\n.*?\n---\n?", "", text, flags=re.DOTALL).strip()
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = re.sub(r"^(#{1,4})\s+", r"##\1 ", text, flags=re.MULTILINE)
|
||||
return text[:max_chars].rstrip()
|
||||
|
||||
|
||||
def _tool_lines(tools: list[str]) -> str:
|
||||
if not tools:
|
||||
return "- No dedicated tools are required."
|
||||
return "\n".join(f"- `{tool}`" for tool in tools)
|
||||
|
||||
|
||||
def _bullet_lines(items: list[str]) -> str:
|
||||
cleaned = [str(item).strip() for item in items if str(item).strip()]
|
||||
if not cleaned:
|
||||
return "- No additional guidance."
|
||||
return "\n".join(f"- {item}" for item in cleaned)
|
||||
|
||||
|
||||
def _coerce_string_list(value: Any) -> list[str]:
|
||||
if isinstance(value, list):
|
||||
raw_items = value
|
||||
elif isinstance(value, str):
|
||||
raw_items = value.split(",")
|
||||
else:
|
||||
raw_items = []
|
||||
result: list[str] = []
|
||||
for item in raw_items:
|
||||
cleaned = str(item).strip()
|
||||
if cleaned and cleaned not in result:
|
||||
result.append(cleaned)
|
||||
return result
|
||||
|
||||
|
||||
def _merge_string_lists(*values: Any) -> list[str]:
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
for item in _coerce_string_list(value):
|
||||
if item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def _replace_required_tools_section(body: str, tools: list[str]) -> str:
|
||||
replacement = "## Required Tools\n\n" + _tool_lines(tools)
|
||||
updated, count = re.subn(
|
||||
r"(?ms)^##\s+Required\s+Tools\s*\n.*?(?=^##\s+|\Z)",
|
||||
replacement + "\n\n",
|
||||
body.strip(),
|
||||
count=1,
|
||||
)
|
||||
return updated.strip() if count else body.strip()
|
||||
|
||||
|
||||
def _slug(value: str) -> str:
|
||||
text = value.strip().lower()
|
||||
text = re.sub(r"[^a-z0-9-]+", "-", text)
|
||||
text = re.sub(r"-{2,}", "-", text).strip("-")
|
||||
return text or "generated-skill"
|
||||
|
||||
|
||||
def _title(value: str) -> str:
|
||||
cleaned = str(value or "").strip().replace("-", " ")
|
||||
return cleaned.title() if cleaned else "Generated Skill"
|
||||
@ -28,12 +28,13 @@ Choose `new_task` when the user asks for anything that needs the main Task agent
|
||||
|
||||
The Intent Agent has no tools. If a request needs a tool, do not apologize and do not say you cannot access it. Route it to Task mode so the main agent can use tools.
|
||||
|
||||
When there is an active task, do not force every new user message into that task. Use the active task and recent conversation to decide:
|
||||
When there is an active task, do not force every new user message into that task. A Session is the durable conversation/device/group context; a Task is one unit of work inside that Session. Use the active task and recent conversation to decide:
|
||||
|
||||
- Choose `revise_task` when the user asks to change, correct, refine, expand, reformat, or redo the latest active task result.
|
||||
- Choose `continue_task` for neutral follow-up questions or additional next steps that still belong to the active task.
|
||||
- Choose `continue_task` for neutral follow-up questions or additional next steps that explicitly depend on or extend the active task's latest result.
|
||||
- Choose `simple_chat` for unrelated lightweight conversation. This starts a new topic and the previous task will be accepted automatically.
|
||||
- Choose `new_task` when the user asks for clearly unrelated work that needs Task capabilities. This starts a new topic and the previous task will be accepted automatically.
|
||||
- Choose `new_task` for a standalone tool-dependent request even when it resembles the active task. Repeating "珠海天气怎么样" later is a fresh task unless the user clearly says to continue or revise the old result.
|
||||
- Choose `close_task` when the user says the task is satisfactory or finished, such as "可以了", "就这样", or "that's good".
|
||||
- Choose `abandon_task` when the user says to stop, cancel, or no longer do the active task.
|
||||
|
||||
@ -46,6 +47,7 @@ Examples with an active weather task:
|
||||
- "再详细一点" -> `revise_task`
|
||||
- "加上明后天穿衣建议" -> `revise_task`
|
||||
- "顺便查一下深圳" -> `continue_task`
|
||||
- "珠海天气怎么样" -> `new_task` when asked as a standalone later request
|
||||
- "帮我写一个采购合同" -> `new_task`
|
||||
- "吃饭没" -> `simple_chat`
|
||||
- "我在冰岛" -> `simple_chat`
|
||||
|
||||
@ -27,6 +27,7 @@ from beaver.skills.specs.storage import SkillSpecStore
|
||||
from .utils import (
|
||||
check_requirements,
|
||||
escape_xml,
|
||||
extract_required_tool_names,
|
||||
get_missing_requirements,
|
||||
parse_frontmatter,
|
||||
parse_skill_metadata_blob,
|
||||
@ -111,13 +112,19 @@ class SkillsLoader:
|
||||
if not include_internal and _truthy(frontmatter.get("internal")):
|
||||
continue
|
||||
normalized_frontmatter = dict(frontmatter)
|
||||
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
|
||||
record = SkillRecord(
|
||||
name=name,
|
||||
path=skill_file,
|
||||
source=source,
|
||||
version="legacy",
|
||||
source_kind=source,
|
||||
tool_hints=self._coerce_tool_names(frontmatter.get("tools")),
|
||||
tool_hints=self._merge_tool_names(
|
||||
self._coerce_tool_names(frontmatter.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
extract_required_tool_names(body),
|
||||
),
|
||||
frontmatter=normalized_frontmatter,
|
||||
description=str(frontmatter.get("description") or summarize_body(body) or name),
|
||||
)
|
||||
@ -138,6 +145,7 @@ class SkillsLoader:
|
||||
path = self.workspace_skills / name / "SKILL.md"
|
||||
else:
|
||||
path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md"
|
||||
_frontmatter, body = parse_frontmatter(loaded.content)
|
||||
record = SkillRecord(
|
||||
name=name,
|
||||
path=path,
|
||||
@ -146,7 +154,10 @@ class SkillsLoader:
|
||||
content_hash=loaded.version.content_hash,
|
||||
source_kind=str(loaded.version.provenance.get("source_kind") or "workspace"),
|
||||
status=str(loaded.version.review_state or "published"),
|
||||
tool_hints=list(loaded.version.tool_hints),
|
||||
tool_hints=self._merge_tool_names(
|
||||
loaded.version.tool_hints,
|
||||
extract_required_tool_names(body),
|
||||
),
|
||||
frontmatter=dict(loaded.version.frontmatter),
|
||||
description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name),
|
||||
)
|
||||
@ -201,23 +212,32 @@ class SkillsLoader:
|
||||
- read_file
|
||||
- search_files
|
||||
- 兼容 metadata JSON blob 里的 `tools`
|
||||
- 兼容 canonical 正文 `## Required Tools` 段落
|
||||
"""
|
||||
|
||||
record = self._find_record(name)
|
||||
if record is not None and record.tool_hints:
|
||||
return list(record.tool_hints)
|
||||
|
||||
frontmatter = self.get_skill_metadata(name) or {}
|
||||
content = self.load_published_skill(name) or self.load_skill(name) or ""
|
||||
frontmatter, body = parse_frontmatter(content)
|
||||
frontmatter = frontmatter or self.get_skill_metadata(name) or {}
|
||||
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
|
||||
names = [
|
||||
*self._coerce_tool_names(frontmatter.get("tools")),
|
||||
*self._coerce_tool_names(meta_blob.get("tools")),
|
||||
*self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
]
|
||||
names = self._merge_tool_names(
|
||||
self._coerce_tool_names(frontmatter.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
extract_required_tool_names(body),
|
||||
)
|
||||
return names
|
||||
|
||||
@staticmethod
|
||||
def _merge_tool_names(*groups: Any) -> list[str]:
|
||||
result: list[str] = []
|
||||
for item in names:
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
for group in groups:
|
||||
for item in SkillsLoader._coerce_tool_names(group):
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def load_skills_for_context(self, skill_names: list[str]) -> str:
|
||||
|
||||
@ -84,6 +84,41 @@ def strip_frontmatter(content: str) -> str:
|
||||
return body
|
||||
|
||||
|
||||
def extract_required_tool_names(body: str) -> list[str]:
|
||||
"""从 canonical skill 正文的 `## Required Tools` 段落提取工具名。
|
||||
|
||||
这是 frontmatter `tools` 的容错补充,不从任意正文里猜工具。只读取明确
|
||||
命名的 Required Tools section,支持常见 bullet/code 格式。
|
||||
"""
|
||||
|
||||
if not body:
|
||||
return []
|
||||
|
||||
match = re.search(
|
||||
r"(?ims)^##\s+Required\s+Tools\s*$\n(?P<section>.*?)(?=^##\s+|\Z)",
|
||||
body,
|
||||
)
|
||||
if match is None:
|
||||
return []
|
||||
|
||||
names: list[str] = []
|
||||
for line in match.group("section").splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped or not stripped.startswith(("-", "*")):
|
||||
continue
|
||||
candidate = stripped[1:].strip()
|
||||
code_matches = re.findall(r"`([^`]+)`", candidate)
|
||||
raw_items = code_matches or re.split(r"[,,]", candidate)
|
||||
for raw_item in raw_items:
|
||||
name = raw_item.strip().strip("`\"' ")
|
||||
if not name:
|
||||
continue
|
||||
token = name.split()[0].strip("`\"' ::-")
|
||||
if re.fullmatch(r"[A-Za-z0-9_.:-]+", token) and token not in names:
|
||||
names.append(token)
|
||||
return names
|
||||
|
||||
|
||||
def parse_skill_metadata_blob(raw: str) -> dict[str, Any]:
|
||||
"""解析 metadata 字段里的 JSON 扩展配置。
|
||||
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from beaver.engine.context import SkillContext
|
||||
@ -39,7 +41,16 @@ class SkillDraftEvaluator:
|
||||
return self._skipped(candidate, draft)
|
||||
|
||||
runs = self.run_store.list_runs()
|
||||
replay_cases = select_replay_cases(candidate, runs)
|
||||
if replay_runner is not None:
|
||||
replay_cases, case_selection_meta = await _prepare_eval_cases(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=select_replay_cases(candidate, runs),
|
||||
provider_bundle=provider_bundle,
|
||||
)
|
||||
else:
|
||||
replay_cases = []
|
||||
case_selection_meta = {}
|
||||
if replay_runner is not None and replay_cases:
|
||||
return await self._evaluate_replay(
|
||||
candidate=candidate,
|
||||
@ -47,6 +58,7 @@ class SkillDraftEvaluator:
|
||||
replay_cases=replay_cases,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=replay_runner,
|
||||
case_selection_meta=case_selection_meta,
|
||||
)
|
||||
return self._evaluate_heuristic(candidate, draft, runs)
|
||||
|
||||
@ -58,7 +70,7 @@ class SkillDraftEvaluator:
|
||||
) -> SkillDraftEvalReport:
|
||||
runs_by_id = {record.run_id: record for record in runs}
|
||||
cases: list[dict] = []
|
||||
for run_id in candidate.source_run_ids[:8]:
|
||||
for run_id in candidate.source_run_ids[:10]:
|
||||
record = runs_by_id.get(run_id)
|
||||
if record is None:
|
||||
continue
|
||||
@ -116,6 +128,7 @@ class SkillDraftEvaluator:
|
||||
replay_cases: list[dict],
|
||||
provider_bundle: ProviderBundle,
|
||||
replay_runner: ReplayRunner,
|
||||
case_selection_meta: dict[str, Any] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
case_reports: list[dict] = []
|
||||
legacy_cases: list[dict] = []
|
||||
@ -147,17 +160,43 @@ class SkillDraftEvaluator:
|
||||
baseline=baseline,
|
||||
candidate=candidate_arm,
|
||||
)
|
||||
baseline_score = surrogate["baseline_score"]
|
||||
candidate_score = surrogate["candidate_score"]
|
||||
baseline_ability = _ability_score(
|
||||
case=case,
|
||||
arm=baseline,
|
||||
arm_name="baseline",
|
||||
)
|
||||
candidate_ability = _ability_score(
|
||||
case=case,
|
||||
arm=candidate_arm,
|
||||
arm_name="candidate",
|
||||
)
|
||||
baseline_score = baseline_ability["final_score"]
|
||||
candidate_score = candidate_ability["final_score"]
|
||||
tool_execution_score = {
|
||||
"baseline_score": surrogate["baseline_score"],
|
||||
"candidate_score": surrogate["candidate_score"],
|
||||
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
|
||||
"score_role": "diagnostic_only",
|
||||
}
|
||||
case_report = {
|
||||
"run_id": case["run_id"],
|
||||
"task_id": case.get("task_id"),
|
||||
"session_id": case.get("session_id"),
|
||||
"task_text": case.get("task_text"),
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"validator": case.get("validator"),
|
||||
"baseline": baseline,
|
||||
"candidate": candidate_arm,
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"ability_score": {
|
||||
"baseline": baseline_ability,
|
||||
"candidate": candidate_ability,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
},
|
||||
"tool_execution_score": tool_execution_score,
|
||||
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
||||
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
||||
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
||||
@ -172,13 +211,23 @@ class SkillDraftEvaluator:
|
||||
{
|
||||
"run_id": case["run_id"],
|
||||
"session_id": case.get("session_id") or "",
|
||||
"task_text": case.get("task_text") or "",
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
}
|
||||
)
|
||||
preservation_report = _preservation_report(candidate, draft)
|
||||
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
|
||||
return _report_from_case_reports(
|
||||
candidate,
|
||||
draft,
|
||||
case_reports,
|
||||
legacy_cases,
|
||||
preservation_report,
|
||||
case_selection_meta or {},
|
||||
)
|
||||
|
||||
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
||||
return SkillDraftEvalReport(
|
||||
@ -238,22 +287,400 @@ def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -
|
||||
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
|
||||
|
||||
|
||||
async def _prepare_eval_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
provider_bundle: ProviderBundle,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
||||
explicit_cases = _explicit_eval_cases(candidate)
|
||||
merged = _dedupe_cases([*explicit_cases, *historical_cases])
|
||||
usable, excluded = _filter_unscorable_cases(merged)
|
||||
missing = max(0, 10 - len(usable))
|
||||
generated: list[dict[str, Any]] = []
|
||||
if missing:
|
||||
generated = await _generate_synthetic_cases(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=usable,
|
||||
provider_bundle=provider_bundle,
|
||||
count=missing,
|
||||
)
|
||||
generated, generated_excluded = _filter_unscorable_cases(generated)
|
||||
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
|
||||
if len(generated) < missing:
|
||||
generated.extend(
|
||||
_fallback_synthetic_cases(
|
||||
candidate=candidate,
|
||||
historical_cases=usable,
|
||||
start_index=len(generated) + 1,
|
||||
count=missing - len(generated),
|
||||
)
|
||||
)
|
||||
prepared = [*usable, *generated]
|
||||
return prepared[:10], {
|
||||
"requested_case_count": 10,
|
||||
"historical_case_count": len(historical_cases),
|
||||
"explicit_case_count": len(explicit_cases),
|
||||
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
|
||||
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
|
||||
}
|
||||
|
||||
|
||||
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
|
||||
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
|
||||
if not isinstance(raw_cases, list):
|
||||
return []
|
||||
result: list[dict[str, Any]] = []
|
||||
for index, raw in enumerate(raw_cases, start=1):
|
||||
if not isinstance(raw, dict):
|
||||
continue
|
||||
task_text = str(raw.get("task_text") or "").strip()
|
||||
if not task_text:
|
||||
continue
|
||||
case = {
|
||||
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
|
||||
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
|
||||
"session_id": raw.get("session_id") or "explicit-eval",
|
||||
"task_text": task_text,
|
||||
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
|
||||
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
|
||||
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
|
||||
"synthetic": bool(raw.get("synthetic")),
|
||||
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
|
||||
}
|
||||
if isinstance(raw.get("validator"), dict):
|
||||
case["validator"] = dict(raw["validator"])
|
||||
result.append(case)
|
||||
return result
|
||||
|
||||
|
||||
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
for case in cases:
|
||||
run_id = str(case.get("run_id") or "")
|
||||
task_text = str(case.get("task_text") or "")
|
||||
key = run_id or task_text
|
||||
if not key or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(case)
|
||||
return result
|
||||
|
||||
|
||||
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
excluded = {"synthetic_without_validator": 0}
|
||||
for case in cases:
|
||||
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
|
||||
excluded["synthetic_without_validator"] += 1
|
||||
continue
|
||||
result.append(case)
|
||||
return result, excluded
|
||||
|
||||
|
||||
async def _generate_synthetic_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
provider_bundle: ProviderBundle,
|
||||
count: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
|
||||
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
|
||||
model = getattr(runtime, "model", None)
|
||||
try:
|
||||
response = await provider.chat(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You generate validator-first Beaver skill evaluation cases. "
|
||||
"Return only JSON with key cases. Each case must include task_text and validator. "
|
||||
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": _synthetic_case_prompt(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=historical_cases,
|
||||
count=count,
|
||||
),
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
max_tokens=2200,
|
||||
temperature=0.4,
|
||||
)
|
||||
except Exception:
|
||||
return []
|
||||
payload = _parse_json_payload(response.content or "")
|
||||
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
|
||||
if not isinstance(raw_cases, list):
|
||||
return []
|
||||
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
|
||||
|
||||
|
||||
def _synthetic_case_prompt(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
count: int,
|
||||
) -> str:
|
||||
historical = [
|
||||
{
|
||||
"run_id": item.get("run_id"),
|
||||
"task_text": item.get("task_text"),
|
||||
"validator": item.get("validator"),
|
||||
}
|
||||
for item in historical_cases
|
||||
]
|
||||
return (
|
||||
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
|
||||
f"Candidate kind: {candidate.kind}\n"
|
||||
f"Candidate reason: {candidate.reason}\n"
|
||||
f"Draft skill name: {draft.skill_name}\n"
|
||||
f"Related skills: {candidate.related_skill_names}\n"
|
||||
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
|
||||
"Every synthetic case must be validator-first. Return exactly:\n"
|
||||
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
|
||||
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
|
||||
)
|
||||
|
||||
|
||||
def _parse_json_payload(content: str) -> dict[str, Any]:
|
||||
cleaned = content.strip()
|
||||
if cleaned.startswith("```"):
|
||||
cleaned = cleaned.strip("`")
|
||||
if cleaned.startswith("json"):
|
||||
cleaned = cleaned[4:]
|
||||
try:
|
||||
payload = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start < 0 or end <= start:
|
||||
return {}
|
||||
try:
|
||||
payload = json.loads(cleaned[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
|
||||
def _synthetic_case_payloads(
|
||||
candidate: SkillLearningCandidate,
|
||||
raw_cases: list[Any],
|
||||
*,
|
||||
start_index: int,
|
||||
limit: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
for raw in raw_cases:
|
||||
if not isinstance(raw, dict):
|
||||
continue
|
||||
task_text = str(raw.get("task_text") or "").strip()
|
||||
validator = raw.get("validator")
|
||||
if not task_text or not isinstance(validator, dict):
|
||||
continue
|
||||
result.append(
|
||||
_synthetic_case_payload(
|
||||
candidate,
|
||||
task_text,
|
||||
start_index + len(result),
|
||||
validator=dict(validator),
|
||||
tier=str(raw.get("tier") or "bronze"),
|
||||
)
|
||||
)
|
||||
if len(result) >= limit:
|
||||
break
|
||||
return result
|
||||
|
||||
|
||||
def _fallback_synthetic_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
start_index: int,
|
||||
count: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
seed_text = ""
|
||||
if historical_cases:
|
||||
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
|
||||
if not seed_text:
|
||||
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
|
||||
required_terms = _terms(seed_text)[:2] or ["done"]
|
||||
return [
|
||||
_synthetic_case_payload(
|
||||
candidate,
|
||||
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
|
||||
index,
|
||||
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
|
||||
tier="bronze",
|
||||
)
|
||||
for index in range(start_index, start_index + count)
|
||||
]
|
||||
|
||||
|
||||
def _synthetic_case_payload(
|
||||
candidate: SkillLearningCandidate,
|
||||
task_text: str,
|
||||
index: int,
|
||||
*,
|
||||
validator: dict[str, Any],
|
||||
tier: str,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
|
||||
"task_id": f"synthetic-{index:02d}",
|
||||
"session_id": "synthetic-eval",
|
||||
"task_text": task_text,
|
||||
"baseline_skill_names": _baseline_skill_names(candidate),
|
||||
"candidate_skill_name": candidate.draft_skill_name,
|
||||
"accepted_score": 0.75,
|
||||
"synthetic": True,
|
||||
"tier": tier,
|
||||
"validator": validator,
|
||||
}
|
||||
|
||||
|
||||
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
|
||||
if candidate.kind == "revise_skill":
|
||||
return list(candidate.related_skill_names[:1])
|
||||
if candidate.kind == "merge_skills":
|
||||
return list(candidate.related_skill_names)
|
||||
return []
|
||||
|
||||
|
||||
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
|
||||
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
|
||||
if validator is not None:
|
||||
return _ability_from_validator(validator, arm)
|
||||
if not case.get("synthetic"):
|
||||
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
|
||||
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
|
||||
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
|
||||
|
||||
|
||||
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
|
||||
validator_type = str(validator.get("type") or "")
|
||||
if validator_type != "final_answer_contains":
|
||||
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
|
||||
|
||||
answer = str(arm.get("final_answer") or "").lower()
|
||||
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
|
||||
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
|
||||
matched = sum(1 for term in required_terms if term in answer)
|
||||
outcome = matched / max(1, len(required_terms))
|
||||
unsafe = any(term in answer for term in forbidden_terms)
|
||||
safety = 0.0 if unsafe else 1.0
|
||||
final_score = (
|
||||
0.40 * outcome
|
||||
+ 0.25 * outcome
|
||||
+ 0.15 * _process_validity(arm)
|
||||
+ 0.10 * safety
|
||||
+ 0.10 * _path_efficiency(arm, outcome)
|
||||
)
|
||||
return {
|
||||
**_ability_breakdown(score=final_score, source="auto_validator"),
|
||||
"outcome_correctness": round(outcome, 4),
|
||||
"artifact_correctness": round(outcome, 4),
|
||||
"safety_no_regression": round(safety, 4),
|
||||
"validator_type": validator_type,
|
||||
}
|
||||
|
||||
|
||||
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
|
||||
answer = str(arm.get("final_answer") or "").strip()
|
||||
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
|
||||
return _ability_breakdown(score=score, source=source, notes=notes)
|
||||
|
||||
|
||||
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
|
||||
bounded = _bounded_score(score, default=0.0)
|
||||
return {
|
||||
"outcome_correctness": bounded,
|
||||
"artifact_correctness": bounded,
|
||||
"process_validity": bounded,
|
||||
"safety_no_regression": bounded,
|
||||
"path_efficiency": bounded,
|
||||
"final_score": round(bounded, 4),
|
||||
"source": source,
|
||||
"notes": list(notes or []),
|
||||
}
|
||||
|
||||
|
||||
def _process_validity(arm: dict[str, Any]) -> float:
|
||||
if arm.get("finish_reason") == "error":
|
||||
return 0.2
|
||||
return 0.8 if arm.get("tool_calls") else 0.6
|
||||
|
||||
|
||||
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
|
||||
if outcome < 0.5:
|
||||
return 0.3
|
||||
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
|
||||
if call_count <= 3:
|
||||
return 1.0
|
||||
if call_count <= 6:
|
||||
return 0.7
|
||||
return 0.4
|
||||
|
||||
|
||||
def _bounded_score(value: Any, *, default: float) -> float:
|
||||
try:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _terms(text: str) -> list[str]:
|
||||
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
|
||||
|
||||
|
||||
def _report_from_case_reports(
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
case_reports: list[dict],
|
||||
legacy_cases: list[dict],
|
||||
preservation_report: dict | None,
|
||||
case_selection_meta: dict[str, Any] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
|
||||
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
|
||||
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
|
||||
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
|
||||
unchanged = len(legacy_cases) - len(regressions) - len(improved)
|
||||
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
|
||||
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
|
||||
execution, surrogate, blocked = _coverage(case_reports)
|
||||
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
|
||||
score_delta = candidate_avg - baseline_avg
|
||||
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
|
||||
selection_meta = dict(case_selection_meta or {})
|
||||
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
|
||||
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
|
||||
overall_score_avg = round(candidate_avg, 4)
|
||||
ability_summary = {
|
||||
"score_role": "primary",
|
||||
"real_case_count": len(real_cases),
|
||||
"synthetic_case_count": len(synthetic_cases),
|
||||
"real_score_avg": real_score_avg,
|
||||
"synthetic_score_avg": synthetic_score_avg,
|
||||
"overall_score_avg": overall_score_avg,
|
||||
}
|
||||
tool_execution_summary = {
|
||||
"score_role": "diagnostic_only",
|
||||
"executed": execution,
|
||||
"surrogate": surrogate,
|
||||
"blocked": blocked,
|
||||
}
|
||||
return SkillDraftEvalReport(
|
||||
report_id=uuid4().hex,
|
||||
skill_name=draft.skill_name,
|
||||
@ -276,11 +703,34 @@ def _report_from_case_reports(
|
||||
blocked_coverage=blocked,
|
||||
confidence=confidence,
|
||||
case_reports=case_reports,
|
||||
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
|
||||
tool_mode_summary={
|
||||
"executed": execution,
|
||||
"surrogate": surrogate,
|
||||
"blocked": blocked,
|
||||
"score_role": "diagnostic_only",
|
||||
"real_case_count": len(real_cases),
|
||||
"synthetic_case_count": len(synthetic_cases),
|
||||
"real_score_avg": real_score_avg,
|
||||
"synthetic_score_avg": synthetic_score_avg,
|
||||
"overall_score_avg": overall_score_avg,
|
||||
**selection_meta,
|
||||
},
|
||||
ability_score_summary=ability_summary,
|
||||
tool_execution_summary=tool_execution_summary,
|
||||
case_selection_summary=selection_meta,
|
||||
real_score_avg=real_score_avg,
|
||||
synthetic_score_avg=synthetic_score_avg,
|
||||
overall_score_avg=overall_score_avg,
|
||||
preservation_report=preservation_report,
|
||||
)
|
||||
|
||||
|
||||
def _avg(values: list[float]) -> float | None:
|
||||
if not values:
|
||||
return None
|
||||
return round(sum(values) / len(values), 4)
|
||||
|
||||
|
||||
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
|
||||
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
|
||||
for report in case_reports:
|
||||
|
||||
@ -323,8 +323,8 @@ class SkillLearningPipelineService:
|
||||
|
||||
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
|
||||
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
|
||||
if not any(review.status == SkillReviewState.APPROVED.value for review in reviews):
|
||||
raise ValueError("Draft must have an approved review before publish")
|
||||
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
|
||||
raise ValueError("Draft must be submitted for review before publish")
|
||||
safety = self.get_safety_report(draft.skill_name, draft.draft_id)
|
||||
if safety is None:
|
||||
raise ValueError("Draft requires a passing safety report before publish")
|
||||
|
||||
@ -162,18 +162,23 @@ class ReplayRunner:
|
||||
registry=loaded.tool_registry,
|
||||
policy=self.policy,
|
||||
)
|
||||
result = await self.agent_loop.process_direct(
|
||||
request.task_text,
|
||||
provider_bundle=request.provider_bundle,
|
||||
include_skill_assembly=False,
|
||||
include_tools=True,
|
||||
pinned_skill_names=request.pinned_skill_names,
|
||||
pinned_skill_contexts=request.pinned_skill_contexts,
|
||||
max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4),
|
||||
temperature=float(request.model_settings.get("temperature") or 0.0),
|
||||
source="skill_replay_eval",
|
||||
tool_executor_override=replay_executor,
|
||||
)
|
||||
direct_kwargs = {
|
||||
"provider_bundle": request.provider_bundle,
|
||||
"include_skill_assembly": False,
|
||||
"include_tools": True,
|
||||
"pinned_skill_names": request.pinned_skill_names,
|
||||
"pinned_skill_contexts": request.pinned_skill_contexts,
|
||||
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
|
||||
"temperature": float(request.model_settings.get("temperature") or 0.0),
|
||||
"source": "skill_replay_eval",
|
||||
"tool_executor_override": replay_executor,
|
||||
}
|
||||
try:
|
||||
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
|
||||
except RuntimeError as exc:
|
||||
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
|
||||
raise
|
||||
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
@ -188,6 +193,14 @@ class ReplayRunner:
|
||||
}
|
||||
|
||||
|
||||
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
|
||||
message = str(exc)
|
||||
return (
|
||||
"AgentLoop.process_direct() is disabled while run() is active" in message
|
||||
and "submit tasks via submit_direct() instead" in message
|
||||
)
|
||||
|
||||
|
||||
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
effects: list[dict[str, Any]] = []
|
||||
for trace in traces:
|
||||
|
||||
@ -99,6 +99,7 @@ class SkillLearningService:
|
||||
]
|
||||
source_run_ids = [record.run_id for record in source_runs]
|
||||
source_session_ids = list(dict.fromkeys(record.session_id for record in source_runs))
|
||||
representative_task_text = self._representative_task_text(source_runs, fallback=final_run.task_text)
|
||||
|
||||
if not published_receipts:
|
||||
candidates.append(
|
||||
@ -113,7 +114,8 @@ class SkillLearningService:
|
||||
"task_id": task_id,
|
||||
"final_accepted_run_id": final_accepted_run_id,
|
||||
"source_run_ids": source_run_ids,
|
||||
"theme": self._task_theme(final_run.task_text),
|
||||
"task_text": representative_task_text,
|
||||
"theme": self._task_theme(representative_task_text),
|
||||
},
|
||||
status="open",
|
||||
priority=1,
|
||||
@ -329,8 +331,14 @@ class SkillLearningService:
|
||||
|
||||
def _build_new_skill_candidates(self) -> list[SkillLearningCandidate]:
|
||||
groups: dict[str, list[RunRecord]] = {}
|
||||
for record in self.run_store.list_runs():
|
||||
key = self._task_theme(record.task_text)
|
||||
all_runs = self.run_store.list_runs()
|
||||
runs_by_task: dict[str, list[RunRecord]] = {}
|
||||
for record in all_runs:
|
||||
if record.task_id:
|
||||
runs_by_task.setdefault(record.task_id, []).append(record)
|
||||
for record in all_runs:
|
||||
task_runs = runs_by_task.get(record.task_id, [record])
|
||||
key = self._task_theme(self._representative_task_text(task_runs, fallback=record.task_text))
|
||||
if not key:
|
||||
continue
|
||||
groups.setdefault(key, []).append(record)
|
||||
@ -443,12 +451,24 @@ class SkillLearningService:
|
||||
|
||||
@staticmethod
|
||||
def _task_theme(task_text: str) -> str:
|
||||
cleaned = re.sub(r"\s+", " ", task_text.strip().lower())
|
||||
cleaned = re.sub(r"\s+", " ", task_text.strip())
|
||||
if not cleaned:
|
||||
return ""
|
||||
words = cleaned.split(" ")
|
||||
first_sentence = re.split(r"[。!?.!?]", cleaned, maxsplit=1)[0].strip()
|
||||
if not first_sentence:
|
||||
first_sentence = cleaned
|
||||
words = first_sentence.split(" ")
|
||||
return " ".join(words[:8]).strip()
|
||||
|
||||
@staticmethod
|
||||
def _representative_task_text(runs: list[RunRecord], *, fallback: str = "") -> str:
|
||||
ordered = sorted(runs, key=lambda item: (item.attempt_index, item.started_at, item.run_id))
|
||||
for record in ordered:
|
||||
text = record.task_text.strip()
|
||||
if text:
|
||||
return text
|
||||
return fallback.strip()
|
||||
|
||||
@staticmethod
|
||||
def _suggest_skill_name(
|
||||
candidate: SkillLearningCandidate,
|
||||
|
||||
@ -15,12 +15,15 @@ class SurrogateToolEvaluator:
|
||||
return {
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"baseline_tool_execution_score": baseline_score,
|
||||
"candidate_tool_execution_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"surrogate_tool_count": surrogate_count,
|
||||
"blocked_tool_count": blocked_count,
|
||||
"score_role": "diagnostic_only",
|
||||
"confidence": confidence,
|
||||
"notes": [
|
||||
"Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.",
|
||||
"Tool execution score is diagnostic only and is not the main task ability score.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ import json
|
||||
from typing import Any
|
||||
|
||||
from beaver.engine.providers.base import LLMProvider
|
||||
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
|
||||
from beaver.skills.learning.evidence import EvidencePacket
|
||||
from beaver.memory.skills.models import SkillLearningCandidate
|
||||
|
||||
@ -58,7 +59,8 @@ class SkillDraftSynthesizer:
|
||||
"content": (
|
||||
"You synthesize Beaver skill drafts from execution evidence. "
|
||||
"Return only JSON with keys: frontmatter, content, change_reason, "
|
||||
"preserved_sections, changed_sections, dropped_sections."
|
||||
"preserved_sections, changed_sections, dropped_sections. "
|
||||
"The content must follow the Canonical Beaver SKILL.md format."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
@ -113,6 +115,7 @@ class SkillDraftSynthesizer:
|
||||
+ "\n- tools: an explicit JSON array of exact tool names this skill needs. "
|
||||
+ "Prefer called tool names when the workflow depends on them; use run-selected tool names only when clearly required. "
|
||||
+ "Use [] only when no tool is required."
|
||||
+ "\n\n" + canonical_skill_format_instructions()
|
||||
+ "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays."
|
||||
)
|
||||
|
||||
@ -144,14 +147,23 @@ class SkillDraftSynthesizer:
|
||||
|
||||
@staticmethod
|
||||
def _normalize_payload(payload: dict[str, Any], evidence_packet: EvidencePacket) -> dict[str, Any]:
|
||||
frontmatter = dict(payload.get("frontmatter") or {})
|
||||
frontmatter = normalize_skill_frontmatter(
|
||||
dict(payload.get("frontmatter") or {}),
|
||||
skill_name=str((payload.get("frontmatter") or {}).get("name") or "generated-skill"),
|
||||
)
|
||||
tool_hints = _coerce_string_list(frontmatter.get("tools"))
|
||||
if not tool_hints:
|
||||
tool_hints = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
|
||||
frontmatter["tools"] = tool_hints
|
||||
content = ensure_canonical_skill_body(
|
||||
str(payload.get("content") or "").strip(),
|
||||
title=str(frontmatter.get("name") or "generated-skill"),
|
||||
description=str(frontmatter.get("description") or ""),
|
||||
tools=tool_hints,
|
||||
)
|
||||
return {
|
||||
"frontmatter": frontmatter,
|
||||
"content": str(payload.get("content") or "").strip(),
|
||||
"content": content,
|
||||
"change_reason": str(payload.get("change_reason") or ""),
|
||||
"preserved_sections": _coerce_string_list(payload.get("preserved_sections")),
|
||||
"changed_sections": _coerce_string_list(payload.get("changed_sections")),
|
||||
@ -162,13 +174,20 @@ class SkillDraftSynthesizer:
|
||||
def _fallback_payload(candidate: SkillLearningCandidate, evidence_packet: EvidencePacket, action: str) -> dict[str, Any]:
|
||||
related = candidate.related_skill_names[0] if candidate.related_skill_names else "generated-skill"
|
||||
title = related.replace("_", "-")
|
||||
content = "\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured."
|
||||
tools = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
|
||||
content = ensure_canonical_skill_body(
|
||||
"\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured.",
|
||||
title=title,
|
||||
description=candidate.reason or f"Auto-generated {action} draft for {title}.",
|
||||
tools=tools,
|
||||
)
|
||||
return {
|
||||
"frontmatter": {
|
||||
"name": title,
|
||||
"description": candidate.reason or f"Auto-generated {action} draft for {title}.",
|
||||
"tools": _coerce_string_list(evidence_packet.metadata.get("tool_names")),
|
||||
"tools": tools,
|
||||
},
|
||||
"content": f"# {title}\n\n## Evidence\n\n{content}\n",
|
||||
"content": content,
|
||||
"change_reason": candidate.reason or f"Fallback {action} synthesis.",
|
||||
"preserved_sections": [],
|
||||
"changed_sections": [],
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Callable
|
||||
from beaver.engine.providers import ProviderBundle
|
||||
from beaver.memory.skills import SkillLearningCandidate
|
||||
from beaver.skills.learning.pipeline import SkillLearningPipelineService
|
||||
from beaver.skills.learning.replay import ReplayRunner
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@ -57,10 +58,12 @@ class SkillLearningWorker:
|
||||
*,
|
||||
pipeline: SkillLearningPipelineService,
|
||||
provider_bundle_factory: Callable[[], ProviderBundle],
|
||||
replay_runner_factory: Callable[[], ReplayRunner] | None = None,
|
||||
config: SkillLearningWorkerConfig | None = None,
|
||||
) -> None:
|
||||
self.pipeline = pipeline
|
||||
self.provider_bundle_factory = provider_bundle_factory
|
||||
self.replay_runner_factory = replay_runner_factory
|
||||
self.config = config or SkillLearningWorkerConfig.from_env()
|
||||
self._running = False
|
||||
self._lock = asyncio.Lock()
|
||||
@ -126,6 +129,7 @@ class SkillLearningWorker:
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=self.provider_bundle_factory(),
|
||||
replay_runner=self.replay_runner_factory() if self.replay_runner_factory is not None else None,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
@ -16,8 +16,8 @@ class SkillPublisher:
|
||||
|
||||
def publish(self, skill_name: str, draft_id: str, publisher: str, notes: str = "") -> SkillVersion:
|
||||
draft = self._require_draft(skill_name, draft_id)
|
||||
if draft.status != SkillReviewState.APPROVED.value:
|
||||
raise ValueError("Draft must be approved before publish")
|
||||
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
|
||||
raise ValueError("Draft must be submitted for review before publish")
|
||||
if draft.proposal_kind == "retire_skill":
|
||||
raise ValueError("Retire proposals must be applied through apply_retire_proposal")
|
||||
|
||||
@ -81,8 +81,8 @@ class SkillPublisher:
|
||||
|
||||
def apply_retire_proposal(self, skill_name: str, draft_id: str, actor: str, notes: str = "") -> SkillSpec:
|
||||
draft = self._require_draft(skill_name, draft_id)
|
||||
if draft.status != SkillReviewState.APPROVED.value:
|
||||
raise ValueError("Retire proposal must be approved before apply")
|
||||
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
|
||||
raise ValueError("Retire proposal must be submitted for review before apply")
|
||||
if draft.proposal_kind != "retire_skill":
|
||||
raise ValueError("Only retire_skill proposals can be applied as retire proposals")
|
||||
|
||||
|
||||
@ -25,7 +25,11 @@ class MainAgentRouter:
|
||||
timeout_seconds: float = 8.0,
|
||||
) -> MainAgentDecision:
|
||||
if provider is None:
|
||||
return self._fallback(active_task=active_task, reason="router_provider_unavailable")
|
||||
return self._apply_active_task_boundary(
|
||||
self._fallback(active_task=active_task, reason="router_provider_unavailable"),
|
||||
message=message,
|
||||
active_task=active_task,
|
||||
)
|
||||
chat_kwargs: dict[str, Any] = {
|
||||
"messages": [
|
||||
{
|
||||
@ -58,10 +62,18 @@ class MainAgentRouter:
|
||||
for attempt_timeout in (timeout_seconds, 12.0):
|
||||
try:
|
||||
response = await asyncio.wait_for(provider.chat(**chat_kwargs), timeout=attempt_timeout)
|
||||
return self.from_json(response.content or "", active_task=active_task)
|
||||
return self._apply_active_task_boundary(
|
||||
self.from_json(response.content or "", active_task=active_task),
|
||||
message=message,
|
||||
active_task=active_task,
|
||||
)
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
return self._fallback(active_task=active_task, reason=f"router_failed: {last_error}")
|
||||
return self._apply_active_task_boundary(
|
||||
self._fallback(active_task=active_task, reason=f"router_failed: {last_error}"),
|
||||
message=message,
|
||||
active_task=active_task,
|
||||
)
|
||||
|
||||
def from_json(self, text: str, *, active_task: TaskRecord | None = None) -> MainAgentDecision:
|
||||
payload = self._parse_json_object(text)
|
||||
@ -121,6 +133,31 @@ class MainAgentRouter:
|
||||
return MainAgentDecision(mode="task", reason=reason, action="continue_task")
|
||||
return MainAgentDecision(mode="simple", reason=reason, action="simple_chat")
|
||||
|
||||
def _apply_active_task_boundary(
|
||||
self,
|
||||
decision: MainAgentDecision,
|
||||
*,
|
||||
message: str,
|
||||
active_task: TaskRecord | None,
|
||||
) -> MainAgentDecision:
|
||||
if active_task is None or decision.action != "continue_task":
|
||||
return decision
|
||||
if not _looks_like_fresh_task_request(message):
|
||||
return decision
|
||||
if _looks_like_explicit_task_followup(message):
|
||||
return decision
|
||||
title = decision.short_title or active_task.metadata.get("short_title")
|
||||
return MainAgentDecision(
|
||||
mode="task",
|
||||
reason=(
|
||||
"fresh standalone task request in the same session; "
|
||||
"do not attach it to the active task without explicit follow-up wording"
|
||||
),
|
||||
starts_new_task=True,
|
||||
short_title=title,
|
||||
action="create_task",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _prompt(
|
||||
*,
|
||||
@ -159,15 +196,19 @@ class MainAgentRouter:
|
||||
"- close_task: user explicitly says the active Task is done/satisfactory/finished.\n"
|
||||
"- abandon_task: user explicitly says to stop, cancel, abandon, or no longer do the active Task.\n\n"
|
||||
"Critical policy:\n"
|
||||
"- If there is an active Task, choose continue_task or revise_task unless the user's topic is completely unrelated "
|
||||
"to that Task or the user explicitly closes/abandons it.\n"
|
||||
"- A Session is the durable conversation/device/group context. A Task is one unit of work inside that Session. "
|
||||
"Do not use an active Task as a reason to merge every later message into the same work item.\n"
|
||||
"- If there is an active Task, choose continue_task only when the current message explicitly depends on, extends, "
|
||||
"or asks a direct follow-up about that active Task's latest result.\n"
|
||||
"- With an active Task, choose simple_chat for unrelated lightweight conversation and new_task for unrelated work "
|
||||
"that needs Task capabilities. Either decision starts a new topic.\n"
|
||||
"- An unrelated lightweight conversation must not be classified as revise_task merely because the active Task is awaiting acceptance.\n"
|
||||
"- Choose revise_task when the active Task is awaiting feedback or needs revision and the user asks for changes "
|
||||
"such as '改一下', '加上', '删除', '换成', '再详细点', '格式改成', '不要', or equivalent wording.\n"
|
||||
"- Choose continue_task for neutral follow-up questions or additional next steps that do not imply dissatisfaction with the previous result.\n"
|
||||
"- Use new_task only when the user clearly asks to start a different task.\n"
|
||||
"- Choose continue_task for neutral follow-up questions or additional next steps that refer to the previous result, "
|
||||
"for example '顺便查一下深圳', '这个也加上', or '继续'.\n"
|
||||
"- A standalone tool-dependent request such as a fresh weather/search/file/run/test request is new_task even when it is "
|
||||
"similar to the active Task. Repeating '珠海天气怎么样' later is a new Task unless the user says to revise or continue the old result.\n"
|
||||
"- If there is no active Task, choose new_task only for work that requires execution, iteration, tools, files, "
|
||||
"implementation, validation, or multi-step completion. Otherwise choose simple_chat.\n"
|
||||
"- Requests that need current, real-time, external, user-private, local-file, web, weather, price, news, "
|
||||
@ -203,3 +244,99 @@ def _clean_short_title(value: Any) -> str | None:
|
||||
return None
|
||||
title = " ".join(str(value).strip().split())
|
||||
return title[:40] or None
|
||||
|
||||
|
||||
def _looks_like_explicit_task_followup(message: str) -> bool:
|
||||
text = _compact_text(message)
|
||||
if not text:
|
||||
return False
|
||||
markers = (
|
||||
"继续",
|
||||
"接着",
|
||||
"上面",
|
||||
"刚才",
|
||||
"前面",
|
||||
"这个",
|
||||
"那个",
|
||||
"它",
|
||||
"结果",
|
||||
"再",
|
||||
"也",
|
||||
"顺便",
|
||||
"补充",
|
||||
"加上",
|
||||
"加入",
|
||||
"删除",
|
||||
"去掉",
|
||||
"改",
|
||||
"换成",
|
||||
"重做",
|
||||
"详细",
|
||||
"展开",
|
||||
"格式",
|
||||
"continue",
|
||||
"same task",
|
||||
"previous",
|
||||
"above",
|
||||
"that result",
|
||||
"revise",
|
||||
"update it",
|
||||
"add",
|
||||
"remove",
|
||||
"change",
|
||||
"also",
|
||||
)
|
||||
return any(marker in text for marker in markers)
|
||||
|
||||
|
||||
def _looks_like_fresh_task_request(message: str) -> bool:
|
||||
text = _compact_text(message)
|
||||
if not text:
|
||||
return False
|
||||
markers = (
|
||||
"天气",
|
||||
"气温",
|
||||
"下雨",
|
||||
"降雨",
|
||||
"空气质量",
|
||||
"预报",
|
||||
"查一下",
|
||||
"帮我查",
|
||||
"搜索",
|
||||
"搜一下",
|
||||
"看看最新",
|
||||
"最新",
|
||||
"今天",
|
||||
"明天",
|
||||
"上传",
|
||||
"下载",
|
||||
"文件",
|
||||
"运行",
|
||||
"执行",
|
||||
"测试",
|
||||
"构建",
|
||||
"部署",
|
||||
"修复",
|
||||
"weather",
|
||||
"forecast",
|
||||
"temperature",
|
||||
"search",
|
||||
"look up",
|
||||
"latest",
|
||||
"today",
|
||||
"tomorrow",
|
||||
"upload",
|
||||
"download",
|
||||
"file",
|
||||
"run",
|
||||
"execute",
|
||||
"test",
|
||||
"build",
|
||||
"deploy",
|
||||
"fix",
|
||||
)
|
||||
return any(marker in text for marker in markers)
|
||||
|
||||
|
||||
def _compact_text(message: str) -> str:
|
||||
return " ".join(str(message or "").strip().lower().split())
|
||||
|
||||
@ -4,6 +4,7 @@ import json
|
||||
from pathlib import Path
|
||||
|
||||
from beaver.engine import EngineLoader
|
||||
from beaver.skills.authoring.format import is_canonical_skill_body
|
||||
from beaver.skills.catalog.utils import parse_frontmatter
|
||||
|
||||
|
||||
@ -69,6 +70,16 @@ def test_skill_authoring_admin_is_seeded_but_not_initial() -> None:
|
||||
assert version["tool_hints"] == expected_tools
|
||||
|
||||
|
||||
def test_seeded_skill_bodies_use_canonical_format() -> None:
|
||||
for index_name in ("published", "disabled"):
|
||||
index = json.loads((REPO_ROOT / "skills" / "_index" / f"{index_name}.json").read_text(encoding="utf-8"))
|
||||
for skill_name in index["items"]:
|
||||
skill_dir = REPO_ROOT / "skills" / skill_name / "versions" / "v0001"
|
||||
_frontmatter, body = parse_frontmatter((skill_dir / "SKILL.md").read_text(encoding="utf-8"))
|
||||
|
||||
assert is_canonical_skill_body(body), skill_name
|
||||
|
||||
|
||||
def test_default_runtime_registers_skill_view_tool(tmp_path: Path) -> None:
|
||||
loaded = EngineLoader(workspace=tmp_path).load()
|
||||
try:
|
||||
|
||||
@ -87,6 +87,14 @@ def _task() -> TaskRecord:
|
||||
)
|
||||
|
||||
|
||||
def _weather_task() -> TaskRecord:
|
||||
task = _task()
|
||||
task.description = "珠海天气怎样"
|
||||
task.goal = "珠海天气怎样"
|
||||
task.metadata["short_title"] = "查询珠海天气"
|
||||
return task
|
||||
|
||||
|
||||
def test_router_continues_active_task_from_llm_decision() -> None:
|
||||
provider = RouterProvider('{"action":"continue_task","reason":"related","short_title":"任务连续性"}')
|
||||
decision = asyncio.run(
|
||||
@ -103,6 +111,35 @@ def test_router_continues_active_task_from_llm_decision() -> None:
|
||||
assert provider.calls[0]["max_tokens"] == 256
|
||||
|
||||
|
||||
def test_router_keeps_same_session_but_starts_new_task_for_standalone_weather_repeat() -> None:
|
||||
decision = asyncio.run(
|
||||
MainAgentRouter().classify(
|
||||
"珠海天气怎么样",
|
||||
active_task=_weather_task(),
|
||||
provider=RouterProvider('{"action":"continue_task","reason":"neutral follow-up","short_title":"查询珠海天气"}'),
|
||||
)
|
||||
)
|
||||
|
||||
assert decision.is_task
|
||||
assert decision.action == "create_task"
|
||||
assert decision.starts_new_task is True
|
||||
assert "fresh standalone task request" in decision.reason
|
||||
|
||||
|
||||
def test_router_allows_explicit_followup_to_continue_active_weather_task() -> None:
|
||||
decision = asyncio.run(
|
||||
MainAgentRouter().classify(
|
||||
"顺便查一下深圳",
|
||||
active_task=_weather_task(),
|
||||
provider=RouterProvider('{"action":"continue_task","reason":"related follow-up","short_title":"查询珠海天气"}'),
|
||||
)
|
||||
)
|
||||
|
||||
assert decision.is_task
|
||||
assert decision.action == "continue_task"
|
||||
assert decision.starts_new_task is False
|
||||
|
||||
|
||||
def test_router_marks_revision_from_llm_decision() -> None:
|
||||
decision = asyncio.run(
|
||||
MainAgentRouter().classify(
|
||||
@ -163,6 +200,8 @@ def test_router_prompt_treats_unrelated_lightweight_conversation_as_new_topic()
|
||||
prompt = provider.calls[0]["messages"][1]["content"]
|
||||
assert "unrelated lightweight conversation" in prompt
|
||||
assert "must not be classified as revise_task merely because the active Task is awaiting acceptance" in prompt
|
||||
assert "A Session is the durable conversation/device/group context" in prompt
|
||||
assert "Repeating '珠海天气怎么样' later is a new Task" in prompt
|
||||
|
||||
|
||||
def test_router_closes_active_task_from_llm_decision() -> None:
|
||||
|
||||
@ -5,13 +5,40 @@ from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from beaver.interfaces.web.app import _create_skill_upload_draft
|
||||
from beaver.engine.providers.base import LLMProvider, LLMResponse
|
||||
from beaver.interfaces.web.app import _create_skill_upload_draft, _rewrite_uploaded_skill_draft_with_llm
|
||||
from beaver.services.skillhub_service import SkillHubService
|
||||
from beaver.skills.authoring.format import is_canonical_skill_body
|
||||
from beaver.skills.catalog.utils import extract_required_tool_names
|
||||
from beaver.skills.drafts import DraftService
|
||||
from beaver.skills.specs import SkillSpecStore
|
||||
from beaver.tools.mcp.wrapper import MCPToolWrapper
|
||||
|
||||
|
||||
class RewriteProvider(LLMProvider):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.messages = []
|
||||
|
||||
async def chat(self, messages, tools=None, model=None, max_tokens=None, temperature=0.7, thinking_enabled=None):
|
||||
self.messages = messages
|
||||
return LLMResponse(
|
||||
content="""{
|
||||
"frontmatter": {
|
||||
"name": "skill",
|
||||
"description": "Use when uploaded skill guidance needs QA formatting.",
|
||||
"tools": ["read_file"]
|
||||
},
|
||||
"content": "# Skill\\n\\n## Overview\\n\\nLLM rewritten overview.\\n\\n## When to Use\\n\\n- Use when testing upload rewrite.\\n\\n## Required Tools\\n\\n- `read_file`\\n\\n## Workflow\\n\\n- Follow the rewritten workflow.\\n\\n## Validation\\n\\n- Verify the result.\\n\\n## Boundaries\\n\\n- Stay in scope.\\n\\n## Anti-Patterns\\n\\n- Do not skip rewrite validation.\\n",
|
||||
"change_reason": "normalized upload"
|
||||
}""",
|
||||
model=model,
|
||||
)
|
||||
|
||||
def get_default_model(self):
|
||||
return "rewrite-model"
|
||||
|
||||
|
||||
class FakeSkillHubService(SkillHubService):
|
||||
async def _get_json(self, path, *, params=None):
|
||||
if path == "/skills":
|
||||
@ -99,6 +126,106 @@ def test_upload_skill_zip_keeps_supporting_files_on_draft(tmp_path):
|
||||
assert upload_dir.endswith(draft["draft_id"])
|
||||
|
||||
|
||||
def test_upload_skill_zip_canonicalizes_uploaded_skill_body(tmp_path):
|
||||
store = SkillSpecStore(tmp_path)
|
||||
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"skill/SKILL.md",
|
||||
"---\nname: skill\ndescription: raw upload\ntools:\n - read_file\n---\nBody without our format.\n",
|
||||
)
|
||||
|
||||
draft = _create_skill_upload_draft(loaded, "skill.zip", buffer.getvalue())
|
||||
|
||||
assert draft["proposed_frontmatter"]["name"] == "skill"
|
||||
assert draft["proposed_frontmatter"]["tools"] == ["read_file"]
|
||||
assert is_canonical_skill_body(draft["proposed_content"])
|
||||
|
||||
|
||||
def test_upload_skill_zip_infers_weather_web_tools_from_content(tmp_path):
|
||||
store = SkillSpecStore(tmp_path)
|
||||
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"weather_search/skills.md",
|
||||
"---\nname: weather-search\ndescription: weather lookup\n---\nLook up current weather and forecast for a city online.\n",
|
||||
)
|
||||
|
||||
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
|
||||
|
||||
assert draft["proposed_frontmatter"]["tools"] == ["web_fetch", "web_search"]
|
||||
assert extract_required_tool_names(draft["proposed_content"]) == ["web_fetch", "web_search"]
|
||||
assert is_canonical_skill_body(draft["proposed_content"])
|
||||
|
||||
|
||||
def test_upload_skill_llm_rewrite_updates_draft(tmp_path):
|
||||
store = SkillSpecStore(tmp_path)
|
||||
draft_service = DraftService(store)
|
||||
draft = draft_service.create_new_skill_draft(
|
||||
skill_name="skill",
|
||||
proposed_content="# Skill\n\n## Overview\n\nFallback.",
|
||||
proposed_frontmatter={"name": "skill", "description": "fallback", "tools": ["read_file"]},
|
||||
created_by="test",
|
||||
reason="upload",
|
||||
)
|
||||
provider = RewriteProvider()
|
||||
agent_service = SimpleNamespace(
|
||||
_make_provider_bundle_for_task=lambda _loaded, _kwargs: SimpleNamespace(
|
||||
main_provider=provider,
|
||||
main_runtime=SimpleNamespace(model="rewrite-model"),
|
||||
)
|
||||
)
|
||||
loaded = SimpleNamespace(skill_spec_store=store, draft_service=draft_service)
|
||||
|
||||
asyncio.run(_rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename="skill.zip"))
|
||||
rewritten = draft_service.get_draft("skill", draft.draft_id)
|
||||
|
||||
assert rewritten is not None
|
||||
assert "LLM rewritten overview" in rewritten.proposed_content
|
||||
assert is_canonical_skill_body(rewritten.proposed_content)
|
||||
assert "Canonical Beaver SKILL.md format" in provider.messages[1]["content"]
|
||||
assert "Available runtime tool names" in provider.messages[1]["content"]
|
||||
|
||||
|
||||
def test_upload_skill_zip_accepts_nested_single_skill_directory(tmp_path):
|
||||
store = SkillSpecStore(tmp_path)
|
||||
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"plugin/skills/nested-skill/SKILL.md",
|
||||
"---\nname: nested-skill\ndescription: nested\n---\nBody\n",
|
||||
)
|
||||
archive.writestr("plugin/skills/nested-skill/references/a.txt", "context")
|
||||
archive.writestr("plugin/README.md", "ignore package file")
|
||||
|
||||
draft = _create_skill_upload_draft(loaded, "plugin.zip", buffer.getvalue())
|
||||
|
||||
assert draft["skill_name"] == "nested-skill"
|
||||
upload_dir = draft["evidence_refs"][0]["supporting_upload_dir"]
|
||||
assert (tmp_path / "skills" / "nested-skill" / "draft_uploads" / draft["draft_id"] / "references" / "a.txt").read_text() == "context"
|
||||
assert "README.md" not in draft["evidence_refs"][0]["files"]
|
||||
|
||||
|
||||
def test_upload_skill_zip_accepts_common_skill_markdown_name_aliases(tmp_path):
|
||||
store = SkillSpecStore(tmp_path)
|
||||
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"weather_search/skills.md",
|
||||
"---\nname: weather-search\ndescription: weather lookup\n---\nBody\n",
|
||||
)
|
||||
|
||||
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
|
||||
|
||||
assert draft["skill_name"] == "weather-search"
|
||||
assert draft["proposed_frontmatter"]["name"] == "weather-search"
|
||||
assert is_canonical_skill_body(draft["proposed_content"])
|
||||
|
||||
|
||||
def test_mcp_wrapper_metadata_preserves_server_id_with_underscores():
|
||||
tool_def = SimpleNamespace(name="auth_status", description="Auth", inputSchema={"type": "object", "properties": {}})
|
||||
|
||||
|
||||
@ -184,7 +184,7 @@ def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
|
||||
assert published.version == "v0002"
|
||||
assert store.get_current_version("release-checklist") == "v0002"
|
||||
|
||||
with pytest.raises(ValueError, match="approved"):
|
||||
with pytest.raises(ValueError, match="submitted for review"):
|
||||
publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")
|
||||
|
||||
rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
|
||||
@ -529,6 +529,66 @@ def test_skill_learning_service_generates_new_skill_for_task_without_published_s
|
||||
assert candidates[0].source_run_ids == ["task-run-1"]
|
||||
|
||||
|
||||
def test_skill_learning_service_uses_original_task_text_for_new_skill_theme(tmp_path: Path) -> None:
|
||||
store = SkillSpecStore(tmp_path)
|
||||
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
|
||||
learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
|
||||
service = SkillLearningService(
|
||||
run_store=run_store,
|
||||
learning_store=learning_store,
|
||||
draft_service=DraftService(store),
|
||||
evidence_selector=EvidenceSelector(run_store),
|
||||
)
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
run_store.append_run_record(
|
||||
RunRecord(
|
||||
run_id="task-run-1",
|
||||
session_id="session-task",
|
||||
task_id="task-1",
|
||||
attempt_index=1,
|
||||
task_text="Compare direct production restart with staging rollout",
|
||||
started_at=now,
|
||||
ended_at=now,
|
||||
success=False,
|
||||
finish_reason="stop",
|
||||
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
|
||||
activated_skills=[],
|
||||
validation_result=None,
|
||||
)
|
||||
)
|
||||
run_store.append_run_record(
|
||||
RunRecord(
|
||||
run_id="task-run-2",
|
||||
session_id="session-task",
|
||||
task_id="task-1",
|
||||
attempt_index=2,
|
||||
task_text="I do not see the docs",
|
||||
started_at=now,
|
||||
ended_at=now,
|
||||
success=True,
|
||||
finish_reason="stop",
|
||||
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
|
||||
activated_skills=[],
|
||||
validation_result={"accepted": True, "score": 0.9},
|
||||
)
|
||||
)
|
||||
|
||||
candidates = service.build_learning_candidates_for_task("task-1", trigger_run_id="task-run-2")
|
||||
|
||||
assert [candidate.candidate_id for candidate in candidates] == ["new:task:task-1"]
|
||||
assert candidates[0].evidence["theme"] == "Compare direct production restart with staging rollout"
|
||||
assert candidates[0].evidence["task_text"] == "Compare direct production restart with staging rollout"
|
||||
|
||||
|
||||
def test_task_theme_uses_first_sentence_for_chinese_text() -> None:
|
||||
assert (
|
||||
SkillLearningService._task_theme(
|
||||
"帮我比较两种发布流程的风险:A 是直接重启线上容器,B 是先部署 staging 再切 production。请给出推荐方案、原因、验证步骤和回滚策略。"
|
||||
)
|
||||
== "帮我比较两种发布流程的风险:A 是直接重启线上容器,B 是先部署 staging 再切 production"
|
||||
)
|
||||
|
||||
|
||||
def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
|
||||
skill = SkillContext(
|
||||
name="docker-debug",
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from beaver.skills.authoring.format import (
|
||||
CANONICAL_SKILL_SECTION_HEADINGS,
|
||||
canonical_skill_format_instructions,
|
||||
canonicalize_skill_body,
|
||||
is_canonical_skill_body,
|
||||
parse_skill_rewrite_json,
|
||||
)
|
||||
|
||||
|
||||
def test_canonical_skill_body_contains_required_sections() -> None:
|
||||
body = canonicalize_skill_body(
|
||||
title="Filesystem Operation",
|
||||
overview="Read and update project files safely.",
|
||||
tools=["read_file", "write_file"],
|
||||
workflow=["Inspect the file before editing.", "Use the smallest safe edit."],
|
||||
validation=["Re-read changed files before reporting completion."],
|
||||
boundaries=["Do not edit files outside the workspace."],
|
||||
anti_patterns=["Do not overwrite files without reading them first."],
|
||||
)
|
||||
|
||||
assert is_canonical_skill_body(body)
|
||||
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
|
||||
assert heading in body
|
||||
|
||||
|
||||
def test_canonical_skill_format_instructions_are_prompt_ready() -> None:
|
||||
instructions = canonical_skill_format_instructions()
|
||||
|
||||
assert "Canonical Beaver SKILL.md format" in instructions
|
||||
assert "frontmatter" in instructions
|
||||
assert "name" in instructions
|
||||
assert "description" in instructions
|
||||
assert "tools" in instructions
|
||||
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
|
||||
assert heading in instructions
|
||||
|
||||
|
||||
def test_parse_skill_rewrite_json_backfills_frontmatter_tools_from_required_tools_section() -> None:
|
||||
payload = parse_skill_rewrite_json(
|
||||
"""{
|
||||
"frontmatter": {
|
||||
"name": "weather-search",
|
||||
"description": "weather lookup",
|
||||
"tools": []
|
||||
},
|
||||
"content": "# Weather Search\\n\\n## Overview\\n\\nLook up weather.\\n\\n## When to Use\\n\\n- Weather requests.\\n\\n## Required Tools\\n\\n- `web_fetch`\\n- `web_search`\\n\\n## Workflow\\n\\n- Fetch current weather.\\n\\n## Validation\\n\\n- Check source freshness.\\n\\n## Boundaries\\n\\n- Do not guess.\\n\\n## Anti-Patterns\\n\\n- Do not fabricate data.\\n"
|
||||
}""",
|
||||
skill_name="weather-search",
|
||||
)
|
||||
|
||||
assert payload is not None
|
||||
assert payload["frontmatter"]["tools"] == ["web_fetch", "web_search"]
|
||||
@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore
|
||||
|
||||
|
||||
class StubProvider(LLMProvider):
|
||||
async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
|
||||
return LLMResponse(content="ok")
|
||||
def __init__(self, content: str = "ok") -> None:
|
||||
super().__init__()
|
||||
self.content = content
|
||||
self.calls: list[dict] = []
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
messages: list[dict],
|
||||
tools: list[dict] | None = None,
|
||||
model: str | None = None,
|
||||
max_tokens: int = 4096,
|
||||
temperature: float = 0.7,
|
||||
thinking_enabled: bool | None = None,
|
||||
) -> LLMResponse:
|
||||
self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
|
||||
return LLMResponse(content=self.content)
|
||||
|
||||
def get_default_model(self) -> str:
|
||||
return "stub"
|
||||
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
|
||||
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
|
||||
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
assert report.passed is True
|
||||
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
|
||||
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
|
||||
assert report.passed is False
|
||||
assert pipeline.get_candidate("candidate-1").status == "eval_failed"
|
||||
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
|
||||
|
||||
|
||||
class FakeReplayRunner:
|
||||
def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
|
||||
self.baseline_answer = baseline_answer
|
||||
self.candidate_answer = candidate_answer
|
||||
self.requests = []
|
||||
|
||||
async def run_arm(self, request):
|
||||
self.requests.append(request)
|
||||
final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
@ -168,7 +187,7 @@ class FakeReplayRunner:
|
||||
"run_id": f"{request.arm}-run",
|
||||
"task_text": request.task_text,
|
||||
"finish_reason": "stop",
|
||||
"final_answer": "done",
|
||||
"final_answer": final_answer,
|
||||
"tool_calls": [
|
||||
{
|
||||
"tool_name": "write_file",
|
||||
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
|
||||
assert 0.0 <= report.execution_coverage <= 1.0
|
||||
assert 0.0 <= report.surrogate_coverage <= 1.0
|
||||
assert report.confidence in {"low", "medium", "high"}
|
||||
assert "ability_score" in report.case_reports[0]
|
||||
assert "tool_execution_score" in report.case_reports[0]
|
||||
assert report.ability_score_summary["score_role"] == "primary"
|
||||
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
|
||||
|
||||
|
||||
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
evidence={
|
||||
"eval_cases": [
|
||||
{
|
||||
"run_id": "validator-case",
|
||||
"task_id": "validator-case",
|
||||
"session_id": "eval",
|
||||
"task_text": "Write the release verdict.",
|
||||
"validator": {
|
||||
"type": "final_answer_contains",
|
||||
"required_terms": ["ship"],
|
||||
"forbidden_terms": ["do not ship"],
|
||||
},
|
||||
"accepted_score": 0.5,
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
|
||||
|
||||
report = asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=FakeReplayRunner(
|
||||
baseline_answer="Do not ship. Tests are failing.",
|
||||
candidate_answer="Ship after smoke tests pass.",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
case = report.case_reports[0]
|
||||
assert case["tool_execution_score"]["baseline_score"] == 0.85
|
||||
assert case["tool_execution_score"]["candidate_score"] == 0.85
|
||||
assert case["baseline_score"] < case["candidate_score"]
|
||||
assert report.tool_mode_summary["score_role"] == "diagnostic_only"
|
||||
assert report.ability_score_summary["score_role"] == "primary"
|
||||
assert report.real_score_avg is not None
|
||||
assert report.synthetic_score_avg is not None
|
||||
|
||||
|
||||
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
pipeline.learning_store.update_learning_candidate(
|
||||
"candidate-1",
|
||||
evidence={
|
||||
"eval_cases": [
|
||||
{
|
||||
"run_id": "synthetic:no-validator",
|
||||
"task_id": "synthetic-no-validator",
|
||||
"session_id": "synthetic-eval",
|
||||
"task_text": "Synthetic task without an oracle.",
|
||||
"synthetic": True,
|
||||
"accepted_score": 0.75,
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="release-checklist",
|
||||
proposed_content="# Release\n\nRun tests.",
|
||||
proposed_frontmatter={"description": "release", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
|
||||
replay_runner = FakeReplayRunner()
|
||||
|
||||
report = asyncio.run(
|
||||
pipeline.evaluate_draft(
|
||||
"candidate-1",
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=_bundle(),
|
||||
replay_runner=replay_runner,
|
||||
)
|
||||
)
|
||||
|
||||
assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
|
||||
assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
|
||||
assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1
|
||||
|
||||
@ -31,6 +31,12 @@ def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
|
||||
assert payload["confidence"] == "low"
|
||||
assert payload["case_reports"] == []
|
||||
assert payload["tool_mode_summary"] == {}
|
||||
assert payload["ability_score_summary"] == {}
|
||||
assert payload["tool_execution_summary"] == {}
|
||||
assert payload["case_selection_summary"] == {}
|
||||
assert payload["real_score_avg"] is None
|
||||
assert payload["synthetic_score_avg"] is None
|
||||
assert payload["overall_score_avg"] is None
|
||||
assert payload["preservation_report"] is None
|
||||
assert payload["cases"] == [{"run_id": "run-1"}]
|
||||
|
||||
@ -59,3 +65,37 @@ def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
|
||||
assert report.mode == "heuristic"
|
||||
assert report.confidence == "low"
|
||||
assert report.case_reports == []
|
||||
|
||||
|
||||
def test_eval_report_persists_ability_and_case_split_fields() -> None:
|
||||
report = SkillDraftEvalReport(
|
||||
report_id="eval-replay",
|
||||
skill_name="debug",
|
||||
draft_id="draft-1",
|
||||
candidate_id="candidate-1",
|
||||
passed=True,
|
||||
baseline_score_avg=0.5,
|
||||
candidate_score_avg=0.8,
|
||||
score_delta=0.3,
|
||||
regression_count=0,
|
||||
improved_count=1,
|
||||
unchanged_count=0,
|
||||
mode="replay",
|
||||
eval_version="replay-v2",
|
||||
real_score_avg=0.9,
|
||||
synthetic_score_avg=0.6,
|
||||
overall_score_avg=0.8,
|
||||
ability_score_summary={"score_role": "primary", "real_case_count": 1},
|
||||
tool_execution_summary={"score_role": "diagnostic_only", "executed": 1.0},
|
||||
case_selection_summary={"excluded_synthetic_without_validator": 2},
|
||||
)
|
||||
|
||||
payload = report.to_dict()
|
||||
restored = SkillDraftEvalReport.from_dict(payload)
|
||||
|
||||
assert payload["real_score_avg"] == 0.9
|
||||
assert payload["synthetic_score_avg"] == 0.6
|
||||
assert payload["overall_score_avg"] == 0.8
|
||||
assert restored.ability_score_summary == {"score_role": "primary", "real_case_count": 1}
|
||||
assert restored.tool_execution_summary == {"score_role": "diagnostic_only", "executed": 1.0}
|
||||
assert restored.case_selection_summary == {"excluded_synthetic_without_validator": 2}
|
||||
|
||||
@ -55,14 +55,12 @@ def test_pipeline_lists_candidates_and_moves_draft_through_review(tmp_path: Path
|
||||
reason="test",
|
||||
)
|
||||
|
||||
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
approved = pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
version = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
assert pipeline.list_candidates()[0].candidate_id == "candidate-1"
|
||||
assert review.status == SkillReviewState.IN_REVIEW.value
|
||||
assert approved.status == SkillReviewState.APPROVED.value
|
||||
assert safety.passed is True
|
||||
assert version.skill_name == "new-skill"
|
||||
assert pipeline.get_draft(draft.skill_name, draft.draft_id).status == SkillReviewState.PUBLISHED.value
|
||||
@ -93,7 +91,6 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
|
||||
)
|
||||
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
@ -165,7 +162,6 @@ def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
|
||||
)
|
||||
)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
|
||||
with pytest.raises(ValueError, match="low confidence"):
|
||||
@ -201,7 +197,6 @@ def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
|
||||
)
|
||||
)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
|
||||
with pytest.raises(ValueError, match="preservation"):
|
||||
|
||||
@ -16,6 +16,25 @@ class FakeAgentLoop:
|
||||
return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop")
|
||||
|
||||
|
||||
class FakeRunningAgentLoop(FakeAgentLoop):
|
||||
def __init__(self) -> None:
|
||||
self.process_direct_calls = 0
|
||||
self.submit_direct_calls: list[tuple[str, dict]] = []
|
||||
|
||||
async def process_direct(self, task: str, **kwargs):
|
||||
self.process_direct_calls += 1
|
||||
raise RuntimeError(
|
||||
"AgentLoop.process_direct() is disabled while run() is active; "
|
||||
"submit tasks via submit_direct() instead."
|
||||
)
|
||||
|
||||
async def submit_direct(self, task: str, **kwargs):
|
||||
self.submit_direct_calls.append((task, kwargs))
|
||||
executor = kwargs["tool_executor_override"]
|
||||
await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"})
|
||||
return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
|
||||
|
||||
|
||||
def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
|
||||
runner = ReplayRunner(agent_loop=FakeAgentLoop())
|
||||
request = ReplayArmRequest(
|
||||
@ -34,3 +53,33 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
|
||||
assert report["arm"] == "candidate"
|
||||
assert report["finish_reason"] == "stop"
|
||||
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
|
||||
|
||||
|
||||
def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
|
||||
agent_loop = FakeRunningAgentLoop()
|
||||
runner = ReplayRunner(agent_loop=agent_loop)
|
||||
request = ReplayArmRequest(
|
||||
case_id="case-queued",
|
||||
arm="baseline",
|
||||
task_text="Send a status email to Ada.",
|
||||
pinned_skill_names=["filesystem-operation"],
|
||||
pinned_skill_contexts=[{"name": "filesystem-operation"}],
|
||||
provider_bundle=object(),
|
||||
model_settings={"max_tool_iterations": 3, "temperature": 0.1},
|
||||
)
|
||||
|
||||
report = asyncio.run(runner.run_arm(request))
|
||||
|
||||
assert agent_loop.process_direct_calls == 1
|
||||
assert len(agent_loop.submit_direct_calls) == 1
|
||||
queued_task, queued_kwargs = agent_loop.submit_direct_calls[0]
|
||||
assert queued_task == "Send a status email to Ada."
|
||||
assert queued_kwargs["source"] == "skill_replay_eval"
|
||||
assert queued_kwargs["include_skill_assembly"] is False
|
||||
assert queued_kwargs["include_tools"] is True
|
||||
assert queued_kwargs["pinned_skill_names"] == ["filesystem-operation"]
|
||||
assert queued_kwargs["max_tool_iterations"] == 3
|
||||
assert queued_kwargs["temperature"] == 0.1
|
||||
assert report["session_id"] == "session-queued"
|
||||
assert report["run_id"] == "run-queued"
|
||||
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
|
||||
|
||||
@ -74,7 +74,6 @@ def test_safety_marks_dangerous_tools_high_and_requires_confirm(tmp_path: Path)
|
||||
|
||||
report = pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
|
||||
assert report.passed is True
|
||||
assert report.risk_level == "high"
|
||||
@ -94,7 +93,6 @@ def test_publish_requires_safety_report(tmp_path: Path) -> None:
|
||||
reason="test",
|
||||
)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
|
||||
with pytest.raises(ValueError, match="safety report"):
|
||||
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from beaver.memory.skills import SkillLearningCandidate
|
||||
from beaver.skills.authoring.format import CANONICAL_SKILL_SECTION_HEADINGS
|
||||
from beaver.skills.learning.evidence import EvidencePacket
|
||||
from beaver.skills.learning.synthesizer import SkillDraftSynthesizer
|
||||
|
||||
@ -39,3 +40,6 @@ def test_revision_prompt_includes_base_skill_snapshot() -> None:
|
||||
assert "Do not delete files." in prompt
|
||||
assert "preserved_sections" in prompt
|
||||
assert "dropped_sections" in prompt
|
||||
assert "Canonical Beaver SKILL.md format" in prompt
|
||||
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
|
||||
assert heading in prompt
|
||||
|
||||
@ -1,12 +1,37 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from beaver.memory.runs import RunRecord
|
||||
from beaver.interfaces.web.app import create_app
|
||||
from beaver.memory.skills import SkillLearningCandidate
|
||||
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
|
||||
from beaver.services.agent_service import AgentService
|
||||
from beaver.skills.specs import SkillVersion
|
||||
|
||||
|
||||
class StubEvaluator:
|
||||
def __init__(self) -> None:
|
||||
self.calls = 0
|
||||
|
||||
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
|
||||
self.calls += 1
|
||||
return SkillDraftEvalReport(
|
||||
report_id="eval-existing",
|
||||
skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
candidate_id=candidate.candidate_id,
|
||||
passed=True,
|
||||
baseline_score_avg=0.5,
|
||||
candidate_score_avg=0.8,
|
||||
score_delta=0.3,
|
||||
regression_count=0,
|
||||
improved_count=1,
|
||||
unchanged_count=0,
|
||||
status="completed",
|
||||
)
|
||||
|
||||
|
||||
def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
|
||||
@ -31,3 +56,191 @@ def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
|
||||
assert candidates[0]["candidate_id"] == "candidate-1"
|
||||
assert "risk_level" in candidates[0]
|
||||
assert run_once["processed"] >= 0
|
||||
|
||||
|
||||
def test_skill_learning_candidates_payload_prefers_original_task_text(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
now = "2026-06-11T00:00:00+00:00"
|
||||
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
|
||||
RunRecord(
|
||||
run_id="run-original",
|
||||
session_id="session-task",
|
||||
task_id="task-1",
|
||||
attempt_index=1,
|
||||
task_text="Compare direct production restart with staging rollout",
|
||||
started_at=now,
|
||||
ended_at=now,
|
||||
success=False,
|
||||
finish_reason="stop",
|
||||
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
|
||||
activated_skills=[],
|
||||
validation_result=None,
|
||||
)
|
||||
)
|
||||
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
|
||||
RunRecord(
|
||||
run_id="run-final",
|
||||
session_id="session-task",
|
||||
task_id="task-1",
|
||||
attempt_index=2,
|
||||
task_text="I do not see the docs",
|
||||
started_at=now,
|
||||
ended_at=now,
|
||||
success=True,
|
||||
finish_reason="stop",
|
||||
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
|
||||
activated_skills=[],
|
||||
validation_result={"accepted": True, "score": 0.9},
|
||||
)
|
||||
)
|
||||
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
|
||||
SkillLearningCandidate(
|
||||
candidate_id="new:task:task-1",
|
||||
kind="new_skill",
|
||||
source_run_ids=["run-original", "run-final"],
|
||||
source_session_ids=["session-task"],
|
||||
related_skill_names=[],
|
||||
reason="test",
|
||||
evidence={"task_id": "task-1", "theme": "i do not see the docs"},
|
||||
)
|
||||
)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
candidates = client.get("/api/skills/candidates").json()
|
||||
|
||||
payload = next(item for item in candidates if item["candidate_id"] == "new:task:task-1")
|
||||
assert payload["evidence"]["theme"] == "Compare direct production restart with staging rollout"
|
||||
assert payload["evidence"]["task_text"] == "Compare direct production restart with staging rollout"
|
||||
|
||||
|
||||
def test_generate_draft_does_not_run_review_checks(tmp_path: Path, monkeypatch) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
|
||||
skill_name="filesystem-operation",
|
||||
proposed_content="# Filesystem Operation\n\nUse files safely.",
|
||||
proposed_frontmatter={"description": "filesystem", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
|
||||
SkillLearningCandidate(
|
||||
candidate_id="candidate-existing",
|
||||
kind="revise_skill",
|
||||
source_run_ids=["run-1"],
|
||||
source_session_ids=["session-1"],
|
||||
related_skill_names=["filesystem-operation"],
|
||||
reason="revise",
|
||||
status="draft_ready",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
)
|
||||
evaluator = StubEvaluator()
|
||||
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
|
||||
monkeypatch.setattr(
|
||||
service,
|
||||
"_make_provider_bundle_for_task",
|
||||
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
|
||||
)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.post("/api/skills/candidates/candidate-existing/draft")
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert evaluator.calls == 0
|
||||
assert payload["draft_id"] == draft.draft_id
|
||||
assert payload["safety_report"] is None
|
||||
assert payload["eval_report"] is None
|
||||
assert loaded.skill_learning_pipeline.get_eval_report(draft.skill_name, draft.draft_id) is None # type: ignore[union-attr]
|
||||
|
||||
|
||||
def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
|
||||
skill_name="filesystem-operation",
|
||||
proposed_content="# Filesystem Operation\n\nUse files safely.",
|
||||
proposed_frontmatter={"description": "filesystem", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
|
||||
SkillLearningCandidate(
|
||||
candidate_id="candidate-existing",
|
||||
kind="revise_skill",
|
||||
source_run_ids=["run-1"],
|
||||
source_session_ids=["session-1"],
|
||||
related_skill_names=["filesystem-operation"],
|
||||
reason="revise",
|
||||
status="draft_ready",
|
||||
draft_skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
)
|
||||
)
|
||||
evaluator = StubEvaluator()
|
||||
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
|
||||
monkeypatch.setattr(
|
||||
service,
|
||||
"_make_provider_bundle_for_task",
|
||||
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
|
||||
)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert evaluator.calls == 1
|
||||
assert payload["status"] == "in_review"
|
||||
assert payload["safety_report"]["passed"] is True
|
||||
assert payload["eval_report"]["report_id"] == "eval-existing"
|
||||
|
||||
|
||||
def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
|
||||
service = AgentService(workspace=tmp_path)
|
||||
loaded = service.create_loop().boot()
|
||||
loaded.skill_spec_store.write_skill_version( # type: ignore[union-attr]
|
||||
SkillVersion(
|
||||
skill_name="filesystem-operation",
|
||||
version="v0001",
|
||||
content_hash="hash-v1",
|
||||
summary_hash="summary-v1",
|
||||
created_at="2026-06-01T00:00:00+00:00",
|
||||
created_by="test",
|
||||
change_reason="initial",
|
||||
parent_version=None,
|
||||
review_state="published",
|
||||
frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
|
||||
summary="filesystem",
|
||||
tool_hints=[],
|
||||
),
|
||||
"# Filesystem Operation\n\nUse files.",
|
||||
)
|
||||
loaded.skill_spec_store.set_current_version("filesystem-operation", "v0001") # type: ignore[union-attr]
|
||||
draft = loaded.skill_learning_pipeline.draft_service.create_revision_draft( # type: ignore[union-attr]
|
||||
skill_name="filesystem-operation",
|
||||
base_version="v0001",
|
||||
proposed_content="# Filesystem Operation\n\nUse files better.",
|
||||
proposed_frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
|
||||
created_by="test",
|
||||
reason="revise",
|
||||
)
|
||||
app = create_app(service=service, manage_service_lifecycle=False)
|
||||
|
||||
with TestClient(app) as client:
|
||||
response = client.get("/api/skills/drafts")
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = next(item for item in response.json() if item["draft_id"] == draft.draft_id)
|
||||
assert payload["proposal_kind"] == "revise_skill"
|
||||
assert payload["base_version"] == "v0001"
|
||||
assert payload["target_version"] == "v0002"
|
||||
assert payload["base_skill"]["version"] == "v0001"
|
||||
assert payload["base_skill"]["content"] == "# Filesystem Operation\n\nUse files."
|
||||
assert payload["base_skill"]["frontmatter"]["name"] == "filesystem-operation"
|
||||
|
||||
@ -10,6 +10,7 @@ from beaver.engine.providers.factory import ProviderBundle
|
||||
from beaver.engine.session import SessionManager
|
||||
from beaver.memory.runs import RunMemoryStore, RunRecord
|
||||
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
|
||||
from beaver.skills.authoring.format import is_canonical_skill_body
|
||||
from beaver.skills.drafts import DraftService
|
||||
from beaver.skills.learning import (
|
||||
EvidenceSelector,
|
||||
@ -48,6 +49,33 @@ def _bundle(provider: LLMProvider) -> ProviderBundle:
|
||||
return ProviderBundle(main_runtime=runtime, main_provider=provider) # type: ignore[arg-type]
|
||||
|
||||
|
||||
class FakeReplayRunner:
|
||||
def __init__(self) -> None:
|
||||
self.requests = []
|
||||
|
||||
async def run_arm(self, request):
|
||||
self.requests.append(request)
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
"session_id": "session-replay",
|
||||
"run_id": f"{request.arm}-run",
|
||||
"task_text": request.task_text,
|
||||
"finish_reason": "stop",
|
||||
"final_answer": "debug deployment startup done",
|
||||
"tool_calls": [
|
||||
{
|
||||
"tool_name": "echo",
|
||||
"mode": "executed",
|
||||
"arguments": {"text": "ok"},
|
||||
"result": {"success": True, "content": "ok"},
|
||||
}
|
||||
],
|
||||
"artifacts": [],
|
||||
"side_effects": [],
|
||||
}
|
||||
|
||||
|
||||
def _pipeline(tmp_path: Path) -> SkillLearningPipelineService:
|
||||
spec_store = SkillSpecStore(tmp_path)
|
||||
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
|
||||
@ -109,6 +137,28 @@ def test_worker_synthesizes_open_candidate_without_publish(tmp_path: Path) -> No
|
||||
assert pipeline.list_drafts(candidate.draft_skill_name)[0].status == "draft"
|
||||
|
||||
|
||||
def test_worker_evaluates_draft_with_replay_runner_when_available(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
replay_runner = FakeReplayRunner()
|
||||
worker = SkillLearningWorker(
|
||||
pipeline=pipeline,
|
||||
provider_bundle_factory=lambda: _bundle(JsonProvider()),
|
||||
replay_runner_factory=lambda: replay_runner,
|
||||
config=SkillLearningWorkerConfig(max_drafts_per_run=5, max_retries=3, interval_seconds=1),
|
||||
)
|
||||
|
||||
result = asyncio.run(worker.run_once())
|
||||
candidate = pipeline.get_candidate("candidate-1")
|
||||
draft = pipeline.get_draft(candidate.draft_skill_name or "", candidate.draft_id or "")
|
||||
report = pipeline.get_eval_report(draft.skill_name, draft.draft_id)
|
||||
|
||||
assert result.succeeded == 1
|
||||
assert report is not None
|
||||
assert report.mode == "replay"
|
||||
assert report.case_reports
|
||||
assert replay_runner.requests
|
||||
|
||||
|
||||
def test_worker_retries_and_marks_failed_after_limit(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
worker = SkillLearningWorker(
|
||||
@ -147,6 +197,7 @@ def test_synthesizer_fills_missing_tools_from_evidence(tmp_path: Path) -> None:
|
||||
)
|
||||
|
||||
assert payload["frontmatter"]["tools"] == ["web_fetch", "memory"]
|
||||
assert is_canonical_skill_body(payload["content"])
|
||||
|
||||
|
||||
def test_evidence_selector_records_run_tool_names(tmp_path: Path) -> None:
|
||||
|
||||
@ -218,6 +218,45 @@ def test_unrelated_new_task_auto_accepts_previous_task(tmp_path: Path) -> None:
|
||||
assert current.run_ids == [second.run_id]
|
||||
|
||||
|
||||
def test_standalone_realtime_repeat_creates_new_task_in_same_session(tmp_path: Path) -> None:
|
||||
service = AgentService(
|
||||
loader=EngineLoader(
|
||||
workspace=tmp_path,
|
||||
task_execution_planner=StubTaskExecutionPlanner(),
|
||||
)
|
||||
)
|
||||
session_id = "feishu:group-weather"
|
||||
first = asyncio.run(
|
||||
service.process_direct(
|
||||
"珠海天气怎样",
|
||||
session_id=session_id,
|
||||
provider_bundle=_bundle("Weather result"),
|
||||
)
|
||||
)
|
||||
|
||||
second = asyncio.run(
|
||||
service.process_direct(
|
||||
"珠海天气怎么样",
|
||||
session_id=session_id,
|
||||
provider_bundle=_bundle("Fresh weather result", route_action="continue_task"),
|
||||
)
|
||||
)
|
||||
|
||||
task_service = service.create_loop().boot().task_service
|
||||
assert task_service is not None
|
||||
previous = task_service.get_task(first.task_id or "")
|
||||
current = task_service.get_task(second.task_id or "")
|
||||
assert previous is not None
|
||||
assert current is not None
|
||||
assert previous.session_id == session_id
|
||||
assert current.session_id == session_id
|
||||
assert current.task_id != previous.task_id
|
||||
assert previous.status == "closed"
|
||||
assert previous.run_ids == [first.run_id]
|
||||
assert current.status == "awaiting_acceptance"
|
||||
assert current.run_ids == [second.run_id]
|
||||
|
||||
|
||||
def test_related_follow_up_continues_active_task_without_accepting_it(tmp_path: Path) -> None:
|
||||
service = AgentService(
|
||||
loader=EngineLoader(
|
||||
|
||||
@ -102,6 +102,58 @@ tools:
|
||||
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files"]
|
||||
|
||||
|
||||
def test_tool_assembler_uses_required_tools_section_when_frontmatter_omits_tools(tmp_path: Path) -> None:
|
||||
skill_dir = tmp_path / "skills" / "docker-debug"
|
||||
skill_dir.mkdir(parents=True)
|
||||
(skill_dir / "SKILL.md").write_text(
|
||||
"""---
|
||||
name: docker-debug
|
||||
description: Debug Docker issues.
|
||||
---
|
||||
|
||||
# Docker Debug
|
||||
|
||||
## Overview
|
||||
|
||||
Debug Docker issues.
|
||||
|
||||
## Required Tools
|
||||
|
||||
- `terminal`
|
||||
- `search_files`
|
||||
|
||||
## Workflow
|
||||
|
||||
Inspect logs and search related files.
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
registry = ToolRegistry()
|
||||
registry.register(DummyTool("memory", toolset="memory", always_available=True))
|
||||
registry.register(DummyTool("terminal", toolset="shell"))
|
||||
registry.register(DummyTool("search_files", toolset="file"))
|
||||
registry.register(DummyTool("echo", toolset="debug"))
|
||||
|
||||
assembler = ToolAssembler(retriever=StaticRetriever())
|
||||
loader = SkillsLoader(tmp_path)
|
||||
record = loader.get_skill_record("docker-debug")
|
||||
assert record is not None
|
||||
assert record.tool_hints == ["terminal", "search_files"]
|
||||
|
||||
selected = asyncio.run(
|
||||
assembler.assemble(
|
||||
task_description="排查 Docker 容器日志",
|
||||
registry=registry,
|
||||
skills_loader=loader,
|
||||
activated_skills=[SkillContext(name="docker-debug", content="", tool_hints=record.tool_hints)],
|
||||
top_k=1,
|
||||
)
|
||||
)
|
||||
|
||||
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files", "echo"]
|
||||
|
||||
|
||||
def test_embedding_fallback_can_return_all_or_top_k() -> None:
|
||||
candidates = [{"name": f"tool_{index}", "description": "", "input_schema": "{}"} for index in range(3)]
|
||||
retriever = EmbeddingRetriever(api_key_env="MISSING_EMBEDDING_KEY", api_base_env="MISSING_EMBEDDING_BASE")
|
||||
|
||||
21
app-instance/backend/tests/unit/test_web_cors.py
Normal file
21
app-instance/backend/tests/unit/test_web_cors.py
Normal file
@ -0,0 +1,21 @@
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from beaver.interfaces.web.app import create_app
|
||||
|
||||
|
||||
def test_local_frontend_origin_can_preflight_api_requests() -> None:
|
||||
app = create_app(service=None, manage_service_lifecycle=False)
|
||||
client = TestClient(app)
|
||||
|
||||
response = client.options(
|
||||
"/api/auth/me",
|
||||
headers={
|
||||
"Origin": "http://127.0.0.1:3080",
|
||||
"Access-Control-Request-Method": "GET",
|
||||
"Access-Control-Request-Headers": "authorization",
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.headers["access-control-allow-origin"] == "http://127.0.0.1:3080"
|
||||
assert "authorization" in response.headers["access-control-allow-headers"].lower()
|
||||
@ -28,8 +28,10 @@ import {
|
||||
deleteUserFile,
|
||||
createUserFileDir,
|
||||
getAccessToken,
|
||||
isApiError,
|
||||
} from '@/lib/api';
|
||||
import type { UserFileContent, UserFileItem } from '@/lib/api';
|
||||
import { canMutateUserFilesPath } from '@/lib/user-file-paths';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { type AppLocale, pickAppText } from '@/lib/i18n/core';
|
||||
@ -44,6 +46,10 @@ function sleep(ms: number): Promise<void> {
|
||||
});
|
||||
}
|
||||
|
||||
function isAuthError(error: unknown): boolean {
|
||||
return isApiError(error, 401);
|
||||
}
|
||||
|
||||
export default function FilesPage() {
|
||||
const { locale } = useAppI18n();
|
||||
const [items, setItems] = useState<UserFileItem[]>([]);
|
||||
@ -78,6 +84,9 @@ export default function FilesPage() {
|
||||
return;
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
if (isAuthError(err)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
const message = lastError instanceof Error ? lastError.message : pickAppText(locale, '加载文件失败', 'Failed to load files');
|
||||
@ -156,6 +165,15 @@ export default function FilesPage() {
|
||||
const handleUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const files = e.target.files;
|
||||
if (!files || files.length === 0) return;
|
||||
if (!canMutateUserFilesPath(currentPath)) {
|
||||
setLoadError(pickAppText(
|
||||
locale,
|
||||
'请先进入 uploads、outputs、shared 或 tasks 目录后再上传。',
|
||||
'Open uploads, outputs, shared, or tasks before uploading.'
|
||||
));
|
||||
if (fileInputRef.current) fileInputRef.current.value = '';
|
||||
return;
|
||||
}
|
||||
|
||||
setUploading(true);
|
||||
setUploadProgress(0);
|
||||
@ -178,6 +196,14 @@ export default function FilesPage() {
|
||||
const handleCreateDir = async () => {
|
||||
const name = newDirName.trim();
|
||||
if (!name) return;
|
||||
if (!canMutateUserFilesPath(currentPath)) {
|
||||
setLoadError(pickAppText(
|
||||
locale,
|
||||
'请先进入 uploads、outputs、shared 或 tasks 目录后再新建文件夹。',
|
||||
'Open uploads, outputs, shared, or tasks before creating a folder.'
|
||||
));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const dirPath = currentPath ? `${currentPath}/${name}` : name;
|
||||
await createUserFileDir(dirPath);
|
||||
@ -191,6 +217,7 @@ export default function FilesPage() {
|
||||
|
||||
// Build breadcrumbs
|
||||
const breadcrumbs = currentPath ? currentPath.split('/') : [];
|
||||
const canMutateCurrentPath = canMutateUserFilesPath(currentPath);
|
||||
|
||||
const formatSize = (bytes: number | null) => {
|
||||
if (bytes === null || bytes === undefined) return '';
|
||||
@ -224,7 +251,12 @@ export default function FilesPage() {
|
||||
size="sm"
|
||||
className="h-11"
|
||||
onClick={() => setShowMkdir(true)}
|
||||
disabled={loading}
|
||||
disabled={loading || !canMutateCurrentPath}
|
||||
title={
|
||||
canMutateCurrentPath
|
||||
? undefined
|
||||
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
|
||||
}
|
||||
>
|
||||
<FolderPlus className="w-4 h-4 mr-1" />
|
||||
{pickAppText(locale, '新建文件夹', 'New folder')}
|
||||
@ -234,7 +266,12 @@ export default function FilesPage() {
|
||||
size="sm"
|
||||
className="h-11"
|
||||
onClick={() => fileInputRef.current?.click()}
|
||||
disabled={uploading}
|
||||
disabled={uploading || !canMutateCurrentPath}
|
||||
title={
|
||||
canMutateCurrentPath
|
||||
? undefined
|
||||
: pickAppText(locale, '先进入 uploads、outputs、shared 或 tasks', 'Open uploads, outputs, shared, or tasks first')
|
||||
}
|
||||
>
|
||||
{uploading ? (
|
||||
<>
|
||||
@ -272,6 +309,15 @@ export default function FilesPage() {
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
{!canMutateCurrentPath && !loading && (
|
||||
<p className="mb-4 rounded-md border border-[#E6E1DE] bg-muted/40 px-3 py-2 text-sm text-muted-foreground">
|
||||
{pickAppText(
|
||||
locale,
|
||||
'请选择 uploads、outputs、shared 或 tasks 后再上传或新建文件夹。',
|
||||
'Select uploads, outputs, shared, or tasks before uploading or creating folders.'
|
||||
)}
|
||||
</p>
|
||||
)}
|
||||
|
||||
{/* Breadcrumbs */}
|
||||
<div className="flex items-center gap-1 mb-4 text-sm text-muted-foreground flex-wrap">
|
||||
|
||||
@ -5,7 +5,6 @@ import { usePathname, useRouter, useSearchParams } from 'next/navigation';
|
||||
import {
|
||||
AlertCircle,
|
||||
BarChart3,
|
||||
Check,
|
||||
CheckCircle2,
|
||||
ChevronDown,
|
||||
ClipboardList,
|
||||
@ -31,7 +30,6 @@ import ReactMarkdown from 'react-markdown';
|
||||
import remarkGfm from 'remark-gfm';
|
||||
|
||||
import {
|
||||
approveSkillDraft,
|
||||
deleteSkill,
|
||||
disablePublishedSkill,
|
||||
downloadSkill,
|
||||
@ -436,11 +434,6 @@ export default function SkillsPage() {
|
||||
submitSkillDraft(draft.skill_name, draft.draft_id)
|
||||
)
|
||||
}
|
||||
onApprove={() =>
|
||||
runAction(`approve:${draft.draft_id}`, () =>
|
||||
approveSkillDraft(draft.skill_name, draft.draft_id)
|
||||
)
|
||||
}
|
||||
onReject={() =>
|
||||
runAction(`reject:${draft.draft_id}`, () =>
|
||||
rejectSkillDraft(draft.skill_name, draft.draft_id)
|
||||
@ -799,7 +792,6 @@ function DraftCard({
|
||||
draft,
|
||||
actionId,
|
||||
onSubmit,
|
||||
onApprove,
|
||||
onReject,
|
||||
onRecheckSafety,
|
||||
onPublish,
|
||||
@ -807,7 +799,6 @@ function DraftCard({
|
||||
draft: SkillDraft;
|
||||
actionId: string | null;
|
||||
onSubmit: () => Promise<unknown>;
|
||||
onApprove: () => Promise<unknown>;
|
||||
onReject: () => Promise<unknown>;
|
||||
onRecheckSafety: () => Promise<unknown>;
|
||||
onPublish: (confirmHighRisk: boolean) => Promise<unknown>;
|
||||
@ -820,8 +811,10 @@ function DraftCard({
|
||||
const frontmatter = draft.proposed_frontmatter || {};
|
||||
const description = String(frontmatter.description || '').trim();
|
||||
const toolHints = normalizeStringList(frontmatter.tools);
|
||||
const submittedForReview = draft.status === 'in_review' || draft.status === 'approved';
|
||||
const isRevision = draft.proposal_kind === 'revise_skill' && Boolean(draft.base_skill);
|
||||
const publishBlocked =
|
||||
draft.status !== 'approved'
|
||||
!submittedForReview
|
||||
|| !safety
|
||||
|| safety.risk_level === 'critical'
|
||||
|| (evalReport?.status !== 'skipped_provider_unavailable' && evalReport?.passed === false);
|
||||
@ -833,7 +826,6 @@ function DraftCard({
|
||||
].filter(Boolean).join('\n');
|
||||
const safetyBlocksReview = Boolean(safety && (!safety.passed || safety.risk_level === 'critical'));
|
||||
const submitBlocked = draft.status !== 'draft' || safetyBlocksReview;
|
||||
const approveBlocked = draft.status !== 'in_review' || safetyBlocksReview;
|
||||
const rejectBlocked = !REJECTABLE_DRAFT_STATUSES.has(draft.status);
|
||||
const canPublishLabel = publishBlocked
|
||||
? publishBlockReason(draft, t)
|
||||
@ -878,7 +870,12 @@ function DraftCard({
|
||||
<p className={`mt-1 text-sm leading-6 text-muted-foreground ${containedLongTextClass}`}>
|
||||
{draft.reason || description || t('没有提供草稿说明。', 'No draft notes were provided.')}
|
||||
</p>
|
||||
<div className="mt-3 grid gap-3 md:grid-cols-3">
|
||||
{draft.proposal_kind === 'revise_skill' && draft.base_version && (
|
||||
<div className="mt-2 text-sm font-medium text-muted-foreground">
|
||||
{draft.skill_name}: {draft.base_version} → {draft.target_version || t('下一版本', 'Next version')}
|
||||
</div>
|
||||
)}
|
||||
<div className="mt-3 grid gap-3 md:grid-cols-4">
|
||||
<ReadableFact
|
||||
icon={<FileCode2 className="h-4 w-4" />}
|
||||
label={t('草稿内容', 'Draft content')}
|
||||
@ -889,6 +886,11 @@ function DraftCard({
|
||||
label={t('基线版本', 'Base version')}
|
||||
value={draft.base_version || t('新增技能,无基线', 'New skill, no base')}
|
||||
/>
|
||||
<ReadableFact
|
||||
icon={<GitCompare className="h-4 w-4" />}
|
||||
label={t('目标版本', 'Target version')}
|
||||
value={draft.target_version || '-'}
|
||||
/>
|
||||
<ReadableFact
|
||||
icon={<Info className="h-4 w-4" />}
|
||||
label={t('来源', 'Source')}
|
||||
@ -912,10 +914,6 @@ function DraftCard({
|
||||
<Send className="mr-2 h-4 w-4" />
|
||||
{t('送审', 'Submit')}
|
||||
</Button>
|
||||
<Button variant="outline" size="sm" className="h-11" disabled={busy || approveBlocked} onClick={() => void onApprove()}>
|
||||
<Check className="mr-2 h-4 w-4" />
|
||||
{t('批准', 'Approve')}
|
||||
</Button>
|
||||
<Button variant="outline" size="sm" className="h-11" disabled={busy || rejectBlocked} onClick={() => void onReject()}>
|
||||
<XCircle className="mr-2 h-4 w-4" />
|
||||
{t('拒绝', 'Reject')}
|
||||
@ -926,7 +924,7 @@ function DraftCard({
|
||||
</Button>
|
||||
<Button size="sm" className="h-11" disabled={busy || publishBlocked} onClick={handlePublish}>
|
||||
<Rocket className="mr-2 h-4 w-4" />
|
||||
{t('发布', 'Publish')}
|
||||
{draft.proposal_kind === 'revise_skill' ? t('发布修订', 'Publish revision') : t('发布', 'Publish')}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
@ -936,7 +934,7 @@ function DraftCard({
|
||||
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
|
||||
<div className="flex items-center gap-2 text-sm font-medium">
|
||||
<FileText className="h-4 w-4 text-muted-foreground" />
|
||||
{t('拟发布的技能正文', 'Proposed skill body')}
|
||||
{isRevision ? t('修改对比', 'Revision comparison') : t('拟发布的技能正文', 'Proposed skill body')}
|
||||
</div>
|
||||
{toolHints.length > 0 && (
|
||||
<div className="flex flex-wrap gap-1">
|
||||
@ -948,7 +946,14 @@ function DraftCard({
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
{draft.proposed_content.trim() ? (
|
||||
{isRevision && draft.base_skill ? (
|
||||
<RevisionComparison
|
||||
baseVersion={draft.base_version || draft.base_skill.version}
|
||||
targetVersion={draft.target_version || t('下一版本', 'Next version')}
|
||||
baseContent={draft.base_skill.content}
|
||||
proposedContent={draft.proposed_content}
|
||||
/>
|
||||
) : draft.proposed_content.trim() ? (
|
||||
<MarkdownPreview content={draft.proposed_content} />
|
||||
) : (
|
||||
<p className="text-sm text-muted-foreground">{t('草稿没有正文内容。', 'This draft has no body content.')}</p>
|
||||
@ -960,7 +965,7 @@ function DraftCard({
|
||||
title={t('发布门禁', 'Publish gates')}
|
||||
summary={canPublishLabel}
|
||||
items={[
|
||||
{ label: t('草稿已批准', 'Draft approved'), ok: draft.status === 'approved' },
|
||||
{ label: t('草稿已送审', 'Draft submitted'), ok: submittedForReview },
|
||||
{ label: t('安全报告通过', 'Safety passed'), ok: Boolean(safety?.passed) && safety?.risk_level !== 'critical' },
|
||||
{
|
||||
label: t('评估未回退', 'No eval regression'),
|
||||
@ -971,6 +976,7 @@ function DraftCard({
|
||||
<RawDetails
|
||||
title={t('原始草稿内容', 'Raw draft payload')}
|
||||
payload={{
|
||||
base_skill: draft.base_skill,
|
||||
proposed_frontmatter: draft.proposed_frontmatter,
|
||||
proposed_content: draft.proposed_content,
|
||||
evidence_refs: draft.evidence_refs,
|
||||
@ -1040,6 +1046,71 @@ function SafetyReportPanel({ report }: { report?: SkillDraftSafetyReport | null
|
||||
);
|
||||
}
|
||||
|
||||
function RevisionComparison({
|
||||
baseVersion,
|
||||
targetVersion,
|
||||
baseContent,
|
||||
proposedContent,
|
||||
}: {
|
||||
baseVersion: string;
|
||||
targetVersion: string;
|
||||
baseContent: string;
|
||||
proposedContent: string;
|
||||
}) {
|
||||
const { locale } = useAppI18n();
|
||||
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
|
||||
const diff = lineDiffSummary(baseContent, proposedContent);
|
||||
return (
|
||||
<div className="space-y-3">
|
||||
<div className="flex flex-wrap gap-2 text-xs text-muted-foreground">
|
||||
<Badge variant="outline">{baseVersion}</Badge>
|
||||
<span>→</span>
|
||||
<Badge variant="default">{targetVersion}</Badge>
|
||||
<span>{t('新增', 'Added')}: {diff.added}</span>
|
||||
<span>{t('删除', 'Removed')}: {diff.removed}</span>
|
||||
<span>{t('修改', 'Changed')}: {diff.changed}</span>
|
||||
</div>
|
||||
<div className="grid min-w-0 gap-3 lg:grid-cols-2">
|
||||
<DiffPane title={t('当前版本', 'Current version')} content={baseContent} />
|
||||
<DiffPane title={t('草稿修订', 'Draft revision')} content={proposedContent} />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function DiffPane({ title, content }: { title: string; content: string }) {
|
||||
return (
|
||||
<div className="min-w-0 rounded-md border border-border bg-white">
|
||||
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">{title}</div>
|
||||
<pre className={`max-h-[520px] overflow-auto p-3 text-xs leading-5 ${containedLongTextClass}`}>
|
||||
{content.trim() || '-'}
|
||||
</pre>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function lineDiffSummary(baseContent: string, proposedContent: string): { added: number; removed: number; changed: number } {
|
||||
const baseLines = baseContent.split(/\r?\n/);
|
||||
const proposedLines = proposedContent.split(/\r?\n/);
|
||||
const maxLength = Math.max(baseLines.length, proposedLines.length);
|
||||
let added = 0;
|
||||
let removed = 0;
|
||||
let changed = 0;
|
||||
for (let index = 0; index < maxLength; index += 1) {
|
||||
const baseLine = baseLines[index];
|
||||
const proposedLine = proposedLines[index];
|
||||
if (baseLine === proposedLine) continue;
|
||||
if (baseLine === undefined) {
|
||||
added += 1;
|
||||
} else if (proposedLine === undefined) {
|
||||
removed += 1;
|
||||
} else {
|
||||
changed += 1;
|
||||
}
|
||||
}
|
||||
return { added, removed, changed };
|
||||
}
|
||||
|
||||
function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
const { locale } = useAppI18n();
|
||||
const t = (zh: string, en: string) => pickAppText(locale, zh, en);
|
||||
@ -1066,6 +1137,15 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
const abilitySummary = report.ability_score_summary || {};
|
||||
const toolExecutionSummary = report.tool_execution_summary || report.tool_mode_summary || {};
|
||||
const caseSelectionSummary = report.case_selection_summary || {};
|
||||
const realScore = report.real_score_avg ?? abilitySummary.real_score_avg;
|
||||
const syntheticScore = report.synthetic_score_avg ?? abilitySummary.synthetic_score_avg;
|
||||
const overallScore = report.overall_score_avg ?? abilitySummary.overall_score_avg ?? report.candidate_score_avg;
|
||||
const realCaseCount = toNumber(abilitySummary.real_case_count);
|
||||
const syntheticCaseCount = toNumber(abilitySummary.synthetic_case_count);
|
||||
const excludedSynthetic = toNumber(caseSelectionSummary.excluded_synthetic_without_validator);
|
||||
return (
|
||||
<div className="min-w-0 rounded-md border border-border bg-muted/20 p-4">
|
||||
<div className="mb-3 flex flex-wrap items-center justify-between gap-2">
|
||||
@ -1079,8 +1159,8 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2 sm:grid-cols-3">
|
||||
<MetricTile label={t('基线均分', 'Baseline avg')} value={formatScore(report.baseline_score_avg)} />
|
||||
<MetricTile label={t('候选均分', 'Candidate avg')} value={formatScore(report.candidate_score_avg)} />
|
||||
<MetricTile label={t('基线能力均分', 'Baseline ability')} value={formatScore(report.baseline_score_avg)} />
|
||||
<MetricTile label={t('候选能力均分', 'Candidate ability')} value={formatScore(report.candidate_score_avg)} />
|
||||
<MetricTile
|
||||
label={t('变化', 'Delta')}
|
||||
value={`${report.score_delta >= 0 ? '+' : ''}${formatScore(report.score_delta)}`}
|
||||
@ -1089,8 +1169,14 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
</div>
|
||||
|
||||
<div className="mt-3 grid gap-2 sm:grid-cols-3">
|
||||
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
|
||||
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
|
||||
<MetricTile label={t('真实案例均分', 'Real avg')} value={formatOptionalScore(realScore)} />
|
||||
<MetricTile label={t('模拟案例均分', 'Synthetic avg')} value={formatOptionalScore(syntheticScore)} />
|
||||
<MetricTile label={t('总体能力分', 'Overall ability')} value={formatOptionalScore(overallScore)} />
|
||||
</div>
|
||||
|
||||
<div className="mt-3 grid gap-2 sm:grid-cols-3">
|
||||
<MetricTile label={t('工具执行覆盖', 'Tool execution')} value={formatPercent(toOptionalNumber(toolExecutionSummary.executed) ?? report.execution_coverage)} />
|
||||
<MetricTile label={t('替代工具评估', 'Tool surrogate')} value={formatPercent(toOptionalNumber(toolExecutionSummary.surrogate) ?? report.surrogate_coverage)} />
|
||||
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
|
||||
</div>
|
||||
|
||||
@ -1100,6 +1186,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('不变', 'Unchanged')} value={String(report.unchanged_count)} />
|
||||
</div>
|
||||
|
||||
<div className="mt-3 grid gap-2 sm:grid-cols-3">
|
||||
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('真实案例', 'Real cases')} value={String(realCaseCount)} />
|
||||
<ReadableFact icon={<Info className="h-4 w-4" />} label={t('模拟案例', 'Synthetic cases')} value={String(syntheticCaseCount)} />
|
||||
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('无验证器已排除', 'No-validator excluded')} value={String(excludedSynthetic)} />
|
||||
</div>
|
||||
|
||||
{report.cases.length > 0 && (
|
||||
<div className="mt-3 overflow-hidden rounded-md border border-border bg-white">
|
||||
<div className="border-b border-border px-3 py-2 text-xs font-medium text-muted-foreground">
|
||||
@ -1114,6 +1206,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
<MetricTile label={t('候选', 'Candidate')} value={formatScore(toNumber(item.candidate_score))} />
|
||||
<MetricTile label={t('变化', 'Delta')} value={formatSignedScore(toNumber(item.delta))} />
|
||||
</div>
|
||||
<div className="mt-2 text-muted-foreground">
|
||||
{String(item.synthetic) === 'true' ? t('模拟案例', 'Synthetic case') : t('真实案例', 'Real case')}
|
||||
{item.tier ? ` · ${String(item.tier)}` : ''}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
@ -1122,6 +1218,7 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
<thead className="bg-muted/40 text-muted-foreground">
|
||||
<tr>
|
||||
<th className="px-3 py-2 font-medium">{t('运行', 'Run')}</th>
|
||||
<th className="px-3 py-2 font-medium">{t('来源', 'Source')}</th>
|
||||
<th className="px-3 py-2 font-medium">{t('基线', 'Baseline')}</th>
|
||||
<th className="px-3 py-2 font-medium">{t('候选', 'Candidate')}</th>
|
||||
<th className="px-3 py-2 font-medium">{t('变化', 'Delta')}</th>
|
||||
@ -1131,6 +1228,10 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
{report.cases.map((item, index) => (
|
||||
<tr key={`${String(item.run_id || index)}:${index}`} className="border-t border-border">
|
||||
<td className="max-w-[160px] truncate px-3 py-2 font-mono">{String(item.run_id || '-')}</td>
|
||||
<td className="px-3 py-2">
|
||||
{String(item.synthetic) === 'true' ? t('模拟', 'Synthetic') : t('真实', 'Real')}
|
||||
{item.tier ? ` · ${String(item.tier)}` : ''}
|
||||
</td>
|
||||
<td className="px-3 py-2">{formatScore(toNumber(item.baseline_score))}</td>
|
||||
<td className="px-3 py-2">{formatScore(toNumber(item.candidate_score))}</td>
|
||||
<td className="px-3 py-2">{formatSignedScore(toNumber(item.delta))}</td>
|
||||
@ -1144,6 +1245,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
|
||||
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
|
||||
) : null}
|
||||
{Object.keys(abilitySummary).length > 0 ? (
|
||||
<RawDetails title={t('能力评分汇总', 'Ability score summary')} payload={abilitySummary} />
|
||||
) : null}
|
||||
{Object.keys(toolExecutionSummary).length > 0 ? (
|
||||
<RawDetails title={t('工具诊断汇总', 'Tool diagnostic summary')} payload={toolExecutionSummary} />
|
||||
) : null}
|
||||
{report.preservation_report ? (
|
||||
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
|
||||
) : null}
|
||||
@ -1366,7 +1473,9 @@ function triggerReasonLabel(reason: string, t: (zh: string, en: string) => strin
|
||||
}
|
||||
|
||||
function publishBlockReason(draft: SkillDraft, t: (zh: string, en: string) => string): string {
|
||||
if (draft.status !== 'approved') return t('草稿还没有批准,不能发布。', 'The draft is not approved yet.');
|
||||
if (draft.status !== 'in_review' && draft.status !== 'approved') {
|
||||
return t('草稿还没有送审,不能发布。', 'The draft has not been submitted yet.');
|
||||
}
|
||||
if (!draft.safety_report) return t('缺少安全报告,不能发布。', 'A safety report is required before publishing.');
|
||||
if (draft.safety_report.risk_level === 'critical' || !draft.safety_report.passed) {
|
||||
return t('安全报告存在阻断项,不能发布。', 'The safety report has blockers.');
|
||||
@ -1399,6 +1508,11 @@ function formatScore(value: number): string {
|
||||
return value.toFixed(2);
|
||||
}
|
||||
|
||||
function formatOptionalScore(value: unknown): string {
|
||||
const parsed = toOptionalNumber(value);
|
||||
return typeof parsed === 'number' ? formatScore(parsed) : '-';
|
||||
}
|
||||
|
||||
function formatPercent(value?: number | null): string {
|
||||
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
|
||||
return `${Math.round(value * 100)}%`;
|
||||
@ -1414,6 +1528,12 @@ function toNumber(value: unknown): number {
|
||||
return Number.isFinite(parsed) ? parsed : 0;
|
||||
}
|
||||
|
||||
function toOptionalNumber(value: unknown): number | null {
|
||||
if (value === null || value === undefined || value === '') return null;
|
||||
const parsed = Number(value);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
function EmptyState({ icon, text }: { icon: React.ReactNode; text: string }) {
|
||||
return (
|
||||
<div className="py-12 text-center text-muted-foreground">
|
||||
@ -1475,7 +1595,7 @@ function UploadSkillForm({
|
||||
className="block w-full cursor-pointer text-sm text-muted-foreground file:mr-4 file:rounded-md file:border-0 file:bg-primary file:px-4 file:py-2 file:text-sm file:font-medium file:text-primary-foreground hover:file:bg-primary/90"
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{pickAppText(locale, '上传后进入草稿评审,并自动运行 safety 和 eval。', 'After upload, the skill enters draft review and runs safety and eval automatically.')}
|
||||
{pickAppText(locale, '上传后生成草稿;送审后再运行 safety 和 eval。', 'After upload, a draft is created; safety and eval run after submission.')}
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex justify-end gap-2">
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import { useEffect } from 'react';
|
||||
import { usePathname, useRouter, useSearchParams } from 'next/navigation';
|
||||
import { buildAuthPortalUrl } from '@/lib/auth-portal';
|
||||
import { clearTokens, getMe, isLoggedIn } from '@/lib/api';
|
||||
import { AUTH_CLEARED_EVENT, clearTokens, getMe, isLoggedIn } from '@/lib/api';
|
||||
import { pickAppText } from '@/lib/i18n/core';
|
||||
import { useAppI18n } from '@/lib/i18n/provider';
|
||||
import { useChatStore } from '@/lib/store';
|
||||
@ -66,6 +66,18 @@ export default function AuthGuard({
|
||||
};
|
||||
}, [setIsAuthLoading, setUser]);
|
||||
|
||||
useEffect(() => {
|
||||
const handleAuthCleared = () => {
|
||||
setUser(null);
|
||||
setIsAuthLoading(false);
|
||||
};
|
||||
|
||||
window.addEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
|
||||
return () => {
|
||||
window.removeEventListener(AUTH_CLEARED_EVENT, handleAuthCleared);
|
||||
};
|
||||
}, [setIsAuthLoading, setUser]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isAuthLoading) {
|
||||
return;
|
||||
|
||||
@ -58,6 +58,7 @@ const WS_URL = process.env.NEXT_PUBLIC_WS_URL?.trim();
|
||||
const DEFAULT_API_URL = 'http://127.0.0.1:18080';
|
||||
const ACCESS_TOKEN_KEY = 'beaver_access_token';
|
||||
const REFRESH_TOKEN_KEY = 'beaver_refresh_token';
|
||||
export const AUTH_CLEARED_EVENT = 'beaver-auth-cleared';
|
||||
const REQUEST_TIMEOUT_MS = 8000;
|
||||
const OUTLOOK_REQUEST_TIMEOUT_MS = 45000;
|
||||
const SKILL_LEARNING_REQUEST_TIMEOUT_MS = 120000;
|
||||
@ -117,6 +118,34 @@ type FetchJsonOptions = RequestInit & {
|
||||
timeoutMs?: number;
|
||||
};
|
||||
|
||||
export class ApiError extends Error {
|
||||
status: number;
|
||||
detail: string;
|
||||
|
||||
constructor(message: string, options: { status: number; detail: string }) {
|
||||
super(message);
|
||||
this.name = 'ApiError';
|
||||
this.status = options.status;
|
||||
this.detail = options.detail;
|
||||
}
|
||||
}
|
||||
|
||||
export function isApiError(error: unknown, status?: number): error is ApiError {
|
||||
return error instanceof ApiError && (status === undefined || error.status === status);
|
||||
}
|
||||
|
||||
function parseErrorDetail(text: string): string {
|
||||
try {
|
||||
const parsed = JSON.parse(text);
|
||||
if (parsed && typeof parsed.detail === 'string') {
|
||||
return parsed.detail;
|
||||
}
|
||||
} catch {
|
||||
// keep raw text
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function withTimeout(
|
||||
signal?: AbortSignal,
|
||||
timeoutMs: number = REQUEST_TIMEOUT_MS
|
||||
@ -163,6 +192,7 @@ export function clearTokens(): void {
|
||||
if (!isBrowser()) return;
|
||||
localStorage.removeItem(ACCESS_TOKEN_KEY);
|
||||
localStorage.removeItem(REFRESH_TOKEN_KEY);
|
||||
window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT));
|
||||
}
|
||||
|
||||
export function isLoggedIn(): boolean {
|
||||
@ -215,16 +245,11 @@ async function fetchJSON<T>(path: string, options?: FetchJsonOptions): Promise<T
|
||||
if (res.status === 401) {
|
||||
clearTokens();
|
||||
}
|
||||
let detail = text;
|
||||
try {
|
||||
const parsed = JSON.parse(text);
|
||||
if (parsed && typeof parsed.detail === 'string') {
|
||||
detail = parsed.detail;
|
||||
}
|
||||
} catch {
|
||||
// keep raw text
|
||||
}
|
||||
throw new Error(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`);
|
||||
const detail = parseErrorDetail(text);
|
||||
throw new ApiError(`${pickAppText(locale, '接口错误', 'API error')} ${res.status}: ${detail}`, {
|
||||
status: res.status,
|
||||
detail,
|
||||
});
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
@ -1216,7 +1241,7 @@ export async function uploadSkill(file: File): Promise<Skill> {
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
throw new Error(`接口错误 ${res.status}: ${text}`);
|
||||
throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`);
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
|
||||
8
app-instance/frontend/lib/user-file-paths.ts
Normal file
8
app-instance/frontend/lib/user-file-paths.ts
Normal file
@ -0,0 +1,8 @@
|
||||
const USER_FILE_MUTABLE_ROOTS = new Set(['uploads', 'outputs', 'shared', 'tasks']);
|
||||
|
||||
export function canMutateUserFilesPath(path: string): boolean {
|
||||
const cleaned = path.trim().replace(/^\/+|\/+$/g, '');
|
||||
if (!cleaned) return false;
|
||||
const [root] = cleaned.split('/');
|
||||
return USER_FILE_MUTABLE_ROOTS.has(root);
|
||||
}
|
||||
@ -3,9 +3,23 @@ import { resolve } from 'node:path';
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { canMutateUserFilesPath } from './user-file-paths';
|
||||
|
||||
const root = resolve(__dirname, '..');
|
||||
|
||||
describe('user file system frontend wiring', () => {
|
||||
it('only enables mutating file actions inside concrete user-file roots', () => {
|
||||
expect(canMutateUserFilesPath('')).toBe(false);
|
||||
expect(canMutateUserFilesPath('/')).toBe(false);
|
||||
expect(canMutateUserFilesPath('qa-folder')).toBe(false);
|
||||
|
||||
expect(canMutateUserFilesPath('uploads')).toBe(true);
|
||||
expect(canMutateUserFilesPath('uploads/qa-folder')).toBe(true);
|
||||
expect(canMutateUserFilesPath('outputs/report.md')).toBe(true);
|
||||
expect(canMutateUserFilesPath('shared')).toBe(true);
|
||||
expect(canMutateUserFilesPath('tasks/task-1')).toBe(true);
|
||||
});
|
||||
|
||||
it('routes API client helpers to user file endpoints', () => {
|
||||
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
|
||||
|
||||
@ -17,6 +31,13 @@ describe('user file system frontend wiring', () => {
|
||||
expect(apiSource).toContain('/api/user-files/mkdir');
|
||||
});
|
||||
|
||||
it('notifies the app shell when API auth is cleared', () => {
|
||||
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
|
||||
|
||||
expect(apiSource).toContain('AUTH_CLEARED_EVENT');
|
||||
expect(apiSource).toContain("window.dispatchEvent(new CustomEvent(AUTH_CLEARED_EVENT))");
|
||||
});
|
||||
|
||||
it('does not wire the Files page to workspace or MinIO management APIs', () => {
|
||||
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
|
||||
|
||||
@ -29,4 +50,18 @@ describe('user file system frontend wiring', () => {
|
||||
expect(pageSource).not.toContain('accessKey');
|
||||
expect(pageSource).not.toContain('secretKey');
|
||||
});
|
||||
|
||||
it('does not retry user-file loads after an auth failure', () => {
|
||||
const pageSource = readFileSync(resolve(root, 'app/(app)/files/page.tsx'), 'utf8');
|
||||
|
||||
expect(pageSource).toContain('isAuthError');
|
||||
expect(pageSource).toContain('if (isAuthError(err))');
|
||||
});
|
||||
|
||||
it('shows backend upload error details instead of raw JSON payloads', () => {
|
||||
const apiSource = readFileSync(resolve(root, 'lib/api.ts'), 'utf8');
|
||||
|
||||
expect(apiSource).toContain('function parseErrorDetail');
|
||||
expect(apiSource).toContain('throw new Error(`接口错误 ${res.status}: ${parseErrorDetail(text)}`)');
|
||||
});
|
||||
});
|
||||
|
||||
@ -993,6 +993,12 @@ export interface SkillDraftEvalReport {
|
||||
confidence?: 'low' | 'medium' | 'high' | string;
|
||||
case_reports?: Array<Record<string, unknown>>;
|
||||
tool_mode_summary?: Record<string, unknown>;
|
||||
ability_score_summary?: Record<string, unknown>;
|
||||
tool_execution_summary?: Record<string, unknown>;
|
||||
case_selection_summary?: Record<string, unknown>;
|
||||
real_score_avg?: number | null;
|
||||
synthetic_score_avg?: number | null;
|
||||
overall_score_avg?: number | null;
|
||||
preservation_report?: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
@ -1000,6 +1006,15 @@ export interface SkillDraft {
|
||||
draft_id: string;
|
||||
skill_name: string;
|
||||
base_version?: string | null;
|
||||
target_version?: string | null;
|
||||
base_skill?: {
|
||||
skill_name: string;
|
||||
version: string;
|
||||
frontmatter: Record<string, unknown>;
|
||||
content: string;
|
||||
summary?: string;
|
||||
tool_hints?: string[];
|
||||
} | null;
|
||||
proposed_content: string;
|
||||
proposed_frontmatter: Record<string, unknown>;
|
||||
created_at: string;
|
||||
|
||||
@ -47,6 +47,8 @@ http {
|
||||
|
||||
location /api/ {
|
||||
proxy_pass http://127.0.0.1:18080;
|
||||
proxy_read_timeout 3600;
|
||||
proxy_send_timeout 3600;
|
||||
}
|
||||
|
||||
location /docs {
|
||||
|
||||
Reference in New Issue
Block a user