feat(app): 移除内置agents并添加CORS支持和技能上传优化
移除了agents/registry.json中的所有内置agents配置,将agents数组清空。 为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。 重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。 新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。 更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。 修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
19
app-instance/backend/beaver/skills/authoring/__init__.py
Normal file
19
app-instance/backend/beaver/skills/authoring/__init__.py
Normal file
@ -0,0 +1,19 @@
|
||||
"""Skill authoring helpers."""
|
||||
|
||||
from .format import (
|
||||
CANONICAL_SKILL_SECTION_HEADINGS,
|
||||
canonical_skill_format_instructions,
|
||||
canonicalize_skill_body,
|
||||
ensure_canonical_skill_body,
|
||||
is_canonical_skill_body,
|
||||
normalize_skill_frontmatter,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CANONICAL_SKILL_SECTION_HEADINGS",
|
||||
"canonical_skill_format_instructions",
|
||||
"canonicalize_skill_body",
|
||||
"ensure_canonical_skill_body",
|
||||
"is_canonical_skill_body",
|
||||
"normalize_skill_frontmatter",
|
||||
]
|
||||
250
app-instance/backend/beaver/skills/authoring/format.py
Normal file
250
app-instance/backend/beaver/skills/authoring/format.py
Normal file
@ -0,0 +1,250 @@
|
||||
"""Canonical Beaver skill authoring format."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from beaver.skills.catalog.utils import extract_required_tool_names
|
||||
|
||||
|
||||
CANONICAL_SKILL_SECTION_HEADINGS: tuple[str, ...] = (
|
||||
"## Overview",
|
||||
"## When to Use",
|
||||
"## Required Tools",
|
||||
"## Workflow",
|
||||
"## Validation",
|
||||
"## Boundaries",
|
||||
"## Anti-Patterns",
|
||||
)
|
||||
|
||||
|
||||
def canonical_skill_format_instructions() -> str:
|
||||
headings = "\n".join(f"- {heading}" for heading in CANONICAL_SKILL_SECTION_HEADINGS)
|
||||
return (
|
||||
"Canonical Beaver SKILL.md format:\n"
|
||||
"1. Return a frontmatter object with `name`, `description`, and `tools`.\n"
|
||||
"2. `name` must be lowercase kebab-case. `description` must explain when the skill should be used.\n"
|
||||
"3. `tools` must be an explicit JSON array of exact runtime tool names. Use [] only if no tool is required.\n"
|
||||
"4. The Markdown content must start with one H1 title and include these H2 sections in this exact order:\n"
|
||||
f"{headings}\n"
|
||||
"5. Write concrete operational guidance, not a story about a past task.\n"
|
||||
"6. Include validation steps and anti-patterns so future runs know how to avoid false completion."
|
||||
)
|
||||
|
||||
|
||||
def normalize_skill_frontmatter(frontmatter: dict[str, Any] | None, *, skill_name: str) -> dict[str, Any]:
|
||||
raw = dict(frontmatter or {})
|
||||
name = _slug(str(raw.get("name") or skill_name))
|
||||
description = str(raw.get("description") or f"Use when {name} guidance is needed.").strip()
|
||||
tools = _coerce_string_list(raw.get("tools"))
|
||||
normalized = {}
|
||||
for key, value in raw.items():
|
||||
if key in {"name", "description", "tools"}:
|
||||
continue
|
||||
if key in {"always", "internal"} and isinstance(value, str):
|
||||
normalized[key] = value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
continue
|
||||
normalized[key] = value
|
||||
return {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"tools": tools,
|
||||
**normalized,
|
||||
}
|
||||
|
||||
|
||||
def is_canonical_skill_body(body: str) -> bool:
|
||||
text = body.strip()
|
||||
if not re.search(r"^#\s+\S", text, flags=re.MULTILINE):
|
||||
return False
|
||||
position = 0
|
||||
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
|
||||
found = text.find(heading, position)
|
||||
if found < 0:
|
||||
return False
|
||||
position = found + len(heading)
|
||||
return True
|
||||
|
||||
|
||||
def ensure_canonical_skill_body(
|
||||
body: str,
|
||||
*,
|
||||
title: str,
|
||||
description: str = "",
|
||||
tools: list[str] | None = None,
|
||||
) -> str:
|
||||
if is_canonical_skill_body(body):
|
||||
normalized = body.strip()
|
||||
if tools:
|
||||
normalized = _replace_required_tools_section(normalized, tools)
|
||||
return normalized + "\n"
|
||||
source = _compact_source_guidance(body)
|
||||
overview = description or source or f"Use this skill for {title}."
|
||||
return canonicalize_skill_body(
|
||||
title=title,
|
||||
overview=overview,
|
||||
tools=list(tools or []),
|
||||
workflow=[
|
||||
"Identify whether the user's request matches the skill's trigger conditions.",
|
||||
"Read the relevant source guidance below and apply only the steps that fit the current task.",
|
||||
"Use the required tools deliberately and keep tool output tied to the user's goal.",
|
||||
],
|
||||
validation=[
|
||||
"Verify the requested outcome with the most direct available check.",
|
||||
"Report any skipped step, unavailable dependency, or remaining uncertainty explicitly.",
|
||||
],
|
||||
boundaries=[
|
||||
"Do not broaden the task beyond the user's request.",
|
||||
"Do not use tools that are not listed or clearly available in the current runtime.",
|
||||
],
|
||||
anti_patterns=[
|
||||
"Do not summarize the skill instead of applying it.",
|
||||
"Do not claim completion without validation evidence.",
|
||||
],
|
||||
source_guidance=source,
|
||||
)
|
||||
|
||||
|
||||
def canonicalize_skill_body(
|
||||
*,
|
||||
title: str,
|
||||
overview: str,
|
||||
tools: list[str] | None = None,
|
||||
workflow: list[str] | None = None,
|
||||
validation: list[str] | None = None,
|
||||
boundaries: list[str] | None = None,
|
||||
anti_patterns: list[str] | None = None,
|
||||
when_to_use: list[str] | None = None,
|
||||
source_guidance: str = "",
|
||||
) -> str:
|
||||
cleaned_title = _title(title)
|
||||
tool_lines = _tool_lines(tools or [])
|
||||
workflow_lines = _bullet_lines(workflow or ["Follow the workflow described by the current task and evidence."])
|
||||
validation_lines = _bullet_lines(validation or ["Validate the result before reporting completion."])
|
||||
boundary_lines = _bullet_lines(boundaries or ["Stay within the current task and workspace boundaries."])
|
||||
anti_pattern_lines = _bullet_lines(anti_patterns or ["Do not skip validation."])
|
||||
when_lines = _bullet_lines(when_to_use or [f"Use when the task requires {cleaned_title} guidance."])
|
||||
source_section = f"\n\n### Source Guidance\n\n{source_guidance.strip()}" if source_guidance.strip() else ""
|
||||
return (
|
||||
f"# {cleaned_title}\n\n"
|
||||
"## Overview\n\n"
|
||||
f"{overview.strip() or f'Use this skill for {cleaned_title}.'}\n\n"
|
||||
"## When to Use\n\n"
|
||||
f"{when_lines}\n\n"
|
||||
"## Required Tools\n\n"
|
||||
f"{tool_lines}\n\n"
|
||||
"## Workflow\n\n"
|
||||
f"{workflow_lines}{source_section}\n\n"
|
||||
"## Validation\n\n"
|
||||
f"{validation_lines}\n\n"
|
||||
"## Boundaries\n\n"
|
||||
f"{boundary_lines}\n\n"
|
||||
"## Anti-Patterns\n\n"
|
||||
f"{anti_pattern_lines}\n"
|
||||
)
|
||||
|
||||
|
||||
def parse_skill_rewrite_json(content: str, *, skill_name: str) -> dict[str, Any] | None:
|
||||
cleaned = content.strip()
|
||||
if cleaned.startswith("```"):
|
||||
lines = cleaned.splitlines()
|
||||
if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].startswith("```"):
|
||||
cleaned = "\n".join(lines[1:-1]).strip()
|
||||
try:
|
||||
payload = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
frontmatter = payload.get("frontmatter")
|
||||
body = payload.get("content")
|
||||
if not isinstance(frontmatter, dict) or not isinstance(body, str):
|
||||
return None
|
||||
normalized = normalize_skill_frontmatter(frontmatter, skill_name=skill_name)
|
||||
normalized["tools"] = _merge_string_lists(
|
||||
normalized.get("tools"),
|
||||
extract_required_tool_names(body),
|
||||
)
|
||||
normalized_body = ensure_canonical_skill_body(
|
||||
body,
|
||||
title=normalized["name"],
|
||||
description=normalized["description"],
|
||||
tools=normalized["tools"],
|
||||
)
|
||||
return {
|
||||
"frontmatter": normalized,
|
||||
"content": normalized_body,
|
||||
"change_reason": str(payload.get("change_reason") or ""),
|
||||
}
|
||||
|
||||
|
||||
def _compact_source_guidance(body: str, *, max_chars: int = 20000) -> str:
|
||||
text = body.strip()
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"^---\n.*?\n---\n?", "", text, flags=re.DOTALL).strip()
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = re.sub(r"^(#{1,4})\s+", r"##\1 ", text, flags=re.MULTILINE)
|
||||
return text[:max_chars].rstrip()
|
||||
|
||||
|
||||
def _tool_lines(tools: list[str]) -> str:
|
||||
if not tools:
|
||||
return "- No dedicated tools are required."
|
||||
return "\n".join(f"- `{tool}`" for tool in tools)
|
||||
|
||||
|
||||
def _bullet_lines(items: list[str]) -> str:
|
||||
cleaned = [str(item).strip() for item in items if str(item).strip()]
|
||||
if not cleaned:
|
||||
return "- No additional guidance."
|
||||
return "\n".join(f"- {item}" for item in cleaned)
|
||||
|
||||
|
||||
def _coerce_string_list(value: Any) -> list[str]:
|
||||
if isinstance(value, list):
|
||||
raw_items = value
|
||||
elif isinstance(value, str):
|
||||
raw_items = value.split(",")
|
||||
else:
|
||||
raw_items = []
|
||||
result: list[str] = []
|
||||
for item in raw_items:
|
||||
cleaned = str(item).strip()
|
||||
if cleaned and cleaned not in result:
|
||||
result.append(cleaned)
|
||||
return result
|
||||
|
||||
|
||||
def _merge_string_lists(*values: Any) -> list[str]:
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
for item in _coerce_string_list(value):
|
||||
if item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def _replace_required_tools_section(body: str, tools: list[str]) -> str:
|
||||
replacement = "## Required Tools\n\n" + _tool_lines(tools)
|
||||
updated, count = re.subn(
|
||||
r"(?ms)^##\s+Required\s+Tools\s*\n.*?(?=^##\s+|\Z)",
|
||||
replacement + "\n\n",
|
||||
body.strip(),
|
||||
count=1,
|
||||
)
|
||||
return updated.strip() if count else body.strip()
|
||||
|
||||
|
||||
def _slug(value: str) -> str:
|
||||
text = value.strip().lower()
|
||||
text = re.sub(r"[^a-z0-9-]+", "-", text)
|
||||
text = re.sub(r"-{2,}", "-", text).strip("-")
|
||||
return text or "generated-skill"
|
||||
|
||||
|
||||
def _title(value: str) -> str:
|
||||
cleaned = str(value or "").strip().replace("-", " ")
|
||||
return cleaned.title() if cleaned else "Generated Skill"
|
||||
@ -28,12 +28,13 @@ Choose `new_task` when the user asks for anything that needs the main Task agent
|
||||
|
||||
The Intent Agent has no tools. If a request needs a tool, do not apologize and do not say you cannot access it. Route it to Task mode so the main agent can use tools.
|
||||
|
||||
When there is an active task, do not force every new user message into that task. Use the active task and recent conversation to decide:
|
||||
When there is an active task, do not force every new user message into that task. A Session is the durable conversation/device/group context; a Task is one unit of work inside that Session. Use the active task and recent conversation to decide:
|
||||
|
||||
- Choose `revise_task` when the user asks to change, correct, refine, expand, reformat, or redo the latest active task result.
|
||||
- Choose `continue_task` for neutral follow-up questions or additional next steps that still belong to the active task.
|
||||
- Choose `continue_task` for neutral follow-up questions or additional next steps that explicitly depend on or extend the active task's latest result.
|
||||
- Choose `simple_chat` for unrelated lightweight conversation. This starts a new topic and the previous task will be accepted automatically.
|
||||
- Choose `new_task` when the user asks for clearly unrelated work that needs Task capabilities. This starts a new topic and the previous task will be accepted automatically.
|
||||
- Choose `new_task` for a standalone tool-dependent request even when it resembles the active task. Repeating "珠海天气怎么样" later is a fresh task unless the user clearly says to continue or revise the old result.
|
||||
- Choose `close_task` when the user says the task is satisfactory or finished, such as "可以了", "就这样", or "that's good".
|
||||
- Choose `abandon_task` when the user says to stop, cancel, or no longer do the active task.
|
||||
|
||||
@ -46,6 +47,7 @@ Examples with an active weather task:
|
||||
- "再详细一点" -> `revise_task`
|
||||
- "加上明后天穿衣建议" -> `revise_task`
|
||||
- "顺便查一下深圳" -> `continue_task`
|
||||
- "珠海天气怎么样" -> `new_task` when asked as a standalone later request
|
||||
- "帮我写一个采购合同" -> `new_task`
|
||||
- "吃饭没" -> `simple_chat`
|
||||
- "我在冰岛" -> `simple_chat`
|
||||
|
||||
@ -27,6 +27,7 @@ from beaver.skills.specs.storage import SkillSpecStore
|
||||
from .utils import (
|
||||
check_requirements,
|
||||
escape_xml,
|
||||
extract_required_tool_names,
|
||||
get_missing_requirements,
|
||||
parse_frontmatter,
|
||||
parse_skill_metadata_blob,
|
||||
@ -111,13 +112,19 @@ class SkillsLoader:
|
||||
if not include_internal and _truthy(frontmatter.get("internal")):
|
||||
continue
|
||||
normalized_frontmatter = dict(frontmatter)
|
||||
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
|
||||
record = SkillRecord(
|
||||
name=name,
|
||||
path=skill_file,
|
||||
source=source,
|
||||
version="legacy",
|
||||
source_kind=source,
|
||||
tool_hints=self._coerce_tool_names(frontmatter.get("tools")),
|
||||
tool_hints=self._merge_tool_names(
|
||||
self._coerce_tool_names(frontmatter.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
extract_required_tool_names(body),
|
||||
),
|
||||
frontmatter=normalized_frontmatter,
|
||||
description=str(frontmatter.get("description") or summarize_body(body) or name),
|
||||
)
|
||||
@ -138,6 +145,7 @@ class SkillsLoader:
|
||||
path = self.workspace_skills / name / "SKILL.md"
|
||||
else:
|
||||
path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md"
|
||||
_frontmatter, body = parse_frontmatter(loaded.content)
|
||||
record = SkillRecord(
|
||||
name=name,
|
||||
path=path,
|
||||
@ -146,7 +154,10 @@ class SkillsLoader:
|
||||
content_hash=loaded.version.content_hash,
|
||||
source_kind=str(loaded.version.provenance.get("source_kind") or "workspace"),
|
||||
status=str(loaded.version.review_state or "published"),
|
||||
tool_hints=list(loaded.version.tool_hints),
|
||||
tool_hints=self._merge_tool_names(
|
||||
loaded.version.tool_hints,
|
||||
extract_required_tool_names(body),
|
||||
),
|
||||
frontmatter=dict(loaded.version.frontmatter),
|
||||
description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name),
|
||||
)
|
||||
@ -201,23 +212,32 @@ class SkillsLoader:
|
||||
- read_file
|
||||
- search_files
|
||||
- 兼容 metadata JSON blob 里的 `tools`
|
||||
- 兼容 canonical 正文 `## Required Tools` 段落
|
||||
"""
|
||||
|
||||
record = self._find_record(name)
|
||||
if record is not None and record.tool_hints:
|
||||
return list(record.tool_hints)
|
||||
|
||||
frontmatter = self.get_skill_metadata(name) or {}
|
||||
content = self.load_published_skill(name) or self.load_skill(name) or ""
|
||||
frontmatter, body = parse_frontmatter(content)
|
||||
frontmatter = frontmatter or self.get_skill_metadata(name) or {}
|
||||
meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", ""))
|
||||
names = [
|
||||
*self._coerce_tool_names(frontmatter.get("tools")),
|
||||
*self._coerce_tool_names(meta_blob.get("tools")),
|
||||
*self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
]
|
||||
names = self._merge_tool_names(
|
||||
self._coerce_tool_names(frontmatter.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("tools")),
|
||||
self._coerce_tool_names(meta_blob.get("required_tools")),
|
||||
extract_required_tool_names(body),
|
||||
)
|
||||
return names
|
||||
|
||||
@staticmethod
|
||||
def _merge_tool_names(*groups: Any) -> list[str]:
|
||||
result: list[str] = []
|
||||
for item in names:
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
for group in groups:
|
||||
for item in SkillsLoader._coerce_tool_names(group):
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def load_skills_for_context(self, skill_names: list[str]) -> str:
|
||||
|
||||
@ -84,6 +84,41 @@ def strip_frontmatter(content: str) -> str:
|
||||
return body
|
||||
|
||||
|
||||
def extract_required_tool_names(body: str) -> list[str]:
|
||||
"""从 canonical skill 正文的 `## Required Tools` 段落提取工具名。
|
||||
|
||||
这是 frontmatter `tools` 的容错补充,不从任意正文里猜工具。只读取明确
|
||||
命名的 Required Tools section,支持常见 bullet/code 格式。
|
||||
"""
|
||||
|
||||
if not body:
|
||||
return []
|
||||
|
||||
match = re.search(
|
||||
r"(?ims)^##\s+Required\s+Tools\s*$\n(?P<section>.*?)(?=^##\s+|\Z)",
|
||||
body,
|
||||
)
|
||||
if match is None:
|
||||
return []
|
||||
|
||||
names: list[str] = []
|
||||
for line in match.group("section").splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped or not stripped.startswith(("-", "*")):
|
||||
continue
|
||||
candidate = stripped[1:].strip()
|
||||
code_matches = re.findall(r"`([^`]+)`", candidate)
|
||||
raw_items = code_matches or re.split(r"[,,]", candidate)
|
||||
for raw_item in raw_items:
|
||||
name = raw_item.strip().strip("`\"' ")
|
||||
if not name:
|
||||
continue
|
||||
token = name.split()[0].strip("`\"' ::-")
|
||||
if re.fullmatch(r"[A-Za-z0-9_.:-]+", token) and token not in names:
|
||||
names.append(token)
|
||||
return names
|
||||
|
||||
|
||||
def parse_skill_metadata_blob(raw: str) -> dict[str, Any]:
|
||||
"""解析 metadata 字段里的 JSON 扩展配置。
|
||||
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from beaver.engine.context import SkillContext
|
||||
@ -39,7 +41,16 @@ class SkillDraftEvaluator:
|
||||
return self._skipped(candidate, draft)
|
||||
|
||||
runs = self.run_store.list_runs()
|
||||
replay_cases = select_replay_cases(candidate, runs)
|
||||
if replay_runner is not None:
|
||||
replay_cases, case_selection_meta = await _prepare_eval_cases(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=select_replay_cases(candidate, runs),
|
||||
provider_bundle=provider_bundle,
|
||||
)
|
||||
else:
|
||||
replay_cases = []
|
||||
case_selection_meta = {}
|
||||
if replay_runner is not None and replay_cases:
|
||||
return await self._evaluate_replay(
|
||||
candidate=candidate,
|
||||
@ -47,6 +58,7 @@ class SkillDraftEvaluator:
|
||||
replay_cases=replay_cases,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=replay_runner,
|
||||
case_selection_meta=case_selection_meta,
|
||||
)
|
||||
return self._evaluate_heuristic(candidate, draft, runs)
|
||||
|
||||
@ -58,7 +70,7 @@ class SkillDraftEvaluator:
|
||||
) -> SkillDraftEvalReport:
|
||||
runs_by_id = {record.run_id: record for record in runs}
|
||||
cases: list[dict] = []
|
||||
for run_id in candidate.source_run_ids[:8]:
|
||||
for run_id in candidate.source_run_ids[:10]:
|
||||
record = runs_by_id.get(run_id)
|
||||
if record is None:
|
||||
continue
|
||||
@ -116,6 +128,7 @@ class SkillDraftEvaluator:
|
||||
replay_cases: list[dict],
|
||||
provider_bundle: ProviderBundle,
|
||||
replay_runner: ReplayRunner,
|
||||
case_selection_meta: dict[str, Any] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
case_reports: list[dict] = []
|
||||
legacy_cases: list[dict] = []
|
||||
@ -147,17 +160,43 @@ class SkillDraftEvaluator:
|
||||
baseline=baseline,
|
||||
candidate=candidate_arm,
|
||||
)
|
||||
baseline_score = surrogate["baseline_score"]
|
||||
candidate_score = surrogate["candidate_score"]
|
||||
baseline_ability = _ability_score(
|
||||
case=case,
|
||||
arm=baseline,
|
||||
arm_name="baseline",
|
||||
)
|
||||
candidate_ability = _ability_score(
|
||||
case=case,
|
||||
arm=candidate_arm,
|
||||
arm_name="candidate",
|
||||
)
|
||||
baseline_score = baseline_ability["final_score"]
|
||||
candidate_score = candidate_ability["final_score"]
|
||||
tool_execution_score = {
|
||||
"baseline_score": surrogate["baseline_score"],
|
||||
"candidate_score": surrogate["candidate_score"],
|
||||
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
|
||||
"score_role": "diagnostic_only",
|
||||
}
|
||||
case_report = {
|
||||
"run_id": case["run_id"],
|
||||
"task_id": case.get("task_id"),
|
||||
"session_id": case.get("session_id"),
|
||||
"task_text": case.get("task_text"),
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"validator": case.get("validator"),
|
||||
"baseline": baseline,
|
||||
"candidate": candidate_arm,
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"ability_score": {
|
||||
"baseline": baseline_ability,
|
||||
"candidate": candidate_ability,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
},
|
||||
"tool_execution_score": tool_execution_score,
|
||||
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
||||
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
||||
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
||||
@ -172,13 +211,23 @@ class SkillDraftEvaluator:
|
||||
{
|
||||
"run_id": case["run_id"],
|
||||
"session_id": case.get("session_id") or "",
|
||||
"task_text": case.get("task_text") or "",
|
||||
"synthetic": bool(case.get("synthetic")),
|
||||
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
}
|
||||
)
|
||||
preservation_report = _preservation_report(candidate, draft)
|
||||
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
|
||||
return _report_from_case_reports(
|
||||
candidate,
|
||||
draft,
|
||||
case_reports,
|
||||
legacy_cases,
|
||||
preservation_report,
|
||||
case_selection_meta or {},
|
||||
)
|
||||
|
||||
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
||||
return SkillDraftEvalReport(
|
||||
@ -238,22 +287,400 @@ def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -
|
||||
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
|
||||
|
||||
|
||||
async def _prepare_eval_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
provider_bundle: ProviderBundle,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
||||
explicit_cases = _explicit_eval_cases(candidate)
|
||||
merged = _dedupe_cases([*explicit_cases, *historical_cases])
|
||||
usable, excluded = _filter_unscorable_cases(merged)
|
||||
missing = max(0, 10 - len(usable))
|
||||
generated: list[dict[str, Any]] = []
|
||||
if missing:
|
||||
generated = await _generate_synthetic_cases(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=usable,
|
||||
provider_bundle=provider_bundle,
|
||||
count=missing,
|
||||
)
|
||||
generated, generated_excluded = _filter_unscorable_cases(generated)
|
||||
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
|
||||
if len(generated) < missing:
|
||||
generated.extend(
|
||||
_fallback_synthetic_cases(
|
||||
candidate=candidate,
|
||||
historical_cases=usable,
|
||||
start_index=len(generated) + 1,
|
||||
count=missing - len(generated),
|
||||
)
|
||||
)
|
||||
prepared = [*usable, *generated]
|
||||
return prepared[:10], {
|
||||
"requested_case_count": 10,
|
||||
"historical_case_count": len(historical_cases),
|
||||
"explicit_case_count": len(explicit_cases),
|
||||
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
|
||||
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
|
||||
}
|
||||
|
||||
|
||||
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
|
||||
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
|
||||
if not isinstance(raw_cases, list):
|
||||
return []
|
||||
result: list[dict[str, Any]] = []
|
||||
for index, raw in enumerate(raw_cases, start=1):
|
||||
if not isinstance(raw, dict):
|
||||
continue
|
||||
task_text = str(raw.get("task_text") or "").strip()
|
||||
if not task_text:
|
||||
continue
|
||||
case = {
|
||||
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
|
||||
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
|
||||
"session_id": raw.get("session_id") or "explicit-eval",
|
||||
"task_text": task_text,
|
||||
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
|
||||
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
|
||||
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
|
||||
"synthetic": bool(raw.get("synthetic")),
|
||||
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
|
||||
}
|
||||
if isinstance(raw.get("validator"), dict):
|
||||
case["validator"] = dict(raw["validator"])
|
||||
result.append(case)
|
||||
return result
|
||||
|
||||
|
||||
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
for case in cases:
|
||||
run_id = str(case.get("run_id") or "")
|
||||
task_text = str(case.get("task_text") or "")
|
||||
key = run_id or task_text
|
||||
if not key or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(case)
|
||||
return result
|
||||
|
||||
|
||||
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
excluded = {"synthetic_without_validator": 0}
|
||||
for case in cases:
|
||||
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
|
||||
excluded["synthetic_without_validator"] += 1
|
||||
continue
|
||||
result.append(case)
|
||||
return result, excluded
|
||||
|
||||
|
||||
async def _generate_synthetic_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
provider_bundle: ProviderBundle,
|
||||
count: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
|
||||
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
|
||||
model = getattr(runtime, "model", None)
|
||||
try:
|
||||
response = await provider.chat(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You generate validator-first Beaver skill evaluation cases. "
|
||||
"Return only JSON with key cases. Each case must include task_text and validator. "
|
||||
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": _synthetic_case_prompt(
|
||||
candidate=candidate,
|
||||
draft=draft,
|
||||
historical_cases=historical_cases,
|
||||
count=count,
|
||||
),
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
max_tokens=2200,
|
||||
temperature=0.4,
|
||||
)
|
||||
except Exception:
|
||||
return []
|
||||
payload = _parse_json_payload(response.content or "")
|
||||
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
|
||||
if not isinstance(raw_cases, list):
|
||||
return []
|
||||
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
|
||||
|
||||
|
||||
def _synthetic_case_prompt(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
count: int,
|
||||
) -> str:
|
||||
historical = [
|
||||
{
|
||||
"run_id": item.get("run_id"),
|
||||
"task_text": item.get("task_text"),
|
||||
"validator": item.get("validator"),
|
||||
}
|
||||
for item in historical_cases
|
||||
]
|
||||
return (
|
||||
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
|
||||
f"Candidate kind: {candidate.kind}\n"
|
||||
f"Candidate reason: {candidate.reason}\n"
|
||||
f"Draft skill name: {draft.skill_name}\n"
|
||||
f"Related skills: {candidate.related_skill_names}\n"
|
||||
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
|
||||
"Every synthetic case must be validator-first. Return exactly:\n"
|
||||
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
|
||||
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
|
||||
)
|
||||
|
||||
|
||||
def _parse_json_payload(content: str) -> dict[str, Any]:
|
||||
cleaned = content.strip()
|
||||
if cleaned.startswith("```"):
|
||||
cleaned = cleaned.strip("`")
|
||||
if cleaned.startswith("json"):
|
||||
cleaned = cleaned[4:]
|
||||
try:
|
||||
payload = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start < 0 or end <= start:
|
||||
return {}
|
||||
try:
|
||||
payload = json.loads(cleaned[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
|
||||
def _synthetic_case_payloads(
|
||||
candidate: SkillLearningCandidate,
|
||||
raw_cases: list[Any],
|
||||
*,
|
||||
start_index: int,
|
||||
limit: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
result: list[dict[str, Any]] = []
|
||||
for raw in raw_cases:
|
||||
if not isinstance(raw, dict):
|
||||
continue
|
||||
task_text = str(raw.get("task_text") or "").strip()
|
||||
validator = raw.get("validator")
|
||||
if not task_text or not isinstance(validator, dict):
|
||||
continue
|
||||
result.append(
|
||||
_synthetic_case_payload(
|
||||
candidate,
|
||||
task_text,
|
||||
start_index + len(result),
|
||||
validator=dict(validator),
|
||||
tier=str(raw.get("tier") or "bronze"),
|
||||
)
|
||||
)
|
||||
if len(result) >= limit:
|
||||
break
|
||||
return result
|
||||
|
||||
|
||||
def _fallback_synthetic_cases(
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
historical_cases: list[dict[str, Any]],
|
||||
start_index: int,
|
||||
count: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
seed_text = ""
|
||||
if historical_cases:
|
||||
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
|
||||
if not seed_text:
|
||||
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
|
||||
required_terms = _terms(seed_text)[:2] or ["done"]
|
||||
return [
|
||||
_synthetic_case_payload(
|
||||
candidate,
|
||||
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
|
||||
index,
|
||||
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
|
||||
tier="bronze",
|
||||
)
|
||||
for index in range(start_index, start_index + count)
|
||||
]
|
||||
|
||||
|
||||
def _synthetic_case_payload(
|
||||
candidate: SkillLearningCandidate,
|
||||
task_text: str,
|
||||
index: int,
|
||||
*,
|
||||
validator: dict[str, Any],
|
||||
tier: str,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
|
||||
"task_id": f"synthetic-{index:02d}",
|
||||
"session_id": "synthetic-eval",
|
||||
"task_text": task_text,
|
||||
"baseline_skill_names": _baseline_skill_names(candidate),
|
||||
"candidate_skill_name": candidate.draft_skill_name,
|
||||
"accepted_score": 0.75,
|
||||
"synthetic": True,
|
||||
"tier": tier,
|
||||
"validator": validator,
|
||||
}
|
||||
|
||||
|
||||
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
|
||||
if candidate.kind == "revise_skill":
|
||||
return list(candidate.related_skill_names[:1])
|
||||
if candidate.kind == "merge_skills":
|
||||
return list(candidate.related_skill_names)
|
||||
return []
|
||||
|
||||
|
||||
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
|
||||
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
|
||||
if validator is not None:
|
||||
return _ability_from_validator(validator, arm)
|
||||
if not case.get("synthetic"):
|
||||
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
|
||||
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
|
||||
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
|
||||
|
||||
|
||||
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
|
||||
validator_type = str(validator.get("type") or "")
|
||||
if validator_type != "final_answer_contains":
|
||||
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
|
||||
|
||||
answer = str(arm.get("final_answer") or "").lower()
|
||||
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
|
||||
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
|
||||
matched = sum(1 for term in required_terms if term in answer)
|
||||
outcome = matched / max(1, len(required_terms))
|
||||
unsafe = any(term in answer for term in forbidden_terms)
|
||||
safety = 0.0 if unsafe else 1.0
|
||||
final_score = (
|
||||
0.40 * outcome
|
||||
+ 0.25 * outcome
|
||||
+ 0.15 * _process_validity(arm)
|
||||
+ 0.10 * safety
|
||||
+ 0.10 * _path_efficiency(arm, outcome)
|
||||
)
|
||||
return {
|
||||
**_ability_breakdown(score=final_score, source="auto_validator"),
|
||||
"outcome_correctness": round(outcome, 4),
|
||||
"artifact_correctness": round(outcome, 4),
|
||||
"safety_no_regression": round(safety, 4),
|
||||
"validator_type": validator_type,
|
||||
}
|
||||
|
||||
|
||||
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
|
||||
answer = str(arm.get("final_answer") or "").strip()
|
||||
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
|
||||
return _ability_breakdown(score=score, source=source, notes=notes)
|
||||
|
||||
|
||||
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
|
||||
bounded = _bounded_score(score, default=0.0)
|
||||
return {
|
||||
"outcome_correctness": bounded,
|
||||
"artifact_correctness": bounded,
|
||||
"process_validity": bounded,
|
||||
"safety_no_regression": bounded,
|
||||
"path_efficiency": bounded,
|
||||
"final_score": round(bounded, 4),
|
||||
"source": source,
|
||||
"notes": list(notes or []),
|
||||
}
|
||||
|
||||
|
||||
def _process_validity(arm: dict[str, Any]) -> float:
|
||||
if arm.get("finish_reason") == "error":
|
||||
return 0.2
|
||||
return 0.8 if arm.get("tool_calls") else 0.6
|
||||
|
||||
|
||||
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
|
||||
if outcome < 0.5:
|
||||
return 0.3
|
||||
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
|
||||
if call_count <= 3:
|
||||
return 1.0
|
||||
if call_count <= 6:
|
||||
return 0.7
|
||||
return 0.4
|
||||
|
||||
|
||||
def _bounded_score(value: Any, *, default: float) -> float:
|
||||
try:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _terms(text: str) -> list[str]:
|
||||
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
|
||||
|
||||
|
||||
def _report_from_case_reports(
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
case_reports: list[dict],
|
||||
legacy_cases: list[dict],
|
||||
preservation_report: dict | None,
|
||||
case_selection_meta: dict[str, Any] | None = None,
|
||||
) -> SkillDraftEvalReport:
|
||||
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
|
||||
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
|
||||
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
|
||||
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
|
||||
unchanged = len(legacy_cases) - len(regressions) - len(improved)
|
||||
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
|
||||
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
|
||||
execution, surrogate, blocked = _coverage(case_reports)
|
||||
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
|
||||
score_delta = candidate_avg - baseline_avg
|
||||
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
|
||||
selection_meta = dict(case_selection_meta or {})
|
||||
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
|
||||
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
|
||||
overall_score_avg = round(candidate_avg, 4)
|
||||
ability_summary = {
|
||||
"score_role": "primary",
|
||||
"real_case_count": len(real_cases),
|
||||
"synthetic_case_count": len(synthetic_cases),
|
||||
"real_score_avg": real_score_avg,
|
||||
"synthetic_score_avg": synthetic_score_avg,
|
||||
"overall_score_avg": overall_score_avg,
|
||||
}
|
||||
tool_execution_summary = {
|
||||
"score_role": "diagnostic_only",
|
||||
"executed": execution,
|
||||
"surrogate": surrogate,
|
||||
"blocked": blocked,
|
||||
}
|
||||
return SkillDraftEvalReport(
|
||||
report_id=uuid4().hex,
|
||||
skill_name=draft.skill_name,
|
||||
@ -276,11 +703,34 @@ def _report_from_case_reports(
|
||||
blocked_coverage=blocked,
|
||||
confidence=confidence,
|
||||
case_reports=case_reports,
|
||||
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
|
||||
tool_mode_summary={
|
||||
"executed": execution,
|
||||
"surrogate": surrogate,
|
||||
"blocked": blocked,
|
||||
"score_role": "diagnostic_only",
|
||||
"real_case_count": len(real_cases),
|
||||
"synthetic_case_count": len(synthetic_cases),
|
||||
"real_score_avg": real_score_avg,
|
||||
"synthetic_score_avg": synthetic_score_avg,
|
||||
"overall_score_avg": overall_score_avg,
|
||||
**selection_meta,
|
||||
},
|
||||
ability_score_summary=ability_summary,
|
||||
tool_execution_summary=tool_execution_summary,
|
||||
case_selection_summary=selection_meta,
|
||||
real_score_avg=real_score_avg,
|
||||
synthetic_score_avg=synthetic_score_avg,
|
||||
overall_score_avg=overall_score_avg,
|
||||
preservation_report=preservation_report,
|
||||
)
|
||||
|
||||
|
||||
def _avg(values: list[float]) -> float | None:
|
||||
if not values:
|
||||
return None
|
||||
return round(sum(values) / len(values), 4)
|
||||
|
||||
|
||||
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
|
||||
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
|
||||
for report in case_reports:
|
||||
|
||||
@ -323,8 +323,8 @@ class SkillLearningPipelineService:
|
||||
|
||||
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
|
||||
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
|
||||
if not any(review.status == SkillReviewState.APPROVED.value for review in reviews):
|
||||
raise ValueError("Draft must have an approved review before publish")
|
||||
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
|
||||
raise ValueError("Draft must be submitted for review before publish")
|
||||
safety = self.get_safety_report(draft.skill_name, draft.draft_id)
|
||||
if safety is None:
|
||||
raise ValueError("Draft requires a passing safety report before publish")
|
||||
|
||||
@ -162,18 +162,23 @@ class ReplayRunner:
|
||||
registry=loaded.tool_registry,
|
||||
policy=self.policy,
|
||||
)
|
||||
result = await self.agent_loop.process_direct(
|
||||
request.task_text,
|
||||
provider_bundle=request.provider_bundle,
|
||||
include_skill_assembly=False,
|
||||
include_tools=True,
|
||||
pinned_skill_names=request.pinned_skill_names,
|
||||
pinned_skill_contexts=request.pinned_skill_contexts,
|
||||
max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4),
|
||||
temperature=float(request.model_settings.get("temperature") or 0.0),
|
||||
source="skill_replay_eval",
|
||||
tool_executor_override=replay_executor,
|
||||
)
|
||||
direct_kwargs = {
|
||||
"provider_bundle": request.provider_bundle,
|
||||
"include_skill_assembly": False,
|
||||
"include_tools": True,
|
||||
"pinned_skill_names": request.pinned_skill_names,
|
||||
"pinned_skill_contexts": request.pinned_skill_contexts,
|
||||
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
|
||||
"temperature": float(request.model_settings.get("temperature") or 0.0),
|
||||
"source": "skill_replay_eval",
|
||||
"tool_executor_override": replay_executor,
|
||||
}
|
||||
try:
|
||||
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
|
||||
except RuntimeError as exc:
|
||||
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
|
||||
raise
|
||||
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
|
||||
return {
|
||||
"case_id": request.case_id,
|
||||
"arm": request.arm,
|
||||
@ -188,6 +193,14 @@ class ReplayRunner:
|
||||
}
|
||||
|
||||
|
||||
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
|
||||
message = str(exc)
|
||||
return (
|
||||
"AgentLoop.process_direct() is disabled while run() is active" in message
|
||||
and "submit tasks via submit_direct() instead" in message
|
||||
)
|
||||
|
||||
|
||||
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
effects: list[dict[str, Any]] = []
|
||||
for trace in traces:
|
||||
|
||||
@ -99,6 +99,7 @@ class SkillLearningService:
|
||||
]
|
||||
source_run_ids = [record.run_id for record in source_runs]
|
||||
source_session_ids = list(dict.fromkeys(record.session_id for record in source_runs))
|
||||
representative_task_text = self._representative_task_text(source_runs, fallback=final_run.task_text)
|
||||
|
||||
if not published_receipts:
|
||||
candidates.append(
|
||||
@ -113,7 +114,8 @@ class SkillLearningService:
|
||||
"task_id": task_id,
|
||||
"final_accepted_run_id": final_accepted_run_id,
|
||||
"source_run_ids": source_run_ids,
|
||||
"theme": self._task_theme(final_run.task_text),
|
||||
"task_text": representative_task_text,
|
||||
"theme": self._task_theme(representative_task_text),
|
||||
},
|
||||
status="open",
|
||||
priority=1,
|
||||
@ -329,8 +331,14 @@ class SkillLearningService:
|
||||
|
||||
def _build_new_skill_candidates(self) -> list[SkillLearningCandidate]:
|
||||
groups: dict[str, list[RunRecord]] = {}
|
||||
for record in self.run_store.list_runs():
|
||||
key = self._task_theme(record.task_text)
|
||||
all_runs = self.run_store.list_runs()
|
||||
runs_by_task: dict[str, list[RunRecord]] = {}
|
||||
for record in all_runs:
|
||||
if record.task_id:
|
||||
runs_by_task.setdefault(record.task_id, []).append(record)
|
||||
for record in all_runs:
|
||||
task_runs = runs_by_task.get(record.task_id, [record])
|
||||
key = self._task_theme(self._representative_task_text(task_runs, fallback=record.task_text))
|
||||
if not key:
|
||||
continue
|
||||
groups.setdefault(key, []).append(record)
|
||||
@ -443,12 +451,24 @@ class SkillLearningService:
|
||||
|
||||
@staticmethod
|
||||
def _task_theme(task_text: str) -> str:
|
||||
cleaned = re.sub(r"\s+", " ", task_text.strip().lower())
|
||||
cleaned = re.sub(r"\s+", " ", task_text.strip())
|
||||
if not cleaned:
|
||||
return ""
|
||||
words = cleaned.split(" ")
|
||||
first_sentence = re.split(r"[。!?.!?]", cleaned, maxsplit=1)[0].strip()
|
||||
if not first_sentence:
|
||||
first_sentence = cleaned
|
||||
words = first_sentence.split(" ")
|
||||
return " ".join(words[:8]).strip()
|
||||
|
||||
@staticmethod
|
||||
def _representative_task_text(runs: list[RunRecord], *, fallback: str = "") -> str:
|
||||
ordered = sorted(runs, key=lambda item: (item.attempt_index, item.started_at, item.run_id))
|
||||
for record in ordered:
|
||||
text = record.task_text.strip()
|
||||
if text:
|
||||
return text
|
||||
return fallback.strip()
|
||||
|
||||
@staticmethod
|
||||
def _suggest_skill_name(
|
||||
candidate: SkillLearningCandidate,
|
||||
|
||||
@ -15,12 +15,15 @@ class SurrogateToolEvaluator:
|
||||
return {
|
||||
"baseline_score": baseline_score,
|
||||
"candidate_score": candidate_score,
|
||||
"baseline_tool_execution_score": baseline_score,
|
||||
"candidate_tool_execution_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline_score, 4),
|
||||
"surrogate_tool_count": surrogate_count,
|
||||
"blocked_tool_count": blocked_count,
|
||||
"score_role": "diagnostic_only",
|
||||
"confidence": confidence,
|
||||
"notes": [
|
||||
"Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.",
|
||||
"Tool execution score is diagnostic only and is not the main task ability score.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ import json
|
||||
from typing import Any
|
||||
|
||||
from beaver.engine.providers.base import LLMProvider
|
||||
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
|
||||
from beaver.skills.learning.evidence import EvidencePacket
|
||||
from beaver.memory.skills.models import SkillLearningCandidate
|
||||
|
||||
@ -58,7 +59,8 @@ class SkillDraftSynthesizer:
|
||||
"content": (
|
||||
"You synthesize Beaver skill drafts from execution evidence. "
|
||||
"Return only JSON with keys: frontmatter, content, change_reason, "
|
||||
"preserved_sections, changed_sections, dropped_sections."
|
||||
"preserved_sections, changed_sections, dropped_sections. "
|
||||
"The content must follow the Canonical Beaver SKILL.md format."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
@ -113,6 +115,7 @@ class SkillDraftSynthesizer:
|
||||
+ "\n- tools: an explicit JSON array of exact tool names this skill needs. "
|
||||
+ "Prefer called tool names when the workflow depends on them; use run-selected tool names only when clearly required. "
|
||||
+ "Use [] only when no tool is required."
|
||||
+ "\n\n" + canonical_skill_format_instructions()
|
||||
+ "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays."
|
||||
)
|
||||
|
||||
@ -144,14 +147,23 @@ class SkillDraftSynthesizer:
|
||||
|
||||
@staticmethod
|
||||
def _normalize_payload(payload: dict[str, Any], evidence_packet: EvidencePacket) -> dict[str, Any]:
|
||||
frontmatter = dict(payload.get("frontmatter") or {})
|
||||
frontmatter = normalize_skill_frontmatter(
|
||||
dict(payload.get("frontmatter") or {}),
|
||||
skill_name=str((payload.get("frontmatter") or {}).get("name") or "generated-skill"),
|
||||
)
|
||||
tool_hints = _coerce_string_list(frontmatter.get("tools"))
|
||||
if not tool_hints:
|
||||
tool_hints = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
|
||||
frontmatter["tools"] = tool_hints
|
||||
content = ensure_canonical_skill_body(
|
||||
str(payload.get("content") or "").strip(),
|
||||
title=str(frontmatter.get("name") or "generated-skill"),
|
||||
description=str(frontmatter.get("description") or ""),
|
||||
tools=tool_hints,
|
||||
)
|
||||
return {
|
||||
"frontmatter": frontmatter,
|
||||
"content": str(payload.get("content") or "").strip(),
|
||||
"content": content,
|
||||
"change_reason": str(payload.get("change_reason") or ""),
|
||||
"preserved_sections": _coerce_string_list(payload.get("preserved_sections")),
|
||||
"changed_sections": _coerce_string_list(payload.get("changed_sections")),
|
||||
@ -162,13 +174,20 @@ class SkillDraftSynthesizer:
|
||||
def _fallback_payload(candidate: SkillLearningCandidate, evidence_packet: EvidencePacket, action: str) -> dict[str, Any]:
|
||||
related = candidate.related_skill_names[0] if candidate.related_skill_names else "generated-skill"
|
||||
title = related.replace("_", "-")
|
||||
content = "\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured."
|
||||
tools = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
|
||||
content = ensure_canonical_skill_body(
|
||||
"\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured.",
|
||||
title=title,
|
||||
description=candidate.reason or f"Auto-generated {action} draft for {title}.",
|
||||
tools=tools,
|
||||
)
|
||||
return {
|
||||
"frontmatter": {
|
||||
"name": title,
|
||||
"description": candidate.reason or f"Auto-generated {action} draft for {title}.",
|
||||
"tools": _coerce_string_list(evidence_packet.metadata.get("tool_names")),
|
||||
"tools": tools,
|
||||
},
|
||||
"content": f"# {title}\n\n## Evidence\n\n{content}\n",
|
||||
"content": content,
|
||||
"change_reason": candidate.reason or f"Fallback {action} synthesis.",
|
||||
"preserved_sections": [],
|
||||
"changed_sections": [],
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Callable
|
||||
from beaver.engine.providers import ProviderBundle
|
||||
from beaver.memory.skills import SkillLearningCandidate
|
||||
from beaver.skills.learning.pipeline import SkillLearningPipelineService
|
||||
from beaver.skills.learning.replay import ReplayRunner
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@ -57,10 +58,12 @@ class SkillLearningWorker:
|
||||
*,
|
||||
pipeline: SkillLearningPipelineService,
|
||||
provider_bundle_factory: Callable[[], ProviderBundle],
|
||||
replay_runner_factory: Callable[[], ReplayRunner] | None = None,
|
||||
config: SkillLearningWorkerConfig | None = None,
|
||||
) -> None:
|
||||
self.pipeline = pipeline
|
||||
self.provider_bundle_factory = provider_bundle_factory
|
||||
self.replay_runner_factory = replay_runner_factory
|
||||
self.config = config or SkillLearningWorkerConfig.from_env()
|
||||
self._running = False
|
||||
self._lock = asyncio.Lock()
|
||||
@ -126,6 +129,7 @@ class SkillLearningWorker:
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=self.provider_bundle_factory(),
|
||||
replay_runner=self.replay_runner_factory() if self.replay_runner_factory is not None else None,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
@ -16,8 +16,8 @@ class SkillPublisher:
|
||||
|
||||
def publish(self, skill_name: str, draft_id: str, publisher: str, notes: str = "") -> SkillVersion:
|
||||
draft = self._require_draft(skill_name, draft_id)
|
||||
if draft.status != SkillReviewState.APPROVED.value:
|
||||
raise ValueError("Draft must be approved before publish")
|
||||
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
|
||||
raise ValueError("Draft must be submitted for review before publish")
|
||||
if draft.proposal_kind == "retire_skill":
|
||||
raise ValueError("Retire proposals must be applied through apply_retire_proposal")
|
||||
|
||||
@ -81,8 +81,8 @@ class SkillPublisher:
|
||||
|
||||
def apply_retire_proposal(self, skill_name: str, draft_id: str, actor: str, notes: str = "") -> SkillSpec:
|
||||
draft = self._require_draft(skill_name, draft_id)
|
||||
if draft.status != SkillReviewState.APPROVED.value:
|
||||
raise ValueError("Retire proposal must be approved before apply")
|
||||
if draft.status not in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value}:
|
||||
raise ValueError("Retire proposal must be submitted for review before apply")
|
||||
if draft.proposal_kind != "retire_skill":
|
||||
raise ValueError("Only retire_skill proposals can be applied as retire proposals")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user