feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions

View File

@ -2,6 +2,8 @@
from __future__ import annotations
import json
from typing import Any
from uuid import uuid4
from beaver.engine.context import SkillContext
@ -39,7 +41,16 @@ class SkillDraftEvaluator:
return self._skipped(candidate, draft)
runs = self.run_store.list_runs()
replay_cases = select_replay_cases(candidate, runs)
if replay_runner is not None:
replay_cases, case_selection_meta = await _prepare_eval_cases(
candidate=candidate,
draft=draft,
historical_cases=select_replay_cases(candidate, runs),
provider_bundle=provider_bundle,
)
else:
replay_cases = []
case_selection_meta = {}
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
@ -47,6 +58,7 @@ class SkillDraftEvaluator:
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
)
return self._evaluate_heuristic(candidate, draft, runs)
@ -58,7 +70,7 @@ class SkillDraftEvaluator:
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = []
for run_id in candidate.source_run_ids[:8]:
for run_id in candidate.source_run_ids[:10]:
record = runs_by_id.get(run_id)
if record is None:
continue
@ -116,6 +128,7 @@ class SkillDraftEvaluator:
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
@ -147,17 +160,43 @@ class SkillDraftEvaluator:
baseline=baseline,
candidate=candidate_arm,
)
baseline_score = surrogate["baseline_score"]
candidate_score = surrogate["candidate_score"]
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
@ -172,13 +211,23 @@ class SkillDraftEvaluator:
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(candidate, draft, case_reports, legacy_cases, preservation_report)
return _report_from_case_reports(
candidate,
draft,
case_reports,
legacy_cases,
preservation_report,
case_selection_meta or {},
)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport(
@ -238,22 +287,400 @@ def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
async def _prepare_eval_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
explicit_cases = _explicit_eval_cases(candidate)
merged = _dedupe_cases([*explicit_cases, *historical_cases])
usable, excluded = _filter_unscorable_cases(merged)
missing = max(0, 10 - len(usable))
generated: list[dict[str, Any]] = []
if missing:
generated = await _generate_synthetic_cases(
candidate=candidate,
draft=draft,
historical_cases=usable,
provider_bundle=provider_bundle,
count=missing,
)
generated, generated_excluded = _filter_unscorable_cases(generated)
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
if len(generated) < missing:
generated.extend(
_fallback_synthetic_cases(
candidate=candidate,
historical_cases=usable,
start_index=len(generated) + 1,
count=missing - len(generated),
)
)
prepared = [*usable, *generated]
return prepared[:10], {
"requested_case_count": 10,
"historical_case_count": len(historical_cases),
"explicit_case_count": len(explicit_cases),
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
}
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
if not isinstance(raw_cases, list):
return []
result: list[dict[str, Any]] = []
for index, raw in enumerate(raw_cases, start=1):
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
if not task_text:
continue
case = {
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
"session_id": raw.get("session_id") or "explicit-eval",
"task_text": task_text,
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
"synthetic": bool(raw.get("synthetic")),
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
}
if isinstance(raw.get("validator"), dict):
case["validator"] = dict(raw["validator"])
result.append(case)
return result
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
seen: set[str] = set()
for case in cases:
run_id = str(case.get("run_id") or "")
task_text = str(case.get("task_text") or "")
key = run_id or task_text
if not key or key in seen:
continue
seen.add(key)
result.append(case)
return result
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
result: list[dict[str, Any]] = []
excluded = {"synthetic_without_validator": 0}
for case in cases:
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
excluded["synthetic_without_validator"] += 1
continue
result.append(case)
return result, excluded
async def _generate_synthetic_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
count: int,
) -> list[dict[str, Any]]:
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
model = getattr(runtime, "model", None)
try:
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You generate validator-first Beaver skill evaluation cases. "
"Return only JSON with key cases. Each case must include task_text and validator. "
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
),
},
{
"role": "user",
"content": _synthetic_case_prompt(
candidate=candidate,
draft=draft,
historical_cases=historical_cases,
count=count,
),
},
],
model=model,
max_tokens=2200,
temperature=0.4,
)
except Exception:
return []
payload = _parse_json_payload(response.content or "")
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
if not isinstance(raw_cases, list):
return []
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
def _synthetic_case_prompt(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
count: int,
) -> str:
historical = [
{
"run_id": item.get("run_id"),
"task_text": item.get("task_text"),
"validator": item.get("validator"),
}
for item in historical_cases
]
return (
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
f"Candidate kind: {candidate.kind}\n"
f"Candidate reason: {candidate.reason}\n"
f"Draft skill name: {draft.skill_name}\n"
f"Related skills: {candidate.related_skill_names}\n"
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
"Every synthetic case must be validator-first. Return exactly:\n"
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
)
def _parse_json_payload(content: str) -> dict[str, Any]:
cleaned = content.strip()
if cleaned.startswith("```"):
cleaned = cleaned.strip("`")
if cleaned.startswith("json"):
cleaned = cleaned[4:]
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start < 0 or end <= start:
return {}
try:
payload = json.loads(cleaned[start : end + 1])
except json.JSONDecodeError:
return {}
return payload if isinstance(payload, dict) else {}
def _synthetic_case_payloads(
candidate: SkillLearningCandidate,
raw_cases: list[Any],
*,
start_index: int,
limit: int,
) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for raw in raw_cases:
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
validator = raw.get("validator")
if not task_text or not isinstance(validator, dict):
continue
result.append(
_synthetic_case_payload(
candidate,
task_text,
start_index + len(result),
validator=dict(validator),
tier=str(raw.get("tier") or "bronze"),
)
)
if len(result) >= limit:
break
return result
def _fallback_synthetic_cases(
*,
candidate: SkillLearningCandidate,
historical_cases: list[dict[str, Any]],
start_index: int,
count: int,
) -> list[dict[str, Any]]:
seed_text = ""
if historical_cases:
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
if not seed_text:
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
required_terms = _terms(seed_text)[:2] or ["done"]
return [
_synthetic_case_payload(
candidate,
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
index,
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
tier="bronze",
)
for index in range(start_index, start_index + count)
]
def _synthetic_case_payload(
candidate: SkillLearningCandidate,
task_text: str,
index: int,
*,
validator: dict[str, Any],
tier: str,
) -> dict[str, Any]:
return {
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
"task_id": f"synthetic-{index:02d}",
"session_id": "synthetic-eval",
"task_text": task_text,
"baseline_skill_names": _baseline_skill_names(candidate),
"candidate_skill_name": candidate.draft_skill_name,
"accepted_score": 0.75,
"synthetic": True,
"tier": tier,
"validator": validator,
}
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
if candidate.kind == "revise_skill":
return list(candidate.related_skill_names[:1])
if candidate.kind == "merge_skills":
return list(candidate.related_skill_names)
return []
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
validator_type = str(validator.get("type") or "")
if validator_type != "final_answer_contains":
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
answer = str(arm.get("final_answer") or "").lower()
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
matched = sum(1 for term in required_terms if term in answer)
outcome = matched / max(1, len(required_terms))
unsafe = any(term in answer for term in forbidden_terms)
safety = 0.0 if unsafe else 1.0
final_score = (
0.40 * outcome
+ 0.25 * outcome
+ 0.15 * _process_validity(arm)
+ 0.10 * safety
+ 0.10 * _path_efficiency(arm, outcome)
)
return {
**_ability_breakdown(score=final_score, source="auto_validator"),
"outcome_correctness": round(outcome, 4),
"artifact_correctness": round(outcome, 4),
"safety_no_regression": round(safety, 4),
"validator_type": validator_type,
}
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
answer = str(arm.get("final_answer") or "").strip()
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
return _ability_breakdown(score=score, source=source, notes=notes)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {
"outcome_correctness": bounded,
"artifact_correctness": bounded,
"process_validity": bounded,
"safety_no_regression": bounded,
"path_efficiency": bounded,
"final_score": round(bounded, 4),
"source": source,
"notes": list(notes or []),
}
def _process_validity(arm: dict[str, Any]) -> float:
if arm.get("finish_reason") == "error":
return 0.2
return 0.8 if arm.get("tool_calls") else 0.6
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
if outcome < 0.5:
return 0.3
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
if call_count <= 3:
return 1.0
if call_count <= 6:
return 0.7
return 0.4
def _bounded_score(value: Any, *, default: float) -> float:
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _terms(text: str) -> list[str]:
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
selection_meta = dict(case_selection_meta or {})
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
overall_score_avg = round(candidate_avg, 4)
ability_summary = {
"score_role": "primary",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
}
tool_execution_summary = {
"score_role": "diagnostic_only",
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
}
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
@ -276,11 +703,34 @@ def _report_from_case_reports(
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={"executed": execution, "surrogate": surrogate, "blocked": blocked},
tool_mode_summary={
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
"score_role": "diagnostic_only",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
**selection_meta,
},
ability_score_summary=ability_summary,
tool_execution_summary=tool_execution_summary,
case_selection_summary=selection_meta,
real_score_avg=real_score_avg,
synthetic_score_avg=synthetic_score_avg,
overall_score_avg=overall_score_avg,
preservation_report=preservation_report,
)
def _avg(values: list[float]) -> float | None:
if not values:
return None
return round(sum(values) / len(values), 4)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:

View File

@ -323,8 +323,8 @@ class SkillLearningPipelineService:
def _validate_publish_gates(self, draft: SkillDraft, *, confirm_high_risk: bool) -> None:
reviews = self.reviews_for_draft(draft.skill_name, draft.draft_id)
if not any(review.status == SkillReviewState.APPROVED.value for review in reviews):
raise ValueError("Draft must have an approved review before publish")
if not any(review.status in {SkillReviewState.IN_REVIEW.value, SkillReviewState.APPROVED.value} for review in reviews):
raise ValueError("Draft must be submitted for review before publish")
safety = self.get_safety_report(draft.skill_name, draft.draft_id)
if safety is None:
raise ValueError("Draft requires a passing safety report before publish")

View File

@ -162,18 +162,23 @@ class ReplayRunner:
registry=loaded.tool_registry,
policy=self.policy,
)
result = await self.agent_loop.process_direct(
request.task_text,
provider_bundle=request.provider_bundle,
include_skill_assembly=False,
include_tools=True,
pinned_skill_names=request.pinned_skill_names,
pinned_skill_contexts=request.pinned_skill_contexts,
max_tool_iterations=int(request.model_settings.get("max_tool_iterations") or 4),
temperature=float(request.model_settings.get("temperature") or 0.0),
source="skill_replay_eval",
tool_executor_override=replay_executor,
)
direct_kwargs = {
"provider_bundle": request.provider_bundle,
"include_skill_assembly": False,
"include_tools": True,
"pinned_skill_names": request.pinned_skill_names,
"pinned_skill_contexts": request.pinned_skill_contexts,
"max_tool_iterations": int(request.model_settings.get("max_tool_iterations") or 4),
"temperature": float(request.model_settings.get("temperature") or 0.0),
"source": "skill_replay_eval",
"tool_executor_override": replay_executor,
}
try:
result = await self.agent_loop.process_direct(request.task_text, **direct_kwargs)
except RuntimeError as exc:
if not _is_process_direct_disabled_while_running(exc) or not hasattr(self.agent_loop, "submit_direct"):
raise
result = await self.agent_loop.submit_direct(request.task_text, **direct_kwargs)
return {
"case_id": request.case_id,
"arm": request.arm,
@ -188,6 +193,14 @@ class ReplayRunner:
}
def _is_process_direct_disabled_while_running(exc: RuntimeError) -> bool:
message = str(exc)
return (
"AgentLoop.process_direct() is disabled while run() is active" in message
and "submit tasks via submit_direct() instead" in message
)
def _side_effects_from_traces(traces: list[dict[str, Any]]) -> list[dict[str, Any]]:
effects: list[dict[str, Any]] = []
for trace in traces:

View File

@ -99,6 +99,7 @@ class SkillLearningService:
]
source_run_ids = [record.run_id for record in source_runs]
source_session_ids = list(dict.fromkeys(record.session_id for record in source_runs))
representative_task_text = self._representative_task_text(source_runs, fallback=final_run.task_text)
if not published_receipts:
candidates.append(
@ -113,7 +114,8 @@ class SkillLearningService:
"task_id": task_id,
"final_accepted_run_id": final_accepted_run_id,
"source_run_ids": source_run_ids,
"theme": self._task_theme(final_run.task_text),
"task_text": representative_task_text,
"theme": self._task_theme(representative_task_text),
},
status="open",
priority=1,
@ -329,8 +331,14 @@ class SkillLearningService:
def _build_new_skill_candidates(self) -> list[SkillLearningCandidate]:
groups: dict[str, list[RunRecord]] = {}
for record in self.run_store.list_runs():
key = self._task_theme(record.task_text)
all_runs = self.run_store.list_runs()
runs_by_task: dict[str, list[RunRecord]] = {}
for record in all_runs:
if record.task_id:
runs_by_task.setdefault(record.task_id, []).append(record)
for record in all_runs:
task_runs = runs_by_task.get(record.task_id, [record])
key = self._task_theme(self._representative_task_text(task_runs, fallback=record.task_text))
if not key:
continue
groups.setdefault(key, []).append(record)
@ -443,12 +451,24 @@ class SkillLearningService:
@staticmethod
def _task_theme(task_text: str) -> str:
cleaned = re.sub(r"\s+", " ", task_text.strip().lower())
cleaned = re.sub(r"\s+", " ", task_text.strip())
if not cleaned:
return ""
words = cleaned.split(" ")
first_sentence = re.split(r"[。!?.!?]", cleaned, maxsplit=1)[0].strip()
if not first_sentence:
first_sentence = cleaned
words = first_sentence.split(" ")
return " ".join(words[:8]).strip()
@staticmethod
def _representative_task_text(runs: list[RunRecord], *, fallback: str = "") -> str:
ordered = sorted(runs, key=lambda item: (item.attempt_index, item.started_at, item.run_id))
for record in ordered:
text = record.task_text.strip()
if text:
return text
return fallback.strip()
@staticmethod
def _suggest_skill_name(
candidate: SkillLearningCandidate,

View File

@ -15,12 +15,15 @@ class SurrogateToolEvaluator:
return {
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"baseline_tool_execution_score": baseline_score,
"candidate_tool_execution_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"surrogate_tool_count": surrogate_count,
"blocked_tool_count": blocked_count,
"score_role": "diagnostic_only",
"confidence": confidence,
"notes": [
"Surrogate score is based on intended tool calls, schemas, arguments, and task relevance.",
"Tool execution score is diagnostic only and is not the main task ability score.",
],
}

View File

@ -6,6 +6,7 @@ import json
from typing import Any
from beaver.engine.providers.base import LLMProvider
from beaver.skills.authoring import canonical_skill_format_instructions, ensure_canonical_skill_body, normalize_skill_frontmatter
from beaver.skills.learning.evidence import EvidencePacket
from beaver.memory.skills.models import SkillLearningCandidate
@ -58,7 +59,8 @@ class SkillDraftSynthesizer:
"content": (
"You synthesize Beaver skill drafts from execution evidence. "
"Return only JSON with keys: frontmatter, content, change_reason, "
"preserved_sections, changed_sections, dropped_sections."
"preserved_sections, changed_sections, dropped_sections. "
"The content must follow the Canonical Beaver SKILL.md format."
),
},
{"role": "user", "content": prompt},
@ -113,6 +115,7 @@ class SkillDraftSynthesizer:
+ "\n- tools: an explicit JSON array of exact tool names this skill needs. "
+ "Prefer called tool names when the workflow depends on them; use run-selected tool names only when clearly required. "
+ "Use [] only when no tool is required."
+ "\n\n" + canonical_skill_format_instructions()
+ "\nThe JSON may include preserved_sections, changed_sections, and dropped_sections arrays."
)
@ -144,14 +147,23 @@ class SkillDraftSynthesizer:
@staticmethod
def _normalize_payload(payload: dict[str, Any], evidence_packet: EvidencePacket) -> dict[str, Any]:
frontmatter = dict(payload.get("frontmatter") or {})
frontmatter = normalize_skill_frontmatter(
dict(payload.get("frontmatter") or {}),
skill_name=str((payload.get("frontmatter") or {}).get("name") or "generated-skill"),
)
tool_hints = _coerce_string_list(frontmatter.get("tools"))
if not tool_hints:
tool_hints = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
frontmatter["tools"] = tool_hints
content = ensure_canonical_skill_body(
str(payload.get("content") or "").strip(),
title=str(frontmatter.get("name") or "generated-skill"),
description=str(frontmatter.get("description") or ""),
tools=tool_hints,
)
return {
"frontmatter": frontmatter,
"content": str(payload.get("content") or "").strip(),
"content": content,
"change_reason": str(payload.get("change_reason") or ""),
"preserved_sections": _coerce_string_list(payload.get("preserved_sections")),
"changed_sections": _coerce_string_list(payload.get("changed_sections")),
@ -162,13 +174,20 @@ class SkillDraftSynthesizer:
def _fallback_payload(candidate: SkillLearningCandidate, evidence_packet: EvidencePacket, action: str) -> dict[str, Any]:
related = candidate.related_skill_names[0] if candidate.related_skill_names else "generated-skill"
title = related.replace("_", "-")
content = "\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured."
tools = _coerce_string_list(evidence_packet.metadata.get("tool_names"))
content = ensure_canonical_skill_body(
"\n".join(f"- {item}" for item in evidence_packet.task_summaries[:5]) or "- No evidence captured.",
title=title,
description=candidate.reason or f"Auto-generated {action} draft for {title}.",
tools=tools,
)
return {
"frontmatter": {
"name": title,
"description": candidate.reason or f"Auto-generated {action} draft for {title}.",
"tools": _coerce_string_list(evidence_packet.metadata.get("tool_names")),
"tools": tools,
},
"content": f"# {title}\n\n## Evidence\n\n{content}\n",
"content": content,
"change_reason": candidate.reason or f"Fallback {action} synthesis.",
"preserved_sections": [],
"changed_sections": [],

View File

@ -10,6 +10,7 @@ from typing import Callable
from beaver.engine.providers import ProviderBundle
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.learning.pipeline import SkillLearningPipelineService
from beaver.skills.learning.replay import ReplayRunner
@dataclass(slots=True)
@ -57,10 +58,12 @@ class SkillLearningWorker:
*,
pipeline: SkillLearningPipelineService,
provider_bundle_factory: Callable[[], ProviderBundle],
replay_runner_factory: Callable[[], ReplayRunner] | None = None,
config: SkillLearningWorkerConfig | None = None,
) -> None:
self.pipeline = pipeline
self.provider_bundle_factory = provider_bundle_factory
self.replay_runner_factory = replay_runner_factory
self.config = config or SkillLearningWorkerConfig.from_env()
self._running = False
self._lock = asyncio.Lock()
@ -126,6 +129,7 @@ class SkillLearningWorker:
draft.skill_name,
draft.draft_id,
provider_bundle=self.provider_bundle_factory(),
replay_runner=self.replay_runner_factory() if self.replay_runner_factory is not None else None,
)
return True