移除了agents/registry.json中的所有内置agents配置,将agents数组清空。 为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。 重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。 新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。 更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。 修改了意图路由技能的说明,改进任务状态管理逻辑。
776 lines
30 KiB
Python
776 lines
30 KiB
Python
"""Lightweight replay/eval reports for skill drafts."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
from uuid import uuid4
|
|
|
|
from beaver.engine.context import SkillContext
|
|
from beaver.engine.providers import ProviderBundle
|
|
from beaver.memory.runs import RunMemoryStore
|
|
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
|
|
from beaver.skills.learning.case_selection import select_replay_cases
|
|
from beaver.skills.learning.preservation import check_preservation
|
|
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
|
|
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
|
|
from beaver.skills.specs import SkillDraft
|
|
|
|
|
|
class SkillDraftEvaluator:
|
|
"""Builds a bounded eval report without writing user-visible sessions."""
|
|
|
|
def __init__(
|
|
self,
|
|
run_store: RunMemoryStore,
|
|
*,
|
|
surrogate_evaluator: SurrogateToolEvaluator | None = None,
|
|
) -> None:
|
|
self.run_store = run_store
|
|
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
|
|
|
|
async def evaluate(
|
|
self,
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
provider_bundle: ProviderBundle | None,
|
|
replay_runner: ReplayRunner | None = None,
|
|
) -> SkillDraftEvalReport:
|
|
if provider_bundle is None or provider_bundle.main_provider is None:
|
|
return self._skipped(candidate, draft)
|
|
|
|
runs = self.run_store.list_runs()
|
|
if replay_runner is not None:
|
|
replay_cases, case_selection_meta = await _prepare_eval_cases(
|
|
candidate=candidate,
|
|
draft=draft,
|
|
historical_cases=select_replay_cases(candidate, runs),
|
|
provider_bundle=provider_bundle,
|
|
)
|
|
else:
|
|
replay_cases = []
|
|
case_selection_meta = {}
|
|
if replay_runner is not None and replay_cases:
|
|
return await self._evaluate_replay(
|
|
candidate=candidate,
|
|
draft=draft,
|
|
replay_cases=replay_cases,
|
|
provider_bundle=provider_bundle,
|
|
replay_runner=replay_runner,
|
|
case_selection_meta=case_selection_meta,
|
|
)
|
|
return self._evaluate_heuristic(candidate, draft, runs)
|
|
|
|
def _evaluate_heuristic(
|
|
self,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
runs: list,
|
|
) -> SkillDraftEvalReport:
|
|
runs_by_id = {record.run_id: record for record in runs}
|
|
cases: list[dict] = []
|
|
for run_id in candidate.source_run_ids[:10]:
|
|
record = runs_by_id.get(run_id)
|
|
if record is None:
|
|
continue
|
|
baseline = _score_from_validation(record.validation_result, record.success)
|
|
candidate_score = _candidate_score(baseline, draft)
|
|
cases.append(
|
|
{
|
|
"run_id": run_id,
|
|
"session_id": record.session_id,
|
|
"baseline_score": baseline,
|
|
"candidate_score": candidate_score,
|
|
"delta": round(candidate_score - baseline, 4),
|
|
}
|
|
)
|
|
if not cases:
|
|
cases.append(
|
|
{
|
|
"run_id": "",
|
|
"session_id": "",
|
|
"baseline_score": 0.75,
|
|
"candidate_score": _candidate_score(0.75, draft),
|
|
"delta": round(_candidate_score(0.75, draft) - 0.75, 4),
|
|
}
|
|
)
|
|
|
|
baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases)
|
|
candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases)
|
|
regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]]
|
|
improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]]
|
|
unchanged = len(cases) - len(regressions) - len(improved)
|
|
score_delta = candidate_avg - baseline_avg
|
|
passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75
|
|
return SkillDraftEvalReport(
|
|
report_id=uuid4().hex,
|
|
skill_name=draft.skill_name,
|
|
draft_id=draft.draft_id,
|
|
candidate_id=candidate.candidate_id,
|
|
passed=passed,
|
|
baseline_score_avg=round(baseline_avg, 4),
|
|
candidate_score_avg=round(candidate_avg, 4),
|
|
score_delta=round(score_delta, 4),
|
|
regression_count=len(regressions),
|
|
improved_count=len(improved),
|
|
unchanged_count=unchanged,
|
|
cases=cases,
|
|
status="completed",
|
|
created_at=_utc_now(),
|
|
)
|
|
|
|
async def _evaluate_replay(
|
|
self,
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
replay_cases: list[dict],
|
|
provider_bundle: ProviderBundle,
|
|
replay_runner: ReplayRunner,
|
|
case_selection_meta: dict[str, Any] | None = None,
|
|
) -> SkillDraftEvalReport:
|
|
case_reports: list[dict] = []
|
|
legacy_cases: list[dict] = []
|
|
for case in replay_cases:
|
|
baseline = await replay_runner.run_arm(
|
|
ReplayArmRequest(
|
|
case_id=f"{case['run_id']}:baseline",
|
|
arm="baseline",
|
|
task_text=str(case["task_text"]),
|
|
pinned_skill_names=list(case.get("baseline_skill_names") or []),
|
|
pinned_skill_contexts=[],
|
|
provider_bundle=provider_bundle,
|
|
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
|
)
|
|
)
|
|
candidate_arm = await replay_runner.run_arm(
|
|
ReplayArmRequest(
|
|
case_id=f"{case['run_id']}:candidate",
|
|
arm="candidate",
|
|
task_text=str(case["task_text"]),
|
|
pinned_skill_names=[],
|
|
pinned_skill_contexts=[_draft_skill_context(draft)],
|
|
provider_bundle=provider_bundle,
|
|
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
|
|
)
|
|
)
|
|
surrogate = await self.surrogate_evaluator.evaluate(
|
|
task_text=str(case["task_text"]),
|
|
baseline=baseline,
|
|
candidate=candidate_arm,
|
|
)
|
|
baseline_ability = _ability_score(
|
|
case=case,
|
|
arm=baseline,
|
|
arm_name="baseline",
|
|
)
|
|
candidate_ability = _ability_score(
|
|
case=case,
|
|
arm=candidate_arm,
|
|
arm_name="candidate",
|
|
)
|
|
baseline_score = baseline_ability["final_score"]
|
|
candidate_score = candidate_ability["final_score"]
|
|
tool_execution_score = {
|
|
"baseline_score": surrogate["baseline_score"],
|
|
"candidate_score": surrogate["candidate_score"],
|
|
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
|
|
"score_role": "diagnostic_only",
|
|
}
|
|
case_report = {
|
|
"run_id": case["run_id"],
|
|
"task_id": case.get("task_id"),
|
|
"session_id": case.get("session_id"),
|
|
"task_text": case.get("task_text"),
|
|
"synthetic": bool(case.get("synthetic")),
|
|
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
|
"validator": case.get("validator"),
|
|
"baseline": baseline,
|
|
"candidate": candidate_arm,
|
|
"baseline_score": baseline_score,
|
|
"candidate_score": candidate_score,
|
|
"delta": round(candidate_score - baseline_score, 4),
|
|
"ability_score": {
|
|
"baseline": baseline_ability,
|
|
"candidate": candidate_ability,
|
|
"delta": round(candidate_score - baseline_score, 4),
|
|
},
|
|
"tool_execution_score": tool_execution_score,
|
|
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
|
|
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
|
|
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
|
|
"confidence": surrogate["confidence"],
|
|
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
|
|
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
|
|
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
|
|
"validator_notes": list(surrogate.get("notes") or []),
|
|
}
|
|
case_reports.append(case_report)
|
|
legacy_cases.append(
|
|
{
|
|
"run_id": case["run_id"],
|
|
"session_id": case.get("session_id") or "",
|
|
"task_text": case.get("task_text") or "",
|
|
"synthetic": bool(case.get("synthetic")),
|
|
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
|
|
"baseline_score": baseline_score,
|
|
"candidate_score": candidate_score,
|
|
"delta": round(candidate_score - baseline_score, 4),
|
|
}
|
|
)
|
|
preservation_report = _preservation_report(candidate, draft)
|
|
return _report_from_case_reports(
|
|
candidate,
|
|
draft,
|
|
case_reports,
|
|
legacy_cases,
|
|
preservation_report,
|
|
case_selection_meta or {},
|
|
)
|
|
|
|
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
|
return SkillDraftEvalReport(
|
|
report_id=uuid4().hex,
|
|
skill_name=draft.skill_name,
|
|
draft_id=draft.draft_id,
|
|
candidate_id=candidate.candidate_id,
|
|
passed=True,
|
|
baseline_score_avg=0.0,
|
|
candidate_score_avg=0.0,
|
|
score_delta=0.0,
|
|
regression_count=0,
|
|
improved_count=0,
|
|
unchanged_count=0,
|
|
cases=[],
|
|
status="skipped_provider_unavailable",
|
|
created_at=_utc_now(),
|
|
)
|
|
|
|
|
|
def _score_from_validation(validation: dict | None, success: bool) -> float:
|
|
if isinstance(validation, dict) and "score" in validation:
|
|
try:
|
|
return max(0.0, min(1.0, float(validation.get("score") or 0.0)))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
return 0.8 if success else 0.4
|
|
|
|
|
|
def _candidate_score(baseline: float, draft: SkillDraft) -> float:
|
|
content = draft.proposed_content.strip()
|
|
if not content and draft.proposal_kind != "retire_skill":
|
|
return 0.0
|
|
if "regression" in content.lower():
|
|
return max(0.0, baseline - 0.2)
|
|
return min(1.0, max(0.75, baseline + 0.05))
|
|
|
|
|
|
def _draft_skill_context(draft: SkillDraft) -> SkillContext:
|
|
tool_hints = draft.proposed_frontmatter.get("tools")
|
|
return SkillContext(
|
|
name=f"draft:{draft.skill_name}",
|
|
content=draft.proposed_content,
|
|
version=draft.draft_id,
|
|
content_hash="draft",
|
|
activation_reason="skill_replay_eval_candidate",
|
|
tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [],
|
|
)
|
|
|
|
|
|
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
|
|
if candidate.kind not in {"revise_skill", "merge_skills"}:
|
|
return None
|
|
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
|
|
if not base_content.strip():
|
|
return None
|
|
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
|
|
|
|
|
|
async def _prepare_eval_cases(
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
historical_cases: list[dict[str, Any]],
|
|
provider_bundle: ProviderBundle,
|
|
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
explicit_cases = _explicit_eval_cases(candidate)
|
|
merged = _dedupe_cases([*explicit_cases, *historical_cases])
|
|
usable, excluded = _filter_unscorable_cases(merged)
|
|
missing = max(0, 10 - len(usable))
|
|
generated: list[dict[str, Any]] = []
|
|
if missing:
|
|
generated = await _generate_synthetic_cases(
|
|
candidate=candidate,
|
|
draft=draft,
|
|
historical_cases=usable,
|
|
provider_bundle=provider_bundle,
|
|
count=missing,
|
|
)
|
|
generated, generated_excluded = _filter_unscorable_cases(generated)
|
|
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
|
|
if len(generated) < missing:
|
|
generated.extend(
|
|
_fallback_synthetic_cases(
|
|
candidate=candidate,
|
|
historical_cases=usable,
|
|
start_index=len(generated) + 1,
|
|
count=missing - len(generated),
|
|
)
|
|
)
|
|
prepared = [*usable, *generated]
|
|
return prepared[:10], {
|
|
"requested_case_count": 10,
|
|
"historical_case_count": len(historical_cases),
|
|
"explicit_case_count": len(explicit_cases),
|
|
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
|
|
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
|
|
}
|
|
|
|
|
|
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
|
|
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
|
|
if not isinstance(raw_cases, list):
|
|
return []
|
|
result: list[dict[str, Any]] = []
|
|
for index, raw in enumerate(raw_cases, start=1):
|
|
if not isinstance(raw, dict):
|
|
continue
|
|
task_text = str(raw.get("task_text") or "").strip()
|
|
if not task_text:
|
|
continue
|
|
case = {
|
|
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
|
|
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
|
|
"session_id": raw.get("session_id") or "explicit-eval",
|
|
"task_text": task_text,
|
|
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
|
|
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
|
|
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
|
|
"synthetic": bool(raw.get("synthetic")),
|
|
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
|
|
}
|
|
if isinstance(raw.get("validator"), dict):
|
|
case["validator"] = dict(raw["validator"])
|
|
result.append(case)
|
|
return result
|
|
|
|
|
|
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
result: list[dict[str, Any]] = []
|
|
seen: set[str] = set()
|
|
for case in cases:
|
|
run_id = str(case.get("run_id") or "")
|
|
task_text = str(case.get("task_text") or "")
|
|
key = run_id or task_text
|
|
if not key or key in seen:
|
|
continue
|
|
seen.add(key)
|
|
result.append(case)
|
|
return result
|
|
|
|
|
|
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
|
result: list[dict[str, Any]] = []
|
|
excluded = {"synthetic_without_validator": 0}
|
|
for case in cases:
|
|
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
|
|
excluded["synthetic_without_validator"] += 1
|
|
continue
|
|
result.append(case)
|
|
return result, excluded
|
|
|
|
|
|
async def _generate_synthetic_cases(
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
historical_cases: list[dict[str, Any]],
|
|
provider_bundle: ProviderBundle,
|
|
count: int,
|
|
) -> list[dict[str, Any]]:
|
|
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
|
|
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
|
|
model = getattr(runtime, "model", None)
|
|
try:
|
|
response = await provider.chat(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": (
|
|
"You generate validator-first Beaver skill evaluation cases. "
|
|
"Return only JSON with key cases. Each case must include task_text and validator. "
|
|
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
|
|
),
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": _synthetic_case_prompt(
|
|
candidate=candidate,
|
|
draft=draft,
|
|
historical_cases=historical_cases,
|
|
count=count,
|
|
),
|
|
},
|
|
],
|
|
model=model,
|
|
max_tokens=2200,
|
|
temperature=0.4,
|
|
)
|
|
except Exception:
|
|
return []
|
|
payload = _parse_json_payload(response.content or "")
|
|
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
|
|
if not isinstance(raw_cases, list):
|
|
return []
|
|
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
|
|
|
|
|
|
def _synthetic_case_prompt(
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
historical_cases: list[dict[str, Any]],
|
|
count: int,
|
|
) -> str:
|
|
historical = [
|
|
{
|
|
"run_id": item.get("run_id"),
|
|
"task_text": item.get("task_text"),
|
|
"validator": item.get("validator"),
|
|
}
|
|
for item in historical_cases
|
|
]
|
|
return (
|
|
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
|
|
f"Candidate kind: {candidate.kind}\n"
|
|
f"Candidate reason: {candidate.reason}\n"
|
|
f"Draft skill name: {draft.skill_name}\n"
|
|
f"Related skills: {candidate.related_skill_names}\n"
|
|
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
|
|
"Every synthetic case must be validator-first. Return exactly:\n"
|
|
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
|
|
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
|
|
)
|
|
|
|
|
|
def _parse_json_payload(content: str) -> dict[str, Any]:
|
|
cleaned = content.strip()
|
|
if cleaned.startswith("```"):
|
|
cleaned = cleaned.strip("`")
|
|
if cleaned.startswith("json"):
|
|
cleaned = cleaned[4:]
|
|
try:
|
|
payload = json.loads(cleaned)
|
|
except json.JSONDecodeError:
|
|
start = cleaned.find("{")
|
|
end = cleaned.rfind("}")
|
|
if start < 0 or end <= start:
|
|
return {}
|
|
try:
|
|
payload = json.loads(cleaned[start : end + 1])
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
return payload if isinstance(payload, dict) else {}
|
|
|
|
|
|
def _synthetic_case_payloads(
|
|
candidate: SkillLearningCandidate,
|
|
raw_cases: list[Any],
|
|
*,
|
|
start_index: int,
|
|
limit: int,
|
|
) -> list[dict[str, Any]]:
|
|
result: list[dict[str, Any]] = []
|
|
for raw in raw_cases:
|
|
if not isinstance(raw, dict):
|
|
continue
|
|
task_text = str(raw.get("task_text") or "").strip()
|
|
validator = raw.get("validator")
|
|
if not task_text or not isinstance(validator, dict):
|
|
continue
|
|
result.append(
|
|
_synthetic_case_payload(
|
|
candidate,
|
|
task_text,
|
|
start_index + len(result),
|
|
validator=dict(validator),
|
|
tier=str(raw.get("tier") or "bronze"),
|
|
)
|
|
)
|
|
if len(result) >= limit:
|
|
break
|
|
return result
|
|
|
|
|
|
def _fallback_synthetic_cases(
|
|
*,
|
|
candidate: SkillLearningCandidate,
|
|
historical_cases: list[dict[str, Any]],
|
|
start_index: int,
|
|
count: int,
|
|
) -> list[dict[str, Any]]:
|
|
seed_text = ""
|
|
if historical_cases:
|
|
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
|
|
if not seed_text:
|
|
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
|
|
required_terms = _terms(seed_text)[:2] or ["done"]
|
|
return [
|
|
_synthetic_case_payload(
|
|
candidate,
|
|
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
|
|
index,
|
|
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
|
|
tier="bronze",
|
|
)
|
|
for index in range(start_index, start_index + count)
|
|
]
|
|
|
|
|
|
def _synthetic_case_payload(
|
|
candidate: SkillLearningCandidate,
|
|
task_text: str,
|
|
index: int,
|
|
*,
|
|
validator: dict[str, Any],
|
|
tier: str,
|
|
) -> dict[str, Any]:
|
|
return {
|
|
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
|
|
"task_id": f"synthetic-{index:02d}",
|
|
"session_id": "synthetic-eval",
|
|
"task_text": task_text,
|
|
"baseline_skill_names": _baseline_skill_names(candidate),
|
|
"candidate_skill_name": candidate.draft_skill_name,
|
|
"accepted_score": 0.75,
|
|
"synthetic": True,
|
|
"tier": tier,
|
|
"validator": validator,
|
|
}
|
|
|
|
|
|
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
|
|
if candidate.kind == "revise_skill":
|
|
return list(candidate.related_skill_names[:1])
|
|
if candidate.kind == "merge_skills":
|
|
return list(candidate.related_skill_names)
|
|
return []
|
|
|
|
|
|
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
|
|
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
|
|
if validator is not None:
|
|
return _ability_from_validator(validator, arm)
|
|
if not case.get("synthetic"):
|
|
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
|
|
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
|
|
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
|
|
|
|
|
|
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
|
|
validator_type = str(validator.get("type") or "")
|
|
if validator_type != "final_answer_contains":
|
|
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
|
|
|
|
answer = str(arm.get("final_answer") or "").lower()
|
|
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
|
|
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
|
|
matched = sum(1 for term in required_terms if term in answer)
|
|
outcome = matched / max(1, len(required_terms))
|
|
unsafe = any(term in answer for term in forbidden_terms)
|
|
safety = 0.0 if unsafe else 1.0
|
|
final_score = (
|
|
0.40 * outcome
|
|
+ 0.25 * outcome
|
|
+ 0.15 * _process_validity(arm)
|
|
+ 0.10 * safety
|
|
+ 0.10 * _path_efficiency(arm, outcome)
|
|
)
|
|
return {
|
|
**_ability_breakdown(score=final_score, source="auto_validator"),
|
|
"outcome_correctness": round(outcome, 4),
|
|
"artifact_correctness": round(outcome, 4),
|
|
"safety_no_regression": round(safety, 4),
|
|
"validator_type": validator_type,
|
|
}
|
|
|
|
|
|
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
|
|
answer = str(arm.get("final_answer") or "").strip()
|
|
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
|
|
return _ability_breakdown(score=score, source=source, notes=notes)
|
|
|
|
|
|
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
|
|
bounded = _bounded_score(score, default=0.0)
|
|
return {
|
|
"outcome_correctness": bounded,
|
|
"artifact_correctness": bounded,
|
|
"process_validity": bounded,
|
|
"safety_no_regression": bounded,
|
|
"path_efficiency": bounded,
|
|
"final_score": round(bounded, 4),
|
|
"source": source,
|
|
"notes": list(notes or []),
|
|
}
|
|
|
|
|
|
def _process_validity(arm: dict[str, Any]) -> float:
|
|
if arm.get("finish_reason") == "error":
|
|
return 0.2
|
|
return 0.8 if arm.get("tool_calls") else 0.6
|
|
|
|
|
|
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
|
|
if outcome < 0.5:
|
|
return 0.3
|
|
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
|
|
if call_count <= 3:
|
|
return 1.0
|
|
if call_count <= 6:
|
|
return 0.7
|
|
return 0.4
|
|
|
|
|
|
def _bounded_score(value: Any, *, default: float) -> float:
|
|
try:
|
|
return max(0.0, min(1.0, float(value)))
|
|
except (TypeError, ValueError):
|
|
return default
|
|
|
|
|
|
def _terms(text: str) -> list[str]:
|
|
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
|
|
|
|
|
|
def _report_from_case_reports(
|
|
candidate: SkillLearningCandidate,
|
|
draft: SkillDraft,
|
|
case_reports: list[dict],
|
|
legacy_cases: list[dict],
|
|
preservation_report: dict | None,
|
|
case_selection_meta: dict[str, Any] | None = None,
|
|
) -> SkillDraftEvalReport:
|
|
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
|
|
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
|
|
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
|
|
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
|
|
unchanged = len(legacy_cases) - len(regressions) - len(improved)
|
|
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
|
|
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
|
|
execution, surrogate, blocked = _coverage(case_reports)
|
|
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
|
|
score_delta = candidate_avg - baseline_avg
|
|
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
|
|
selection_meta = dict(case_selection_meta or {})
|
|
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
|
|
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
|
|
overall_score_avg = round(candidate_avg, 4)
|
|
ability_summary = {
|
|
"score_role": "primary",
|
|
"real_case_count": len(real_cases),
|
|
"synthetic_case_count": len(synthetic_cases),
|
|
"real_score_avg": real_score_avg,
|
|
"synthetic_score_avg": synthetic_score_avg,
|
|
"overall_score_avg": overall_score_avg,
|
|
}
|
|
tool_execution_summary = {
|
|
"score_role": "diagnostic_only",
|
|
"executed": execution,
|
|
"surrogate": surrogate,
|
|
"blocked": blocked,
|
|
}
|
|
return SkillDraftEvalReport(
|
|
report_id=uuid4().hex,
|
|
skill_name=draft.skill_name,
|
|
draft_id=draft.draft_id,
|
|
candidate_id=candidate.candidate_id,
|
|
passed=passed,
|
|
baseline_score_avg=round(baseline_avg, 4),
|
|
candidate_score_avg=round(candidate_avg, 4),
|
|
score_delta=round(score_delta, 4),
|
|
regression_count=len(regressions),
|
|
improved_count=len(improved),
|
|
unchanged_count=unchanged,
|
|
cases=legacy_cases,
|
|
status="completed",
|
|
created_at=_utc_now(),
|
|
eval_version="replay-v1",
|
|
mode="replay",
|
|
execution_coverage=execution,
|
|
surrogate_coverage=surrogate,
|
|
blocked_coverage=blocked,
|
|
confidence=confidence,
|
|
case_reports=case_reports,
|
|
tool_mode_summary={
|
|
"executed": execution,
|
|
"surrogate": surrogate,
|
|
"blocked": blocked,
|
|
"score_role": "diagnostic_only",
|
|
"real_case_count": len(real_cases),
|
|
"synthetic_case_count": len(synthetic_cases),
|
|
"real_score_avg": real_score_avg,
|
|
"synthetic_score_avg": synthetic_score_avg,
|
|
"overall_score_avg": overall_score_avg,
|
|
**selection_meta,
|
|
},
|
|
ability_score_summary=ability_summary,
|
|
tool_execution_summary=tool_execution_summary,
|
|
case_selection_summary=selection_meta,
|
|
real_score_avg=real_score_avg,
|
|
synthetic_score_avg=synthetic_score_avg,
|
|
overall_score_avg=overall_score_avg,
|
|
preservation_report=preservation_report,
|
|
)
|
|
|
|
|
|
def _avg(values: list[float]) -> float | None:
|
|
if not values:
|
|
return None
|
|
return round(sum(values) / len(values), 4)
|
|
|
|
|
|
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
|
|
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
|
|
for report in case_reports:
|
|
for call in report.get("tool_calls") or []:
|
|
if isinstance(call, dict) and call.get("mode") in counts:
|
|
counts[str(call["mode"])] += 1
|
|
total = sum(counts.values())
|
|
if total == 0:
|
|
return 1.0, 0.0, 0.0
|
|
return (
|
|
round(counts["executed"] / total, 4),
|
|
round(counts["surrogate"] / total, 4),
|
|
round(counts["blocked"] / total, 4),
|
|
)
|
|
|
|
|
|
def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str:
|
|
if blocked > 0.0:
|
|
return "low"
|
|
if execution >= 0.75 and surrogate <= 0.25:
|
|
return "high"
|
|
if execution >= 0.25 or "medium" in case_confidences:
|
|
return "medium"
|
|
return "low"
|
|
|
|
|
|
def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float:
|
|
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
|
|
if not calls:
|
|
return 1.0 if mode == "executed" else 0.0
|
|
return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4)
|
|
|
|
|
|
def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int:
|
|
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
|
|
return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode)
|
|
|
|
|
|
def _utc_now() -> str:
|
|
from datetime import datetime, timezone
|
|
|
|
return datetime.now(timezone.utc).isoformat()
|