Files
beaver_project/app-instance/backend/beaver/skills/learning/eval.py
steven_li 8aeb97a5fc feat(app): 移除内置agents并添加CORS支持和技能上传优化
移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
2026-06-12 13:25:20 +08:00

776 lines
30 KiB
Python

"""Lightweight replay/eval reports for skill drafts."""
from __future__ import annotations
import json
from typing import Any
from uuid import uuid4
from beaver.engine.context import SkillContext
from beaver.engine.providers import ProviderBundle
from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.skills.learning.case_selection import select_replay_cases
from beaver.skills.learning.preservation import check_preservation
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
from beaver.skills.specs import SkillDraft
class SkillDraftEvaluator:
"""Builds a bounded eval report without writing user-visible sessions."""
def __init__(
self,
run_store: RunMemoryStore,
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
) -> None:
self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
async def evaluate(
self,
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
) -> SkillDraftEvalReport:
if provider_bundle is None or provider_bundle.main_provider is None:
return self._skipped(candidate, draft)
runs = self.run_store.list_runs()
if replay_runner is not None:
replay_cases, case_selection_meta = await _prepare_eval_cases(
candidate=candidate,
draft=draft,
historical_cases=select_replay_cases(candidate, runs),
provider_bundle=provider_bundle,
)
else:
replay_cases = []
case_selection_meta = {}
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
draft=draft,
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
)
return self._evaluate_heuristic(candidate, draft, runs)
def _evaluate_heuristic(
self,
candidate: SkillLearningCandidate,
draft: SkillDraft,
runs: list,
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = []
for run_id in candidate.source_run_ids[:10]:
record = runs_by_id.get(run_id)
if record is None:
continue
baseline = _score_from_validation(record.validation_result, record.success)
candidate_score = _candidate_score(baseline, draft)
cases.append(
{
"run_id": run_id,
"session_id": record.session_id,
"baseline_score": baseline,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline, 4),
}
)
if not cases:
cases.append(
{
"run_id": "",
"session_id": "",
"baseline_score": 0.75,
"candidate_score": _candidate_score(0.75, draft),
"delta": round(_candidate_score(0.75, draft) - 0.75, 4),
}
)
baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases)
candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases)
regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(cases) - len(regressions) - len(improved)
score_delta = candidate_avg - baseline_avg
passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=passed,
baseline_score_avg=round(baseline_avg, 4),
candidate_score_avg=round(candidate_avg, 4),
score_delta=round(score_delta, 4),
regression_count=len(regressions),
improved_count=len(improved),
unchanged_count=unchanged,
cases=cases,
status="completed",
created_at=_utc_now(),
)
async def _evaluate_replay(
self,
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
case_reports: list[dict] = []
legacy_cases: list[dict] = []
for case in replay_cases:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
baseline_ability = _ability_score(
case=case,
arm=baseline,
arm_name="baseline",
)
candidate_ability = _ability_score(
case=case,
arm=candidate_arm,
arm_name="candidate",
)
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tool_execution_score = {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
}
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": tool_execution_score,
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
case_reports.append(case_report)
legacy_cases.append(
{
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": case.get("tier") or ("bronze" if case.get("synthetic") else "gold"),
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
)
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(
candidate,
draft,
case_reports,
legacy_cases,
preservation_report,
case_selection_meta or {},
)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=True,
baseline_score_avg=0.0,
candidate_score_avg=0.0,
score_delta=0.0,
regression_count=0,
improved_count=0,
unchanged_count=0,
cases=[],
status="skipped_provider_unavailable",
created_at=_utc_now(),
)
def _score_from_validation(validation: dict | None, success: bool) -> float:
if isinstance(validation, dict) and "score" in validation:
try:
return max(0.0, min(1.0, float(validation.get("score") or 0.0)))
except (TypeError, ValueError):
pass
return 0.8 if success else 0.4
def _candidate_score(baseline: float, draft: SkillDraft) -> float:
content = draft.proposed_content.strip()
if not content and draft.proposal_kind != "retire_skill":
return 0.0
if "regression" in content.lower():
return max(0.0, baseline - 0.2)
return min(1.0, max(0.75, baseline + 0.05))
def _draft_skill_context(draft: SkillDraft) -> SkillContext:
tool_hints = draft.proposed_frontmatter.get("tools")
return SkillContext(
name=f"draft:{draft.skill_name}",
content=draft.proposed_content,
version=draft.draft_id,
content_hash="draft",
activation_reason="skill_replay_eval_candidate",
tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [],
)
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
if candidate.kind not in {"revise_skill", "merge_skills"}:
return None
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
if not base_content.strip():
return None
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
async def _prepare_eval_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
explicit_cases = _explicit_eval_cases(candidate)
merged = _dedupe_cases([*explicit_cases, *historical_cases])
usable, excluded = _filter_unscorable_cases(merged)
missing = max(0, 10 - len(usable))
generated: list[dict[str, Any]] = []
if missing:
generated = await _generate_synthetic_cases(
candidate=candidate,
draft=draft,
historical_cases=usable,
provider_bundle=provider_bundle,
count=missing,
)
generated, generated_excluded = _filter_unscorable_cases(generated)
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
if len(generated) < missing:
generated.extend(
_fallback_synthetic_cases(
candidate=candidate,
historical_cases=usable,
start_index=len(generated) + 1,
count=missing - len(generated),
)
)
prepared = [*usable, *generated]
return prepared[:10], {
"requested_case_count": 10,
"historical_case_count": len(historical_cases),
"explicit_case_count": len(explicit_cases),
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
}
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
if not isinstance(raw_cases, list):
return []
result: list[dict[str, Any]] = []
for index, raw in enumerate(raw_cases, start=1):
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
if not task_text:
continue
case = {
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
"session_id": raw.get("session_id") or "explicit-eval",
"task_text": task_text,
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
"synthetic": bool(raw.get("synthetic")),
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
}
if isinstance(raw.get("validator"), dict):
case["validator"] = dict(raw["validator"])
result.append(case)
return result
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
seen: set[str] = set()
for case in cases:
run_id = str(case.get("run_id") or "")
task_text = str(case.get("task_text") or "")
key = run_id or task_text
if not key or key in seen:
continue
seen.add(key)
result.append(case)
return result
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
result: list[dict[str, Any]] = []
excluded = {"synthetic_without_validator": 0}
for case in cases:
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
excluded["synthetic_without_validator"] += 1
continue
result.append(case)
return result, excluded
async def _generate_synthetic_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
count: int,
) -> list[dict[str, Any]]:
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
model = getattr(runtime, "model", None)
try:
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You generate validator-first Beaver skill evaluation cases. "
"Return only JSON with key cases. Each case must include task_text and validator. "
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
),
},
{
"role": "user",
"content": _synthetic_case_prompt(
candidate=candidate,
draft=draft,
historical_cases=historical_cases,
count=count,
),
},
],
model=model,
max_tokens=2200,
temperature=0.4,
)
except Exception:
return []
payload = _parse_json_payload(response.content or "")
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
if not isinstance(raw_cases, list):
return []
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
def _synthetic_case_prompt(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
count: int,
) -> str:
historical = [
{
"run_id": item.get("run_id"),
"task_text": item.get("task_text"),
"validator": item.get("validator"),
}
for item in historical_cases
]
return (
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
f"Candidate kind: {candidate.kind}\n"
f"Candidate reason: {candidate.reason}\n"
f"Draft skill name: {draft.skill_name}\n"
f"Related skills: {candidate.related_skill_names}\n"
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
"Every synthetic case must be validator-first. Return exactly:\n"
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
)
def _parse_json_payload(content: str) -> dict[str, Any]:
cleaned = content.strip()
if cleaned.startswith("```"):
cleaned = cleaned.strip("`")
if cleaned.startswith("json"):
cleaned = cleaned[4:]
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start < 0 or end <= start:
return {}
try:
payload = json.loads(cleaned[start : end + 1])
except json.JSONDecodeError:
return {}
return payload if isinstance(payload, dict) else {}
def _synthetic_case_payloads(
candidate: SkillLearningCandidate,
raw_cases: list[Any],
*,
start_index: int,
limit: int,
) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for raw in raw_cases:
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
validator = raw.get("validator")
if not task_text or not isinstance(validator, dict):
continue
result.append(
_synthetic_case_payload(
candidate,
task_text,
start_index + len(result),
validator=dict(validator),
tier=str(raw.get("tier") or "bronze"),
)
)
if len(result) >= limit:
break
return result
def _fallback_synthetic_cases(
*,
candidate: SkillLearningCandidate,
historical_cases: list[dict[str, Any]],
start_index: int,
count: int,
) -> list[dict[str, Any]]:
seed_text = ""
if historical_cases:
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
if not seed_text:
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
required_terms = _terms(seed_text)[:2] or ["done"]
return [
_synthetic_case_payload(
candidate,
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
index,
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
tier="bronze",
)
for index in range(start_index, start_index + count)
]
def _synthetic_case_payload(
candidate: SkillLearningCandidate,
task_text: str,
index: int,
*,
validator: dict[str, Any],
tier: str,
) -> dict[str, Any]:
return {
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
"task_id": f"synthetic-{index:02d}",
"session_id": "synthetic-eval",
"task_text": task_text,
"baseline_skill_names": _baseline_skill_names(candidate),
"candidate_skill_name": candidate.draft_skill_name,
"accepted_score": 0.75,
"synthetic": True,
"tier": tier,
"validator": validator,
}
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
if candidate.kind == "revise_skill":
return list(candidate.related_skill_names[:1])
if candidate.kind == "merge_skills":
return list(candidate.related_skill_names)
return []
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
validator_type = str(validator.get("type") or "")
if validator_type != "final_answer_contains":
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
answer = str(arm.get("final_answer") or "").lower()
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
matched = sum(1 for term in required_terms if term in answer)
outcome = matched / max(1, len(required_terms))
unsafe = any(term in answer for term in forbidden_terms)
safety = 0.0 if unsafe else 1.0
final_score = (
0.40 * outcome
+ 0.25 * outcome
+ 0.15 * _process_validity(arm)
+ 0.10 * safety
+ 0.10 * _path_efficiency(arm, outcome)
)
return {
**_ability_breakdown(score=final_score, source="auto_validator"),
"outcome_correctness": round(outcome, 4),
"artifact_correctness": round(outcome, 4),
"safety_no_regression": round(safety, 4),
"validator_type": validator_type,
}
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
answer = str(arm.get("final_answer") or "").strip()
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
return _ability_breakdown(score=score, source=source, notes=notes)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {
"outcome_correctness": bounded,
"artifact_correctness": bounded,
"process_validity": bounded,
"safety_no_regression": bounded,
"path_efficiency": bounded,
"final_score": round(bounded, 4),
"source": source,
"notes": list(notes or []),
}
def _process_validity(arm: dict[str, Any]) -> float:
if arm.get("finish_reason") == "error":
return 0.2
return 0.8 if arm.get("tool_calls") else 0.6
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
if outcome < 0.5:
return 0.3
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
if call_count <= 3:
return 1.0
if call_count <= 6:
return 0.7
return 0.4
def _bounded_score(value: Any, *, default: float) -> float:
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _terms(text: str) -> list[str]:
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
selection_meta = dict(case_selection_meta or {})
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
overall_score_avg = round(candidate_avg, 4)
ability_summary = {
"score_role": "primary",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
}
tool_execution_summary = {
"score_role": "diagnostic_only",
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
}
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=passed,
baseline_score_avg=round(baseline_avg, 4),
candidate_score_avg=round(candidate_avg, 4),
score_delta=round(score_delta, 4),
regression_count=len(regressions),
improved_count=len(improved),
unchanged_count=unchanged,
cases=legacy_cases,
status="completed",
created_at=_utc_now(),
eval_version="replay-v1",
mode="replay",
execution_coverage=execution,
surrogate_coverage=surrogate,
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
"score_role": "diagnostic_only",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
**selection_meta,
},
ability_score_summary=ability_summary,
tool_execution_summary=tool_execution_summary,
case_selection_summary=selection_meta,
real_score_avg=real_score_avg,
synthetic_score_avg=synthetic_score_avg,
overall_score_avg=overall_score_avg,
preservation_report=preservation_report,
)
def _avg(values: list[float]) -> float | None:
if not values:
return None
return round(sum(values) / len(values), 4)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:
for call in report.get("tool_calls") or []:
if isinstance(call, dict) and call.get("mode") in counts:
counts[str(call["mode"])] += 1
total = sum(counts.values())
if total == 0:
return 1.0, 0.0, 0.0
return (
round(counts["executed"] / total, 4),
round(counts["surrogate"] / total, 4),
round(counts["blocked"] / total, 4),
)
def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str:
if blocked > 0.0:
return "low"
if execution >= 0.75 and surrogate <= 0.25:
return "high"
if execution >= 0.25 or "medium" in case_confidences:
return "medium"
return "low"
def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
if not calls:
return 1.0 if mode == "executed" else 0.0
return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4)
def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode)
def _utc_now() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()