Files
beaver_project/app-instance/backend/beaver/skills/learning/eval.py
steven_li 4b0bf65ace ```
feat(engine): 优化智能体循环中的助手消息处理逻辑

- 在没有工具调用时才添加助手消息到上下文
- 确保工具调用响应正确添加到消息上下文中
- 修复了消息构建的条件逻辑

fix(cron): 改进定时任务调度的时间解析功能

- 添加正则表达式导入用于时间显示解析
- 实现从显示文本中提取毫秒间隔的功能
- 增强整数转换的安全性,避免类型错误
- 优化定时任务配置的解析逻辑

feat(outlook): 增强Outlook集成的功能和稳定性

- 将默认超时时间从10秒增加到180秒
- 为状态检查函数添加可选的验证参数
- 串行执行邮件概览获取操作而非并行
- 改进连接状态验证逻辑

feat(channel): 添加设备名称作为会话标识的选项

- 为终端WebSocket适配器添加新的配置选项
- 实现基于设备名称生成会话对等ID的功能
- 记录原始对等ID和设备名称的元数据
- 支持从设备名称创建会话对等ID

feat(skills): 完善技能学习评估系统和进度跟踪

- 在应用启动时自动调度待评估的技能草稿
- 为技能评估工作创建独立的循环工厂
- 实现异步技能评估任务的取消和清理机制
- 添加技能评估进度报告和状态跟踪功能
- 扩展会话列表API以包含更多详细信息
- 防止对不存在的会话进行操作
- 优化技能草稿提交和评估的业务逻辑

perf(skills): 提升技能评估的并发性能

- 实现并行技能案例评估以提高效率
- 添加最大并行案例数的环境变量控制
- 实现实时评估进度更新和回调机制
- 优化评估过程中的资源管理和同步

refactor(services): 创建隔离的智能体循环实例

- 添加创建独立智能体循环的工厂方法
- 确保新循环继承运行时服务配置
- 支持技能评估等需要隔离环境的场景
```
2026-06-15 14:48:16 +08:00

841 lines
32 KiB
Python

"""Lightweight replay/eval reports for skill drafts."""
from __future__ import annotations
import asyncio
import json
import os
from typing import Any, Callable
from uuid import uuid4
from beaver.engine.context import SkillContext
from beaver.engine.providers import ProviderBundle
from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.skills.learning.case_selection import select_replay_cases
from beaver.skills.learning.preservation import check_preservation
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
from beaver.skills.specs import SkillDraft
class SkillDraftEvaluator:
"""Builds a bounded eval report without writing user-visible sessions."""
def __init__(
self,
run_store: RunMemoryStore,
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
max_parallel_cases: int | None = None,
) -> None:
self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
configured_parallelism = max_parallel_cases
if configured_parallelism is None:
try:
configured_parallelism = int(os.getenv("BEAVER_SKILL_EVAL_MAX_PARALLEL_CASES", "3") or "3")
except ValueError:
configured_parallelism = 3
self.max_parallel_cases = max(1, configured_parallelism)
async def evaluate(
self,
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
provider_bundle: ProviderBundle | None,
replay_runner: ReplayRunner | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
if provider_bundle is None or provider_bundle.main_provider is None:
return self._skipped(candidate, draft)
runs = self.run_store.list_runs()
if replay_runner is not None:
replay_cases, case_selection_meta = await _prepare_eval_cases(
candidate=candidate,
draft=draft,
historical_cases=select_replay_cases(candidate, runs),
provider_bundle=provider_bundle,
)
else:
replay_cases = []
case_selection_meta = {}
if replay_runner is not None and replay_cases:
return await self._evaluate_replay(
candidate=candidate,
draft=draft,
replay_cases=replay_cases,
provider_bundle=provider_bundle,
replay_runner=replay_runner,
case_selection_meta=case_selection_meta,
progress_callback=progress_callback,
)
return self._evaluate_heuristic(candidate, draft, runs)
def _evaluate_heuristic(
self,
candidate: SkillLearningCandidate,
draft: SkillDraft,
runs: list,
) -> SkillDraftEvalReport:
runs_by_id = {record.run_id: record for record in runs}
cases: list[dict] = []
for run_id in candidate.source_run_ids[:10]:
record = runs_by_id.get(run_id)
if record is None:
continue
baseline = _score_from_validation(record.validation_result, record.success)
candidate_score = _candidate_score(baseline, draft)
cases.append(
{
"run_id": run_id,
"session_id": record.session_id,
"baseline_score": baseline,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline, 4),
}
)
if not cases:
cases.append(
{
"run_id": "",
"session_id": "",
"baseline_score": 0.75,
"candidate_score": _candidate_score(0.75, draft),
"delta": round(_candidate_score(0.75, draft) - 0.75, 4),
}
)
baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases)
candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases)
regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(cases) - len(regressions) - len(improved)
score_delta = candidate_avg - baseline_avg
passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=passed,
baseline_score_avg=round(baseline_avg, 4),
candidate_score_avg=round(candidate_avg, 4),
score_delta=round(score_delta, 4),
regression_count=len(regressions),
improved_count=len(improved),
unchanged_count=unchanged,
cases=cases,
status="completed",
created_at=_utc_now(),
)
async def _evaluate_replay(
self,
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
replay_cases: list[dict],
provider_bundle: ProviderBundle,
replay_runner: ReplayRunner,
case_selection_meta: dict[str, Any] | None = None,
progress_callback: Callable[[dict[str, Any]], None] | None = None,
) -> SkillDraftEvalReport:
total_cases = len(replay_cases)
total_arms = total_cases * 2
completed_arms = 0
completed_cases = 0
progress_lock = asyncio.Lock()
semaphore = asyncio.Semaphore(self.max_parallel_cases)
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=0,
total_cases=total_cases,
)
async def mark_progress(*, case_completed: bool) -> None:
nonlocal completed_arms, completed_cases
async with progress_lock:
completed_arms += 1
if case_completed:
completed_cases += 1
_report_progress(
progress_callback,
completed_arms=completed_arms,
total_arms=total_arms,
completed_cases=completed_cases,
total_cases=total_cases,
)
async def evaluate_case(case: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
async with semaphore:
baseline = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:baseline",
arm="baseline",
task_text=str(case["task_text"]),
pinned_skill_names=list(case.get("baseline_skill_names") or []),
pinned_skill_contexts=[],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
await mark_progress(case_completed=False)
candidate_arm = await replay_runner.run_arm(
ReplayArmRequest(
case_id=f"{case['run_id']}:candidate",
arm="candidate",
task_text=str(case["task_text"]),
pinned_skill_names=[],
pinned_skill_contexts=[_draft_skill_context(draft)],
provider_bundle=provider_bundle,
model_settings={"max_tool_iterations": 4, "temperature": 0.0},
)
)
await mark_progress(case_completed=True)
surrogate = await self.surrogate_evaluator.evaluate(
task_text=str(case["task_text"]),
baseline=baseline,
candidate=candidate_arm,
)
return _build_replay_case_reports(case, baseline, candidate_arm, surrogate)
results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases))
case_reports = [case_report for case_report, _ in results]
legacy_cases = [legacy_case for _, legacy_case in results]
preservation_report = _preservation_report(candidate, draft)
return _report_from_case_reports(
candidate,
draft,
case_reports,
legacy_cases,
preservation_report,
case_selection_meta or {},
)
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=True,
baseline_score_avg=0.0,
candidate_score_avg=0.0,
score_delta=0.0,
regression_count=0,
improved_count=0,
unchanged_count=0,
cases=[],
status="skipped_provider_unavailable",
created_at=_utc_now(),
)
def _build_replay_case_reports(
case: dict[str, Any],
baseline: dict[str, Any],
candidate_arm: dict[str, Any],
surrogate: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any]]:
baseline_ability = _ability_score(case=case, arm=baseline, arm_name="baseline")
candidate_ability = _ability_score(case=case, arm=candidate_arm, arm_name="candidate")
baseline_score = baseline_ability["final_score"]
candidate_score = candidate_ability["final_score"]
tier = case.get("tier") or ("bronze" if case.get("synthetic") else "gold")
case_report = {
"run_id": case["run_id"],
"task_id": case.get("task_id"),
"session_id": case.get("session_id"),
"task_text": case.get("task_text"),
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"validator": case.get("validator"),
"baseline": baseline,
"candidate": candidate_arm,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
"ability_score": {
"baseline": baseline_ability,
"candidate": candidate_ability,
"delta": round(candidate_score - baseline_score, 4),
},
"tool_execution_score": {
"baseline_score": surrogate["baseline_score"],
"candidate_score": surrogate["candidate_score"],
"delta": round(surrogate["candidate_score"] - surrogate["baseline_score"], 4),
"score_role": "diagnostic_only",
},
"execution_coverage": _arm_mode_coverage(baseline, candidate_arm, "executed"),
"surrogate_coverage": _arm_mode_coverage(baseline, candidate_arm, "surrogate"),
"blocked_tool_count": _arm_mode_count(baseline, candidate_arm, "blocked"),
"confidence": surrogate["confidence"],
"tool_calls": [*baseline.get("tool_calls", []), *candidate_arm.get("tool_calls", [])],
"artifacts": [*baseline.get("artifacts", []), *candidate_arm.get("artifacts", [])],
"side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])],
"validator_notes": list(surrogate.get("notes") or []),
}
return case_report, {
"run_id": case["run_id"],
"session_id": case.get("session_id") or "",
"task_text": case.get("task_text") or "",
"synthetic": bool(case.get("synthetic")),
"tier": tier,
"baseline_score": baseline_score,
"candidate_score": candidate_score,
"delta": round(candidate_score - baseline_score, 4),
}
def _report_progress(
callback: Callable[[dict[str, Any]], None] | None,
*,
completed_arms: int,
total_arms: int,
completed_cases: int,
total_cases: int,
) -> None:
if callback is None:
return
callback(
{
"phase": "replaying",
"completed_arms": completed_arms,
"total_arms": total_arms,
"completed_cases": completed_cases,
"total_cases": total_cases,
}
)
def _score_from_validation(validation: dict | None, success: bool) -> float:
if isinstance(validation, dict) and "score" in validation:
try:
return max(0.0, min(1.0, float(validation.get("score") or 0.0)))
except (TypeError, ValueError):
pass
return 0.8 if success else 0.4
def _candidate_score(baseline: float, draft: SkillDraft) -> float:
content = draft.proposed_content.strip()
if not content and draft.proposal_kind != "retire_skill":
return 0.0
if "regression" in content.lower():
return max(0.0, baseline - 0.2)
return min(1.0, max(0.75, baseline + 0.05))
def _draft_skill_context(draft: SkillDraft) -> SkillContext:
tool_hints = draft.proposed_frontmatter.get("tools")
return SkillContext(
name=f"draft:{draft.skill_name}",
content=draft.proposed_content,
version=draft.draft_id,
content_hash="draft",
activation_reason="skill_replay_eval_candidate",
tool_hints=[str(item) for item in tool_hints if str(item).strip()] if isinstance(tool_hints, list) else [],
)
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
if candidate.kind not in {"revise_skill", "merge_skills"}:
return None
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
if not base_content.strip():
return None
return check_preservation(base_content=base_content, draft_content=draft.proposed_content)
async def _prepare_eval_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
explicit_cases = _explicit_eval_cases(candidate)
merged = _dedupe_cases([*explicit_cases, *historical_cases])
usable, excluded = _filter_unscorable_cases(merged)
missing = max(0, 10 - len(usable))
generated: list[dict[str, Any]] = []
if missing:
generated = await _generate_synthetic_cases(
candidate=candidate,
draft=draft,
historical_cases=usable,
provider_bundle=provider_bundle,
count=missing,
)
generated, generated_excluded = _filter_unscorable_cases(generated)
excluded["synthetic_without_validator"] += generated_excluded["synthetic_without_validator"]
if len(generated) < missing:
generated.extend(
_fallback_synthetic_cases(
candidate=candidate,
historical_cases=usable,
start_index=len(generated) + 1,
count=missing - len(generated),
)
)
prepared = [*usable, *generated]
return prepared[:10], {
"requested_case_count": 10,
"historical_case_count": len(historical_cases),
"explicit_case_count": len(explicit_cases),
"generated_synthetic_count": sum(1 for item in prepared if item.get("synthetic")),
"excluded_synthetic_without_validator": excluded["synthetic_without_validator"],
}
def _explicit_eval_cases(candidate: SkillLearningCandidate) -> list[dict[str, Any]]:
raw_cases = candidate.evidence.get("eval_cases") if isinstance(candidate.evidence, dict) else None
if not isinstance(raw_cases, list):
return []
result: list[dict[str, Any]] = []
for index, raw in enumerate(raw_cases, start=1):
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
if not task_text:
continue
case = {
"run_id": str(raw.get("run_id") or f"explicit:{candidate.candidate_id}:{index:02d}"),
"task_id": raw.get("task_id") or f"explicit-{index:02d}",
"session_id": raw.get("session_id") or "explicit-eval",
"task_text": task_text,
"baseline_skill_names": list(raw.get("baseline_skill_names") or _baseline_skill_names(candidate)),
"candidate_skill_name": raw.get("candidate_skill_name") or candidate.draft_skill_name,
"accepted_score": _bounded_score(raw.get("accepted_score"), default=0.75),
"synthetic": bool(raw.get("synthetic")),
"tier": raw.get("tier") or ("bronze" if raw.get("synthetic") else "gold"),
}
if isinstance(raw.get("validator"), dict):
case["validator"] = dict(raw["validator"])
result.append(case)
return result
def _dedupe_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
seen: set[str] = set()
for case in cases:
run_id = str(case.get("run_id") or "")
task_text = str(case.get("task_text") or "")
key = run_id or task_text
if not key or key in seen:
continue
seen.add(key)
result.append(case)
return result
def _filter_unscorable_cases(cases: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, int]]:
result: list[dict[str, Any]] = []
excluded = {"synthetic_without_validator": 0}
for case in cases:
if case.get("synthetic") and not isinstance(case.get("validator"), dict):
excluded["synthetic_without_validator"] += 1
continue
result.append(case)
return result, excluded
async def _generate_synthetic_cases(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
provider_bundle: ProviderBundle,
count: int,
) -> list[dict[str, Any]]:
provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider
runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime
model = getattr(runtime, "model", None)
try:
response = await provider.chat(
messages=[
{
"role": "system",
"content": (
"You generate validator-first Beaver skill evaluation cases. "
"Return only JSON with key cases. Each case must include task_text and validator. "
"Validator type should be final_answer_contains with required_terms and optional forbidden_terms."
),
},
{
"role": "user",
"content": _synthetic_case_prompt(
candidate=candidate,
draft=draft,
historical_cases=historical_cases,
count=count,
),
},
],
model=model,
max_tokens=2200,
temperature=0.4,
)
except Exception:
return []
payload = _parse_json_payload(response.content or "")
raw_cases = payload.get("cases") if isinstance(payload, dict) else None
if not isinstance(raw_cases, list):
return []
return _synthetic_case_payloads(candidate, raw_cases, start_index=1, limit=count)
def _synthetic_case_prompt(
*,
candidate: SkillLearningCandidate,
draft: SkillDraft,
historical_cases: list[dict[str, Any]],
count: int,
) -> str:
historical = [
{
"run_id": item.get("run_id"),
"task_text": item.get("task_text"),
"validator": item.get("validator"),
}
for item in historical_cases
]
return (
f"Generate {count} synthetic evaluation cases for this skill draft.\n\n"
f"Candidate kind: {candidate.kind}\n"
f"Candidate reason: {candidate.reason}\n"
f"Draft skill name: {draft.skill_name}\n"
f"Related skills: {candidate.related_skill_names}\n"
f"Historical cases:\n{json.dumps(historical, ensure_ascii=False)}\n\n"
"Every synthetic case must be validator-first. Return exactly:\n"
'{"cases":[{"task_text":"...","validator":{"type":"final_answer_contains",'
'"required_terms":["..."],"forbidden_terms":["..."]},"tier":"bronze"}]}'
)
def _parse_json_payload(content: str) -> dict[str, Any]:
cleaned = content.strip()
if cleaned.startswith("```"):
cleaned = cleaned.strip("`")
if cleaned.startswith("json"):
cleaned = cleaned[4:]
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start < 0 or end <= start:
return {}
try:
payload = json.loads(cleaned[start : end + 1])
except json.JSONDecodeError:
return {}
return payload if isinstance(payload, dict) else {}
def _synthetic_case_payloads(
candidate: SkillLearningCandidate,
raw_cases: list[Any],
*,
start_index: int,
limit: int,
) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for raw in raw_cases:
if not isinstance(raw, dict):
continue
task_text = str(raw.get("task_text") or "").strip()
validator = raw.get("validator")
if not task_text or not isinstance(validator, dict):
continue
result.append(
_synthetic_case_payload(
candidate,
task_text,
start_index + len(result),
validator=dict(validator),
tier=str(raw.get("tier") or "bronze"),
)
)
if len(result) >= limit:
break
return result
def _fallback_synthetic_cases(
*,
candidate: SkillLearningCandidate,
historical_cases: list[dict[str, Any]],
start_index: int,
count: int,
) -> list[dict[str, Any]]:
seed_text = ""
if historical_cases:
seed_text = str(historical_cases[(start_index - 1) % len(historical_cases)].get("task_text") or "")
if not seed_text:
seed_text = candidate.reason or candidate.draft_skill_name or "the candidate skill"
required_terms = _terms(seed_text)[:2] or ["done"]
return [
_synthetic_case_payload(
candidate,
f"Complete a realistic task related to {seed_text}. Scenario {index}.",
index,
validator={"type": "final_answer_contains", "required_terms": required_terms, "forbidden_terms": []},
tier="bronze",
)
for index in range(start_index, start_index + count)
]
def _synthetic_case_payload(
candidate: SkillLearningCandidate,
task_text: str,
index: int,
*,
validator: dict[str, Any],
tier: str,
) -> dict[str, Any]:
return {
"run_id": f"synthetic:{candidate.candidate_id}:{index:02d}",
"task_id": f"synthetic-{index:02d}",
"session_id": "synthetic-eval",
"task_text": task_text,
"baseline_skill_names": _baseline_skill_names(candidate),
"candidate_skill_name": candidate.draft_skill_name,
"accepted_score": 0.75,
"synthetic": True,
"tier": tier,
"validator": validator,
}
def _baseline_skill_names(candidate: SkillLearningCandidate) -> list[str]:
if candidate.kind == "revise_skill":
return list(candidate.related_skill_names[:1])
if candidate.kind == "merge_skills":
return list(candidate.related_skill_names)
return []
def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) -> dict[str, Any]:
validator = case.get("validator") if isinstance(case.get("validator"), dict) else None
if validator is not None:
return _ability_from_validator(validator, arm)
if not case.get("synthetic"):
score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"]
return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge")
return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."])
def _ability_from_validator(validator: dict[str, Any], arm: dict[str, Any]) -> dict[str, Any]:
validator_type = str(validator.get("type") or "")
if validator_type != "final_answer_contains":
return _ability_from_output(arm, source="llm_judge", notes=[f"Unsupported validator type: {validator_type}"])
answer = str(arm.get("final_answer") or "").lower()
required_terms = [str(item).lower() for item in validator.get("required_terms") or [] if str(item).strip()]
forbidden_terms = [str(item).lower() for item in validator.get("forbidden_terms") or [] if str(item).strip()]
matched = sum(1 for term in required_terms if term in answer)
outcome = matched / max(1, len(required_terms))
unsafe = any(term in answer for term in forbidden_terms)
safety = 0.0 if unsafe else 1.0
final_score = (
0.40 * outcome
+ 0.25 * outcome
+ 0.15 * _process_validity(arm)
+ 0.10 * safety
+ 0.10 * _path_efficiency(arm, outcome)
)
return {
**_ability_breakdown(score=final_score, source="auto_validator"),
"outcome_correctness": round(outcome, 4),
"artifact_correctness": round(outcome, 4),
"safety_no_regression": round(safety, 4),
"validator_type": validator_type,
}
def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", notes: list[str] | None = None) -> dict[str, Any]:
answer = str(arm.get("final_answer") or "").strip()
score = 0.7 if answer and arm.get("finish_reason") != "error" else 0.3
return _ability_breakdown(score=score, source=source, notes=notes)
def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]:
bounded = _bounded_score(score, default=0.0)
return {
"outcome_correctness": bounded,
"artifact_correctness": bounded,
"process_validity": bounded,
"safety_no_regression": bounded,
"path_efficiency": bounded,
"final_score": round(bounded, 4),
"source": source,
"notes": list(notes or []),
}
def _process_validity(arm: dict[str, Any]) -> float:
if arm.get("finish_reason") == "error":
return 0.2
return 0.8 if arm.get("tool_calls") else 0.6
def _path_efficiency(arm: dict[str, Any], outcome: float) -> float:
if outcome < 0.5:
return 0.3
call_count = len([item for item in arm.get("tool_calls") or [] if isinstance(item, dict)])
if call_count <= 3:
return 1.0
if call_count <= 6:
return 0.7
return 0.4
def _bounded_score(value: Any, *, default: float) -> float:
try:
return max(0.0, min(1.0, float(value)))
except (TypeError, ValueError):
return default
def _terms(text: str) -> list[str]:
return [part.strip(".,:;!?()[]{}").lower() for part in text.split() if len(part.strip(".,:;!?()[]{}")) > 3]
def _report_from_case_reports(
candidate: SkillLearningCandidate,
draft: SkillDraft,
case_reports: list[dict],
legacy_cases: list[dict],
preservation_report: dict | None,
case_selection_meta: dict[str, Any] | None = None,
) -> SkillDraftEvalReport:
baseline_avg = sum(item["baseline_score"] for item in legacy_cases) / len(legacy_cases)
candidate_avg = sum(item["candidate_score"] for item in legacy_cases) / len(legacy_cases)
regressions = [item for item in legacy_cases if item["candidate_score"] < item["baseline_score"]]
improved = [item for item in legacy_cases if item["candidate_score"] > item["baseline_score"]]
unchanged = len(legacy_cases) - len(regressions) - len(improved)
real_cases = [item for item in legacy_cases if not item.get("synthetic")]
synthetic_cases = [item for item in legacy_cases if item.get("synthetic")]
execution, surrogate, blocked = _coverage(case_reports)
confidence = _confidence(execution, surrogate, blocked, [item.get("confidence") for item in case_reports])
score_delta = candidate_avg - baseline_avg
passed = candidate_avg >= 0.75 and not (regressions and score_delta <= 0) and blocked < 1.0
selection_meta = dict(case_selection_meta or {})
real_score_avg = _avg([item["candidate_score"] for item in real_cases])
synthetic_score_avg = _avg([item["candidate_score"] for item in synthetic_cases])
overall_score_avg = round(candidate_avg, 4)
ability_summary = {
"score_role": "primary",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
}
tool_execution_summary = {
"score_role": "diagnostic_only",
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
}
return SkillDraftEvalReport(
report_id=uuid4().hex,
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=passed,
baseline_score_avg=round(baseline_avg, 4),
candidate_score_avg=round(candidate_avg, 4),
score_delta=round(score_delta, 4),
regression_count=len(regressions),
improved_count=len(improved),
unchanged_count=unchanged,
cases=legacy_cases,
status="completed",
created_at=_utc_now(),
eval_version="replay-v1",
mode="replay",
execution_coverage=execution,
surrogate_coverage=surrogate,
blocked_coverage=blocked,
confidence=confidence,
case_reports=case_reports,
tool_mode_summary={
"executed": execution,
"surrogate": surrogate,
"blocked": blocked,
"score_role": "diagnostic_only",
"real_case_count": len(real_cases),
"synthetic_case_count": len(synthetic_cases),
"real_score_avg": real_score_avg,
"synthetic_score_avg": synthetic_score_avg,
"overall_score_avg": overall_score_avg,
**selection_meta,
},
ability_score_summary=ability_summary,
tool_execution_summary=tool_execution_summary,
case_selection_summary=selection_meta,
real_score_avg=real_score_avg,
synthetic_score_avg=synthetic_score_avg,
overall_score_avg=overall_score_avg,
preservation_report=preservation_report,
)
def _avg(values: list[float]) -> float | None:
if not values:
return None
return round(sum(values) / len(values), 4)
def _coverage(case_reports: list[dict]) -> tuple[float, float, float]:
counts = {"executed": 0, "surrogate": 0, "blocked": 0}
for report in case_reports:
for call in report.get("tool_calls") or []:
if isinstance(call, dict) and call.get("mode") in counts:
counts[str(call["mode"])] += 1
total = sum(counts.values())
if total == 0:
return 1.0, 0.0, 0.0
return (
round(counts["executed"] / total, 4),
round(counts["surrogate"] / total, 4),
round(counts["blocked"] / total, 4),
)
def _confidence(execution: float, surrogate: float, blocked: float, case_confidences: list[object]) -> str:
if blocked > 0.0:
return "low"
if execution >= 0.75 and surrogate <= 0.25:
return "high"
if execution >= 0.25 or "medium" in case_confidences:
return "medium"
return "low"
def _arm_mode_coverage(baseline: dict, candidate: dict, mode: str) -> float:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
if not calls:
return 1.0 if mode == "executed" else 0.0
return round(sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode) / len(calls), 4)
def _arm_mode_count(baseline: dict, candidate: dict, mode: str) -> int:
calls = [*baseline.get("tool_calls", []), *candidate.get("tool_calls", [])]
return sum(1 for call in calls if isinstance(call, dict) and call.get("mode") == mode)
def _utc_now() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()