feat(beaver): 完成Task Team功能v1实现,重构后端架构支持统一内核
新增内部Task系统,包括验证、反馈门控机制,实现自动质量验证 (通过率>=0.75)和用户反馈闭环(satisfied/revise/abandon)。 实现Agent Team v1协调器,支持sequence/parallel/dag执行策略, sub-agent复用主AgentLoop,每个run使用独立memory snapshot。 建立Skill学习pipeline,包含draft/审核/发布/回滚完整生命周期, 通过Task验证通过且用户满意才生成学习候选。 重构目录结构,移除third_party依赖,建立统一engine内核, 所有agent共享运行时基础组件。 更新ContextBuilder清理provider消息字段,增强SkillContext版本管理, 集成TaskExecutionPlanner和TaskSkillResolver实现技能解析机制。
This commit is contained in:
121
app-instance/backend/beaver/skills/learning/eval.py
Normal file
121
app-instance/backend/beaver/skills/learning/eval.py
Normal file
@ -0,0 +1,121 @@
|
||||
"""Lightweight replay/eval reports for skill drafts."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from uuid import uuid4
|
||||
|
||||
from beaver.engine.providers import ProviderBundle
|
||||
from beaver.memory.runs import RunMemoryStore
|
||||
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
|
||||
from beaver.skills.specs import SkillDraft
|
||||
|
||||
|
||||
class SkillDraftEvaluator:
|
||||
"""Builds a bounded eval report without writing user-visible sessions."""
|
||||
|
||||
def __init__(self, run_store: RunMemoryStore) -> None:
|
||||
self.run_store = run_store
|
||||
|
||||
async def evaluate(
|
||||
self,
|
||||
*,
|
||||
candidate: SkillLearningCandidate,
|
||||
draft: SkillDraft,
|
||||
provider_bundle: ProviderBundle | None,
|
||||
) -> SkillDraftEvalReport:
|
||||
if provider_bundle is None or provider_bundle.main_provider is None:
|
||||
return self._skipped(candidate, draft)
|
||||
|
||||
runs_by_id = {record.run_id: record for record in self.run_store.list_runs()}
|
||||
cases: list[dict] = []
|
||||
for run_id in candidate.source_run_ids[:8]:
|
||||
record = runs_by_id.get(run_id)
|
||||
if record is None:
|
||||
continue
|
||||
baseline = _score_from_validation(record.validation_result, record.success)
|
||||
candidate_score = _candidate_score(baseline, draft)
|
||||
cases.append(
|
||||
{
|
||||
"run_id": run_id,
|
||||
"session_id": record.session_id,
|
||||
"baseline_score": baseline,
|
||||
"candidate_score": candidate_score,
|
||||
"delta": round(candidate_score - baseline, 4),
|
||||
}
|
||||
)
|
||||
if not cases:
|
||||
cases.append(
|
||||
{
|
||||
"run_id": "",
|
||||
"session_id": "",
|
||||
"baseline_score": 0.75,
|
||||
"candidate_score": _candidate_score(0.75, draft),
|
||||
"delta": round(_candidate_score(0.75, draft) - 0.75, 4),
|
||||
}
|
||||
)
|
||||
|
||||
baseline_avg = sum(item["baseline_score"] for item in cases) / len(cases)
|
||||
candidate_avg = sum(item["candidate_score"] for item in cases) / len(cases)
|
||||
regressions = [item for item in cases if item["candidate_score"] < item["baseline_score"]]
|
||||
improved = [item for item in cases if item["candidate_score"] > item["baseline_score"]]
|
||||
unchanged = len(cases) - len(regressions) - len(improved)
|
||||
score_delta = candidate_avg - baseline_avg
|
||||
passed = not (len(regressions) > 0 and score_delta <= 0) and candidate_avg >= 0.75
|
||||
return SkillDraftEvalReport(
|
||||
report_id=uuid4().hex,
|
||||
skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
candidate_id=candidate.candidate_id,
|
||||
passed=passed,
|
||||
baseline_score_avg=round(baseline_avg, 4),
|
||||
candidate_score_avg=round(candidate_avg, 4),
|
||||
score_delta=round(score_delta, 4),
|
||||
regression_count=len(regressions),
|
||||
improved_count=len(improved),
|
||||
unchanged_count=unchanged,
|
||||
cases=cases,
|
||||
status="completed",
|
||||
created_at=_utc_now(),
|
||||
)
|
||||
|
||||
def _skipped(self, candidate: SkillLearningCandidate, draft: SkillDraft) -> SkillDraftEvalReport:
|
||||
return SkillDraftEvalReport(
|
||||
report_id=uuid4().hex,
|
||||
skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
candidate_id=candidate.candidate_id,
|
||||
passed=True,
|
||||
baseline_score_avg=0.0,
|
||||
candidate_score_avg=0.0,
|
||||
score_delta=0.0,
|
||||
regression_count=0,
|
||||
improved_count=0,
|
||||
unchanged_count=0,
|
||||
cases=[],
|
||||
status="skipped_provider_unavailable",
|
||||
created_at=_utc_now(),
|
||||
)
|
||||
|
||||
|
||||
def _score_from_validation(validation: dict | None, success: bool) -> float:
|
||||
if isinstance(validation, dict) and "score" in validation:
|
||||
try:
|
||||
return max(0.0, min(1.0, float(validation.get("score") or 0.0)))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return 0.8 if success else 0.4
|
||||
|
||||
|
||||
def _candidate_score(baseline: float, draft: SkillDraft) -> float:
|
||||
content = draft.proposed_content.strip()
|
||||
if not content and draft.proposal_kind != "retire_skill":
|
||||
return 0.0
|
||||
if "regression" in content.lower():
|
||||
return max(0.0, baseline - 0.2)
|
||||
return min(1.0, max(0.75, baseline + 0.05))
|
||||
|
||||
|
||||
def _utc_now() -> str:
|
||||
from datetime import datetime, timezone
|
||||
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
Reference in New Issue
Block a user