feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions

View File

@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore
class StubProvider(LLMProvider):
async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
return LLMResponse(content="ok")
def __init__(self, content: str = "ok") -> None:
super().__init__()
self.content = content
self.calls: list[dict] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
thinking_enabled: bool | None = None,
) -> LLMResponse:
self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
return LLMResponse(content=self.content)
def get_default_model(self) -> str:
return "stub"
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert report.passed is True
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is False
assert pipeline.get_candidate("candidate-1").status == "eval_failed"
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
class FakeReplayRunner:
def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
self.baseline_answer = baseline_answer
self.candidate_answer = candidate_answer
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
return {
"case_id": request.case_id,
"arm": request.arm,
@ -168,7 +187,7 @@ class FakeReplayRunner:
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"final_answer": final_answer,
"tool_calls": [
{
"tool_name": "write_file",
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}
assert "ability_score" in report.case_reports[0]
assert "tool_execution_score" in report.case_reports[0]
assert report.ability_score_summary["score_role"] == "primary"
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "validator-case",
"task_id": "validator-case",
"session_id": "eval",
"task_text": "Write the release verdict.",
"validator": {
"type": "final_answer_contains",
"required_terms": ["ship"],
"forbidden_terms": ["do not ship"],
},
"accepted_score": 0.5,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(
baseline_answer="Do not ship. Tests are failing.",
candidate_answer="Ship after smoke tests pass.",
),
)
)
case = report.case_reports[0]
assert case["tool_execution_score"]["baseline_score"] == 0.85
assert case["tool_execution_score"]["candidate_score"] == 0.85
assert case["baseline_score"] < case["candidate_score"]
assert report.tool_mode_summary["score_role"] == "diagnostic_only"
assert report.ability_score_summary["score_role"] == "primary"
assert report.real_score_avg is not None
assert report.synthetic_score_avg is not None
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "synthetic:no-validator",
"task_id": "synthetic-no-validator",
"session_id": "synthetic-eval",
"task_text": "Synthetic task without an oracle.",
"synthetic": True,
"accepted_score": 0.75,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
replay_runner = FakeReplayRunner()
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=replay_runner,
)
)
assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1