feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置,将agents数组清空。
为web应用添加了CORS中间件支持,允许指定的前端地址跨域访问。
重构了技能上传功能,增加了LLM重写机制,自动规范化上传的技能格式。
新增了工具名称提取逻辑,从技能正文中自动识别Required Tools段落。
更新了技能学习候选者和草稿的载荷结构,添加评估报告统计信息。
修改了意图路由技能的说明,改进任务状态管理逻辑。
This commit is contained in:
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions

View File

@ -4,6 +4,7 @@ import json
from pathlib import Path
from beaver.engine import EngineLoader
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import parse_frontmatter
@ -69,6 +70,16 @@ def test_skill_authoring_admin_is_seeded_but_not_initial() -> None:
assert version["tool_hints"] == expected_tools
def test_seeded_skill_bodies_use_canonical_format() -> None:
for index_name in ("published", "disabled"):
index = json.loads((REPO_ROOT / "skills" / "_index" / f"{index_name}.json").read_text(encoding="utf-8"))
for skill_name in index["items"]:
skill_dir = REPO_ROOT / "skills" / skill_name / "versions" / "v0001"
_frontmatter, body = parse_frontmatter((skill_dir / "SKILL.md").read_text(encoding="utf-8"))
assert is_canonical_skill_body(body), skill_name
def test_default_runtime_registers_skill_view_tool(tmp_path: Path) -> None:
loaded = EngineLoader(workspace=tmp_path).load()
try:

View File

@ -87,6 +87,14 @@ def _task() -> TaskRecord:
)
def _weather_task() -> TaskRecord:
task = _task()
task.description = "珠海天气怎样"
task.goal = "珠海天气怎样"
task.metadata["short_title"] = "查询珠海天气"
return task
def test_router_continues_active_task_from_llm_decision() -> None:
provider = RouterProvider('{"action":"continue_task","reason":"related","short_title":"任务连续性"}')
decision = asyncio.run(
@ -103,6 +111,35 @@ def test_router_continues_active_task_from_llm_decision() -> None:
assert provider.calls[0]["max_tokens"] == 256
def test_router_keeps_same_session_but_starts_new_task_for_standalone_weather_repeat() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"珠海天气怎么样",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"neutral follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "create_task"
assert decision.starts_new_task is True
assert "fresh standalone task request" in decision.reason
def test_router_allows_explicit_followup_to_continue_active_weather_task() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
"顺便查一下深圳",
active_task=_weather_task(),
provider=RouterProvider('{"action":"continue_task","reason":"related follow-up","short_title":"查询珠海天气"}'),
)
)
assert decision.is_task
assert decision.action == "continue_task"
assert decision.starts_new_task is False
def test_router_marks_revision_from_llm_decision() -> None:
decision = asyncio.run(
MainAgentRouter().classify(
@ -163,6 +200,8 @@ def test_router_prompt_treats_unrelated_lightweight_conversation_as_new_topic()
prompt = provider.calls[0]["messages"][1]["content"]
assert "unrelated lightweight conversation" in prompt
assert "must not be classified as revise_task merely because the active Task is awaiting acceptance" in prompt
assert "A Session is the durable conversation/device/group context" in prompt
assert "Repeating '珠海天气怎么样' later is a new Task" in prompt
def test_router_closes_active_task_from_llm_decision() -> None:

View File

@ -5,13 +5,40 @@ from types import SimpleNamespace
import pytest
from beaver.interfaces.web.app import _create_skill_upload_draft
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.interfaces.web.app import _create_skill_upload_draft, _rewrite_uploaded_skill_draft_with_llm
from beaver.services.skillhub_service import SkillHubService
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.catalog.utils import extract_required_tool_names
from beaver.skills.drafts import DraftService
from beaver.skills.specs import SkillSpecStore
from beaver.tools.mcp.wrapper import MCPToolWrapper
class RewriteProvider(LLMProvider):
def __init__(self) -> None:
super().__init__()
self.messages = []
async def chat(self, messages, tools=None, model=None, max_tokens=None, temperature=0.7, thinking_enabled=None):
self.messages = messages
return LLMResponse(
content="""{
"frontmatter": {
"name": "skill",
"description": "Use when uploaded skill guidance needs QA formatting.",
"tools": ["read_file"]
},
"content": "# Skill\\n\\n## Overview\\n\\nLLM rewritten overview.\\n\\n## When to Use\\n\\n- Use when testing upload rewrite.\\n\\n## Required Tools\\n\\n- `read_file`\\n\\n## Workflow\\n\\n- Follow the rewritten workflow.\\n\\n## Validation\\n\\n- Verify the result.\\n\\n## Boundaries\\n\\n- Stay in scope.\\n\\n## Anti-Patterns\\n\\n- Do not skip rewrite validation.\\n",
"change_reason": "normalized upload"
}""",
model=model,
)
def get_default_model(self):
return "rewrite-model"
class FakeSkillHubService(SkillHubService):
async def _get_json(self, path, *, params=None):
if path == "/skills":
@ -99,6 +126,106 @@ def test_upload_skill_zip_keeps_supporting_files_on_draft(tmp_path):
assert upload_dir.endswith(draft["draft_id"])
def test_upload_skill_zip_canonicalizes_uploaded_skill_body(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"skill/SKILL.md",
"---\nname: skill\ndescription: raw upload\ntools:\n - read_file\n---\nBody without our format.\n",
)
draft = _create_skill_upload_draft(loaded, "skill.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["name"] == "skill"
assert draft["proposed_frontmatter"]["tools"] == ["read_file"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_zip_infers_weather_web_tools_from_content(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nLook up current weather and forecast for a city online.\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["proposed_frontmatter"]["tools"] == ["web_fetch", "web_search"]
assert extract_required_tool_names(draft["proposed_content"]) == ["web_fetch", "web_search"]
assert is_canonical_skill_body(draft["proposed_content"])
def test_upload_skill_llm_rewrite_updates_draft(tmp_path):
store = SkillSpecStore(tmp_path)
draft_service = DraftService(store)
draft = draft_service.create_new_skill_draft(
skill_name="skill",
proposed_content="# Skill\n\n## Overview\n\nFallback.",
proposed_frontmatter={"name": "skill", "description": "fallback", "tools": ["read_file"]},
created_by="test",
reason="upload",
)
provider = RewriteProvider()
agent_service = SimpleNamespace(
_make_provider_bundle_for_task=lambda _loaded, _kwargs: SimpleNamespace(
main_provider=provider,
main_runtime=SimpleNamespace(model="rewrite-model"),
)
)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=draft_service)
asyncio.run(_rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename="skill.zip"))
rewritten = draft_service.get_draft("skill", draft.draft_id)
assert rewritten is not None
assert "LLM rewritten overview" in rewritten.proposed_content
assert is_canonical_skill_body(rewritten.proposed_content)
assert "Canonical Beaver SKILL.md format" in provider.messages[1]["content"]
assert "Available runtime tool names" in provider.messages[1]["content"]
def test_upload_skill_zip_accepts_nested_single_skill_directory(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"plugin/skills/nested-skill/SKILL.md",
"---\nname: nested-skill\ndescription: nested\n---\nBody\n",
)
archive.writestr("plugin/skills/nested-skill/references/a.txt", "context")
archive.writestr("plugin/README.md", "ignore package file")
draft = _create_skill_upload_draft(loaded, "plugin.zip", buffer.getvalue())
assert draft["skill_name"] == "nested-skill"
upload_dir = draft["evidence_refs"][0]["supporting_upload_dir"]
assert (tmp_path / "skills" / "nested-skill" / "draft_uploads" / draft["draft_id"] / "references" / "a.txt").read_text() == "context"
assert "README.md" not in draft["evidence_refs"][0]["files"]
def test_upload_skill_zip_accepts_common_skill_markdown_name_aliases(tmp_path):
store = SkillSpecStore(tmp_path)
loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"weather_search/skills.md",
"---\nname: weather-search\ndescription: weather lookup\n---\nBody\n",
)
draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
assert draft["skill_name"] == "weather-search"
assert draft["proposed_frontmatter"]["name"] == "weather-search"
assert is_canonical_skill_body(draft["proposed_content"])
def test_mcp_wrapper_metadata_preserves_server_id_with_underscores():
tool_def = SimpleNamespace(name="auth_status", description="Auth", inputSchema={"type": "object", "properties": {}})

View File

@ -184,7 +184,7 @@ def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
assert published.version == "v0002"
assert store.get_current_version("release-checklist") == "v0002"
with pytest.raises(ValueError, match="approved"):
with pytest.raises(ValueError, match="submitted for review"):
publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")
rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
@ -529,6 +529,66 @@ def test_skill_learning_service_generates_new_skill_for_task_without_published_s
assert candidates[0].source_run_ids == ["task-run-1"]
def test_skill_learning_service_uses_original_task_text_for_new_skill_theme(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
service = SkillLearningService(
run_store=run_store,
learning_store=learning_store,
draft_service=DraftService(store),
evidence_selector=EvidenceSelector(run_store),
)
now = datetime.now(timezone.utc).isoformat()
run_store.append_run_record(
RunRecord(
run_id="task-run-1",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
run_store.append_run_record(
RunRecord(
run_id="task-run-2",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
candidates = service.build_learning_candidates_for_task("task-1", trigger_run_id="task-run-2")
assert [candidate.candidate_id for candidate in candidates] == ["new:task:task-1"]
assert candidates[0].evidence["theme"] == "Compare direct production restart with staging rollout"
assert candidates[0].evidence["task_text"] == "Compare direct production restart with staging rollout"
def test_task_theme_uses_first_sentence_for_chinese_text() -> None:
assert (
SkillLearningService._task_theme(
"帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production。请给出推荐方案、原因、验证步骤和回滚策略。"
)
== "帮我比较两种发布流程的风险A 是直接重启线上容器B 是先部署 staging 再切 production"
)
def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
skill = SkillContext(
name="docker-debug",

View File

@ -0,0 +1,54 @@
from __future__ import annotations
from beaver.skills.authoring.format import (
CANONICAL_SKILL_SECTION_HEADINGS,
canonical_skill_format_instructions,
canonicalize_skill_body,
is_canonical_skill_body,
parse_skill_rewrite_json,
)
def test_canonical_skill_body_contains_required_sections() -> None:
body = canonicalize_skill_body(
title="Filesystem Operation",
overview="Read and update project files safely.",
tools=["read_file", "write_file"],
workflow=["Inspect the file before editing.", "Use the smallest safe edit."],
validation=["Re-read changed files before reporting completion."],
boundaries=["Do not edit files outside the workspace."],
anti_patterns=["Do not overwrite files without reading them first."],
)
assert is_canonical_skill_body(body)
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in body
def test_canonical_skill_format_instructions_are_prompt_ready() -> None:
instructions = canonical_skill_format_instructions()
assert "Canonical Beaver SKILL.md format" in instructions
assert "frontmatter" in instructions
assert "name" in instructions
assert "description" in instructions
assert "tools" in instructions
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in instructions
def test_parse_skill_rewrite_json_backfills_frontmatter_tools_from_required_tools_section() -> None:
payload = parse_skill_rewrite_json(
"""{
"frontmatter": {
"name": "weather-search",
"description": "weather lookup",
"tools": []
},
"content": "# Weather Search\\n\\n## Overview\\n\\nLook up weather.\\n\\n## When to Use\\n\\n- Weather requests.\\n\\n## Required Tools\\n\\n- `web_fetch`\\n- `web_search`\\n\\n## Workflow\\n\\n- Fetch current weather.\\n\\n## Validation\\n\\n- Check source freshness.\\n\\n## Boundaries\\n\\n- Do not guess.\\n\\n## Anti-Patterns\\n\\n- Do not fabricate data.\\n"
}""",
skill_name="weather-search",
)
assert payload is not None
assert payload["frontmatter"]["tools"] == ["web_fetch", "web_search"]

View File

@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore
class StubProvider(LLMProvider):
async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
return LLMResponse(content="ok")
def __init__(self, content: str = "ok") -> None:
super().__init__()
self.content = content
self.calls: list[dict] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
thinking_enabled: bool | None = None,
) -> LLMResponse:
self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
return LLMResponse(content=self.content)
def get_default_model(self) -> str:
return "stub"
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert report.passed is True
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is False
assert pipeline.get_candidate("candidate-1").status == "eval_failed"
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:
class FakeReplayRunner:
def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
self.baseline_answer = baseline_answer
self.candidate_answer = candidate_answer
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
return {
"case_id": request.case_id,
"arm": request.arm,
@ -168,7 +187,7 @@ class FakeReplayRunner:
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "done",
"final_answer": final_answer,
"tool_calls": [
{
"tool_name": "write_file",
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
assert 0.0 <= report.execution_coverage <= 1.0
assert 0.0 <= report.surrogate_coverage <= 1.0
assert report.confidence in {"low", "medium", "high"}
assert "ability_score" in report.case_reports[0]
assert "tool_execution_score" in report.case_reports[0]
assert report.ability_score_summary["score_role"] == "primary"
assert report.tool_execution_summary["score_role"] == "diagnostic_only"
def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "validator-case",
"task_id": "validator-case",
"session_id": "eval",
"task_text": "Write the release verdict.",
"validator": {
"type": "final_answer_contains",
"required_terms": ["ship"],
"forbidden_terms": ["do not ship"],
},
"accepted_score": 0.5,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=FakeReplayRunner(
baseline_answer="Do not ship. Tests are failing.",
candidate_answer="Ship after smoke tests pass.",
),
)
)
case = report.case_reports[0]
assert case["tool_execution_score"]["baseline_score"] == 0.85
assert case["tool_execution_score"]["candidate_score"] == 0.85
assert case["baseline_score"] < case["candidate_score"]
assert report.tool_mode_summary["score_role"] == "diagnostic_only"
assert report.ability_score_summary["score_role"] == "primary"
assert report.real_score_avg is not None
assert report.synthetic_score_avg is not None
def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
pipeline.learning_store.update_learning_candidate(
"candidate-1",
evidence={
"eval_cases": [
{
"run_id": "synthetic:no-validator",
"task_id": "synthetic-no-validator",
"session_id": "synthetic-eval",
"task_text": "Synthetic task without an oracle.",
"synthetic": True,
"accepted_score": 0.75,
}
]
},
)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="release-checklist",
proposed_content="# Release\n\nRun tests.",
proposed_frontmatter={"description": "release", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
replay_runner = FakeReplayRunner()
report = asyncio.run(
pipeline.evaluate_draft(
"candidate-1",
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(),
replay_runner=replay_runner,
)
)
assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1

View File

@ -31,6 +31,12 @@ def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
assert payload["confidence"] == "low"
assert payload["case_reports"] == []
assert payload["tool_mode_summary"] == {}
assert payload["ability_score_summary"] == {}
assert payload["tool_execution_summary"] == {}
assert payload["case_selection_summary"] == {}
assert payload["real_score_avg"] is None
assert payload["synthetic_score_avg"] is None
assert payload["overall_score_avg"] is None
assert payload["preservation_report"] is None
assert payload["cases"] == [{"run_id": "run-1"}]
@ -59,3 +65,37 @@ def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
assert report.mode == "heuristic"
assert report.confidence == "low"
assert report.case_reports == []
def test_eval_report_persists_ability_and_case_split_fields() -> None:
report = SkillDraftEvalReport(
report_id="eval-replay",
skill_name="debug",
draft_id="draft-1",
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
mode="replay",
eval_version="replay-v2",
real_score_avg=0.9,
synthetic_score_avg=0.6,
overall_score_avg=0.8,
ability_score_summary={"score_role": "primary", "real_case_count": 1},
tool_execution_summary={"score_role": "diagnostic_only", "executed": 1.0},
case_selection_summary={"excluded_synthetic_without_validator": 2},
)
payload = report.to_dict()
restored = SkillDraftEvalReport.from_dict(payload)
assert payload["real_score_avg"] == 0.9
assert payload["synthetic_score_avg"] == 0.6
assert payload["overall_score_avg"] == 0.8
assert restored.ability_score_summary == {"score_role": "primary", "real_case_count": 1}
assert restored.tool_execution_summary == {"score_role": "diagnostic_only", "executed": 1.0}
assert restored.case_selection_summary == {"excluded_synthetic_without_validator": 2}

View File

@ -55,14 +55,12 @@ def test_pipeline_lists_candidates_and_moves_draft_through_review(tmp_path: Path
reason="test",
)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
approved = pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
version = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert pipeline.list_candidates()[0].candidate_id == "candidate-1"
assert review.status == SkillReviewState.IN_REVIEW.value
assert approved.status == SkillReviewState.APPROVED.value
assert safety.passed is True
assert version.skill_name == "new-skill"
assert pipeline.get_draft(draft.skill_name, draft.draft_id).status == SkillReviewState.PUBLISHED.value
@ -93,7 +91,6 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
@ -165,7 +162,6 @@ def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="low confidence"):
@ -201,7 +197,6 @@ def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="preservation"):

View File

@ -16,6 +16,25 @@ class FakeAgentLoop:
return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop")
class FakeRunningAgentLoop(FakeAgentLoop):
def __init__(self) -> None:
self.process_direct_calls = 0
self.submit_direct_calls: list[tuple[str, dict]] = []
async def process_direct(self, task: str, **kwargs):
self.process_direct_calls += 1
raise RuntimeError(
"AgentLoop.process_direct() is disabled while run() is active; "
"submit tasks via submit_direct() instead."
)
async def submit_direct(self, task: str, **kwargs):
self.submit_direct_calls.append((task, kwargs))
executor = kwargs["tool_executor_override"]
await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"})
return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
runner = ReplayRunner(agent_loop=FakeAgentLoop())
request = ReplayArmRequest(
@ -34,3 +53,33 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
assert report["arm"] == "candidate"
assert report["finish_reason"] == "stop"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
agent_loop = FakeRunningAgentLoop()
runner = ReplayRunner(agent_loop=agent_loop)
request = ReplayArmRequest(
case_id="case-queued",
arm="baseline",
task_text="Send a status email to Ada.",
pinned_skill_names=["filesystem-operation"],
pinned_skill_contexts=[{"name": "filesystem-operation"}],
provider_bundle=object(),
model_settings={"max_tool_iterations": 3, "temperature": 0.1},
)
report = asyncio.run(runner.run_arm(request))
assert agent_loop.process_direct_calls == 1
assert len(agent_loop.submit_direct_calls) == 1
queued_task, queued_kwargs = agent_loop.submit_direct_calls[0]
assert queued_task == "Send a status email to Ada."
assert queued_kwargs["source"] == "skill_replay_eval"
assert queued_kwargs["include_skill_assembly"] is False
assert queued_kwargs["include_tools"] is True
assert queued_kwargs["pinned_skill_names"] == ["filesystem-operation"]
assert queued_kwargs["max_tool_iterations"] == 3
assert queued_kwargs["temperature"] == 0.1
assert report["session_id"] == "session-queued"
assert report["run_id"] == "run-queued"
assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"

View File

@ -74,7 +74,6 @@ def test_safety_marks_dangerous_tools_high_and_requires_confirm(tmp_path: Path)
report = pipeline.check_safety(draft.skill_name, draft.draft_id)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
assert report.passed is True
assert report.risk_level == "high"
@ -94,7 +93,6 @@ def test_publish_requires_safety_report(tmp_path: Path) -> None:
reason="test",
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
with pytest.raises(ValueError, match="safety report"):
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from beaver.memory.skills import SkillLearningCandidate
from beaver.skills.authoring.format import CANONICAL_SKILL_SECTION_HEADINGS
from beaver.skills.learning.evidence import EvidencePacket
from beaver.skills.learning.synthesizer import SkillDraftSynthesizer
@ -39,3 +40,6 @@ def test_revision_prompt_includes_base_skill_snapshot() -> None:
assert "Do not delete files." in prompt
assert "preserved_sections" in prompt
assert "dropped_sections" in prompt
assert "Canonical Beaver SKILL.md format" in prompt
for heading in CANONICAL_SKILL_SECTION_HEADINGS:
assert heading in prompt

View File

@ -1,12 +1,37 @@
from __future__ import annotations
from pathlib import Path
from types import SimpleNamespace
from fastapi.testclient import TestClient
from beaver.memory.runs import RunRecord
from beaver.interfaces.web.app import create_app
from beaver.memory.skills import SkillLearningCandidate
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.services.agent_service import AgentService
from beaver.skills.specs import SkillVersion
class StubEvaluator:
def __init__(self) -> None:
self.calls = 0
async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
self.calls += 1
return SkillDraftEvalReport(
report_id="eval-existing",
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id=candidate.candidate_id,
passed=True,
baseline_score_avg=0.5,
candidate_score_avg=0.8,
score_delta=0.3,
regression_count=0,
improved_count=1,
unchanged_count=0,
status="completed",
)
def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
@ -31,3 +56,191 @@ def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
assert candidates[0]["candidate_id"] == "candidate-1"
assert "risk_level" in candidates[0]
assert run_once["processed"] >= 0
def test_skill_learning_candidates_payload_prefers_original_task_text(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
now = "2026-06-11T00:00:00+00:00"
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-original",
session_id="session-task",
task_id="task-1",
attempt_index=1,
task_text="Compare direct production restart with staging rollout",
started_at=now,
ended_at=now,
success=False,
finish_reason="stop",
feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
activated_skills=[],
validation_result=None,
)
)
loaded.skill_learning_service.run_store.append_run_record( # type: ignore[union-attr]
RunRecord(
run_id="run-final",
session_id="session-task",
task_id="task-1",
attempt_index=2,
task_text="I do not see the docs",
started_at=now,
ended_at=now,
success=True,
finish_reason="stop",
feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
activated_skills=[],
validation_result={"accepted": True, "score": 0.9},
)
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="new:task:task-1",
kind="new_skill",
source_run_ids=["run-original", "run-final"],
source_session_ids=["session-task"],
related_skill_names=[],
reason="test",
evidence={"task_id": "task-1", "theme": "i do not see the docs"},
)
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
candidates = client.get("/api/skills/candidates").json()
payload = next(item for item in candidates if item["candidate_id"] == "new:task:task-1")
assert payload["evidence"]["theme"] == "Compare direct production restart with staging rollout"
assert payload["evidence"]["task_text"] == "Compare direct production restart with staging rollout"
def test_generate_draft_does_not_run_review_checks(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post("/api/skills/candidates/candidate-existing/draft")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 0
assert payload["draft_id"] == draft.draft_id
assert payload["safety_report"] is None
assert payload["eval_report"] is None
assert loaded.skill_learning_pipeline.get_eval_report(draft.skill_name, draft.draft_id) is None # type: ignore[union-attr]
def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
proposed_content="# Filesystem Operation\n\nUse files safely.",
proposed_frontmatter={"description": "filesystem", "tools": []},
created_by="test",
reason="test",
)
loaded.skill_learning_store.record_learning_candidate( # type: ignore[union-attr]
SkillLearningCandidate(
candidate_id="candidate-existing",
kind="revise_skill",
source_run_ids=["run-1"],
source_session_ids=["session-1"],
related_skill_names=["filesystem-operation"],
reason="revise",
status="draft_ready",
draft_skill_name=draft.skill_name,
draft_id=draft.draft_id,
)
)
evaluator = StubEvaluator()
loaded.skill_learning_pipeline.evaluator = evaluator # type: ignore[union-attr]
monkeypatch.setattr(
service,
"_make_provider_bundle_for_task",
lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
assert response.status_code == 200
payload = response.json()
assert evaluator.calls == 1
assert payload["status"] == "in_review"
assert payload["safety_report"]["passed"] is True
assert payload["eval_report"]["report_id"] == "eval-existing"
def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
service = AgentService(workspace=tmp_path)
loaded = service.create_loop().boot()
loaded.skill_spec_store.write_skill_version( # type: ignore[union-attr]
SkillVersion(
skill_name="filesystem-operation",
version="v0001",
content_hash="hash-v1",
summary_hash="summary-v1",
created_at="2026-06-01T00:00:00+00:00",
created_by="test",
change_reason="initial",
parent_version=None,
review_state="published",
frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
summary="filesystem",
tool_hints=[],
),
"# Filesystem Operation\n\nUse files.",
)
loaded.skill_spec_store.set_current_version("filesystem-operation", "v0001") # type: ignore[union-attr]
draft = loaded.skill_learning_pipeline.draft_service.create_revision_draft( # type: ignore[union-attr]
skill_name="filesystem-operation",
base_version="v0001",
proposed_content="# Filesystem Operation\n\nUse files better.",
proposed_frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
created_by="test",
reason="revise",
)
app = create_app(service=service, manage_service_lifecycle=False)
with TestClient(app) as client:
response = client.get("/api/skills/drafts")
assert response.status_code == 200
payload = next(item for item in response.json() if item["draft_id"] == draft.draft_id)
assert payload["proposal_kind"] == "revise_skill"
assert payload["base_version"] == "v0001"
assert payload["target_version"] == "v0002"
assert payload["base_skill"]["version"] == "v0001"
assert payload["base_skill"]["content"] == "# Filesystem Operation\n\nUse files."
assert payload["base_skill"]["frontmatter"]["name"] == "filesystem-operation"

View File

@ -10,6 +10,7 @@ from beaver.engine.providers.factory import ProviderBundle
from beaver.engine.session import SessionManager
from beaver.memory.runs import RunMemoryStore, RunRecord
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
from beaver.skills.authoring.format import is_canonical_skill_body
from beaver.skills.drafts import DraftService
from beaver.skills.learning import (
EvidenceSelector,
@ -48,6 +49,33 @@ def _bundle(provider: LLMProvider) -> ProviderBundle:
return ProviderBundle(main_runtime=runtime, main_provider=provider) # type: ignore[arg-type]
class FakeReplayRunner:
def __init__(self) -> None:
self.requests = []
async def run_arm(self, request):
self.requests.append(request)
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "debug deployment startup done",
"tool_calls": [
{
"tool_name": "echo",
"mode": "executed",
"arguments": {"text": "ok"},
"result": {"success": True, "content": "ok"},
}
],
"artifacts": [],
"side_effects": [],
}
def _pipeline(tmp_path: Path) -> SkillLearningPipelineService:
spec_store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
@ -109,6 +137,28 @@ def test_worker_synthesizes_open_candidate_without_publish(tmp_path: Path) -> No
assert pipeline.list_drafts(candidate.draft_skill_name)[0].status == "draft"
def test_worker_evaluates_draft_with_replay_runner_when_available(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
replay_runner = FakeReplayRunner()
worker = SkillLearningWorker(
pipeline=pipeline,
provider_bundle_factory=lambda: _bundle(JsonProvider()),
replay_runner_factory=lambda: replay_runner,
config=SkillLearningWorkerConfig(max_drafts_per_run=5, max_retries=3, interval_seconds=1),
)
result = asyncio.run(worker.run_once())
candidate = pipeline.get_candidate("candidate-1")
draft = pipeline.get_draft(candidate.draft_skill_name or "", candidate.draft_id or "")
report = pipeline.get_eval_report(draft.skill_name, draft.draft_id)
assert result.succeeded == 1
assert report is not None
assert report.mode == "replay"
assert report.case_reports
assert replay_runner.requests
def test_worker_retries_and_marks_failed_after_limit(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
worker = SkillLearningWorker(
@ -147,6 +197,7 @@ def test_synthesizer_fills_missing_tools_from_evidence(tmp_path: Path) -> None:
)
assert payload["frontmatter"]["tools"] == ["web_fetch", "memory"]
assert is_canonical_skill_body(payload["content"])
def test_evidence_selector_records_run_tool_names(tmp_path: Path) -> None:

View File

@ -218,6 +218,45 @@ def test_unrelated_new_task_auto_accepts_previous_task(tmp_path: Path) -> None:
assert current.run_ids == [second.run_id]
def test_standalone_realtime_repeat_creates_new_task_in_same_session(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner(),
)
)
session_id = "feishu:group-weather"
first = asyncio.run(
service.process_direct(
"珠海天气怎样",
session_id=session_id,
provider_bundle=_bundle("Weather result"),
)
)
second = asyncio.run(
service.process_direct(
"珠海天气怎么样",
session_id=session_id,
provider_bundle=_bundle("Fresh weather result", route_action="continue_task"),
)
)
task_service = service.create_loop().boot().task_service
assert task_service is not None
previous = task_service.get_task(first.task_id or "")
current = task_service.get_task(second.task_id or "")
assert previous is not None
assert current is not None
assert previous.session_id == session_id
assert current.session_id == session_id
assert current.task_id != previous.task_id
assert previous.status == "closed"
assert previous.run_ids == [first.run_id]
assert current.status == "awaiting_acceptance"
assert current.run_ids == [second.run_id]
def test_related_follow_up_continues_active_task_without_accepting_it(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(

View File

@ -102,6 +102,58 @@ tools:
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files"]
def test_tool_assembler_uses_required_tools_section_when_frontmatter_omits_tools(tmp_path: Path) -> None:
skill_dir = tmp_path / "skills" / "docker-debug"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text(
"""---
name: docker-debug
description: Debug Docker issues.
---
# Docker Debug
## Overview
Debug Docker issues.
## Required Tools
- `terminal`
- `search_files`
## Workflow
Inspect logs and search related files.
""",
encoding="utf-8",
)
registry = ToolRegistry()
registry.register(DummyTool("memory", toolset="memory", always_available=True))
registry.register(DummyTool("terminal", toolset="shell"))
registry.register(DummyTool("search_files", toolset="file"))
registry.register(DummyTool("echo", toolset="debug"))
assembler = ToolAssembler(retriever=StaticRetriever())
loader = SkillsLoader(tmp_path)
record = loader.get_skill_record("docker-debug")
assert record is not None
assert record.tool_hints == ["terminal", "search_files"]
selected = asyncio.run(
assembler.assemble(
task_description="排查 Docker 容器日志",
registry=registry,
skills_loader=loader,
activated_skills=[SkillContext(name="docker-debug", content="", tool_hints=record.tool_hints)],
top_k=1,
)
)
assert [spec.name for spec in selected] == ["memory", "terminal", "search_files", "echo"]
def test_embedding_fallback_can_return_all_or_top_k() -> None:
candidates = [{"name": f"tool_{index}", "description": "", "input_schema": "{}"} for index in range(3)]
retriever = EmbeddingRetriever(api_key_env="MISSING_EMBEDDING_KEY", api_base_env="MISSING_EMBEDDING_BASE")

View File

@ -0,0 +1,21 @@
from fastapi.testclient import TestClient
from beaver.interfaces.web.app import create_app
def test_local_frontend_origin_can_preflight_api_requests() -> None:
app = create_app(service=None, manage_service_lifecycle=False)
client = TestClient(app)
response = client.options(
"/api/auth/me",
headers={
"Origin": "http://127.0.0.1:3080",
"Access-Control-Request-Method": "GET",
"Access-Control-Request-Headers": "authorization",
},
)
assert response.status_code == 200
assert response.headers["access-control-allow-origin"] == "http://127.0.0.1:3080"
assert "authorization" in response.headers["access-control-allow-headers"].lower()