feat(app): 移除内置agents并添加CORS支持和技能上传优化

移除了agents/registry.json中的所有内置agents配置，将agents数组清空。为web应用添加了CORS中间件支持，允许指定的前端地址跨域访问。重构了技能上传功能，增加了LLM重写机制，自动规范化上传的技能格式。新增了工具名称提取逻辑，从技能正文中自动识别Required Tools段落。更新了技能学习候选者和草稿的载荷结构，添加评估报告统计信息。修改了意图路由技能的说明，改进任务状态管理逻辑。
2026-06-12 13:25:20 +08:00
parent fc9fd93c36
commit 8aeb97a5fc
76 changed files with 3382 additions and 553 deletions
--- a/app-instance/backend/tests/unit/test_initial_skill_tool_hints.py
+++ b/app-instance/backend/tests/unit/test_initial_skill_tool_hints.py
@ -4,6 +4,7 @@ import json
 from pathlib import Path

 from beaver.engine import EngineLoader
+from beaver.skills.authoring.format import is_canonical_skill_body
 from beaver.skills.catalog.utils import parse_frontmatter


@ -69,6 +70,16 @@ def test_skill_authoring_admin_is_seeded_but_not_initial() -> None:
        assert version["tool_hints"] == expected_tools


+def test_seeded_skill_bodies_use_canonical_format() -> None:
+    for index_name in ("published", "disabled"):
+        index = json.loads((REPO_ROOT / "skills" / "_index" / f"{index_name}.json").read_text(encoding="utf-8"))
+        for skill_name in index["items"]:
+            skill_dir = REPO_ROOT / "skills" / skill_name / "versions" / "v0001"
+            _frontmatter, body = parse_frontmatter((skill_dir / "SKILL.md").read_text(encoding="utf-8"))
+
+            assert is_canonical_skill_body(body), skill_name
+
+
 def test_default_runtime_registers_skill_view_tool(tmp_path: Path) -> None:
    loaded = EngineLoader(workspace=tmp_path).load()
    try:
--- a/app-instance/backend/tests/unit/test_main_agent_router.py
+++ b/app-instance/backend/tests/unit/test_main_agent_router.py
@ -87,6 +87,14 @@ def _task() -> TaskRecord:
    )


+def _weather_task() -> TaskRecord:
+    task = _task()
+    task.description = "珠海天气怎样"
+    task.goal = "珠海天气怎样"
+    task.metadata["short_title"] = "查询珠海天气"
+    return task
+
+
 def test_router_continues_active_task_from_llm_decision() -> None:
    provider = RouterProvider('{"action":"continue_task","reason":"related","short_title":"任务连续性"}')
    decision = asyncio.run(
@ -103,6 +111,35 @@ def test_router_continues_active_task_from_llm_decision() -> None:
    assert provider.calls[0]["max_tokens"] == 256


+def test_router_keeps_same_session_but_starts_new_task_for_standalone_weather_repeat() -> None:
+    decision = asyncio.run(
+        MainAgentRouter().classify(
+            "珠海天气怎么样",
+            active_task=_weather_task(),
+            provider=RouterProvider('{"action":"continue_task","reason":"neutral follow-up","short_title":"查询珠海天气"}'),
+        )
+    )
+
+    assert decision.is_task
+    assert decision.action == "create_task"
+    assert decision.starts_new_task is True
+    assert "fresh standalone task request" in decision.reason
+
+
+def test_router_allows_explicit_followup_to_continue_active_weather_task() -> None:
+    decision = asyncio.run(
+        MainAgentRouter().classify(
+            "顺便查一下深圳",
+            active_task=_weather_task(),
+            provider=RouterProvider('{"action":"continue_task","reason":"related follow-up","short_title":"查询珠海天气"}'),
+        )
+    )
+
+    assert decision.is_task
+    assert decision.action == "continue_task"
+    assert decision.starts_new_task is False
+
+
 def test_router_marks_revision_from_llm_decision() -> None:
    decision = asyncio.run(
        MainAgentRouter().classify(
@ -163,6 +200,8 @@ def test_router_prompt_treats_unrelated_lightweight_conversation_as_new_topic()
    prompt = provider.calls[0]["messages"][1]["content"]
    assert "unrelated lightweight conversation" in prompt
    assert "must not be classified as revise_task merely because the active Task is awaiting acceptance" in prompt
+    assert "A Session is the durable conversation/device/group context" in prompt
+    assert "Repeating '珠海天气怎么样' later is a new Task" in prompt


 def test_router_closes_active_task_from_llm_decision() -> None:
--- a/app-instance/backend/tests/unit/test_marketplace_and_mcp.py
+++ b/app-instance/backend/tests/unit/test_marketplace_and_mcp.py
@ -5,13 +5,40 @@ from types import SimpleNamespace

 import pytest

-from beaver.interfaces.web.app import _create_skill_upload_draft
+from beaver.engine.providers.base import LLMProvider, LLMResponse
+from beaver.interfaces.web.app import _create_skill_upload_draft, _rewrite_uploaded_skill_draft_with_llm
 from beaver.services.skillhub_service import SkillHubService
+from beaver.skills.authoring.format import is_canonical_skill_body
+from beaver.skills.catalog.utils import extract_required_tool_names
 from beaver.skills.drafts import DraftService
 from beaver.skills.specs import SkillSpecStore
 from beaver.tools.mcp.wrapper import MCPToolWrapper


+class RewriteProvider(LLMProvider):
+    def __init__(self) -> None:
+        super().__init__()
+        self.messages = []
+
+    async def chat(self, messages, tools=None, model=None, max_tokens=None, temperature=0.7, thinking_enabled=None):
+        self.messages = messages
+        return LLMResponse(
+            content="""{
+              "frontmatter": {
+                "name": "skill",
+                "description": "Use when uploaded skill guidance needs QA formatting.",
+                "tools": ["read_file"]
+              },
+              "content": "# Skill\\n\\n## Overview\\n\\nLLM rewritten overview.\\n\\n## When to Use\\n\\n- Use when testing upload rewrite.\\n\\n## Required Tools\\n\\n- `read_file`\\n\\n## Workflow\\n\\n- Follow the rewritten workflow.\\n\\n## Validation\\n\\n- Verify the result.\\n\\n## Boundaries\\n\\n- Stay in scope.\\n\\n## Anti-Patterns\\n\\n- Do not skip rewrite validation.\\n",
+              "change_reason": "normalized upload"
+            }""",
+            model=model,
+        )
+
+    def get_default_model(self):
+        return "rewrite-model"
+
+
 class FakeSkillHubService(SkillHubService):
    async def _get_json(self, path, *, params=None):
        if path == "/skills":
@ -99,6 +126,106 @@ def test_upload_skill_zip_keeps_supporting_files_on_draft(tmp_path):
    assert upload_dir.endswith(draft["draft_id"])


+def test_upload_skill_zip_canonicalizes_uploaded_skill_body(tmp_path):
+    store = SkillSpecStore(tmp_path)
+    loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr(
+            "skill/SKILL.md",
+            "---\nname: skill\ndescription: raw upload\ntools:\n  - read_file\n---\nBody without our format.\n",
+        )
+
+    draft = _create_skill_upload_draft(loaded, "skill.zip", buffer.getvalue())
+
+    assert draft["proposed_frontmatter"]["name"] == "skill"
+    assert draft["proposed_frontmatter"]["tools"] == ["read_file"]
+    assert is_canonical_skill_body(draft["proposed_content"])
+
+
+def test_upload_skill_zip_infers_weather_web_tools_from_content(tmp_path):
+    store = SkillSpecStore(tmp_path)
+    loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr(
+            "weather_search/skills.md",
+            "---\nname: weather-search\ndescription: weather lookup\n---\nLook up current weather and forecast for a city online.\n",
+        )
+
+    draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
+
+    assert draft["proposed_frontmatter"]["tools"] == ["web_fetch", "web_search"]
+    assert extract_required_tool_names(draft["proposed_content"]) == ["web_fetch", "web_search"]
+    assert is_canonical_skill_body(draft["proposed_content"])
+
+
+def test_upload_skill_llm_rewrite_updates_draft(tmp_path):
+    store = SkillSpecStore(tmp_path)
+    draft_service = DraftService(store)
+    draft = draft_service.create_new_skill_draft(
+        skill_name="skill",
+        proposed_content="# Skill\n\n## Overview\n\nFallback.",
+        proposed_frontmatter={"name": "skill", "description": "fallback", "tools": ["read_file"]},
+        created_by="test",
+        reason="upload",
+    )
+    provider = RewriteProvider()
+    agent_service = SimpleNamespace(
+        _make_provider_bundle_for_task=lambda _loaded, _kwargs: SimpleNamespace(
+            main_provider=provider,
+            main_runtime=SimpleNamespace(model="rewrite-model"),
+        )
+    )
+    loaded = SimpleNamespace(skill_spec_store=store, draft_service=draft_service)
+
+    asyncio.run(_rewrite_uploaded_skill_draft_with_llm(agent_service, loaded, draft, filename="skill.zip"))
+    rewritten = draft_service.get_draft("skill", draft.draft_id)
+
+    assert rewritten is not None
+    assert "LLM rewritten overview" in rewritten.proposed_content
+    assert is_canonical_skill_body(rewritten.proposed_content)
+    assert "Canonical Beaver SKILL.md format" in provider.messages[1]["content"]
+    assert "Available runtime tool names" in provider.messages[1]["content"]
+
+
+def test_upload_skill_zip_accepts_nested_single_skill_directory(tmp_path):
+    store = SkillSpecStore(tmp_path)
+    loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr(
+            "plugin/skills/nested-skill/SKILL.md",
+            "---\nname: nested-skill\ndescription: nested\n---\nBody\n",
+        )
+        archive.writestr("plugin/skills/nested-skill/references/a.txt", "context")
+        archive.writestr("plugin/README.md", "ignore package file")
+
+    draft = _create_skill_upload_draft(loaded, "plugin.zip", buffer.getvalue())
+
+    assert draft["skill_name"] == "nested-skill"
+    upload_dir = draft["evidence_refs"][0]["supporting_upload_dir"]
+    assert (tmp_path / "skills" / "nested-skill" / "draft_uploads" / draft["draft_id"] / "references" / "a.txt").read_text() == "context"
+    assert "README.md" not in draft["evidence_refs"][0]["files"]
+
+
+def test_upload_skill_zip_accepts_common_skill_markdown_name_aliases(tmp_path):
+    store = SkillSpecStore(tmp_path)
+    loaded = SimpleNamespace(skill_spec_store=store, draft_service=DraftService(store))
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr(
+            "weather_search/skills.md",
+            "---\nname: weather-search\ndescription: weather lookup\n---\nBody\n",
+        )
+
+    draft = _create_skill_upload_draft(loaded, "weather_search.zip", buffer.getvalue())
+
+    assert draft["skill_name"] == "weather-search"
+    assert draft["proposed_frontmatter"]["name"] == "weather-search"
+    assert is_canonical_skill_body(draft["proposed_content"])
+
+
 def test_mcp_wrapper_metadata_preserves_server_id_with_underscores():
    tool_def = SimpleNamespace(name="auth_status", description="Auth", inputSchema={"type": "object", "properties": {}})

--- a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py
+++ b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py
@ -184,7 +184,7 @@ def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
    assert published.version == "v0002"
    assert store.get_current_version("release-checklist") == "v0002"

-    with pytest.raises(ValueError, match="approved"):
+    with pytest.raises(ValueError, match="submitted for review"):
        publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")

    rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
@ -529,6 +529,66 @@ def test_skill_learning_service_generates_new_skill_for_task_without_published_s
    assert candidates[0].source_run_ids == ["task-run-1"]


+def test_skill_learning_service_uses_original_task_text_for_new_skill_theme(tmp_path: Path) -> None:
+    store = SkillSpecStore(tmp_path)
+    run_store = RunMemoryStore(tmp_path / "memory" / "runs")
+    learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
+    service = SkillLearningService(
+        run_store=run_store,
+        learning_store=learning_store,
+        draft_service=DraftService(store),
+        evidence_selector=EvidenceSelector(run_store),
+    )
+    now = datetime.now(timezone.utc).isoformat()
+    run_store.append_run_record(
+        RunRecord(
+            run_id="task-run-1",
+            session_id="session-task",
+            task_id="task-1",
+            attempt_index=1,
+            task_text="Compare direct production restart with staging rollout",
+            started_at=now,
+            ended_at=now,
+            success=False,
+            finish_reason="stop",
+            feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
+            activated_skills=[],
+            validation_result=None,
+        )
+    )
+    run_store.append_run_record(
+        RunRecord(
+            run_id="task-run-2",
+            session_id="session-task",
+            task_id="task-1",
+            attempt_index=2,
+            task_text="I do not see the docs",
+            started_at=now,
+            ended_at=now,
+            success=True,
+            finish_reason="stop",
+            feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
+            activated_skills=[],
+            validation_result={"accepted": True, "score": 0.9},
+        )
+    )
+
+    candidates = service.build_learning_candidates_for_task("task-1", trigger_run_id="task-run-2")
+
+    assert [candidate.candidate_id for candidate in candidates] == ["new:task:task-1"]
+    assert candidates[0].evidence["theme"] == "Compare direct production restart with staging rollout"
+    assert candidates[0].evidence["task_text"] == "Compare direct production restart with staging rollout"
+
+
+def test_task_theme_uses_first_sentence_for_chinese_text() -> None:
+    assert (
+        SkillLearningService._task_theme(
+            "帮我比较两种发布流程的风险：A 是直接重启线上容器，B 是先部署 staging 再切 production。请给出推荐方案、原因、验证步骤和回滚策略。"
+        )
+        == "帮我比较两种发布流程的风险：A 是直接重启线上容器，B 是先部署 staging 再切 production"
+    )
+
+
 def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
    skill = SkillContext(
        name="docker-debug",
--- a/app-instance/backend/tests/unit/test_skill_authoring_format.py
+++ b/app-instance/backend/tests/unit/test_skill_authoring_format.py
@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from beaver.skills.authoring.format import (
+    CANONICAL_SKILL_SECTION_HEADINGS,
+    canonical_skill_format_instructions,
+    canonicalize_skill_body,
+    is_canonical_skill_body,
+    parse_skill_rewrite_json,
+)
+
+
+def test_canonical_skill_body_contains_required_sections() -> None:
+    body = canonicalize_skill_body(
+        title="Filesystem Operation",
+        overview="Read and update project files safely.",
+        tools=["read_file", "write_file"],
+        workflow=["Inspect the file before editing.", "Use the smallest safe edit."],
+        validation=["Re-read changed files before reporting completion."],
+        boundaries=["Do not edit files outside the workspace."],
+        anti_patterns=["Do not overwrite files without reading them first."],
+    )
+
+    assert is_canonical_skill_body(body)
+    for heading in CANONICAL_SKILL_SECTION_HEADINGS:
+        assert heading in body
+
+
+def test_canonical_skill_format_instructions_are_prompt_ready() -> None:
+    instructions = canonical_skill_format_instructions()
+
+    assert "Canonical Beaver SKILL.md format" in instructions
+    assert "frontmatter" in instructions
+    assert "name" in instructions
+    assert "description" in instructions
+    assert "tools" in instructions
+    for heading in CANONICAL_SKILL_SECTION_HEADINGS:
+        assert heading in instructions
+
+
+def test_parse_skill_rewrite_json_backfills_frontmatter_tools_from_required_tools_section() -> None:
+    payload = parse_skill_rewrite_json(
+        """{
+          "frontmatter": {
+            "name": "weather-search",
+            "description": "weather lookup",
+            "tools": []
+          },
+          "content": "# Weather Search\\n\\n## Overview\\n\\nLook up weather.\\n\\n## When to Use\\n\\n- Weather requests.\\n\\n## Required Tools\\n\\n- `web_fetch`\\n- `web_search`\\n\\n## Workflow\\n\\n- Fetch current weather.\\n\\n## Validation\\n\\n- Check source freshness.\\n\\n## Boundaries\\n\\n- Do not guess.\\n\\n## Anti-Patterns\\n\\n- Do not fabricate data.\\n"
+        }""",
+        skill_name="weather-search",
+    )
+
+    assert payload is not None
+    assert payload["frontmatter"]["tools"] == ["web_fetch", "web_search"]
--- a/app-instance/backend/tests/unit/test_skill_learning_eval.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_eval.py
@ -19,8 +19,22 @@ from beaver.skills.specs import SkillSpecStore


 class StubProvider(LLMProvider):
-    async def chat(self, messages: list[dict], tools: list[dict] | None = None, model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7) -> LLMResponse:
-        return LLMResponse(content="ok")
+    def __init__(self, content: str = "ok") -> None:
+        super().__init__()
+        self.content = content
+        self.calls: list[dict] = []
+
+    async def chat(
+        self,
+        messages: list[dict],
+        tools: list[dict] | None = None,
+        model: str | None = None,
+        max_tokens: int = 4096,
+        temperature: float = 0.7,
+        thinking_enabled: bool | None = None,
+    ) -> LLMResponse:
+        self.calls.append({"messages": messages, "model": model, "max_tokens": max_tokens, "temperature": temperature})
+        return LLMResponse(content=self.content)

    def get_default_model(self) -> str:
        return "stub"
@ -92,7 +106,6 @@ def test_eval_pass_allows_publish_after_safety_and_review(tmp_path: Path) -> Non
    report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
    safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
    published = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

    assert report.passed is True
@ -114,7 +127,6 @@ def test_eval_regression_blocks_publish(tmp_path: Path) -> None:
    report = asyncio.run(pipeline.evaluate_draft("candidate-1", draft.skill_name, draft.draft_id, provider_bundle=_bundle()))
    pipeline.check_safety(draft.skill_name, draft.draft_id)
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")

    assert report.passed is False
    assert pipeline.get_candidate("candidate-1").status == "eval_failed"
@ -160,7 +172,14 @@ def test_eval_does_not_clear_safety_failed_status(tmp_path: Path) -> None:


 class FakeReplayRunner:
+    def __init__(self, *, baseline_answer: str = "done", candidate_answer: str = "done") -> None:
+        self.baseline_answer = baseline_answer
+        self.candidate_answer = candidate_answer
+        self.requests = []
+
    async def run_arm(self, request):
+        self.requests.append(request)
+        final_answer = self.candidate_answer if request.arm == "candidate" else self.baseline_answer
        return {
            "case_id": request.case_id,
            "arm": request.arm,
@ -168,7 +187,7 @@ class FakeReplayRunner:
            "run_id": f"{request.arm}-run",
            "task_text": request.task_text,
            "finish_reason": "stop",
-            "final_answer": "done",
+            "final_answer": final_answer,
            "tool_calls": [
                {
                    "tool_name": "write_file",
@ -213,3 +232,102 @@ def test_eval_report_includes_replay_case_and_coverage(tmp_path: Path) -> None:
    assert 0.0 <= report.execution_coverage <= 1.0
    assert 0.0 <= report.surrogate_coverage <= 1.0
    assert report.confidence in {"low", "medium", "high"}
+    assert "ability_score" in report.case_reports[0]
+    assert "tool_execution_score" in report.case_reports[0]
+    assert report.ability_score_summary["score_role"] == "primary"
+    assert report.tool_execution_summary["score_role"] == "diagnostic_only"
+
+
+def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    pipeline.learning_store.update_learning_candidate(
+        "candidate-1",
+        evidence={
+            "eval_cases": [
+                {
+                    "run_id": "validator-case",
+                    "task_id": "validator-case",
+                    "session_id": "eval",
+                    "task_text": "Write the release verdict.",
+                    "validator": {
+                        "type": "final_answer_contains",
+                        "required_terms": ["ship"],
+                        "forbidden_terms": ["do not ship"],
+                    },
+                    "accepted_score": 0.5,
+                }
+            ]
+        },
+    )
+    draft = pipeline.draft_service.create_new_skill_draft(
+        skill_name="release-checklist",
+        proposed_content="# Release\n\nRun tests.",
+        proposed_frontmatter={"description": "release", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
+
+    report = asyncio.run(
+        pipeline.evaluate_draft(
+            "candidate-1",
+            draft.skill_name,
+            draft.draft_id,
+            provider_bundle=_bundle(),
+            replay_runner=FakeReplayRunner(
+                baseline_answer="Do not ship. Tests are failing.",
+                candidate_answer="Ship after smoke tests pass.",
+            ),
+        )
+    )
+
+    case = report.case_reports[0]
+    assert case["tool_execution_score"]["baseline_score"] == 0.85
+    assert case["tool_execution_score"]["candidate_score"] == 0.85
+    assert case["baseline_score"] < case["candidate_score"]
+    assert report.tool_mode_summary["score_role"] == "diagnostic_only"
+    assert report.ability_score_summary["score_role"] == "primary"
+    assert report.real_score_avg is not None
+    assert report.synthetic_score_avg is not None
+
+
+def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    pipeline.learning_store.update_learning_candidate(
+        "candidate-1",
+        evidence={
+            "eval_cases": [
+                {
+                    "run_id": "synthetic:no-validator",
+                    "task_id": "synthetic-no-validator",
+                    "session_id": "synthetic-eval",
+                    "task_text": "Synthetic task without an oracle.",
+                    "synthetic": True,
+                    "accepted_score": 0.75,
+                }
+            ]
+        },
+    )
+    draft = pipeline.draft_service.create_new_skill_draft(
+        skill_name="release-checklist",
+        proposed_content="# Release\n\nRun tests.",
+        proposed_frontmatter={"description": "release", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id)
+    replay_runner = FakeReplayRunner()
+
+    report = asyncio.run(
+        pipeline.evaluate_draft(
+            "candidate-1",
+            draft.skill_name,
+            draft.draft_id,
+            provider_bundle=_bundle(),
+            replay_runner=replay_runner,
+        )
+    )
+
+    assert "synthetic:no-validator" not in {case["run_id"] for case in report.case_reports}
+    assert all("synthetic:no-validator" not in request.case_id for request in replay_runner.requests)
+    assert report.case_selection_summary["excluded_synthetic_without_validator"] == 1
--- a/app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_eval_report_model.py
@ -31,6 +31,12 @@ def test_eval_report_defaults_preserve_legacy_payload_shape() -> None:
    assert payload["confidence"] == "low"
    assert payload["case_reports"] == []
    assert payload["tool_mode_summary"] == {}
+    assert payload["ability_score_summary"] == {}
+    assert payload["tool_execution_summary"] == {}
+    assert payload["case_selection_summary"] == {}
+    assert payload["real_score_avg"] is None
+    assert payload["synthetic_score_avg"] is None
+    assert payload["overall_score_avg"] is None
    assert payload["preservation_report"] is None
    assert payload["cases"] == [{"run_id": "run-1"}]

@ -59,3 +65,37 @@ def test_eval_report_reads_legacy_payload_without_replay_fields() -> None:
    assert report.mode == "heuristic"
    assert report.confidence == "low"
    assert report.case_reports == []
+
+
+def test_eval_report_persists_ability_and_case_split_fields() -> None:
+    report = SkillDraftEvalReport(
+        report_id="eval-replay",
+        skill_name="debug",
+        draft_id="draft-1",
+        candidate_id="candidate-1",
+        passed=True,
+        baseline_score_avg=0.5,
+        candidate_score_avg=0.8,
+        score_delta=0.3,
+        regression_count=0,
+        improved_count=1,
+        unchanged_count=0,
+        mode="replay",
+        eval_version="replay-v2",
+        real_score_avg=0.9,
+        synthetic_score_avg=0.6,
+        overall_score_avg=0.8,
+        ability_score_summary={"score_role": "primary", "real_case_count": 1},
+        tool_execution_summary={"score_role": "diagnostic_only", "executed": 1.0},
+        case_selection_summary={"excluded_synthetic_without_validator": 2},
+    )
+
+    payload = report.to_dict()
+    restored = SkillDraftEvalReport.from_dict(payload)
+
+    assert payload["real_score_avg"] == 0.9
+    assert payload["synthetic_score_avg"] == 0.6
+    assert payload["overall_score_avg"] == 0.8
+    assert restored.ability_score_summary == {"score_role": "primary", "real_case_count": 1}
+    assert restored.tool_execution_summary == {"score_role": "diagnostic_only", "executed": 1.0}
+    assert restored.case_selection_summary == {"excluded_synthetic_without_validator": 2}
--- a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py
@ -55,14 +55,12 @@ def test_pipeline_lists_candidates_and_moves_draft_through_review(tmp_path: Path
        reason="test",
    )

-    review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    approved = pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
    safety = pipeline.check_safety(draft.skill_name, draft.draft_id)
+    review = pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
    version = pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

    assert pipeline.list_candidates()[0].candidate_id == "candidate-1"
    assert review.status == SkillReviewState.IN_REVIEW.value
-    assert approved.status == SkillReviewState.APPROVED.value
    assert safety.passed is True
    assert version.skill_name == "new-skill"
    assert pipeline.get_draft(draft.skill_name, draft.draft_id).status == SkillReviewState.PUBLISHED.value
@ -93,7 +91,6 @@ def test_pipeline_does_not_resubmit_terminal_draft(tmp_path: Path) -> None:
    )

    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
    pipeline.check_safety(draft.skill_name, draft.draft_id)
    pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")

@ -165,7 +162,6 @@ def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
        )
    )
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
    pipeline.check_safety(draft.skill_name, draft.draft_id)

    with pytest.raises(ValueError, match="low confidence"):
@ -201,7 +197,6 @@ def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
        )
    )
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
    pipeline.check_safety(draft.skill_name, draft.draft_id)

    with pytest.raises(ValueError, match="preservation"):
--- a/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_replay_runner.py
@ -16,6 +16,25 @@ class FakeAgentLoop:
        return SimpleNamespace(session_id="session-replay", run_id="run-replay", output_text="done", finish_reason="stop")


+class FakeRunningAgentLoop(FakeAgentLoop):
+    def __init__(self) -> None:
+        self.process_direct_calls = 0
+        self.submit_direct_calls: list[tuple[str, dict]] = []
+
+    async def process_direct(self, task: str, **kwargs):
+        self.process_direct_calls += 1
+        raise RuntimeError(
+            "AgentLoop.process_direct() is disabled while run() is active; "
+            "submit tasks via submit_direct() instead."
+        )
+
+    async def submit_direct(self, task: str, **kwargs):
+        self.submit_direct_calls.append((task, kwargs))
+        executor = kwargs["tool_executor_override"]
+        await executor.execute("mcp_outlook_send_email", {"to": "ada@example.com"})
+        return SimpleNamespace(session_id="session-queued", run_id="run-queued", output_text="queued done", finish_reason="stop")
+
+
 def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
    runner = ReplayRunner(agent_loop=FakeAgentLoop())
    request = ReplayArmRequest(
@ -34,3 +53,33 @@ def test_replay_runner_returns_arm_report_with_tool_trace() -> None:
    assert report["arm"] == "candidate"
    assert report["finish_reason"] == "stop"
    assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
+
+
+def test_replay_runner_queues_arm_when_agent_loop_is_running() -> None:
+    agent_loop = FakeRunningAgentLoop()
+    runner = ReplayRunner(agent_loop=agent_loop)
+    request = ReplayArmRequest(
+        case_id="case-queued",
+        arm="baseline",
+        task_text="Send a status email to Ada.",
+        pinned_skill_names=["filesystem-operation"],
+        pinned_skill_contexts=[{"name": "filesystem-operation"}],
+        provider_bundle=object(),
+        model_settings={"max_tool_iterations": 3, "temperature": 0.1},
+    )
+
+    report = asyncio.run(runner.run_arm(request))
+
+    assert agent_loop.process_direct_calls == 1
+    assert len(agent_loop.submit_direct_calls) == 1
+    queued_task, queued_kwargs = agent_loop.submit_direct_calls[0]
+    assert queued_task == "Send a status email to Ada."
+    assert queued_kwargs["source"] == "skill_replay_eval"
+    assert queued_kwargs["include_skill_assembly"] is False
+    assert queued_kwargs["include_tools"] is True
+    assert queued_kwargs["pinned_skill_names"] == ["filesystem-operation"]
+    assert queued_kwargs["max_tool_iterations"] == 3
+    assert queued_kwargs["temperature"] == 0.1
+    assert report["session_id"] == "session-queued"
+    assert report["run_id"] == "run-queued"
+    assert report["tool_calls"][0]["tool_name"] == "mcp_outlook_send_email"
--- a/app-instance/backend/tests/unit/test_skill_learning_safety.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_safety.py
@ -74,7 +74,6 @@ def test_safety_marks_dangerous_tools_high_and_requires_confirm(tmp_path: Path)

    report = pipeline.check_safety(draft.skill_name, draft.draft_id)
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")

    assert report.passed is True
    assert report.risk_level == "high"
@ -94,7 +93,6 @@ def test_publish_requires_safety_report(tmp_path: Path) -> None:
        reason="test",
    )
    pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
-    pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")

    with pytest.raises(ValueError, match="safety report"):
        pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
--- a/app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_synthesizer_preservation.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 from beaver.memory.skills import SkillLearningCandidate
+from beaver.skills.authoring.format import CANONICAL_SKILL_SECTION_HEADINGS
 from beaver.skills.learning.evidence import EvidencePacket
 from beaver.skills.learning.synthesizer import SkillDraftSynthesizer

@ -39,3 +40,6 @@ def test_revision_prompt_includes_base_skill_snapshot() -> None:
    assert "Do not delete files." in prompt
    assert "preserved_sections" in prompt
    assert "dropped_sections" in prompt
+    assert "Canonical Beaver SKILL.md format" in prompt
+    for heading in CANONICAL_SKILL_SECTION_HEADINGS:
+        assert heading in prompt
--- a/app-instance/backend/tests/unit/test_skill_learning_web_api.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_web_api.py
@ -1,12 +1,37 @@
 from __future__ import annotations

 from pathlib import Path
+from types import SimpleNamespace

 from fastapi.testclient import TestClient

+from beaver.memory.runs import RunRecord
 from beaver.interfaces.web.app import create_app
-from beaver.memory.skills import SkillLearningCandidate
+from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
 from beaver.services.agent_service import AgentService
+from beaver.skills.specs import SkillVersion
+
+
+class StubEvaluator:
+    def __init__(self) -> None:
+        self.calls = 0
+
+    async def evaluate(self, *, candidate, draft, provider_bundle, replay_runner=None):
+        self.calls += 1
+        return SkillDraftEvalReport(
+            report_id="eval-existing",
+            skill_name=draft.skill_name,
+            draft_id=draft.draft_id,
+            candidate_id=candidate.candidate_id,
+            passed=True,
+            baseline_score_avg=0.5,
+            candidate_score_avg=0.8,
+            score_delta=0.3,
+            regression_count=0,
+            improved_count=1,
+            unchanged_count=0,
+            status="completed",
+        )


 def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
@ -31,3 +56,191 @@ def test_skill_learning_candidates_and_run_once_api(tmp_path: Path) -> None:
    assert candidates[0]["candidate_id"] == "candidate-1"
    assert "risk_level" in candidates[0]
    assert run_once["processed"] >= 0
+
+
+def test_skill_learning_candidates_payload_prefers_original_task_text(tmp_path: Path) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    now = "2026-06-11T00:00:00+00:00"
+    loaded.skill_learning_service.run_store.append_run_record(  # type: ignore[union-attr]
+        RunRecord(
+            run_id="run-original",
+            session_id="session-task",
+            task_id="task-1",
+            attempt_index=1,
+            task_text="Compare direct production restart with staging rollout",
+            started_at=now,
+            ended_at=now,
+            success=False,
+            finish_reason="stop",
+            feedback={"feedback_type": "revise", "comment": "I do not see the docs"},
+            activated_skills=[],
+            validation_result=None,
+        )
+    )
+    loaded.skill_learning_service.run_store.append_run_record(  # type: ignore[union-attr]
+        RunRecord(
+            run_id="run-final",
+            session_id="session-task",
+            task_id="task-1",
+            attempt_index=2,
+            task_text="I do not see the docs",
+            started_at=now,
+            ended_at=now,
+            success=True,
+            finish_reason="stop",
+            feedback={"feedback_type": "satisfied", "acceptance_type": "accept"},
+            activated_skills=[],
+            validation_result={"accepted": True, "score": 0.9},
+        )
+    )
+    loaded.skill_learning_store.record_learning_candidate(  # type: ignore[union-attr]
+        SkillLearningCandidate(
+            candidate_id="new:task:task-1",
+            kind="new_skill",
+            source_run_ids=["run-original", "run-final"],
+            source_session_ids=["session-task"],
+            related_skill_names=[],
+            reason="test",
+            evidence={"task_id": "task-1", "theme": "i do not see the docs"},
+        )
+    )
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        candidates = client.get("/api/skills/candidates").json()
+
+    payload = next(item for item in candidates if item["candidate_id"] == "new:task:task-1")
+    assert payload["evidence"]["theme"] == "Compare direct production restart with staging rollout"
+    assert payload["evidence"]["task_text"] == "Compare direct production restart with staging rollout"
+
+
+def test_generate_draft_does_not_run_review_checks(tmp_path: Path, monkeypatch) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft(  # type: ignore[union-attr]
+        skill_name="filesystem-operation",
+        proposed_content="# Filesystem Operation\n\nUse files safely.",
+        proposed_frontmatter={"description": "filesystem", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    loaded.skill_learning_store.record_learning_candidate(  # type: ignore[union-attr]
+        SkillLearningCandidate(
+            candidate_id="candidate-existing",
+            kind="revise_skill",
+            source_run_ids=["run-1"],
+            source_session_ids=["session-1"],
+            related_skill_names=["filesystem-operation"],
+            reason="revise",
+            status="draft_ready",
+            draft_skill_name=draft.skill_name,
+            draft_id=draft.draft_id,
+        )
+    )
+    evaluator = StubEvaluator()
+    loaded.skill_learning_pipeline.evaluator = evaluator  # type: ignore[union-attr]
+    monkeypatch.setattr(
+        service,
+        "_make_provider_bundle_for_task",
+        lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
+    )
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        response = client.post("/api/skills/candidates/candidate-existing/draft")
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert evaluator.calls == 0
+    assert payload["draft_id"] == draft.draft_id
+    assert payload["safety_report"] is None
+    assert payload["eval_report"] is None
+    assert loaded.skill_learning_pipeline.get_eval_report(draft.skill_name, draft.draft_id) is None  # type: ignore[union-attr]
+
+
+def test_submit_draft_runs_safety_and_eval(tmp_path: Path, monkeypatch) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    draft = loaded.skill_learning_pipeline.draft_service.create_new_skill_draft(  # type: ignore[union-attr]
+        skill_name="filesystem-operation",
+        proposed_content="# Filesystem Operation\n\nUse files safely.",
+        proposed_frontmatter={"description": "filesystem", "tools": []},
+        created_by="test",
+        reason="test",
+    )
+    loaded.skill_learning_store.record_learning_candidate(  # type: ignore[union-attr]
+        SkillLearningCandidate(
+            candidate_id="candidate-existing",
+            kind="revise_skill",
+            source_run_ids=["run-1"],
+            source_session_ids=["session-1"],
+            related_skill_names=["filesystem-operation"],
+            reason="revise",
+            status="draft_ready",
+            draft_skill_name=draft.skill_name,
+            draft_id=draft.draft_id,
+        )
+    )
+    evaluator = StubEvaluator()
+    loaded.skill_learning_pipeline.evaluator = evaluator  # type: ignore[union-attr]
+    monkeypatch.setattr(
+        service,
+        "_make_provider_bundle_for_task",
+        lambda loaded, kwargs: SimpleNamespace(main_provider=object()),
+    )
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        response = client.post(f"/api/skills/{draft.skill_name}/drafts/{draft.draft_id}/submit")
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert evaluator.calls == 1
+    assert payload["status"] == "in_review"
+    assert payload["safety_report"]["passed"] is True
+    assert payload["eval_report"]["report_id"] == "eval-existing"
+
+
+def test_draft_payload_includes_target_version_for_revision(tmp_path: Path) -> None:
+    service = AgentService(workspace=tmp_path)
+    loaded = service.create_loop().boot()
+    loaded.skill_spec_store.write_skill_version(  # type: ignore[union-attr]
+        SkillVersion(
+            skill_name="filesystem-operation",
+            version="v0001",
+            content_hash="hash-v1",
+            summary_hash="summary-v1",
+            created_at="2026-06-01T00:00:00+00:00",
+            created_by="test",
+            change_reason="initial",
+            parent_version=None,
+            review_state="published",
+            frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
+            summary="filesystem",
+            tool_hints=[],
+        ),
+        "# Filesystem Operation\n\nUse files.",
+    )
+    loaded.skill_spec_store.set_current_version("filesystem-operation", "v0001")  # type: ignore[union-attr]
+    draft = loaded.skill_learning_pipeline.draft_service.create_revision_draft(  # type: ignore[union-attr]
+        skill_name="filesystem-operation",
+        base_version="v0001",
+        proposed_content="# Filesystem Operation\n\nUse files better.",
+        proposed_frontmatter={"description": "filesystem", "name": "filesystem-operation", "tools": []},
+        created_by="test",
+        reason="revise",
+    )
+    app = create_app(service=service, manage_service_lifecycle=False)
+
+    with TestClient(app) as client:
+        response = client.get("/api/skills/drafts")
+
+    assert response.status_code == 200
+    payload = next(item for item in response.json() if item["draft_id"] == draft.draft_id)
+    assert payload["proposal_kind"] == "revise_skill"
+    assert payload["base_version"] == "v0001"
+    assert payload["target_version"] == "v0002"
+    assert payload["base_skill"]["version"] == "v0001"
+    assert payload["base_skill"]["content"] == "# Filesystem Operation\n\nUse files."
+    assert payload["base_skill"]["frontmatter"]["name"] == "filesystem-operation"
--- a/app-instance/backend/tests/unit/test_skill_learning_worker.py
+++ b/app-instance/backend/tests/unit/test_skill_learning_worker.py
@ -10,6 +10,7 @@ from beaver.engine.providers.factory import ProviderBundle
 from beaver.engine.session import SessionManager
 from beaver.memory.runs import RunMemoryStore, RunRecord
 from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
+from beaver.skills.authoring.format import is_canonical_skill_body
 from beaver.skills.drafts import DraftService
 from beaver.skills.learning import (
    EvidenceSelector,
@ -48,6 +49,33 @@ def _bundle(provider: LLMProvider) -> ProviderBundle:
    return ProviderBundle(main_runtime=runtime, main_provider=provider)  # type: ignore[arg-type]


+class FakeReplayRunner:
+    def __init__(self) -> None:
+        self.requests = []
+
+    async def run_arm(self, request):
+        self.requests.append(request)
+        return {
+            "case_id": request.case_id,
+            "arm": request.arm,
+            "session_id": "session-replay",
+            "run_id": f"{request.arm}-run",
+            "task_text": request.task_text,
+            "finish_reason": "stop",
+            "final_answer": "debug deployment startup done",
+            "tool_calls": [
+                {
+                    "tool_name": "echo",
+                    "mode": "executed",
+                    "arguments": {"text": "ok"},
+                    "result": {"success": True, "content": "ok"},
+                }
+            ],
+            "artifacts": [],
+            "side_effects": [],
+        }
+
+
 def _pipeline(tmp_path: Path) -> SkillLearningPipelineService:
    spec_store = SkillSpecStore(tmp_path)
    run_store = RunMemoryStore(tmp_path / "memory" / "runs")
@ -109,6 +137,28 @@ def test_worker_synthesizes_open_candidate_without_publish(tmp_path: Path) -> No
    assert pipeline.list_drafts(candidate.draft_skill_name)[0].status == "draft"


+def test_worker_evaluates_draft_with_replay_runner_when_available(tmp_path: Path) -> None:
+    pipeline = _pipeline(tmp_path)
+    replay_runner = FakeReplayRunner()
+    worker = SkillLearningWorker(
+        pipeline=pipeline,
+        provider_bundle_factory=lambda: _bundle(JsonProvider()),
+        replay_runner_factory=lambda: replay_runner,
+        config=SkillLearningWorkerConfig(max_drafts_per_run=5, max_retries=3, interval_seconds=1),
+    )
+
+    result = asyncio.run(worker.run_once())
+    candidate = pipeline.get_candidate("candidate-1")
+    draft = pipeline.get_draft(candidate.draft_skill_name or "", candidate.draft_id or "")
+    report = pipeline.get_eval_report(draft.skill_name, draft.draft_id)
+
+    assert result.succeeded == 1
+    assert report is not None
+    assert report.mode == "replay"
+    assert report.case_reports
+    assert replay_runner.requests
+
+
 def test_worker_retries_and_marks_failed_after_limit(tmp_path: Path) -> None:
    pipeline = _pipeline(tmp_path)
    worker = SkillLearningWorker(
@ -147,6 +197,7 @@ def test_synthesizer_fills_missing_tools_from_evidence(tmp_path: Path) -> None:
    )

    assert payload["frontmatter"]["tools"] == ["web_fetch", "memory"]
+    assert is_canonical_skill_body(payload["content"])


 def test_evidence_selector_records_run_tool_names(tmp_path: Path) -> None:
--- a/app-instance/backend/tests/unit/test_task_mode_feedback.py
+++ b/app-instance/backend/tests/unit/test_task_mode_feedback.py
@ -218,6 +218,45 @@ def test_unrelated_new_task_auto_accepts_previous_task(tmp_path: Path) -> None:
    assert current.run_ids == [second.run_id]


+def test_standalone_realtime_repeat_creates_new_task_in_same_session(tmp_path: Path) -> None:
+    service = AgentService(
+        loader=EngineLoader(
+            workspace=tmp_path,
+            task_execution_planner=StubTaskExecutionPlanner(),
+        )
+    )
+    session_id = "feishu:group-weather"
+    first = asyncio.run(
+        service.process_direct(
+            "珠海天气怎样",
+            session_id=session_id,
+            provider_bundle=_bundle("Weather result"),
+        )
+    )
+
+    second = asyncio.run(
+        service.process_direct(
+            "珠海天气怎么样",
+            session_id=session_id,
+            provider_bundle=_bundle("Fresh weather result", route_action="continue_task"),
+        )
+    )
+
+    task_service = service.create_loop().boot().task_service
+    assert task_service is not None
+    previous = task_service.get_task(first.task_id or "")
+    current = task_service.get_task(second.task_id or "")
+    assert previous is not None
+    assert current is not None
+    assert previous.session_id == session_id
+    assert current.session_id == session_id
+    assert current.task_id != previous.task_id
+    assert previous.status == "closed"
+    assert previous.run_ids == [first.run_id]
+    assert current.status == "awaiting_acceptance"
+    assert current.run_ids == [second.run_id]
+
+
 def test_related_follow_up_continues_active_task_without_accepting_it(tmp_path: Path) -> None:
    service = AgentService(
        loader=EngineLoader(
--- a/app-instance/backend/tests/unit/test_tool_assembler.py
+++ b/app-instance/backend/tests/unit/test_tool_assembler.py
@ -102,6 +102,58 @@ tools:
    assert [spec.name for spec in selected] == ["memory", "terminal", "search_files"]


+def test_tool_assembler_uses_required_tools_section_when_frontmatter_omits_tools(tmp_path: Path) -> None:
+    skill_dir = tmp_path / "skills" / "docker-debug"
+    skill_dir.mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text(
+        """---
+name: docker-debug
+description: Debug Docker issues.
+---
+
+# Docker Debug
+
+## Overview
+
+Debug Docker issues.
+
+## Required Tools
+
+- `terminal`
+- `search_files`
+
+## Workflow
+
+Inspect logs and search related files.
+""",
+        encoding="utf-8",
+    )
+
+    registry = ToolRegistry()
+    registry.register(DummyTool("memory", toolset="memory", always_available=True))
+    registry.register(DummyTool("terminal", toolset="shell"))
+    registry.register(DummyTool("search_files", toolset="file"))
+    registry.register(DummyTool("echo", toolset="debug"))
+
+    assembler = ToolAssembler(retriever=StaticRetriever())
+    loader = SkillsLoader(tmp_path)
+    record = loader.get_skill_record("docker-debug")
+    assert record is not None
+    assert record.tool_hints == ["terminal", "search_files"]
+
+    selected = asyncio.run(
+        assembler.assemble(
+            task_description="排查 Docker 容器日志",
+            registry=registry,
+            skills_loader=loader,
+            activated_skills=[SkillContext(name="docker-debug", content="", tool_hints=record.tool_hints)],
+            top_k=1,
+        )
+    )
+
+    assert [spec.name for spec in selected] == ["memory", "terminal", "search_files", "echo"]
+
+
 def test_embedding_fallback_can_return_all_or_top_k() -> None:
    candidates = [{"name": f"tool_{index}", "description": "", "input_schema": "{}"} for index in range(3)]
    retriever = EmbeddingRetriever(api_key_env="MISSING_EMBEDDING_KEY", api_base_env="MISSING_EMBEDDING_BASE")
--- a/app-instance/backend/tests/unit/test_web_cors.py
+++ b/app-instance/backend/tests/unit/test_web_cors.py
@ -0,0 +1,21 @@
+from fastapi.testclient import TestClient
+
+from beaver.interfaces.web.app import create_app
+
+
+def test_local_frontend_origin_can_preflight_api_requests() -> None:
+    app = create_app(service=None, manage_service_lifecycle=False)
+    client = TestClient(app)
+
+    response = client.options(
+        "/api/auth/me",
+        headers={
+            "Origin": "http://127.0.0.1:3080",
+            "Access-Control-Request-Method": "GET",
+            "Access-Control-Request-Headers": "authorization",
+        },
+    )
+
+    assert response.status_code == 200
+    assert response.headers["access-control-allow-origin"] == "http://127.0.0.1:3080"
+    assert "authorization" in response.headers["access-control-allow-headers"].lower()