Files
beaver_project/app-instance/backend/tests/unit/test_phase5_skills_runtime.py
steven_li 8a12c30141 feat(beaver): 完成Task Team功能v1实现,重构后端架构支持统一内核
新增内部Task系统,包括验证、反馈门控机制,实现自动质量验证
(通过率>=0.75)和用户反馈闭环(satisfied/revise/abandon)。

实现Agent Team v1协调器,支持sequence/parallel/dag执行策略,
sub-agent复用主AgentLoop,每个run使用独立memory snapshot。

建立Skill学习pipeline,包含draft/审核/发布/回滚完整生命周期,
通过Task验证通过且用户满意才生成学习候选。

重构目录结构,移除third_party依赖,建立统一engine内核,
所有agent共享运行时基础组件。

更新ContextBuilder清理provider消息字段,增强SkillContext版本管理,
集成TaskExecutionPlanner和TaskSkillResolver实现技能解析机制。
2026-05-08 17:14:14 +08:00

507 lines
18 KiB
Python

from __future__ import annotations
import asyncio
from datetime import datetime, timedelta, timezone
from pathlib import Path
from types import SimpleNamespace
import pytest
from beaver.engine import AgentLoop, EngineLoader
from beaver.engine.context import SkillContext
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.engine.providers.factory import ProviderBundle
from beaver.memory.runs import RunMemoryStore, RunRecord, SkillEffectRecord
from beaver.memory.skills import SkillLearningStore
from beaver.services.memory_service import MemoryService
from beaver.skills.assembler import SkillAssemblyResult
from beaver.skills.catalog.loader import SkillsLoader
from beaver.skills.drafts import DraftService
from beaver.skills.learning import EvidenceSelector, SkillLearningService
from beaver.skills.publisher import SkillPublisher
from beaver.skills.reviews import ReviewService
from beaver.skills.specs import SkillActivationReceipt, SkillSpecStore
class StubProvider(LLMProvider):
def __init__(self, responses: list[LLMResponse]) -> None:
super().__init__()
self._responses = list(responses)
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
) -> LLMResponse:
if not self._responses:
raise AssertionError("No stubbed provider responses left")
return self._responses.pop(0)
def get_default_model(self) -> str:
return "stub-model"
class StubSkillAssembler:
def __init__(self, activated_skills: list[SkillContext]) -> None:
self.activated_skills = activated_skills
async def assemble(self, **kwargs) -> SkillAssemblyResult:
return SkillAssemblyResult(activated_skills=list(self.activated_skills))
def _tool_call(*, name: str = "echo", arguments: dict | None = None, call_id: str = "call-1") -> SimpleNamespace:
return SimpleNamespace(
id=call_id,
name=name,
arguments=arguments or {"message": "again"},
)
def _publish_skill(
store: SkillSpecStore,
*,
skill_name: str,
body: str,
description: str,
actor: str = "tester",
) -> str:
drafts = DraftService(store)
reviews = ReviewService(store)
publisher = SkillPublisher(store)
draft = drafts.create_new_skill_draft(
skill_name=skill_name,
proposed_content=body,
proposed_frontmatter={"description": description, "tools": ["terminal"]},
created_by=actor,
reason=f"create {skill_name}",
)
reviews.approve(skill_name, draft.draft_id, reviewer=actor, notes="ok")
version = publisher.publish(skill_name, draft.draft_id, publisher=actor, notes="publish")
return version.version
def _receipt(
*,
run_id: str,
session_id: str,
skill_name: str,
skill_version: str,
activated_at: str,
) -> SkillActivationReceipt:
return SkillActivationReceipt(
run_id=run_id,
session_id=session_id,
skill_name=skill_name,
skill_version=skill_version,
content_hash=f"{skill_name}-{skill_version}",
activated_at=activated_at,
activation_reason="selected",
tool_hints=["terminal"],
)
def test_memory_service_snapshot_stays_frozen_until_reload(tmp_path: Path) -> None:
service = MemoryService(tmp_path / "memory")
service.initialize()
initial_snapshot = service.get_snapshot()
assert initial_snapshot.memory_block is None
result = service.get_store().add("memory", "Remember to inspect Docker container logs first.")
assert result["success"] is True
frozen_snapshot = service.get_snapshot()
assert frozen_snapshot.memory_block is None
service.reload_for_new_run()
refreshed_snapshot = service.get_snapshot()
assert "Docker container logs" in (refreshed_snapshot.memory_block or "")
def test_skill_loader_only_uses_active_published_versions(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
active_version = _publish_skill(
store,
skill_name="docker-debug",
body="# Docker Debug\n\nUse `docker logs` before changing config.\n",
description="Debug Docker containers.",
)
_publish_skill(
store,
skill_name="archived-debug",
body="# Archived\n\nOld instructions.\n",
description="Should be hidden from runtime.",
)
SkillPublisher(store).disable("archived-debug", actor="tester", reason="superseded")
loader = SkillsLoader(tmp_path, skill_store=store)
assert loader.get_current_version("docker-debug") == active_version
assert {record.name for record in loader.list_published_skills()} == {"docker-debug"}
assert {item["name"] for item in loader.build_selection_candidates()} == {"docker-debug"}
assert "docker logs" in (loader.load_published_skill("docker-debug") or "").lower()
def test_skill_lifecycle_publish_revision_and_rollback(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
drafts = DraftService(store)
reviews = ReviewService(store)
publisher = SkillPublisher(store)
initial_version = _publish_skill(
store,
skill_name="release-checklist",
body="# Release Checklist\n\nRun tests.\n",
description="Release workflow.",
)
assert initial_version == "v0001"
revision = drafts.create_revision_draft(
skill_name="release-checklist",
base_version=initial_version,
proposed_content="# Release Checklist\n\nRun tests.\nShip artifacts.\n",
proposed_frontmatter={"description": "Release workflow.", "tools": ["terminal"]},
created_by="tester",
reason="add artifact step",
)
reviews.approve("release-checklist", revision.draft_id, reviewer="reviewer", notes="ship it")
published = publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="v2")
assert published.version == "v0002"
assert store.get_current_version("release-checklist") == "v0002"
with pytest.raises(ValueError, match="approved"):
publisher.publish("release-checklist", revision.draft_id, publisher="reviewer", notes="duplicate")
rolled_back = publisher.rollback("release-checklist", "v0001", actor="reviewer", reason="regression")
assert rolled_back.current_version == "v0001"
assert store.get_current_version("release-checklist") == "v0001"
assert set(store.list_versions("release-checklist")) == {"v0001", "v0002"}
def test_skill_lifecycle_retire_proposal_disables_without_new_version(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
drafts = DraftService(store)
reviews = ReviewService(store)
publisher = SkillPublisher(store)
initial_version = _publish_skill(
store,
skill_name="svn-migration",
body="# SVN Migration\n\nUse the legacy checklist only for SVN repositories.\n",
description="Legacy SVN migration workflow.",
)
retire = drafts.create_retire_proposal(
skill_name="svn-migration",
base_version=initial_version,
created_by="tester",
reason="unused legacy workflow",
)
reviews.approve("svn-migration", retire.draft_id, reviewer="reviewer", notes="retire")
with pytest.raises(ValueError, match="Retire proposals"):
publisher.publish("svn-migration", retire.draft_id, publisher="reviewer", notes="wrong path")
assert store.get_current_version("svn-migration") == initial_version
assert store.list_versions("svn-migration") == [initial_version]
spec = publisher.apply_retire_proposal(
"svn-migration",
retire.draft_id,
actor="reviewer",
notes="retired after review",
)
assert spec.status == "disabled"
assert spec.current_version == initial_version
assert store.get_current_version("svn-migration") == initial_version
assert store.list_versions("svn-migration") == [initial_version]
assert store.read_draft("svn-migration", retire.draft_id).status == "disabled" # type: ignore[union-attr]
assert "svn-migration" not in store.list_published_skill_names()
def test_skill_spec_store_lists_new_skill_drafts_before_publish(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
draft = DraftService(store).create_new_skill_draft(
skill_name="brand-new-skill",
proposed_content="# Brand New Skill\n\nDraft body.\n",
proposed_frontmatter={"description": "Draft only."},
created_by="tester",
reason="capture a repeated workflow",
)
drafts = store.list_drafts()
assert [item.draft_id for item in drafts] == [draft.draft_id]
assert drafts[0].skill_name == "brand-new-skill"
def test_skill_learning_service_generates_candidates_and_retire_draft(tmp_path: Path) -> None:
store = SkillSpecStore(tmp_path)
run_store = RunMemoryStore(tmp_path / "memory" / "runs")
learning_store = SkillLearningStore(tmp_path / "memory" / "skills")
draft_service = DraftService(store)
service = SkillLearningService(
run_store=run_store,
learning_store=learning_store,
draft_service=draft_service,
evidence_selector=EvidenceSelector(run_store),
)
now = datetime.now(timezone.utc)
stale = (now - timedelta(days=45)).isoformat()
recent = now.isoformat()
failing_runs = [
RunRecord(
run_id=f"revise-{index}",
session_id="session-revise",
task_text="Fix the flaky deployment health check",
started_at=recent,
ended_at=recent,
success=False,
finish_reason="error",
feedback={},
activated_skills=[_receipt(
run_id=f"revise-{index}",
session_id="session-revise",
skill_name="deploy-debug",
skill_version="v0002",
activated_at=recent,
)],
)
for index in range(2)
]
for record in failing_runs:
run_store.append_run_record(record)
run_store.append_skill_effect(
SkillEffectRecord(
run_id=record.run_id,
skill_name="deploy-debug",
skill_version="v0002",
success=False,
feedback_score=None,
notes="error",
created_at=recent,
)
)
for index in range(2):
run_store.append_run_record(
RunRecord(
run_id=f"new-{index}",
session_id="session-new",
task_text="Generate a weekly metrics digest for stakeholders",
started_at=recent,
ended_at=recent,
success=True,
finish_reason="stop",
feedback={},
activated_skills=[],
)
)
for index in range(2):
receipts = [
_receipt(
run_id=f"merge-{index}",
session_id="session-merge",
skill_name="docker-debug",
skill_version="v0001",
activated_at=recent,
),
_receipt(
run_id=f"merge-{index}",
session_id="session-merge",
skill_name="k8s-debug",
skill_version="v0003",
activated_at=recent,
),
]
run_store.append_run_record(
RunRecord(
run_id=f"merge-{index}",
session_id="session-merge",
task_text="Investigate staging outage and compare container health checks",
started_at=recent,
ended_at=recent,
success=True,
finish_reason="stop",
feedback={},
activated_skills=receipts,
)
)
for receipt in receipts:
run_store.append_skill_effect(
SkillEffectRecord(
run_id=f"merge-{index}",
skill_name=receipt.skill_name,
skill_version=receipt.skill_version,
success=True,
feedback_score=None,
notes="stop",
created_at=recent,
)
)
run_store.append_run_record(
RunRecord(
run_id="retire-1",
session_id="session-retire",
task_text="Legacy SVN migration checklist",
started_at=stale,
ended_at=stale,
success=True,
finish_reason="stop",
feedback={},
activated_skills=[_receipt(
run_id="retire-1",
session_id="session-retire",
skill_name="svn-migration",
skill_version="v0001",
activated_at=stale,
)],
)
)
run_store.append_skill_effect(
SkillEffectRecord(
run_id="retire-1",
skill_name="svn-migration",
skill_version="v0001",
success=True,
feedback_score=None,
notes="stop",
created_at=stale,
)
)
service.rescore_skill_versions()
candidates = service.build_learning_candidates()
kinds = {candidate.kind for candidate in candidates}
assert {"revise_skill", "new_skill", "merge_skills", "retire_skill"} <= kinds
retire_candidate = next(candidate for candidate in candidates if candidate.kind == "retire_skill")
retire_draft = asyncio.run(
service.synthesize_draft(
retire_candidate.candidate_id,
ProviderBundle(main_runtime=None, main_provider=None),
)
)
assert retire_draft.proposal_kind == "retire_skill"
assert retire_draft.status == "draft"
assert store.read_draft("svn-migration", retire_draft.draft_id) is not None
def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None:
skill = SkillContext(
name="docker-debug",
content="Use docker logs before editing config.",
version="v0007",
content_hash="hash-v7",
activation_reason="llm_selected",
tool_hints=["terminal"],
)
loader = EngineLoader(
workspace=tmp_path,
skill_assembler=StubSkillAssembler([skill]),
)
loop = AgentLoop(loader=loader)
bundle = ProviderBundle(
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
main_provider=StubProvider(
[
LLMResponse(
content="Check the container logs first.",
finish_reason="stop",
provider_name="stub",
model="stub-model",
)
]
),
)
result = asyncio.run(loop.process_direct("Why is the Docker container crashing?", provider_bundle=bundle))
loaded = loop.boot()
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
activation = next(event for event in events if event.event_type == "skill_activation_snapshotted")
receipts = activation.event_payload["receipts"]
assert receipts == [
{
"run_id": result.run_id,
"session_id": result.session_id,
"skill_name": "docker-debug",
"skill_version": "v0007",
"content_hash": "hash-v7",
"activated_at": receipts[0]["activated_at"],
"activation_reason": "llm_selected",
"tool_hints": ["terminal"],
}
]
skill_effects = next(event for event in events if event.event_type == "skill_effects_snapshotted")
assert skill_effects.event_payload["run_record"]["activated_skills"][0]["skill_version"] == "v0007"
assert skill_effects.event_payload["skill_effects"][0]["skill_name"] == "docker-debug"
assert skill_effects.event_payload["learning_candidate_enabled"] is False
assert skill_effects.event_payload["learning_candidates"] == []
run_records = loaded.run_memory_store.list_runs()
effect_records = loaded.run_memory_store.list_skill_effects("docker-debug", version="v0007")
assert run_records[-1].run_id == result.run_id
assert effect_records[-1].run_id == result.run_id
def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path: Path) -> None:
skill = SkillContext(
name="docker-debug",
content="Use docker logs before editing config.",
version="v0007",
content_hash="hash-v7",
activation_reason="llm_selected",
tool_hints=["echo"],
)
loader = EngineLoader(
workspace=tmp_path,
skill_assembler=StubSkillAssembler([skill]),
)
loop = AgentLoop(loader=loader)
bundle = ProviderBundle(
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
main_provider=StubProvider(
[
LLMResponse(
content="Need a tool.",
finish_reason="tool_calls",
tool_calls=[_tool_call()],
provider_name="stub",
model="stub-model",
),
LLMResponse(
content="Need another tool.",
finish_reason="tool_calls",
tool_calls=[_tool_call(call_id="call-2")],
provider_name="stub",
model="stub-model",
),
]
),
)
result = asyncio.run(
loop.process_direct(
"Why is the Docker container crashing?",
provider_bundle=bundle,
max_tool_iterations=1,
)
)
loaded = loop.boot()
assert result.finish_reason == "max_tool_iterations"
effect_records = loaded.run_memory_store.list_skill_effects("docker-debug", version="v0007")
assert effect_records[-1].run_id == result.run_id
assert effect_records[-1].success is False