test(plugins): cover skill mirror lifecycle

This commit is contained in:
2026-06-16 12:24:19 +08:00
parent a9b830d11e
commit a65e59fcb6
7 changed files with 516 additions and 5 deletions

View File

@ -271,6 +271,8 @@ class PluginManager:
current = self.skill_store.read_published_skill(declaration.name)
if current is None:
continue
if self._reconcile_published_update(binding, current.version, snapshot.skill_tree_hash):
continue
classification = classify_plugin_skill_update(
binding.accepted_upstream_tree_hash,
current.version.tree_hash,
@ -317,6 +319,33 @@ class PluginManager:
finally:
transaction.cleanup()
def _reconcile_published_update(
self,
binding: PluginSkillBinding,
current_version: SkillVersion,
observed_upstream_tree_hash: str,
) -> bool:
if not binding.pending_candidate_id:
return False
candidates = self.learning_store.list_learning_candidates()
candidate = next(
(item for item in candidates if item.candidate_id == binding.pending_candidate_id),
None,
)
if candidate is None or candidate.status != "published":
return False
candidate_hash = str(candidate.evidence.get("new_upstream_tree_hash") or "")
version_hash = str(current_version.provenance.get("new_upstream_tree_hash") or "")
if not candidate_hash or candidate_hash != observed_upstream_tree_hash or version_hash != candidate_hash:
return False
binding.accepted_upstream_tree_hash = candidate_hash
binding.observed_upstream_tree_hash = candidate_hash
binding.accepted_beaver_version = current_version.version
binding.current_beaver_version = current_version.version
binding.pending_candidate_id = None
binding.status = "synced"
return True
@staticmethod
def _create_update_candidate(
*,

View File

@ -12,11 +12,13 @@ from beaver.engine.context import SkillContext
from beaver.engine.providers import ProviderBundle
from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate
from beaver.skills.catalog.utils import strip_frontmatter
from beaver.skills.learning.case_selection import select_replay_cases
from beaver.skills.learning.preservation import check_preservation
from beaver.skills.learning.preservation import check_plugin_merge_preservation, check_preservation
from beaver.skills.learning.replay import ReplayArmRequest, ReplayRunner
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
from beaver.skills.specs import SkillDraft
from beaver.skills.specs.storage import SkillSpecStore
class SkillDraftEvaluator:
@ -28,9 +30,11 @@ class SkillDraftEvaluator:
*,
surrogate_evaluator: SurrogateToolEvaluator | None = None,
max_parallel_cases: int | None = None,
skill_store: SkillSpecStore | None = None,
) -> None:
self.run_store = run_store
self.surrogate_evaluator = surrogate_evaluator or SurrogateToolEvaluator()
self.skill_store = skill_store
configured_parallelism = max_parallel_cases
if configured_parallelism is None:
try:
@ -207,7 +211,7 @@ class SkillDraftEvaluator:
results = await asyncio.gather(*(evaluate_case(case) for case in replay_cases))
case_reports = [case_report for case_report, _ in results]
legacy_cases = [legacy_case for _, legacy_case in results]
preservation_report = _preservation_report(candidate, draft)
preservation_report = _preservation_report(candidate, draft, skill_store=self.skill_store)
return _report_from_case_reports(
candidate,
draft,
@ -343,9 +347,35 @@ def _draft_skill_context(draft: SkillDraft) -> SkillContext:
)
def _preservation_report(candidate: SkillLearningCandidate, draft: SkillDraft) -> dict | None:
def _preservation_report(
candidate: SkillLearningCandidate,
draft: SkillDraft,
*,
skill_store: SkillSpecStore | None = None,
) -> dict | None:
if candidate.kind not in {"revise_skill", "merge_skills"}:
return None
if candidate.kind != "plugin_skill_update" or skill_store is None:
return None
plugin_id = str(draft.provenance.get("plugin_id") or candidate.evidence.get("plugin_id") or "")
skill_name = str(draft.provenance.get("skill_name") or candidate.evidence.get("skill_name") or draft.skill_name)
local_version = str(draft.base_version or draft.provenance.get("local_version") or candidate.evidence.get("local_version") or "")
upstream_hash = str(
draft.provenance.get("new_upstream_tree_hash")
or candidate.evidence.get("new_upstream_tree_hash")
or ""
)
if not plugin_id or not skill_name or not local_version or not upstream_hash:
return None
local = skill_store.read_published_skill(skill_name, local_version)
upstream = skill_store.read_upstream_snapshot(skill_name, plugin_id, upstream_hash)
if local is None or upstream is None:
return None
return check_plugin_merge_preservation(
local_content=strip_frontmatter(local.content),
upstream_content=strip_frontmatter(upstream.content),
draft_content=draft.proposed_content,
merge_decisions=draft.provenance,
)
base_content = str(candidate.evidence.get("base_content") or "") if isinstance(candidate.evidence, dict) else ""
if not base_content.strip():
return None

View File

@ -315,7 +315,10 @@ class SkillLearningPipelineService:
) -> SkillDraftEvalReport:
draft = self.get_draft(skill_name, draft_id)
candidate = self.get_candidate(candidate_id)
evaluator = self.evaluator or SkillDraftEvaluator(self.learning_service.run_store)
evaluator = self.evaluator or SkillDraftEvaluator(
self.learning_service.run_store,
skill_store=self.draft_service.store,
)
report = await evaluator.evaluate(
candidate=candidate,
draft=draft,

View File

@ -55,7 +55,11 @@ class SkillPublisher:
version.tree_hash = hash_plugin_skill_tree(version_dir).skill_tree_hash
self.store._write_json(version_dir / "version.json", version.to_dict())
else:
self._copy_base_supporting_files(draft, next_version)
self._copy_uploaded_supporting_files(draft, next_version)
version_dir = self.store.root / draft.skill_name / "versions" / next_version
version.tree_hash = hash_plugin_skill_tree(version_dir).skill_tree_hash
self.store._write_json(version_dir / "version.json", version.to_dict())
self.store.set_current_version(skill_name, next_version)
spec = self.store.get_skill_spec(skill_name)
@ -202,6 +206,23 @@ class SkillPublisher:
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(source, target)
def _copy_base_supporting_files(self, draft: SkillDraft, version: str) -> None:
if not draft.base_version:
return
source_root = self.store.root / draft.skill_name / "versions" / draft.base_version
if not source_root.exists() or not source_root.is_dir():
return
target_root = self.store.root / draft.skill_name / "versions" / version
for source in sorted(source_root.rglob("*"), key=lambda item: item.relative_to(source_root).as_posix()):
if not source.is_file() or source.is_symlink():
continue
relative = source.relative_to(source_root)
if relative.as_posix() in {"SKILL.md", "version.json", "upstream.json"}:
continue
target = target_root / relative
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(source, target)
def _copy_plugin_update_supporting_files(self, draft: SkillDraft, version: str) -> None:
plugin_id = str(draft.provenance.get("plugin_id") or "")
tree_hash = str(draft.provenance.get("new_upstream_tree_hash") or "")

View File

@ -0,0 +1,326 @@
from __future__ import annotations
import asyncio
import json
import shutil
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from types import SimpleNamespace
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.engine.providers.factory import ProviderBundle
from beaver.foundation.utils.file_lock import WorkspaceWriteLock
from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillLearningStore
from beaver.plugins.discovery import discover_plugins
from beaver.plugins.skills import PluginManager
from beaver.plugins.state import PluginStateStore
from beaver.skills.drafts import DraftService
from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService
from beaver.skills.learning.safety import SkillDraftSafetyChecker
from beaver.skills.publisher import SkillPublisher
from beaver.skills.reviews import ReviewService
from beaver.skills.specs import SkillSpecStore
class StubProvider(LLMProvider):
def __init__(self, content: str) -> None:
super().__init__()
self.content = content
self.calls: list[dict] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
thinking_enabled: bool | None = None,
) -> LLMResponse:
self.calls.append({"messages": messages, "model": model})
return LLMResponse(content=self.content, provider_name="stub", model=model or "stub")
def get_default_model(self) -> str:
return "stub"
class StubReplayRunner:
def __init__(self) -> None:
self.requests: list[object] = []
async def run_arm(self, request):
self.requests.append(request)
return {
"case_id": request.case_id,
"arm": request.arm,
"session_id": "session-replay",
"run_id": f"{request.arm}-run",
"task_text": request.task_text,
"finish_reason": "stop",
"final_answer": "panel safety review complete",
"tool_calls": [
{
"tool_name": "write_file",
"mode": "executed",
"arguments": {"path": "storyboard.md"},
"result": {"success": True},
}
],
"artifacts": [],
"side_effects": [],
}
def test_plugin_skill_mirror_upgrade_and_recovery_lifecycle(tmp_path: Path) -> None:
workspace = tmp_path / "workspace"
plugin_root = _write_plugin(
workspace / "plugins",
version="1.0.0",
body="# Baoyu Comic\n\n## Workflow\n\nDraw panels.\n",
template="panel-v1",
)
manager, store, learning_store, pipeline = _services(workspace)
manager.enable("baoyu-comic")
initial = store.read_published_skill("baoyu-comic")
assert initial is not None
assert initial.version.version == "v0001"
local = pipeline.draft_service.create_revision_draft(
skill_name="baoyu-comic",
base_version="v0001",
proposed_content="# Baoyu Comic\n\n## Workflow\n\nDraw panels.\n\n## Local Review\n\nKeep user edits.\n",
proposed_frontmatter={"name": "baoyu-comic", "description": "Comic workflow", "tools": []},
created_by="tester",
reason="learned local revision",
)
pipeline.check_safety(local.skill_name, local.draft_id)
pipeline.submit_review(local.skill_name, local.draft_id, requested_by="tester")
pipeline.approve(local.skill_name, local.draft_id, reviewer="tester")
local_version = pipeline.publish(local.skill_name, local.draft_id, publisher="tester")
assert local_version.version == "v0002"
_rewrite_plugin(
plugin_root,
version="1.1.0",
body="# Baoyu Comic\n\n## Workflow\n\nDraw better panels.\n\n## Safety\n\nDo not leak secrets.\n",
template="panel-v2",
)
plugin_files_after_update = _plugin_file_bytes(plugin_root)
_services(workspace)[0].sync_enabled()
first_candidate = _only_open_candidate(learning_store)
assert first_candidate.evidence["merge_mode"] == "three_way"
merged_payload = {
"frontmatter": {"name": "baoyu-comic", "description": "Comic workflow", "tools": []},
"content": (
"# Baoyu Comic\n\n"
"## Workflow\n\nDraw better panels.\n\n"
"## Local Review\n\nKeep user edits.\n\n"
"## Safety\n\nDo not leak secrets.\n"
),
"change_reason": "Merge upstream safety guidance and preserve local review.",
"preserved_local_sections": ["Local Review"],
"adopted_upstream_sections": ["Workflow", "Safety"],
"resolved_conflicts": [],
"dropped_sections": [],
}
draft = asyncio.run(
pipeline.synthesize_draft(
first_candidate.candidate_id,
provider_bundle=_bundle(StubProvider(json.dumps(merged_payload))),
)
)
_add_eval_cases(learning_store, first_candidate.candidate_id)
pipeline.check_safety(draft.skill_name, draft.draft_id)
replay_runner = StubReplayRunner()
report = asyncio.run(
pipeline.evaluate_draft(
first_candidate.candidate_id,
draft.skill_name,
draft.draft_id,
provider_bundle=_bundle(StubProvider('{"cases": []}')),
replay_runner=replay_runner,
)
)
assert replay_runner.requests
assert report.mode == "replay"
assert report.preservation_report is not None
assert report.preservation_report["mode"] == "plugin_three_way"
assert report.preservation_report["passed"] is True
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
_, _, _, failing_ack_pipeline = _services(
workspace,
publish_observer=lambda draft, result: (_ for _ in ()).throw(RuntimeError("observer failed")),
)
published = failing_ack_pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
assert published.version == "v0003"
pending_after_failed_observer = PluginStateStore(workspace).get_plugin("baoyu-comic")
assert pending_after_failed_observer is not None
assert pending_after_failed_observer.skills["baoyu-comic"].pending_candidate_id == first_candidate.candidate_id
_services(workspace)[0].sync_enabled()
state = PluginStateStore(workspace).get_plugin("baoyu-comic")
assert state is not None
binding = state.skills["baoyu-comic"]
assert binding.accepted_upstream_tree_hash == draft.provenance["new_upstream_tree_hash"]
published_loaded = store.read_published_skill("baoyu-comic")
assert published_loaded is not None
assert published_loaded.version.provenance["new_upstream_tree_hash"] == draft.provenance["new_upstream_tree_hash"]
pipeline.rollback("baoyu-comic", "v0002", actor="tester", reason="verify rollback")
assert store.read_published_skill("baoyu-comic").version.version == "v0002" # type: ignore[union-attr]
assert _plugin_file_bytes(plugin_root) == plugin_files_after_update
_rewrite_plugin(plugin_root, version="1.2.0", template="panel-v3")
_services(workspace)[0].sync_enabled()
second_candidate = _only_open_candidate(learning_store)
assert second_candidate.candidate_id != first_candidate.candidate_id
shutil.rmtree(plugin_root)
_services(workspace)[0].sync_enabled()
missing = PluginStateStore(workspace).get_plugin("baoyu-comic")
assert missing is not None and missing.status == "missing"
assert store.get_skill_spec("baoyu-comic").status == "active" # type: ignore[union-attr]
plugin_root = _write_plugin(
workspace / "plugins",
version="1.3.0",
body="# Baoyu Comic\n\n## Workflow\n\nDraw better panels.\n\n## Safety\n\nDo not leak secrets.\n",
template="panel-v4",
)
with ThreadPoolExecutor(max_workers=2) as executor:
list(executor.map(lambda _: _services(workspace)[0].sync_enabled(), range(2)))
candidates = [
item
for item in learning_store.list_learning_candidates()
if item.candidate_id != first_candidate.candidate_id
]
assert len([item for item in candidates if item.status == "open"]) == 1
versions = store.list_versions("baoyu-comic")
assert versions.count("v0003") == 1
assert (plugin_root / "skills" / "baoyu-comic" / "templates" / "panel.txt").read_text(encoding="utf-8") == "panel-v4"
def _services(
workspace: Path,
*,
publish_observer=None,
) -> tuple[PluginManager, SkillSpecStore, SkillLearningStore, SkillLearningPipelineService]:
discovery = discover_plugins(workspace, search_paths=[])
store = SkillSpecStore(workspace)
learning_store = SkillLearningStore(workspace / "memory" / "skills")
run_store = RunMemoryStore(workspace / "memory" / "runs")
publisher = SkillPublisher(store)
manager = PluginManager(
workspace=workspace,
manifests=discovery.manifests,
discovery_errors=discovery.errors,
state_store=PluginStateStore(workspace),
skill_store=store,
learning_store=learning_store,
publisher=publisher,
safety_checker=SkillDraftSafetyChecker(),
write_lock=WorkspaceWriteLock(workspace),
)
pipeline = SkillLearningPipelineService(
learning_store=learning_store,
learning_service=SkillLearningService(
run_store=run_store,
learning_store=learning_store,
draft_service=DraftService(store),
evidence_selector=EvidenceSelector(run_store),
synthesizer=SkillDraftSynthesizer(),
),
draft_service=DraftService(store),
review_service=ReviewService(store),
publisher=publisher,
publish_observer=publish_observer if publish_observer is not None else manager.on_skill_published,
)
return manager, store, learning_store, pipeline
def _write_plugin(root: Path, *, version: str, body: str, template: str) -> Path:
plugin_root = root / "baoyu-comic"
skill_root = plugin_root / "skills" / "baoyu-comic"
skill_root.mkdir(parents=True, exist_ok=True)
_write_skill(skill_root, body)
(skill_root / "templates").mkdir(exist_ok=True)
(skill_root / "templates" / "panel.txt").write_text(template, encoding="utf-8")
(plugin_root / "beaver.plugin.json").write_text(
json.dumps(
{
"schema_version": 1,
"id": "baoyu-comic",
"name": "Baoyu Comic",
"version": version,
"skills": [{"name": "baoyu-comic", "path": "skills/baoyu-comic"}],
}
),
encoding="utf-8",
)
return plugin_root
def _rewrite_plugin(plugin_root: Path, *, version: str, body: str | None = None, template: str | None = None) -> None:
manifest_path = plugin_root / "beaver.plugin.json"
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
manifest["version"] = version
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
skill_root = plugin_root / "skills" / "baoyu-comic"
if body is not None:
_write_skill(skill_root, body)
if template is not None:
(skill_root / "templates" / "panel.txt").write_text(template, encoding="utf-8")
def _write_skill(skill_root: Path, body: str) -> None:
(skill_root / "SKILL.md").write_text(
"---\nname: baoyu-comic\ndescription: Comic workflow\ntools: []\n---\n\n" + body,
encoding="utf-8",
)
def _bundle(provider: StubProvider) -> ProviderBundle:
runtime = SimpleNamespace(model="stub", provider_name="stub")
return ProviderBundle(main_runtime=runtime, main_provider=provider) # type: ignore[arg-type]
def _only_open_candidate(learning_store: SkillLearningStore):
open_candidates = learning_store.list_learning_candidates(status="open")
assert len(open_candidates) == 1
return open_candidates[0]
def _add_eval_cases(learning_store: SkillLearningStore, candidate_id: str) -> None:
candidate = next(item for item in learning_store.list_learning_candidates() if item.candidate_id == candidate_id)
evidence = dict(candidate.evidence)
evidence["eval_cases"] = [
{
"run_id": f"explicit:{index}",
"task_text": f"Review comic panel safety case {index}",
"baseline_skill_names": ["baoyu-comic"],
"candidate_skill_name": "baoyu-comic",
"accepted_score": 0.8,
"validator": {
"type": "final_answer_contains",
"required_terms": ["panel", "safety"],
"forbidden_terms": ["secret"],
},
}
for index in range(10)
]
learning_store.update_learning_candidate(candidate_id, evidence=evidence)
def _plugin_file_bytes(plugin_root: Path) -> dict[str, bytes]:
return {
path.relative_to(plugin_root).as_posix(): path.read_bytes()
for path in sorted(plugin_root.rglob("*"))
if path.is_file()
}

View File

@ -0,0 +1,101 @@
# Beaver Skill Plugins
Declarative skill plugins let an operator mirror skills from a local plugin package into Beaver's managed skill lifecycle. V1 plugins are data packages only: Beaver reads manifests and skill files, but it does not execute plugin Python code, install dependencies, or run arbitrary hooks.
## Package Layout
A plugin package is a directory containing `beaver.plugin.json` and one or more skill directories:
```text
my-plugin/
beaver.plugin.json
skills/
my-skill/
SKILL.md
templates/
example.md
```
Manifest example:
```json
{
"schema_version": 1,
"id": "my-plugin",
"name": "My Plugin",
"version": "1.0.0",
"skills": [
{ "name": "my-skill", "path": "skills/my-skill" }
]
}
```
IDs and skill names use lowercase identifiers with letters, digits, `_`, and `-`. Skill paths must stay inside the plugin package, cannot use symlinks, and must contain a regular `SKILL.md`.
## Discovery
Beaver discovers plugin manifests from:
- the workspace `plugins/` directory;
- configured `plugins.search_paths` entries in Beaver config.
Discovery only records available packages. Operators must explicitly enable a plugin before its skills are mirrored.
## Mirroring
When a plugin is enabled, Beaver stages immutable upstream snapshots, safety-checks every declared skill, then publishes each mirrored skill as a normal workspace skill version. The first mirror becomes `v0001` and carries plugin provenance:
- `source_kind: plugin`;
- plugin id and plugin version;
- upstream content hash;
- upstream full-tree hash.
If a skill with the same name already exists and is not plugin-owned, enable fails without publishing any plugin skill.
## Hashing And Supporting Files
Beaver tracks two hashes:
- content hash: normalized `SKILL.md` content;
- tree hash: `SKILL.md` plus supporting files, relative paths, sizes, bytes, and executable-bit state.
Mtime, owner, group, and non-executable mode bits do not affect the tree hash. Beaver metadata files such as `version.json` and `upstream.json` are excluded.
Supporting files are copied into Beaver-managed skill versions. Local revisions inherit supporting files from their base version; uploaded supporting files can override inherited files. Plugin update drafts copy supporting files from the referenced upstream snapshot when published. Divergent supporting-file edits are blocked by the publish gate until resolved.
## Upgrade Flow
When an enabled plugin version changes, sync compares:
- accepted upstream tree;
- current Beaver skill tree;
- newly discovered upstream tree.
Possible outcomes:
- unchanged: no candidate;
- already applied: state is reconciled without a draft;
- fast forward: Beaver creates a `plugin_skill_update` candidate that can draft the exact upstream content without an LLM;
- three-way: Beaver creates a `plugin_skill_update` candidate using old upstream, current local, and new upstream inputs.
Plugin update candidates go through the same draft, safety, replay evaluation, review, publish, and rollback flow as learned skills. Three-way plugin updates require a plugin preservation report showing local and upstream sections were preserved and conflicts were resolved.
## Lifecycle Controls
Pause and resume affect updates only. Paused plugins keep current mirrored skills active and suppress new update candidates until resumed.
Disable requires explicit confirmation to disable linked skills. It disables the plugin and its linked Beaver skills, but keeps historical versions on disk.
Adopt detaches a mirrored skill from the plugin and keeps the skill active as a managed Beaver skill. Future plugin updates no longer apply to that skill.
## Recovery
If a previously enabled plugin package is removed or becomes undiscoverable, sync marks the plugin `missing`. Current Beaver skills remain active; updates are suspended until the package returns or the operator disables/adopts the skills.
If publication succeeds but the plugin state acknowledgement fails, the next sync reconciles state from the published draft provenance and clears the pending candidate.
Workspace writes are serialized by the shared workspace write lock. Boot-time auto-sync uses the same lock and defers safely if another writer is active.
## V1 Boundary
V1 does not execute plugin code. This keeps install and sync deterministic, avoids dependency side effects, and leaves tool execution to Beaver's existing MCP/tool runtime.

View File

@ -13,6 +13,7 @@ Beaver is an enterprise Agent sandbox and execution platform. It combines privat
- [PRD](./PRD-beaver-agent-sandbox.md): full-product PRD for the Beaver Agent Sandbox.
- [Validation Plan](./validation-plan.md): customer, product, technical, security, usability, and business validation plan.
- [Launch And Maintenance Runbook](./launch-maintenance-runbook.md): launch phases, readiness checks, monitoring, incident response, maintenance cadence, and rollback.
- [Skill Plugins Operator Guide](../../plugins/skill-plugins.md): declarative plugin package layout, skill mirroring, upgrade review flow, lifecycle controls, recovery, and V1 boundaries.
## Source Material