feat(skill-learning): gate publish on replay confidence

This commit is contained in:
2026-06-08 13:36:55 +08:00
parent 64d789a3d0
commit b9171998b9
2 changed files with 83 additions and 1 deletions

View File

@ -337,6 +337,14 @@ class SkillLearningPipelineService:
eval_report = self.get_eval_report(draft.skill_name, draft.draft_id)
if eval_report is not None and eval_report.status != "skipped_provider_unavailable" and not eval_report.passed:
raise ValueError("Draft eval report did not pass")
if eval_report is not None and eval_report.mode == "replay":
if eval_report.confidence == "low":
raise ValueError("Draft replay eval has low confidence and requires revision before publish")
if eval_report.blocked_coverage >= 1.0:
raise ValueError("Draft replay eval blocked all important tool calls")
preservation = eval_report.preservation_report or {}
if preservation.get("passed") is False:
raise ValueError("Draft preservation check did not pass")
def _mark_candidate_by_draft(
self,

View File

@ -5,7 +5,7 @@ from pathlib import Path
import pytest
from beaver.memory.runs import RunMemoryStore
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate, SkillLearningStore
from beaver.skills.drafts import DraftService
from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService
from beaver.skills.publisher import SkillPublisher
@ -132,3 +132,77 @@ def test_pipeline_reject_removes_draft_from_review_list(tmp_path: Path) -> None:
assert review.status == SkillReviewState.REJECTED.value
assert pipeline.list_drafts() == []
def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="low-confidence",
proposed_content="# Low\n\nDo it.",
proposed_frontmatter={"description": "low", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.write_eval_report(
SkillDraftEvalReport(
report_id="eval-low",
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.7,
candidate_score_avg=0.9,
score_delta=0.2,
regression_count=0,
improved_count=1,
unchanged_count=0,
confidence="low",
mode="replay",
eval_version="replay-v1",
execution_coverage=0.0,
surrogate_coverage=1.0,
blocked_coverage=0.0,
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="low confidence"):
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
pipeline = _pipeline(tmp_path)
draft = pipeline.draft_service.create_new_skill_draft(
skill_name="dropped-section",
proposed_content="# Skill\n\n## Workflow\n\nDo it.",
proposed_frontmatter={"description": "dropped", "tools": []},
created_by="test",
reason="test",
)
pipeline.learning_store.write_eval_report(
SkillDraftEvalReport(
report_id="eval-preservation",
skill_name=draft.skill_name,
draft_id=draft.draft_id,
candidate_id="candidate-1",
passed=True,
baseline_score_avg=0.7,
candidate_score_avg=0.9,
score_delta=0.2,
regression_count=0,
improved_count=1,
unchanged_count=0,
confidence="medium",
mode="replay",
eval_version="replay-v1",
preservation_report={"passed": False, "risk_level": "high", "dropped_sections": ["Safety"]},
)
)
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
pipeline.check_safety(draft.skill_name, draft.draft_id)
with pytest.raises(ValueError, match="preservation"):
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")