From b9171998b9cd883cae2e95121214087b118edce5 Mon Sep 17 00:00:00 2001 From: steven_li Date: Mon, 8 Jun 2026 13:36:55 +0800 Subject: [PATCH] feat(skill-learning): gate publish on replay confidence --- .../beaver/skills/learning/pipeline.py | 8 ++ .../unit/test_skill_learning_pipeline.py | 76 ++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/app-instance/backend/beaver/skills/learning/pipeline.py b/app-instance/backend/beaver/skills/learning/pipeline.py index 65f0cbc..7c16dd7 100644 --- a/app-instance/backend/beaver/skills/learning/pipeline.py +++ b/app-instance/backend/beaver/skills/learning/pipeline.py @@ -337,6 +337,14 @@ class SkillLearningPipelineService: eval_report = self.get_eval_report(draft.skill_name, draft.draft_id) if eval_report is not None and eval_report.status != "skipped_provider_unavailable" and not eval_report.passed: raise ValueError("Draft eval report did not pass") + if eval_report is not None and eval_report.mode == "replay": + if eval_report.confidence == "low": + raise ValueError("Draft replay eval has low confidence and requires revision before publish") + if eval_report.blocked_coverage >= 1.0: + raise ValueError("Draft replay eval blocked all important tool calls") + preservation = eval_report.preservation_report or {} + if preservation.get("passed") is False: + raise ValueError("Draft preservation check did not pass") def _mark_candidate_by_draft( self, diff --git a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py index 3513493..a7934ec 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_pipeline.py +++ b/app-instance/backend/tests/unit/test_skill_learning_pipeline.py @@ -5,7 +5,7 @@ from pathlib import Path import pytest from beaver.memory.runs import RunMemoryStore -from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore +from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate, SkillLearningStore from beaver.skills.drafts import DraftService from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService from beaver.skills.publisher import SkillPublisher @@ -132,3 +132,77 @@ def test_pipeline_reject_removes_draft_from_review_list(tmp_path: Path) -> None: assert review.status == SkillReviewState.REJECTED.value assert pipeline.list_drafts() == [] + + +def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="low-confidence", + proposed_content="# Low\n\nDo it.", + proposed_frontmatter={"description": "low", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.write_eval_report( + SkillDraftEvalReport( + report_id="eval-low", + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.7, + candidate_score_avg=0.9, + score_delta=0.2, + regression_count=0, + improved_count=1, + unchanged_count=0, + confidence="low", + mode="replay", + eval_version="replay-v1", + execution_coverage=0.0, + surrogate_coverage=1.0, + blocked_coverage=0.0, + ) + ) + pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") + pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester") + pipeline.check_safety(draft.skill_name, draft.draft_id) + + with pytest.raises(ValueError, match="low confidence"): + pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester") + + +def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="dropped-section", + proposed_content="# Skill\n\n## Workflow\n\nDo it.", + proposed_frontmatter={"description": "dropped", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.write_eval_report( + SkillDraftEvalReport( + report_id="eval-preservation", + skill_name=draft.skill_name, + draft_id=draft.draft_id, + candidate_id="candidate-1", + passed=True, + baseline_score_avg=0.7, + candidate_score_avg=0.9, + score_delta=0.2, + regression_count=0, + improved_count=1, + unchanged_count=0, + confidence="medium", + mode="replay", + eval_version="replay-v1", + preservation_report={"passed": False, "risk_level": "high", "dropped_sections": ["Safety"]}, + ) + ) + pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester") + pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester") + pipeline.check_safety(draft.skill_name, draft.draft_id) + + with pytest.raises(ValueError, match="preservation"): + pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")