feat(skill-learning): gate publish on replay confidence
This commit is contained in:
@ -337,6 +337,14 @@ class SkillLearningPipelineService:
|
||||
eval_report = self.get_eval_report(draft.skill_name, draft.draft_id)
|
||||
if eval_report is not None and eval_report.status != "skipped_provider_unavailable" and not eval_report.passed:
|
||||
raise ValueError("Draft eval report did not pass")
|
||||
if eval_report is not None and eval_report.mode == "replay":
|
||||
if eval_report.confidence == "low":
|
||||
raise ValueError("Draft replay eval has low confidence and requires revision before publish")
|
||||
if eval_report.blocked_coverage >= 1.0:
|
||||
raise ValueError("Draft replay eval blocked all important tool calls")
|
||||
preservation = eval_report.preservation_report or {}
|
||||
if preservation.get("passed") is False:
|
||||
raise ValueError("Draft preservation check did not pass")
|
||||
|
||||
def _mark_candidate_by_draft(
|
||||
self,
|
||||
|
||||
@ -5,7 +5,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from beaver.memory.runs import RunMemoryStore
|
||||
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
|
||||
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate, SkillLearningStore
|
||||
from beaver.skills.drafts import DraftService
|
||||
from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService
|
||||
from beaver.skills.publisher import SkillPublisher
|
||||
@ -132,3 +132,77 @@ def test_pipeline_reject_removes_draft_from_review_list(tmp_path: Path) -> None:
|
||||
|
||||
assert review.status == SkillReviewState.REJECTED.value
|
||||
assert pipeline.list_drafts() == []
|
||||
|
||||
|
||||
def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="low-confidence",
|
||||
proposed_content="# Low\n\nDo it.",
|
||||
proposed_frontmatter={"description": "low", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.write_eval_report(
|
||||
SkillDraftEvalReport(
|
||||
report_id="eval-low",
|
||||
skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
candidate_id="candidate-1",
|
||||
passed=True,
|
||||
baseline_score_avg=0.7,
|
||||
candidate_score_avg=0.9,
|
||||
score_delta=0.2,
|
||||
regression_count=0,
|
||||
improved_count=1,
|
||||
unchanged_count=0,
|
||||
confidence="low",
|
||||
mode="replay",
|
||||
eval_version="replay-v1",
|
||||
execution_coverage=0.0,
|
||||
surrogate_coverage=1.0,
|
||||
blocked_coverage=0.0,
|
||||
)
|
||||
)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
|
||||
with pytest.raises(ValueError, match="low confidence"):
|
||||
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
|
||||
def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
|
||||
pipeline = _pipeline(tmp_path)
|
||||
draft = pipeline.draft_service.create_new_skill_draft(
|
||||
skill_name="dropped-section",
|
||||
proposed_content="# Skill\n\n## Workflow\n\nDo it.",
|
||||
proposed_frontmatter={"description": "dropped", "tools": []},
|
||||
created_by="test",
|
||||
reason="test",
|
||||
)
|
||||
pipeline.learning_store.write_eval_report(
|
||||
SkillDraftEvalReport(
|
||||
report_id="eval-preservation",
|
||||
skill_name=draft.skill_name,
|
||||
draft_id=draft.draft_id,
|
||||
candidate_id="candidate-1",
|
||||
passed=True,
|
||||
baseline_score_avg=0.7,
|
||||
candidate_score_avg=0.9,
|
||||
score_delta=0.2,
|
||||
regression_count=0,
|
||||
improved_count=1,
|
||||
unchanged_count=0,
|
||||
confidence="medium",
|
||||
mode="replay",
|
||||
eval_version="replay-v1",
|
||||
preservation_report={"passed": False, "risk_level": "high", "dropped_sections": ["Safety"]},
|
||||
)
|
||||
)
|
||||
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||
|
||||
with pytest.raises(ValueError, match="preservation"):
|
||||
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||
|
||||
Reference in New Issue
Block a user