feat(skill-learning): gate publish on replay confidence
This commit is contained in:
@ -337,6 +337,14 @@ class SkillLearningPipelineService:
|
|||||||
eval_report = self.get_eval_report(draft.skill_name, draft.draft_id)
|
eval_report = self.get_eval_report(draft.skill_name, draft.draft_id)
|
||||||
if eval_report is not None and eval_report.status != "skipped_provider_unavailable" and not eval_report.passed:
|
if eval_report is not None and eval_report.status != "skipped_provider_unavailable" and not eval_report.passed:
|
||||||
raise ValueError("Draft eval report did not pass")
|
raise ValueError("Draft eval report did not pass")
|
||||||
|
if eval_report is not None and eval_report.mode == "replay":
|
||||||
|
if eval_report.confidence == "low":
|
||||||
|
raise ValueError("Draft replay eval has low confidence and requires revision before publish")
|
||||||
|
if eval_report.blocked_coverage >= 1.0:
|
||||||
|
raise ValueError("Draft replay eval blocked all important tool calls")
|
||||||
|
preservation = eval_report.preservation_report or {}
|
||||||
|
if preservation.get("passed") is False:
|
||||||
|
raise ValueError("Draft preservation check did not pass")
|
||||||
|
|
||||||
def _mark_candidate_by_draft(
|
def _mark_candidate_by_draft(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from beaver.memory.runs import RunMemoryStore
|
from beaver.memory.runs import RunMemoryStore
|
||||||
from beaver.memory.skills import SkillLearningCandidate, SkillLearningStore
|
from beaver.memory.skills import SkillDraftEvalReport, SkillLearningCandidate, SkillLearningStore
|
||||||
from beaver.skills.drafts import DraftService
|
from beaver.skills.drafts import DraftService
|
||||||
from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService
|
from beaver.skills.learning import EvidenceSelector, SkillDraftSynthesizer, SkillLearningPipelineService, SkillLearningService
|
||||||
from beaver.skills.publisher import SkillPublisher
|
from beaver.skills.publisher import SkillPublisher
|
||||||
@ -132,3 +132,77 @@ def test_pipeline_reject_removes_draft_from_review_list(tmp_path: Path) -> None:
|
|||||||
|
|
||||||
assert review.status == SkillReviewState.REJECTED.value
|
assert review.status == SkillReviewState.REJECTED.value
|
||||||
assert pipeline.list_drafts() == []
|
assert pipeline.list_drafts() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_publish_blocks_low_confidence_replay_report(tmp_path: Path) -> None:
|
||||||
|
pipeline = _pipeline(tmp_path)
|
||||||
|
draft = pipeline.draft_service.create_new_skill_draft(
|
||||||
|
skill_name="low-confidence",
|
||||||
|
proposed_content="# Low\n\nDo it.",
|
||||||
|
proposed_frontmatter={"description": "low", "tools": []},
|
||||||
|
created_by="test",
|
||||||
|
reason="test",
|
||||||
|
)
|
||||||
|
pipeline.learning_store.write_eval_report(
|
||||||
|
SkillDraftEvalReport(
|
||||||
|
report_id="eval-low",
|
||||||
|
skill_name=draft.skill_name,
|
||||||
|
draft_id=draft.draft_id,
|
||||||
|
candidate_id="candidate-1",
|
||||||
|
passed=True,
|
||||||
|
baseline_score_avg=0.7,
|
||||||
|
candidate_score_avg=0.9,
|
||||||
|
score_delta=0.2,
|
||||||
|
regression_count=0,
|
||||||
|
improved_count=1,
|
||||||
|
unchanged_count=0,
|
||||||
|
confidence="low",
|
||||||
|
mode="replay",
|
||||||
|
eval_version="replay-v1",
|
||||||
|
execution_coverage=0.0,
|
||||||
|
surrogate_coverage=1.0,
|
||||||
|
blocked_coverage=0.0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||||
|
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||||
|
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="low confidence"):
|
||||||
|
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||||
|
|
||||||
|
|
||||||
|
def test_publish_blocks_failed_preservation_report(tmp_path: Path) -> None:
|
||||||
|
pipeline = _pipeline(tmp_path)
|
||||||
|
draft = pipeline.draft_service.create_new_skill_draft(
|
||||||
|
skill_name="dropped-section",
|
||||||
|
proposed_content="# Skill\n\n## Workflow\n\nDo it.",
|
||||||
|
proposed_frontmatter={"description": "dropped", "tools": []},
|
||||||
|
created_by="test",
|
||||||
|
reason="test",
|
||||||
|
)
|
||||||
|
pipeline.learning_store.write_eval_report(
|
||||||
|
SkillDraftEvalReport(
|
||||||
|
report_id="eval-preservation",
|
||||||
|
skill_name=draft.skill_name,
|
||||||
|
draft_id=draft.draft_id,
|
||||||
|
candidate_id="candidate-1",
|
||||||
|
passed=True,
|
||||||
|
baseline_score_avg=0.7,
|
||||||
|
candidate_score_avg=0.9,
|
||||||
|
score_delta=0.2,
|
||||||
|
regression_count=0,
|
||||||
|
improved_count=1,
|
||||||
|
unchanged_count=0,
|
||||||
|
confidence="medium",
|
||||||
|
mode="replay",
|
||||||
|
eval_version="replay-v1",
|
||||||
|
preservation_report={"passed": False, "risk_level": "high", "dropped_sections": ["Safety"]},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
pipeline.submit_review(draft.skill_name, draft.draft_id, requested_by="tester")
|
||||||
|
pipeline.approve(draft.skill_name, draft.draft_id, reviewer="tester")
|
||||||
|
pipeline.check_safety(draft.skill_name, draft.draft_id)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="preservation"):
|
||||||
|
pipeline.publish(draft.skill_name, draft.draft_id, publisher="tester")
|
||||||
|
|||||||
Reference in New Issue
Block a user