feat(skill-learning): select replay eval cases
This commit is contained in:
@ -0,0 +1,82 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from beaver.memory.runs import RunRecord
|
||||
from beaver.memory.skills import SkillLearningCandidate
|
||||
from beaver.skills.learning.case_selection import select_replay_cases
|
||||
from beaver.skills.specs import SkillActivationReceipt
|
||||
|
||||
|
||||
def _run(
|
||||
run_id: str,
|
||||
*,
|
||||
task_id: str = "task",
|
||||
session_id: str = "session",
|
||||
task_text: str = "debug task",
|
||||
skill_name: str | None = None,
|
||||
skill_version: str = "v0001",
|
||||
) -> RunRecord:
|
||||
receipts = []
|
||||
if skill_name:
|
||||
receipts.append(
|
||||
SkillActivationReceipt(
|
||||
run_id=run_id,
|
||||
session_id=session_id,
|
||||
skill_name=skill_name,
|
||||
skill_version=skill_version,
|
||||
content_hash="hash",
|
||||
activated_at="now",
|
||||
activation_reason="selected",
|
||||
)
|
||||
)
|
||||
return RunRecord(
|
||||
run_id=run_id,
|
||||
session_id=session_id,
|
||||
task_id=task_id,
|
||||
attempt_index=1,
|
||||
task_text=task_text,
|
||||
started_at=f"2026-06-08T00:00:{run_id[-2:]}+00:00",
|
||||
ended_at="end",
|
||||
success=True,
|
||||
finish_reason="stop",
|
||||
feedback={"acceptance_type": "accept"},
|
||||
activated_skills=receipts,
|
||||
)
|
||||
|
||||
|
||||
def test_select_revise_cases_caps_at_ten_and_prefers_related_skill() -> None:
|
||||
runs = [
|
||||
_run(f"run-{index:02d}", task_id=f"task-{index}", skill_name="debug", skill_version="v0001")
|
||||
for index in range(12)
|
||||
]
|
||||
candidate = SkillLearningCandidate(
|
||||
candidate_id="candidate-1",
|
||||
kind="revise_skill",
|
||||
source_run_ids=[],
|
||||
source_session_ids=[],
|
||||
related_skill_names=["debug"],
|
||||
reason="revise",
|
||||
evidence={"skill_version": "v0001"},
|
||||
)
|
||||
|
||||
cases = select_replay_cases(candidate, runs)
|
||||
|
||||
assert len(cases) == 10
|
||||
assert all(case["baseline_skill_names"] == ["debug"] for case in cases)
|
||||
assert cases[0]["run_id"] == "run-11"
|
||||
|
||||
|
||||
def test_select_new_skill_uses_all_available_source_runs_under_ten() -> None:
|
||||
runs = [_run(f"run-{index:02d}", task_id=f"task-{index}") for index in range(3)]
|
||||
candidate = SkillLearningCandidate(
|
||||
candidate_id="candidate-1",
|
||||
kind="new_skill",
|
||||
source_run_ids=["run-00", "run-01", "run-02"],
|
||||
source_session_ids=["session"],
|
||||
related_skill_names=[],
|
||||
reason="new",
|
||||
)
|
||||
|
||||
cases = select_replay_cases(candidate, runs)
|
||||
|
||||
assert [case["run_id"] for case in cases] == ["run-02", "run-01", "run-00"]
|
||||
assert all(case["baseline_skill_names"] == [] for case in cases)
|
||||
Reference in New Issue
Block a user