feat(skill-learning): add surrogate tool evaluator
This commit is contained in:
@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
|
||||
|
||||
|
||||
def test_surrogate_scores_complete_candidate_higher_than_missing_baseline() -> None:
|
||||
evaluator = SurrogateToolEvaluator()
|
||||
baseline = {
|
||||
"arm": "baseline",
|
||||
"tool_calls": [
|
||||
{"tool_name": "mcp_outlook_send_email", "mode": "surrogate", "arguments": {"to": "", "subject": ""}},
|
||||
],
|
||||
}
|
||||
candidate = {
|
||||
"arm": "candidate",
|
||||
"tool_calls": [
|
||||
{
|
||||
"tool_name": "mcp_outlook_send_email",
|
||||
"mode": "surrogate",
|
||||
"arguments": {"to": "ada@example.com", "subject": "Status", "body": "Done"},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
result = asyncio.run(evaluator.evaluate(task_text="Send a status email to Ada.", baseline=baseline, candidate=candidate))
|
||||
|
||||
assert result["candidate_score"] > result["baseline_score"]
|
||||
assert result["surrogate_tool_count"] == 2
|
||||
assert result["confidence"] in {"low", "medium"}
|
||||
Reference in New Issue
Block a user