32 lines
1.0 KiB
Python
32 lines
1.0 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
from beaver.skills.learning.surrogate import SurrogateToolEvaluator
|
|
|
|
|
|
def test_surrogate_scores_complete_candidate_higher_than_missing_baseline() -> None:
|
|
evaluator = SurrogateToolEvaluator()
|
|
baseline = {
|
|
"arm": "baseline",
|
|
"tool_calls": [
|
|
{"tool_name": "mcp_outlook_send_email", "mode": "surrogate", "arguments": {"to": "", "subject": ""}},
|
|
],
|
|
}
|
|
candidate = {
|
|
"arm": "candidate",
|
|
"tool_calls": [
|
|
{
|
|
"tool_name": "mcp_outlook_send_email",
|
|
"mode": "surrogate",
|
|
"arguments": {"to": "ada@example.com", "subject": "Status", "body": "Done"},
|
|
},
|
|
],
|
|
}
|
|
|
|
result = asyncio.run(evaluator.evaluate(task_text="Send a status email to Ada.", baseline=baseline, candidate=candidate))
|
|
|
|
assert result["candidate_score"] > result["baseline_score"]
|
|
assert result["surrogate_tool_count"] == 2
|
|
assert result["confidence"] in {"low", "medium"}
|