from __future__ import annotations import asyncio from beaver.skills.learning.surrogate import SurrogateToolEvaluator def test_surrogate_scores_complete_candidate_higher_than_missing_baseline() -> None: evaluator = SurrogateToolEvaluator() baseline = { "arm": "baseline", "tool_calls": [ {"tool_name": "mcp_outlook_send_email", "mode": "surrogate", "arguments": {"to": "", "subject": ""}}, ], } candidate = { "arm": "candidate", "tool_calls": [ { "tool_name": "mcp_outlook_send_email", "mode": "surrogate", "arguments": {"to": "ada@example.com", "subject": "Status", "body": "Done"}, }, ], } result = asyncio.run(evaluator.evaluate(task_text="Send a status email to Ada.", baseline=baseline, candidate=candidate)) assert result["candidate_score"] > result["baseline_score"] assert result["surrogate_tool_count"] == 2 assert result["confidence"] in {"low", "medium"}