150 lines
4.5 KiB
Python
150 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from beaver.engine.session.manager import SessionManager
|
|
from beaver.tasks.evidence import (
|
|
EvidenceBuilder,
|
|
RunEvidence,
|
|
TaskEvidencePacket,
|
|
ToolEvidence,
|
|
evaluate_node_evidence,
|
|
render_task_evidence,
|
|
)
|
|
|
|
|
|
def _run_evidence(*, tool_results: list[ToolEvidence] | None = None) -> RunEvidence:
|
|
return RunEvidence(
|
|
run_id="run-1",
|
|
session_id="session-1",
|
|
output_text="",
|
|
finish_reason="stop",
|
|
tool_results=list(tool_results or []),
|
|
)
|
|
|
|
|
|
def test_evaluate_node_evidence_requires_successful_tool_result() -> None:
|
|
evidence = _run_evidence(
|
|
tool_results=[
|
|
ToolEvidence(
|
|
tool_name="web_fetch",
|
|
tool_call_id="call-1",
|
|
content="failed",
|
|
event_payload={"success": False},
|
|
)
|
|
]
|
|
)
|
|
|
|
assert evaluate_node_evidence(evidence, ["tool_result"], "done") == [
|
|
"missing required evidence: tool_result"
|
|
]
|
|
|
|
|
|
def test_evaluate_node_evidence_accepts_url_in_successful_tool_content() -> None:
|
|
evidence = _run_evidence(
|
|
tool_results=[
|
|
ToolEvidence(
|
|
tool_name="web_fetch",
|
|
tool_call_id="call-1",
|
|
content="Source: https://example.test/report",
|
|
event_payload={"success": True},
|
|
)
|
|
]
|
|
)
|
|
|
|
assert evaluate_node_evidence(evidence, ["tool_result", "url"], "done") == []
|
|
|
|
|
|
def test_evaluate_node_evidence_checks_output_and_unknown_requirements() -> None:
|
|
evidence = _run_evidence()
|
|
|
|
assert evaluate_node_evidence(evidence, ["output", "unknown_type"], " ") == [
|
|
"missing required evidence: output",
|
|
"unsupported evidence requirement: unknown_type",
|
|
]
|
|
|
|
|
|
def test_evidence_builder_preserves_full_tool_result(tmp_path: Path) -> None:
|
|
session_manager = SessionManager(tmp_path)
|
|
session_id = "session-1"
|
|
run_id = "run-1"
|
|
long_content = "prefix " + ("x" * 700) + " MAN 3 FT 2 NFO"
|
|
session_manager.ensure_session(session_id, source="test")
|
|
session_manager.append_message(session_id, run_id=run_id, role="user", event_type="user_message_added", content="score?")
|
|
session_manager.append_message(
|
|
session_id,
|
|
run_id=run_id,
|
|
role="tool",
|
|
event_type="tool_result_recorded",
|
|
event_payload={"success": True, "url": "https://example.test/match"},
|
|
content=long_content,
|
|
tool_name="web_fetch",
|
|
tool_call_id="call-1",
|
|
)
|
|
session_manager.append_message(
|
|
session_id,
|
|
run_id=run_id,
|
|
role="system",
|
|
event_type="run_completed",
|
|
event_payload={"finish_reason": "stop"},
|
|
content="Manchester United won 3-2.",
|
|
finish_reason="stop",
|
|
context_visible=False,
|
|
)
|
|
|
|
evidence = EvidenceBuilder(session_manager).build_run_evidence(
|
|
session_id,
|
|
run_id,
|
|
"Manchester United won 3-2.",
|
|
"stop",
|
|
)
|
|
rendered = render_task_evidence(
|
|
TaskEvidencePacket(
|
|
task_id="task-1",
|
|
attempt_index=1,
|
|
main_run=evidence,
|
|
team_runs=[],
|
|
team_node_results=[],
|
|
final_output="Manchester United won 3-2.",
|
|
)
|
|
)
|
|
|
|
assert evidence.tool_results[0].content == long_content
|
|
assert "MAN 3 FT 2 NFO" in rendered
|
|
assert "https://example.test/match" in rendered
|
|
|
|
|
|
def test_render_task_evidence_includes_failed_team_run_tool_results() -> None:
|
|
run = RunEvidence(
|
|
run_id="run-team",
|
|
session_id="session-team",
|
|
output_text="Tool loop stopped.",
|
|
finish_reason="max_tool_iterations",
|
|
transcript=[],
|
|
tool_results=[
|
|
ToolEvidence(
|
|
tool_name="web_fetch",
|
|
tool_call_id="call-team",
|
|
content="Recovered partial source content.",
|
|
event_payload={"success": True, "created_at": "2026-05-22T12:00:00Z"},
|
|
created_at="2026-05-22T12:00:00Z",
|
|
)
|
|
],
|
|
warnings=["finish_reason=max_tool_iterations"],
|
|
)
|
|
packet = TaskEvidencePacket(
|
|
task_id="task-1",
|
|
attempt_index=2,
|
|
main_run=None,
|
|
team_runs=[run],
|
|
team_node_results=[],
|
|
final_output="partial answer",
|
|
)
|
|
|
|
rendered = render_task_evidence(packet)
|
|
|
|
assert "finish_reason=max_tool_iterations" in rendered
|
|
assert "partial answer" in rendered
|
|
assert "Recovered partial source content." in rendered
|
|
assert "created_at=2026-05-22T12:00:00Z" in rendered
|