Files
beaver_project/app-instance/backend/tests/unit/test_agent_team_v1.py

952 lines
34 KiB
Python

from __future__ import annotations
import asyncio
from pathlib import Path
from types import SimpleNamespace
import pytest
from beaver.memory.curated.snapshot import MemorySnapshot
from beaver.services.memory_service import MemoryService
from beaver.coordinator import AgentDescriptor, DelegationEnvelope, ExecutionGraph, ExecutionNode, NodeRunResult
from beaver.coordinator.execution.scheduler import TeamGraphScheduler
from beaver.coordinator.local import LocalAgentRunner
from beaver.engine import AgentLoop, EngineLoader
from beaver.engine.context import SkillContext
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.engine.providers.factory import ProviderBundle
from beaver.services.team_service import TeamService
from beaver.skills.assembler import SkillAssemblyResult
from beaver.skills.drafts import DraftService
from beaver.skills.publisher import SkillPublisher
from beaver.skills.reviews import ReviewService
from beaver.skills.specs import SkillSpecStore
class RecordingProvider(LLMProvider):
def __init__(self, responses: list[LLMResponse]) -> None:
super().__init__()
self.responses = list(responses)
self.calls: list[list[dict]] = []
async def chat(
self,
messages: list[dict],
tools: list[dict] | None = None,
model: str | None = None,
max_tokens: int = 4096,
temperature: float = 0.7,
) -> LLMResponse:
self.calls.append(messages)
if not self.responses:
raise AssertionError("No stubbed provider responses left")
return self.responses.pop(0)
def get_default_model(self) -> str:
return "stub-model"
class BlockingProvider(RecordingProvider):
def __init__(self, content: str, started: asyncio.Event, release: asyncio.Event) -> None:
super().__init__([_response(content)])
self.started = started
self.release = release
async def chat(self, *args, **kwargs) -> LLMResponse:
self.started.set()
await self.release.wait()
return await super().chat(*args, **kwargs)
class StubSkillAssembler:
def __init__(self, activated_skills: list[SkillContext] | None = None) -> None:
self.activated_skills = list(activated_skills or [])
async def assemble(self, **kwargs) -> SkillAssemblyResult:
return SkillAssemblyResult(activated_skills=list(self.activated_skills))
class BlockingSkillAssembler:
def __init__(self) -> None:
self.first_started = asyncio.Event()
self.release_first = asyncio.Event()
async def assemble(self, **kwargs) -> SkillAssemblyResult:
if "task first" in kwargs["task_description"]:
self.first_started.set()
await self.release_first.wait()
return SkillAssemblyResult()
class PerRunSnapshotMemoryService(MemoryService):
def __init__(self, root: Path) -> None:
super().__init__(root)
self.count = 0
def capture_snapshot_for_run(self) -> MemorySnapshot:
self.count += 1
return MemorySnapshot(memory_block=f"# Memory\n\nsnapshot-{self.count}", user_block=None)
def get_snapshot(self) -> MemorySnapshot:
return MemorySnapshot(memory_block="# Memory\n\nshared-snapshot", user_block=None)
class CapturingRunner:
def __init__(self) -> None:
self.envelopes: list[DelegationEnvelope] = []
async def run(self, envelope: DelegationEnvelope, **kwargs) -> NodeRunResult:
self.envelopes.append(envelope)
return NodeRunResult(node_id=envelope.node_id or "node", success=True, output_text="done")
def _bundle(provider: RecordingProvider) -> ProviderBundle:
return ProviderBundle(
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
main_provider=provider,
)
def _loop(tmp_path: Path) -> AgentLoop:
return AgentLoop(
loader=EngineLoader(
workspace=tmp_path,
skill_assembler=StubSkillAssembler(),
)
)
def _loop_with_services(
tmp_path: Path,
*,
skill_assembler,
memory_service: MemoryService | None = None,
) -> AgentLoop:
return AgentLoop(
loader=EngineLoader(
workspace=tmp_path,
skill_assembler=skill_assembler,
memory_service=memory_service,
)
)
def _response(content: str, *, finish_reason: str = "stop") -> LLMResponse:
return LLMResponse(
content=content,
finish_reason=finish_reason,
provider_name="stub",
model="stub-model",
)
def _publish_skill(workspace: Path, *, skill_name: str, body: str) -> None:
store = SkillSpecStore(workspace)
draft = DraftService(store).create_new_skill_draft(
skill_name=skill_name,
proposed_content=body,
proposed_frontmatter={"description": f"{skill_name} test skill", "tools": []},
created_by="tester",
reason="test",
)
ReviewService(store).approve(skill_name, draft.draft_id, reviewer="tester", notes="ok")
SkillPublisher(store).publish(skill_name, draft.draft_id, publisher="tester", notes="publish")
def test_local_agent_runner_uses_shared_loop_and_records_parent_task(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("sub-agent result")])
envelope = DelegationEnvelope(
parent_task_id="task-parent",
parent_session_id="session-root",
parent_run_id="run-root",
agent=AgentDescriptor(name="researcher", role="research"),
task="research the requested topic",
node_id="research",
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
loaded = loop.boot()
run_record = loaded.run_memory_store.list_runs()[-1] # type: ignore[union-attr]
child_session = loaded.session_manager.get_session(result.session_id) # type: ignore[union-attr,arg-type]
assert result.success is True
assert result.completion_status == "succeeded"
assert result.evidence_gaps == []
assert run_record.task_id == "task-parent"
assert child_session["parent_session_id"] == "session-root"
def test_node_without_required_tool_result_is_partial(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("collected narrative")])
envelope = DelegationEnvelope(
parent_task_id=None,
parent_session_id="session-root",
parent_run_id=None,
agent=AgentDescriptor(name="collect"),
task="collect",
node_id="collect",
required_evidence=["tool_result"],
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
assert result.success is False
assert result.completion_status == "partial"
assert result.evidence_gaps == ["missing required evidence: tool_result"]
def test_node_with_required_nonempty_output_succeeds(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("verified output")])
envelope = DelegationEnvelope(
parent_task_id=None,
parent_session_id="session-root",
parent_run_id=None,
agent=AgentDescriptor(name="verify"),
task="verify",
node_id="verify",
required_evidence=["output"],
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
assert result.success is True
assert result.completion_status == "succeeded"
assert result.evidence_gaps == []
def test_unknown_evidence_requirement_makes_node_partial(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("output")])
envelope = DelegationEnvelope(
parent_task_id=None,
parent_session_id="session-root",
parent_run_id=None,
agent=AgentDescriptor(name="verify"),
task="verify",
node_id="verify",
required_evidence=["unknown_type"],
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
assert result.success is False
assert result.completion_status == "partial"
assert result.evidence_gaps == ["unsupported evidence requirement: unknown_type"]
def test_team_node_preserves_evidence_when_finish_reason_is_not_stop(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("partial evidence", finish_reason="max_tool_iterations")])
envelope = DelegationEnvelope(
parent_task_id="task-parent",
parent_session_id="session-root",
parent_run_id="run-root",
agent=AgentDescriptor(name="researcher", role="research"),
task="research the requested topic",
node_id="research",
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
assert result.success is False
assert result.evidence is not None
assert result.evidence.output_text == "partial evidence"
assert result.evidence.finish_reason == "max_tool_iterations"
def test_pinned_skill_is_injected_into_delegated_run(tmp_path: Path) -> None:
_publish_skill(
tmp_path,
skill_name="review-check",
body="# Review Check\n\nAlways mention the pinned review checklist.\n",
)
loop = _loop(tmp_path)
provider = RecordingProvider([_response("done")])
envelope = DelegationEnvelope(
parent_task_id="task-parent",
parent_session_id="session-root",
parent_run_id="run-root",
agent=AgentDescriptor(name="reviewer"),
task="review the work",
inherited_pinned_skills=["review-check"],
node_id="review",
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
loaded = loop.boot()
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) # type: ignore[union-attr,arg-type]
skill_events = [event for event in events if event.event_type == "skill_activation_snapshotted"]
assert "Always mention the pinned review checklist" in provider.calls[0][1]["content"]
assert skill_events
receipts = skill_events[0].event_payload["receipts"]
assert receipts[0]["skill_name"] == "review-check"
assert receipts[0]["activation_reason"] == "pinned_delegation"
def test_ephemeral_pinned_skill_context_is_injected_into_delegated_run(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("done")])
envelope = DelegationEnvelope(
parent_task_id="task-parent",
parent_session_id="session-root",
parent_run_id="run-root",
agent=AgentDescriptor(name="api_review"),
task="review the API",
inherited_pinned_skill_contexts=[
SkillContext(
name="draft:api-review",
content="Always mention schema compatibility.",
version="draft:draft-1",
content_hash="hash",
activation_reason="generated_missing_skill",
)
],
node_id="api_review",
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
loaded = loop.boot()
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) # type: ignore[union-attr,arg-type]
skill_events = [event for event in events if event.event_type == "skill_activation_snapshotted"]
assert "Always mention schema compatibility" in provider.calls[0][1]["content"]
receipts = skill_events[0].event_payload["receipts"]
assert receipts[0]["skill_name"] == "draft:api-review"
assert receipts[0]["skill_version"] == "draft:draft-1"
assert receipts[0]["activation_reason"] == "generated_missing_skill"
def test_team_sequence_passes_prior_outputs(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"first": RecordingProvider([_response("first output")]),
"second": RecordingProvider([_response("second output")]),
}
graph = ExecutionGraph(
strategy="sequence",
nodes=[
ExecutionNode("first", "step one", AgentDescriptor(name="a")),
ExecutionNode("second", "step two", AgentDescriptor(name="b")),
],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
parent_run_id="run-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
assert result.success is True
assert result.summary == "first output\n\nsecond output"
assert "Dependency first output:\nfirst output" in providers["second"].calls[0][0]["content"]
def test_partial_node_allows_downstream_by_default(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"collect": RecordingProvider([_response("partial source notes")]),
"extract": RecordingProvider([_response("extracted metrics")]),
}
graph = ExecutionGraph(
strategy="sequence",
nodes=[
ExecutionNode(
"collect",
"collect",
AgentDescriptor(name="collect"),
required_evidence=["tool_result"],
),
ExecutionNode("extract", "extract", AgentDescriptor(name="extract")),
],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
assert result.node_results[0].completion_status == "partial"
assert result.node_results[1].completion_status == "succeeded"
assert "Dependency collect output:\npartial source notes" in providers["extract"].calls[0][0]["content"]
def test_partial_node_blocks_downstream_when_configured(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"collect": RecordingProvider([_response("partial source notes")]),
"extract": RecordingProvider([_response("must not run")]),
}
graph = ExecutionGraph(
strategy="sequence",
nodes=[
ExecutionNode(
"collect",
"collect",
AgentDescriptor(name="collect"),
required_evidence=["tool_result"],
block_downstream_on_partial=True,
),
ExecutionNode("extract", "extract", AgentDescriptor(name="extract")),
],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
assert result.node_results[0].completion_status == "partial"
assert result.node_results[1].completion_status == "blocked"
assert providers["extract"].calls == []
def test_scheduler_copies_task_two_contract_fields_to_envelope() -> None:
runner = CapturingRunner()
node = ExecutionNode(
"collect",
"collect",
AgentDescriptor(name="collect"),
input_contract={"query": "str"},
output_contract={"sources": "list"},
required_evidence=["tool_result"],
evidence_contract={"entities": ["MGM"]},
validation_rules=["official_sources_only"],
required_for_completion=False,
block_downstream_on_partial=True,
max_tool_iterations=2,
)
asyncio.run(
TeamGraphScheduler(runner).run( # type: ignore[arg-type]
ExecutionGraph(strategy="sequence", nodes=[node]),
parent_task_id=None,
parent_session_id="session-root",
)
)
envelope = runner.envelopes[0]
assert envelope.input_contract == {"query": "str"}
assert envelope.output_contract == {"sources": "list"}
assert envelope.required_evidence == ["tool_result"]
assert envelope.evidence_contract == {"entities": ["MGM"]}
assert envelope.validation_rules == ["official_sources_only"]
assert envelope.required_for_completion is False
assert envelope.block_downstream_on_partial is True
assert envelope.max_tool_iterations == 2
def test_team_parallel_runs_all_nodes(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"one": RecordingProvider([_response("one")]),
"two": RecordingProvider([_response("two")]),
"three": RecordingProvider([_response("three")]),
}
factory_calls: list[str] = []
graph = ExecutionGraph(
strategy="parallel",
nodes=[
ExecutionNode("one", "task one", AgentDescriptor(name="one")),
ExecutionNode("two", "task two", AgentDescriptor(name="two")),
ExecutionNode("three", "task three", AgentDescriptor(name="three")),
],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
parent_run_id="run-root",
provider_bundle_factory=lambda node: (factory_calls.append(node.node_id) or _bundle(providers[node.node_id])),
)
)
assert result.success is True
assert sorted(factory_calls) == ["one", "three", "two"]
assert result.run_ids and len(result.run_ids) == 3
assert [item.output_text for item in result.node_results] == ["one", "two", "three"]
def test_team_parallel_starts_nodes_concurrently_with_isolated_loops(tmp_path: Path) -> None:
loop = _loop(tmp_path)
first_started = asyncio.Event()
second_started = asyncio.Event()
release = asyncio.Event()
providers = {
"one": BlockingProvider("one", first_started, release),
"two": BlockingProvider("two", second_started, release),
}
graph = ExecutionGraph(
strategy="parallel",
nodes=[
ExecutionNode("one", "task one", AgentDescriptor(name="one")),
ExecutionNode("two", "task two", AgentDescriptor(name="two")),
],
)
async def run_case():
loop_task = asyncio.create_task(loop.run())
await asyncio.sleep(0)
task = asyncio.create_task(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
parent_run_id="run-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
try:
await asyncio.wait_for(first_started.wait(), timeout=1)
await asyncio.wait_for(second_started.wait(), timeout=1)
release.set()
return await task
finally:
release.set()
if not task.done():
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
await loop.stop()
await loop_task
result = asyncio.run(run_case())
assert result.success is True
assert [item.node_id for item in result.node_results] == ["one", "two"]
def test_parallel_node_factory_error_is_normalized_and_keeps_completed_runs(tmp_path: Path) -> None:
loop = _loop(tmp_path)
loaded = loop.boot()
parent = loaded.task_service.create_task(session_id="session-root", description="parent task") # type: ignore[union-attr]
providers = {
"ok": RecordingProvider([_response("ok output")]),
}
graph = ExecutionGraph(
strategy="parallel",
nodes=[
ExecutionNode("ok", "task ok", AgentDescriptor(name="ok")),
ExecutionNode("bad", "task bad", AgentDescriptor(name="bad")),
],
)
def factory(node: ExecutionNode) -> ProviderBundle:
if node.node_id == "bad":
raise RuntimeError("factory failed")
return _bundle(providers[node.node_id])
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=parent.task_id,
parent_session_id=parent.session_id,
parent_run_id="run-root",
provider_bundle_factory=factory,
)
)
bad = [item for item in result.node_results if item.node_id == "bad"][0]
task = loaded.task_service.get_task(parent.task_id) # type: ignore[union-attr]
assert result.success is False
assert bad.finish_reason == "error"
assert bad.error == "factory failed"
assert result.run_ids and len(result.run_ids) == 1
assert task is not None
assert task.run_ids == result.run_ids
assert "ok output" in result.summary
assert "Failed nodes:\n- bad: factory failed" in result.summary
def test_team_dag_blocks_dependents_after_failure(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"prepare": RecordingProvider([_response("ok")]),
"validate": RecordingProvider([_response("failed", finish_reason="error")]),
}
graph = ExecutionGraph(
strategy="dag",
nodes=[
ExecutionNode("prepare", "prepare", AgentDescriptor(name="prep")),
ExecutionNode("validate", "validate", AgentDescriptor(name="validator"), depends_on=["prepare"]),
ExecutionNode("publish", "publish", AgentDescriptor(name="publisher"), depends_on=["validate"]),
],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
parent_run_id="run-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
publish = [item for item in result.node_results if item.node_id == "publish"][0]
validate = [item for item in result.node_results if item.node_id == "validate"][0]
assert result.success is False
assert validate.completion_status == "failed"
assert publish.finish_reason == "blocked"
assert publish.completion_status == "blocked"
assert publish.run_id is None
assert publish.error == "Blocked by failed dependency: validate"
assert "failed" not in result.summary.split("Failed nodes:")[0]
assert "- validate: failed" in result.summary
assert "- publish: Blocked by failed dependency: validate" in result.summary
def test_dag_node_factory_error_blocks_dependents(tmp_path: Path) -> None:
loop = _loop(tmp_path)
providers = {
"prepare": RecordingProvider([_response("prepared")]),
}
graph = ExecutionGraph(
strategy="dag",
nodes=[
ExecutionNode("prepare", "prepare", AgentDescriptor(name="prep")),
ExecutionNode("validate", "validate", AgentDescriptor(name="validator"), depends_on=["prepare"]),
ExecutionNode("publish", "publish", AgentDescriptor(name="publisher"), depends_on=["validate"]),
],
)
def factory(node: ExecutionNode) -> ProviderBundle:
if node.node_id == "validate":
raise RuntimeError("validator unavailable")
return _bundle(providers[node.node_id])
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
parent_run_id="run-root",
provider_bundle_factory=factory,
)
)
validate = [item for item in result.node_results if item.node_id == "validate"][0]
publish = [item for item in result.node_results if item.node_id == "publish"][0]
assert result.success is False
assert validate.finish_reason == "error"
assert validate.completion_status == "failed"
assert validate.error == "validator unavailable"
assert publish.finish_reason == "blocked"
assert publish.completion_status == "blocked"
assert publish.error == "Blocked by failed dependency: validate"
def test_provider_bundle_with_node_model_override_is_normalized_by_team_service(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("unused")])
graph = ExecutionGraph(
strategy="sequence",
nodes=[ExecutionNode("specialist", "work", AgentDescriptor(name="specialist", model="special-model"))],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
provider_bundle=_bundle(provider),
)
)
assert result.success is False
assert result.node_results[0].finish_reason == "error"
assert "provider_bundle cannot be combined" in (result.node_results[0].error or "")
def test_team_summary_lists_only_failed_nodes_when_all_nodes_fail(tmp_path: Path) -> None:
loop = _loop(tmp_path)
graph = ExecutionGraph(
strategy="parallel",
nodes=[
ExecutionNode("one", "task one", AgentDescriptor(name="one")),
ExecutionNode("two", "task two", AgentDescriptor(name="two")),
],
)
def factory(node: ExecutionNode) -> ProviderBundle:
raise RuntimeError(f"{node.node_id} down")
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
provider_bundle_factory=factory,
)
)
assert result.success is False
assert result.summary == "Failed nodes:\n- one: one down evidence=no\n- two: two down evidence=no"
def test_graph_structure_errors_still_raise(tmp_path: Path) -> None:
loop = _loop(tmp_path)
reserved = ExecutionGraph(
strategy="moa",
nodes=[ExecutionNode("node", "task", AgentDescriptor(name="node"))],
)
unknown_dependency = ExecutionGraph(
strategy="dag",
nodes=[ExecutionNode("node", "task", AgentDescriptor(name="node"), depends_on=["missing"])],
)
cyclic = ExecutionGraph(
strategy="dag",
nodes=[
ExecutionNode("a", "task a", AgentDescriptor(name="a"), depends_on=["b"]),
ExecutionNode("b", "task b", AgentDescriptor(name="b"), depends_on=["a"]),
],
)
with pytest.raises(NotImplementedError, match="reserved"):
asyncio.run(TeamService(loop).run_team(reserved, parent_task_id=None, parent_session_id="session-root"))
with pytest.raises(ValueError, match="unknown node"):
asyncio.run(TeamService(loop).run_team(unknown_dependency, parent_task_id=None, parent_session_id="session-root"))
with pytest.raises(ValueError, match="cyclic or unresolved dependencies"):
asyncio.run(TeamService(loop).run_team(cyclic, parent_task_id=None, parent_session_id="session-root"))
def test_execution_node_contract_defaults_preserve_legacy_scope_behavior() -> None:
node = ExecutionNode("collect", "Collect sources", AgentDescriptor(name="collect"))
assert node.input_contract == {}
assert node.output_contract == {}
assert node.allowed_tool_names is None
assert node.required_evidence == []
assert node.evidence_contract == {}
assert node.validation_rules == []
assert node.required_for_completion is True
assert node.block_downstream_on_partial is False
assert node.max_tool_iterations is None
def test_execution_node_keeps_explicit_empty_tool_scope_distinct_from_unspecified_scope() -> None:
unrestricted = ExecutionNode("unrestricted", "Collect", AgentDescriptor(name="unrestricted"))
tool_free = ExecutionNode(
"tool_free",
"Synthesize",
AgentDescriptor(name="tool_free"),
allowed_tool_names=[],
)
assert unrestricted.allowed_tool_names is None
assert tool_free.allowed_tool_names == []
def test_delegation_envelope_and_node_result_preserve_new_contract_metadata() -> None:
envelope = DelegationEnvelope(
parent_task_id="task-parent",
parent_session_id="session-root",
parent_run_id="run-root",
agent=AgentDescriptor(name="collect"),
task="Collect sources",
allowed_tool_names=["web_search"],
required_evidence=["url"],
evidence_contract={"entities": ["MGM", "Galaxy"]},
validation_rules=["official_sources_only"],
required_for_completion=True,
block_downstream_on_partial=True,
max_tool_iterations=2,
)
result = NodeRunResult(
node_id="collect",
success=False,
output_text="MGM source only",
completion_status="partial",
evidence_gaps=["missing required evidence: Galaxy official source"],
)
assert envelope.allowed_tool_names == ["web_search"]
assert envelope.evidence_contract == {"entities": ["MGM", "Galaxy"]}
assert result.to_dict()["completion_status"] == "partial"
assert result.to_dict()["evidence_gaps"] == ["missing required evidence: Galaxy official source"]
def test_graph_rejects_depth_above_configured_limit() -> None:
graph = ExecutionGraph(
strategy="dag",
nodes=[
ExecutionNode("a", "A", AgentDescriptor(name="a")),
ExecutionNode("b", "B", AgentDescriptor(name="b"), depends_on=["a"]),
ExecutionNode("c", "C", AgentDescriptor(name="c"), depends_on=["b"]),
],
)
with pytest.raises(ValueError, match="max depth"):
graph.validate(max_depth=2)
def test_team_run_does_not_create_independent_team_task(tmp_path: Path) -> None:
loop = _loop(tmp_path)
loaded = loop.boot()
parent = loaded.task_service.create_task(session_id="session-root", description="parent task") # type: ignore[union-attr]
provider = RecordingProvider([_response("child output")])
graph = ExecutionGraph(
strategy="sequence",
nodes=[ExecutionNode("child", "child task", AgentDescriptor(name="child"))],
)
result = asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=parent.task_id,
parent_session_id=parent.session_id,
parent_run_id="run-root",
provider_bundle=_bundle(provider),
)
)
tasks = loaded.task_service.store.list_tasks() # type: ignore[union-attr]
run_record = loaded.run_memory_store.list_runs()[-1] # type: ignore[union-attr]
assert result.task_id == parent.task_id
assert [task.task_id for task in tasks] == [parent.task_id]
assert tasks[0].run_ids == result.run_ids
assert run_record.task_id == parent.task_id
def test_parallel_nodes_use_independent_memory_snapshots(tmp_path: Path) -> None:
skill_assembler = BlockingSkillAssembler()
memory_service = PerRunSnapshotMemoryService(tmp_path / "memory" / "curated")
memory_service.initialize()
loop = _loop_with_services(tmp_path, skill_assembler=skill_assembler, memory_service=memory_service)
providers = {
"first": RecordingProvider([_response("first")]),
"second": RecordingProvider([_response("second")]),
}
graph = ExecutionGraph(
strategy="parallel",
nodes=[
ExecutionNode("first", "task first", AgentDescriptor(name="first")),
ExecutionNode("second", "task second", AgentDescriptor(name="second")),
],
)
async def run_team() -> None:
task = asyncio.create_task(
TeamService(loop).run_team(
graph,
parent_task_id=None,
parent_session_id="session-root",
provider_bundle_factory=lambda node: _bundle(providers[node.node_id]),
)
)
await skill_assembler.first_started.wait()
skill_assembler.release_first.set()
await task
asyncio.run(run_team())
first_system = providers["first"].calls[0][0]["content"]
second_system = providers["second"].calls[0][0]["content"]
assert "snapshot-1" in first_system
assert "snapshot-2" in second_system
assert "shared-snapshot" not in first_system
assert "shared-snapshot" not in second_system
def test_provider_bundle_with_node_model_override_is_rejected(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("unused")])
envelope = DelegationEnvelope(
parent_task_id=None,
parent_session_id="session-root",
parent_run_id=None,
agent=AgentDescriptor(name="specialist", model="special-model"),
task="work",
node_id="specialist",
)
with pytest.raises(ValueError, match="provider_bundle cannot be combined"):
asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider)))
def test_node_level_model_without_bundle_reaches_provider_resolution(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
captured: dict[str, str | None] = {}
provider = RecordingProvider([_response("node model used")])
def fake_make_provider_bundle(**kwargs):
captured["model"] = kwargs.get("model")
captured["provider_name"] = kwargs.get("provider_name")
return _bundle(provider)
monkeypatch.setattr("beaver.engine.loop.make_provider_bundle", fake_make_provider_bundle)
loop = _loop(tmp_path)
envelope = DelegationEnvelope(
parent_task_id=None,
parent_session_id="session-root",
parent_run_id=None,
agent=AgentDescriptor(name="specialist", model="special-model", provider_name="custom"),
task="work",
node_id="specialist",
)
result = asyncio.run(LocalAgentRunner(loop).run(envelope))
assert result.success is True
assert captured == {"model": "special-model", "provider_name": "custom"}
def test_unknown_parent_task_is_rejected_before_any_run(tmp_path: Path) -> None:
loop = _loop(tmp_path)
provider = RecordingProvider([_response("unused")])
graph = ExecutionGraph(
strategy="sequence",
nodes=[ExecutionNode("child", "child task", AgentDescriptor(name="child"))],
)
with pytest.raises(ValueError, match="Unknown parent_task_id"):
asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id="missing-task",
parent_session_id="session-root",
provider_bundle=_bundle(provider),
)
)
loaded = loop.boot()
assert loaded.run_memory_store.list_runs() == [] # type: ignore[union-attr]
def test_parent_task_session_mismatch_is_rejected(tmp_path: Path) -> None:
loop = _loop(tmp_path)
loaded = loop.boot()
parent = loaded.task_service.create_task(session_id="session-root", description="parent task") # type: ignore[union-attr]
provider = RecordingProvider([_response("unused")])
graph = ExecutionGraph(
strategy="sequence",
nodes=[ExecutionNode("child", "child task", AgentDescriptor(name="child"))],
)
with pytest.raises(ValueError, match="belongs to session"):
asyncio.run(
TeamService(loop).run_team(
graph,
parent_task_id=parent.task_id,
parent_session_id="other-session",
provider_bundle=_bundle(provider),
)
)