935 lines
34 KiB
Python
935 lines
34 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
from beaver.coordinator import AgentDescriptor, ExecutionGraph, ExecutionNode
|
|
from beaver.engine import EngineLoader
|
|
from beaver.engine.context.builder import ContextBuilder, ContextBuildInput
|
|
from beaver.engine.providers.base import LLMProvider, LLMResponse
|
|
from beaver.engine.providers.factory import ProviderBundle
|
|
from beaver.services.agent_service import AgentService
|
|
from beaver.skills.assembler import SkillAssemblyResult
|
|
from beaver.tasks import TaskExecutionPlan, TaskRecord, TaskService, ValidationResult, ValidationService
|
|
|
|
|
|
class StubProvider(LLMProvider):
|
|
def __init__(self, responses: list[LLMResponse]) -> None:
|
|
super().__init__()
|
|
self._responses = list(responses)
|
|
self.calls: list[dict[str, object]] = []
|
|
|
|
async def chat(
|
|
self,
|
|
messages: list[dict],
|
|
tools: list[dict] | None = None,
|
|
model: str | None = None,
|
|
max_tokens: int = 4096,
|
|
temperature: float = 0.7,
|
|
) -> LLMResponse:
|
|
self.calls.append({"messages": messages, "tools": tools, "model": model})
|
|
if not self._responses:
|
|
raise AssertionError("No stubbed provider responses left")
|
|
return self._responses.pop(0)
|
|
|
|
def get_default_model(self) -> str:
|
|
return "stub-model"
|
|
|
|
|
|
class StubValidationService:
|
|
def __init__(self, results: list[ValidationResult]) -> None:
|
|
self.results = list(results)
|
|
self.calls: list[dict] = []
|
|
|
|
async def validate_task_result(self, **kwargs) -> ValidationResult:
|
|
self.calls.append(kwargs)
|
|
if not self.results:
|
|
raise AssertionError("No stubbed validation results left")
|
|
return self.results.pop(0)
|
|
|
|
|
|
class StubTaskExecutionPlanner:
|
|
def __init__(self, plans: list[TaskExecutionPlan] | None = None) -> None:
|
|
self.plans = list(plans or [TaskExecutionPlan.single("test-single")])
|
|
self.calls = []
|
|
|
|
async def plan(self, **kwargs) -> TaskExecutionPlan:
|
|
self.calls.append(kwargs)
|
|
if len(self.plans) == 1:
|
|
return self.plans[0]
|
|
if not self.plans:
|
|
raise AssertionError("No stubbed execution plans left")
|
|
return self.plans.pop(0)
|
|
|
|
|
|
class FakeLearningCandidate:
|
|
def to_dict(self) -> dict:
|
|
return {"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"}
|
|
|
|
|
|
class RecordingSkillAssembler:
|
|
def __init__(self) -> None:
|
|
self.task_descriptions: list[str] = []
|
|
|
|
async def assemble(self, **kwargs) -> SkillAssemblyResult:
|
|
self.task_descriptions.append(kwargs["task_description"])
|
|
return SkillAssemblyResult()
|
|
|
|
|
|
def _route_response(action: str = "new_task", short_title: str = "Test task") -> LLMResponse:
|
|
return LLMResponse(
|
|
content=f'{{"action":"{action}","reason":"test route","short_title":"{short_title}"}}',
|
|
finish_reason="stop",
|
|
provider_name="stub",
|
|
model="stub-model",
|
|
)
|
|
|
|
|
|
def _bundle(*responses: str, route_action: str = "new_task") -> ProviderBundle:
|
|
return ProviderBundle(
|
|
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
|
main_provider=StubProvider(
|
|
[
|
|
LLMResponse(
|
|
content=response,
|
|
finish_reason="stop",
|
|
provider_name="stub",
|
|
model="stub-model",
|
|
)
|
|
for response in responses
|
|
]
|
|
),
|
|
auxiliary_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
|
auxiliary_provider=StubProvider([_route_response(route_action)]),
|
|
)
|
|
|
|
|
|
def _single_planner() -> StubTaskExecutionPlanner:
|
|
return StubTaskExecutionPlanner([TaskExecutionPlan.single("test-single")])
|
|
|
|
|
|
def _team_plan(strategy: str = "sequence") -> TaskExecutionPlan:
|
|
return TaskExecutionPlan(
|
|
mode="team",
|
|
reason="test-team",
|
|
graph=ExecutionGraph(
|
|
strategy=strategy, # type: ignore[arg-type]
|
|
nodes=[
|
|
ExecutionNode(
|
|
node_id="research",
|
|
task="research implementation options",
|
|
agent=AgentDescriptor(name="researcher", role="research"),
|
|
)
|
|
],
|
|
),
|
|
final_synthesis_instruction="Use the sub-agent result to produce the final answer.",
|
|
)
|
|
|
|
|
|
def _provider_bundle(provider: StubProvider) -> ProviderBundle:
|
|
return ProviderBundle(
|
|
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
|
main_provider=provider,
|
|
auxiliary_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
|
auxiliary_provider=StubProvider([_route_response("new_task")]),
|
|
)
|
|
|
|
|
|
def _main_only_bundle(*responses: str) -> ProviderBundle:
|
|
return ProviderBundle(
|
|
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
|
|
main_provider=StubProvider(
|
|
[
|
|
LLMResponse(
|
|
content=response,
|
|
finish_reason="stop",
|
|
provider_name="stub",
|
|
model="stub-model",
|
|
)
|
|
for response in responses
|
|
]
|
|
),
|
|
)
|
|
|
|
|
|
def _task_record(status: str) -> TaskRecord:
|
|
return TaskRecord(
|
|
task_id="task-1",
|
|
session_id="session-1",
|
|
description="test task",
|
|
goal="test task",
|
|
constraints=[],
|
|
priority=0,
|
|
status=status,
|
|
creator="main-agent",
|
|
created_at="2026-05-22T00:00:00+00:00",
|
|
updated_at="2026-05-22T00:00:00+00:00",
|
|
)
|
|
|
|
|
|
def test_simple_question_does_not_create_task(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService([]),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"hello?",
|
|
session_id="web:simple",
|
|
provider_bundle=_bundle("hi", route_action="simple_chat"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
|
|
assert result.task_id is None
|
|
assert loaded.task_service.store.list_tasks() == []
|
|
|
|
|
|
def test_complex_request_creates_task_and_records_validation(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[ValidationResult(passed=True, score=0.9, validator="test")]
|
|
),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement the new report workflow",
|
|
session_id="web:task",
|
|
provider_bundle=_bundle("implemented"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task_by_run_id(result.run_id)
|
|
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
|
|
run_record = loaded.run_memory_store.list_runs()[-1]
|
|
skill_effects = next(event for event in events if event.event_type == "skill_effects_snapshotted")
|
|
|
|
assert result.task_id is not None
|
|
assert task is not None
|
|
assert task.status == "awaiting_feedback"
|
|
assert any(event.event_type == "task_validation_snapshotted" for event in events)
|
|
assert run_record.task_id == result.task_id
|
|
assert run_record.validation_result["accepted"] is True
|
|
assert skill_effects.event_payload["candidate_generation_allowed"] is False
|
|
assert skill_effects.event_payload["learning_candidates"] == []
|
|
assert task.metadata["short_title"] == "Test task"
|
|
|
|
|
|
def test_task_mode_uses_task_aware_skill_selection_context(tmp_path: Path) -> None:
|
|
skill_assembler = RecordingSkillAssembler()
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[ValidationResult(passed=True, score=1.0, validator="test")]
|
|
),
|
|
skill_assembler=skill_assembler,
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"继续按刚才的方案改",
|
|
session_id="web:task-skill-query",
|
|
provider_bundle=_bundle("done", route_action="new_task"),
|
|
)
|
|
)
|
|
|
|
assert result.task_id
|
|
assert skill_assembler.task_descriptions
|
|
query = skill_assembler.task_descriptions[0]
|
|
assert "Task goal:" in query
|
|
assert "Current user request:" in query
|
|
assert "Previously activated skills:" in query
|
|
assert "If no published skill matches, return []" in query
|
|
|
|
|
|
def test_active_task_continues_until_llm_closes_it(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(passed=True, score=0.9, validator="test"),
|
|
ValidationResult(passed=True, score=0.9, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
first = asyncio.run(
|
|
service.process_direct(
|
|
"implement the search workflow",
|
|
session_id="web:continue",
|
|
provider_bundle=_bundle("first done", route_action="new_task"),
|
|
)
|
|
)
|
|
second = asyncio.run(
|
|
service.process_direct(
|
|
"also add tests for it",
|
|
session_id="web:continue",
|
|
provider_bundle=_bundle("tests added", route_action="continue_task"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(first.task_id)
|
|
|
|
assert task is not None
|
|
assert second.task_id == first.task_id
|
|
assert len(task.run_ids) == 2
|
|
|
|
closed = asyncio.run(
|
|
service.process_direct(
|
|
"这个任务结束了",
|
|
session_id="web:continue",
|
|
provider_bundle=_bundle("好的,已结束。", route_action="close_task"),
|
|
)
|
|
)
|
|
task = loaded.task_service.get_task(first.task_id)
|
|
|
|
assert closed.task_id is None
|
|
assert task is not None
|
|
assert task.status == "closed"
|
|
assert loaded.task_service.active_task_view("web:continue") is None
|
|
|
|
|
|
def test_active_task_revision_input_records_feedback_and_reruns(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(passed=True, score=0.9, validator="test"),
|
|
ValidationResult(passed=True, score=0.95, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
first = asyncio.run(
|
|
service.process_direct(
|
|
"查询珠海天气",
|
|
session_id="web:revise-direct",
|
|
provider_bundle=_bundle("珠海天气概览", route_action="new_task"),
|
|
)
|
|
)
|
|
second = asyncio.run(
|
|
service.process_direct(
|
|
"再详细一点,并加上明后天穿衣建议",
|
|
session_id="web:revise-direct",
|
|
provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(first.task_id)
|
|
messages = loaded.session_manager.get_messages_as_conversation(first.session_id)
|
|
first_assistant = [
|
|
message
|
|
for message in messages
|
|
if message.get("role") == "assistant" and message.get("run_id") == first.run_id
|
|
][-1]
|
|
user_messages = [message.get("content") for message in messages if message.get("role") == "user"]
|
|
|
|
assert second.task_id == first.task_id
|
|
assert task is not None
|
|
assert task.status == "awaiting_feedback"
|
|
assert len(task.run_ids) == 2
|
|
assert task.feedback == [
|
|
{
|
|
"feedback_type": "revise",
|
|
"comment": "再详细一点,并加上明后天穿衣建议",
|
|
"run_id": first.run_id,
|
|
"created_at": task.feedback[0]["created_at"],
|
|
}
|
|
]
|
|
assert first_assistant["feedback_state"] == "revise"
|
|
assert "再详细一点,并加上明后天穿衣建议" in user_messages
|
|
|
|
|
|
def test_explicit_revision_feedback_then_input_reruns_without_duplicate_feedback(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(passed=True, score=0.9, validator="test"),
|
|
ValidationResult(passed=True, score=0.95, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
first = asyncio.run(
|
|
service.process_direct(
|
|
"查询珠海天气",
|
|
session_id="web:explicit-revise",
|
|
provider_bundle=_bundle("珠海天气概览", route_action="new_task"),
|
|
)
|
|
)
|
|
feedback = asyncio.run(
|
|
service.submit_feedback(
|
|
session_id=first.session_id,
|
|
run_id=first.run_id,
|
|
feedback_type="revise",
|
|
comment="准备补充穿衣建议",
|
|
)
|
|
)
|
|
second = asyncio.run(
|
|
service.process_direct(
|
|
"加上明后天穿衣建议",
|
|
session_id="web:explicit-revise",
|
|
provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(first.task_id)
|
|
|
|
assert feedback["task_status"] == "needs_revision"
|
|
assert second.task_id == first.task_id
|
|
assert task is not None
|
|
assert task.status == "awaiting_feedback"
|
|
assert len(task.run_ids) == 2
|
|
assert len(task.feedback) == 1
|
|
assert task.feedback[0]["feedback_type"] == "revise"
|
|
assert task.feedback[0]["comment"] == "准备补充穿衣建议"
|
|
|
|
|
|
def test_validation_result_status_drives_accepted_and_passed() -> None:
|
|
accepted = ValidationResult(status="accepted", score=0.9, validator="test")
|
|
insufficient = ValidationResult(status="insufficient_evidence", score=0.9, validator="test")
|
|
rejected = ValidationResult(status="rejected", score=0.9, validator="test")
|
|
|
|
assert accepted.passed is True
|
|
assert accepted.accepted is True
|
|
assert insufficient.passed is False
|
|
assert insufficient.accepted is False
|
|
assert rejected.passed is False
|
|
assert rejected.accepted is False
|
|
|
|
|
|
def test_validation_result_from_legacy_payload_maps_to_status() -> None:
|
|
accepted = ValidationResult.from_dict({"passed": True, "score": 0.9, "validator": "legacy"})
|
|
low_score = ValidationResult.from_dict({"passed": True, "score": 0.7, "validator": "legacy"})
|
|
rejected = ValidationResult.from_dict({"passed": False, "score": 0.2, "validator": "legacy"})
|
|
|
|
assert accepted is not None
|
|
assert accepted.status == "accepted"
|
|
assert low_score is not None
|
|
assert low_score.status == "rejected"
|
|
assert rejected is not None
|
|
assert rejected.status == "rejected"
|
|
|
|
|
|
def test_validation_result_rejects_unknown_status() -> None:
|
|
with pytest.raises(ValueError, match="unknown validation status"):
|
|
ValidationResult(status="pending", score=0.9, validator="test") # type: ignore[arg-type]
|
|
|
|
|
|
def test_validation_result_from_dict_rejects_unknown_explicit_status() -> None:
|
|
with pytest.raises(ValueError, match="unknown validation status"):
|
|
ValidationResult.from_dict({"status": "pending", "passed": True, "score": 0.9})
|
|
|
|
|
|
def test_validation_result_evidence_gaps_round_trip() -> None:
|
|
validation = ValidationResult(
|
|
status="insufficient_evidence",
|
|
score=0.4,
|
|
evidence_gaps=["missing command output", "missing file reference"],
|
|
validator="test",
|
|
)
|
|
|
|
restored = ValidationResult.from_dict(validation.to_dict())
|
|
|
|
assert restored is not None
|
|
assert restored.status == "insufficient_evidence"
|
|
assert restored.evidence_gaps == ["missing command output", "missing file reference"]
|
|
assert restored.to_dict()["evidence_gaps"] == ["missing command output", "missing file reference"]
|
|
|
|
|
|
def test_task_record_status_helpers_distinguish_review_and_failed() -> None:
|
|
needs_review = _task_record("needs_review")
|
|
failed = _task_record("failed")
|
|
|
|
assert needs_review.is_open is True
|
|
assert needs_review.is_execution_active is False
|
|
assert needs_review.requires_user_action is True
|
|
assert failed.is_open is False
|
|
assert failed.is_execution_active is False
|
|
assert failed.requires_user_action is False
|
|
|
|
|
|
def test_task_service_api_payload_emits_status_helpers(tmp_path: Path) -> None:
|
|
service = TaskService(tmp_path)
|
|
task = _task_record("needs_review")
|
|
|
|
payload = service.to_api_dict(task)
|
|
|
|
assert payload["is_open"] is True
|
|
assert payload["is_execution_active"] is False
|
|
assert payload["requires_user_action"] is True
|
|
|
|
|
|
def test_validation_failure_retries_once(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(
|
|
passed=False,
|
|
score=0.2,
|
|
issues=["missing tests"],
|
|
recommended_revision_prompt="Add tests before final response.",
|
|
validator="test",
|
|
),
|
|
ValidationResult(passed=True, score=0.88, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement and validate the task",
|
|
session_id="web:retry",
|
|
provider_bundle=_bundle("first draft", "revised draft"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(result.task_id)
|
|
|
|
assert result.output_text == "revised draft"
|
|
assert result.validation_result["accepted"] is True
|
|
assert task is not None
|
|
assert len(task.run_ids) == 2
|
|
visible_messages = loaded.session_manager.get_messages_as_conversation(result.session_id)
|
|
visible_contents = [message.get("content") for message in visible_messages]
|
|
assert "first draft" not in visible_contents
|
|
assert "revised draft" in visible_contents
|
|
|
|
|
|
def test_feedback_closes_or_abandons_internal_task(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[ValidationResult(passed=True, score=0.9, validator="test")]
|
|
),
|
|
)
|
|
)
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement feedback handling",
|
|
session_id="web:feedback",
|
|
provider_bundle=_bundle("done"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
learning_calls = []
|
|
|
|
def build_learning_candidates_for_task(task_id: str, *, trigger_run_id: str) -> list[FakeLearningCandidate]:
|
|
learning_calls.append((task_id, trigger_run_id))
|
|
return [FakeLearningCandidate()]
|
|
|
|
loaded.skill_learning_service.build_learning_candidates_for_task = build_learning_candidates_for_task
|
|
|
|
feedback = asyncio.run(
|
|
service.submit_feedback(
|
|
session_id=result.session_id,
|
|
run_id=result.run_id,
|
|
feedback_type="satisfied",
|
|
)
|
|
)
|
|
|
|
assert feedback["task_status"] == "closed"
|
|
assert feedback["learning_candidates"] == [
|
|
{"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"}
|
|
]
|
|
assert learning_calls == [(result.task_id, result.run_id)]
|
|
|
|
service2 = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path / "abandon",
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(passed=False, score=0.3, validator="test"),
|
|
ValidationResult(passed=False, score=0.3, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
abandoned = asyncio.run(
|
|
service2.process_direct(
|
|
"implement another workflow",
|
|
session_id="web:abandon",
|
|
provider_bundle=_bundle("not enough", "still not enough"),
|
|
)
|
|
)
|
|
abandon_feedback = asyncio.run(
|
|
service2.submit_feedback(
|
|
session_id=abandoned.session_id,
|
|
run_id=abandoned.run_id,
|
|
feedback_type="abandon",
|
|
comment="too costly",
|
|
)
|
|
)
|
|
|
|
assert abandon_feedback["task_status"] == "abandoned"
|
|
assert abandon_feedback["learning_candidates"] == []
|
|
loaded2 = service2.create_loop().boot()
|
|
failure_events = [
|
|
event
|
|
for event in loaded2.session_manager.get_run_event_records(abandoned.session_id, abandoned.run_id)
|
|
if event.event_type == "task_failure_evidence_recorded"
|
|
]
|
|
assert len(failure_events) == 1
|
|
assert loaded2.memory_service.get_store().memory_entries == []
|
|
|
|
|
|
def test_feedback_is_idempotent_and_projected_to_assistant_message(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[ValidationResult(passed=True, score=0.9, validator="test")]
|
|
),
|
|
)
|
|
)
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement feedback projection",
|
|
session_id="web:feedback-projection",
|
|
provider_bundle=_bundle("done"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
|
|
first = asyncio.run(
|
|
service.submit_feedback(
|
|
session_id=result.session_id,
|
|
run_id=result.run_id,
|
|
feedback_type="satisfied",
|
|
)
|
|
)
|
|
second = asyncio.run(
|
|
service.submit_feedback(
|
|
session_id=result.session_id,
|
|
run_id=result.run_id,
|
|
feedback_type="satisfied",
|
|
)
|
|
)
|
|
|
|
feedback_events = [
|
|
event
|
|
for event in loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
|
|
if event.event_type == "task_feedback_recorded"
|
|
]
|
|
assistant = [
|
|
message
|
|
for message in loaded.session_manager.get_messages_as_conversation(result.session_id)
|
|
if message.get("role") == "assistant" and message.get("run_id") == result.run_id
|
|
][-1]
|
|
|
|
assert first["task_status"] == "closed"
|
|
assert second["task_status"] == "closed"
|
|
assert len(feedback_events) == 1
|
|
assert assistant["feedback_state"] == "satisfied"
|
|
assert assistant["task_status"] == "closed"
|
|
assert assistant["validation_status"] == "passed"
|
|
|
|
with pytest.raises(ValueError, match="already recorded"):
|
|
asyncio.run(
|
|
service.submit_feedback(
|
|
session_id=result.session_id,
|
|
run_id=result.run_id,
|
|
feedback_type="abandon",
|
|
)
|
|
)
|
|
|
|
task = loaded.task_service.get_task(result.task_id)
|
|
assert task is not None
|
|
assert task.status == "closed"
|
|
|
|
|
|
def test_task_mode_team_plan_runs_subagent_then_main_synthesis(tmp_path: Path) -> None:
|
|
main_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
|
|
]
|
|
)
|
|
sub_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model")
|
|
]
|
|
)
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
|
|
validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement team-backed workflow",
|
|
session_id="web:team",
|
|
provider_bundle=_provider_bundle(main_provider),
|
|
team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(result.task_id)
|
|
events = loaded.session_manager.get_event_records(result.session_id)
|
|
|
|
assert result.output_text == "final synthesized answer"
|
|
assert task is not None
|
|
assert len(task.run_ids) == 2
|
|
assert result.run_id == task.run_ids[-1]
|
|
assert any(event.event_type == "task_execution_planned" for event in events)
|
|
assert any(event.event_type == "task_team_run_completed" for event in events)
|
|
assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"]
|
|
assert "sub-agent evidence" != result.output_text
|
|
|
|
|
|
def test_task_mode_team_synthesis_runs_without_tools_and_receives_evidence(tmp_path: Path) -> None:
|
|
main_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
|
|
]
|
|
)
|
|
sub_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model")
|
|
]
|
|
)
|
|
validation = StubValidationService([ValidationResult(status="accepted", score=0.9, validator="test")])
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
|
|
validation_service=validation,
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement team-backed workflow",
|
|
session_id="web:team-no-tools",
|
|
provider_bundle=_provider_bundle(main_provider),
|
|
team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider),
|
|
)
|
|
)
|
|
|
|
assert result.output_text == "final synthesized answer"
|
|
assert main_provider.calls[0]["tools"] is None
|
|
assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"]
|
|
assert "Task evidence packet" in validation.calls[0]["evidence_text"]
|
|
|
|
|
|
def test_task_mode_team_failure_still_uses_main_synthesis(tmp_path: Path) -> None:
|
|
main_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="fallback synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
|
|
]
|
|
)
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
|
|
validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement workflow despite team failure",
|
|
session_id="web:team-failure",
|
|
provider_bundle=_provider_bundle(main_provider),
|
|
team_provider_bundle_factory=lambda node: (_ for _ in ()).throw(RuntimeError("sub-agent unavailable")),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
events = loaded.session_manager.get_event_records(result.session_id)
|
|
|
|
assert result.output_text == "fallback synthesized answer"
|
|
assert any(event.event_type == "task_team_run_failed" for event in events)
|
|
assert "sub-agent unavailable" in main_provider.calls[0]["messages"][0]["content"]
|
|
assert "same class of tools fails repeatedly" in main_provider.calls[0]["messages"][0]["content"]
|
|
assert "user-visible fallback answer" in main_provider.calls[0]["messages"][0]["content"]
|
|
|
|
|
|
def test_insufficient_evidence_moves_task_to_needs_review(tmp_path: Path) -> None:
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=_single_planner(),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(
|
|
status="insufficient_evidence",
|
|
score=0.4,
|
|
evidence_gaps=["source missing"],
|
|
validator="test",
|
|
)
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"answer with uncertain evidence",
|
|
session_id="web:needs-review",
|
|
provider_bundle=_bundle("possible answer"),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(result.task_id)
|
|
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
|
|
validation_event = next(event for event in events if event.event_type == "task_validation_snapshotted")
|
|
|
|
assert task is not None
|
|
assert task.status == "needs_review"
|
|
assert task.requires_user_action is True
|
|
assert task.is_execution_active is False
|
|
assert validation_event.event_payload["validation_result"]["status"] == "insufficient_evidence"
|
|
assert validation_event.event_payload["retry_scheduled"] is False
|
|
assert validation_event.event_payload["validation_debug"]["tool_result_count"] >= 0
|
|
|
|
|
|
def test_task_mode_team_retry_hides_first_synthesis_run(tmp_path: Path) -> None:
|
|
main_provider = StubProvider(
|
|
[
|
|
LLMResponse(content="first synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"),
|
|
LLMResponse(content="revised synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"),
|
|
]
|
|
)
|
|
sub_providers = [
|
|
StubProvider([LLMResponse(content="first evidence", finish_reason="stop", provider_name="stub", model="stub-model")]),
|
|
StubProvider([LLMResponse(content="second evidence", finish_reason="stop", provider_name="stub", model="stub-model")]),
|
|
]
|
|
service = AgentService(
|
|
loader=EngineLoader(
|
|
workspace=tmp_path,
|
|
task_execution_planner=StubTaskExecutionPlanner([_team_plan(), _team_plan()]),
|
|
validation_service=StubValidationService(
|
|
[
|
|
ValidationResult(passed=False, score=0.2, recommended_revision_prompt="revise", validator="test"),
|
|
ValidationResult(passed=True, score=0.9, validator="test"),
|
|
]
|
|
),
|
|
)
|
|
)
|
|
|
|
result = asyncio.run(
|
|
service.process_direct(
|
|
"implement and validate with team",
|
|
session_id="web:team-retry",
|
|
provider_bundle=_provider_bundle(main_provider),
|
|
team_provider_bundle_factory=lambda node: _provider_bundle(sub_providers.pop(0)),
|
|
)
|
|
)
|
|
loaded = service.create_loop().boot()
|
|
task = loaded.task_service.get_task(result.task_id)
|
|
visible = loaded.session_manager.get_messages_as_conversation(result.session_id)
|
|
visible_contents = [message.get("content") for message in visible]
|
|
run_records = {record.run_id: record for record in loaded.run_memory_store.list_runs()}
|
|
|
|
assert result.output_text == "revised synthesized answer"
|
|
assert task is not None
|
|
assert len(task.run_ids) == 4
|
|
assert "first synthesized answer" not in visible_contents
|
|
assert "revised synthesized answer" in visible_contents
|
|
for run_id in task.run_ids:
|
|
record = run_records[run_id]
|
|
events = loaded.session_manager.get_run_event_records(record.session_id, run_id)
|
|
skill_effects = [event for event in events if event.event_type == "skill_effects_snapshotted"]
|
|
assert skill_effects
|
|
assert skill_effects[-1].event_payload["candidate_generation_allowed"] is False
|
|
|
|
|
|
def test_context_builder_strips_ui_projection_fields_from_provider_history() -> None:
|
|
result = ContextBuilder().build_messages(
|
|
ContextBuildInput(
|
|
history=[
|
|
{
|
|
"role": "assistant",
|
|
"content": "done",
|
|
"run_id": "run-1",
|
|
"task_id": "task-1",
|
|
"task_status": "closed",
|
|
"validation_status": "passed",
|
|
"feedback_state": "satisfied",
|
|
}
|
|
],
|
|
)
|
|
)
|
|
|
|
assistant = result.messages[-1]
|
|
assert assistant == {"role": "assistant", "content": "done"}
|
|
|
|
|
|
def test_context_builder_normalizes_persisted_tool_arguments() -> None:
|
|
result = ContextBuilder().build_messages(
|
|
ContextBuildInput(
|
|
history=[
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [
|
|
{
|
|
"id": "call-1",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "cron",
|
|
"arguments": {"action": "add", "mode": "notification"},
|
|
},
|
|
}
|
|
],
|
|
}
|
|
],
|
|
)
|
|
)
|
|
|
|
tool_call = result.messages[-1]["tool_calls"][0]
|
|
assert tool_call["function"]["arguments"] == '{"action": "add", "mode": "notification"}'
|
|
|
|
|
|
def test_llm_validator_parse_failure_is_not_accepted(tmp_path: Path) -> None:
|
|
task_service = TaskService(tmp_path / "tasks")
|
|
task = task_service.create_task(session_id="web:validator", description="implement validator handling")
|
|
validation = asyncio.run(
|
|
ValidationService().validate_task_result(
|
|
task=task,
|
|
user_message="implement validator handling",
|
|
final_output="done",
|
|
provider_bundle=_main_only_bundle("not json"),
|
|
)
|
|
)
|
|
|
|
assert validation.accepted is False
|
|
assert validation.status == "validator_error"
|
|
assert validation.validator == "llm_error"
|
|
assert validation.issues
|