feat(engine): 添加运行时上下文支持并重构工具迭代限制

添加 RuntimeContext 类用于捕获模型运行时的日期时间信息,
包括UTC时间、本地时间和时区信息,并在系统提示中显示这些信息。

同时增加最大上下文消息数和工具迭代次数的配置选项,
将验证服务从引擎加载器中移除,并更新相关的数据结构和接口。

BREAKING CHANGE: 移除了验证服务,相关字段被替换为证据状态和接受状态。

- 添加 RuntimeContext 类和相关渲染方法
- 增加 max_context_messages 和 max_tool_iterations 配置
- 移除 ValidationService 相关代码
- 更新消息记录中的验证状态字段
- 添加原始工具调用检测和回退处理
This commit is contained in:
2026-05-26 11:18:35 +08:00
parent 16347caf5e
commit 6e9e74d1ee
57 changed files with 5710 additions and 1582 deletions

View File

@ -4,23 +4,17 @@ import asyncio
from pathlib import Path
from types import SimpleNamespace
import pytest
from beaver.coordinator import AgentDescriptor, ExecutionGraph, ExecutionNode
from beaver.engine import EngineLoader
from beaver.engine.context.builder import ContextBuilder, ContextBuildInput
from beaver.engine.providers.base import LLMProvider, LLMResponse
from beaver.engine.providers.factory import ProviderBundle
from beaver.services.agent_service import AgentService
from beaver.skills.assembler import SkillAssemblyResult
from beaver.tasks import TaskExecutionPlan, TaskRecord, TaskService, ValidationResult, ValidationService
from beaver.tasks import TaskExecutionPlan, TaskService
class StubProvider(LLMProvider):
def __init__(self, responses: list[LLMResponse]) -> None:
super().__init__()
self._responses = list(responses)
self.calls: list[dict[str, object]] = []
async def chat(
self,
@ -30,7 +24,6 @@ class StubProvider(LLMProvider):
max_tokens: int = 4096,
temperature: float = 0.7,
) -> LLMResponse:
self.calls.append({"messages": messages, "tools": tools, "model": model})
if not self._responses:
raise AssertionError("No stubbed provider responses left")
return self._responses.pop(0)
@ -39,30 +32,9 @@ class StubProvider(LLMProvider):
return "stub-model"
class StubValidationService:
def __init__(self, results: list[ValidationResult]) -> None:
self.results = list(results)
self.calls: list[dict] = []
async def validate_task_result(self, **kwargs) -> ValidationResult:
self.calls.append(kwargs)
if not self.results:
raise AssertionError("No stubbed validation results left")
return self.results.pop(0)
class StubTaskExecutionPlanner:
def __init__(self, plans: list[TaskExecutionPlan] | None = None) -> None:
self.plans = list(plans or [TaskExecutionPlan.single("test-single")])
self.calls = []
async def plan(self, **kwargs) -> TaskExecutionPlan:
self.calls.append(kwargs)
if len(self.plans) == 1:
return self.plans[0]
if not self.plans:
raise AssertionError("No stubbed execution plans left")
return self.plans.pop(0)
return TaskExecutionPlan.single("test-single")
class FakeLearningCandidate:
@ -70,15 +42,6 @@ class FakeLearningCandidate:
return {"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"}
class RecordingSkillAssembler:
def __init__(self) -> None:
self.task_descriptions: list[str] = []
async def assemble(self, **kwargs) -> SkillAssemblyResult:
self.task_descriptions.append(kwargs["task_description"])
return SkillAssemblyResult()
def _route_response(action: str = "new_task", short_title: str = "Test task") -> LLMResponse:
return LLMResponse(
content=f'{{"action":"{action}","reason":"test route","short_title":"{short_title}"}}',
@ -107,828 +70,157 @@ def _bundle(*responses: str, route_action: str = "new_task") -> ProviderBundle:
)
def _single_planner() -> StubTaskExecutionPlanner:
return StubTaskExecutionPlanner([TaskExecutionPlan.single("test-single")])
def _team_plan(strategy: str = "sequence") -> TaskExecutionPlan:
return TaskExecutionPlan(
mode="team",
reason="test-team",
graph=ExecutionGraph(
strategy=strategy, # type: ignore[arg-type]
nodes=[
ExecutionNode(
node_id="research",
task="research implementation options",
agent=AgentDescriptor(name="researcher", role="research"),
)
],
),
final_synthesis_instruction="Use the sub-agent result to produce the final answer.",
)
def _provider_bundle(provider: StubProvider) -> ProviderBundle:
return ProviderBundle(
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
main_provider=provider,
auxiliary_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
auxiliary_provider=StubProvider([_route_response("new_task")]),
)
def _main_only_bundle(*responses: str) -> ProviderBundle:
return ProviderBundle(
main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"),
main_provider=StubProvider(
[
LLMResponse(
content=response,
finish_reason="stop",
provider_name="stub",
model="stub-model",
)
for response in responses
]
),
)
def _task_record(status: str) -> TaskRecord:
return TaskRecord(
task_id="task-1",
session_id="session-1",
description="test task",
goal="test task",
constraints=[],
priority=0,
status=status,
creator="main-agent",
created_at="2026-05-22T00:00:00+00:00",
updated_at="2026-05-22T00:00:00+00:00",
)
def test_simple_question_does_not_create_task(tmp_path: Path) -> None:
def test_task_run_records_evidence_and_waits_for_acceptance(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService([]),
task_execution_planner=StubTaskExecutionPlanner(),
)
)
result = asyncio.run(
service.process_direct(
"hello?",
session_id="web:simple",
provider_bundle=_bundle("hi", route_action="simple_chat"),
)
)
loaded = service.create_loop().boot()
assert result.task_id is None
assert loaded.task_service.store.list_tasks() == []
def test_complex_request_creates_task_and_records_validation(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[ValidationResult(passed=True, score=0.9, validator="test")]
),
"draft release notes",
session_id="web:test",
provider_bundle=_bundle("Done"),
)
)
result = asyncio.run(
service.process_direct(
"implement the new report workflow",
session_id="web:task",
provider_bundle=_bundle("implemented"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task_by_run_id(result.run_id)
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
run_record = loaded.run_memory_store.list_runs()[-1]
skill_effects = next(event for event in events if event.event_type == "skill_effects_snapshotted")
assert result.task_id is not None
task_service = service.create_loop().boot().task_service
assert task_service is not None
task = task_service.get_task(result.task_id or "")
assert task is not None
assert task.status == "awaiting_feedback"
assert any(event.event_type == "task_validation_snapshotted" for event in events)
assert run_record.task_id == result.task_id
assert run_record.validation_result["accepted"] is True
assert skill_effects.event_payload["candidate_generation_allowed"] is False
assert skill_effects.event_payload["learning_candidates"] == []
assert task.metadata["short_title"] == "Test task"
assert task.status == "awaiting_acceptance"
assert task.validation_result is None
assert result.validation_result is None
event_types = [event.event_type for event in task_service.list_events(task.task_id)]
assert "evidence_recorded" in event_types
assert "validated" not in event_types
def test_task_mode_uses_task_aware_skill_selection_context(tmp_path: Path) -> None:
skill_assembler = RecordingSkillAssembler()
def test_acceptance_closes_task_and_triggers_learning(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[ValidationResult(passed=True, score=1.0, validator="test")]
),
skill_assembler=skill_assembler,
)
)
result = asyncio.run(
service.process_direct(
"继续按刚才的方案改",
session_id="web:task-skill-query",
provider_bundle=_bundle("done", route_action="new_task"),
)
)
assert result.task_id
assert skill_assembler.task_descriptions
query = skill_assembler.task_descriptions[0]
assert "Task goal:" in query
assert "Current user request:" in query
assert "Previously activated skills:" in query
assert "If no published skill matches, return []" in query
def test_active_task_continues_until_llm_closes_it(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(passed=True, score=0.9, validator="test"),
ValidationResult(passed=True, score=0.9, validator="test"),
]
),
)
)
first = asyncio.run(
service.process_direct(
"implement the search workflow",
session_id="web:continue",
provider_bundle=_bundle("first done", route_action="new_task"),
)
)
second = asyncio.run(
service.process_direct(
"also add tests for it",
session_id="web:continue",
provider_bundle=_bundle("tests added", route_action="continue_task"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(first.task_id)
assert task is not None
assert second.task_id == first.task_id
assert len(task.run_ids) == 2
closed = asyncio.run(
service.process_direct(
"这个任务结束了",
session_id="web:continue",
provider_bundle=_bundle("好的,已结束。", route_action="close_task"),
)
)
task = loaded.task_service.get_task(first.task_id)
assert closed.task_id is None
assert task is not None
assert task.status == "closed"
assert loaded.task_service.active_task_view("web:continue") is None
def test_active_task_revision_input_records_feedback_and_reruns(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(passed=True, score=0.9, validator="test"),
ValidationResult(passed=True, score=0.95, validator="test"),
]
),
)
)
first = asyncio.run(
service.process_direct(
"查询珠海天气",
session_id="web:revise-direct",
provider_bundle=_bundle("珠海天气概览", route_action="new_task"),
)
)
second = asyncio.run(
service.process_direct(
"再详细一点,并加上明后天穿衣建议",
session_id="web:revise-direct",
provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(first.task_id)
messages = loaded.session_manager.get_messages_as_conversation(first.session_id)
first_assistant = [
message
for message in messages
if message.get("role") == "assistant" and message.get("run_id") == first.run_id
][-1]
user_messages = [message.get("content") for message in messages if message.get("role") == "user"]
assert second.task_id == first.task_id
assert task is not None
assert task.status == "awaiting_feedback"
assert len(task.run_ids) == 2
assert task.feedback == [
{
"feedback_type": "revise",
"comment": "再详细一点,并加上明后天穿衣建议",
"run_id": first.run_id,
"created_at": task.feedback[0]["created_at"],
}
]
assert first_assistant["feedback_state"] == "revise"
assert "再详细一点,并加上明后天穿衣建议" in user_messages
def test_explicit_revision_feedback_then_input_reruns_without_duplicate_feedback(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(passed=True, score=0.9, validator="test"),
ValidationResult(passed=True, score=0.95, validator="test"),
]
),
)
)
first = asyncio.run(
service.process_direct(
"查询珠海天气",
session_id="web:explicit-revise",
provider_bundle=_bundle("珠海天气概览", route_action="new_task"),
)
)
feedback = asyncio.run(
service.submit_feedback(
session_id=first.session_id,
run_id=first.run_id,
feedback_type="revise",
comment="准备补充穿衣建议",
)
)
second = asyncio.run(
service.process_direct(
"加上明后天穿衣建议",
session_id="web:explicit-revise",
provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(first.task_id)
assert feedback["task_status"] == "needs_revision"
assert second.task_id == first.task_id
assert task is not None
assert task.status == "awaiting_feedback"
assert len(task.run_ids) == 2
assert len(task.feedback) == 1
assert task.feedback[0]["feedback_type"] == "revise"
assert task.feedback[0]["comment"] == "准备补充穿衣建议"
def test_validation_result_status_drives_accepted_and_passed() -> None:
accepted = ValidationResult(status="accepted", score=0.9, validator="test")
insufficient = ValidationResult(status="insufficient_evidence", score=0.9, validator="test")
rejected = ValidationResult(status="rejected", score=0.9, validator="test")
assert accepted.passed is True
assert accepted.accepted is True
assert insufficient.passed is False
assert insufficient.accepted is False
assert rejected.passed is False
assert rejected.accepted is False
def test_validation_result_from_legacy_payload_maps_to_status() -> None:
accepted = ValidationResult.from_dict({"passed": True, "score": 0.9, "validator": "legacy"})
low_score = ValidationResult.from_dict({"passed": True, "score": 0.7, "validator": "legacy"})
rejected = ValidationResult.from_dict({"passed": False, "score": 0.2, "validator": "legacy"})
assert accepted is not None
assert accepted.status == "accepted"
assert low_score is not None
assert low_score.status == "rejected"
assert rejected is not None
assert rejected.status == "rejected"
def test_validation_result_rejects_unknown_status() -> None:
with pytest.raises(ValueError, match="unknown validation status"):
ValidationResult(status="pending", score=0.9, validator="test") # type: ignore[arg-type]
def test_validation_result_from_dict_rejects_unknown_explicit_status() -> None:
with pytest.raises(ValueError, match="unknown validation status"):
ValidationResult.from_dict({"status": "pending", "passed": True, "score": 0.9})
def test_validation_result_evidence_gaps_round_trip() -> None:
validation = ValidationResult(
status="insufficient_evidence",
score=0.4,
evidence_gaps=["missing command output", "missing file reference"],
validator="test",
)
restored = ValidationResult.from_dict(validation.to_dict())
assert restored is not None
assert restored.status == "insufficient_evidence"
assert restored.evidence_gaps == ["missing command output", "missing file reference"]
assert restored.to_dict()["evidence_gaps"] == ["missing command output", "missing file reference"]
def test_task_record_status_helpers_distinguish_review_and_failed() -> None:
needs_review = _task_record("needs_review")
failed = _task_record("failed")
assert needs_review.is_open is True
assert needs_review.is_execution_active is False
assert needs_review.requires_user_action is True
assert failed.is_open is False
assert failed.is_execution_active is False
assert failed.requires_user_action is False
def test_task_service_api_payload_emits_status_helpers(tmp_path: Path) -> None:
service = TaskService(tmp_path)
task = _task_record("needs_review")
payload = service.to_api_dict(task)
assert payload["is_open"] is True
assert payload["is_execution_active"] is False
assert payload["requires_user_action"] is True
def test_validation_failure_retries_once(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(
passed=False,
score=0.2,
issues=["missing tests"],
recommended_revision_prompt="Add tests before final response.",
validator="test",
),
ValidationResult(passed=True, score=0.88, validator="test"),
]
),
)
)
result = asyncio.run(
service.process_direct(
"implement and validate the task",
session_id="web:retry",
provider_bundle=_bundle("first draft", "revised draft"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(result.task_id)
assert result.output_text == "revised draft"
assert result.validation_result["accepted"] is True
assert task is not None
assert len(task.run_ids) == 2
visible_messages = loaded.session_manager.get_messages_as_conversation(result.session_id)
visible_contents = [message.get("content") for message in visible_messages]
assert "first draft" not in visible_contents
assert "revised draft" in visible_contents
def test_feedback_closes_or_abandons_internal_task(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[ValidationResult(passed=True, score=0.9, validator="test")]
),
task_execution_planner=StubTaskExecutionPlanner(),
)
)
result = asyncio.run(
service.process_direct(
"implement feedback handling",
session_id="web:feedback",
provider_bundle=_bundle("done"),
"write implementation plan",
session_id="web:acceptance",
provider_bundle=_bundle("Plan"),
)
)
loaded = service.create_loop().boot()
learning_calls = []
def build_learning_candidates_for_task(task_id: str, *, trigger_run_id: str) -> list[FakeLearningCandidate]:
learning_calls.append((task_id, trigger_run_id))
loaded = service.create_loop().boot()
generated: list[tuple[str, str]] = []
def build_learning_candidates_for_task(
task_id: str,
*,
final_accepted_run_id: str | None = None,
trigger_run_id: str | None = None,
) -> list[FakeLearningCandidate]:
generated.append((task_id, final_accepted_run_id or trigger_run_id or ""))
return [FakeLearningCandidate()]
loaded.skill_learning_service.build_learning_candidates_for_task = build_learning_candidates_for_task
feedback = asyncio.run(
service.submit_feedback(
session_id=result.session_id,
response = asyncio.run(
service.submit_acceptance(
session_id="web:acceptance",
run_id=result.run_id,
feedback_type="satisfied",
acceptance_type="accept",
)
)
assert feedback["task_status"] == "closed"
assert feedback["learning_candidates"] == [
assert response["task_status"] == "closed"
assert response["acceptance_type"] == "accept"
assert response["learning_candidates"] == [
{"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"}
]
assert learning_calls == [(result.task_id, result.run_id)]
assert generated == [(result.task_id, result.run_id)]
service2 = AgentService(
loader=EngineLoader(
workspace=tmp_path / "abandon",
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(passed=False, score=0.3, validator="test"),
ValidationResult(passed=False, score=0.3, validator="test"),
]
),
)
)
abandoned = asyncio.run(
service2.process_direct(
"implement another workflow",
session_id="web:abandon",
provider_bundle=_bundle("not enough", "still not enough"),
)
)
abandon_feedback = asyncio.run(
service2.submit_feedback(
session_id=abandoned.session_id,
run_id=abandoned.run_id,
feedback_type="abandon",
comment="too costly",
)
)
assert abandon_feedback["task_status"] == "abandoned"
assert abandon_feedback["learning_candidates"] == []
loaded2 = service2.create_loop().boot()
failure_events = [
event
for event in loaded2.session_manager.get_run_event_records(abandoned.session_id, abandoned.run_id)
if event.event_type == "task_failure_evidence_recorded"
]
assert len(failure_events) == 1
assert loaded2.memory_service.get_store().memory_entries == []
task_service = loaded.task_service
assert task_service is not None
task = task_service.get_task(result.task_id or "")
assert task is not None
assert task.metadata["final_accepted_run_id"] == result.run_id
def test_feedback_is_idempotent_and_projected_to_assistant_message(tmp_path: Path) -> None:
def test_revise_and_abandon_do_not_trigger_learning(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[ValidationResult(passed=True, score=0.9, validator="test")]
),
task_execution_planner=StubTaskExecutionPlanner(),
)
)
result = asyncio.run(
service.process_direct(
"implement feedback projection",
session_id="web:feedback-projection",
provider_bundle=_bundle("done"),
"summarize notes",
session_id="web:revise",
provider_bundle=_bundle("Summary"),
)
)
loaded = service.create_loop().boot()
first = asyncio.run(
service.submit_feedback(
session_id=result.session_id,
response = asyncio.run(
service.submit_acceptance(
session_id="web:revise",
run_id=result.run_id,
feedback_type="satisfied",
acceptance_type="revise",
comment="Add decisions",
)
)
second = asyncio.run(
assert response["task_status"] == "needs_revision"
assert response["learning_candidates"] == []
task_service = service.create_loop().boot().task_service
assert task_service is not None
task = task_service.get_task(result.task_id or "")
assert task is not None
assert task.feedback[0]["acceptance_type"] == "revise"
def test_legacy_feedback_endpoint_maps_satisfied_to_accept(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner(),
)
)
result = asyncio.run(
service.process_direct(
"prepare checklist",
session_id="web:legacy",
provider_bundle=_bundle("Checklist"),
)
)
response = asyncio.run(
service.submit_feedback(
session_id=result.session_id,
session_id="web:legacy",
run_id=result.run_id,
feedback_type="satisfied",
)
)
feedback_events = [
event
for event in loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
if event.event_type == "task_feedback_recorded"
]
assistant = [
message
for message in loaded.session_manager.get_messages_as_conversation(result.session_id)
if message.get("role") == "assistant" and message.get("run_id") == result.run_id
][-1]
assert first["task_status"] == "closed"
assert second["task_status"] == "closed"
assert len(feedback_events) == 1
assert assistant["feedback_state"] == "satisfied"
assert assistant["task_status"] == "closed"
assert assistant["validation_status"] == "passed"
with pytest.raises(ValueError, match="already recorded"):
asyncio.run(
service.submit_feedback(
session_id=result.session_id,
run_id=result.run_id,
feedback_type="abandon",
)
)
task = loaded.task_service.get_task(result.task_id)
assert task is not None
assert task.status == "closed"
assert response["acceptance_type"] == "accept"
assert response["feedback_type"] == "satisfied"
assert response["task_status"] == "closed"
def test_task_mode_team_plan_runs_subagent_then_main_synthesis(tmp_path: Path) -> None:
main_provider = StubProvider(
[
LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
]
)
sub_provider = StubProvider(
[
LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model")
]
)
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]),
)
)
def test_task_service_maps_legacy_status_and_feedback(tmp_path: Path) -> None:
service = TaskService(tmp_path)
task = service.create_task(session_id="s", description="legacy")
task.status = "awaiting_feedback"
task.feedback.append({"feedback_type": "satisfied", "run_id": "run-1"})
service.store.upsert_task(task)
result = asyncio.run(
service.process_direct(
"implement team-backed workflow",
session_id="web:team",
provider_bundle=_provider_bundle(main_provider),
team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(result.task_id)
events = loaded.session_manager.get_event_records(result.session_id)
loaded = service.get_task(task.task_id)
assert result.output_text == "final synthesized answer"
assert task is not None
assert len(task.run_ids) == 2
assert result.run_id == task.run_ids[-1]
assert any(event.event_type == "task_execution_planned" for event in events)
assert any(event.event_type == "task_team_run_completed" for event in events)
assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"]
assert "sub-agent evidence" != result.output_text
def test_task_mode_team_synthesis_runs_without_tools_and_receives_evidence(tmp_path: Path) -> None:
main_provider = StubProvider(
[
LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
]
)
sub_provider = StubProvider(
[
LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model")
]
)
validation = StubValidationService([ValidationResult(status="accepted", score=0.9, validator="test")])
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
validation_service=validation,
)
)
result = asyncio.run(
service.process_direct(
"implement team-backed workflow",
session_id="web:team-no-tools",
provider_bundle=_provider_bundle(main_provider),
team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider),
)
)
assert result.output_text == "final synthesized answer"
assert main_provider.calls[0]["tools"] is None
assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"]
assert "Task evidence packet" in validation.calls[0]["evidence_text"]
def test_task_mode_team_failure_still_uses_main_synthesis(tmp_path: Path) -> None:
main_provider = StubProvider(
[
LLMResponse(content="fallback synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model")
]
)
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner([_team_plan()]),
validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]),
)
)
result = asyncio.run(
service.process_direct(
"implement workflow despite team failure",
session_id="web:team-failure",
provider_bundle=_provider_bundle(main_provider),
team_provider_bundle_factory=lambda node: (_ for _ in ()).throw(RuntimeError("sub-agent unavailable")),
)
)
loaded = service.create_loop().boot()
events = loaded.session_manager.get_event_records(result.session_id)
assert result.output_text == "fallback synthesized answer"
assert any(event.event_type == "task_team_run_failed" for event in events)
assert "sub-agent unavailable" in main_provider.calls[0]["messages"][0]["content"]
assert "same class of tools fails repeatedly" in main_provider.calls[0]["messages"][0]["content"]
assert "user-visible fallback answer" in main_provider.calls[0]["messages"][0]["content"]
def test_insufficient_evidence_moves_task_to_needs_review(tmp_path: Path) -> None:
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=_single_planner(),
validation_service=StubValidationService(
[
ValidationResult(
status="insufficient_evidence",
score=0.4,
evidence_gaps=["source missing"],
validator="test",
)
]
),
)
)
result = asyncio.run(
service.process_direct(
"answer with uncertain evidence",
session_id="web:needs-review",
provider_bundle=_bundle("possible answer"),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(result.task_id)
events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id)
validation_event = next(event for event in events if event.event_type == "task_validation_snapshotted")
assert task is not None
assert task.status == "needs_review"
assert task.requires_user_action is True
assert task.is_execution_active is False
assert validation_event.event_payload["validation_result"]["status"] == "insufficient_evidence"
assert validation_event.event_payload["retry_scheduled"] is False
assert validation_event.event_payload["validation_debug"]["tool_result_count"] >= 0
def test_task_mode_team_retry_hides_first_synthesis_run(tmp_path: Path) -> None:
main_provider = StubProvider(
[
LLMResponse(content="first synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"),
LLMResponse(content="revised synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"),
]
)
sub_providers = [
StubProvider([LLMResponse(content="first evidence", finish_reason="stop", provider_name="stub", model="stub-model")]),
StubProvider([LLMResponse(content="second evidence", finish_reason="stop", provider_name="stub", model="stub-model")]),
]
service = AgentService(
loader=EngineLoader(
workspace=tmp_path,
task_execution_planner=StubTaskExecutionPlanner([_team_plan(), _team_plan()]),
validation_service=StubValidationService(
[
ValidationResult(passed=False, score=0.2, recommended_revision_prompt="revise", validator="test"),
ValidationResult(passed=True, score=0.9, validator="test"),
]
),
)
)
result = asyncio.run(
service.process_direct(
"implement and validate with team",
session_id="web:team-retry",
provider_bundle=_provider_bundle(main_provider),
team_provider_bundle_factory=lambda node: _provider_bundle(sub_providers.pop(0)),
)
)
loaded = service.create_loop().boot()
task = loaded.task_service.get_task(result.task_id)
visible = loaded.session_manager.get_messages_as_conversation(result.session_id)
visible_contents = [message.get("content") for message in visible]
run_records = {record.run_id: record for record in loaded.run_memory_store.list_runs()}
assert result.output_text == "revised synthesized answer"
assert task is not None
assert len(task.run_ids) == 4
assert "first synthesized answer" not in visible_contents
assert "revised synthesized answer" in visible_contents
for run_id in task.run_ids:
record = run_records[run_id]
events = loaded.session_manager.get_run_event_records(record.session_id, run_id)
skill_effects = [event for event in events if event.event_type == "skill_effects_snapshotted"]
assert skill_effects
assert skill_effects[-1].event_payload["candidate_generation_allowed"] is False
def test_context_builder_strips_ui_projection_fields_from_provider_history() -> None:
result = ContextBuilder().build_messages(
ContextBuildInput(
history=[
{
"role": "assistant",
"content": "done",
"run_id": "run-1",
"task_id": "task-1",
"task_status": "closed",
"validation_status": "passed",
"feedback_state": "satisfied",
}
],
)
)
assistant = result.messages[-1]
assert assistant == {"role": "assistant", "content": "done"}
def test_context_builder_normalizes_persisted_tool_arguments() -> None:
result = ContextBuilder().build_messages(
ContextBuildInput(
history=[
{
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": "call-1",
"type": "function",
"function": {
"name": "cron",
"arguments": {"action": "add", "mode": "notification"},
},
}
],
}
],
)
)
tool_call = result.messages[-1]["tool_calls"][0]
assert tool_call["function"]["arguments"] == '{"action": "add", "mode": "notification"}'
def test_llm_validator_parse_failure_is_not_accepted(tmp_path: Path) -> None:
task_service = TaskService(tmp_path / "tasks")
task = task_service.create_task(session_id="web:validator", description="implement validator handling")
validation = asyncio.run(
ValidationService().validate_task_result(
task=task,
user_message="implement validator handling",
final_output="done",
provider_bundle=_main_only_bundle("not json"),
)
)
assert validation.accepted is False
assert validation.status == "validator_error"
assert validation.validator == "llm_error"
assert validation.issues
assert loaded is not None
assert loaded.status == "awaiting_acceptance"
assert loaded.feedback[0]["acceptance_type"] == "accept"