diff --git a/app-instance/backend/beaver/coordinator/execution/scheduler.py b/app-instance/backend/beaver/coordinator/execution/scheduler.py index 6027599..8516fec 100644 --- a/app-instance/backend/beaver/coordinator/execution/scheduler.py +++ b/app-instance/backend/beaver/coordinator/execution/scheduler.py @@ -84,11 +84,21 @@ class TeamGraphScheduler: **kwargs, ) -> list[NodeRunResult]: results: list[NodeRunResult] = [] + nodes_by_id = {node.node_id: node for node in nodes} for node in nodes: - if any(not item.success for item in results): - results.append(self._blocked(node, results)) + blocking = [ + item + for item in results + if self._blocks_downstream(item, nodes_by_id[item.node_id]) + ] + if blocking: + results.append(self._blocked(node, blocking)) continue - dependency_outputs = {item.node_id: item.output_text for item in results if item.success} + dependency_outputs = { + item.node_id: item.output_text + for item in results + if item.completion_status in {"succeeded", "partial"} + } results.append(await self._run_node(node, dependency_outputs=dependency_outputs, **kwargs)) return results @@ -116,6 +126,7 @@ class TeamGraphScheduler: **kwargs, ) -> list[NodeRunResult]: pending = {node.node_id: node for node in nodes} + nodes_by_id = {node.node_id: node for node in nodes} completed: dict[str, NodeRunResult] = {} ordered: list[NodeRunResult] = [] @@ -123,18 +134,28 @@ class TeamGraphScheduler: blocked_ids = { node_id for node_id, node in pending.items() - if any(dep in completed and not completed[dep].success for dep in node.depends_on) + if any( + dep in completed + and self._blocks_downstream(completed[dep], nodes_by_id[dep]) + for dep in node.depends_on + ) } for node_id in sorted(blocked_ids): node = pending.pop(node_id) result = self._blocked(node, list(completed.values())) completed[node_id] = result ordered.append(result) + if blocked_ids: + continue ready = [ node for node in pending.values() - if all(dep in completed and completed[dep].success for dep in node.depends_on) + if all( + dep in completed + and not self._blocks_downstream(completed[dep], nodes_by_id[dep]) + for dep in node.depends_on + ) ] if not ready: if pending: @@ -196,6 +217,17 @@ class TeamGraphScheduler: expected_output=node.expected_output, node_id=node.node_id, dependency_outputs=dict(dependency_outputs), + input_contract=dict(node.input_contract), + output_contract=dict(node.output_contract), + allowed_tool_names=( + None if node.allowed_tool_names is None else list(node.allowed_tool_names) + ), + required_evidence=list(node.required_evidence), + evidence_contract=dict(node.evidence_contract), + validation_rules=list(node.validation_rules), + required_for_completion=node.required_for_completion, + block_downstream_on_partial=node.block_downstream_on_partial, + max_tool_iterations=node.max_tool_iterations, ) node_provider_bundle = provider_bundle_factory(node) if provider_bundle_factory is not None else provider_bundle return await self.runner.run( @@ -213,8 +245,17 @@ class TeamGraphScheduler: output_text="", finish_reason="error", error=str(exc), + completion_status="failed", ) + @staticmethod + def _blocks_downstream(result: NodeRunResult, node: ExecutionNode) -> bool: + if result.completion_status in {"failed", "blocked"}: + return True + if result.completion_status == "partial": + return node.block_downstream_on_partial + return not result.success + @staticmethod def _merge_pinned(parent: list[str], local: list[str]) -> list[str]: result: list[str] = [] @@ -245,6 +286,7 @@ class TeamGraphScheduler: output_text="", finish_reason="blocked", error=f"Blocked by failed dependency: {detail}", + completion_status="blocked", ) @staticmethod diff --git a/app-instance/backend/beaver/coordinator/local.py b/app-instance/backend/beaver/coordinator/local.py index f225e0a..51583ee 100644 --- a/app-instance/backend/beaver/coordinator/local.py +++ b/app-instance/backend/beaver/coordinator/local.py @@ -6,7 +6,7 @@ from uuid import uuid4 from beaver.engine import AgentLoop from beaver.engine.providers import ProviderBundle -from beaver.tasks.evidence import EvidenceBuilder +from beaver.tasks.evidence import EvidenceBuilder, evaluate_node_evidence from .models import DelegationEnvelope, NodeRunResult @@ -54,6 +54,8 @@ class LocalAgentRunner: task_mode=bool(envelope.parent_task_id), pinned_skill_names=envelope.inherited_pinned_skills, pinned_skill_contexts=envelope.inherited_pinned_skill_contexts, + allowed_tool_names=envelope.allowed_tool_names, + max_tool_iterations=envelope.max_tool_iterations, allow_candidate_generation=allow_candidate_generation, ) loaded = target_loop.boot() @@ -63,7 +65,23 @@ class LocalAgentRunner: result.output_text, result.finish_reason, ) - success = result.finish_reason == "stop" + evidence_gaps = evaluate_node_evidence( + evidence, + envelope.required_evidence, + result.output_text, + ) + run_succeeded = result.finish_reason == "stop" + if not run_succeeded: + completion_status = "failed" + elif evidence_gaps: + completion_status = "partial" + else: + completion_status = "succeeded" + success = completion_status == "succeeded" + if completion_status == "partial": + error = "; ".join(evidence_gaps) + else: + error = None if success else (result.output_text or result.finish_reason) return NodeRunResult( node_id=envelope.node_id or envelope.agent.name, success=success, @@ -71,8 +89,10 @@ class LocalAgentRunner: run_id=result.run_id, session_id=result.session_id, finish_reason=result.finish_reason, - error=None if success else (result.output_text or result.finish_reason), + error=error, evidence=evidence, + completion_status=completion_status, + evidence_gaps=evidence_gaps, ) @staticmethod diff --git a/app-instance/backend/beaver/coordinator/models.py b/app-instance/backend/beaver/coordinator/models.py index f54f036..aa45ae6 100644 --- a/app-instance/backend/beaver/coordinator/models.py +++ b/app-instance/backend/beaver/coordinator/models.py @@ -51,6 +51,15 @@ class DelegationEnvelope: expected_output: str | None = None node_id: str | None = None dependency_outputs: dict[str, str] = field(default_factory=dict) + input_contract: dict[str, Any] = field(default_factory=dict) + output_contract: dict[str, Any] = field(default_factory=dict) + allowed_tool_names: list[str] | None = None + required_evidence: list[str] = field(default_factory=list) + evidence_contract: dict[str, Any] = field(default_factory=dict) + validation_rules: list[str] = field(default_factory=list) + required_for_completion: bool = True + block_downstream_on_partial: bool = False + max_tool_iterations: int | None = None @dataclass(slots=True) @@ -65,6 +74,15 @@ class ExecutionNode: inherited_pinned_skill_contexts: list["SkillContext"] = field(default_factory=list) constraints: list[str] = field(default_factory=list) expected_output: str | None = None + input_contract: dict[str, Any] = field(default_factory=dict) + output_contract: dict[str, Any] = field(default_factory=dict) + allowed_tool_names: list[str] | None = None + required_evidence: list[str] = field(default_factory=list) + evidence_contract: dict[str, Any] = field(default_factory=dict) + validation_rules: list[str] = field(default_factory=list) + required_for_completion: bool = True + block_downstream_on_partial: bool = False + max_tool_iterations: int | None = None @dataclass(slots=True) @@ -74,7 +92,7 @@ class ExecutionGraph: strategy: TeamStrategy nodes: list[ExecutionNode] - def validate(self) -> None: + def validate(self, *, max_depth: int | None = None) -> None: if self.strategy not in {"sequence", "parallel", "dag"}: raise NotImplementedError(f"Team strategy {self.strategy!r} is reserved but not implemented in v1") if not self.nodes: @@ -91,19 +109,25 @@ class ExecutionGraph: visited: set[str] = set() deps = {node.node_id: list(node.depends_on) for node in self.nodes} - def visit(node_id: str) -> None: + def visit(node_id: str) -> int: if node_id in visited: - return + return depths[node_id] if node_id in visiting: raise ValueError(f"ExecutionGraph has cyclic or unresolved dependencies involving {node_id!r}") visiting.add(node_id) + depth = 1 for dep in deps[node_id]: - visit(dep) + depth = max(depth, visit(dep) + 1) visiting.remove(node_id) visited.add(node_id) + depths[node_id] = depth + return depth + depths: dict[str, int] = {} for node_id in node_ids: - visit(node_id) + depth = visit(node_id) + if max_depth is not None and depth > max_depth: + raise ValueError(f"ExecutionGraph exceeds max depth {max_depth}") @dataclass(slots=True) @@ -118,6 +142,8 @@ class NodeRunResult: finish_reason: str = "stop" error: str | None = None evidence: "RunEvidence | None" = None + completion_status: str = "succeeded" + evidence_gaps: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: return { @@ -129,6 +155,8 @@ class NodeRunResult: "finish_reason": self.finish_reason, "error": self.error, "evidence": self.evidence.to_dict() if self.evidence is not None else None, + "completion_status": self.completion_status, + "evidence_gaps": list(self.evidence_gaps), } diff --git a/app-instance/backend/beaver/engine/context/builder.py b/app-instance/backend/beaver/engine/context/builder.py index c229635..b7775c1 100644 --- a/app-instance/backend/beaver/engine/context/builder.py +++ b/app-instance/backend/beaver/engine/context/builder.py @@ -48,6 +48,8 @@ class SkillContext: content_hash: str = "" activation_reason: str = "selected" tool_hints: list[str] = field(default_factory=list) + team_template: dict[str, Any] | None = None + team_template_warnings: list[str] = field(default_factory=list) @dataclass(slots=True) diff --git a/app-instance/backend/beaver/engine/loader.py b/app-instance/backend/beaver/engine/loader.py index d51666b..dfd626b 100644 --- a/app-instance/backend/beaver/engine/loader.py +++ b/app-instance/backend/beaver/engine/loader.py @@ -317,7 +317,10 @@ class EngineLoader: draft_service=draft_service, ) task_service = self._task_service or TaskService(workspace / "tasks") - task_execution_planner = self._task_execution_planner or TaskExecutionPlanner(task_skill_resolver=task_skill_resolver) + task_execution_planner = self._task_execution_planner or TaskExecutionPlanner( + task_skill_resolver=task_skill_resolver, + tool_registry=tool_registry, + ) mcp_manager = MCPConnectionManager( self.config.tools.mcp_servers, authz_config=self.config.authz, diff --git a/app-instance/backend/beaver/engine/loop.py b/app-instance/backend/beaver/engine/loop.py index 588421c..e9a5b7f 100644 --- a/app-instance/backend/beaver/engine/loop.py +++ b/app-instance/backend/beaver/engine/loop.py @@ -8,6 +8,7 @@ import os import re from dataclasses import dataclass, field from datetime import datetime, timezone +from time import perf_counter from typing import Any from uuid import uuid4 from zoneinfo import ZoneInfo, ZoneInfoNotFoundError @@ -81,6 +82,49 @@ class _DirectRunRequest: future: asyncio.Future[AgentRunResult] +@dataclass(slots=True) +class _WebSearchLoopGuard: + low_quality_limit: int = 3 + _low_quality_count: int = 0 + _last_query: str = "" + + def observe_result(self, tool_name: str, content: str) -> dict[str, str] | None: + if tool_name != "web_search": + self._reset() + return None + try: + payload = json.loads(content) + except Exception: + self._reset() + return None + + query = str(payload.get("query") or self._last_query or "").strip() + is_low_quality = payload.get("success") is False or payload.get("quality") == "low" + if not is_low_quality: + self._reset() + self._last_query = query + return None + + self._low_quality_count += 1 + self._last_query = query + if self._low_quality_count < self.low_quality_limit: + return None + + query_text = f" for query '{query}'" if query else "" + return { + "finish_reason": "web_search_low_quality_budget", + "message": ( + "Web search returned low-quality or failed results repeatedly" + f"{query_text}. Stop retrying query variants; use confirmed sources already found, " + "state uncertainty clearly, and mark missing fields as N/A." + ), + } + + def _reset(self) -> None: + self._low_quality_count = 0 + self._last_query = "" + + class AgentLoop: """Single execution kernel shared by root agents and delegated agents.""" @@ -240,6 +284,7 @@ class AgentLoop: thinking_enabled: bool | None = None, include_skill_assembly: bool = True, include_tools: bool = True, + allowed_tool_names: list[str] | None = None, max_tool_iterations: int | None = None, provider_bundle: ProviderBundle | None = None, parent_session_id: str | None = None, @@ -252,6 +297,7 @@ class AgentLoop: allow_candidate_generation: bool = False, intent_agent_decision: dict[str, Any] | None = None, channel_identity: ChannelIdentity | None = None, + pre_run_latency_ms: dict[str, float] | None = None, ) -> AgentRunResult: """跑通最小 direct run 主链。 @@ -292,6 +338,7 @@ class AgentLoop: thinking_enabled=thinking_enabled, include_skill_assembly=include_skill_assembly, include_tools=include_tools, + allowed_tool_names=allowed_tool_names, max_tool_iterations=max_tool_iterations, provider_bundle=provider_bundle, parent_session_id=parent_session_id, @@ -304,6 +351,7 @@ class AgentLoop: allow_candidate_generation=allow_candidate_generation, intent_agent_decision=intent_agent_decision, channel_identity=channel_identity, + pre_run_latency_ms=pre_run_latency_ms, ) async def _process_direct_impl( @@ -332,6 +380,7 @@ class AgentLoop: thinking_enabled: bool | None = None, include_skill_assembly: bool = True, include_tools: bool = True, + allowed_tool_names: list[str] | None = None, max_tool_iterations: int | None = None, provider_bundle: ProviderBundle | None = None, parent_session_id: str | None = None, @@ -344,6 +393,7 @@ class AgentLoop: allow_candidate_generation: bool = False, intent_agent_decision: dict[str, Any] | None = None, channel_identity: ChannelIdentity | None = None, + pre_run_latency_ms: dict[str, float] | None = None, ) -> AgentRunResult: """真正执行一轮 direct run 的内部实现。 @@ -353,8 +403,25 @@ class AgentLoop: - 这样才能保证 run 模式下外部不能绕过队列直接执行 """ + run_perf_started = perf_counter() + latency_ms = self._initial_latency_ms(pre_run_latency_ms) + + def add_latency(key: str, started_at: float) -> None: + latency_ms[key] = latency_ms.get(key, 0.0) + (perf_counter() - started_at) * 1000 + loaded = self.boot() session_manager = self._require_loaded("session_manager") + + def session_write(callable_obj: Any, *args: Any, **kwargs: Any) -> Any: + started_at = perf_counter() + try: + return callable_obj(*args, **kwargs) + finally: + add_latency("session_write_ms", started_at) + + def append_message(session_id_value: str, **kwargs: Any) -> int: + return session_write(session_manager.append_message, session_id_value, **kwargs) + memory_service = self._require_loaded("memory_service") context_builder = self._require_loaded("context_builder") tool_registry = self._require_loaded("tool_registry") @@ -365,9 +432,13 @@ class AgentLoop: skill_assembler = self._require_loaded("skill_assembler") skill_learning_service = self._require_loaded("skill_learning_service") mcp_manager = getattr(loaded, "mcp_manager", None) - if mcp_manager is not None: - loaded.mcp_report = await mcp_manager.connect_all(tool_registry) - loaded.tools = [spec.name for spec in tool_registry.list_specs()] + if include_tools and mcp_manager is not None: + started_at = perf_counter() + try: + loaded.mcp_report = await mcp_manager.connect_all(tool_registry) + loaded.tools = [spec.name for spec in tool_registry.list_specs()] + finally: + add_latency("mcp_ms", started_at) config = loaded.config configured_provider = config.resolve_provider_target(model=model, provider_name=provider_name) @@ -393,13 +464,15 @@ class AgentLoop: memory_snapshot = memory_service.capture_snapshot_for_run() if parent_session_id: - session_manager.ensure_session( + session_write( + session_manager.ensure_session, parent_session_id, source="unknown", model=resolved_model, user_id=user_id, ) - session_manager.ensure_session( + session_write( + session_manager.ensure_session, resolved_session_id, source=source, model=resolved_model, @@ -407,7 +480,7 @@ class AgentLoop: user_id=user_id, parent_session_id=parent_session_id, ) - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -435,7 +508,7 @@ class AgentLoop: user_id=user_id, ) if intent_agent_decision: - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -480,35 +553,39 @@ class AgentLoop: *(pinned_skill_contexts or []), *self._load_pinned_skill_contexts(skills_loader, pinned_skill_names or []), ] - if not include_skill_assembly: - activated_skills = self._merge_skill_contexts(pinned_skills, []) - else: - skill_query = skill_selection_context or task - assembled_skills = await skill_assembler.assemble( - task_description=skill_query, - provider=skill_selector_provider, - model=skill_selector_model, - embedding_runtime=bundle.embedding_runtime, - thinking_enabled=thinking_enabled, - ) - for interaction in getattr(assembled_skills, "llm_interactions", []) or []: - session_manager.append_message( - resolved_session_id, - run_id=resolved_run_id, - role="system", - event_type="skill_assembler_llm_interaction_snapshotted", - event_payload=interaction, - content=json.dumps(interaction, ensure_ascii=False, default=str), - context_visible=False, - source=source, - title=title, + started_at = perf_counter() + try: + if not include_skill_assembly: + activated_skills = self._merge_skill_contexts(pinned_skills, []) + else: + skill_query = skill_selection_context or task + assembled_skills = await skill_assembler.assemble( + task_description=skill_query, + provider=skill_selector_provider, model=skill_selector_model, - user_id=user_id, + embedding_runtime=bundle.embedding_runtime, + thinking_enabled=thinking_enabled, ) - activated_skills = self._merge_skill_contexts( - pinned_skills, - assembled_skills.activated_skills, - ) + for interaction in getattr(assembled_skills, "llm_interactions", []) or []: + append_message( + resolved_session_id, + run_id=resolved_run_id, + role="system", + event_type="skill_assembler_llm_interaction_snapshotted", + event_payload=interaction, + content=json.dumps(interaction, ensure_ascii=False, default=str), + context_visible=False, + source=source, + title=title, + model=skill_selector_model, + user_id=user_id, + ) + activated_skills = self._merge_skill_contexts( + pinned_skills, + assembled_skills.activated_skills, + ) + finally: + add_latency("skill_assembly_ms", started_at) skill_activation_messages = context_builder.build_skill_activation_messages( activated_skills ) @@ -527,7 +604,7 @@ class AgentLoop: ] if skill_activation_messages or activated_receipts: - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -544,19 +621,26 @@ class AgentLoop: user_id=user_id, ) - if not include_tools: - selected_tool_specs = [] - else: - selected_tool_specs = await tool_assembler.assemble( - task_description=task, - registry=tool_registry, - skills_loader=skills_loader, - activated_skills=activated_skills, - embedding_runtime=bundle.embedding_runtime, - top_k=10, - ) + started_at = perf_counter() + try: + if not include_tools: + selected_tool_specs = [] + else: + selected_tool_specs = await tool_assembler.assemble( + task_description=task, + registry=tool_registry, + skills_loader=skills_loader, + activated_skills=activated_skills, + embedding_runtime=bundle.embedding_runtime, + top_k=10, + ) + if allowed_tool_names is not None: + allowed = set(allowed_tool_names) + selected_tool_specs = [spec for spec in selected_tool_specs if spec.name in allowed] + finally: + add_latency("tool_assembly_ms", started_at) tool_schemas = tool_registry.export_selected_provider_schemas(selected_tool_specs) - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -573,37 +657,41 @@ class AgentLoop: user_id=user_id, ) - build_input = ContextBuildInput( - base_system_prompt=self.profile.system_prompt, - prompt_locale=prompt_locale, - history=session_manager.get_history( - resolved_session_id, - max_messages=max(1, self.profile.max_context_messages), - ), - current_user_input=task, - memory_snapshot=memory_snapshot, - activated_skills=activated_skills, - session_context=SessionContext( - session_id=resolved_session_id, - source=source, - model=resolved_model, - user_id=user_id, - channel=channel_identity.channel_id if channel_identity else None, - channel_kind=channel_identity.kind if channel_identity else None, - account_id=channel_identity.account_id if channel_identity else None, - peer_id=channel_identity.peer_id if channel_identity else None, - peer_type=channel_identity.peer_type if channel_identity else None, - chat_id=channel_identity.peer_id if channel_identity else None, - thread_id=channel_identity.thread_id if channel_identity else None, - parent_session_id=parent_session_id, - ), - runtime_context=self._current_runtime_context(), - execution_context=execution_context, - extra_sections=[TOOL_FAILURE_GUIDANCE_PROMPT], - ) - context_result = context_builder.build_messages(build_input) + started_at = perf_counter() + try: + build_input = ContextBuildInput( + base_system_prompt=self.profile.system_prompt, + prompt_locale=prompt_locale, + history=session_manager.get_history( + resolved_session_id, + max_messages=max(1, self.profile.max_context_messages), + ), + current_user_input=task, + memory_snapshot=memory_snapshot, + activated_skills=activated_skills, + session_context=SessionContext( + session_id=resolved_session_id, + source=source, + model=resolved_model, + user_id=user_id, + channel=channel_identity.channel_id if channel_identity else None, + channel_kind=channel_identity.kind if channel_identity else None, + account_id=channel_identity.account_id if channel_identity else None, + peer_id=channel_identity.peer_id if channel_identity else None, + peer_type=channel_identity.peer_type if channel_identity else None, + chat_id=channel_identity.peer_id if channel_identity else None, + thread_id=channel_identity.thread_id if channel_identity else None, + parent_session_id=parent_session_id, + ), + runtime_context=self._current_runtime_context(), + execution_context=execution_context, + extra_sections=[TOOL_FAILURE_GUIDANCE_PROMPT], + ) + context_result = context_builder.build_messages(build_input) + finally: + add_latency("context_build_ms", started_at) if skill_selection_context: - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -621,8 +709,8 @@ class AgentLoop: model=resolved_model, user_id=user_id, ) - session_manager.update_system_prompt(resolved_session_id, context_result.system_prompt) - session_manager.append_message( + session_write(session_manager.update_system_prompt, resolved_session_id, context_result.system_prompt) + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -639,7 +727,7 @@ class AgentLoop: model=resolved_model, user_id=user_id, ) - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="user", @@ -676,6 +764,9 @@ class AgentLoop: "session_id": resolved_session_id, "task_id": task_id, "run_id": resolved_run_id, + "allowed_tool_names": ( + None if allowed_tool_names is None else list(allowed_tool_names) + ), }, ) @@ -683,6 +774,7 @@ class AgentLoop: final_finish_reason = "stop" final_provider_name = bundle.main_runtime.provider_name final_model = bundle.main_runtime.model + web_search_loop_guard = _WebSearchLoopGuard() while True: chat_kwargs: dict[str, Any] = { @@ -713,7 +805,7 @@ class AgentLoop: "temperature": resolved_temperature, "thinking_enabled": thinking_enabled, } - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -726,14 +818,18 @@ class AgentLoop: model=final_model, user_id=user_id, ) - response = await provider.chat(**chat_kwargs) + started_at = perf_counter() + try: + response = await provider.chat(**chat_kwargs) + finally: + add_latency("llm_ms", started_at) final_provider_name = response.provider_name or final_provider_name final_model = response.model or final_model final_usage = self._merge_usage(final_usage, response.usage or {}) - self._record_usage(session_manager, resolved_session_id, response.usage or {}) + session_write(self._record_usage, session_manager, resolved_session_id, response.usage or {}) assistant_tool_calls = self._serialize_tool_calls(response.tool_calls) - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="assistant", @@ -764,17 +860,21 @@ class AgentLoop: break if iterations >= resolved_max_tool_iterations: - finalized = await self._finalize_after_tool_limit( - provider=provider, - messages=messages, - model=final_model, - max_tokens=resolved_max_tokens, - temperature=resolved_temperature, - thinking_enabled=thinking_enabled, - ) + started_at = perf_counter() + try: + finalized = await self._finalize_after_tool_limit( + provider=provider, + messages=messages, + model=final_model, + max_tokens=resolved_max_tokens, + temperature=resolved_temperature, + thinking_enabled=thinking_enabled, + ) + finally: + add_latency("llm_ms", started_at) final_text = finalized or RAW_TOOL_CALL_FALLBACK final_finish_reason = "max_tool_iterations_finalized" if finalized else "max_tool_iterations" - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="assistant", @@ -800,9 +900,26 @@ class AgentLoop: reasoning_content=response.reasoning_content, ) iterations += 1 - for tool_call in response.tool_calls: - result = await effective_tool_executor.execute_tool_call(tool_call, context=tool_context) - session_manager.append_message( + started_at = perf_counter() + try: + if self._can_run_tool_calls_concurrently(response.tool_calls, tool_registry): + tool_results = await asyncio.gather( + *( + effective_tool_executor.execute_tool_call(tool_call, context=tool_context) + for tool_call in response.tool_calls + ) + ) + else: + tool_results = [] + for tool_call in response.tool_calls: + tool_results.append( + await effective_tool_executor.execute_tool_call(tool_call, context=tool_context) + ) + finally: + add_latency("tool_ms", started_at) + web_guard_decision: dict[str, str] | None = None + for tool_call, result in zip(response.tool_calls, tool_results, strict=True): + append_message( resolved_session_id, run_id=resolved_run_id, role="tool", @@ -825,8 +942,30 @@ class AgentLoop: tool_name=result.tool_name, result=result.content, ) + if web_guard_decision is None: + web_guard_decision = web_search_loop_guard.observe_result(result.tool_name, result.content) + if web_guard_decision is not None: + final_text = web_guard_decision["message"] + final_finish_reason = web_guard_decision["finish_reason"] + append_message( + resolved_session_id, + run_id=resolved_run_id, + role="assistant", + event_type="assistant_message_added", + event_payload={"task_id": task_id} if task_id else None, + content=final_text, + finish_reason=final_finish_reason, + source=source, + title=title, + model=final_model, + user_id=user_id, + ) + context_builder.add_assistant_message(messages, content=final_text) + break - session_manager.append_message( + final_latency_ms = self._final_latency_ms(latency_ms, run_perf_started) + final_usage_with_latency = self._usage_with_latency(final_usage, final_latency_ms) + append_message( resolved_session_id, run_id=resolved_run_id, role="system", @@ -837,6 +976,7 @@ class AgentLoop: "task_id": task_id, "task_mode": task_mode, "attempt_index": attempt_index, + "latency_ms": final_latency_ms, }, content=final_text, finish_reason=final_finish_reason, @@ -869,12 +1009,12 @@ class AgentLoop: tool_iterations=iterations, provider_name=final_provider_name, model=final_model, - usage=final_usage, + usage=final_usage_with_latency, task_id=task_id, ) except Exception as exc: if not user_message_recorded: - session_manager.append_message( + append_message( resolved_session_id, run_id=resolved_run_id, role="user", @@ -885,6 +1025,7 @@ class AgentLoop: model=resolved_model, user_id=user_id, ) + final_latency_ms = self._final_latency_ms(latency_ms, run_perf_started) result = self._build_error_result( session_manager=session_manager, session_id=resolved_session_id, @@ -896,8 +1037,9 @@ class AgentLoop: message=f"Run failed before completion: {exc}", tool_iterations=iterations, provider_name=final_provider_name, - usage=final_usage, + usage=self._usage_with_latency(final_usage, final_latency_ms), task_id=task_id, + latency_ms=final_latency_ms, ) self._record_run_receipts( skill_learning_service=skill_learning_service, @@ -1032,6 +1174,80 @@ class AgentLoop: ) return payload + @staticmethod + def _can_run_tool_calls_concurrently(tool_calls: list[Any], tool_registry: Any) -> bool: + if len(tool_calls) < 2: + return False + return all(AgentLoop._is_read_only_tool_call(tool_call, tool_registry) for tool_call in tool_calls) + + @staticmethod + def _is_read_only_tool_call(tool_call: Any, tool_registry: Any) -> bool: + name = AgentLoop._tool_call_name(tool_call) + if not name: + return False + tool = tool_registry.get(name) if tool_registry is not None else None + if tool is None: + return False + spec = getattr(tool, "spec", None) + toolset = str(getattr(spec, "toolset", "") or "").lower() + metadata = getattr(spec, "metadata", {}) or {} + if metadata.get("read_only") is True: + return True + if metadata.get("mutates") or metadata.get("sensitive"): + return False + return name in { + "list_directory", + "read_file", + "search_files", + "session_search", + "skills_list", + "skill_view", + "user_files_list", + "user_files_read", + "web_fetch", + "web_search", + } and toolset in {"filesystem", "session", "skills", "user_files", "web"} + + @staticmethod + def _tool_call_name(tool_call: Any) -> str: + if not isinstance(tool_call, dict): + return str(getattr(tool_call, "name", "") or "") + function = tool_call.get("function") + if isinstance(function, dict): + return str(function.get("name") or "") + return str(tool_call.get("name") or "") + + @staticmethod + def _initial_latency_ms(pre_run_latency_ms: dict[str, float] | None) -> dict[str, float]: + latency = { + "router_ms": 0.0, + "mcp_ms": 0.0, + "skill_assembly_ms": 0.0, + "tool_assembly_ms": 0.0, + "context_build_ms": 0.0, + "llm_ms": 0.0, + "tool_ms": 0.0, + "session_write_ms": 0.0, + "total_ms": 0.0, + } + if pre_run_latency_ms: + for key, value in pre_run_latency_ms.items(): + if isinstance(value, (int, float)): + latency[str(key)] = latency.get(str(key), 0.0) + float(value) + return latency + + @staticmethod + def _final_latency_ms(latency_ms: dict[str, float], run_perf_started: float) -> dict[str, float]: + finalized = dict(latency_ms) + finalized["total_ms"] = finalized.get("total_ms", 0.0) + (perf_counter() - run_perf_started) * 1000 + return {key: round(max(0.0, float(value)), 3) for key, value in finalized.items()} + + @staticmethod + def _usage_with_latency(usage: dict[str, Any], latency_ms: dict[str, float]) -> dict[str, Any]: + payload = dict(usage) + payload["latency_ms"] = dict(latency_ms) + return payload + @staticmethod def _record_usage(session_manager: Any, session_id: str, usage: dict[str, Any]) -> None: """把 provider usage 映射到 session usage 字段。 @@ -1079,6 +1295,7 @@ class AgentLoop: provider_name: str | None, usage: dict[str, Any], task_id: str | None = None, + latency_ms: dict[str, float] | None = None, ) -> AgentRunResult: """把主链中的未处理异常收口成可追踪的 assistant error turn。""" @@ -1104,6 +1321,7 @@ class AgentLoop: "tool_iterations": tool_iterations, "provider_name": provider_name, "task_id": task_id, + "latency_ms": latency_ms or {}, }, content=message, finish_reason="error", diff --git a/app-instance/backend/beaver/interfaces/web/app.py b/app-instance/backend/beaver/interfaces/web/app.py index f1318f7..daffc23 100644 --- a/app-instance/backend/beaver/interfaces/web/app.py +++ b/app-instance/backend/beaver/interfaces/web/app.py @@ -43,6 +43,7 @@ from beaver.services.user_files import ( UserFileNotFoundError, UserFilePathError, UserFileSizeError, + UserFileStorageError, UserFileService, ) from beaver.services.user_file_resolver import ( @@ -644,6 +645,8 @@ def create_app( return HTTPException(status_code=400, detail=str(exc) or "Invalid path") if isinstance(exc, UserFileSizeError): return HTTPException(status_code=413, detail=str(exc) or "File too large") + if isinstance(exc, UserFileStorageError): + return HTTPException(status_code=503, detail=str(exc) or "User file storage is unavailable") if isinstance(exc, UserFileConfigurationError): return HTTPException(status_code=503, detail=str(exc) or "User file storage is not configured") return HTTPException(status_code=400, detail=str(exc) or "User file operation failed") @@ -1327,6 +1330,7 @@ def create_app( "runs": runs, } ) + sessions.sort(key=lambda item: item.get("updated_at") or item.get("created_at") or "", reverse=True) return {"sessions": sessions} @app.post("/api/sessions/{session_id:path}/archive") @@ -3166,6 +3170,11 @@ def _debug_runs_for_session(session_manager: Any, session_id: str) -> list[dict[ title = getattr(started, "title", None) if title is None: title = source or "run" + latency_ms = None + if completed is not None and isinstance(completed.event_payload, dict): + raw_latency = completed.event_payload.get("latency_ms") + latency_ms = raw_latency if isinstance(raw_latency, dict) else None + sorted_records = sorted(records, key=lambda item: item.timestamp or 0, reverse=True) runs.append( { "run_id": run_id, @@ -3181,10 +3190,15 @@ def _debug_runs_for_session(session_manager: Any, session_id: str) -> list[dict[ "started_at": _iso_from_timestamp(started.timestamp if started is not None else None), "ended_at": _iso_from_timestamp(completed.timestamp) if completed is not None else None, "finish_reason": completed.finish_reason if completed is not None else None, - "events": [_debug_event_to_dict(item) for item in records], + "latency_ms": latency_ms or {}, + "events": [_debug_event_to_dict(item) for item in sorted_records], } ) - return runs + return sorted( + runs, + key=lambda item: item.get("ended_at") or item.get("started_at") or "", + reverse=True, + ) def _debug_event_to_dict(record: Any) -> dict[str, Any]: diff --git a/app-instance/backend/beaver/services/agent_service.py b/app-instance/backend/beaver/services/agent_service.py index 7fcfc98..c671e38 100644 --- a/app-instance/backend/beaver/services/agent_service.py +++ b/app-instance/backend/beaver/services/agent_service.py @@ -14,24 +14,20 @@ from __future__ import annotations import asyncio from pathlib import Path +from time import perf_counter from typing import Any from uuid import uuid4 -from beaver.coordinator.models import ExecutionNode, TeamRunResult from beaver.engine import AgentLoop, AgentProfile, AgentRunResult, EngineLoader from beaver.engine.providers import make_provider_bundle from beaver.foundation.events import InboundMessage, OutboundMessage from beaver.foundation.models import CronJob, CronRunRecord from beaver.prompts.main_agent import normalize_main_agent_prompt_locale from beaver.tasks import ( - EvidenceBuilder, MainAgentRouter, - RunEvidence, - TaskEvidencePacket, - TaskExecutionPlan, TaskRecord, - render_task_evidence, ) +from beaver.tasks.attempt_orchestrator import TaskAttemptOrchestrator from beaver.tasks.service import normalize_acceptance_type @@ -594,15 +590,22 @@ class AgentService: router_provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider router_runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime active_task = task_service.get_latest_open_task(session_id) - decision = await self._main_agent_router.classify( - message, - active_task=active_task, - provider=router_provider, - model=getattr(router_runtime, "model", None), - recent_messages=session_manager.get_messages_as_conversation(session_id), - intent_skill=self._load_intent_agent_skill(loaded), - thinking_enabled=kwargs.get("thinking_enabled"), - ) + router_started = perf_counter() + try: + decision = await self._main_agent_router.classify( + message, + active_task=active_task, + provider=router_provider, + model=getattr(router_runtime, "model", None), + recent_messages=session_manager.get_messages_as_conversation(session_id), + intent_skill=self._load_intent_agent_skill(loaded), + thinking_enabled=kwargs.get("thinking_enabled"), + ) + finally: + kwargs["pre_run_latency_ms"] = self._merge_latency_ms( + kwargs.get("pre_run_latency_ms"), + {"router_ms": (perf_counter() - router_started) * 1000}, + ) kwargs["intent_agent_decision"] = self._intent_decision_payload( decision, active_task=active_task, @@ -751,216 +754,19 @@ class AgentService: task: TaskRecord, ) -> AgentRunResult: loaded = self.create_loop().boot() - task_service = self._require_loaded(loaded, "task_service") - task_execution_planner = self._require_loaded(loaded, "task_execution_planner") - session_manager = self._require_loaded(loaded, "session_manager") - - base_execution_context = kwargs.get("execution_context") - prompt_locale = kwargs.get("prompt_locale") or task.metadata.get("prompt_locale") - output_language_instruction = self._output_language_instruction(prompt_locale) - provider_bundle = kwargs.get("provider_bundle") or self._make_provider_bundle_for_task(loaded, kwargs) - kwargs = dict(kwargs) - team_provider_bundle_factory = kwargs.pop("team_provider_bundle_factory", None) - kwargs["provider_bundle"] = provider_bundle - - attempt_index = int(task.metadata.get("latest_attempt_index") or 0) + 1 - task_service.start_run(task.task_id, user_message=message, attempt_index=attempt_index) - plan = await task_execution_planner.plan( + return await self._build_task_attempt_orchestrator(loaded).run( + message=message, + runner=runner, + kwargs=kwargs, task=task, - user_message=message, - attempt_index=attempt_index, - provider_bundle=provider_bundle, - ) - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_execution_planned", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - **plan.to_event_payload(), - }, - ) - team_summaries: list[str] = [] - team_execution_context = "" - team_result: TeamRunResult | None = None - if plan.is_team: - team_result, team_error = await self._run_team_for_task( - plan, - task=task, - parent_session_id=kwargs["session_id"], - provider_bundle_factory=team_provider_bundle_factory - or self._build_team_provider_bundle_factory(loaded, kwargs), - ) - if team_result is not None: - team_summaries = [self._team_summary_for_validation(team_result)] - team_packet = TaskEvidencePacket( - task_id=task.task_id, - attempt_index=attempt_index, - main_run=None, - team_runs=self._team_run_evidence(team_result), - team_node_results=list(team_result.node_results), - final_output="", - ) - team_execution_context = self._join_context( - self._team_execution_context(plan, team_result), - "Rendered team evidence:\n" + render_task_evidence(team_packet), - ) - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_team_run_completed" if team_result.success else "task_team_run_failed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], - "team_run_ids": team_result.run_ids, - "team_success": team_result.success, - "node_results": self._team_node_results_for_event(plan, team_result), - "reason": plan.reason, - "error": None if team_result.success else "one or more team nodes failed", - }, - ) - else: - team_summaries = [f"Team execution failed: {team_error}"] - team_execution_context = self._failed_team_execution_context(plan, team_error or "unknown error") - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_team_run_failed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], - "team_run_ids": [], - "team_success": False, - "reason": plan.reason, - "error": team_error, - }, - ) - - attempt_kwargs = dict(kwargs) - attempt_kwargs.update( - { - "task_id": task.task_id, - "task_mode": True, - "attempt_index": attempt_index, - "allow_candidate_generation": False, - } - ) - attempt_kwargs["execution_context"] = self._join_context( - base_execution_context, - output_language_instruction, - team_execution_context, - ) - if plan.is_team and team_execution_context: - attempt_kwargs["include_tools"] = False - attempt_kwargs["max_tool_iterations"] = 0 - attempt_kwargs["skill_selection_context"] = self._build_skill_selection_context( - task=task, - user_message=message, - attempt_index=attempt_index, - plan=plan, - team_summaries=team_summaries, ) - result = await runner(message, **attempt_kwargs) - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_synthesis_completed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "main_run_id": result.run_id, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - }, + def _build_task_attempt_orchestrator(self, loaded: Any) -> TaskAttemptOrchestrator: + return TaskAttemptOrchestrator( + loaded=loaded, + create_loop=self.create_loop, + make_provider_bundle_for_task=self._make_provider_bundle_for_task, ) - task = task_service.append_run( - task.task_id, - result.run_id, - skill_names=self._skill_names_for_run(loaded, result.run_id), - ) - evidence_packet = self._build_task_evidence_packet( - session_manager=session_manager, - task=task, - attempt_index=attempt_index, - result=result, - team_result=team_result, - ) - evidence_text = render_task_evidence(evidence_packet) - evidence_debug = { - "evidence_run_ids": [ - item.run_id for item in [evidence_packet.main_run, *evidence_packet.team_runs] if item is not None - ], - "evidence_session_ids": [ - item.session_id - for item in [evidence_packet.main_run, *evidence_packet.team_runs] - if item is not None - ], - "tool_result_count": sum( - len(item.tool_results) - for item in [evidence_packet.main_run, *evidence_packet.team_runs] - if item is not None - ), - "evidence_length": len(evidence_text), - } - session_manager.update_latest_assistant_event_payload( - result.session_id, - result.run_id, - { - "task_id": task.task_id, - "task_status": task.status, - "evidence_status": "recorded", - }, - ) - session_manager.append_message( - result.session_id, - run_id=result.run_id, - role="system", - event_type="task_evidence_recorded", - event_payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "evidence_debug": evidence_debug, - }, - content=None, - context_visible=False, - ) - result.task_id = task.task_id - result.task_status = task.status - result.validation_result = None - return result - - async def _run_team_for_task( - self, - plan: TaskExecutionPlan, - *, - task: TaskRecord, - parent_session_id: str, - provider_bundle_factory: Any, - ) -> tuple[TeamRunResult | None, str | None]: - if plan.graph is None: - return None, "team plan did not include an execution graph" - try: - from beaver.services.team_service import TeamService - - result = await TeamService(self.create_loop()).run_team( - plan.graph, - parent_task_id=task.task_id, - parent_session_id=parent_session_id, - parent_run_id=None, - provider_bundle_factory=provider_bundle_factory, - allow_candidate_generation=False, - ) - return result, None - except Exception as exc: - return None, str(exc) @staticmethod def _require_loaded(loaded: Any, field_name: str) -> Any: @@ -992,32 +798,15 @@ class AgentService: } @staticmethod - def _output_language_instruction(prompt_locale: str | None) -> str: - locale = normalize_main_agent_prompt_locale(prompt_locale) - if locale == "en": - return ( - "Output language: English. Use English for user-facing task titles, summaries, plans, " - "and final answers unless the user explicitly requests another language." - ) - if locale == "zh-Hant": - return ( - "輸出語言:繁體中文。除非使用者明確要求其他語言,所有面向使用者的任務標題、摘要、" - "計劃與最終回答都使用繁體中文。" - ) - return ( - "输出语言:简体中文。除非用户明确要求其他语言,所有面向用户的任务标题、摘要、" - "计划与最终回答都使用简体中文。" - ) - - @staticmethod - def _skill_names_for_run(loaded: Any, run_id: str) -> list[str]: - store = getattr(loaded, "run_memory_store", None) - if store is None: - return [] - for record in store.list_runs(): - if record.run_id == run_id: - return [receipt.skill_name for receipt in record.activated_skills] - return [] + def _merge_latency_ms(current: Any, updates: dict[str, float]) -> dict[str, float]: + merged: dict[str, float] = {} + if isinstance(current, dict): + for key, value in current.items(): + if isinstance(value, (int, float)): + merged[str(key)] = float(value) + for key, value in updates.items(): + merged[key] = merged.get(key, 0.0) + float(value) + return merged @staticmethod def _acceptance_score_for_learning(acceptance_type: str) -> float: @@ -1027,237 +816,6 @@ class AgentService: return 0.5 return 0.0 - @staticmethod - def _build_skill_selection_context( - *, - task: TaskRecord, - user_message: str, - attempt_index: int, - plan: TaskExecutionPlan | None = None, - team_summaries: list[str] | None = None, - ) -> str: - phase = f"attempt_{attempt_index}" - if task.feedback and task.feedback[-1].get("acceptance_type") == "revise": - phase = f"revision_attempt_{attempt_index}" - elif plan is not None and plan.is_team: - phase = f"team_synthesis_attempt_{attempt_index}" - - sections = [ - f"Task goal:\n{task.goal or task.description}", - f"Task description:\n{task.description}", - f"Current user request:\n{user_message}", - f"Execution phase:\n{phase}", - f"Task status:\n{task.status}", - ] - if task.constraints: - sections.append("Known constraints:\n" + "\n".join(f"- {item}" for item in task.constraints)) - if task.skill_names: - sections.append( - "Previously activated skills (reuse bias, not pinned):\n" - + "\n".join(f"- {item}" for item in task.skill_names) - ) - else: - sections.append("Previously activated skills:\nNone") - if task.feedback: - history_lines = [] - for item in task.feedback[-5:]: - kind = item.get("acceptance_type") or item.get("feedback_type") - comment = item.get("comment") or "" - run_id = item.get("run_id") or "" - history_lines.append(f"- {kind} run={run_id}: {comment}".strip()) - sections.append("Task acceptance history:\n" + "\n".join(history_lines)) - if plan is not None: - plan_lines = [ - f"mode: {plan.mode}", - f"reason: {plan.reason}", - ] - if plan.final_synthesis_instruction: - plan_lines.append(f"final synthesis instruction: {plan.final_synthesis_instruction}") - if plan.graph is not None: - plan_lines.append(f"strategy: {plan.graph.strategy}") - plan_lines.append( - "nodes:\n" - + "\n".join( - f"- {node.node_id}: {node.task}" - for node in plan.graph.nodes - ) - ) - sections.append("Execution plan:\n" + "\n".join(plan_lines)) - if team_summaries: - sections.append("Team execution summaries:\n" + "\n\n".join(team_summaries)[:2400]) - sections.append( - "Skill selection instruction:\n" - "Prefer reusing previously activated skills when they still match the Task. " - "Select new skills only if the current request, revision, or execution plan needs a different capability. " - "If no published skill matches, return [] and let the run continue without skills." - ) - return "\n\n".join(section for section in sections if section.strip()) - - @staticmethod - def _run_excerpt(session_manager: Any, session_id: str, run_id: str) -> str: - lines = [] - for event in session_manager.get_run_event_records(session_id, run_id): - if event.context_visible and event.content: - lines.append(f"{event.role}: {event.content.strip()}") - return "\n".join(lines[:12])[:2400] - - @staticmethod - def _tool_summaries(session_manager: Any, session_id: str, run_id: str) -> list[str]: - summaries = [] - for event in session_manager.get_run_event_records(session_id, run_id): - if event.event_type != "tool_result_recorded": - continue - text = (event.content or "").strip() - if text: - summaries.append(f"{event.tool_name or 'tool'}: {text[:500]}") - return summaries[:12] - - @staticmethod - def _append_task_observation( - session_manager: Any, - session_id: str, - *, - event_type: str, - payload: dict[str, Any], - ) -> None: - session_manager.append_message( - session_id, - role="system", - event_type=event_type, - event_payload=payload, - content=payload.get("reason") or payload.get("error"), - context_visible=False, - ) - - @staticmethod - def _join_context(*parts: str | None) -> str: - return "\n\n".join(part.strip() for part in parts if part and part.strip()) - - @staticmethod - def _team_summary_for_validation(result: TeamRunResult) -> str: - lines = [ - f"success={result.success}", - f"task_id={result.task_id or ''}", - "summary:", - result.summary, - "nodes:", - ] - for node in result.node_results: - lines.append( - f"- {node.node_id}: success={node.success} finish_reason={node.finish_reason} " - f"error={node.error or ''} output={node.output_text[:500]}" - ) - return "\n".join(lines) - - @staticmethod - def _team_node_results_for_event(plan: TaskExecutionPlan, result: TeamRunResult) -> list[dict[str, Any]]: - nodes = {node.node_id: node for node in plan.graph.nodes} if plan.graph else {} - payloads: list[dict[str, Any]] = [] - for item in result.node_results: - payload = item.to_dict() - node = nodes.get(item.node_id) - if node is not None: - payload["selected_skill_names"] = list(node.inherited_pinned_skills) - payload["ephemeral_skill_names"] = [ - skill.name for skill in node.inherited_pinned_skill_contexts - ] - payload["skill_query"] = node.agent.metadata.get("skill_query") - payload["ephemeral_guidance_id"] = node.agent.metadata.get("ephemeral_guidance_id") - payload["ephemeral_guidance_name"] = node.agent.metadata.get("ephemeral_guidance_name") - payload["ephemeral_used"] = bool(node.inherited_pinned_skill_contexts) - payloads.append(payload) - return payloads - - @staticmethod - def _team_run_evidence(result: TeamRunResult | None) -> list[RunEvidence]: - if result is None: - return [] - return [node.evidence for node in result.node_results if node.evidence is not None] - - def _build_task_evidence_packet( - self, - *, - session_manager: Any, - task: TaskRecord, - attempt_index: int, - result: AgentRunResult, - team_result: TeamRunResult | None, - ) -> TaskEvidencePacket: - main_run = EvidenceBuilder(session_manager).build_run_evidence( - result.session_id, - result.run_id, - result.output_text, - result.finish_reason, - ) - return TaskEvidencePacket( - task_id=task.task_id, - attempt_index=attempt_index, - main_run=main_run, - team_runs=self._team_run_evidence(team_result), - team_node_results=list(team_result.node_results) if team_result is not None else [], - final_output=result.output_text, - ) - - @staticmethod - def _team_execution_context(plan: TaskExecutionPlan, result: TeamRunResult) -> str: - node_lines = [ - ( - f"- {node.node_id}: success={node.success}, finish_reason={node.finish_reason}, " - f"run_id={node.run_id or ''}, error={node.error or ''}\n{node.output_text}" - ) - for node in result.node_results - ] - return "\n\n".join( - item - for item in [ - "Task team execution result:", - f"Planner reason: {plan.reason}", - f"Strategy: {plan.graph.strategy if plan.graph else ''}", - f"Team success: {result.success}", - f"Team summary:\n{result.summary}", - "Node results:\n" + "\n\n".join(node_lines), - ( - "Final synthesis instruction:\n" + plan.final_synthesis_instruction - if plan.final_synthesis_instruction - else None - ), - ( - "Use successful team outputs as internal evidence. If one or more nodes failed, " - "do not blindly repeat failed tool calls. Produce a user-visible fallback answer " - "with available evidence and clearly state any missing or uncertain data." - ), - ] - if item - ) - - @staticmethod - def _failed_team_execution_context(plan: TaskExecutionPlan, error: str) -> str: - return "\n\n".join( - [ - "Task team execution failed before final synthesis.", - f"Planner reason: {plan.reason}", - f"Strategy: {plan.graph.strategy if plan.graph else ''}", - f"Error: {error}", - ( - "Proceed as the main agent. Do not blindly repeat failed tool calls; " - "produce a user-visible fallback answer with available evidence and clearly " - "state any missing or uncertain data." - ), - ] - ) - - def _build_team_provider_bundle_factory(self, loaded: Any, kwargs: dict[str, Any]) -> Any: - def factory(node: ExecutionNode) -> Any: - node_kwargs = dict(kwargs) - node_kwargs.pop("provider_bundle", None) - if node.agent.model: - node_kwargs["model"] = node.agent.model - if node.agent.provider_name: - node_kwargs["provider_name"] = node.agent.provider_name - return self._make_provider_bundle_for_task(loaded, node_kwargs) - - return factory - def _make_provider_bundle_for_task(self, loaded: Any, kwargs: dict[str, Any]) -> Any: config = loaded.config configured_provider = config.resolve_provider_target( diff --git a/app-instance/backend/beaver/services/user_files.py b/app-instance/backend/beaver/services/user_files.py index 9052fcc..a46b1ab 100644 --- a/app-instance/backend/beaver/services/user_files.py +++ b/app-instance/backend/beaver/services/user_files.py @@ -40,6 +40,10 @@ class UserFileSizeError(UserFileError): """Raised when a user file upload exceeds configured limits.""" +class UserFileStorageError(UserFileError): + """Raised when the backing user-file storage cannot complete an operation.""" + + @dataclass(frozen=True, slots=True) class AgentUserFilePolicy: task_id: str | None = None @@ -387,26 +391,34 @@ class MinIOUserFileStorage: async def list_dir(self, path: str) -> list[UserFileEntry]: prefix = self._object_prefix(path) - objects = self.client.list_objects(self.config.bucket, prefix=prefix, recursive=False) + try: + objects = self.client.list_objects(self.config.bucket, prefix=prefix, recursive=False) + except Exception as exc: + raise _minio_storage_error("list directory", exc) from exc entries: list[UserFileEntry] = [] - for obj in objects: - object_name = str(obj.object_name or "") - user_path = self._user_path(object_name) - if not user_path or user_path == path or user_path.endswith("/.keep"): - continue - trimmed = user_path.rstrip("/") - name = PurePosixPath(trimmed).name - is_dir = bool(getattr(obj, "is_dir", False)) or object_name.endswith("/") - entries.append( - UserFileEntry( - name=name, - path=trimmed, - type="directory" if is_dir else "file", - size=None if is_dir else getattr(obj, "size", None), - content_type=None if is_dir else "application/octet-stream", - modified=obj.last_modified.isoformat() if getattr(obj, "last_modified", None) else None, + try: + for obj in objects: + object_name = str(obj.object_name or "") + user_path = self._user_path(object_name) + if not user_path or user_path == path or user_path.endswith("/.keep"): + continue + trimmed = user_path.rstrip("/") + name = PurePosixPath(trimmed).name + is_dir = bool(getattr(obj, "is_dir", False)) or object_name.endswith("/") + entries.append( + UserFileEntry( + name=name, + path=trimmed, + type="directory" if is_dir else "file", + size=None if is_dir else getattr(obj, "size", None), + content_type=None if is_dir else "application/octet-stream", + modified=obj.last_modified.isoformat() if getattr(obj, "last_modified", None) else None, + ) ) - ) + except UserFileError: + raise + except Exception as exc: + raise _minio_storage_error("list directory", exc) from exc return sorted(entries, key=lambda item: (item.type != "directory", item.name.lower())) async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent: @@ -421,7 +433,9 @@ class MinIOUserFileStorage: response.close() response.release_conn() except Exception as exc: - raise UserFileNotFoundError("File not found") from exc + if _minio_error_code(exc) in {"NoSuchKey", "NoSuchObject"}: + raise UserFileNotFoundError("File not found") from exc + raise _minio_storage_error("read file", exc) from exc return UserFileContent( name=PurePosixPath(path).name, path=path, @@ -433,13 +447,16 @@ class MinIOUserFileStorage: async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry: object_name = self._object_name(path) - result = self.client.put_object( - self.config.bucket, - object_name, - BytesIO(content), - length=len(content), - content_type=content_type, - ) + try: + self.client.put_object( + self.config.bucket, + object_name, + BytesIO(content), + length=len(content), + content_type=content_type, + ) + except Exception as exc: + raise _minio_storage_error("write file", exc) from exc return UserFileEntry( name=PurePosixPath(path).name, path=path, @@ -475,6 +492,8 @@ class MinIOUserFileStorage: except Exception: pass raise + except Exception as exc: + raise _minio_storage_error("write file", exc) from exc return UserFileEntry( name=PurePosixPath(path).name, path=path, @@ -490,23 +509,30 @@ class MinIOUserFileStorage: try: self.client.remove_object(self.config.bucket, object_name) removed = True - except Exception: - pass + except Exception as exc: + if _minio_error_code(exc) != "NoSuchKey": + raise _minio_storage_error("delete path", exc) from exc prefix = f"{object_name.rstrip('/')}/" - for obj in self.client.list_objects(self.config.bucket, prefix=prefix, recursive=True): - self.client.remove_object(self.config.bucket, str(obj.object_name)) - removed = True + try: + for obj in self.client.list_objects(self.config.bucket, prefix=prefix, recursive=True): + self.client.remove_object(self.config.bucket, str(obj.object_name)) + removed = True + except Exception as exc: + raise _minio_storage_error("delete path", exc) from exc return removed async def mkdir(self, path: str) -> UserFileEntry: object_name = f"{self._object_name(path).rstrip('/')}/.keep" - self.client.put_object( - self.config.bucket, - object_name, - BytesIO(b""), - length=0, - content_type="application/x-directory", - ) + try: + self.client.put_object( + self.config.bucket, + object_name, + BytesIO(b""), + length=0, + content_type="application/x-directory", + ) + except Exception as exc: + raise _minio_storage_error("create directory", exc) from exc return UserFileEntry( name=PurePosixPath(path).name, path=path, @@ -600,6 +626,18 @@ def _safe_scope(value: str | None) -> str: return cleaned or "interactive" +def _minio_error_code(exc: Exception) -> str: + return str(getattr(exc, "code", "") or "") + + +def _minio_storage_error(operation: str, exc: Exception) -> UserFileStorageError: + code = _minio_error_code(exc) + message = f"User file storage {operation} failed" + if code: + message = f"{message}: {code}" + return UserFileStorageError(message) + + class _LimitedReadStream: def __init__(self, stream: object, *, max_bytes: int | None = None) -> None: self.stream = stream diff --git a/app-instance/backend/beaver/skills/assembler/task_assembler.py b/app-instance/backend/beaver/skills/assembler/task_assembler.py index e95ae23..ad0a0c0 100644 --- a/app-instance/backend/beaver/skills/assembler/task_assembler.py +++ b/app-instance/backend/beaver/skills/assembler/task_assembler.py @@ -83,6 +83,12 @@ class SkillAssembler: return SkillAssemblyResult() llm_interactions: list[dict[str, Any]] = [] + if len(candidates) == 1: + return SkillAssemblyResult( + activated_skills=self._activate_skill_contexts([candidates[0]["name"]]), + llm_interactions=llm_interactions, + ) + if len(candidates) <= self.max_detailed_candidates: shortlisted_names = [item["name"] for item in candidates] else: @@ -115,6 +121,10 @@ class SkillAssembler: if not selected_names: return SkillAssemblyResult(llm_interactions=llm_interactions) + activated_skills = self._activate_skill_contexts(selected_names) + return SkillAssemblyResult(activated_skills=activated_skills, llm_interactions=llm_interactions) + + def _activate_skill_contexts(self, selected_names: list[str]) -> list[SkillContext]: activated_skills: list[SkillContext] = [] for name in selected_names: record = self.loader.get_skill_record(name) @@ -130,10 +140,11 @@ class SkillAssembler: content_hash=record.content_hash or "" if record is not None else "", activation_reason="llm_selected", tool_hints=list(record.tool_hints) if record is not None else [], + team_template=getattr(record, "team_template", None) if record is not None else None, + team_template_warnings=list(getattr(record, "team_template_warnings", [])) if record is not None else [], ) ) - - return SkillAssemblyResult(activated_skills=activated_skills, llm_interactions=llm_interactions) + return activated_skills async def _select_skill_names( self, diff --git a/app-instance/backend/beaver/skills/catalog/loader.py b/app-instance/backend/beaver/skills/catalog/loader.py index d2d67ce..901c332 100644 --- a/app-instance/backend/beaver/skills/catalog/loader.py +++ b/app-instance/backend/beaver/skills/catalog/loader.py @@ -28,6 +28,7 @@ from .utils import ( check_requirements, escape_xml, extract_required_tool_names, + extract_skill_team_template, get_missing_requirements, parse_frontmatter, parse_skill_metadata_blob, @@ -49,6 +50,8 @@ class SkillRecord: tool_hints: list[str] = field(default_factory=list) frontmatter: dict[str, Any] = field(default_factory=dict) description: str = "" + team_template: dict[str, Any] | None = None + team_template_warnings: list[str] = field(default_factory=list) class SkillsLoader: @@ -113,6 +116,7 @@ class SkillsLoader: continue normalized_frontmatter = dict(frontmatter) meta_blob = parse_skill_metadata_blob(frontmatter.get("metadata", "")) + template_result = extract_skill_team_template(body) record = SkillRecord( name=name, path=skill_file, @@ -127,6 +131,8 @@ class SkillsLoader: ), frontmatter=normalized_frontmatter, description=str(frontmatter.get("description") or summarize_body(body) or name), + team_template=template_result.template, + team_template_warnings=template_result.warnings, ) if filter_unavailable and not self._record_available(record): continue @@ -146,6 +152,7 @@ class SkillsLoader: else: path = self.workspace_skills / name / "versions" / loaded.version.version / "SKILL.md" _frontmatter, body = parse_frontmatter(loaded.content) + template_result = extract_skill_team_template(body) record = SkillRecord( name=name, path=path, @@ -160,6 +167,8 @@ class SkillsLoader: ), frontmatter=dict(loaded.version.frontmatter), description=str(loaded.version.frontmatter.get("description") or loaded.version.summary or name), + team_template=template_result.template, + team_template_warnings=template_result.warnings, ) if filter_unavailable and not self._record_available(record): continue diff --git a/app-instance/backend/beaver/skills/catalog/utils.py b/app-instance/backend/beaver/skills/catalog/utils.py index c2f82ce..97d4cf3 100644 --- a/app-instance/backend/beaver/skills/catalog/utils.py +++ b/app-instance/backend/beaver/skills/catalog/utils.py @@ -17,6 +17,7 @@ import json import os import re import shutil +from dataclasses import dataclass, field from typing import Any @@ -84,6 +85,27 @@ def strip_frontmatter(content: str) -> str: return body +@dataclass(slots=True) +class SkillTeamTemplateParseResult: + template: dict[str, Any] | None = None + warnings: list[str] = field(default_factory=list) + + +def extract_skill_team_template(body: str) -> SkillTeamTemplateParseResult: + matches = re.findall(r"```beaver-team-template\s*\n(.*?)\n```", body, re.DOTALL) + if not matches: + return SkillTeamTemplateParseResult() + if len(matches) != 1: + return SkillTeamTemplateParseResult(warnings=["skill defines multiple team templates"]) + try: + template = json.loads(matches[0]) + except json.JSONDecodeError: + return SkillTeamTemplateParseResult(warnings=["team template JSON is invalid"]) + if not isinstance(template, dict) or not isinstance(template.get("nodes", []), list): + return SkillTeamTemplateParseResult(warnings=["team template must be an object with a nodes list"]) + return SkillTeamTemplateParseResult(template=template) + + def extract_required_tool_names(body: str) -> list[str]: """从 canonical skill 正文的 `## Required Tools` 段落提取工具名。 diff --git a/app-instance/backend/beaver/skills/learning/eval.py b/app-instance/backend/beaver/skills/learning/eval.py index 404642e..299b689 100644 --- a/app-instance/backend/beaver/skills/learning/eval.py +++ b/app-instance/backend/beaver/skills/learning/eval.py @@ -284,6 +284,9 @@ def _build_replay_case_reports( "side_effects": [*baseline.get("side_effects", []), *candidate_arm.get("side_effects", [])], "validator_notes": list(surrogate.get("notes") or []), } + historical_accepted_score = _historical_accepted_score(case) + if historical_accepted_score is not None: + case_report["historical_accepted_score"] = historical_accepted_score return case_report, { "run_id": case["run_id"], "session_id": case.get("session_id") or "", @@ -293,6 +296,7 @@ def _build_replay_case_reports( "baseline_score": baseline_score, "candidate_score": candidate_score, "delta": round(candidate_score - baseline_score, 4), + **({"historical_accepted_score": historical_accepted_score} if historical_accepted_score is not None else {}), } @@ -658,8 +662,11 @@ def _ability_score(*, case: dict[str, Any], arm: dict[str, Any], arm_name: str) if validator is not None: return _ability_from_validator(validator, arm) if not case.get("synthetic"): - score = _bounded_score(case.get("accepted_score"), default=0.75) if arm_name == "baseline" else _ability_from_output(arm)["final_score"] - return _ability_breakdown(score=score, source="user_feedback" if arm_name == "baseline" else "llm_judge") + result = _ability_from_output(arm, source="output_heuristic") + historical_accepted_score = _historical_accepted_score(case) + if historical_accepted_score is not None: + result["historical_accepted_score"] = historical_accepted_score + return result return _ability_breakdown(score=0.0, source="unscored", notes=["Synthetic cases require a validator."]) @@ -697,6 +704,12 @@ def _ability_from_output(arm: dict[str, Any], *, source: str = "llm_judge", note return _ability_breakdown(score=score, source=source, notes=notes) +def _historical_accepted_score(case: dict[str, Any]) -> float | None: + if case.get("synthetic") or isinstance(case.get("validator"), dict) or "accepted_score" not in case: + return None + return _bounded_score(case.get("accepted_score"), default=0.75) + + def _ability_breakdown(*, score: float, source: str, notes: list[str] | None = None) -> dict[str, Any]: bounded = _bounded_score(score, default=0.0) return { diff --git a/app-instance/backend/beaver/tasks/attempt_orchestrator.py b/app-instance/backend/beaver/tasks/attempt_orchestrator.py new file mode 100644 index 0000000..4fd58d3 --- /dev/null +++ b/app-instance/backend/beaver/tasks/attempt_orchestrator.py @@ -0,0 +1,695 @@ +"""Task attempt orchestration for Beaver Task mode.""" + +from __future__ import annotations + +from time import perf_counter +from typing import Any, Callable + +from beaver.coordinator.models import ExecutionNode, TeamRunResult +from beaver.engine import AgentRunResult +from beaver.engine.context import SkillContext +from beaver.prompts.main_agent import normalize_main_agent_prompt_locale + +from .evidence import EvidenceBuilder, RunEvidence, TaskEvidencePacket, render_task_evidence +from .models import TaskRecord +from .planner import TaskExecutionPlan + + +class TaskAttemptOrchestrator: + """Own the execution order inside one Task attempt.""" + + def __init__( + self, + *, + loaded: Any, + create_loop: Callable[[], Any], + make_provider_bundle_for_task: Callable[[Any, dict[str, Any]], Any], + ) -> None: + self.loaded = loaded + self.create_loop = create_loop + self.make_provider_bundle_for_task = make_provider_bundle_for_task + + async def run( + self, + *, + message: str, + runner: Any, + kwargs: dict[str, Any], + task: TaskRecord, + ) -> AgentRunResult: + task_service = self._require_loaded(self.loaded, "task_service") + task_execution_planner = self._require_loaded(self.loaded, "task_execution_planner") + session_manager = self._require_loaded(self.loaded, "session_manager") + + base_execution_context = kwargs.get("execution_context") + prompt_locale = kwargs.get("prompt_locale") or task.metadata.get("prompt_locale") + output_language_instruction = self._output_language_instruction(prompt_locale) + provider_bundle = kwargs.get("provider_bundle") or self.make_provider_bundle_for_task(self.loaded, kwargs) + kwargs = dict(kwargs) + team_provider_bundle_factory = kwargs.pop("team_provider_bundle_factory", None) + kwargs["provider_bundle"] = provider_bundle + + attempt_index = int(task.metadata.get("latest_attempt_index") or 0) + 1 + task_service.start_run(task.task_id, user_message=message, attempt_index=attempt_index) + pre_skill_context = self._build_skill_selection_context( + task=task, + user_message=message, + attempt_index=attempt_index, + ) + preselected_skills, pre_skill_latency_ms = await self._assemble_task_attempt_skills( + task_description=pre_skill_context, + provider_bundle=provider_bundle, + thinking_enabled=kwargs.get("thinking_enabled"), + include_skill_assembly=bool(kwargs.get("include_skill_assembly", True)), + pinned_skill_contexts=kwargs.get("pinned_skill_contexts"), + ) + if pre_skill_latency_ms: + kwargs["pre_run_latency_ms"] = self._merge_latency_ms( + kwargs.get("pre_run_latency_ms"), + {"pre_skill_assembly_ms": pre_skill_latency_ms}, + ) + plan = await task_execution_planner.plan( + task=task, + user_message=message, + attempt_index=attempt_index, + provider_bundle=provider_bundle, + skill_summaries=self._skill_summaries_for_planner(preselected_skills), + tool_hints=self._tool_hints_for_skills(preselected_skills), + activated_skills=preselected_skills, + ) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_execution_planned", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + **plan.to_event_payload(), + }, + ) + team_summaries: list[str] = [] + team_execution_context = "" + team_result: TeamRunResult | None = None + if plan.is_team: + team_result, team_error = await self._run_team_for_task( + plan, + task=task, + parent_session_id=kwargs["session_id"], + provider_bundle_factory=team_provider_bundle_factory + or self._build_team_provider_bundle_factory(kwargs), + ) + if team_result is not None: + team_summaries = [self._team_summary_for_validation(team_result)] + team_packet = TaskEvidencePacket( + task_id=task.task_id, + attempt_index=attempt_index, + main_run=None, + team_runs=self._team_run_evidence(team_result), + team_node_results=list(team_result.node_results), + final_output="", + ) + team_execution_context = self._join_context( + self._team_execution_context(plan, team_result), + "Rendered team evidence:\n" + render_task_evidence(team_packet), + ) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_team_run_completed" if team_result.success else "task_team_run_failed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], + "team_run_ids": team_result.run_ids, + "team_success": team_result.success, + "node_results": self._team_node_results_for_event(plan, team_result), + "reason": plan.reason, + "error": None if team_result.success else "one or more team nodes failed", + }, + ) + else: + team_summaries = [f"Team execution failed: {team_error}"] + team_execution_context = self._failed_team_execution_context(plan, team_error or "unknown error") + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_team_run_failed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], + "team_run_ids": [], + "team_success": False, + "reason": plan.reason, + "error": team_error, + }, + ) + + outcome_context, incomplete_prefix, outcome_metadata = self._team_synthesis_outcome( + plan, + team_result, + prompt_locale=prompt_locale, + ) + if plan.is_team: + team_execution_context = self._join_context(outcome_context, team_execution_context) + + attempt_kwargs = dict(kwargs) + attempt_kwargs.update( + { + "task_id": task.task_id, + "task_mode": True, + "attempt_index": attempt_index, + "allow_candidate_generation": False, + "pinned_skill_contexts": preselected_skills, + "include_skill_assembly": False, + } + ) + attempt_kwargs["execution_context"] = self._join_context( + base_execution_context, + output_language_instruction, + team_execution_context, + ) + if plan.is_team and team_execution_context: + attempt_kwargs["include_tools"] = False + attempt_kwargs["max_tool_iterations"] = 0 + attempt_kwargs["skill_selection_context"] = self._build_skill_selection_context( + task=task, + user_message=message, + attempt_index=attempt_index, + plan=plan, + team_summaries=team_summaries, + ) + + result = await runner(message, **attempt_kwargs) + if outcome_metadata["task_outcome"] == "incomplete": + result.output_text = self._apply_incomplete_prefix(result.output_text, incomplete_prefix) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_synthesis_completed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "main_run_id": result.run_id, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + **outcome_metadata, + }, + ) + task = task_service.append_run( + task.task_id, + result.run_id, + skill_names=self._skill_names_for_run(result.run_id), + ) + evidence_packet = self._build_task_evidence_packet( + session_manager=session_manager, + task=task, + attempt_index=attempt_index, + result=result, + team_result=team_result, + ) + evidence_text = render_task_evidence(evidence_packet) + evidence_debug = { + "evidence_run_ids": [ + item.run_id for item in [evidence_packet.main_run, *evidence_packet.team_runs] if item is not None + ], + "evidence_session_ids": [ + item.session_id + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ], + "tool_result_count": sum( + len(item.tool_results) + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ), + "evidence_length": len(evidence_text), + } + session_manager.update_latest_assistant_event_payload( + result.session_id, + result.run_id, + { + "task_id": task.task_id, + "task_status": task.status, + "evidence_status": "recorded", + }, + ) + session_manager.append_message( + result.session_id, + run_id=result.run_id, + role="system", + event_type="task_evidence_recorded", + event_payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "evidence_debug": evidence_debug, + }, + content=None, + context_visible=False, + ) + result.task_id = task.task_id + result.task_status = task.status + result.validation_result = None + return result + + async def _run_team_for_task( + self, + plan: TaskExecutionPlan, + *, + task: TaskRecord, + parent_session_id: str, + provider_bundle_factory: Any, + ) -> tuple[TeamRunResult | None, str | None]: + if plan.graph is None: + return None, "team plan did not include an execution graph" + try: + from beaver.services.team_service import TeamService + + result = await TeamService(self.create_loop()).run_team( + plan.graph, + parent_task_id=task.task_id, + parent_session_id=parent_session_id, + parent_run_id=None, + provider_bundle_factory=provider_bundle_factory, + allow_candidate_generation=False, + ) + return result, None + except Exception as exc: + return None, str(exc) + + async def _assemble_task_attempt_skills( + self, + *, + task_description: str, + provider_bundle: Any, + thinking_enabled: bool | None, + include_skill_assembly: bool, + pinned_skill_contexts: Any, + ) -> tuple[list[SkillContext], float]: + started = perf_counter() + selected = self._coerce_skill_contexts(pinned_skill_contexts) + if include_skill_assembly: + skill_assembler = self._require_loaded(self.loaded, "skill_assembler") + runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime + assembled = await skill_assembler.assemble( + task_description=task_description, + provider=provider_bundle.auxiliary_provider or provider_bundle.main_provider, + model=getattr(runtime, "model", None), + embedding_runtime=getattr(provider_bundle, "embedding_runtime", None), + thinking_enabled=thinking_enabled, + ) + selected = self._merge_skill_contexts( + selected, + list(getattr(assembled, "activated_skills", []) or []), + ) + return selected, (perf_counter() - started) * 1000 + + @staticmethod + def _coerce_skill_contexts(value: Any) -> list[SkillContext]: + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, SkillContext)] + + @staticmethod + def _merge_skill_contexts(left: list[SkillContext], right: list[SkillContext]) -> list[SkillContext]: + merged: list[SkillContext] = [] + seen: set[str] = set() + for skill in [*left, *right]: + if skill.name in seen: + continue + seen.add(skill.name) + merged.append(skill) + return merged + + @staticmethod + def _skill_summaries_for_planner(skills: list[SkillContext]) -> list[str]: + summaries: list[str] = [] + for skill in skills: + content = " ".join((skill.content or "").split()) + if len(content) > 240: + content = content[:237].rstrip() + "..." + summaries.append(f"{skill.name}: {content}" if content else skill.name) + return summaries + + @staticmethod + def _tool_hints_for_skills(skills: list[SkillContext]) -> list[str]: + result: list[str] = [] + for skill in skills: + for hint in skill.tool_hints: + if hint and hint not in result: + result.append(hint) + return result + + @staticmethod + def _require_loaded(loaded: Any, field_name: str) -> Any: + value = getattr(loaded, field_name) + if value is None: + raise RuntimeError(f"Engine loader did not provide required dependency {field_name!r}") + return value + + @staticmethod + def _merge_latency_ms(current: Any, updates: dict[str, float]) -> dict[str, float]: + merged: dict[str, float] = {} + if isinstance(current, dict): + for key, value in current.items(): + if isinstance(value, (int, float)): + merged[str(key)] = float(value) + for key, value in updates.items(): + merged[key] = merged.get(key, 0.0) + float(value) + return merged + + @staticmethod + def _output_language_instruction(prompt_locale: str | None) -> str: + locale = normalize_main_agent_prompt_locale(prompt_locale) + if locale == "en": + return ( + "Output language: English. Use English for user-facing task titles, summaries, plans, " + "and final answers unless the user explicitly requests another language." + ) + if locale == "zh-Hant": + return ( + "輸出語言:繁體中文。除非使用者明確要求其他語言,所有面向使用者的任務標題、摘要、" + "計劃與最終回答都使用繁體中文。" + ) + return ( + "输出语言:简体中文。除非用户明确要求其他语言,所有面向用户的任务标题、摘要、" + "计划与最终回答都使用简体中文。" + ) + + def _skill_names_for_run(self, run_id: str) -> list[str]: + store = getattr(self.loaded, "run_memory_store", None) + if store is None: + return [] + for record in store.list_runs(): + if record.run_id == run_id: + return [receipt.skill_name for receipt in record.activated_skills] + return [] + + @staticmethod + def _build_skill_selection_context( + *, + task: TaskRecord, + user_message: str, + attempt_index: int, + plan: TaskExecutionPlan | None = None, + team_summaries: list[str] | None = None, + ) -> str: + phase = f"attempt_{attempt_index}" + if task.feedback and task.feedback[-1].get("acceptance_type") == "revise": + phase = f"revision_attempt_{attempt_index}" + elif plan is not None and plan.is_team: + phase = f"team_synthesis_attempt_{attempt_index}" + + sections = [ + f"Task goal:\n{task.goal or task.description}", + f"Task description:\n{task.description}", + f"Current user request:\n{user_message}", + f"Execution phase:\n{phase}", + f"Task status:\n{task.status}", + ] + if task.constraints: + sections.append("Known constraints:\n" + "\n".join(f"- {item}" for item in task.constraints)) + if task.skill_names: + sections.append( + "Previously activated skills (reuse bias, not pinned):\n" + + "\n".join(f"- {item}" for item in task.skill_names) + ) + else: + sections.append("Previously activated skills:\nNone") + if task.feedback: + history_lines = [] + for item in task.feedback[-5:]: + kind = item.get("acceptance_type") or item.get("feedback_type") + comment = item.get("comment") or "" + run_id = item.get("run_id") or "" + history_lines.append(f"- {kind} run={run_id}: {comment}".strip()) + sections.append("Task acceptance history:\n" + "\n".join(history_lines)) + if plan is not None: + plan_lines = [ + f"mode: {plan.mode}", + f"reason: {plan.reason}", + ] + if plan.final_synthesis_instruction: + plan_lines.append(f"final synthesis instruction: {plan.final_synthesis_instruction}") + if plan.graph is not None: + plan_lines.append(f"strategy: {plan.graph.strategy}") + plan_lines.append( + "nodes:\n" + + "\n".join( + f"- {node.node_id}: {node.task}" + for node in plan.graph.nodes + ) + ) + sections.append("Execution plan:\n" + "\n".join(plan_lines)) + if team_summaries: + sections.append("Team execution summaries:\n" + "\n\n".join(team_summaries)[:2400]) + sections.append( + "Skill selection instruction:\n" + "Prefer reusing previously activated skills when they still match the Task. " + "Select new skills only if the current request, revision, or execution plan needs a different capability. " + "If no published skill matches, return [] and let the run continue without skills." + ) + return "\n\n".join(section for section in sections if section.strip()) + + @staticmethod + def _append_task_observation( + session_manager: Any, + session_id: str, + *, + event_type: str, + payload: dict[str, Any], + ) -> None: + session_manager.append_message( + session_id, + role="system", + event_type=event_type, + event_payload=payload, + content=payload.get("reason") or payload.get("error"), + context_visible=False, + ) + + @staticmethod + def _join_context(*parts: str | None) -> str: + return "\n\n".join(part.strip() for part in parts if part and part.strip()) + + @staticmethod + def _team_summary_for_validation(result: TeamRunResult) -> str: + lines = [ + f"success={result.success}", + f"task_id={result.task_id or ''}", + "summary:", + result.summary, + "nodes:", + ] + for node in result.node_results: + lines.append( + f"- {node.node_id}: success={node.success} finish_reason={node.finish_reason} " + f"error={node.error or ''} output={node.output_text[:500]}" + ) + return "\n".join(lines) + + @staticmethod + def _team_node_results_for_event(plan: TaskExecutionPlan, result: TeamRunResult) -> list[dict[str, Any]]: + nodes = {node.node_id: node for node in plan.graph.nodes} if plan.graph else {} + payloads: list[dict[str, Any]] = [] + for item in result.node_results: + payload = item.to_dict() + node = nodes.get(item.node_id) + if node is not None: + payload["selected_skill_names"] = list(node.inherited_pinned_skills) + payload["ephemeral_skill_names"] = [ + skill.name for skill in node.inherited_pinned_skill_contexts + ] + payload["skill_query"] = node.agent.metadata.get("skill_query") + payload["ephemeral_guidance_id"] = node.agent.metadata.get("ephemeral_guidance_id") + payload["ephemeral_guidance_name"] = node.agent.metadata.get("ephemeral_guidance_name") + payload["ephemeral_used"] = bool(node.inherited_pinned_skill_contexts) + payloads.append(payload) + return payloads + + @staticmethod + def _team_run_evidence(result: TeamRunResult | None) -> list[RunEvidence]: + if result is None: + return [] + return [node.evidence for node in result.node_results if node.evidence is not None] + + @staticmethod + def _team_synthesis_outcome( + plan: TaskExecutionPlan, + result: TeamRunResult | None, + *, + prompt_locale: str | None = None, + ) -> tuple[str, str, dict[str, Any]]: + if not plan.is_team or plan.graph is None: + metadata = { + "task_outcome": "single", + "incomplete_node_ids": [], + "node_statuses": {}, + "evidence_gaps": {}, + } + return "Task outcome: single", "", metadata + + result_by_node = { + item.node_id: item + for item in (result.node_results if result is not None else []) + } + node_statuses: dict[str, str] = {} + evidence_gaps: dict[str, list[str]] = {} + incomplete_node_ids: list[str] = [] + detail_lines: list[str] = [] + successful_lines: list[str] = [] + for node in plan.graph.nodes: + node_result = result_by_node.get(node.node_id) + status = node_result.completion_status if node_result is not None else "not_run" + node_statuses[node.node_id] = status + gaps = list(node_result.evidence_gaps) if node_result is not None else [] + if gaps: + evidence_gaps[node.node_id] = gaps + if node.required_for_completion and status != "succeeded": + incomplete_node_ids.append(node.node_id) + detail_lines.append( + f"- {node.node_id}: status={status}, " + f"finish_reason={node_result.finish_reason if node_result is not None else 'not_run'}, " + f"error={(node_result.error or '') if node_result is not None else 'node did not run'}, " + f"evidence_gaps={gaps}" + ) + elif node_result is not None and status == "succeeded": + successful_lines.append(f"- {node.node_id}: {node_result.output_text[:1000]}") + + task_outcome = "incomplete" if incomplete_node_ids else "complete" + metadata = { + "task_outcome": task_outcome, + "incomplete_node_ids": incomplete_node_ids, + "node_statuses": node_statuses, + "evidence_gaps": evidence_gaps, + } + context_parts = [ + f"Task outcome: {task_outcome}", + "Incomplete node IDs: " + (", ".join(incomplete_node_ids) or "none"), + ] + if detail_lines: + context_parts.append("Incomplete required node details:\n" + "\n".join(detail_lines)) + if successful_lines: + context_parts.append("Available successful node evidence:\n" + "\n".join(successful_lines)) + if task_outcome == "incomplete": + context_parts.append( + "Synthesis requirement: produce a partial report from available evidence and explicitly state " + "that the task is incomplete, partially completed, or missing required evidence." + ) + prefix = TaskAttemptOrchestrator._incomplete_prefix(prompt_locale) if incomplete_node_ids else "" + return "\n\n".join(context_parts), prefix, metadata + + @staticmethod + def _incomplete_prefix(prompt_locale: str | None) -> str: + locale = normalize_main_agent_prompt_locale(prompt_locale) + if locale == "en": + return "Task incomplete: some required steps failed or lack required evidence. The report below uses available results only.\n\n" + if locale == "zh-Hant": + return "任務未完成:部分必要步驟失敗或缺少必要證據。以下內容僅基於現有結果。\n\n" + return "任务未完成:部分必要步骤失败或缺少必要证据。以下内容仅基于现有结果。\n\n" + + @staticmethod + def _apply_incomplete_prefix(output_text: str, prefix: str) -> str: + normalized = output_text.lower() + notices = ( + "任务未完成", + "任務未完成", + "部分完成", + "缺少证据", + "缺少證據", + "task incomplete", + "incomplete task", + "partially complete", + "missing evidence", + ) + if any(notice in normalized for notice in notices): + return output_text + return prefix + output_text.lstrip() + + def _build_task_evidence_packet( + self, + *, + session_manager: Any, + task: TaskRecord, + attempt_index: int, + result: AgentRunResult, + team_result: TeamRunResult | None, + ) -> TaskEvidencePacket: + main_run = EvidenceBuilder(session_manager).build_run_evidence( + result.session_id, + result.run_id, + result.output_text, + result.finish_reason, + ) + return TaskEvidencePacket( + task_id=task.task_id, + attempt_index=attempt_index, + main_run=main_run, + team_runs=self._team_run_evidence(team_result), + team_node_results=list(team_result.node_results) if team_result is not None else [], + final_output=result.output_text, + ) + + @staticmethod + def _team_execution_context(plan: TaskExecutionPlan, result: TeamRunResult) -> str: + node_lines = [ + ( + f"- {node.node_id}: success={node.success}, finish_reason={node.finish_reason}, " + f"run_id={node.run_id or ''}, error={node.error or ''}\n{node.output_text}" + ) + for node in result.node_results + ] + return "\n\n".join( + item + for item in [ + "Task team execution result:", + f"Planner reason: {plan.reason}", + f"Strategy: {plan.graph.strategy if plan.graph else ''}", + f"Team success: {result.success}", + f"Team summary:\n{result.summary}", + "Node results:\n" + "\n\n".join(node_lines), + ( + "Final synthesis instruction:\n" + plan.final_synthesis_instruction + if plan.final_synthesis_instruction + else None + ), + ( + "Use successful team outputs as internal evidence. If one or more nodes failed, " + "do not blindly repeat failed tool calls. Produce a user-visible fallback answer " + "with available evidence and clearly state any missing or uncertain data." + ), + ] + if item + ) + + @staticmethod + def _failed_team_execution_context(plan: TaskExecutionPlan, error: str) -> str: + return "\n\n".join( + [ + "Task team execution failed before final synthesis.", + f"Planner reason: {plan.reason}", + f"Strategy: {plan.graph.strategy if plan.graph else ''}", + f"Error: {error}", + ( + "Proceed as the main agent. Do not blindly repeat failed tool calls; " + "produce a user-visible fallback answer with available evidence and clearly " + "state any missing or uncertain data." + ), + ] + ) + + def _build_team_provider_bundle_factory(self, kwargs: dict[str, Any]) -> Any: + def factory(node: ExecutionNode) -> Any: + node_kwargs = dict(kwargs) + node_kwargs.pop("provider_bundle", None) + if node.agent.model: + node_kwargs["model"] = node.agent.model + if node.agent.provider_name: + node_kwargs["provider_name"] = node.agent.provider_name + return self.make_provider_bundle_for_task(self.loaded, node_kwargs) + + return factory diff --git a/app-instance/backend/beaver/tasks/evidence.py b/app-instance/backend/beaver/tasks/evidence.py index 02ccb20..b328434 100644 --- a/app-instance/backend/beaver/tasks/evidence.py +++ b/app-instance/backend/beaver/tasks/evidence.py @@ -2,6 +2,8 @@ from __future__ import annotations +import json +import re from dataclasses import dataclass, field from typing import Any @@ -126,6 +128,37 @@ class EvidenceBuilder: ) +def evaluate_node_evidence( + evidence: RunEvidence, + required_evidence: list[str], + output_text: str, +) -> list[str]: + """Evaluate v1 coarse-grained node evidence requirements.""" + + gaps: list[str] = [] + successful_tools = [ + item + for item in evidence.tool_results + if item.event_payload.get("success") is True + ] + for raw_requirement in required_evidence: + requirement = str(raw_requirement).strip() + if not requirement: + continue + if requirement == "tool_result": + if not successful_tools: + _append_unique(gaps, "missing required evidence: tool_result") + elif requirement == "url": + if not any(_tool_evidence_contains_url(item) for item in successful_tools): + _append_unique(gaps, "missing required evidence: url") + elif requirement == "output": + if not output_text.strip(): + _append_unique(gaps, "missing required evidence: output") + else: + _append_unique(gaps, f"unsupported evidence requirement: {requirement}") + return gaps + + def render_task_evidence(packet: TaskEvidencePacket) -> str: sections = [ f"Task evidence packet: task_id={packet.task_id} attempt={packet.attempt_index}", @@ -181,3 +214,20 @@ def _render_tool_evidence(item: ToolEvidence) -> str: def _optional_str(value: Any) -> str | None: return str(value) if value is not None else None + + +_URL_RE = re.compile(r"https?://[^\s<>'\"]+", re.IGNORECASE) + + +def _tool_evidence_contains_url(item: ToolEvidence) -> bool: + values = [ + item.url or "", + item.content, + json.dumps(item.event_payload, ensure_ascii=False, default=str), + ] + return any(_URL_RE.search(value) is not None for value in values) + + +def _append_unique(values: list[str], value: str) -> None: + if value not in values: + values.append(value) diff --git a/app-instance/backend/beaver/tasks/planner.py b/app-instance/backend/beaver/tasks/planner.py index ec23ae3..7d76d2d 100644 --- a/app-instance/backend/beaver/tasks/planner.py +++ b/app-instance/backend/beaver/tasks/planner.py @@ -4,11 +4,14 @@ from __future__ import annotations import asyncio import json +import os from dataclasses import dataclass, field from typing import Any, Literal from beaver.coordinator.models import AgentDescriptor, ExecutionGraph, ExecutionNode +from beaver.engine.context import SkillContext from beaver.engine.providers import ProviderBundle +from beaver.tools.registry import ToolRegistry from .models import TaskRecord from .skill_resolver import SkillResolutionReport, TaskSkillResolver @@ -17,6 +20,24 @@ from .skill_resolver import SkillResolutionReport, TaskSkillResolver TaskExecutionMode = Literal["single", "team"] +# Temporary name-based denylist until high-risk tool approval is implemented. +# Keep this policy centralized so planner behavior cannot drift by call site. +HIGH_RISK_PLANNER_TOOL_NAMES = frozenset( + { + "delete_file", + "execute_command", + "external_send", + "send_email", + "terminal", + "write_file", + } +) + + +def _agent_team_enabled() -> bool: + return os.getenv("BEAVER_AGENT_TEAM_ENABLED", "1").strip().lower() not in {"0", "false", "no", "off"} + + @dataclass(slots=True) class TaskExecutionPlan: mode: TaskExecutionMode @@ -25,14 +46,26 @@ class TaskExecutionPlan: final_synthesis_instruction: str = "" fallback_error: str | None = None skill_resolution_report: list[SkillResolutionReport] = field(default_factory=list) + planner_adaptation: dict[str, Any] = field(default_factory=dict) @property def is_team(self) -> bool: return self.mode == "team" and self.graph is not None @classmethod - def single(cls, reason: str, *, fallback_error: str | None = None) -> "TaskExecutionPlan": - return cls(mode="single", reason=reason, fallback_error=fallback_error) + def single( + cls, + reason: str, + *, + fallback_error: str | None = None, + planner_adaptation: dict[str, Any] | None = None, + ) -> "TaskExecutionPlan": + return cls( + mode="single", + reason=reason, + fallback_error=fallback_error, + planner_adaptation=dict(planner_adaptation or {}), + ) def to_event_payload(self) -> dict[str, Any]: strategy = self.graph.strategy if self.graph is not None else None @@ -57,6 +90,7 @@ class TaskExecutionPlan: if item.ephemeral_guidance_id ], "skill_resolution_report": [item.to_dict() for item in self.skill_resolution_report], + "planner_adaptation": dict(self.planner_adaptation), "fallback_error": self.fallback_error, } @@ -65,10 +99,34 @@ class TaskExecutionPlanner: """Plan whether a Task attempt should run through a team first.""" _MAX_NODES = 6 + _MAX_DEPTH = 4 _SUPPORTED_STRATEGIES = {"sequence", "parallel", "dag"} + _ALLOWED_NODE_FIELDS = { + "node_id", + "task", + "use_skill", + "skill_query", + "depends_on", + "input_contract", + "output_contract", + "requested_tools", + "required_evidence", + "evidence_contract", + "validation_rules", + "required_for_completion", + "block_downstream_on_partial", + "max_tool_iterations", + "constraints", + } - def __init__(self, *, task_skill_resolver: TaskSkillResolver | None = None) -> None: + def __init__( + self, + *, + task_skill_resolver: TaskSkillResolver | None = None, + tool_registry: ToolRegistry | None = None, + ) -> None: self.task_skill_resolver = task_skill_resolver + self.tool_registry = tool_registry async def plan( self, @@ -78,7 +136,15 @@ class TaskExecutionPlanner: attempt_index: int, provider_bundle: ProviderBundle | None = None, timeout_seconds: float = 30.0, + skill_summaries: list[str] | None = None, + tool_hints: list[str] | None = None, + activated_skills: list[SkillContext] | None = None, ) -> TaskExecutionPlan: + if not _agent_team_enabled(): + return TaskExecutionPlan.single("planner_disabled_by_environment") + if not self._needs_team_planning(task=task, user_message=user_message): + return TaskExecutionPlan.single("planner_skipped_simple_task") + provider = None model = None if provider_bundle is not None: @@ -87,6 +153,7 @@ class TaskExecutionPlanner: model = getattr(runtime, "model", None) if provider is None: return TaskExecutionPlan.single("planner_provider_unavailable") + selected_template, base_adaptation = self._select_team_template(activated_skills or []) try: response = await asyncio.wait_for( provider.chat( @@ -104,6 +171,10 @@ class TaskExecutionPlanner: task=task, user_message=user_message, attempt_index=attempt_index, + skill_summaries=skill_summaries or [], + tool_hints=tool_hints or [], + activated_skills=activated_skills or [], + selected_template=selected_template, ), }, ], @@ -114,7 +185,40 @@ class TaskExecutionPlanner: ), timeout=timeout_seconds, ) - plan = self.from_json(response.content or "") + try: + plan = self._from_json_or_raise(response.content or "") + except Exception as first_error: + repair_response = await asyncio.wait_for( + provider.chat( + messages=[ + { + "role": "system", + "content": "Repair invalid Beaver task planner JSON. Return only one compact JSON object.", + }, + { + "role": "user", + "content": ( + "Repair the invalid planner JSON using the task-only schema from the original " + f"request. Validation error: {first_error}\nInvalid output:\n{response.content or ''}" + ), + }, + ], + tools=None, + model=model, + max_tokens=4096, + temperature=0.0, + ), + timeout=timeout_seconds, + ) + try: + plan = self._from_json_or_raise(repair_response.content or "") + except Exception as repair_error: + return TaskExecutionPlan.single( + "planner_fallback_single", + fallback_error=f"initial validation: {first_error}; repair validation: {repair_error}", + planner_adaptation=base_adaptation, + ) + self._merge_adaptation(plan, base_adaptation) return await self._resolve_plan( plan, task=task, @@ -152,30 +256,90 @@ class TaskExecutionPlanner: graph.validate() plan.graph = graph plan.skill_resolution_report = reports + self._merge_skill_resolution_adaptation(plan, reports) return plan except Exception as exc: return TaskExecutionPlan.single("planner_fallback_single", fallback_error=f"task_skill_resolver_failed: {exc}") + @staticmethod + def _needs_team_planning(*, task: TaskRecord, user_message: str) -> bool: + text = " ".join( + part + for part in ( + task.goal, + task.description, + user_message, + ) + if part + ).lower() + if not text.strip(): + return False + + complex_markers = ( + "agent team", + "sub-agent", + "multi-agent", + "parallel", + "dag", + "workflow", + "review", + "research", + "compare", + "comparison", + "architecture", + "refactor", + "multi-file", + "end-to-end", + "并行", + "团队", + "多智能体", + "子代理", + "工作流", + "评审", + "审查", + "调研", + "研究", + "对比", + "架构", + "重构", + "多文件", + "端到端", + ) + return any(marker in text for marker in complex_markers) + def from_json(self, text: str) -> TaskExecutionPlan: try: - payload = self._parse_json_object(text) - mode = str(payload.get("mode") or "single").strip().lower() - reason = str(payload.get("reason") or "") - if mode != "team": - return TaskExecutionPlan.single(reason or "planner_selected_single") - - graph = self._graph_from_payload(payload) - graph.validate() - return TaskExecutionPlan( - mode="team", - reason=reason or "planner_selected_team", - graph=graph, - final_synthesis_instruction=str(payload.get("final_synthesis_instruction") or ""), - ) + return self._from_json_or_raise(text) except Exception as exc: return TaskExecutionPlan.single("planner_fallback_single", fallback_error=str(exc)) - def _graph_from_payload(self, payload: dict[str, Any]) -> ExecutionGraph: + def _from_json_or_raise(self, text: str) -> TaskExecutionPlan: + payload = self._parse_json_object(text) + mode = str(payload.get("mode") or "single").strip().lower() + reason = str(payload.get("reason") or "") + adaptation = self._adaptation_from_payload(payload) + if mode != "team": + return TaskExecutionPlan.single( + reason or "planner_selected_single", + planner_adaptation=adaptation, + ) + + graph = self._graph_from_payload(payload, adaptation=adaptation) + graph.validate(max_depth=self._MAX_DEPTH) + return TaskExecutionPlan( + mode="team", + reason=reason or "planner_selected_team", + graph=graph, + final_synthesis_instruction=str(payload.get("final_synthesis_instruction") or ""), + planner_adaptation=adaptation, + ) + + def _graph_from_payload( + self, + payload: dict[str, Any], + *, + adaptation: dict[str, Any], + ) -> ExecutionGraph: strategy = str(payload.get("strategy") or "sequence").strip().lower() if strategy not in self._SUPPORTED_STRATEGIES: raise ValueError(f"Unsupported team strategy: {strategy}") @@ -189,16 +353,27 @@ class TaskExecutionPlanner: for index, item in enumerate(raw_nodes, start=1): if not isinstance(item, dict): raise ValueError("Each team node must be an object") - agent_payload = item.get("agent") if isinstance(item.get("agent"), dict) else {} - skill_query = str(item.get("skill_query") or agent_payload.get("skill_query") or item.get("task") or "").strip() - requested_capabilities = _string_list( - item.get("required_capabilities") or item.get("capabilities") or agent_payload.get("capabilities") - ) - requested_tags = _string_list(item.get("tags") or agent_payload.get("tags")) - node_id = str(item.get("node_id") or item.get("id") or agent_payload.get("name") or f"node_{index}").strip() + unsupported = sorted(set(item) - self._ALLOWED_NODE_FIELDS) + if unsupported: + raise ValueError(f"Unsupported team node field(s): {', '.join(unsupported)}") + node_id = str(item.get("node_id") or f"node_{index}").strip() task = str(item.get("task") or "").strip() if not node_id or not task: - raise ValueError("Each team node requires node_id/id and task") + raise ValueError("Each team node requires node_id and task") + allowed_tool_names = self._resolve_requested_tools( + item.get("requested_tools"), + warnings=adaptation["warnings"], + ) + use_skill = _optional_str(item.get("use_skill")) + skill_query = _optional_str(item.get("skill_query")) or task + if use_skill is not None or "skill_query" in item: + adaptation.setdefault("node_skill_bindings", []).append( + { + "node_id": node_id, + "use_skill": use_skill, + "skill_query": skill_query, + } + ) nodes.append( ExecutionNode( node_id=node_id, @@ -208,30 +383,147 @@ class TaskExecutionPlanner: role="", system_prompt="", metadata={ + "use_skill": use_skill, "skill_query": skill_query, - "required_capabilities": requested_capabilities, - "requested_tags": requested_tags, + "required_capabilities": [], + "requested_tags": [], "sub_agent_kind": "generic_skill_worker", }, ), depends_on=[str(dep) for dep in item.get("depends_on") or []], - inherited_pinned_skills=[str(name) for name in item.get("pinned_skills") or []], constraints=[str(value) for value in item.get("constraints") or []], - expected_output=str(item.get("expected_output") or "") or None, + input_contract=_dict_value(item.get("input_contract")), + output_contract=_dict_value(item.get("output_contract")), + allowed_tool_names=allowed_tool_names, + required_evidence=_string_list(item.get("required_evidence")), + evidence_contract=_dict_value(item.get("evidence_contract")), + validation_rules=_string_list(item.get("validation_rules")), + required_for_completion=bool(item.get("required_for_completion", True)), + block_downstream_on_partial=bool(item.get("block_downstream_on_partial", False)), + max_tool_iterations=_optional_int(item.get("max_tool_iterations")), ) ) return ExecutionGraph(strategy=strategy, nodes=nodes) # type: ignore[arg-type] + def _resolve_requested_tools(self, value: Any, *, warnings: list[str]) -> list[str] | None: + if value is None: + return None + result: list[str] = [] + for name in _string_list(value): + if name.lower() in HIGH_RISK_PLANNER_TOOL_NAMES: + _append_unique(warnings, f"requires_high_risk_review: {name}") + continue + if self.tool_registry is None or self.tool_registry.get(name) is None: + _append_unique(warnings, f"unknown tool removed: {name}") + continue + result.append(name) + return result + + @staticmethod + def _adaptation_from_payload(payload: dict[str, Any]) -> dict[str, Any]: + raw = payload.get("adaptation") + adaptation = dict(raw) if isinstance(raw, dict) else {} + adaptation["warnings"] = _string_list(adaptation.get("warnings")) + return adaptation + + @staticmethod + def _select_team_template( + activated_skills: list[SkillContext], + ) -> tuple[SkillContext | None, dict[str, Any]]: + candidates = [ + skill + for skill in activated_skills + if isinstance(skill.team_template, dict) and isinstance(skill.team_template.get("nodes"), list) + ] + selected = candidates[0] if candidates else None + warnings: list[str] = [] + for skill in activated_skills: + for warning in skill.team_template_warnings: + _append_unique(warnings, f"{skill.name}: {warning}") + return selected, { + "template_used": False, + "selected_template": selected.name if selected else None, + "selection_reason": ( + "first activated skill with a valid team template" + if selected + else "no activated skill has a valid team template" + ), + "ignored_templates": [skill.name for skill in candidates[1:]], + "warnings": warnings, + } + + @staticmethod + def _merge_adaptation(plan: TaskExecutionPlan, base: dict[str, Any]) -> None: + payload = dict(plan.planner_adaptation) + warnings: list[str] = [] + for warning in [*base.get("warnings", []), *payload.get("warnings", [])]: + _append_unique(warnings, str(warning)) + merged = { + "template_used": bool(payload.get("template_used", False)), + "selected_template": base.get("selected_template"), + "selection_reason": base.get("selection_reason"), + "ignored_templates": list(base.get("ignored_templates", [])), + "warnings": warnings, + } + if isinstance(payload.get("node_skill_bindings"), list): + merged["node_skill_bindings"] = [dict(item) for item in payload["node_skill_bindings"] if isinstance(item, dict)] + plan.planner_adaptation = merged + + @staticmethod + def _merge_skill_resolution_adaptation( + plan: TaskExecutionPlan, + reports: list[SkillResolutionReport], + ) -> None: + warnings = plan.planner_adaptation.setdefault("warnings", []) + bindings = plan.planner_adaptation.get("node_skill_bindings") + binding_by_node = { + str(item.get("node_id")): item + for item in bindings or [] + if isinstance(item, dict) + } + for report in reports: + for warning in report.warnings: + _append_unique(warnings, warning) + binding = binding_by_node.get(report.node_id) + if binding is not None and report.requested_skill_name and not report.exact_binding_used: + binding["fallback_reason"] = f"use_skill unresolved; {report.reason}" + @staticmethod def _prompt( *, task: TaskRecord, user_message: str, attempt_index: int, + skill_summaries: list[str] | None = None, + tool_hints: list[str] | None = None, + activated_skills: list[SkillContext] | None = None, + selected_template: SkillContext | None = None, ) -> str: history_note = "" if task.feedback: history_note = "\nRelevant task history:\n" + json.dumps(task.feedback[-5:], ensure_ascii=False) + skill_note = "" + if skill_summaries: + skill_note = "\nActivated skill summaries:\n" + "\n".join(f"- {item}" for item in skill_summaries) + guidance_note = "" + if activated_skills: + guidance_note = "\nActivated Skill guidance:\n" + "\n".join( + f"[{skill.name}]\n{skill.content}" for skill in activated_skills + ) + template_note = "" + if selected_template is not None: + template_note = "\nPrimary Skill team template:\n" + json.dumps( + { + "skill_name": selected_template.name, + "skill_version": selected_template.version, + "template": selected_template.team_template, + }, + ensure_ascii=False, + indent=2, + ) + tool_note = "" + if tool_hints: + tool_note = "\nActivated skill tool hints:\n" + "\n".join(f"- {item}" for item in tool_hints) return ( "Decide execution mode for this internal Task attempt.\n" "Use mode=team only when independent research, review, implementation slices, or staged checks " @@ -241,13 +533,24 @@ class TaskExecutionPlanner: ' "mode": "single" | "team",\n' ' "reason": "short reason",\n' ' "strategy": "sequence" | "parallel" | "dag",\n' - ' "nodes": [{"node_id": "api_review", "task": "...", "skill_query": "API contract review", ' - '"required_capabilities": ["schema compatibility"], "depends_on": []}],\n' + ' "nodes": [{"node_id": "collect", "task": "...", "use_skill": "optional exact skill", ' + '"skill_query": "optional dynamic skill query", "depends_on": [], ' + '"input_contract": {}, "output_contract": {}, "requested_tools": [], ' + '"required_evidence": [], "evidence_contract": {}, "validation_rules": [], ' + '"required_for_completion": true, "block_downstream_on_partial": false, ' + '"max_tool_iterations": 3, "constraints": []}],\n' + ' "adaptation": {"template_used": true, "warnings": []},\n' ' "final_synthesis_instruction": "how the main agent should synthesize team output"\n' "}\n\n" + "Node definitions are task-only. Never output agent or role fields. Use at most one primary " + "Skill template; treat all other activated Skills as guidance.\n\n" f"Task goal:\n{task.goal}\n\n" f"Current user request:\n{user_message}\n\n" f"Attempt index: {attempt_index}\n" + f"{skill_note}" + f"{guidance_note}" + f"{template_note}" + f"{tool_note}" f"{history_note}" ) @@ -275,6 +578,26 @@ def _optional_str(value: Any) -> str | None: return text or None +def _optional_int(value: Any) -> int | None: + if value in (None, ""): + return None + if isinstance(value, bool): + raise ValueError("max_tool_iterations must be an integer") + result = int(value) + if result < 0: + raise ValueError("max_tool_iterations must be non-negative") + return result + + +def _dict_value(value: Any) -> dict[str, Any]: + return dict(value) if isinstance(value, dict) else {} + + +def _append_unique(values: list[str], value: str) -> None: + if value and value not in values: + values.append(value) + + def _string_list(value: Any) -> list[str]: if not isinstance(value, list): if isinstance(value, str): diff --git a/app-instance/backend/beaver/tasks/router.py b/app-instance/backend/beaver/tasks/router.py index ff7ae7a..ae9b874 100644 --- a/app-instance/backend/beaver/tasks/router.py +++ b/app-instance/backend/beaver/tasks/router.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio import json +import re from typing import Any from .models import MainAgentDecision, TaskRecord @@ -24,6 +25,15 @@ class MainAgentRouter: thinking_enabled: bool | None = None, timeout_seconds: float = 8.0, ) -> MainAgentDecision: + if active_task is None and _is_obvious_simple_chat(message): + return MainAgentDecision(mode="simple", reason="obvious_simple_chat", action="simple_chat") + if active_task is None and _is_obvious_task_request(message): + return MainAgentDecision( + mode="task", + reason="obvious_task", + starts_new_task=True, + action="create_task", + ) if provider is None: return self._apply_active_task_boundary( self._fallback(active_task=active_task, reason="router_provider_unavailable"), @@ -246,6 +256,64 @@ def _clean_short_title(value: Any) -> str | None: return title[:40] or None +def _is_obvious_simple_chat(message: str) -> bool: + text = _compact_text(message).lower().strip("!!??。.,,~~") + if not text: + return False + if _has_url_or_path(text) or _looks_like_fresh_task_request(text): + return False + if len(text) <= 24 and text in { + "hi", + "hello", + "hey", + "thanks", + "thankyou", + "thankyou!", + "谢谢", + "谢了", + "多谢", + "你好", + "您好", + "嗨", + "在吗", + "早上好", + "下午好", + "晚上好", + "辛苦了", + }: + return True + simple_prefixes = ( + "翻译", + "translate", + "润色", + "改写", + "校对", + "总结下面", + "总结这段", + "摘要下面", + "summarize this", + ) + return len(text) <= 1200 and text.startswith(simple_prefixes) + + +def _is_obvious_task_request(message: str) -> bool: + text = _compact_text(message) + if not text: + return False + if _looks_like_explicit_task_followup(text): + return False + if _has_url_or_path(text): + return True + return _looks_like_fresh_task_request(text) + + +def _has_url_or_path(text: str) -> bool: + return bool( + re.search(r"https?://|www\.", text) + or re.search(r"(^|[\s'\"`])(?:[./~]|[a-zA-Z]:[\\/])[^\s'\"`]+", text) + ) + + def _looks_like_explicit_task_followup(message: str) -> bool: text = _compact_text(message) if not text: @@ -307,6 +375,16 @@ def _looks_like_fresh_task_request(message: str) -> bool: "看看最新", "最新", "今天", + "昨天", + "昨日", + "昨晚", + "刚刚", + "最近", + "近期", + "本届", + "本场", + "这场", + "上一场", "明天", "上传", "下载", @@ -324,6 +402,12 @@ def _looks_like_fresh_task_request(message: str) -> bool: "look up", "latest", "today", + "yesterday", + "last night", + "recent", + "recently", + "this match", + "this game", "tomorrow", "upload", "download", diff --git a/app-instance/backend/beaver/tasks/skill_resolver.py b/app-instance/backend/beaver/tasks/skill_resolver.py index 8038998..9b65a78 100644 --- a/app-instance/backend/beaver/tasks/skill_resolver.py +++ b/app-instance/backend/beaver/tasks/skill_resolver.py @@ -7,9 +7,11 @@ from dataclasses import dataclass, field, replace from typing import Any from beaver.coordinator.models import AgentDescriptor, ExecutionGraph, ExecutionNode +from beaver.engine.context import SkillContext from beaver.engine.providers import ProviderBundle from beaver.skills.assembler.embedding_retriever import SkillEmbeddingRetriever from beaver.skills.catalog.loader import SkillsLoader +from beaver.skills.catalog.utils import strip_frontmatter from beaver.skills.drafts import DraftService from beaver.skills.learning import EphemeralGuidanceSynthesizer from beaver.tasks.models import TaskRecord @@ -24,6 +26,9 @@ class SkillResolutionReport: ephemeral_guidance_id: str | None = None ephemeral_guidance_name: str | None = None ephemeral_used: bool = False + requested_skill_name: str | None = None + exact_binding_used: bool = False + warnings: list[str] = field(default_factory=list) reason: str = "" def to_dict(self) -> dict[str, Any]: @@ -35,6 +40,9 @@ class SkillResolutionReport: "ephemeral_guidance_id": self.ephemeral_guidance_id, "ephemeral_guidance_name": self.ephemeral_guidance_name, "ephemeral_used": self.ephemeral_used, + "requested_skill_name": self.requested_skill_name, + "exact_binding_used": self.exact_binding_used, + "warnings": list(self.warnings), "reason": self.reason, } @@ -87,12 +95,45 @@ class TaskSkillResolver: attempt_index: int, provider_bundle: ProviderBundle, ) -> tuple[ExecutionNode, SkillResolutionReport]: + use_skill = str(node.agent.metadata.get("use_skill") or "").strip() skill_query = str(node.agent.metadata.get("skill_query") or node.task or node.node_id).strip() + warnings: list[str] = [] required_capabilities = [ str(item).strip() for item in node.agent.metadata.get("required_capabilities", []) if str(item).strip() ] + if use_skill: + exact_context = self._load_exact_skill_context(use_skill) + if exact_context is not None: + resolved = self._generic_node( + node, + pinned_skill_names=_merge_names(node.inherited_pinned_skills, [use_skill]), + pinned_skill_contexts=_merge_skill_contexts( + node.inherited_pinned_skill_contexts, + [exact_context], + ), + metadata={ + **node.agent.metadata, + "use_skill": use_skill, + "skill_query": skill_query, + "required_capabilities": required_capabilities, + "selected_skill_names": [use_skill], + "ephemeral_skill_names": [], + "exact_binding_used": True, + }, + ) + return resolved, SkillResolutionReport( + node_id=node.node_id, + skill_query=skill_query, + required_capabilities=required_capabilities, + selected_skill_names=[use_skill], + requested_skill_name=use_skill, + exact_binding_used=True, + reason="exact use_skill binding", + ) + warnings.append(f"use_skill unresolved: {use_skill}") + if self._is_summary_only_node(node, skill_query=skill_query, required_capabilities=required_capabilities): resolved = self._generic_node( node, @@ -104,6 +145,7 @@ class TaskSkillResolver: "required_capabilities": required_capabilities, "selected_skill_names": [], "ephemeral_skill_names": [], + "exact_binding_used": False, "summary_uses_dependency_outputs_only": True, }, ) @@ -113,6 +155,9 @@ class TaskSkillResolver: required_capabilities=required_capabilities, selected_skill_names=[], ephemeral_used=False, + requested_skill_name=use_skill or None, + exact_binding_used=False, + warnings=warnings, reason="summary node uses dependency outputs directly", ) @@ -141,6 +186,7 @@ class TaskSkillResolver: "required_capabilities": required_capabilities, "selected_skill_names": selected, "ephemeral_skill_names": [], + "exact_binding_used": False, }, ) return resolved, SkillResolutionReport( @@ -149,6 +195,9 @@ class TaskSkillResolver: required_capabilities=required_capabilities, selected_skill_names=selected, ephemeral_used=False, + requested_skill_name=use_skill or None, + exact_binding_used=False, + warnings=warnings, reason="matched published skill", ) @@ -174,6 +223,7 @@ class TaskSkillResolver: "ephemeral_guidance_id": missing.guidance_id, "ephemeral_guidance_name": missing.guidance_name, "ephemeral_skill_names": [missing.skill_context.name], + "exact_binding_used": False, }, ) return resolved, SkillResolutionReport( @@ -183,9 +233,27 @@ class TaskSkillResolver: ephemeral_guidance_id=missing.guidance_id, ephemeral_guidance_name=missing.guidance_name, ephemeral_used=True, + requested_skill_name=use_skill or None, + exact_binding_used=False, + warnings=warnings, reason="generated ephemeral guidance for missing sub-agent capability", ) + def _load_exact_skill_context(self, name: str) -> SkillContext | None: + record = self.skills_loader.get_skill_record(name) + raw_content = self.skills_loader.load_published_skill(name) + content = strip_frontmatter(raw_content).strip() if raw_content else "" + if record is None or not content: + return None + return SkillContext( + name=name, + content=content, + version=record.version, + content_hash=record.content_hash or "", + activation_reason="explicit_node_binding", + tool_hints=list(record.tool_hints), + ) + async def _select_published_skills(self, *, query: str, provider_bundle: ProviderBundle) -> list[str]: candidates = self.skills_loader.build_selection_candidates() if not candidates: @@ -336,3 +404,14 @@ def _merge_names(parent: list[str], selected: list[str]) -> list[str]: if name and name not in result: result.append(name) return result + + +def _merge_skill_contexts(parent: list[SkillContext], selected: list[SkillContext]) -> list[SkillContext]: + result: list[SkillContext] = [] + seen: set[str] = set() + for context in [*parent, *selected]: + if context.name in seen: + continue + seen.add(context.name) + result.append(context) + return result diff --git a/app-instance/backend/beaver/tools/builtins/web.py b/app-instance/backend/beaver/tools/builtins/web.py index 90e55b3..ae55d88 100644 --- a/app-instance/backend/beaver/tools/builtins/web.py +++ b/app-instance/backend/beaver/tools/builtins/web.py @@ -5,10 +5,11 @@ from __future__ import annotations import asyncio from dataclasses import dataclass, field from html import unescape +from html.parser import HTMLParser import json import re from typing import Any -from urllib.parse import quote_plus, urlparse +from urllib.parse import quote_plus, urljoin, urlparse import httpx @@ -24,6 +25,10 @@ def _strip_html(value: str) -> str: return re.sub(r"\s+", " ", text).strip() +def _compact_text(value: str) -> str: + return re.sub(r"\s+", " ", unescape(value)).strip() + + def _safe_url(url: str) -> str: parsed = urlparse(url) if parsed.scheme not in {"http", "https"} or not parsed.netloc: @@ -31,6 +36,77 @@ def _safe_url(url: str) -> str: return url +class _HtmlMetadataParser(HTMLParser): + def __init__(self, base_url: str) -> None: + super().__init__(convert_charrefs=True) + self.base_url = base_url + self.title = "" + self.links: list[dict[str, str]] = [] + self._in_title = False + self._current_href: str | None = None + self._current_text: list[str] = [] + self._skip_depth = 0 + self._seen_urls: set[str] = set() + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + lowered = tag.lower() + if lowered in {"script", "style"}: + self._skip_depth += 1 + return + if self._skip_depth: + return + if lowered == "title": + self._in_title = True + return + if lowered == "a": + href = dict(attrs).get("href") + if href: + self._current_href = urljoin(self.base_url, href) + self._current_text = [] + + def handle_endtag(self, tag: str) -> None: + lowered = tag.lower() + if lowered in {"script", "style"} and self._skip_depth: + self._skip_depth -= 1 + return + if self._skip_depth: + return + if lowered == "title": + self._in_title = False + self.title = _compact_text(self.title) + return + if lowered == "a" and self._current_href: + parsed = urlparse(self._current_href) + if parsed.scheme in {"http", "https"} and self._current_href not in self._seen_urls: + text = _compact_text(" ".join(self._current_text)) + self.links.append({"text": text, "url": self._current_href}) + self._seen_urls.add(self._current_href) + self._current_href = None + self._current_text = [] + + def handle_data(self, data: str) -> None: + if self._skip_depth: + return + if self._in_title: + self.title += data + if self._current_href: + self._current_text.append(data) + + +def _extract_html_metadata(html: str, base_url: str, *, max_links: int = 80) -> dict[str, Any]: + parser = _HtmlMetadataParser(base_url) + parser.feed(html) + links = parser.links[:max_links] + pdf_links = [ + link for link in links if urlparse(link["url"]).path.lower().endswith(".pdf") + ][:30] + return { + "title": parser.title, + "links": links, + "pdf_links": pdf_links, + } + + @dataclass(slots=True) class WebFetchTool: name: str = "web_fetch" @@ -61,13 +137,20 @@ class WebFetchTool: response.raise_for_status() content_type = response.headers.get("content-type", "") raw = response.text - text = _strip_html(raw) if "html" in content_type.lower() else raw + is_html = "html" in content_type.lower() + text = _strip_html(raw) if is_html else raw + metadata = _extract_html_metadata(raw, str(response.url)) if is_html else { + "title": "", + "links": [], + "pdf_links": [], + } truncated = len(text) > limit return _json_result( True, url=str(response.url), status_code=response.status_code, content_type=content_type, + **metadata, content=text[:limit], truncated=truncated, ) @@ -97,6 +180,15 @@ class WebSearchTool: if not str(query).strip(): raise ValueError("query is required") bounded = max(1, min(int(limit or 5), 10)) + errors: list[str] = [] + try: + ddgs_results = await asyncio.to_thread(_search_ddgs, query, bounded) + except Exception as exc: + ddgs_results = [] + errors.append(str(exc)) + if ddgs_results: + return _json_result(True, **_search_result_payload(query, "ddgs", ddgs_results)) + headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"} timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5) async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client: @@ -118,7 +210,6 @@ class WebSearchTool: ) ), ] - errors: list[str] = [] try: for completed in asyncio.as_completed(tasks): try: @@ -127,7 +218,7 @@ class WebSearchTool: errors.append(str(exc)) continue if results: - return _json_result(True, query=query, engine=engine, results=results) + return _json_result(True, **_search_result_payload(query, engine, results)) detail = "; ".join(error for error in errors if error) or "no search results" return _json_result(False, query=query, error=detail) finally: @@ -182,6 +273,62 @@ def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]: return results +def _search_ddgs(query: str, limit: int) -> list[dict[str, str]]: + from ddgs import DDGS # type: ignore[import-not-found] + + rows = DDGS().text(query, max_results=limit) + results: list[dict[str, str]] = [] + for row in rows or []: + title = _compact_text(str(row.get("title") or "")) + result_url = str(row.get("href") or row.get("url") or "").strip() + snippet = _compact_text(str(row.get("body") or row.get("snippet") or "")) + if title and result_url: + results.append({"title": title, "url": result_url, "snippet": snippet}) + if len(results) >= limit: + break + return results + + +def _search_result_payload(query: str, engine: str, results: list[dict[str, str]]) -> dict[str, Any]: + quality, reason = _assess_search_quality(query, results) + payload: dict[str, Any] = { + "query": query, + "engine": engine, + "quality": quality, + "results": results, + } + if reason: + payload["low_relevance_reason"] = reason + return payload + + +def _search_terms(value: str) -> set[str]: + return { + term + for term in re.findall(r"[a-z0-9]+", value.lower()) + if len(term) > 2 + } + + +def _assess_search_quality(query: str, results: list[dict[str, str]]) -> tuple[str, str | None]: + terms = _search_terms(query) + if not terms: + return "high", None + required_overlap = min(2, len(terms)) + for result in results: + haystack = " ".join( + [ + result.get("title", ""), + result.get("snippet", ""), + urlparse(result.get("url", "")).netloc, + urlparse(result.get("url", "")).path, + ] + ) + if len(terms & _search_terms(haystack)) >= required_overlap: + return "high", None + return "low", "results do not overlap enough with query terms" + + def _parse_duckduckgo_results(html: str, limit: int) -> list[dict[str, str]]: results: list[dict[str, str]] = [] pattern = re.compile( diff --git a/app-instance/backend/beaver/tools/runtime/executor.py b/app-instance/backend/beaver/tools/runtime/executor.py index 2d842a3..b293911 100644 --- a/app-instance/backend/beaver/tools/runtime/executor.py +++ b/app-instance/backend/beaver/tools/runtime/executor.py @@ -37,6 +37,14 @@ class ToolExecutor: ) -> ToolResult: """按工具名执行一次调用。""" + allowed = context.metadata.get("allowed_tool_names") if context is not None else None + if isinstance(allowed, list) and tool_name not in allowed: + return ToolResult( + success=False, + content=f"Tool {tool_name} is not allowed for this node.", + tool_name=tool_name, + error="tool_not_allowed", + ) tool = self.registry.get(tool_name) if tool is None: return ToolResult( diff --git a/app-instance/backend/pyproject.toml b/app-instance/backend/pyproject.toml index d16c3da..4396e15 100644 --- a/app-instance/backend/pyproject.toml +++ b/app-instance/backend/pyproject.toml @@ -6,6 +6,7 @@ requires-python = ">=3.11" dependencies = [ "anthropic>=0.51.0,<1.0.0", "croniter>=6.0.0,<7.0.0", + "ddgs>=9.0.0,<10.0.0", "fastmcp>=3.0.0,<4.0.0", "fastapi>=0.115.0,<1.0.0", "httpx>=0.28.0,<1.0.0", diff --git a/app-instance/backend/tests/unit/test_agent_loop.py b/app-instance/backend/tests/unit/test_agent_loop.py index ab48736..6c8d8d4 100644 --- a/app-instance/backend/tests/unit/test_agent_loop.py +++ b/app-instance/backend/tests/unit/test_agent_loop.py @@ -1,8 +1,10 @@ import asyncio +import json from contextlib import suppress from typing import Any from beaver.engine import AgentLoop, AgentRunResult, EngineLoader +from beaver.engine import loop as loop_module def _run_result(run_id: str, output_text: str) -> AgentRunResult: @@ -45,3 +47,37 @@ def test_running_loop_handles_reentrant_submit_direct(tmp_path) -> None: assert calls == ["outer", "inner"] asyncio.run(run_case()) + + +def test_web_search_loop_guard_stops_after_repeated_low_quality_results() -> None: + guard = loop_module._WebSearchLoopGuard() + low_quality = json.dumps( + { + "success": True, + "query": "weather beijing", + "quality": "low", + "results": [{"title": "Example", "url": "https://example.com", "snippet": ""}], + } + ) + + assert guard.observe_result("web_search", low_quality) is None + assert guard.observe_result("web_search", low_quality) is None + + guidance = guard.observe_result("web_search", low_quality) + + assert guidance is not None + assert guidance["finish_reason"] == "web_search_low_quality_budget" + assert "weather beijing" in guidance["message"] + + +def test_web_search_loop_guard_resets_after_useful_result() -> None: + guard = loop_module._WebSearchLoopGuard() + low_quality = json.dumps({"success": True, "query": "weather", "quality": "low", "results": []}) + useful = json.dumps({"success": True, "query": "weather", "quality": "high", "results": []}) + + assert guard.observe_result("web_search", low_quality) is None + assert guard.observe_result("web_search", useful) is None + assert guard.observe_result("web_search", low_quality) is None + assert guard.observe_result("web_search", low_quality) is None + + assert guard.observe_result("web_search", low_quality) is not None diff --git a/app-instance/backend/tests/unit/test_agent_loop_replay_executor.py b/app-instance/backend/tests/unit/test_agent_loop_replay_executor.py index a171e8e..d404d40 100644 --- a/app-instance/backend/tests/unit/test_agent_loop_replay_executor.py +++ b/app-instance/backend/tests/unit/test_agent_loop_replay_executor.py @@ -1,7 +1,9 @@ from __future__ import annotations +import asyncio from pathlib import Path from types import SimpleNamespace +from typing import Any import pytest @@ -44,6 +46,49 @@ class ToolCallingProvider(LLMProvider): return "stub" +class ParallelToolProvider(LLMProvider): + def __init__(self) -> None: + super().__init__() + self.calls = 0 + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + model: str | None = None, + max_tokens: int | None = None, + temperature: float = 0.7, + thinking_enabled: bool | None = None, + ) -> LLMResponse: + self.calls += 1 + if self.calls == 1: + return LLMResponse( + content="", + tool_calls=[ + ToolCallRequest(id="call-1", name="read_file", arguments={"path": "README.md"}), + ToolCallRequest(id="call-2", name="search_files", arguments={"query": "Beaver"}), + ], + ) + return LLMResponse(content="done") + + def get_default_model(self) -> str: + return "stub" + + +class ConcurrentReadOnlyExecutor: + def __init__(self) -> None: + self.started: list[str] = [] + self._both_started = asyncio.Event() + + async def execute_tool_call(self, tool_call: ToolCallRequest | dict[str, Any], *, context=None): + name = getattr(tool_call, "name", "") + self.started.append(name) + if len(self.started) >= 2: + self._both_started.set() + await asyncio.wait_for(self._both_started.wait(), timeout=0.2) + return SimpleNamespace(success=True, error=None, content=f"{name} result", tool_name=name) + + @pytest.mark.asyncio async def test_process_direct_uses_replay_tool_executor(tmp_path: Path) -> None: loop = AgentLoop(loader=EngineLoader(workspace=tmp_path)) @@ -69,3 +114,63 @@ async def test_process_direct_uses_replay_tool_executor(tmp_path: Path) -> None: assert result.output_text == "done" assert replay_executor.traces assert replay_executor.traces[0]["tool_name"] == "read_file" + + +@pytest.mark.asyncio +async def test_process_direct_runs_read_only_tool_calls_concurrently(tmp_path: Path) -> None: + loop = AgentLoop(loader=EngineLoader(workspace=tmp_path)) + provider = ParallelToolProvider() + executor = ConcurrentReadOnlyExecutor() + runtime = SimpleNamespace(model="stub", provider_name="stub") + + result = await loop.process_direct( + "Read and search the workspace.", + provider_bundle=ProviderBundle(main_runtime=runtime, main_provider=provider), # type: ignore[arg-type] + include_skill_assembly=False, + pinned_skill_names=[], + tool_executor_override=executor, + max_tool_iterations=2, + ) + + assert result.output_text == "done" + assert executor.started == ["read_file", "search_files"] + + +@pytest.mark.asyncio +async def test_process_direct_records_latency_breakdown(tmp_path: Path) -> None: + loop = AgentLoop(loader=EngineLoader(workspace=tmp_path)) + provider = ParallelToolProvider() + executor = ConcurrentReadOnlyExecutor() + runtime = SimpleNamespace(model="stub", provider_name="stub") + + result = await loop.process_direct( + "Read and search the workspace.", + provider_bundle=ProviderBundle(main_runtime=runtime, main_provider=provider), # type: ignore[arg-type] + include_skill_assembly=False, + pinned_skill_names=[], + tool_executor_override=executor, + max_tool_iterations=2, + ) + + latency = result.usage["latency_ms"] + expected_keys = { + "router_ms", + "mcp_ms", + "skill_assembly_ms", + "tool_assembly_ms", + "context_build_ms", + "llm_ms", + "tool_ms", + "session_write_ms", + "total_ms", + } + assert expected_keys.issubset(latency) + assert all(isinstance(latency[key], (int, float)) and latency[key] >= 0 for key in expected_keys) + assert latency["llm_ms"] > 0 + assert latency["tool_ms"] > 0 + assert latency["total_ms"] >= latency["llm_ms"] + + loaded = loop.boot() + events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) + completed = next(event for event in events if event.event_type == "run_completed") + assert completed.event_payload["latency_ms"] == latency diff --git a/app-instance/backend/tests/unit/test_agent_team_toggle.py b/app-instance/backend/tests/unit/test_agent_team_toggle.py new file mode 100644 index 0000000..97c64a0 --- /dev/null +++ b/app-instance/backend/tests/unit/test_agent_team_toggle.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import asyncio +from types import SimpleNamespace + +from beaver.engine.providers.base import LLMProvider, LLMResponse +from beaver.engine.providers.factory import ProviderBundle +from beaver.tasks import TaskExecutionPlanner, TaskRecord + + +class _TeamPlannerProvider(LLMProvider): + def __init__(self) -> None: + super().__init__() + self.calls = 0 + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + model: str | None = None, + max_tokens: int = 4096, + temperature: float = 0.7, + ) -> LLMResponse: + self.calls += 1 + return LLMResponse( + content='{"mode":"team","reason":"parallel research","strategy":"parallel","nodes":[{"node_id":"research","task":"research","agent":{"name":"researcher"}}]}', + finish_reason="stop", + provider_name="stub", + model="stub-model", + ) + + def get_default_model(self) -> str: + return "stub-model" + + +def test_agent_team_can_be_disabled_by_environment(monkeypatch) -> None: + monkeypatch.setenv("BEAVER_AGENT_TEAM_ENABLED", "0") + provider = _TeamPlannerProvider() + task = TaskRecord( + task_id="task-1", + session_id="session-1", + description="research and compare options", + goal="research and compare options", + constraints=[], + priority=0, + status="open", + creator="test", + created_at="now", + updated_at="now", + ) + bundle = ProviderBundle( + main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), + main_provider=provider, + ) + + plan = asyncio.run( + TaskExecutionPlanner().plan( + task=task, + user_message="research and compare options", + attempt_index=1, + provider_bundle=bundle, + ) + ) + + assert plan.mode == "single" + assert plan.reason == "planner_disabled_by_environment" + assert provider.calls == 0 diff --git a/app-instance/backend/tests/unit/test_agent_team_v1.py b/app-instance/backend/tests/unit/test_agent_team_v1.py index a098b81..2503a4e 100644 --- a/app-instance/backend/tests/unit/test_agent_team_v1.py +++ b/app-instance/backend/tests/unit/test_agent_team_v1.py @@ -8,7 +8,8 @@ import pytest from beaver.memory.curated.snapshot import MemorySnapshot from beaver.services.memory_service import MemoryService -from beaver.coordinator import AgentDescriptor, DelegationEnvelope, ExecutionGraph, ExecutionNode +from beaver.coordinator import AgentDescriptor, DelegationEnvelope, ExecutionGraph, ExecutionNode, NodeRunResult +from beaver.coordinator.execution.scheduler import TeamGraphScheduler from beaver.coordinator.local import LocalAgentRunner from beaver.engine import AgentLoop, EngineLoader from beaver.engine.context import SkillContext @@ -90,6 +91,15 @@ class PerRunSnapshotMemoryService(MemoryService): return MemorySnapshot(memory_block="# Memory\n\nshared-snapshot", user_block=None) +class CapturingRunner: + def __init__(self) -> None: + self.envelopes: list[DelegationEnvelope] = [] + + async def run(self, envelope: DelegationEnvelope, **kwargs) -> NodeRunResult: + self.envelopes.append(envelope) + return NodeRunResult(node_id=envelope.node_id or "node", success=True, output_text="done") + + def _bundle(provider: RecordingProvider) -> ProviderBundle: return ProviderBundle( main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), @@ -161,10 +171,72 @@ def test_local_agent_runner_uses_shared_loop_and_records_parent_task(tmp_path: P child_session = loaded.session_manager.get_session(result.session_id) # type: ignore[union-attr,arg-type] assert result.success is True + assert result.completion_status == "succeeded" + assert result.evidence_gaps == [] assert run_record.task_id == "task-parent" assert child_session["parent_session_id"] == "session-root" +def test_node_without_required_tool_result_is_partial(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider([_response("collected narrative")]) + envelope = DelegationEnvelope( + parent_task_id=None, + parent_session_id="session-root", + parent_run_id=None, + agent=AgentDescriptor(name="collect"), + task="collect", + node_id="collect", + required_evidence=["tool_result"], + ) + + result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider))) + + assert result.success is False + assert result.completion_status == "partial" + assert result.evidence_gaps == ["missing required evidence: tool_result"] + + +def test_node_with_required_nonempty_output_succeeds(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider([_response("verified output")]) + envelope = DelegationEnvelope( + parent_task_id=None, + parent_session_id="session-root", + parent_run_id=None, + agent=AgentDescriptor(name="verify"), + task="verify", + node_id="verify", + required_evidence=["output"], + ) + + result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider))) + + assert result.success is True + assert result.completion_status == "succeeded" + assert result.evidence_gaps == [] + + +def test_unknown_evidence_requirement_makes_node_partial(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider([_response("output")]) + envelope = DelegationEnvelope( + parent_task_id=None, + parent_session_id="session-root", + parent_run_id=None, + agent=AgentDescriptor(name="verify"), + task="verify", + node_id="verify", + required_evidence=["unknown_type"], + ) + + result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider))) + + assert result.success is False + assert result.completion_status == "partial" + assert result.evidence_gaps == ["unsupported evidence requirement: unknown_type"] + + def test_team_node_preserves_evidence_when_finish_reason_is_not_stop(tmp_path: Path) -> None: loop = _loop(tmp_path) provider = RecordingProvider([_response("partial evidence", finish_reason="max_tool_iterations")]) @@ -277,6 +349,108 @@ def test_team_sequence_passes_prior_outputs(tmp_path: Path) -> None: assert "Dependency first output:\nfirst output" in providers["second"].calls[0][0]["content"] +def test_partial_node_allows_downstream_by_default(tmp_path: Path) -> None: + loop = _loop(tmp_path) + providers = { + "collect": RecordingProvider([_response("partial source notes")]), + "extract": RecordingProvider([_response("extracted metrics")]), + } + graph = ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + "collect", + "collect", + AgentDescriptor(name="collect"), + required_evidence=["tool_result"], + ), + ExecutionNode("extract", "extract", AgentDescriptor(name="extract")), + ], + ) + + result = asyncio.run( + TeamService(loop).run_team( + graph, + parent_task_id=None, + parent_session_id="session-root", + provider_bundle_factory=lambda node: _bundle(providers[node.node_id]), + ) + ) + + assert result.node_results[0].completion_status == "partial" + assert result.node_results[1].completion_status == "succeeded" + assert "Dependency collect output:\npartial source notes" in providers["extract"].calls[0][0]["content"] + + +def test_partial_node_blocks_downstream_when_configured(tmp_path: Path) -> None: + loop = _loop(tmp_path) + providers = { + "collect": RecordingProvider([_response("partial source notes")]), + "extract": RecordingProvider([_response("must not run")]), + } + graph = ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + "collect", + "collect", + AgentDescriptor(name="collect"), + required_evidence=["tool_result"], + block_downstream_on_partial=True, + ), + ExecutionNode("extract", "extract", AgentDescriptor(name="extract")), + ], + ) + + result = asyncio.run( + TeamService(loop).run_team( + graph, + parent_task_id=None, + parent_session_id="session-root", + provider_bundle_factory=lambda node: _bundle(providers[node.node_id]), + ) + ) + + assert result.node_results[0].completion_status == "partial" + assert result.node_results[1].completion_status == "blocked" + assert providers["extract"].calls == [] + + +def test_scheduler_copies_task_two_contract_fields_to_envelope() -> None: + runner = CapturingRunner() + node = ExecutionNode( + "collect", + "collect", + AgentDescriptor(name="collect"), + input_contract={"query": "str"}, + output_contract={"sources": "list"}, + required_evidence=["tool_result"], + evidence_contract={"entities": ["MGM"]}, + validation_rules=["official_sources_only"], + required_for_completion=False, + block_downstream_on_partial=True, + max_tool_iterations=2, + ) + + asyncio.run( + TeamGraphScheduler(runner).run( # type: ignore[arg-type] + ExecutionGraph(strategy="sequence", nodes=[node]), + parent_task_id=None, + parent_session_id="session-root", + ) + ) + + envelope = runner.envelopes[0] + assert envelope.input_contract == {"query": "str"} + assert envelope.output_contract == {"sources": "list"} + assert envelope.required_evidence == ["tool_result"] + assert envelope.evidence_contract == {"entities": ["MGM"]} + assert envelope.validation_rules == ["official_sources_only"] + assert envelope.required_for_completion is False + assert envelope.block_downstream_on_partial is True + assert envelope.max_tool_iterations == 2 + + def test_team_parallel_runs_all_nodes(tmp_path: Path) -> None: loop = _loop(tmp_path) providers = { @@ -428,9 +602,12 @@ def test_team_dag_blocks_dependents_after_failure(tmp_path: Path) -> None: ) ) publish = [item for item in result.node_results if item.node_id == "publish"][0] + validate = [item for item in result.node_results if item.node_id == "validate"][0] assert result.success is False + assert validate.completion_status == "failed" assert publish.finish_reason == "blocked" + assert publish.completion_status == "blocked" assert publish.run_id is None assert publish.error == "Blocked by failed dependency: validate" assert "failed" not in result.summary.split("Failed nodes:")[0] @@ -471,8 +648,10 @@ def test_dag_node_factory_error_blocks_dependents(tmp_path: Path) -> None: assert result.success is False assert validate.finish_reason == "error" + assert validate.completion_status == "failed" assert validate.error == "validator unavailable" assert publish.finish_reason == "blocked" + assert publish.completion_status == "blocked" assert publish.error == "Blocked by failed dependency: validate" @@ -550,6 +729,76 @@ def test_graph_structure_errors_still_raise(tmp_path: Path) -> None: asyncio.run(TeamService(loop).run_team(cyclic, parent_task_id=None, parent_session_id="session-root")) +def test_execution_node_contract_defaults_preserve_legacy_scope_behavior() -> None: + node = ExecutionNode("collect", "Collect sources", AgentDescriptor(name="collect")) + + assert node.input_contract == {} + assert node.output_contract == {} + assert node.allowed_tool_names is None + assert node.required_evidence == [] + assert node.evidence_contract == {} + assert node.validation_rules == [] + assert node.required_for_completion is True + assert node.block_downstream_on_partial is False + assert node.max_tool_iterations is None + + +def test_execution_node_keeps_explicit_empty_tool_scope_distinct_from_unspecified_scope() -> None: + unrestricted = ExecutionNode("unrestricted", "Collect", AgentDescriptor(name="unrestricted")) + tool_free = ExecutionNode( + "tool_free", + "Synthesize", + AgentDescriptor(name="tool_free"), + allowed_tool_names=[], + ) + + assert unrestricted.allowed_tool_names is None + assert tool_free.allowed_tool_names == [] + + +def test_delegation_envelope_and_node_result_preserve_new_contract_metadata() -> None: + envelope = DelegationEnvelope( + parent_task_id="task-parent", + parent_session_id="session-root", + parent_run_id="run-root", + agent=AgentDescriptor(name="collect"), + task="Collect sources", + allowed_tool_names=["web_search"], + required_evidence=["url"], + evidence_contract={"entities": ["MGM", "Galaxy"]}, + validation_rules=["official_sources_only"], + required_for_completion=True, + block_downstream_on_partial=True, + max_tool_iterations=2, + ) + result = NodeRunResult( + node_id="collect", + success=False, + output_text="MGM source only", + completion_status="partial", + evidence_gaps=["missing required evidence: Galaxy official source"], + ) + + assert envelope.allowed_tool_names == ["web_search"] + assert envelope.evidence_contract == {"entities": ["MGM", "Galaxy"]} + assert result.to_dict()["completion_status"] == "partial" + assert result.to_dict()["evidence_gaps"] == ["missing required evidence: Galaxy official source"] + + +def test_graph_rejects_depth_above_configured_limit() -> None: + graph = ExecutionGraph( + strategy="dag", + nodes=[ + ExecutionNode("a", "A", AgentDescriptor(name="a")), + ExecutionNode("b", "B", AgentDescriptor(name="b"), depends_on=["a"]), + ExecutionNode("c", "C", AgentDescriptor(name="c"), depends_on=["b"]), + ], + ) + + with pytest.raises(ValueError, match="max depth"): + graph.validate(max_depth=2) + + def test_team_run_does_not_create_independent_team_task(tmp_path: Path) -> None: loop = _loop(tmp_path) loaded = loop.boot() diff --git a/app-instance/backend/tests/unit/test_debug_chat_logs_api.py b/app-instance/backend/tests/unit/test_debug_chat_logs_api.py index 7521144..03eff75 100644 --- a/app-instance/backend/tests/unit/test_debug_chat_logs_api.py +++ b/app-instance/backend/tests/unit/test_debug_chat_logs_api.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path +from time import sleep from fastapi.testclient import TestClient @@ -74,10 +75,77 @@ def test_debug_chat_logs_group_events_by_run(tmp_path: Path) -> None: assert run["intent_agent_choice"] == "create_task" assert run["user_input"] == "hello" assert [event["event_type"] for event in run["events"]] == [ - "run_started", - "intent_agent_decision_snapshotted", - "llm_request_snapshotted", - "user_message_added", "assistant_message_added", + "user_message_added", + "llm_request_snapshotted", + "intent_agent_decision_snapshotted", + "run_started", ] assert run["events"][2]["event_payload"]["messages"][0]["content"] == "hello" + + +def test_debug_chat_logs_are_reverse_chronological_and_include_latency(tmp_path: Path) -> None: + service = AgentService(workspace=tmp_path) + loaded = service.create_loop().boot() + manager = loaded.session_manager + session_id = "web:debug-order" + manager.ensure_session(session_id, source="web", title="Debug order") + + manager.append_message( + session_id, + run_id="run-old", + role="system", + event_type="run_started", + content="old", + context_visible=False, + ) + manager.append_message( + session_id, + run_id="run-old", + role="system", + event_type="run_completed", + event_payload={"latency_ms": {"total_ms": 10.0, "llm_ms": 7.0}}, + finish_reason="stop", + context_visible=False, + ) + sleep(0.01) + manager.append_message( + session_id, + run_id="run-new", + role="system", + event_type="run_started", + content="new", + context_visible=False, + ) + manager.append_message( + session_id, + run_id="run-new", + role="system", + event_type="run_completed", + event_payload={ + "latency_ms": { + "router_ms": 1.0, + "mcp_ms": 2.0, + "skill_assembly_ms": 3.0, + "tool_assembly_ms": 4.0, + "context_build_ms": 5.0, + "llm_ms": 6.0, + "tool_ms": 7.0, + "session_write_ms": 8.0, + "total_ms": 36.0, + } + }, + finish_reason="stop", + context_visible=False, + ) + + app = create_app(service=service, manage_service_lifecycle=False) + with TestClient(app) as client: + response = client.get("/api/debug/chat-logs") + + assert response.status_code == 200 + runs = response.json()["sessions"][0]["runs"] + assert [run["run_id"] for run in runs] == ["run-new", "run-old"] + assert [event["event_type"] for event in runs[0]["events"]] == ["run_completed", "run_started"] + assert runs[0]["latency_ms"]["total_ms"] == 36.0 + assert runs[0]["latency_ms"]["router_ms"] == 1.0 diff --git a/app-instance/backend/tests/unit/test_main_agent_router.py b/app-instance/backend/tests/unit/test_main_agent_router.py index f0e62d7..1c77243 100644 --- a/app-instance/backend/tests/unit/test_main_agent_router.py +++ b/app-instance/backend/tests/unit/test_main_agent_router.py @@ -158,7 +158,7 @@ def test_router_receives_thinking_mode() -> None: provider = RouterProvider('{"action":"simple_chat","reason":"simple"}') decision = asyncio.run( MainAgentRouter().classify( - "你好", + "请判断一下这个概念是否合理", provider=provider, thinking_enabled=False, ) @@ -168,11 +168,84 @@ def test_router_receives_thinking_mode() -> None: assert provider.calls[0]["thinking_enabled"] is False +def test_router_fast_paths_obvious_simple_chat_without_provider_call() -> None: + provider = RouterProvider('{"action":"new_task","reason":"should not be used"}') + + decision = asyncio.run(MainAgentRouter().classify("你好", provider=provider)) + punctuated = asyncio.run(MainAgentRouter().classify("你好!", provider=provider)) + translation = asyncio.run(MainAgentRouter().classify("翻译这句话:hello world", provider=provider)) + + assert not decision.is_task + assert decision.action == "simple_chat" + assert decision.reason == "obvious_simple_chat" + assert not punctuated.is_task + assert punctuated.action == "simple_chat" + assert not translation.is_task + assert translation.action == "simple_chat" + assert provider.calls == [] + + +def test_router_sends_broad_explanations_to_intent_llm() -> None: + provider = RouterProvider('{"action":"simple_chat","reason":"intent decided concept explanation"}') + + explanation = asyncio.run(MainAgentRouter().classify("解释一下什么是 MCP", provider=provider)) + definition = asyncio.run(MainAgentRouter().classify("什么是 context engineering", provider=provider)) + + assert not explanation.is_task + assert explanation.reason == "intent decided concept explanation" + assert not definition.is_task + assert definition.reason == "intent decided concept explanation" + assert len(provider.calls) == 2 + + +def test_router_fast_paths_obvious_task_without_provider_call() -> None: + provider = RouterProvider('{"action":"simple_chat","reason":"should not be used"}') + + decision = asyncio.run(MainAgentRouter().classify("帮我查一下今天深圳天气", provider=provider)) + current_event = asyncio.run( + MainAgentRouter().classify("解释一下今天法国队在世界杯的表现为什么那么好", provider=provider) + ) + + assert decision.is_task + assert decision.action == "create_task" + assert decision.reason == "obvious_task" + assert current_event.is_task + assert current_event.action == "create_task" + assert provider.calls == [] + + +def test_router_does_not_simple_fast_path_current_event_explanations() -> None: + provider = RouterProvider('{"action":"simple_chat","reason":"llm fallback"}') + + decision = asyncio.run(MainAgentRouter().classify("解释一下昨晚法国队在世界杯的表现为什么那么好", provider=provider)) + + assert decision.is_task + assert decision.action == "create_task" + assert decision.reason == "obvious_task" + assert provider.calls == [] + + +def test_router_keeps_active_task_followups_in_llm_path() -> None: + provider = RouterProvider('{"action":"revise_task","reason":"needs revision","short_title":"任务连续性"}') + + decision = asyncio.run( + MainAgentRouter().classify( + "这个也加上", + active_task=_task(), + provider=provider, + ) + ) + + assert decision.is_task + assert decision.action == "revise_task" + assert len(provider.calls) == 1 + + def test_router_injects_intent_skill_guidance() -> None: provider = RouterProvider('{"action":"new_task","reason":"needs weather tool","short_title":"珠海天气"}') decision = asyncio.run( MainAgentRouter().classify( - "帮我查一下今天珠海天气", + "帮我判断这个需求要不要进入任务模式", provider=provider, intent_skill="Weather and current external data must be routed to new_task.", ) @@ -247,7 +320,7 @@ def test_router_retries_once_after_provider_failure() -> None: decision = asyncio.run( MainAgentRouter().classify( - "帮我看看昨天的中美会面都谈了什么?", + "帮我判断这次中美会面分析需求要不要进入任务模式", provider=provider, ) ) @@ -262,7 +335,7 @@ def test_router_fallback_after_two_provider_failures() -> None: decision = asyncio.run( MainAgentRouter().classify( - "帮我看看昨天的中美会面都谈了什么?", + "帮我判断这次中美会面分析需求要不要进入任务模式", provider=provider, ) ) diff --git a/app-instance/backend/tests/unit/test_skill_assembler.py b/app-instance/backend/tests/unit/test_skill_assembler.py index 8a92def..e75c319 100644 --- a/app-instance/backend/tests/unit/test_skill_assembler.py +++ b/app-instance/backend/tests/unit/test_skill_assembler.py @@ -103,7 +103,7 @@ def test_skill_selection_receives_thinking_mode() -> None: assert provider.thinking_enabled is False -def test_skill_assembler_loads_detail_directly_for_small_candidate_sets() -> None: +def test_skill_assembler_directly_activates_single_clear_candidate_without_llm() -> None: provider = SequencedProvider(['["docker-debug"]']) assembler = SkillAssembler(loader=LoaderWithFullSkill(), retriever=StaticRetriever()) @@ -117,10 +117,8 @@ def test_skill_assembler_loads_detail_directly_for_small_candidate_sets() -> Non assert [skill.name for skill in result.activated_skills] == ["docker-debug"] assert result.activated_skills[0].tool_hints == ["search_files"] - assert [item["stage"] for item in result.llm_interactions] == ["final"] - assert len(provider.messages) == 1 - first_user_prompt = provider.messages[0][1]["content"] - assert "Use this skill when doing Docker log triage" in first_user_prompt + assert result.llm_interactions == [] + assert provider.messages == [] def test_skill_assembler_shortlists_before_loading_detail_for_large_candidate_sets() -> None: diff --git a/app-instance/backend/tests/unit/test_skill_learning_eval.py b/app-instance/backend/tests/unit/test_skill_learning_eval.py index 2a16b77..d401e65 100644 --- a/app-instance/backend/tests/unit/test_skill_learning_eval.py +++ b/app-instance/backend/tests/unit/test_skill_learning_eval.py @@ -395,6 +395,52 @@ def test_replay_main_score_uses_validator_not_tool_success(tmp_path: Path) -> No assert report.synthetic_score_avg is not None +def test_replay_real_case_without_validator_uses_same_output_scoring_for_both_arms(tmp_path: Path) -> None: + pipeline = _pipeline(tmp_path, task_score=0.8) + pipeline.learning_store.update_learning_candidate( + "candidate-1", + evidence={ + "eval_cases": [ + { + "run_id": "real-no-validator", + "task_id": "real-no-validator", + "session_id": "eval", + "task_text": "Summarize the release checklist.", + "accepted_score": 0.8, + } + ] + }, + ) + draft = pipeline.draft_service.create_new_skill_draft( + skill_name="release-checklist", + proposed_content="# Release\n\nRun tests.", + proposed_frontmatter={"description": "release", "tools": []}, + created_by="test", + reason="test", + ) + pipeline.learning_store.update_learning_candidate("candidate-1", draft_skill_name=draft.skill_name, draft_id=draft.draft_id) + + report = asyncio.run( + pipeline.evaluate_draft( + "candidate-1", + draft.skill_name, + draft.draft_id, + provider_bundle=_bundle(), + replay_runner=FakeReplayRunner( + baseline_answer="Release checklist summarized.", + candidate_answer="Release checklist summarized.", + ), + ) + ) + + case = next(item for item in report.case_reports if item["run_id"] == "real-no-validator") + legacy_case = next(item for item in report.cases if item["run_id"] == "real-no-validator") + assert case["baseline_score"] == 0.7 + assert case["candidate_score"] == 0.7 + assert case["delta"] == 0.0 + assert legacy_case["delta"] == 0.0 + + def test_synthetic_cases_without_validator_are_not_replay_scored(tmp_path: Path) -> None: pipeline = _pipeline(tmp_path) pipeline.learning_store.update_learning_candidate( diff --git a/app-instance/backend/tests/unit/test_skill_team_template.py b/app-instance/backend/tests/unit/test_skill_team_template.py new file mode 100644 index 0000000..f0ca37d --- /dev/null +++ b/app-instance/backend/tests/unit/test_skill_team_template.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from beaver.skills.assembler.task_assembler import SkillAssembler +from beaver.skills.catalog.loader import SkillsLoader +from beaver.skills.catalog.utils import extract_skill_team_template + + +def test_extract_team_template_returns_none_when_block_is_absent() -> None: + result = extract_skill_team_template("# Ordinary Skill") + + assert result.template is None + assert result.warnings == [] + + +def test_extract_team_template_parses_valid_json_block() -> None: + result = extract_skill_team_template( + "```beaver-team-template\n" + '{"version": 1, "nodes": [{"node_id": "collect", "task": "Collect"}]}\n' + "```" + ) + + assert result.template == { + "version": 1, + "nodes": [{"node_id": "collect", "task": "Collect"}], + } + assert result.warnings == [] + + +def test_invalid_template_is_warning_not_skill_load_failure() -> None: + result = extract_skill_team_template("```beaver-team-template\nnot-json\n```") + + assert result.template is None + assert result.warnings == ["team template JSON is invalid"] + + +def test_loader_and_assembler_propagate_team_template_to_skill_context(tmp_path) -> None: + skill_dir = tmp_path / "plugin-skills" / "financial-comparison" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text( + "---\n" + "description: Compare financial disclosures.\n" + "---\n\n" + "# Financial Comparison\n\n" + "```beaver-team-template\n" + '{"version": 1, "nodes": [{"node_id": "collect", "task": "Collect official sources"}]}\n' + "```\n", + encoding="utf-8", + ) + loader = SkillsLoader( + tmp_path, + builtin_skills_dir=tmp_path / "no-builtins", + extra_dirs=[tmp_path / "plugin-skills"], + ) + + record = loader.get_skill_record("financial-comparison") + context = SkillAssembler(loader)._activate_skill_contexts(["financial-comparison"])[0] + + assert record is not None + assert record.team_template == { + "version": 1, + "nodes": [{"node_id": "collect", "task": "Collect official sources"}], + } + assert record.team_template_warnings == [] + assert context.team_template == record.team_template + assert context.team_template_warnings == [] diff --git a/app-instance/backend/tests/unit/test_task_evidence.py b/app-instance/backend/tests/unit/test_task_evidence.py index 6206642..5549fcb 100644 --- a/app-instance/backend/tests/unit/test_task_evidence.py +++ b/app-instance/backend/tests/unit/test_task_evidence.py @@ -3,7 +3,65 @@ from __future__ import annotations from pathlib import Path from beaver.engine.session.manager import SessionManager -from beaver.tasks.evidence import EvidenceBuilder, RunEvidence, TaskEvidencePacket, ToolEvidence, render_task_evidence +from beaver.tasks.evidence import ( + EvidenceBuilder, + RunEvidence, + TaskEvidencePacket, + ToolEvidence, + evaluate_node_evidence, + render_task_evidence, +) + + +def _run_evidence(*, tool_results: list[ToolEvidence] | None = None) -> RunEvidence: + return RunEvidence( + run_id="run-1", + session_id="session-1", + output_text="", + finish_reason="stop", + tool_results=list(tool_results or []), + ) + + +def test_evaluate_node_evidence_requires_successful_tool_result() -> None: + evidence = _run_evidence( + tool_results=[ + ToolEvidence( + tool_name="web_fetch", + tool_call_id="call-1", + content="failed", + event_payload={"success": False}, + ) + ] + ) + + assert evaluate_node_evidence(evidence, ["tool_result"], "done") == [ + "missing required evidence: tool_result" + ] + + +def test_evaluate_node_evidence_accepts_url_in_successful_tool_content() -> None: + evidence = _run_evidence( + tool_results=[ + ToolEvidence( + tool_name="web_fetch", + tool_call_id="call-1", + content="Source: https://example.test/report", + event_payload={"success": True}, + ) + ] + ) + + assert evaluate_node_evidence(evidence, ["tool_result", "url"], "done") == [] + + +def test_evaluate_node_evidence_checks_output_and_unknown_requirements() -> None: + evidence = _run_evidence() + + assert evaluate_node_evidence(evidence, ["output", "unknown_type"], " ") == [ + "missing required evidence: output", + "unsupported evidence requirement: unknown_type", + ] def test_evidence_builder_preserves_full_tool_result(tmp_path: Path) -> None: diff --git a/app-instance/backend/tests/unit/test_task_execution_planner.py b/app-instance/backend/tests/unit/test_task_execution_planner.py index e048d7c..be5ea78 100644 --- a/app-instance/backend/tests/unit/test_task_execution_planner.py +++ b/app-instance/backend/tests/unit/test_task_execution_planner.py @@ -3,15 +3,19 @@ from __future__ import annotations import asyncio from types import SimpleNamespace +from beaver.engine.context import SkillContext from beaver.engine.providers.base import LLMProvider, LLMResponse from beaver.engine.providers.factory import ProviderBundle -from beaver.tasks import TaskExecutionPlanner, TaskRecord +from beaver.tasks import SkillResolutionReport, TaskExecutionPlanner, TaskRecord +from beaver.tools.base import BaseTool, ToolContext, ToolResult, ToolSpec +from beaver.tools.registry import ToolRegistry class PlannerProvider(LLMProvider): def __init__(self, response: str) -> None: super().__init__() self.response = response + self.calls: list[dict] = [] async def chat( self, @@ -21,6 +25,15 @@ class PlannerProvider(LLMProvider): max_tokens: int = 4096, temperature: float = 0.7, ) -> LLMResponse: + self.calls.append( + { + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "model": model, + "tools": tools, + } + ) return LLMResponse(content=self.response, finish_reason="stop", provider_name="stub", model="stub-model") def get_default_model(self) -> str: @@ -43,6 +56,28 @@ class HangingPlannerProvider(LLMProvider): return "stub-model" +class SequencedPlannerProvider(PlannerProvider): + def __init__(self, responses: list[str]) -> None: + super().__init__(responses[0]) + self.responses = list(responses) + + async def chat(self, *args, **kwargs) -> LLMResponse: + self.response = self.responses.pop(0) + return await super().chat(*args, **kwargs) + + +class StubTool(BaseTool): + def __init__(self, name: str) -> None: + self._spec = ToolSpec(name=name, description=name, input_schema={"type": "object"}) + + @property + def spec(self) -> ToolSpec: + return self._spec + + async def invoke(self, arguments: dict, context: ToolContext) -> ToolResult: + raise AssertionError("Planner tests do not execute tools") + + def _task() -> TaskRecord: return TaskRecord( task_id="task-1", @@ -59,12 +94,26 @@ def _task() -> TaskRecord: def _bundle(response: str) -> ProviderBundle: + provider = PlannerProvider(response) return ProviderBundle( main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), - main_provider=PlannerProvider(response), + main_provider=provider, ) +def _bundle_with_provider(provider: LLMProvider) -> ProviderBundle: + return ProviderBundle( + main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), + main_provider=provider, + ) + + +def _registry() -> ToolRegistry: + registry = ToolRegistry() + registry.register_many([StubTool("web_search"), StubTool("web_fetch"), StubTool("terminal")]) + return registry + + def _hanging_bundle() -> ProviderBundle: return ProviderBundle( main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), @@ -87,26 +136,55 @@ def test_planner_selects_single_mode() -> None: assert plan.reason == "main agent is enough" +def test_planner_skips_llm_for_simple_task() -> None: + provider = PlannerProvider('{"mode":"team","reason":"should not be used"}') + bundle = ProviderBundle( + main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), + main_provider=provider, + ) + task = _task() + task.description = "查询深圳天气" + task.goal = "查询深圳天气" + + plan = asyncio.run( + TaskExecutionPlanner().plan( + task=task, + user_message="帮我查一下今天深圳天气", + attempt_index=1, + provider_bundle=bundle, + ) + ) + + assert plan.mode == "single" + assert plan.graph is None + assert plan.reason == "planner_skipped_simple_task" + assert provider.calls == [] + + def test_planner_builds_team_graph() -> None: + bundle = _bundle( + """ + { + "mode": "team", + "reason": "needs parallel review", + "strategy": "dag", + "nodes": [ + {"node_id": "research", "task": "research options"}, + {"node_id": "review", "task": "review result", "depends_on": ["research"]} + ], + "final_synthesis_instruction": "merge the findings" + } + """ + ) + provider = bundle.main_provider plan = asyncio.run( TaskExecutionPlanner().plan( task=_task(), user_message="implement workflow", attempt_index=1, - provider_bundle=_bundle( - """ - { - "mode": "team", - "reason": "needs parallel review", - "strategy": "dag", - "nodes": [ - {"node_id": "research", "task": "research options", "agent": {"name": "researcher"}}, - {"node_id": "review", "task": "review result", "agent": {"name": "reviewer"}, "depends_on": ["research"]} - ], - "final_synthesis_instruction": "merge the findings" - } - """ - ), + provider_bundle=bundle, + skill_summaries=["docker-debug: Use docker logs before editing config."], + tool_hints=["terminal", "search_files"], ) ) @@ -116,6 +194,12 @@ def test_planner_builds_team_graph() -> None: assert [node.node_id for node in plan.graph.nodes] == ["research", "review"] assert plan.graph.nodes[1].depends_on == ["research"] assert plan.final_synthesis_instruction == "merge the findings" + assert isinstance(provider, PlannerProvider) + prompt = provider.calls[0]["messages"][1]["content"] + assert "Activated skill summaries" in prompt + assert "docker-debug: Use docker logs before editing config." in prompt + assert "terminal" in prompt + assert "search_files" in prompt def test_planner_timeout_falls_back_to_single() -> None: @@ -134,7 +218,7 @@ def test_planner_timeout_falls_back_to_single() -> None: assert "TimeoutError" in (plan.fallback_error or "") -def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None: +def test_planner_team_nodes_use_task_as_internal_skill_query() -> None: plan = TaskExecutionPlanner().from_json( """ { @@ -144,9 +228,7 @@ def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None: "nodes": [ { "node_id": "api_review", - "task": "review API compatibility", - "skill_query": "API contract compatibility review", - "required_capabilities": ["schema compatibility"] + "task": "review API compatibility" } ] } @@ -158,8 +240,77 @@ def test_planner_team_nodes_can_target_skills_without_agent_roles() -> None: node = plan.graph.nodes[0] assert node.agent.name == "api_review" assert node.agent.role == "" - assert node.agent.metadata["skill_query"] == "API contract compatibility review" - assert node.agent.metadata["required_capabilities"] == ["schema compatibility"] + assert node.agent.metadata["skill_query"] == "review API compatibility" + assert node.agent.metadata["required_capabilities"] == [] + + +def test_planner_accepts_use_skill_and_skill_query() -> None: + plan = TaskExecutionPlanner().from_json( + """ + { + "mode": "team", + "strategy": "sequence", + "nodes": [ + { + "node_id": "collect", + "task": "Collect official sources", + "use_skill": "official-source-research", + "skill_query": "official source verification" + } + ] + } + """ + ) + + assert plan.is_team + assert plan.graph is not None + node = plan.graph.nodes[0] + assert node.agent.metadata["use_skill"] == "official-source-research" + assert node.agent.metadata["skill_query"] == "official source verification" + assert node.inherited_pinned_skills == [] + assert node.allowed_tool_names is None + assert plan.planner_adaptation["node_skill_bindings"] == [ + { + "node_id": "collect", + "use_skill": "official-source-research", + "skill_query": "official source verification", + } + ] + + +def test_planner_defaults_skill_query_to_node_task_when_absent() -> None: + plan = TaskExecutionPlanner().from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"extract","task":"Extract financial metrics","use_skill":"financial-extraction"}]}' + ) + + assert plan.is_team + assert plan.graph is not None + assert plan.graph.nodes[0].agent.metadata["skill_query"] == "Extract financial metrics" + + +def test_planner_adaptation_records_unresolved_use_skill_fallback() -> None: + planner = TaskExecutionPlanner() + plan = planner.from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"extract","task":"Extract metrics","use_skill":"missing-skill",' + '"skill_query":"financial extraction"}]}' + ) + report = SkillResolutionReport( + node_id="extract", + skill_query="financial extraction", + requested_skill_name="missing-skill", + exact_binding_used=False, + warnings=["use_skill unresolved: missing-skill"], + reason="matched published skill", + ) + + planner._merge_skill_resolution_adaptation(plan, [report]) + + assert plan.planner_adaptation["warnings"] == ["use_skill unresolved: missing-skill"] + assert plan.planner_adaptation["node_skill_bindings"][0]["fallback_reason"] == ( + "use_skill unresolved; matched published skill" + ) def test_planner_invalid_outputs_fallback_to_single() -> None: @@ -193,3 +344,216 @@ def test_planner_invalid_outputs_fallback_to_single() -> None: assert unknown_strategy.mode == "single" assert too_many_nodes.mode == "single" assert cyclic.mode == "single" + + +def test_template_plan_builds_generic_worker_and_preserves_v1_contract_fields() -> None: + plan = TaskExecutionPlanner(tool_registry=_registry()).from_json( + """ + { + "mode": "team", + "strategy": "dag", + "nodes": [ + { + "node_id": "collect", + "task": "Collect official sources", + "requested_tools": ["web_search"], + "evidence_contract": {"entities": ["MGM", "Galaxy"]}, + "block_downstream_on_partial": true + } + ], + "adaptation": {"template_used": true} + } + """ + ) + + assert plan.is_team + assert plan.graph is not None + node = plan.graph.nodes[0] + assert node.agent.name == "collect" + assert node.agent.role == "" + assert node.agent.metadata["sub_agent_kind"] == "generic_skill_worker" + assert node.allowed_tool_names == ["web_search"] + assert node.evidence_contract == {"entities": ["MGM", "Galaxy"]} + assert node.block_downstream_on_partial is True + assert plan.planner_adaptation["template_used"] is True + + +def test_unknown_tool_is_removed_and_warned() -> None: + plan = TaskExecutionPlanner(tool_registry=_registry()).from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"collect","task":"Collect","requested_tools":["web_search","not_real"]}]}' + ) + + assert plan.is_team + assert plan.graph is not None + assert plan.graph.nodes[0].allowed_tool_names == ["web_search"] + assert "unknown tool removed: not_real" in plan.planner_adaptation["warnings"] + + +def test_high_risk_tool_is_removed_without_failing_low_risk_plan() -> None: + plan = TaskExecutionPlanner(tool_registry=_registry()).from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"collect","task":"Collect","requested_tools":["web_search","terminal"]}]}' + ) + + assert plan.is_team + assert plan.graph is not None + assert plan.graph.nodes[0].allowed_tool_names == ["web_search"] + assert "requires_high_risk_review: terminal" in plan.planner_adaptation["warnings"] + + +def test_planner_rejects_agent_and_role_node_fields() -> None: + planner = TaskExecutionPlanner(tool_registry=_registry()) + + agent_plan = planner.from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"collect","task":"Collect","agent":{"name":"researcher"}}]}' + ) + role_plan = planner.from_json( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"collect","task":"Collect","role":"researcher"}]}' + ) + + assert agent_plan.mode == "single" + assert "agent" in (agent_plan.fallback_error or "") + assert role_plan.mode == "single" + assert "role" in (role_plan.fallback_error or "") + + +def test_planner_records_primary_template_selection_and_ignored_templates() -> None: + primary = SkillContext( + name="financial-comparison", + version="v1", + content="Compare official financial disclosures.", + team_template={"version": 1, "nodes": [{"node_id": "collect", "task": "Collect"}]}, + ) + secondary = SkillContext( + name="chart-reporting", + version="v2", + content="Render chart-ready Markdown.", + team_template={"version": 1, "nodes": [{"node_id": "report", "task": "Report"}]}, + ) + provider = PlannerProvider( + '{"mode":"team","strategy":"sequence","nodes":[' + '{"node_id":"collect","task":"Collect official sources"}],' + '"adaptation":{"template_used":true}}' + ) + + plan = asyncio.run( + TaskExecutionPlanner(tool_registry=_registry()).plan( + task=_task(), + user_message="compare financial workflow", + attempt_index=1, + provider_bundle=_bundle_with_provider(provider), + activated_skills=[primary, secondary], + ) + ) + + assert plan.planner_adaptation == { + "template_used": True, + "selected_template": "financial-comparison", + "selection_reason": "first activated skill with a valid team template", + "ignored_templates": ["chart-reporting"], + "warnings": [], + } + prompt = provider.calls[0]["messages"][1]["content"] + assert '"skill_name": "financial-comparison"' in prompt + assert "Compare official financial disclosures." in prompt + assert "Render chart-ready Markdown." in prompt + + +def test_malformed_planner_output_repairs_once_without_tools() -> None: + provider = SequencedPlannerProvider( + [ + "not json", + '{"mode":"team","strategy":"sequence","nodes":[{"node_id":"collect","task":"Collect"}]}', + ] + ) + + plan = asyncio.run( + TaskExecutionPlanner(tool_registry=_registry()).plan( + task=_task(), + user_message="implement workflow", + attempt_index=1, + provider_bundle=_bundle_with_provider(provider), + ) + ) + + assert plan.is_team + assert len(provider.calls) == 2 + assert provider.calls[1]["tools"] is None + assert "Repair the invalid planner JSON" in provider.calls[1]["messages"][1]["content"] + + +def test_failed_planner_repair_falls_back_to_single() -> None: + provider = SequencedPlannerProvider(["not json", "still not json"]) + + plan = asyncio.run( + TaskExecutionPlanner(tool_registry=_registry()).plan( + task=_task(), + user_message="implement workflow", + attempt_index=1, + provider_bundle=_bundle_with_provider(provider), + ) + ) + + assert plan.mode == "single" + assert plan.reason == "planner_fallback_single" + assert len(provider.calls) == 2 + + +def test_finance_template_adapts_to_task_oriented_read_only_graph() -> None: + plan = TaskExecutionPlanner(tool_registry=_registry()).from_json( + """ + { + "mode": "team", + "strategy": "dag", + "nodes": [ + { + "node_id": "collect_official_sources", + "task": "Collect MGM and Galaxy official financial disclosures", + "requested_tools": ["web_search", "web_fetch"], + "required_evidence": ["tool_result", "url"] + }, + { + "node_id": "extract_financial_metrics", + "task": "Extract comparable financial metrics from collected sources", + "depends_on": ["collect_official_sources"], + "requested_tools": ["web_fetch"], + "required_evidence": ["output"] + }, + { + "node_id": "validate_metrics", + "task": "Validate metric units, periods, and source consistency", + "depends_on": ["extract_financial_metrics"], + "required_evidence": ["output"] + }, + { + "node_id": "generate_chart_report", + "task": "Generate a Markdown comparison table and chart-ready data without claiming an image or file artifact", + "depends_on": ["validate_metrics"], + "requested_tools": [], + "required_evidence": ["output"] + } + ] + } + """ + ) + + assert plan.is_team + assert plan.graph is not None + assert [node.node_id for node in plan.graph.nodes] == [ + "collect_official_sources", + "extract_financial_metrics", + "validate_metrics", + "generate_chart_report", + ] + assert all(node.agent.role == "" for node in plan.graph.nodes) + assert not {"researcher", "writer", "reviewer", "analyst"}.intersection( + node.node_id for node in plan.graph.nodes + ) + assert plan.graph.nodes[0].allowed_tool_names == ["web_search", "web_fetch"] + assert plan.graph.nodes[-1].allowed_tool_names == [] + report_task = plan.graph.nodes[-1].task.lower() + assert "markdown" in report_task + assert "without claiming an image or file artifact" in report_task diff --git a/app-instance/backend/tests/unit/test_task_mode_feedback.py b/app-instance/backend/tests/unit/test_task_mode_feedback.py index 0675fa7..497e214 100644 --- a/app-instance/backend/tests/unit/test_task_mode_feedback.py +++ b/app-instance/backend/tests/unit/test_task_mode_feedback.py @@ -4,10 +4,12 @@ import asyncio from pathlib import Path from types import SimpleNamespace -from beaver.engine import EngineLoader +from beaver.engine import AgentRunResult, EngineLoader +from beaver.engine.context import SkillContext from beaver.engine.providers.base import LLMProvider, LLMResponse from beaver.engine.providers.factory import ProviderBundle from beaver.services.agent_service import AgentService +from beaver.skills.assembler import SkillAssemblyResult from beaver.tasks import TaskExecutionPlan, TaskService @@ -39,6 +41,44 @@ class StubTaskExecutionPlanner: return TaskExecutionPlan.single("test-single") +class RecordingTaskExecutionPlanner: + def __init__(self) -> None: + self.calls: list[dict] = [] + + async def plan(self, **kwargs) -> TaskExecutionPlan: + self.calls.append(dict(kwargs)) + return TaskExecutionPlan.single("test-single") + + +class RecordingSkillAssembler: + def __init__(self, skills: list[SkillContext]) -> None: + self.skills = list(skills) + self.calls: list[dict] = [] + + async def assemble(self, **kwargs) -> SkillAssemblyResult: + self.calls.append(dict(kwargs)) + return SkillAssemblyResult(activated_skills=list(self.skills)) + + +class RecordingTaskAttemptOrchestrator: + def __init__(self) -> None: + self.calls: list[dict] = [] + + async def run(self, **kwargs) -> AgentRunResult: + self.calls.append(dict(kwargs)) + task = kwargs["task"] + task.task_id = "task-from-orchestrator" + return AgentRunResult( + session_id=kwargs["kwargs"]["session_id"], + run_id="run-from-orchestrator", + output_text="orchestrated", + finish_reason="stop", + tool_iterations=0, + task_id=task.task_id, + task_status=task.status, + ) + + class FakeLearningCandidate: def to_dict(self) -> dict: return {"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"} @@ -101,6 +141,91 @@ def test_task_run_records_evidence_and_waits_for_acceptance(tmp_path: Path) -> N assert "validated" not in event_types +def test_agent_service_records_router_latency(tmp_path: Path) -> None: + service = AgentService( + loader=EngineLoader( + workspace=tmp_path, + task_execution_planner=StubTaskExecutionPlanner(), + ) + ) + + result = asyncio.run( + service.process_direct( + "draft release notes", + session_id="web:latency", + provider_bundle=_bundle("Done"), + ) + ) + + latency = result.usage["latency_ms"] + assert latency["router_ms"] > 0 + + +def test_task_mode_preselects_skills_for_planner_and_reuses_them_in_main_run(tmp_path: Path) -> None: + skill = SkillContext( + name="docker-debug", + content="Use docker logs before editing config.", + version="v1", + content_hash="hash-v1", + activation_reason="llm_selected", + tool_hints=["terminal"], + ) + skill_assembler = RecordingSkillAssembler([skill]) + planner = RecordingTaskExecutionPlanner() + service = AgentService( + loader=EngineLoader( + workspace=tmp_path, + skill_assembler=skill_assembler, + task_execution_planner=planner, + ) + ) + + result = asyncio.run( + service.process_direct( + "debug this workflow", + session_id="web:skill-aware-task", + provider_bundle=_bundle("Done"), + ) + ) + + assert result.task_id + assert len(skill_assembler.calls) == 1 + assert planner.calls + assert planner.calls[0]["skill_summaries"] == ["docker-debug: Use docker logs before editing config."] + assert planner.calls[0]["tool_hints"] == ["terminal"] + + task_service = service.create_loop().boot().task_service + assert task_service is not None + task = task_service.get_task(result.task_id) + assert task is not None + assert task.skill_names == ["docker-debug"] + + +def test_task_mode_delegates_attempt_execution_to_orchestrator(tmp_path: Path) -> None: + orchestrator = RecordingTaskAttemptOrchestrator() + service = AgentService( + loader=EngineLoader( + workspace=tmp_path, + task_execution_planner=StubTaskExecutionPlanner(), + ) + ) + service._build_task_attempt_orchestrator = lambda loaded: orchestrator # type: ignore[attr-defined] + + result = asyncio.run( + service.process_direct( + "draft release notes", + session_id="web:orchestrator", + provider_bundle=_bundle("main runner should not be used"), + ) + ) + + assert result.output_text == "orchestrated" + assert result.run_id == "run-from-orchestrator" + assert len(orchestrator.calls) == 1 + assert orchestrator.calls[0]["message"] == "draft release notes" + assert orchestrator.calls[0]["task"].description == "draft release notes" + + def test_task_mode_injects_prompt_locale_output_language(tmp_path: Path) -> None: service = AgentService( loader=EngineLoader( diff --git a/app-instance/backend/tests/unit/test_task_skill_resolver.py b/app-instance/backend/tests/unit/test_task_skill_resolver.py index fb5d07f..38079ba 100644 --- a/app-instance/backend/tests/unit/test_task_skill_resolver.py +++ b/app-instance/backend/tests/unit/test_task_skill_resolver.py @@ -222,3 +222,179 @@ def test_task_skill_resolver_keeps_summary_nodes_skillless(tmp_path: Path) -> No assert reports[0].ephemeral_used is False assert reports[0].reason == "summary node uses dependency outputs directly" assert provider.calls == [] + + +def test_resolver_exact_binds_use_skill_before_dynamic_lookup(tmp_path: Path) -> None: + _publish_skill(tmp_path, skill_name="official-source-research") + provider = RecordingProvider(['["wrong-dynamic-skill"]']) + resolver = TaskSkillResolver( + skills_loader=SkillsLoader(tmp_path), + draft_service=DraftService(SkillSpecStore(tmp_path)), + ) + graph = ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + "collect", + "Collect official sources", + AgentDescriptor( + name="collect", + metadata={ + "use_skill": "official-source-research", + "skill_query": "generic web research", + }, + ), + ) + ], + ) + + resolved, reports = asyncio.run( + resolver.resolve_graph( + graph, + task=_task(), + user_message="collect sources", + attempt_index=1, + provider_bundle=_bundle(provider), + ) + ) + + node = resolved.nodes[0] + assert node.inherited_pinned_skills == ["official-source-research"] + assert [context.name for context in node.inherited_pinned_skill_contexts] == ["official-source-research"] + assert node.agent.metadata["exact_binding_used"] is True + assert reports[0].selected_skill_names == ["official-source-research"] + assert reports[0].exact_binding_used is True + assert reports[0].warnings == [] + assert provider.calls == [] + + +def test_resolver_falls_back_to_skill_query_when_use_skill_missing(tmp_path: Path) -> None: + _publish_skill(tmp_path, skill_name="financial-metric-extraction") + provider = RecordingProvider(['["financial-metric-extraction"]']) + resolver = TaskSkillResolver( + skills_loader=SkillsLoader(tmp_path), + draft_service=DraftService(SkillSpecStore(tmp_path)), + ) + graph = ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + "extract", + "Extract metrics", + AgentDescriptor( + name="extract", + metadata={ + "use_skill": "missing-exact-skill", + "skill_query": "financial metric extraction", + }, + ), + ) + ], + ) + + resolved, reports = asyncio.run( + resolver.resolve_graph( + graph, + task=_task(), + user_message="extract financial metrics", + attempt_index=1, + provider_bundle=_bundle(provider), + ) + ) + + assert resolved.nodes[0].inherited_pinned_skills == ["financial-metric-extraction"] + assert reports[0].exact_binding_used is False + assert reports[0].selected_skill_names == ["financial-metric-extraction"] + assert reports[0].warnings == ["use_skill unresolved: missing-exact-skill"] + assert "financial metric extraction" in provider.calls[0][1]["content"] + + +def test_resolver_falls_back_to_ephemeral_when_exact_and_query_miss(tmp_path: Path) -> None: + _publish_skill(tmp_path, skill_name="unrelated-skill") + provider = RecordingProvider( + [ + "[]", + """ + { + "guidance_name": "financial-extraction-guidance", + "description": "Extract financial metrics", + "content": "# Financial Extraction\\n\\nExtract the requested metrics.", + "tags": ["finance"] + } + """, + ] + ) + resolver = TaskSkillResolver( + skills_loader=SkillsLoader(tmp_path), + draft_service=DraftService(SkillSpecStore(tmp_path)), + missing_skill_synthesizer=EphemeralGuidanceSynthesizer(), + ) + graph = ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + "extract", + "Extract metrics", + AgentDescriptor( + name="extract", + metadata={ + "use_skill": "missing-exact-skill", + "skill_query": "financial metric extraction", + }, + ), + ) + ], + ) + + resolved, reports = asyncio.run( + resolver.resolve_graph( + graph, + task=_task(), + user_message="extract financial metrics", + attempt_index=1, + provider_bundle=_bundle(provider), + ) + ) + + assert resolved.nodes[0].inherited_pinned_skills == [] + assert resolved.nodes[0].inherited_pinned_skill_contexts[0].name == "ephemeral:financial-extraction-guidance" + assert reports[0].ephemeral_used is True + assert reports[0].warnings == ["use_skill unresolved: missing-exact-skill"] + + +def test_explicit_use_skill_is_preserved_for_summary_without_nested_expansion(tmp_path: Path) -> None: + _publish_skill(tmp_path, skill_name="summary-formatting") + provider = RecordingProvider([]) + resolver = TaskSkillResolver( + skills_loader=SkillsLoader(tmp_path), + draft_service=DraftService(SkillSpecStore(tmp_path)), + ) + graph = ExecutionGraph( + strategy="dag", + nodes=[ + ExecutionNode( + "summarize", + "Compile a summary from dependency outputs", + AgentDescriptor( + name="summarize", + metadata={"use_skill": "summary-formatting", "skill_query": "Summarization"}, + ), + depends_on=["collect"], + ) + ], + ) + + resolved, reports = asyncio.run( + resolver.resolve_graph( + graph, + task=_task(), + user_message="summarize", + attempt_index=1, + provider_bundle=_bundle(provider), + ) + ) + + assert len(resolved.nodes) == 1 + assert resolved.nodes[0].inherited_pinned_skills == ["summary-formatting"] + assert reports[0].exact_binding_used is True + assert provider.calls == [] diff --git a/app-instance/backend/tests/unit/test_task_team_synthesis_outcome.py b/app-instance/backend/tests/unit/test_task_team_synthesis_outcome.py new file mode 100644 index 0000000..b8c0450 --- /dev/null +++ b/app-instance/backend/tests/unit/test_task_team_synthesis_outcome.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +import asyncio +from types import SimpleNamespace +from typing import Any + +import pytest + +from beaver.coordinator import AgentDescriptor, ExecutionGraph, ExecutionNode, NodeRunResult, TeamRunResult +from beaver.engine import AgentRunResult +from beaver.tasks import TaskExecutionPlan, TaskRecord +from beaver.tasks.attempt_orchestrator import TaskAttemptOrchestrator + + +def _plan(*, optional_second: bool = False) -> TaskExecutionPlan: + return TaskExecutionPlan( + mode="team", + reason="test team", + graph=ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode("collect", "Collect", AgentDescriptor(name="collect")), + ExecutionNode( + "report", + "Report", + AgentDescriptor(name="report"), + required_for_completion=not optional_second, + ), + ], + ), + ) + + +def _team_result(*results: NodeRunResult) -> TeamRunResult: + return TeamRunResult( + success=all(result.success for result in results), + summary="team summary", + node_results=list(results), + ) + + +def _result(node_id: str, status: str, *, gaps: list[str] | None = None) -> NodeRunResult: + return NodeRunResult( + node_id=node_id, + success=status == "succeeded", + output_text=f"{node_id} output", + finish_reason="blocked" if status == "blocked" else "stop", + error=None if status == "succeeded" else f"{status} node", + completion_status=status, + evidence_gaps=list(gaps or []), + ) + + +def test_required_partial_node_marks_synthesis_incomplete() -> None: + context, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome( + _plan(), + _team_result( + _result("collect", "partial", gaps=["missing required evidence: url"]), + _result("report", "succeeded"), + ), + ) + + assert metadata["task_outcome"] == "incomplete" + assert metadata["incomplete_node_ids"] == ["collect"] + assert metadata["evidence_gaps"] == {"collect": ["missing required evidence: url"]} + assert "Task outcome: incomplete" in context + assert "missing required evidence: url" in context + assert prefix.startswith("任务未完成:") + + +@pytest.mark.parametrize("status", ["failed", "blocked"]) +def test_required_failed_or_blocked_node_marks_synthesis_incomplete(status: str) -> None: + _, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome( + _plan(), + _team_result(_result("collect", status), _result("report", "succeeded")), + ) + + assert metadata["task_outcome"] == "incomplete" + assert metadata["incomplete_node_ids"] == ["collect"] + assert metadata["node_statuses"]["collect"] == status + assert prefix + + +def test_optional_failed_node_does_not_force_incomplete() -> None: + context, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome( + _plan(optional_second=True), + _team_result(_result("collect", "succeeded"), _result("report", "failed")), + ) + + assert metadata["task_outcome"] == "complete" + assert metadata["incomplete_node_ids"] == [] + assert "Task outcome: complete" in context + assert prefix == "" + + +def test_all_required_nodes_succeeded_is_complete() -> None: + _, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome( + _plan(), + _team_result(_result("collect", "succeeded"), _result("report", "succeeded")), + ) + + assert metadata["task_outcome"] == "complete" + assert prefix == "" + + +def test_single_plan_outcome_does_not_add_prefix() -> None: + context, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome( + TaskExecutionPlan.single("single"), + None, + ) + + assert metadata["task_outcome"] == "single" + assert "Task outcome: single" in context + assert prefix == "" + + +class FakeTaskService: + def start_run(self, task_id: str, **_: Any) -> None: + return None + + def append_run(self, task_id: str, run_id: str, **_: Any) -> TaskRecord: + return self.task + + +class FakeSessionManager: + def __init__(self) -> None: + self.events: list[dict[str, Any]] = [] + + def append_message(self, session_id: str, **kwargs: Any) -> None: + self.events.append({"session_id": session_id, **kwargs}) + + def update_latest_assistant_event_payload(self, *args: Any, **kwargs: Any) -> None: + return None + + def get_run_event_records(self, session_id: str, run_id: str) -> list[Any]: + return [] + + +class FixedPlanner: + def __init__(self, plan: TaskExecutionPlan) -> None: + self.fixed_plan = plan + + async def plan(self, **_: Any) -> TaskExecutionPlan: + return self.fixed_plan + + +def _task() -> TaskRecord: + return TaskRecord( + task_id="task-1", + session_id="session-1", + description="finance comparison", + goal="finance comparison", + constraints=[], + priority=0, + status="open", + creator="test", + created_at="now", + updated_at="now", + ) + + +def test_incomplete_team_still_runs_tool_free_synthesis_and_prefixes_output() -> None: + plan = _plan() + team_result = _team_result( + _result("collect", "partial", gaps=["missing required evidence: url"]), + _result("report", "succeeded"), + ) + task = _task() + task_service = FakeTaskService() + task_service.task = task + session_manager = FakeSessionManager() + loaded = SimpleNamespace( + task_service=task_service, + task_execution_planner=FixedPlanner(plan), + session_manager=session_manager, + run_memory_store=None, + ) + orchestrator = TaskAttemptOrchestrator( + loaded=loaded, + create_loop=lambda: None, + make_provider_bundle_for_task=lambda *_: None, + ) + + async def fake_run_team(*args: Any, **kwargs: Any) -> tuple[TeamRunResult, None]: + return team_result, None + + runner_calls: list[dict[str, Any]] = [] + + async def runner(message: str, **kwargs: Any) -> AgentRunResult: + runner_calls.append(kwargs) + return AgentRunResult( + session_id="session-1", + run_id="main-run", + output_text="Available financial comparison.", + finish_reason="stop", + tool_iterations=0, + ) + + orchestrator._run_team_for_task = fake_run_team # type: ignore[method-assign] + result = asyncio.run( + orchestrator.run( + message="compare finance", + runner=runner, + kwargs={ + "session_id": "session-1", + "provider_bundle": SimpleNamespace(), + "include_skill_assembly": False, + }, + task=task, + ) + ) + + assert len(runner_calls) == 1 + assert runner_calls[0]["include_tools"] is False + assert runner_calls[0]["max_tool_iterations"] == 0 + assert "Task outcome: incomplete" in runner_calls[0]["execution_context"] + assert result.output_text.startswith("任务未完成:") + synthesis_event = [event for event in session_manager.events if event.get("event_type") == "task_synthesis_completed"][0] + assert synthesis_event["event_payload"]["task_outcome"] == "incomplete" + assert synthesis_event["event_payload"]["incomplete_node_ids"] == ["collect"] + assert synthesis_event["event_payload"]["node_statuses"] == { + "collect": "partial", + "report": "succeeded", + } + assert synthesis_event["event_payload"]["evidence_gaps"] == { + "collect": ["missing required evidence: url"] + } + + +def test_incomplete_notice_is_not_prefixed_twice() -> None: + text = "任务未完成:缺少官方来源。" + + assert TaskAttemptOrchestrator._apply_incomplete_prefix(text, "任务未完成:部分步骤缺少证据。\n\n") == text diff --git a/app-instance/backend/tests/unit/test_team_node_tool_policy.py b/app-instance/backend/tests/unit/test_team_node_tool_policy.py new file mode 100644 index 0000000..43326f2 --- /dev/null +++ b/app-instance/backend/tests/unit/test_team_node_tool_policy.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +from beaver.coordinator import AgentDescriptor, DelegationEnvelope, ExecutionGraph, ExecutionNode, NodeRunResult +from beaver.coordinator.execution.scheduler import TeamGraphScheduler +from beaver.coordinator.local import LocalAgentRunner +from beaver.engine import AgentLoop, EngineLoader +from beaver.engine.providers.base import LLMProvider, LLMResponse +from beaver.engine.providers.factory import ProviderBundle +from beaver.tools import BaseTool, ToolContext, ToolExecutor, ToolRegistry, ToolResult, ToolSpec + + +class RecordingProvider(LLMProvider): + def __init__(self) -> None: + super().__init__() + self.calls: list[dict[str, Any]] = [] + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + model: str | None = None, + max_tokens: int | None = None, + temperature: float = 0.7, + thinking_enabled: bool | None = None, + ) -> LLMResponse: + self.calls.append({"messages": messages, "tools": tools}) + return LLMResponse(content="done", finish_reason="stop", provider_name="stub", model="stub") + + def get_default_model(self) -> str: + return "stub" + + +class StaticToolAssembler: + def __init__(self, specs: list[ToolSpec]) -> None: + self.specs = specs + + async def assemble(self, **_: Any) -> list[ToolSpec]: + return list(self.specs) + + +class StubTool(BaseTool): + def __init__(self, name: str) -> None: + self._spec = ToolSpec(name=name, description=name, input_schema={"type": "object"}) + self.calls = 0 + + @property + def spec(self) -> ToolSpec: + return self._spec + + async def invoke(self, arguments: dict[str, Any], context: ToolContext) -> ToolResult: + self.calls += 1 + return ToolResult(True, "called", self.spec.name) + + +class CapturingRunner: + def __init__(self) -> None: + self.envelopes: list[DelegationEnvelope] = [] + + async def run(self, envelope: DelegationEnvelope, **_: Any) -> NodeRunResult: + self.envelopes.append(envelope) + return NodeRunResult( + node_id=envelope.node_id or envelope.agent.name, + success=True, + output_text="done", + finish_reason="stop", + ) + + +def _bundle(provider: LLMProvider) -> ProviderBundle: + return ProviderBundle( + main_runtime=SimpleNamespace(model="stub", provider_name="stub"), + main_provider=provider, + ) + + +def _loop(tmp_path: Path) -> AgentLoop: + loop = AgentLoop(loader=EngineLoader(workspace=tmp_path)) + loaded = loop.boot() + specs = [loaded.tool_registry.get(name).spec for name in ("read_file", "web_search")] + loaded.tool_assembler = StaticToolAssembler(specs) # type: ignore[assignment] + return loop + + +def _tool_names(tools: list[dict] | None) -> list[str]: + return [str(tool["function"]["name"]) for tool in tools or []] + + +def _graph(allowed_tool_names: list[str] | None) -> ExecutionGraph: + return ExecutionGraph( + strategy="sequence", + nodes=[ + ExecutionNode( + node_id="collect", + task="collect", + agent=AgentDescriptor(name="collect"), + allowed_tool_names=allowed_tool_names, + ) + ], + ) + + +def test_none_tool_scope_preserves_legacy_selection(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + + asyncio.run( + loop.process_direct( + "collect", + allowed_tool_names=None, + include_skill_assembly=False, + provider_bundle=_bundle(provider), + ) + ) + + assert _tool_names(provider.calls[0]["tools"]) == ["read_file", "web_search"] + + +def test_empty_tool_scope_exposes_no_tools(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + + asyncio.run( + loop.process_direct( + "collect", + allowed_tool_names=[], + include_skill_assembly=False, + provider_bundle=_bundle(provider), + ) + ) + + assert _tool_names(provider.calls[0]["tools"]) == [] + + +def test_named_tool_scope_exposes_only_allowed_schema(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + + asyncio.run( + loop.process_direct( + "collect", + allowed_tool_names=["web_search"], + include_skill_assembly=False, + provider_bundle=_bundle(provider), + ) + ) + + assert _tool_names(provider.calls[0]["tools"]) == ["web_search"] + + +def test_executor_rejects_registered_tool_outside_node_allowlist() -> None: + registry = ToolRegistry() + write_file = StubTool("write_file") + registry.register(write_file) + executor = ToolExecutor(registry) + context = ToolContext(metadata={"allowed_tool_names": ["web_search"]}) + + result = asyncio.run(executor.execute("write_file", {"path": "x"}, context=context)) + + assert result.success is False + assert result.error == "tool_not_allowed" + assert write_file.calls == 0 + + +def test_local_agent_runner_passes_node_tool_scope(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + envelope = DelegationEnvelope( + parent_task_id="task-parent", + parent_session_id="session-root", + parent_run_id="run-root", + agent=AgentDescriptor(name="collect"), + task="collect", + node_id="collect", + allowed_tool_names=[], + ) + + result = asyncio.run(LocalAgentRunner(loop).run(envelope, provider_bundle=_bundle(provider))) + + assert result.success is True + assert _tool_names(provider.calls[0]["tools"]) == [] + + +def test_scheduler_copies_named_node_tool_scope_to_envelope() -> None: + runner = CapturingRunner() + + asyncio.run( + TeamGraphScheduler(runner).run( # type: ignore[arg-type] + _graph(["web_search"]), + parent_task_id="task-parent", + parent_session_id="session-root", + ) + ) + + assert runner.envelopes[0].allowed_tool_names == ["web_search"] + + +def test_empty_tool_scope_reaches_provider_through_real_team_path(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + + asyncio.run( + TeamGraphScheduler(LocalAgentRunner(loop)).run( + _graph([]), + parent_task_id="task-parent", + parent_session_id="session-root", + provider_bundle=_bundle(provider), + ) + ) + + assert _tool_names(provider.calls[0]["tools"]) == [] + + +def test_none_tool_scope_preserves_tools_through_real_team_path(tmp_path: Path) -> None: + loop = _loop(tmp_path) + provider = RecordingProvider() + + asyncio.run( + TeamGraphScheduler(LocalAgentRunner(loop)).run( + _graph(None), + parent_task_id="task-parent", + parent_session_id="session-root", + provider_bundle=_bundle(provider), + ) + ) + + assert _tool_names(provider.calls[0]["tools"]) == ["read_file", "web_search"] diff --git a/app-instance/backend/tests/unit/test_user_file_service.py b/app-instance/backend/tests/unit/test_user_file_service.py index a1fcf53..a7bbcbb 100644 --- a/app-instance/backend/tests/unit/test_user_file_service.py +++ b/app-instance/backend/tests/unit/test_user_file_service.py @@ -11,6 +11,7 @@ from beaver.services.user_files import ( UserFileNotFoundError, UserFilePathError, UserFileSizeError, + UserFileStorageError, UserFileService, normalize_user_path, ) @@ -151,3 +152,68 @@ def test_minio_storage_rejects_paths_that_escape_namespace() -> None: with pytest.raises(UserFilePathError): storage._user_path("users/bob/uploads/secret.txt") + + +@pytest.mark.asyncio +async def test_minio_storage_translates_s3_errors_to_user_file_errors() -> None: + from minio.error import S3Error + + class FakeMinioClient: + def list_objects(self, *args, **kwargs): + raise S3Error( + None, + "SignatureDoesNotMatch", + "The request signature we calculated does not match", + "/beaver-user-files", + "request-id", + "host-id", + bucket_name="beaver-user-files", + ) + + storage = object.__new__(MinIOUserFileStorage) + storage.config = MinIOStorageConfig( + endpoint="minio.local:9000", + access_key="alice-access", + secret_key="alice-secret", + bucket="beaver-user-files", + namespace="users/alice", + ) + storage.client = FakeMinioClient() + + with pytest.raises(UserFileStorageError) as exc_info: + await storage.list_dir("uploads") + + assert "SignatureDoesNotMatch" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_minio_storage_does_not_report_auth_errors_as_missing_files() -> None: + from minio.error import S3Error + + class FakeMinioClient: + def stat_object(self, *args, **kwargs): + raise S3Error( + None, + "SignatureDoesNotMatch", + "The request signature we calculated does not match", + "/beaver-user-files/uploads/input.txt", + "request-id", + "host-id", + bucket_name="beaver-user-files", + object_name="users/alice/uploads/input.txt", + ) + + storage = object.__new__(MinIOUserFileStorage) + storage.config = MinIOStorageConfig( + endpoint="minio.local:9000", + access_key="alice-access", + secret_key="alice-secret", + bucket="beaver-user-files", + namespace="users/alice", + ) + storage.client = FakeMinioClient() + + with pytest.raises(UserFileStorageError) as exc_info: + await storage.read_file("uploads/input.txt") + + assert "SignatureDoesNotMatch" in str(exc_info.value) diff --git a/app-instance/backend/tests/unit/test_web_files_api.py b/app-instance/backend/tests/unit/test_web_files_api.py index 32fd6f5..9ee7e48 100644 --- a/app-instance/backend/tests/unit/test_web_files_api.py +++ b/app-instance/backend/tests/unit/test_web_files_api.py @@ -7,7 +7,7 @@ from fastapi.testclient import TestClient from beaver.interfaces.web.app import create_app from beaver.services.agent_service import AgentService from beaver.services.user_file_resolver import UserFileStorageResolver -from beaver.services.user_files import LocalUserFileStorage, UserFileService +from beaver.services.user_files import LocalUserFileStorage, UserFileService, UserFileStorageError def _auth_headers(app, username: str = "alice") -> dict[str, str]: @@ -191,6 +191,26 @@ def test_user_files_api_authenticated_request_resolves_identity(tmp_path: Path, assert seen[0].storage_namespace == "users/alice" +def test_user_files_api_reports_storage_errors_as_unavailable(tmp_path: Path, monkeypatch) -> None: + service = AgentService(workspace=tmp_path) + app = create_app(service=service, manage_service_lifecycle=False) + + class BrokenStorage: + async def list_dir(self, path: str): + raise UserFileStorageError("User file storage list directory failed: SignatureDoesNotMatch") + + async def fake_service(self): + return UserFileService(BrokenStorage()) + + monkeypatch.setattr(UserFileStorageResolver, "service", fake_service) + + with TestClient(app) as client: + response = client.get("/api/user-files/browse", params={"path": "uploads"}, headers=_auth_headers(app)) + + assert response.status_code == 503 + assert "SignatureDoesNotMatch" in response.json()["detail"] + + def test_user_files_api_streams_upload_and_enforces_configured_limit(tmp_path: Path, monkeypatch) -> None: monkeypatch.setenv("BEAVER_USER_FILES_MAX_UPLOAD_BYTES", "5") service = AgentService(workspace=tmp_path) diff --git a/app-instance/backend/tests/unit/test_web_tools.py b/app-instance/backend/tests/unit/test_web_tools.py index de5f8a9..c88122f 100644 --- a/app-instance/backend/tests/unit/test_web_tools.py +++ b/app-instance/backend/tests/unit/test_web_tools.py @@ -2,23 +2,43 @@ from __future__ import annotations import asyncio import json +import sys +import types from beaver.tools.builtins import web +def _disable_ddgs(monkeypatch) -> None: + def _raise_unavailable(query: str, limit: int) -> list[dict[str, str]]: + raise ModuleNotFoundError("ddgs disabled for fallback test") + + monkeypatch.setattr(web, "_search_ddgs", _raise_unavailable) + + class _FakeResponse: headers = {"content-type": "text/html"} status_code = 200 + fetch_html = """ + +
Example result