diff --git a/app-instance/backend/beaver/engine/context/__init__.py b/app-instance/backend/beaver/engine/context/__init__.py index 090d3a3..a14f33a 100644 --- a/app-instance/backend/beaver/engine/context/__init__.py +++ b/app-instance/backend/beaver/engine/context/__init__.py @@ -4,6 +4,7 @@ from .builder import ( ContextBuildInput, ContextBuildResult, ContextBuilder, + RuntimeContext, SessionContext, SkillContext, ) @@ -12,6 +13,7 @@ __all__ = [ "ContextBuildInput", "ContextBuildResult", "ContextBuilder", + "RuntimeContext", "SessionContext", "SkillContext", ] diff --git a/app-instance/backend/beaver/engine/context/builder.py b/app-instance/backend/beaver/engine/context/builder.py index e365897..2ca040f 100644 --- a/app-instance/backend/beaver/engine/context/builder.py +++ b/app-instance/backend/beaver/engine/context/builder.py @@ -80,6 +80,16 @@ class SessionContext: parent_session_id: str | None = None +@dataclass(slots=True) +class RuntimeContext: + """Per-run runtime facts that should be visible to the model.""" + + utc_datetime: str + local_datetime: str + timezone: str | None = None + utc_offset: str | None = None + + @dataclass(slots=True) class ContextBuildInput: """一次上下文构建所需的全部输入。 @@ -103,6 +113,7 @@ class ContextBuildInput: memory_snapshot: MemorySnapshot | None = None activated_skills: list[SkillContext] = field(default_factory=list) session_context: SessionContext | None = None + runtime_context: RuntimeContext | None = None execution_context: str | None = None extra_sections: list[str] = field(default_factory=list) @@ -143,9 +154,10 @@ class ContextBuilder: 1. Beaver user-facing assistant identity 2. base system prompt 3. session metadata - 4. execution context - 5. frozen memory snapshot - 6. extra sections + 4. runtime date/time + 5. execution context + 6. frozen memory snapshot + 7. extra sections 这样设计的原因: - 身份与总规则要最靠前 @@ -164,6 +176,10 @@ class ContextBuilder: if session_section: sections.append(session_section) + runtime_section = self._render_runtime_section(build_input.runtime_context) + if runtime_section: + sections.append(runtime_section) + execution_context = (build_input.execution_context or "").strip() if execution_context: sections.append(f"# Execution Context\n\n{execution_context}") @@ -347,6 +363,31 @@ class ContextBuilder: return None return "# Current Session\n\n" + "\n".join(rows) + def _render_runtime_section(self, runtime_context: RuntimeContext | None) -> str | None: + """Render date/time facts captured for the current model run.""" + + if runtime_context is None: + return None + + rows: list[str] = [] + if runtime_context.utc_datetime: + rows.append(f"Current UTC time: {runtime_context.utc_datetime}") + if runtime_context.local_datetime: + rows.append(f"Current local time: {runtime_context.local_datetime}") + if runtime_context.timezone: + rows.append(f"Local timezone: {runtime_context.timezone}") + if runtime_context.utc_offset: + rows.append(f"Local UTC offset: {runtime_context.utc_offset}") + + if not rows: + return None + return ( + "# Current Date and Time\n\n" + + "\n".join(rows) + + "\n\nUse this section as authoritative for relative date/time references such as " + '"today", "tomorrow", "now", "this week", and "next month".' + ) + def build_skill_activation_messages(self, activated_skills: list[SkillContext]) -> list[dict[str, str]]: """把已激活 skill 转成显式消息。 diff --git a/app-instance/backend/beaver/engine/loader.py b/app-instance/backend/beaver/engine/loader.py index 86362f6..e12e14d 100644 --- a/app-instance/backend/beaver/engine/loader.py +++ b/app-instance/backend/beaver/engine/loader.py @@ -24,7 +24,7 @@ from beaver.skills.learning.eval import SkillDraftEvaluator from beaver.skills.publisher import SkillPublisher from beaver.skills.reviews import ReviewService from beaver.skills.specs import SkillSpecStore -from beaver.tasks import TaskExecutionPlanner, TaskService, ValidationService +from beaver.tasks import TaskExecutionPlanner, TaskService from beaver.tasks.skill_resolver import TaskSkillResolver from beaver.skills import SkillAssembler, SkillsLoader from beaver.tools import ObjectBackedTool, ToolAssembler, ToolExecutor, ToolRegistry @@ -91,7 +91,6 @@ class EngineLoadResult: task_skill_resolver: TaskSkillResolver | None = None task_service: TaskService | None = None task_execution_planner: TaskExecutionPlanner | None = None - validation_service: ValidationService | None = None mcp_manager: MCPConnectionManager | None = None mcp_report: dict[str, dict] = field(default_factory=dict) closeables: list[tuple[str, Callable[[], None]]] = field(default_factory=list, repr=False) @@ -166,7 +165,6 @@ class EngineLoader: task_skill_resolver: TaskSkillResolver | None = None, task_service: TaskService | None = None, task_execution_planner: TaskExecutionPlanner | None = None, - validation_service: ValidationService | None = None, ) -> None: self.config = config or load_config(workspace=workspace, config_path=config_path) configured_workspace = self.config.agents_defaults.workspace @@ -192,7 +190,6 @@ class EngineLoader: self._task_skill_resolver = task_skill_resolver self._task_service = task_service self._task_execution_planner = task_execution_planner - self._validation_service = validation_service def load(self) -> EngineLoadResult: """装配当前主链需要的最小 runtime 对象。""" @@ -276,7 +273,6 @@ class EngineLoader: ) task_service = self._task_service or TaskService(workspace / "tasks") task_execution_planner = self._task_execution_planner or TaskExecutionPlanner(task_skill_resolver=task_skill_resolver) - validation_service = self._validation_service or ValidationService() mcp_manager = MCPConnectionManager( self.config.tools.mcp_servers, authz_config=self.config.authz, @@ -311,7 +307,6 @@ class EngineLoader: task_skill_resolver=task_skill_resolver, task_service=task_service, task_execution_planner=task_execution_planner, - validation_service=validation_service, mcp_manager=mcp_manager, ) if self._session_manager is None: diff --git a/app-instance/backend/beaver/engine/loop.py b/app-instance/backend/beaver/engine/loop.py index 235e0fc..6749a0b 100644 --- a/app-instance/backend/beaver/engine/loop.py +++ b/app-instance/backend/beaver/engine/loop.py @@ -4,12 +4,15 @@ from __future__ import annotations import asyncio import json +import os +import re from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Any from uuid import uuid4 +from zoneinfo import ZoneInfo, ZoneInfoNotFoundError -from beaver.engine.context import ContextBuildInput, SessionContext, SkillContext +from beaver.engine.context import ContextBuildInput, RuntimeContext, SessionContext, SkillContext from beaver.memory.runs import RunRecord, SkillEffectRecord from beaver.skills.learning import RunReceiptContext from beaver.skills.catalog.utils import strip_frontmatter @@ -26,6 +29,17 @@ TOOL_FAILURE_GUIDANCE_PROMPT = ( "Use available materials, state uncertainty clearly, and provide partial confirmed results." ) +RAW_TOOL_CALL_FALLBACK = ( + "The run reached the configured tool-call limit before producing a reliable final answer. " + "The model attempted another tool call instead of answering, so the raw tool call was suppressed. " + "Please request a revision to continue the task." +) + +_RAW_TOOL_CALL_RE = re.compile( + r"^\s*\s*$|^\s*]+>[\s\S]*?\s*$", + re.IGNORECASE, +) + @dataclass(slots=True) class AgentProfile: @@ -35,8 +49,9 @@ class AgentProfile: system_prompt: str = "" default_model: str = "gpt-4.1-mini" max_tokens: int = 4096 + max_context_messages: int = 1000 temperature: float = 0.2 - max_tool_iterations: int = 8 + max_tool_iterations: int = 30 @dataclass(slots=True) @@ -446,7 +461,7 @@ class AgentLoop: *(pinned_skill_contexts or []), *self._load_pinned_skill_contexts(skills_loader, pinned_skill_names or []), ] - if not include_skill_assembly or thinking_enabled is False: + if not include_skill_assembly: activated_skills = self._merge_skill_contexts(pinned_skills, []) else: skill_query = skill_selection_context or task @@ -512,8 +527,6 @@ class AgentLoop: if not include_tools: selected_tool_specs = [] - elif thinking_enabled is False: - selected_tool_specs = tool_registry.list_specs() else: selected_tool_specs = await tool_assembler.assemble( task_description=task, @@ -543,7 +556,10 @@ class AgentLoop: build_input = ContextBuildInput( base_system_prompt=self.profile.system_prompt, - history=session_manager.get_history(resolved_session_id), + history=session_manager.get_history( + resolved_session_id, + max_messages=max(1, self.profile.max_context_messages), + ), current_user_input=task, memory_snapshot=memory_snapshot, activated_skills=activated_skills, @@ -554,6 +570,7 @@ class AgentLoop: user_id=user_id, parent_session_id=parent_session_id, ), + runtime_context=self._current_runtime_context(), execution_context=execution_context, extra_sections=[TOOL_FAILURE_GUIDANCE_PROMPT], ) @@ -693,6 +710,7 @@ class AgentLoop: tool_calls=assistant_tool_calls or None, finish_reason=response.finish_reason, reasoning=response.reasoning_content, + context_visible=not bool(assistant_tool_calls), source=source, title=title, model=final_model, @@ -707,7 +725,11 @@ class AgentLoop: if not response.has_tool_calls: final_text = response.content or "" - final_finish_reason = response.finish_reason or "stop" + if self._looks_like_raw_tool_call(final_text): + final_text = RAW_TOOL_CALL_FALLBACK + final_finish_reason = "invalid_tool_call_text" + else: + final_finish_reason = response.finish_reason or "stop" break if iterations >= resolved_max_tool_iterations: @@ -719,10 +741,7 @@ class AgentLoop: temperature=resolved_temperature, thinking_enabled=thinking_enabled, ) - final_text = finalized or ( - "Tool loop stopped after reaching the configured iteration limit, " - "and no final answer was produced." - ) + final_text = finalized or RAW_TOOL_CALL_FALLBACK final_finish_reason = "max_tool_iterations_finalized" if finalized else "max_tool_iterations" session_manager.append_message( resolved_session_id, @@ -877,17 +896,14 @@ class AgentLoop: temperature: float, thinking_enabled: bool | None, ) -> str: - final_messages = [ - *messages, - { - "role": "system", - "content": ( - "The configured tool iteration budget is exhausted. Do not call tools. " - "Produce the best final answer from the existing conversation and tool results. " - "State uncertainty explicitly." - ), - }, - ] + final_messages = AgentLoop._with_system_guidance( + messages, + ( + "The configured tool iteration budget is exhausted. Do not call tools. " + "Produce the best final answer from the existing conversation and tool results. " + "State uncertainty explicitly." + ), + ) kwargs: dict[str, Any] = { "messages": final_messages, "tools": None, @@ -898,7 +914,27 @@ class AgentLoop: if thinking_enabled is not None: kwargs["thinking_enabled"] = thinking_enabled response = await provider.chat(**kwargs) - return (response.content or "").strip() + if response.has_tool_calls: + return "" + content = (response.content or "").strip() + if AgentLoop._looks_like_raw_tool_call(content): + return "" + return content + + @staticmethod + def _looks_like_raw_tool_call(content: str | None) -> bool: + if not content: + return False + return bool(_RAW_TOOL_CALL_RE.match(content)) + + @staticmethod + def _with_system_guidance(messages: list[dict[str, Any]], guidance: str) -> list[dict[str, Any]]: + copied = [dict(message) for message in messages] + if copied and copied[0].get("role") == "system": + existing = str(copied[0].get("content") or "").strip() + copied[0]["content"] = "\n\n".join(part for part in (existing, guidance.strip()) if part) + return copied + return [{"role": "system", "content": guidance.strip()}, *copied] @staticmethod def _load_pinned_skill_contexts(skills_loader: Any, skill_names: list[str]) -> list[SkillContext]: @@ -1133,3 +1169,49 @@ class AgentLoop: @staticmethod def _utc_now() -> str: return datetime.now(timezone.utc).isoformat() + + @staticmethod + def _current_runtime_context() -> RuntimeContext: + utc_now = datetime.now(timezone.utc) + timezone_name = AgentLoop._configured_timezone_name() + local_now = datetime.now().astimezone() + rendered_timezone = local_now.tzname() + + if timezone_name: + try: + local_now = utc_now.astimezone(ZoneInfo(timezone_name)) + rendered_timezone = timezone_name + except ZoneInfoNotFoundError: + rendered_timezone = local_now.tzname() or timezone_name + + return RuntimeContext( + utc_datetime=utc_now.isoformat(), + local_datetime=local_now.isoformat(), + timezone=rendered_timezone, + utc_offset=AgentLoop._format_utc_offset(local_now), + ) + + @staticmethod + def _configured_timezone_name() -> str | None: + for value in (os.getenv("BEAVER_RUNTIME_TIMEZONE"), os.getenv("TZ")): + cleaned = (value or "").strip() + if cleaned: + return cleaned + + try: + timezone_file = "/etc/timezone" + if os.path.exists(timezone_file): + with open(timezone_file, encoding="utf-8") as file: + cleaned = file.read().strip() + if cleaned: + return cleaned + except OSError: + return None + return None + + @staticmethod + def _format_utc_offset(value: datetime) -> str | None: + raw = value.strftime("%z") + if not raw: + return None + return f"{raw[:3]}:{raw[3:]}" diff --git a/app-instance/backend/beaver/engine/providers/litellm.py b/app-instance/backend/beaver/engine/providers/litellm.py index 53532f1..bcd8fde 100644 --- a/app-instance/backend/beaver/engine/providers/litellm.py +++ b/app-instance/backend/beaver/engine/providers/litellm.py @@ -119,13 +119,23 @@ class LiteLLMProvider(LLMProvider): @staticmethod def _sanitize_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: sanitized = [] + system_contents: list[str] = [] for message in messages: clean = {key: value for key, value in message.items() if key in _ALLOWED_MSG_KEYS} + if clean.get("role") == "system": + content = clean.get("content") + if isinstance(content, str) and content.strip(): + system_contents.append(content.strip()) + elif content is not None: + system_contents.append(str(content)) + continue if clean.get("role") == "assistant" and "content" not in clean: clean["content"] = None if isinstance(clean.get("tool_calls"), list): clean["tool_calls"] = LiteLLMProvider._sanitize_tool_calls(clean["tool_calls"]) sanitized.append(clean) + if system_contents: + sanitized.insert(0, {"role": "system", "content": "\n\n".join(system_contents)}) return sanitized @staticmethod diff --git a/app-instance/backend/beaver/engine/session/models.py b/app-instance/backend/beaver/engine/session/models.py index 73bfbaf..d2e26a8 100644 --- a/app-instance/backend/beaver/engine/session/models.py +++ b/app-instance/backend/beaver/engine/session/models.py @@ -84,8 +84,10 @@ class MessageRecord: payload["task_id"] = self.event_payload.get("task_id") if self.event_payload.get("task_status"): payload["task_status"] = self.event_payload.get("task_status") - if self.event_payload.get("validation_status"): - payload["validation_status"] = self.event_payload.get("validation_status") + if self.event_payload.get("evidence_status"): + payload["evidence_status"] = self.event_payload.get("evidence_status") + if self.event_payload.get("acceptance_state"): + payload["acceptance_state"] = self.event_payload.get("acceptance_state") if self.event_payload.get("feedback_state"): payload["feedback_state"] = self.event_payload.get("feedback_state") if self.event_payload.get("feedback_error"): diff --git a/app-instance/backend/beaver/foundation/config/loader.py b/app-instance/backend/beaver/foundation/config/loader.py index ce75a8b..19aa4a2 100644 --- a/app-instance/backend/beaver/foundation/config/loader.py +++ b/app-instance/backend/beaver/foundation/config/loader.py @@ -86,6 +86,18 @@ def _parse_agent_defaults(data: dict[str, Any]) -> AgentDefaultsConfig: model=_string(defaults.get("model") or data.get("model")), provider=_string(defaults.get("provider") or data.get("provider")), embedding_model=_string(defaults.get("embeddingModel") or defaults.get("embedding_model") or data.get("embeddingModel")), + max_context_messages=_int( + defaults.get("maxContextMessages") + or defaults.get("max_context_messages") + or data.get("maxContextMessages") + or data.get("max_context_messages") + ), + max_tool_iterations=_int( + defaults.get("maxToolIterations") + or defaults.get("max_tool_iterations") + or data.get("maxToolIterations") + or data.get("max_tool_iterations") + ), ) @@ -217,6 +229,13 @@ def _float(value: Any) -> float | None: return float(value) +def _int(value: Any) -> int | None: + parsed = _float(value) + if parsed is None: + return None + return int(parsed) + + def _bool(value: Any, *, default: bool) -> bool: if isinstance(value, bool): return value diff --git a/app-instance/backend/beaver/foundation/config/schema.py b/app-instance/backend/beaver/foundation/config/schema.py index 61f0d7f..438068d 100644 --- a/app-instance/backend/beaver/foundation/config/schema.py +++ b/app-instance/backend/beaver/foundation/config/schema.py @@ -25,6 +25,8 @@ class AgentDefaultsConfig: model: str | None = None provider: str | None = None embedding_model: str | None = None + max_context_messages: int | None = None + max_tool_iterations: int | None = None @dataclass(slots=True) diff --git a/app-instance/backend/beaver/interfaces/web/app.py b/app-instance/backend/beaver/interfaces/web/app.py index 30ac62c..1efafa8 100644 --- a/app-instance/backend/beaver/interfaces/web/app.py +++ b/app-instance/backend/beaver/interfaces/web/app.py @@ -44,6 +44,8 @@ from .files import ( workspace_file_path, ) from .schemas import ( + WebChatAcceptanceRequest, + WebChatAcceptanceResponse, WebChatFeedbackRequest, WebChatFeedbackResponse, WebChatRequest, @@ -155,6 +157,13 @@ except ModuleNotFoundError: # pragma: no cover - fallback for skeleton-only env return decorator +RAW_TOOL_CALL_DISPLAY_FALLBACK = ( + "The run reached the configured tool-call limit before producing a reliable final answer. " + "The model attempted another tool call instead of answering, so the raw tool call was suppressed. " + "Please request a revision to continue the task." +) + + @asynccontextmanager async def _app_lifespan( app: FastAPI, @@ -365,6 +374,7 @@ def create_app( "workspace_exists": loaded.workspace.exists(), "model": config.default_model or agent_service.profile.default_model, "max_tokens": agent_service.profile.max_tokens, + "max_context_messages": agent_service.profile.max_context_messages, "temperature": agent_service.profile.temperature, "max_tool_iterations": agent_service.profile.max_tool_iterations, "providers": providers_status, @@ -1719,7 +1729,8 @@ def create_app( usage=result.usage, task_id=result.task_id, task_status=result.task_status, - validation_result=result.validation_result, + evidence_status="recorded" if result.task_id else None, + validation_result=None, ) fallback_target = _model_dump(payload.fallback_target) @@ -1769,7 +1780,8 @@ def create_app( usage=result.usage, task_id=result.task_id, task_status=result.task_status, - validation_result=result.validation_result, + evidence_status="recorded" if result.task_id else None, + validation_result=None, ) @app.websocket("/ws/{session_id:path}") @@ -1882,6 +1894,30 @@ def create_app( } ) + @app.post( + "/api/chat/acceptance", + response_model=WebChatAcceptanceResponse, + responses={ + 400: {"model": WebErrorResponse}, + 404: {"model": WebErrorResponse}, + }, + ) + async def chat_acceptance(request: Request, payload: WebChatAcceptanceRequest) -> WebChatAcceptanceResponse: + agent_service = get_agent_service(request) + try: + result = await agent_service.submit_acceptance( + session_id=payload.session_id, + run_id=payload.run_id, + acceptance_type=payload.acceptance_type, + comment=payload.comment, + ) + except ValueError as exc: + detail = str(exc) + status_code = 404 if "No internal task" in detail else 400 + raise HTTPException(status_code=status_code, detail=detail) from exc + + return WebChatAcceptanceResponse(**result) + @app.post( "/api/chat/feedback", response_model=WebChatFeedbackResponse, @@ -1893,10 +1929,10 @@ def create_app( async def chat_feedback(request: Request, payload: WebChatFeedbackRequest) -> WebChatFeedbackResponse: agent_service = get_agent_service(request) try: - result = await agent_service.submit_feedback( + result = await agent_service.submit_acceptance( session_id=payload.session_id, run_id=payload.run_id, - feedback_type=payload.feedback_type, + acceptance_type=payload.feedback_type, comment=payload.comment, ) except ValueError as exc: @@ -1915,15 +1951,21 @@ def _session_detail(session_manager: Any, session_id: str, session: dict[str, An role = event.get("role") if role not in {"user", "assistant"}: continue + content = event.get("content") or "" + comparable_content = str(content).replace("\u200b", "").replace("\u200c", "").replace("\u200d", "").replace("\ufeff", "") + if role == "assistant" and not comparable_content.strip(): + continue + content = _sanitize_user_visible_assistant_content(role=role, content=content) messages.append( { "role": role, - "content": event.get("content") or "", + "content": content, "timestamp": _iso_from_timestamp(event.get("timestamp")), "run_id": event.get("run_id"), "task_id": event.get("task_id"), "task_status": event.get("task_status"), - "validation_status": event.get("validation_status"), + "evidence_status": event.get("evidence_status"), + "acceptance_state": event.get("acceptance_state"), "feedback_state": event.get("feedback_state"), "feedback_error": event.get("feedback_error"), "message_type": event.get("message_type"), @@ -2142,6 +2184,7 @@ def _task_run_views(task: Any, events: list[Any], session_manager: Any, run_memo content = (record.content or "").strip() if not content: continue + content = _sanitize_user_visible_assistant_content(role=record.role, content=content) messages.append( { "role": record.role, @@ -2150,7 +2193,6 @@ def _task_run_views(task: Any, events: list[Any], session_manager: Any, run_memo "tool_name": record.tool_name, } ) - validation = run_record.validation_result if run_record is not None else None views.append( { "run_id": run_id, @@ -2163,7 +2205,8 @@ def _task_run_views(task: Any, events: list[Any], session_manager: Any, run_memo "attempt_index": run_record.attempt_index if run_record is not None else None, "task_text": run_record.task_text if run_record is not None else "", "messages": messages, - "validation_result": validation, + "evidence_status": "recorded", + "validation_result": None, } ) return views @@ -2428,12 +2471,6 @@ def _model_dump(value: Any) -> dict[str, Any] | None: return dict(value) -def _validation_status(validation_result: dict[str, Any] | None) -> str: - if validation_result is None: - return "unknown" - return "passed" if validation_result.get("accepted") is True else "failed" - - def _websocket_input_metadata(payload: dict[str, Any]) -> dict[str, Any]: metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {} result: dict[str, Any] = dict(metadata) @@ -2467,13 +2504,15 @@ def _int_or_none(value: Any) -> int | None: def _websocket_message_payload(result: Any, *, input_payload: dict[str, Any]) -> dict[str, Any]: - validation_result = getattr(result, "validation_result", None) task_id = getattr(result, "task_id", None) task_status = getattr(result, "task_status", None) return { "type": "message", "role": "assistant", - "content": getattr(result, "output_text", "") or "", + "content": _sanitize_user_visible_assistant_content( + role="assistant", + content=getattr(result, "output_text", "") or "", + ), "session_id": getattr(result, "session_id", None), "run_id": getattr(result, "run_id", None), "finish_reason": getattr(result, "finish_reason", None), @@ -2483,17 +2522,39 @@ def _websocket_message_payload(result: Any, *, input_payload: dict[str, Any]) -> "usage": dict(getattr(result, "usage", {}) or {}), "task_id": task_id, "task_status": task_status, - "validation_result": validation_result, - "validation_status": _validation_status(validation_result), + "evidence_status": "recorded" if task_id else None, + "validation_result": None, "metadata": { "task_id": task_id, "task_status": task_status, - "validation_result": validation_result, + "evidence_status": "recorded" if task_id else None, "input_metadata": _websocket_input_metadata(input_payload), }, } +def _sanitize_user_visible_assistant_content(*, role: str, content: str) -> str: + if role != "assistant": + return content + if _looks_like_raw_tool_call(content): + return RAW_TOOL_CALL_DISPLAY_FALLBACK + return content + + +def _looks_like_raw_tool_call(content: str | None) -> bool: + if not content: + return False + stripped = content.strip() + lowered = stripped.lower() + return ( + lowered.startswith("") + ) or ( + lowered.startswith("") + ) + + def _provider_enabled(provider_name: str, provider_cfg: Any) -> bool: if provider_cfg is None or provider_name == "custom": return False @@ -2980,6 +3041,7 @@ def _write_config_json(path: Path, data: dict[str, Any]) -> None: def _reload_agent_config(agent_service: AgentService, config_path: Path) -> None: config = load_config(config_path=config_path) agent_service.loader.config = config + agent_service._apply_configured_profile_defaults() # noqa: SLF001 loop = getattr(agent_service, "_loop", None) loaded = getattr(loop, "loaded", None) if loop is not None else None if loaded is not None: diff --git a/app-instance/backend/beaver/interfaces/web/schemas/__init__.py b/app-instance/backend/beaver/interfaces/web/schemas/__init__.py index a53fcb7..48d2d5b 100644 --- a/app-instance/backend/beaver/interfaces/web/schemas/__init__.py +++ b/app-instance/backend/beaver/interfaces/web/schemas/__init__.py @@ -1,6 +1,8 @@ """Web request and response schemas.""" from .chat import ( + WebChatAcceptanceRequest, + WebChatAcceptanceResponse, WebChatFeedbackRequest, WebChatFeedbackResponse, WebChatRequest, @@ -13,6 +15,8 @@ from .chat import ( ) __all__ = [ + "WebChatAcceptanceRequest", + "WebChatAcceptanceResponse", "WebChatFeedbackRequest", "WebChatFeedbackResponse", "WebChatRequest", diff --git a/app-instance/backend/beaver/interfaces/web/schemas/chat.py b/app-instance/backend/beaver/interfaces/web/schemas/chat.py index 7fc6ab7..c5127a4 100644 --- a/app-instance/backend/beaver/interfaces/web/schemas/chat.py +++ b/app-instance/backend/beaver/interfaces/web/schemas/chat.py @@ -82,11 +82,34 @@ class WebChatResponse(BaseModel): usage: dict[str, Any] = Field(default_factory=dict) task_id: str | None = None task_status: str | None = None + evidence_status: str | None = None + acceptance_state: str | None = None validation_result: dict[str, Any] | None = None +class WebChatAcceptanceRequest(BaseModel): + """User acceptance on the latest assistant result in chat.""" + + session_id: str + run_id: str + acceptance_type: str + comment: str | None = None + + +class WebChatAcceptanceResponse(BaseModel): + """Acceptance recording result.""" + + session_id: str + run_id: str + task_id: str + task_status: str + acceptance_type: str + feedback_type: str + learning_candidates: list[dict[str, Any]] = Field(default_factory=list) + + class WebChatFeedbackRequest(BaseModel): - """Feedback on the latest assistant result in chat.""" + """Backward-compatible feedback payload.""" session_id: str run_id: str @@ -94,15 +117,8 @@ class WebChatFeedbackRequest(BaseModel): comment: str | None = None -class WebChatFeedbackResponse(BaseModel): - """Feedback recording result.""" - - session_id: str - run_id: str - task_id: str - task_status: str - feedback_type: str - learning_candidates: list[dict[str, Any]] = Field(default_factory=list) +class WebChatFeedbackResponse(WebChatAcceptanceResponse): + """Backward-compatible feedback response.""" class WebProviderConfigRequest(BaseModel): diff --git a/app-instance/backend/beaver/services/agent_service.py b/app-instance/backend/beaver/services/agent_service.py index e489995..a8331bf 100644 --- a/app-instance/backend/beaver/services/agent_service.py +++ b/app-instance/backend/beaver/services/agent_service.py @@ -29,9 +29,9 @@ from beaver.tasks import ( TaskEvidencePacket, TaskExecutionPlan, TaskRecord, - ValidationResult, render_task_evidence, ) +from beaver.tasks.service import normalize_acceptance_type NOTIFICATION_SESSION_ID = "notify:default:scheduled" @@ -60,11 +60,19 @@ class AgentService: ) -> None: self.profile = profile or AgentProfile() self.loader = loader or EngineLoader(workspace=workspace, config_path=config_path) + self._apply_configured_profile_defaults() self._loop: AgentLoop | None = None self._run_task: asyncio.Task[None] | None = None self._main_agent_router = MainAgentRouter() self._runtime_services: dict[str, Any] = {} + def _apply_configured_profile_defaults(self) -> None: + defaults = self.loader.config.agents_defaults + if defaults.max_context_messages is not None: + self.profile.max_context_messages = max(1, defaults.max_context_messages) + if defaults.max_tool_iterations is not None: + self.profile.max_tool_iterations = max(0, defaults.max_tool_iterations) + def create_loop(self) -> AgentLoop: """创建并缓存当前 service 使用的 AgentLoop。""" @@ -232,7 +240,7 @@ class AgentService: Scheduled jobs are product-level Tasks, not hidden one-off agent turns. This entry bypasses the main-agent classifier and forces Task mode so - every trigger produces a TaskRecord, validation, feedback state, and a + every trigger produces a TaskRecord, evidence, acceptance state, and a run_id that the scheduled-task history can link to. """ @@ -280,9 +288,9 @@ class AgentService: result.run_id, { "message_type": "scheduled_reply", - "scheduled_job_id": job.id, - "scheduled_run_id": run.scheduled_run_id, - "cron_job_name": job.name, + "scheduled_job_id": cron_job_id, + "scheduled_run_id": scheduled_run_id, + "cron_job_name": cron_job_name, "mode": "notification", }, ) @@ -403,15 +411,15 @@ class AgentService: }, ) - async def submit_feedback( + async def submit_acceptance( self, *, session_id: str, run_id: str, - feedback_type: str, + acceptance_type: str, comment: str | None = None, ) -> dict[str, Any]: - """Record chat feedback for the internal task linked to a run.""" + """Record user acceptance for the internal task linked to a run.""" loaded = self.create_loop().boot() task_service = self._require_loaded(loaded, "task_service") @@ -419,32 +427,31 @@ class AgentService: if task is None or task.session_id != session_id: raise ValueError(f"No internal task found for run_id={run_id!r}") - normalized = feedback_type.strip().lower() - if normalized not in {"satisfied", "revise", "abandon"}: - raise ValueError("feedback_type must be one of: satisfied, revise, abandon") + normalized = normalize_acceptance_type(acceptance_type) + legacy_feedback_type = "satisfied" if normalized == "accept" else normalized already_recorded = any( - item.get("run_id") == run_id and item.get("feedback_type") == normalized + item.get("run_id") == run_id and item.get("acceptance_type") == normalized for item in task.feedback ) - conflicting_feedback = next( + conflicting_acceptance = next( ( item for item in task.feedback - if item.get("run_id") == run_id and item.get("feedback_type") != normalized + if item.get("run_id") == run_id and item.get("acceptance_type") != normalized ), None, ) - if conflicting_feedback is not None: + if conflicting_acceptance is not None: raise ValueError( - f"Feedback for run_id={run_id!r} was already recorded as " - f"{conflicting_feedback.get('feedback_type')!r}" + f"Acceptance for run_id={run_id!r} was already recorded as " + f"{conflicting_acceptance.get('acceptance_type')!r}" ) if task.status in {"closed", "abandoned"} and not already_recorded: raise ValueError(f"Task {task.task_id} is already finalized as {task.status!r}") - updated = task if already_recorded else task_service.add_feedback( + updated = task if already_recorded else task_service.add_acceptance( task.task_id, - feedback_type=normalized, + acceptance_type=normalized, comment=comment, run_id=run_id, ) @@ -455,7 +462,8 @@ class AgentService: { "task_id": updated.task_id, "task_status": updated.status, - "feedback_state": normalized, + "acceptance_state": normalized, + "feedback_state": legacy_feedback_type, }, ) if not already_recorded: @@ -463,10 +471,11 @@ class AgentService: session_id, run_id=run_id, role="system", - event_type="task_feedback_recorded", + event_type="task_acceptance_recorded", event_payload={ "task_id": task.task_id, - "feedback_type": normalized, + "acceptance_type": normalized, + "feedback_type": legacy_feedback_type, "comment": comment, "task_status": updated.status, }, @@ -475,35 +484,36 @@ class AgentService: ) generated_candidates = [] - validation = ValidationResult.from_dict(updated.validation_result) if not already_recorded: run_memory_store = self._require_loaded(loaded, "run_memory_store") - feedback_payload = { - "feedback_type": normalized, + acceptance_payload = { + "acceptance_type": normalized, + "feedback_type": legacy_feedback_type, "comment": comment or "", "task_status": updated.status, + "final_accepted_run_id": updated.metadata.get("final_accepted_run_id"), } run_memory_store.update_run_record( run_id, - success=normalized == "satisfied", - feedback=feedback_payload, + success=normalized == "accept", + feedback=acceptance_payload, ) run_memory_store.update_skill_effects_for_run( run_id, - success=normalized == "satisfied", - feedback_score=self._feedback_score_for_learning(normalized, validation), + success=normalized == "accept", + feedback_score=self._acceptance_score_for_learning(normalized), notes=(comment or normalized).strip(), ) skill_learning_service = self._require_loaded(loaded, "skill_learning_service") skill_learning_service.rescore_skill_versions() if already_recorded: generated_candidates = [] - elif normalized == "satisfied" and validation is not None and validation.accepted: + elif normalized == "accept": generated_candidates = [ item.to_dict() for item in skill_learning_service.build_learning_candidates_for_task( updated.task_id, - trigger_run_id=run_id, + final_accepted_run_id=run_id, ) ] elif normalized == "abandon": @@ -514,7 +524,8 @@ class AgentService: event_type="task_failure_evidence_recorded", event_payload={ "task_id": updated.task_id, - "feedback_type": normalized, + "acceptance_type": normalized, + "feedback_type": legacy_feedback_type, "comment": comment or "", "task_status": updated.status, "durable_memory_written": False, @@ -528,10 +539,28 @@ class AgentService: "run_id": run_id, "task_id": updated.task_id, "task_status": updated.status, - "feedback_type": normalized, + "acceptance_type": normalized, + "feedback_type": legacy_feedback_type, "learning_candidates": generated_candidates, } + async def submit_feedback( + self, + *, + session_id: str, + run_id: str, + feedback_type: str, + comment: str | None = None, + ) -> dict[str, Any]: + """Backward-compatible wrapper for older clients.""" + + return await self.submit_acceptance( + session_id=session_id, + run_id=run_id, + acceptance_type=feedback_type, + comment=comment, + ) + async def _process_with_main_agent( self, message: str, @@ -591,7 +620,7 @@ class AgentService: else active_task ) if active_task is not None and decision.action == "revise_task" and task.task_id == active_task.task_id: - task = self._record_revision_feedback_for_task( + task = self._record_revision_acceptance_for_task( loaded, task=task, session_id=session_id, @@ -599,7 +628,7 @@ class AgentService: ) return await self._run_task_mode(message, runner=runner, kwargs=kwargs, task=task) - def _record_revision_feedback_for_task( + def _record_revision_acceptance_for_task( self, loaded: Any, *, @@ -607,9 +636,9 @@ class AgentService: session_id: str, comment: str, ) -> TaskRecord: - """Mark the latest feedback-eligible run as revised before continuing a task.""" + """Mark the latest acceptance-eligible run as revised before continuing a task.""" - if task.status not in {"awaiting_feedback", "needs_revision"}: + if task.status not in {"awaiting_acceptance", "needs_revision"}: return task run_id = next((item for item in reversed(task.run_ids) if item), None) if not run_id: @@ -617,15 +646,15 @@ class AgentService: existing = next((item for item in task.feedback if item.get("run_id") == run_id), None) if existing is not None: - if existing.get("feedback_type") != "revise": + if existing.get("acceptance_type") != "revise": return task updated = task already_recorded = True else: task_service = self._require_loaded(loaded, "task_service") - updated = task_service.add_feedback( + updated = task_service.add_acceptance( task.task_id, - feedback_type="revise", + acceptance_type="revise", comment=comment, run_id=run_id, ) @@ -638,6 +667,7 @@ class AgentService: { "task_id": updated.task_id, "task_status": updated.status, + "acceptance_state": "revise", "feedback_state": "revise", }, ) @@ -648,9 +678,10 @@ class AgentService: session_id, run_id=run_id, role="system", - event_type="task_feedback_recorded", + event_type="task_acceptance_recorded", event_payload={ "task_id": updated.task_id, + "acceptance_type": "revise", "feedback_type": "revise", "comment": comment, "task_status": updated.status, @@ -659,12 +690,12 @@ class AgentService: content=comment, context_visible=False, ) - validation = ValidationResult.from_dict(updated.validation_result) run_memory_store = self._require_loaded(loaded, "run_memory_store") run_memory_store.update_run_record( run_id, success=False, feedback={ + "acceptance_type": "revise", "feedback_type": "revise", "comment": comment, "task_status": updated.status, @@ -673,7 +704,7 @@ class AgentService: run_memory_store.update_skill_effects_for_run( run_id, success=False, - feedback_score=self._feedback_score_for_learning("revise", validation), + feedback_score=self._acceptance_score_for_learning("revise"), notes=comment.strip() or "revise", ) skill_learning_service = self._require_loaded(loaded, "skill_learning_service") @@ -690,236 +721,185 @@ class AgentService: ) -> AgentRunResult: loaded = self.create_loop().boot() task_service = self._require_loaded(loaded, "task_service") - validation_service = self._require_loaded(loaded, "validation_service") task_execution_planner = self._require_loaded(loaded, "task_execution_planner") session_manager = self._require_loaded(loaded, "session_manager") - run_memory_store = self._require_loaded(loaded, "run_memory_store") - last_result: AgentRunResult | None = None - latest_validation: ValidationResult | None = None base_execution_context = kwargs.get("execution_context") provider_bundle = kwargs.get("provider_bundle") or self._make_provider_bundle_for_task(loaded, kwargs) kwargs = dict(kwargs) team_provider_bundle_factory = kwargs.pop("team_provider_bundle_factory", None) kwargs["provider_bundle"] = provider_bundle - for attempt_index in (1, 2): - task_service.start_run(task.task_id, user_message=message, attempt_index=attempt_index) - plan = await task_execution_planner.plan( + attempt_index = int(task.metadata.get("latest_attempt_index") or 0) + 1 + task_service.start_run(task.task_id, user_message=message, attempt_index=attempt_index) + plan = await task_execution_planner.plan( + task=task, + user_message=message, + attempt_index=attempt_index, + provider_bundle=provider_bundle, + ) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_execution_planned", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + **plan.to_event_payload(), + }, + ) + team_summaries: list[str] = [] + team_execution_context = "" + team_result: TeamRunResult | None = None + if plan.is_team: + team_result, team_error = await self._run_team_for_task( + plan, task=task, - user_message=message, - attempt_index=attempt_index, - latest_validation=latest_validation, - provider_bundle=provider_bundle, + parent_session_id=kwargs["session_id"], + provider_bundle_factory=team_provider_bundle_factory + or self._build_team_provider_bundle_factory(loaded, kwargs), ) - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_execution_planned", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - **plan.to_event_payload(), - }, - ) - team_summaries: list[str] = [] - team_execution_context = "" - team_result: TeamRunResult | None = None - if plan.is_team: - team_result, team_error = await self._run_team_for_task( - plan, - task=task, - parent_session_id=kwargs["session_id"], - provider_bundle_factory=team_provider_bundle_factory - or self._build_team_provider_bundle_factory(loaded, kwargs), + if team_result is not None: + team_summaries = [self._team_summary_for_validation(team_result)] + team_packet = TaskEvidencePacket( + task_id=task.task_id, + attempt_index=attempt_index, + main_run=None, + team_runs=self._team_run_evidence(team_result), + team_node_results=list(team_result.node_results), + final_output="", + ) + team_execution_context = self._join_context( + self._team_execution_context(plan, team_result), + "Rendered team evidence:\n" + render_task_evidence(team_packet), + ) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_team_run_completed" if team_result.success else "task_team_run_failed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], + "team_run_ids": team_result.run_ids, + "team_success": team_result.success, + "node_results": self._team_node_results_for_event(plan, team_result), + "reason": plan.reason, + "error": None if team_result.success else "one or more team nodes failed", + }, + ) + else: + team_summaries = [f"Team execution failed: {team_error}"] + team_execution_context = self._failed_team_execution_context(plan, team_error or "unknown error") + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_team_run_failed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], + "team_run_ids": [], + "team_success": False, + "reason": plan.reason, + "error": team_error, + }, ) - if team_result is not None: - team_summaries = [self._team_summary_for_validation(team_result)] - team_packet = TaskEvidencePacket( - task_id=task.task_id, - attempt_index=attempt_index, - main_run=None, - team_runs=self._team_run_evidence(team_result), - team_node_results=list(team_result.node_results), - final_output="", - ) - team_execution_context = self._join_context( - self._team_execution_context(plan, team_result), - "Rendered team evidence:\n" + render_task_evidence(team_packet), - ) - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_team_run_completed" if team_result.success else "task_team_run_failed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], - "team_run_ids": team_result.run_ids, - "team_success": team_result.success, - "node_results": self._team_node_results_for_event(plan, team_result), - "reason": plan.reason, - "error": None if team_result.success else "one or more team nodes failed", - }, - ) - else: - team_summaries = [f"Team execution failed: {team_error}"] - team_execution_context = self._failed_team_execution_context(plan, team_error or "unknown error") - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_team_run_failed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [], - "team_run_ids": [], - "team_success": False, - "reason": plan.reason, - "error": team_error, - }, - ) - attempt_kwargs = dict(kwargs) - attempt_kwargs.update( - { - "task_id": task.task_id, - "task_mode": True, - "attempt_index": attempt_index, - "allow_candidate_generation": False, - } - ) - if attempt_index == 2 and latest_validation is not None: - revision_context = latest_validation.recommended_revision_prompt.strip() - if revision_context: - attempt_kwargs["execution_context"] = self._join_context( - base_execution_context, - f"Task validation revision request:\n{revision_context}", - team_execution_context, - ) - elif team_execution_context: - attempt_kwargs["execution_context"] = self._join_context(base_execution_context, team_execution_context) - if plan.is_team and team_execution_context: - attempt_kwargs["include_tools"] = False - attempt_kwargs["max_tool_iterations"] = 0 - attempt_kwargs["skill_selection_context"] = self._build_skill_selection_context( - task=task, - user_message=message, - attempt_index=attempt_index, - latest_validation=latest_validation, - plan=plan, - team_summaries=team_summaries, - ) - - result = await runner(message, **attempt_kwargs) - last_result = result - self._append_task_observation( - session_manager, - task.session_id, - event_type="task_synthesis_completed", - payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "main_run_id": result.run_id, - "plan_mode": plan.mode, - "strategy": plan.graph.strategy if plan.graph else None, - }, - ) - task = task_service.append_run( - task.task_id, - result.run_id, - skill_names=self._skill_names_for_run(loaded, result.run_id), - ) - evidence_packet = self._build_task_evidence_packet( - session_manager=session_manager, - task=task, - attempt_index=attempt_index, - result=result, - team_result=team_result, - ) - evidence_text = render_task_evidence(evidence_packet) - validation = await validation_service.validate_task_result( - task=task, - user_message=message, - final_output=result.output_text, - evidence_packet=evidence_packet, - evidence_text=evidence_text, - transcript_excerpt=self._run_excerpt(session_manager, result.session_id, result.run_id), - tool_summaries=self._tool_summaries(session_manager, result.session_id, result.run_id), - team_summaries=team_summaries, - provider_bundle=provider_bundle, - ) - latest_validation = validation - has_usable_answer = bool(result.output_text.strip()) and ( - "Tool loop stopped after reaching the configured iteration limit." not in result.output_text - ) - task = task_service.record_validation( - task.task_id, - result.run_id, - validation, - final_attempt=( - attempt_index == 2 - or validation.status in {"accepted", "insufficient_evidence", "validator_error"} - ), - has_usable_answer=has_usable_answer, - ) - run_memory_store.update_run_record(result.run_id, validation_result=validation.to_dict()) - session_manager.update_latest_assistant_event_payload( - result.session_id, - result.run_id, - { - "task_id": task.task_id, - "task_status": task.status, - "validation_status": "passed" if validation.accepted else "failed", - }, - ) - validation_debug = { - "evidence_run_ids": [ - item.run_id for item in [evidence_packet.main_run, *evidence_packet.team_runs] if item is not None - ], - "evidence_session_ids": [ - item.session_id - for item in [evidence_packet.main_run, *evidence_packet.team_runs] - if item is not None - ], - "tool_result_count": sum( - len(item.tool_results) - for item in [evidence_packet.main_run, *evidence_packet.team_runs] - if item is not None - ), - "evidence_length": len(evidence_text), + attempt_kwargs = dict(kwargs) + attempt_kwargs.update( + { + "task_id": task.task_id, + "task_mode": True, + "attempt_index": attempt_index, + "allow_candidate_generation": False, } - retry_scheduled = validation.status == "rejected" and attempt_index == 1 - session_manager.append_message( - result.session_id, - run_id=result.run_id, - role="system", - event_type="task_validation_snapshotted", - event_payload={ - "task_id": task.task_id, - "attempt_index": attempt_index, - "validation_result": validation.to_dict(), - "validation_debug": validation_debug, - "retry_scheduled": retry_scheduled, - }, - content=validation.recommended_revision_prompt or None, - context_visible=False, - ) - if retry_scheduled: - session_manager.set_run_context_visible(result.session_id, result.run_id, False) - result.task_id = task.task_id - result.task_status = task.status - result.validation_result = validation.to_dict() - if not retry_scheduled: - return result + ) + if team_execution_context: + attempt_kwargs["execution_context"] = self._join_context(base_execution_context, team_execution_context) + if plan.is_team and team_execution_context: + attempt_kwargs["include_tools"] = False + attempt_kwargs["max_tool_iterations"] = 0 + attempt_kwargs["skill_selection_context"] = self._build_skill_selection_context( + task=task, + user_message=message, + attempt_index=attempt_index, + plan=plan, + team_summaries=team_summaries, + ) - if last_result is None: # pragma: no cover - defensive - raise RuntimeError("Task mode did not produce a run result") - return last_result + result = await runner(message, **attempt_kwargs) + self._append_task_observation( + session_manager, + task.session_id, + event_type="task_synthesis_completed", + payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "main_run_id": result.run_id, + "plan_mode": plan.mode, + "strategy": plan.graph.strategy if plan.graph else None, + }, + ) + task = task_service.append_run( + task.task_id, + result.run_id, + skill_names=self._skill_names_for_run(loaded, result.run_id), + ) + evidence_packet = self._build_task_evidence_packet( + session_manager=session_manager, + task=task, + attempt_index=attempt_index, + result=result, + team_result=team_result, + ) + evidence_text = render_task_evidence(evidence_packet) + evidence_debug = { + "evidence_run_ids": [ + item.run_id for item in [evidence_packet.main_run, *evidence_packet.team_runs] if item is not None + ], + "evidence_session_ids": [ + item.session_id + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ], + "tool_result_count": sum( + len(item.tool_results) + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ), + "evidence_length": len(evidence_text), + } + session_manager.update_latest_assistant_event_payload( + result.session_id, + result.run_id, + { + "task_id": task.task_id, + "task_status": task.status, + "evidence_status": "recorded", + }, + ) + session_manager.append_message( + result.session_id, + run_id=result.run_id, + role="system", + event_type="task_evidence_recorded", + event_payload={ + "task_id": task.task_id, + "attempt_index": attempt_index, + "evidence_debug": evidence_debug, + }, + content=None, + context_visible=False, + ) + result.task_id = task.task_id + result.task_status = task.status + result.validation_result = None + return result async def _run_team_for_task( self, @@ -986,12 +966,10 @@ class AgentService: return [] @staticmethod - def _feedback_score_for_learning(feedback_type: str, validation: ValidationResult | None) -> float: - if feedback_type == "satisfied": - if validation is not None: - return max(0.0, min(1.0, float(validation.score))) + def _acceptance_score_for_learning(acceptance_type: str) -> float: + if acceptance_type == "accept": return 1.0 - if feedback_type == "revise": + if acceptance_type == "revise": return 0.5 return 0.0 @@ -1001,12 +979,11 @@ class AgentService: task: TaskRecord, user_message: str, attempt_index: int, - latest_validation: ValidationResult | None = None, plan: TaskExecutionPlan | None = None, team_summaries: list[str] | None = None, ) -> str: phase = f"attempt_{attempt_index}" - if latest_validation is not None: + if task.feedback and task.feedback[-1].get("acceptance_type") == "revise": phase = f"revision_attempt_{attempt_index}" elif plan is not None and plan.is_team: phase = f"team_synthesis_attempt_{attempt_index}" @@ -1027,24 +1004,14 @@ class AgentService: ) else: sections.append("Previously activated skills:\nNone") - if latest_validation is not None: - validation_lines = [ - f"accepted: {latest_validation.accepted}", - f"score: {latest_validation.score}", - ] - if latest_validation.issues: - validation_lines.append("issues:\n" + "\n".join(f"- {item}" for item in latest_validation.issues)) - if latest_validation.missing_requirements: - validation_lines.append( - "missing requirements:\n" - + "\n".join(f"- {item}" for item in latest_validation.missing_requirements) - ) - if latest_validation.recommended_revision_prompt: - validation_lines.append( - "recommended revision:\n" - + latest_validation.recommended_revision_prompt - ) - sections.append("Validation feedback:\n" + "\n".join(validation_lines)) + if task.feedback: + history_lines = [] + for item in task.feedback[-5:]: + kind = item.get("acceptance_type") or item.get("feedback_type") + comment = item.get("comment") or "" + run_id = item.get("run_id") or "" + history_lines.append(f"- {kind} run={run_id}: {comment}".strip()) + sections.append("Task acceptance history:\n" + "\n".join(history_lines)) if plan is not None: plan_lines = [ f"mode: {plan.mode}", @@ -1313,7 +1280,8 @@ class AgentService: "inbound_metadata": dict(inbound.metadata), "task_id": getattr(result, "task_id", None), "task_status": getattr(result, "task_status", None), - "validation_result": getattr(result, "validation_result", None), + "evidence_status": "recorded" if getattr(result, "task_id", None) else None, + "validation_result": None, }, ) diff --git a/app-instance/backend/beaver/services/process_service.py b/app-instance/backend/beaver/services/process_service.py index 4a4cbfc..4b91a8a 100644 --- a/app-instance/backend/beaver/services/process_service.py +++ b/app-instance/backend/beaver/services/process_service.py @@ -235,26 +235,45 @@ class SessionProcessProjector: metadata=dict(payload), ) - elif record.event_type == "task_validation_snapshotted": - validation = payload.get("validation_result") if isinstance(payload.get("validation_result"), dict) else {} - accepted = bool(validation.get("accepted")) - root["status"] = "done" if accepted or attempt_index == 2 else "waiting" - root["finished_at"] = created_at if root["status"] == "done" else None + elif record.event_type == "task_evidence_recorded": + root["status"] = "waiting" + root["finished_at"] = None add_event( - event_id=_event_id(record, "validation"), + event_id=_event_id(record, "evidence"), run_id=record.run_id or root_run_id, parent_run_id=root_run_id if record.run_id else None, kind="run_status", actor_type="system", - actor_id="validator", - actor_name="Validator", - text=( - f"Validation {'passed' if accepted else 'failed'} " - f"(score={validation.get('score')})." - + (" Retry scheduled." if payload.get("retry_scheduled") else "") - ), + actor_id="evidence-recorder", + actor_name="Evidence", + text="Task evidence was recorded; waiting for user acceptance.", created_at=created_at, - status="done" if accepted else "error", + status="done", + metadata=dict(payload), + ) + + elif record.event_type == "task_acceptance_recorded": + acceptance_type = str(payload.get("acceptance_type") or payload.get("feedback_type") or "") + if acceptance_type == "accept": + root["status"] = "done" + root["finished_at"] = created_at + elif acceptance_type == "abandon": + root["status"] = "cancelled" + root["finished_at"] = created_at + else: + root["status"] = "waiting" + root["finished_at"] = None + add_event( + event_id=_event_id(record, "acceptance"), + run_id=record.run_id or root_run_id, + parent_run_id=root_run_id if record.run_id else None, + kind="run_status", + actor_type="user", + actor_id="user-acceptance", + actor_name="User Acceptance", + text=f"User acceptance recorded: {acceptance_type or 'unknown'}.", + created_at=created_at, + status="done", metadata=dict(payload), ) diff --git a/app-instance/backend/beaver/skills/learning/service.py b/app-instance/backend/beaver/skills/learning/service.py index c350672..3262280 100644 --- a/app-instance/backend/beaver/skills/learning/service.py +++ b/app-instance/backend/beaver/skills/learning/service.py @@ -69,15 +69,24 @@ class SkillLearningService: existing_ids.add(candidate.candidate_id) return candidates - def build_learning_candidates_for_task(self, task_id: str, *, trigger_run_id: str) -> list[SkillLearningCandidate]: - """Build candidates scoped to a single validated and satisfied Task run.""" + def build_learning_candidates_for_task( + self, + task_id: str, + *, + final_accepted_run_id: str | None = None, + trigger_run_id: str | None = None, + ) -> list[SkillLearningCandidate]: + """Build candidates from a user-accepted Task and all of its runs.""" + final_accepted_run_id = final_accepted_run_id or trigger_run_id + if not final_accepted_run_id: + return [] runs = [record for record in self.run_store.list_runs() if record.task_id == task_id] - trigger_run = next((record for record in runs if record.run_id == trigger_run_id), None) - if trigger_run is None or not self._is_confirmed_positive_run(trigger_run): + final_run = next((record for record in runs if record.run_id == final_accepted_run_id), None) + if final_run is None or not self._is_task_accepted_run(final_run): return [] - source_runs = [record for record in runs if self._is_confirmed_positive_run(record)] + source_runs = sorted(runs, key=lambda item: (item.started_at, item.run_id)) if not source_runs: return [] @@ -100,11 +109,16 @@ class SkillLearningService: source_session_ids=source_session_ids, related_skill_names=[], reason=f"Task {task_id} completed successfully without a published skill; consider extracting reusable guidance.", - evidence={"task_id": task_id, "trigger_run_id": trigger_run_id, "theme": self._task_theme(trigger_run.task_text)}, + evidence={ + "task_id": task_id, + "final_accepted_run_id": final_accepted_run_id, + "source_run_ids": source_run_ids, + "theme": self._task_theme(final_run.task_text), + }, status="open", priority=1, confidence=0.8, - trigger_reason="validation_accepted_and_user_satisfied", + trigger_reason="task_accepted", ) ) else: @@ -137,13 +151,14 @@ class SkillLearningService: ), evidence={ "task_id": task_id, - "trigger_run_id": trigger_run_id, + "final_accepted_run_id": final_accepted_run_id, + "source_run_ids": source_run_ids, "skill_version": receipt.skill_version, }, status="open", priority=1, confidence=0.7, - trigger_reason="validation_accepted_and_user_satisfied", + trigger_reason="task_accepted", ) ) @@ -269,7 +284,7 @@ class SkillLearningService: groups.setdefault(key, []).append(record) candidates: list[SkillLearningCandidate] = [] for theme, runs in groups.items(): - successful = [record for record in runs if self._is_confirmed_positive_run(record)] + successful = [record for record in runs if self._is_task_accepted_run(record)] if len(successful) < 2: continue if any(record.activated_skills for record in successful): @@ -290,7 +305,7 @@ class SkillLearningService: def _build_merge_candidates(self) -> list[SkillLearningCandidate]: pair_counts: dict[tuple[str, str], list[RunRecord]] = {} for record in self.run_store.list_runs(): - if not self._is_confirmed_positive_run(record): + if not self._is_task_accepted_run(record): continue unique = sorted({receipt.skill_name for receipt in record.activated_skills}) for pair in combinations(unique, 2): @@ -351,14 +366,15 @@ class SkillLearningService: return effects @staticmethod - def _is_confirmed_positive_run(record: RunRecord) -> bool: - validation = record.validation_result or {} + def _is_task_accepted_run(record: RunRecord) -> bool: feedback = record.feedback or {} + acceptance_type = feedback.get("acceptance_type") + if acceptance_type is None and feedback.get("feedback_type") == "satisfied": + acceptance_type = "accept" return ( bool(record.success) and bool(record.task_id) - and validation.get("accepted") is True - and feedback.get("feedback_type") == "satisfied" + and acceptance_type == "accept" ) @staticmethod diff --git a/app-instance/backend/beaver/tasks/__init__.py b/app-instance/backend/beaver/tasks/__init__.py index 73f4e0e..0f764bb 100644 --- a/app-instance/backend/beaver/tasks/__init__.py +++ b/app-instance/backend/beaver/tasks/__init__.py @@ -6,7 +6,6 @@ from .planner import TaskExecutionPlan, TaskExecutionPlanner from .router import MainAgentRouter from .service import TaskService from .skill_resolver import SkillResolutionReport, TaskSkillResolver -from .validation import ValidationService __all__ = [ "EvidenceBuilder", @@ -24,6 +23,5 @@ __all__ = [ "ToolEvidence", "ValidationResult", "ValidationStatus", - "ValidationService", "render_task_evidence", ] diff --git a/app-instance/backend/beaver/tasks/models.py b/app-instance/backend/beaver/tasks/models.py index 88182c3..9a45caf 100644 --- a/app-instance/backend/beaver/tasks/models.py +++ b/app-instance/backend/beaver/tasks/models.py @@ -1,4 +1,4 @@ -"""Models for internal task tracking and validation.""" +"""Models for internal task tracking and user acceptance.""" from __future__ import annotations @@ -9,7 +9,12 @@ from typing import Any, Literal ValidationStatus = Literal["accepted", "rejected", "insufficient_evidence", "validator_error"] VALIDATION_STATUSES = {"accepted", "rejected", "insufficient_evidence", "validator_error"} -TASK_OPEN_STATUSES = {"open", "running", "validating", "awaiting_feedback", "needs_review", "needs_revision"} +TASK_OPEN_STATUSES = {"open", "running", "awaiting_acceptance", "needs_revision"} +LEGACY_STATUS_MAP = { + "validating": "running", + "awaiting_feedback": "awaiting_acceptance", + "needs_review": "awaiting_acceptance", +} @dataclass(slots=True) @@ -113,11 +118,11 @@ class TaskRecord: @property def is_execution_active(self) -> bool: - return self.status in {"running", "validating"} + return self.status == "running" @property def requires_user_action(self) -> bool: - return self.status in {"awaiting_feedback", "needs_review", "needs_revision"} + return self.status in {"awaiting_acceptance", "needs_revision"} def to_dict(self) -> dict[str, Any]: return { @@ -137,6 +142,7 @@ class TaskRecord: "satisfaction": self.satisfaction, "run_ids": list(self.run_ids), "skill_names": list(self.skill_names), + "acceptance": list(self.feedback), "feedback": list(self.feedback), "validation_result": self.validation_result, "metadata": dict(self.metadata), @@ -152,7 +158,7 @@ class TaskRecord: goal=str(payload.get("goal") or payload.get("description") or ""), constraints=[str(item) for item in payload.get("constraints") or []], priority=int(payload.get("priority", 0) or 0), - status=str(payload.get("status") or "open"), + status=LEGACY_STATUS_MAP.get(str(payload.get("status") or "open"), str(payload.get("status") or "open")), creator=str(payload.get("creator") or "main-agent"), created_at=str(payload.get("created_at") or ""), updated_at=str(payload.get("updated_at") or ""), @@ -161,7 +167,11 @@ class TaskRecord: satisfaction=_optional_float(payload.get("satisfaction")), run_ids=[str(item) for item in payload.get("run_ids") or []], skill_names=[str(item) for item in payload.get("skill_names") or []], - feedback=[dict(item) for item in payload.get("feedback") or [] if isinstance(item, dict)], + feedback=[ + _normalize_acceptance_entry(dict(item)) + for item in (payload.get("acceptance") or payload.get("feedback") or []) + if isinstance(item, dict) + ], validation_result=dict(payload["validation_result"]) if isinstance(payload.get("validation_result"), dict) else None, metadata=dict(payload.get("metadata") or {}), ) @@ -226,3 +236,13 @@ def _optional_float(value: Any) -> float | None: if value in (None, ""): return None return float(value) + + +def _normalize_acceptance_entry(entry: dict[str, Any]) -> dict[str, Any]: + if entry.get("acceptance_type") is None and entry.get("feedback_type") is not None: + feedback_type = str(entry.get("feedback_type") or "") + entry["acceptance_type"] = "accept" if feedback_type == "satisfied" else feedback_type + if entry.get("feedback_type") is None and entry.get("acceptance_type") is not None: + acceptance_type = str(entry.get("acceptance_type") or "") + entry["feedback_type"] = "satisfied" if acceptance_type == "accept" else acceptance_type + return entry diff --git a/app-instance/backend/beaver/tasks/planner.py b/app-instance/backend/beaver/tasks/planner.py index 9d635d5..ec23ae3 100644 --- a/app-instance/backend/beaver/tasks/planner.py +++ b/app-instance/backend/beaver/tasks/planner.py @@ -10,7 +10,7 @@ from typing import Any, Literal from beaver.coordinator.models import AgentDescriptor, ExecutionGraph, ExecutionNode from beaver.engine.providers import ProviderBundle -from .models import TaskRecord, ValidationResult +from .models import TaskRecord from .skill_resolver import SkillResolutionReport, TaskSkillResolver @@ -76,7 +76,6 @@ class TaskExecutionPlanner: task: TaskRecord, user_message: str, attempt_index: int, - latest_validation: ValidationResult | None = None, provider_bundle: ProviderBundle | None = None, timeout_seconds: float = 30.0, ) -> TaskExecutionPlan: @@ -105,7 +104,6 @@ class TaskExecutionPlanner: task=task, user_message=user_message, attempt_index=attempt_index, - latest_validation=latest_validation, ), }, ], @@ -230,14 +228,10 @@ class TaskExecutionPlanner: task: TaskRecord, user_message: str, attempt_index: int, - latest_validation: ValidationResult | None, ) -> str: - validation_note = "" - if latest_validation is not None: - validation_note = ( - "\nPrevious validation issues:\n" - + json.dumps(latest_validation.to_dict(), ensure_ascii=False) - ) + history_note = "" + if task.feedback: + history_note = "\nRelevant task history:\n" + json.dumps(task.feedback[-5:], ensure_ascii=False) return ( "Decide execution mode for this internal Task attempt.\n" "Use mode=team only when independent research, review, implementation slices, or staged checks " @@ -254,7 +248,7 @@ class TaskExecutionPlanner: f"Task goal:\n{task.goal}\n\n" f"Current user request:\n{user_message}\n\n" f"Attempt index: {attempt_index}\n" - f"{validation_note}" + f"{history_note}" ) @staticmethod diff --git a/app-instance/backend/beaver/tasks/service.py b/app-instance/backend/beaver/tasks/service.py index 92701b1..542b9b0 100644 --- a/app-instance/backend/beaver/tasks/service.py +++ b/app-instance/backend/beaver/tasks/service.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any from uuid import uuid4 -from .models import TaskEvent, TaskRecord, ValidationResult +from .models import TaskEvent, TaskRecord from .store import TaskStore @@ -105,38 +105,70 @@ class TaskService: for name in skill_names or []: if name not in task.skill_names: task.skill_names.append(name) + task.status = "awaiting_acceptance" task.updated_at = self._now() self.store.upsert_task(task) self._event(task, "run_completed", run_id=run_id, payload={"skill_names": skill_names or []}) + self._event(task, "evidence_recorded", run_id=run_id, payload={"skill_names": skill_names or []}) return task - def record_validation( + def add_acceptance( self, task_id: str, - run_id: str, - validation: ValidationResult, *, - final_attempt: bool = True, - has_usable_answer: bool = True, + acceptance_type: str, + comment: str | None = None, + run_id: str | None = None, ) -> TaskRecord: task = self._require(task_id) now = self._now() - if validation.status == "accepted": - task.status = "awaiting_feedback" - elif validation.status in {"insufficient_evidence", "validator_error"}: - task.status = "needs_review" - elif validation.status == "rejected" and not final_attempt: + normalized = normalize_acceptance_type(acceptance_type) + matching_acceptance = any( + item.get("run_id") == run_id and item.get("acceptance_type") == normalized + for item in task.feedback + ) + conflicting_acceptance = next( + ( + item + for item in task.feedback + if item.get("run_id") == run_id and item.get("acceptance_type") != normalized + ), + None, + ) + if conflicting_acceptance is not None: + raise ValueError( + f"Acceptance for run_id={run_id!r} was already recorded as " + f"{conflicting_acceptance.get('acceptance_type')!r}" + ) + if task.status in {"closed", "abandoned"} and not matching_acceptance: + raise ValueError(f"Task {task.task_id} is already finalized as {task.status!r}") + if matching_acceptance: + return task + + entry = { + "acceptance_type": normalized, + "feedback_type": "satisfied" if normalized == "accept" else normalized, + "comment": comment or "", + "run_id": run_id, + "created_at": now, + } + task.feedback.append(entry) + if normalized == "revise": task.status = "needs_revision" - elif validation.status == "rejected" and has_usable_answer: - task.status = "needs_review" - else: - task.status = "failed" + elif normalized == "abandon": + task.status = "abandoned" task.closed_at = now - task.close_reason = "automatic validation rejected the final attempt" + task.close_reason = comment or "abandoned" + elif normalized == "accept": + task.status = "closed" + task.closed_at = now + task.close_reason = "accepted" + task.satisfaction = 1.0 + if run_id: + task.metadata["final_accepted_run_id"] = run_id task.updated_at = now - task.validation_result = validation.to_dict() self.store.upsert_task(task) - self._event(task, "validated", run_id=run_id, payload=validation.to_dict()) + self._event(task, f"acceptance_{normalized}", run_id=run_id, payload=entry) return task def add_feedback( @@ -147,52 +179,12 @@ class TaskService: comment: str | None = None, run_id: str | None = None, ) -> TaskRecord: - task = self._require(task_id) - now = self._now() - matching_feedback = any( - item.get("run_id") == run_id and item.get("feedback_type") == feedback_type - for item in task.feedback + return self.add_acceptance( + task_id, + acceptance_type=feedback_type, + comment=comment, + run_id=run_id, ) - conflicting_feedback = next( - ( - item - for item in task.feedback - if item.get("run_id") == run_id and item.get("feedback_type") != feedback_type - ), - None, - ) - if conflicting_feedback is not None: - raise ValueError( - f"Feedback for run_id={run_id!r} was already recorded as " - f"{conflicting_feedback.get('feedback_type')!r}" - ) - if task.status in {"closed", "abandoned"} and not matching_feedback: - raise ValueError(f"Task {task.task_id} is already finalized as {task.status!r}") - if matching_feedback: - return task - - entry = { - "feedback_type": feedback_type, - "comment": comment or "", - "run_id": run_id, - "created_at": now, - } - task.feedback.append(entry) - if feedback_type == "revise": - task.status = "needs_revision" - elif feedback_type == "abandon": - task.status = "abandoned" - task.closed_at = now - task.close_reason = comment or "abandoned" - elif feedback_type == "satisfied": - task.status = "closed" - task.closed_at = now - task.close_reason = "satisfied" - task.satisfaction = 1.0 - task.updated_at = now - self.store.upsert_task(task) - self._event(task, f"feedback_{feedback_type}", run_id=run_id, payload=entry) - return task def close_task(self, task_id: str, *, reason: str = "closed") -> TaskRecord: task = self._require(task_id) @@ -267,3 +259,12 @@ def short_task_title(text: str) -> str: if len(words) <= 4: return cleaned[:40] return " ".join(words[:4])[:40] + + +def normalize_acceptance_type(value: str) -> str: + normalized = (value or "").strip().lower() + if normalized == "satisfied": + return "accept" + if normalized not in {"accept", "revise", "abandon"}: + raise ValueError("acceptance_type must be one of: accept, revise, abandon") + return normalized diff --git a/app-instance/backend/beaver/tasks/validation.py b/app-instance/backend/beaver/tasks/validation.py deleted file mode 100644 index 28a3eaa..0000000 --- a/app-instance/backend/beaver/tasks/validation.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Automatic validation for internal Task mode.""" - -from __future__ import annotations - -import json -from typing import Any - -from beaver.engine.providers import ProviderBundle - -from .models import TaskRecord, ValidationResult - - -class ValidationService: - async def validate_task_result( - self, - *, - task: TaskRecord, - user_message: str, - final_output: str, - evidence_packet: Any | None = None, - evidence_text: str = "", - transcript_excerpt: str = "", - tool_summaries: list[str] | None = None, - team_summaries: list[str] | None = None, - provider_bundle: ProviderBundle | None = None, - ) -> ValidationResult: - provider = None - model = None - if provider_bundle is not None: - provider = provider_bundle.auxiliary_provider or provider_bundle.main_provider - runtime = provider_bundle.auxiliary_runtime or provider_bundle.main_runtime - model = getattr(runtime, "model", None) - if provider is not None: - try: - return await self._validate_with_provider( - provider=provider, - model=model, - task=task, - user_message=user_message, - final_output=final_output, - evidence_text=evidence_text, - transcript_excerpt=transcript_excerpt, - tool_summaries=tool_summaries or [], - team_summaries=team_summaries or [], - ) - except Exception as exc: - return ValidationResult( - status="validator_error", - score=0.0, - issues=[f"Validator failed: {exc}"], - evidence_gaps=["Automatic validation failed before producing a reliable decision."], - missing_requirements=["User review is required because automatic validation failed."], - recommended_revision_prompt=( - "Review the answer and evidence, then decide whether to revise or accept it." - ), - validator="llm_error", - ) - return self._heuristic_validate(final_output) - - async def _validate_with_provider( - self, - *, - provider: Any, - model: str | None, - task: TaskRecord, - user_message: str, - final_output: str, - evidence_text: str, - transcript_excerpt: str, - tool_summaries: list[str], - team_summaries: list[str], - ) -> ValidationResult: - legacy_context = "" if evidence_text else ( - f"Transcript excerpt:\n{transcript_excerpt}\n\n" - f"Tool summaries:\n{json.dumps(tool_summaries, ensure_ascii=False)}\n\n" - f"Team summaries:\n{json.dumps(team_summaries, ensure_ascii=False)}\n\n" - ) - prompt = ( - "Validate whether the assistant output satisfies the task. " - "Return only compact JSON with keys: passed, score, issues, " - "missing_requirements, recommended_revision_prompt.\n\n" - f"Task goal:\n{task.goal}\n\n" - f"Current user request:\n{user_message}\n\n" - f"Evidence packet:\n{evidence_text}\n\n" - f"{legacy_context}" - f"Assistant final output:\n{final_output}" - ) - response = await provider.chat( - messages=[ - {"role": "system", "content": "You are a strict task result validator."}, - {"role": "user", "content": prompt}, - ], - tools=None, - model=model, - max_tokens=4096, - temperature=0.0, - ) - payload = self._parse_json_object(response.content or "") - status = payload.get("status") - if status not in {"accepted", "rejected", "insufficient_evidence", "validator_error"}: - status = ( - "accepted" - if payload.get("passed") and float(payload.get("score", 0.0) or 0.0) >= 0.75 - else "rejected" - ) - return ValidationResult( - status=status, - score=max(0.0, min(1.0, float(payload.get("score", 0.0) or 0.0))), - issues=[str(item) for item in payload.get("issues") or []], - missing_requirements=[str(item) for item in payload.get("missing_requirements") or []], - evidence_gaps=[str(item) for item in payload.get("evidence_gaps") or []], - recommended_revision_prompt=str(payload.get("recommended_revision_prompt") or ""), - validator="llm", - ) - - @staticmethod - def _heuristic_validate(final_output: str) -> ValidationResult: - text = final_output.strip() - if not text: - return ValidationResult( - passed=False, - score=0.0, - issues=["Assistant output is empty."], - missing_requirements=["A non-empty result is required."], - recommended_revision_prompt="Produce a complete, non-empty answer for the task.", - validator="heuristic", - ) - lowered = text.lower() - if "run failed before completion" in lowered or "tool loop stopped" in lowered: - return ValidationResult( - passed=False, - score=0.35, - issues=["The run did not complete cleanly."], - missing_requirements=["A successful final result is required."], - recommended_revision_prompt="Retry the task and address the failure before returning the final answer.", - validator="heuristic", - ) - return ValidationResult(passed=True, score=0.85, validator="heuristic") - - @staticmethod - def _parse_json_object(text: str) -> dict[str, Any]: - cleaned = text.strip() - if cleaned.startswith("```"): - cleaned = cleaned.strip("`") - if cleaned.lower().startswith("json"): - cleaned = cleaned[4:].strip() - start = cleaned.find("{") - end = cleaned.rfind("}") - if start >= 0 and end >= start: - cleaned = cleaned[start : end + 1] - payload = json.loads(cleaned) - if not isinstance(payload, dict): - raise ValueError("validator response must be a JSON object") - return payload diff --git a/app-instance/backend/tests/unit/test_context_builder.py b/app-instance/backend/tests/unit/test_context_builder.py new file mode 100644 index 0000000..42eb8ae --- /dev/null +++ b/app-instance/backend/tests/unit/test_context_builder.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from beaver.engine.context import ContextBuildInput, ContextBuilder, RuntimeContext, SessionContext + + +def test_context_builder_injects_current_date_and_time() -> None: + result = ContextBuilder().build_messages( + ContextBuildInput( + base_system_prompt="Follow user requests.", + current_user_input="今天几号?", + session_context=SessionContext(session_id="web:alpha", source="web", model="stub-model"), + runtime_context=RuntimeContext( + utc_datetime="2026-05-26T01:10:00+00:00", + local_datetime="2026-05-26T09:10:00+08:00", + timezone="Asia/Shanghai", + utc_offset="+08:00", + ), + ) + ) + + system_prompt = result.messages[0]["content"] + assert "# Current Date and Time" in system_prompt + assert "Current UTC time: 2026-05-26T01:10:00+00:00" in system_prompt + assert "Current local time: 2026-05-26T09:10:00+08:00" in system_prompt + assert "Local timezone: Asia/Shanghai" in system_prompt + assert "Local UTC offset: +08:00" in system_prompt + assert '"today", "tomorrow", "now", "this week", and "next month"' in system_prompt + assert result.messages[-1] == {"role": "user", "content": "今天几号?"} diff --git a/app-instance/backend/tests/unit/test_gateway_channels.py b/app-instance/backend/tests/unit/test_gateway_channels.py index 67cd739..2fbc1da 100644 --- a/app-instance/backend/tests/unit/test_gateway_channels.py +++ b/app-instance/backend/tests/unit/test_gateway_channels.py @@ -18,8 +18,8 @@ class FakeResult: model: str | None = "fake-model" usage: dict[str, Any] = field(default_factory=dict) task_id: str | None = "task-1" - task_status: str | None = "awaiting_feedback" - validation_result: dict[str, Any] | None = field(default_factory=lambda: {"accepted": True}) + task_status: str | None = "awaiting_acceptance" + validation_result: dict[str, Any] | None = None class FakeService: @@ -79,8 +79,9 @@ def test_gateway_routes_memory_channel_roundtrip() -> None: assert message.session_id == "s1" assert message.finish_reason == "stop" assert message.metadata["task_id"] == "task-1" - assert message.metadata["task_status"] == "awaiting_feedback" - assert message.metadata["validation_result"] == {"accepted": True} + assert message.metadata["task_status"] == "awaiting_acceptance" + assert message.metadata["evidence_status"] == "recorded" + assert message.metadata["validation_result"] is None stop_event.set() await asyncio.wait_for(task, timeout=2) diff --git a/app-instance/backend/tests/unit/test_litellm_thinking_mode.py b/app-instance/backend/tests/unit/test_litellm_thinking_mode.py index 97977ea..f3a547e 100644 --- a/app-instance/backend/tests/unit/test_litellm_thinking_mode.py +++ b/app-instance/backend/tests/unit/test_litellm_thinking_mode.py @@ -113,6 +113,19 @@ def test_litellm_provider_preserves_reasoning_content_for_tool_round_trip() -> N assert LiteLLMProvider._sanitize_messages(messages)[0]["reasoning_content"] == "must be passed back" +def test_litellm_provider_merges_late_system_messages_to_front() -> None: + messages = [ + {"role": "system", "content": "base"}, + {"role": "user", "content": "question"}, + {"role": "system", "content": "finalize without tools"}, + ] + + sanitized = LiteLLMProvider._sanitize_messages(messages) + + assert [message["role"] for message in sanitized] == ["system", "user"] + assert sanitized[0]["content"] == "base\n\nfinalize without tools" + + def test_thinking_mode_is_forced_disabled_even_when_requested_enabled(monkeypatch: pytest.MonkeyPatch) -> None: captured: dict = {} diff --git a/app-instance/backend/tests/unit/test_main_agent_router.py b/app-instance/backend/tests/unit/test_main_agent_router.py index 65627ec..7100a83 100644 --- a/app-instance/backend/tests/unit/test_main_agent_router.py +++ b/app-instance/backend/tests/unit/test_main_agent_router.py @@ -79,7 +79,7 @@ def _task() -> TaskRecord: goal="实现任务连续性", constraints=[], priority=0, - status="awaiting_feedback", + status="awaiting_acceptance", creator="test", created_at="now", updated_at="now", diff --git a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py index 08a6242..a850473 100644 --- a/app-instance/backend/tests/unit/test_phase5_skills_runtime.py +++ b/app-instance/backend/tests/unit/test_phase5_skills_runtime.py @@ -35,6 +35,7 @@ class StubProvider(LLMProvider): model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7, + thinking_enabled: bool | None = None, ) -> LLMResponse: if not self._responses: raise AssertionError("No stubbed provider responses left") @@ -47,11 +48,22 @@ class StubProvider(LLMProvider): class StubSkillAssembler: def __init__(self, activated_skills: list[SkillContext]) -> None: self.activated_skills = activated_skills + self.calls: list[dict] = [] async def assemble(self, **kwargs) -> SkillAssemblyResult: + self.calls.append(kwargs) return SkillAssemblyResult(activated_skills=list(self.activated_skills)) +class RecordingToolAssembler: + def __init__(self) -> None: + self.calls: list[dict] = [] + + async def assemble(self, **kwargs): + self.calls.append(kwargs) + return kwargs["registry"].get_specs(["memory"]) + + def _tool_call(*, name: str = "echo", arguments: dict | None = None, call_id: str = "call-1") -> SimpleNamespace: return SimpleNamespace( id=call_id, @@ -576,6 +588,48 @@ def test_agent_loop_records_skill_receipts_and_effects(tmp_path: Path) -> None: assert effect_records[-1].run_id == result.run_id +def test_thinking_disabled_still_uses_skill_and_tool_assembly(tmp_path: Path) -> None: + skill = SkillContext( + name="docker-debug", + content="Use docker logs before editing config.", + version="v0007", + content_hash="hash-v7", + activation_reason="llm_selected", + tool_hints=["terminal"], + ) + skill_assembler = StubSkillAssembler([skill]) + tool_assembler = RecordingToolAssembler() + loader = EngineLoader( + workspace=tmp_path, + skill_assembler=skill_assembler, + tool_assembler=tool_assembler, + ) + loop = AgentLoop(loader=loader) + bundle = ProviderBundle( + main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), + main_provider=StubProvider( + [LLMResponse(content="Done", finish_reason="stop", provider_name="stub", model="stub-model")] + ), + ) + + result = asyncio.run( + loop.process_direct( + "Why is the Docker container crashing?", + provider_bundle=bundle, + thinking_enabled=False, + ) + ) + loaded = loop.boot() + events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) + tool_selection = next(event for event in events if event.event_type == "tool_selection_snapshotted") + + assert skill_assembler.calls + assert skill_assembler.calls[0]["thinking_enabled"] is False + assert tool_assembler.calls + assert [skill.name for skill in tool_assembler.calls[0]["activated_skills"]] == ["docker-debug"] + assert tool_selection.event_payload["tool_names"] == ["memory"] + + def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path: Path) -> None: skill = SkillContext( name="docker-debug", @@ -635,6 +689,52 @@ def test_agent_loop_records_max_tool_iterations_as_failed_skill_effect(tmp_path: assert effect_records[-1].success is False +def test_agent_loop_suppresses_raw_tool_call_when_finalizing_after_tool_limit(tmp_path: Path) -> None: + loader = EngineLoader( + workspace=tmp_path, + skill_assembler=StubSkillAssembler([]), + ) + loop = AgentLoop(loader=loader) + bundle = ProviderBundle( + main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), + main_provider=StubProvider( + [ + LLMResponse( + content="Need a tool.", + finish_reason="tool_calls", + tool_calls=[_tool_call()], + provider_name="stub", + model="stub-model", + ), + LLMResponse( + content=( + "\n" + "\n" + "https://example.com\n" + "\n" + "" + ), + finish_reason="stop", + provider_name="stub", + model="stub-model", + ), + ] + ), + ) + + result = asyncio.run( + loop.process_direct( + "Fetch the latest result", + provider_bundle=bundle, + max_tool_iterations=0, + ) + ) + + assert result.finish_reason == "max_tool_iterations" + assert "" not in result.output_text + assert "raw tool call was suppressed" in result.output_text + + def test_llm_request_snapshot_defaults_to_compact_payload(tmp_path: Path) -> None: loop = AgentLoop(loader=EngineLoader(workspace=tmp_path, skill_assembler=StubSkillAssembler([]))) bundle = ProviderBundle( diff --git a/app-instance/backend/tests/unit/test_process_projection.py b/app-instance/backend/tests/unit/test_process_projection.py index c7e7faf..0b96175 100644 --- a/app-instance/backend/tests/unit/test_process_projection.py +++ b/app-instance/backend/tests/unit/test_process_projection.py @@ -101,12 +101,11 @@ def test_process_projection_maps_task_team_events(tmp_path: Path) -> None: "web:test", run_id="main-run", role="system", - event_type="task_validation_snapshotted", + event_type="task_evidence_recorded", event_payload={ "task_id": "task-1", "attempt_index": 1, - "validation_result": {"accepted": True, "score": 0.9}, - "retry_scheduled": False, + "evidence_status": "recorded", }, context_visible=False, ) @@ -121,7 +120,7 @@ def test_process_projection_maps_task_team_events(tmp_path: Path) -> None: assert sub_run["metadata"]["selected_skill_names"] == ["research-workflow"] assert sub_run["metadata"]["skill_query"] == "research workflow" assert sub_run["metadata"]["ephemeral_guidance_id"] is None - assert any(event["actor_name"] == "Validator" for event in projection["events"]) + assert any(event["actor_name"] == "Evidence" for event in projection["events"]) assert any(run["session_id"] == "web:test" for run in projection["runs"]) diff --git a/app-instance/backend/tests/unit/test_task_mode_feedback.py b/app-instance/backend/tests/unit/test_task_mode_feedback.py index 39273de..90b59a2 100644 --- a/app-instance/backend/tests/unit/test_task_mode_feedback.py +++ b/app-instance/backend/tests/unit/test_task_mode_feedback.py @@ -4,23 +4,17 @@ import asyncio from pathlib import Path from types import SimpleNamespace -import pytest - -from beaver.coordinator import AgentDescriptor, ExecutionGraph, ExecutionNode from beaver.engine import EngineLoader -from beaver.engine.context.builder import ContextBuilder, ContextBuildInput from beaver.engine.providers.base import LLMProvider, LLMResponse from beaver.engine.providers.factory import ProviderBundle from beaver.services.agent_service import AgentService -from beaver.skills.assembler import SkillAssemblyResult -from beaver.tasks import TaskExecutionPlan, TaskRecord, TaskService, ValidationResult, ValidationService +from beaver.tasks import TaskExecutionPlan, TaskService class StubProvider(LLMProvider): def __init__(self, responses: list[LLMResponse]) -> None: super().__init__() self._responses = list(responses) - self.calls: list[dict[str, object]] = [] async def chat( self, @@ -30,7 +24,6 @@ class StubProvider(LLMProvider): max_tokens: int = 4096, temperature: float = 0.7, ) -> LLMResponse: - self.calls.append({"messages": messages, "tools": tools, "model": model}) if not self._responses: raise AssertionError("No stubbed provider responses left") return self._responses.pop(0) @@ -39,30 +32,9 @@ class StubProvider(LLMProvider): return "stub-model" -class StubValidationService: - def __init__(self, results: list[ValidationResult]) -> None: - self.results = list(results) - self.calls: list[dict] = [] - - async def validate_task_result(self, **kwargs) -> ValidationResult: - self.calls.append(kwargs) - if not self.results: - raise AssertionError("No stubbed validation results left") - return self.results.pop(0) - - class StubTaskExecutionPlanner: - def __init__(self, plans: list[TaskExecutionPlan] | None = None) -> None: - self.plans = list(plans or [TaskExecutionPlan.single("test-single")]) - self.calls = [] - async def plan(self, **kwargs) -> TaskExecutionPlan: - self.calls.append(kwargs) - if len(self.plans) == 1: - return self.plans[0] - if not self.plans: - raise AssertionError("No stubbed execution plans left") - return self.plans.pop(0) + return TaskExecutionPlan.single("test-single") class FakeLearningCandidate: @@ -70,15 +42,6 @@ class FakeLearningCandidate: return {"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"} -class RecordingSkillAssembler: - def __init__(self) -> None: - self.task_descriptions: list[str] = [] - - async def assemble(self, **kwargs) -> SkillAssemblyResult: - self.task_descriptions.append(kwargs["task_description"]) - return SkillAssemblyResult() - - def _route_response(action: str = "new_task", short_title: str = "Test task") -> LLMResponse: return LLMResponse( content=f'{{"action":"{action}","reason":"test route","short_title":"{short_title}"}}', @@ -107,828 +70,157 @@ def _bundle(*responses: str, route_action: str = "new_task") -> ProviderBundle: ) -def _single_planner() -> StubTaskExecutionPlanner: - return StubTaskExecutionPlanner([TaskExecutionPlan.single("test-single")]) - - -def _team_plan(strategy: str = "sequence") -> TaskExecutionPlan: - return TaskExecutionPlan( - mode="team", - reason="test-team", - graph=ExecutionGraph( - strategy=strategy, # type: ignore[arg-type] - nodes=[ - ExecutionNode( - node_id="research", - task="research implementation options", - agent=AgentDescriptor(name="researcher", role="research"), - ) - ], - ), - final_synthesis_instruction="Use the sub-agent result to produce the final answer.", - ) - - -def _provider_bundle(provider: StubProvider) -> ProviderBundle: - return ProviderBundle( - main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), - main_provider=provider, - auxiliary_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), - auxiliary_provider=StubProvider([_route_response("new_task")]), - ) - - -def _main_only_bundle(*responses: str) -> ProviderBundle: - return ProviderBundle( - main_runtime=SimpleNamespace(model="stub-model", provider_name="stub"), - main_provider=StubProvider( - [ - LLMResponse( - content=response, - finish_reason="stop", - provider_name="stub", - model="stub-model", - ) - for response in responses - ] - ), - ) - - -def _task_record(status: str) -> TaskRecord: - return TaskRecord( - task_id="task-1", - session_id="session-1", - description="test task", - goal="test task", - constraints=[], - priority=0, - status=status, - creator="main-agent", - created_at="2026-05-22T00:00:00+00:00", - updated_at="2026-05-22T00:00:00+00:00", - ) - - -def test_simple_question_does_not_create_task(tmp_path: Path) -> None: +def test_task_run_records_evidence_and_waits_for_acceptance(tmp_path: Path) -> None: service = AgentService( loader=EngineLoader( workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService([]), + task_execution_planner=StubTaskExecutionPlanner(), ) ) result = asyncio.run( service.process_direct( - "hello?", - session_id="web:simple", - provider_bundle=_bundle("hi", route_action="simple_chat"), - ) - ) - loaded = service.create_loop().boot() - - assert result.task_id is None - assert loaded.task_service.store.list_tasks() == [] - - -def test_complex_request_creates_task_and_records_validation(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ValidationResult(passed=True, score=0.9, validator="test")] - ), + "draft release notes", + session_id="web:test", + provider_bundle=_bundle("Done"), ) ) - result = asyncio.run( - service.process_direct( - "implement the new report workflow", - session_id="web:task", - provider_bundle=_bundle("implemented"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task_by_run_id(result.run_id) - events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) - run_record = loaded.run_memory_store.list_runs()[-1] - skill_effects = next(event for event in events if event.event_type == "skill_effects_snapshotted") - - assert result.task_id is not None + task_service = service.create_loop().boot().task_service + assert task_service is not None + task = task_service.get_task(result.task_id or "") assert task is not None - assert task.status == "awaiting_feedback" - assert any(event.event_type == "task_validation_snapshotted" for event in events) - assert run_record.task_id == result.task_id - assert run_record.validation_result["accepted"] is True - assert skill_effects.event_payload["candidate_generation_allowed"] is False - assert skill_effects.event_payload["learning_candidates"] == [] - assert task.metadata["short_title"] == "Test task" + assert task.status == "awaiting_acceptance" + assert task.validation_result is None + assert result.validation_result is None + + event_types = [event.event_type for event in task_service.list_events(task.task_id)] + assert "evidence_recorded" in event_types + assert "validated" not in event_types -def test_task_mode_uses_task_aware_skill_selection_context(tmp_path: Path) -> None: - skill_assembler = RecordingSkillAssembler() +def test_acceptance_closes_task_and_triggers_learning(tmp_path: Path) -> None: service = AgentService( loader=EngineLoader( workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ValidationResult(passed=True, score=1.0, validator="test")] - ), - skill_assembler=skill_assembler, - ) - ) - - result = asyncio.run( - service.process_direct( - "继续按刚才的方案改", - session_id="web:task-skill-query", - provider_bundle=_bundle("done", route_action="new_task"), - ) - ) - - assert result.task_id - assert skill_assembler.task_descriptions - query = skill_assembler.task_descriptions[0] - assert "Task goal:" in query - assert "Current user request:" in query - assert "Previously activated skills:" in query - assert "If no published skill matches, return []" in query - - -def test_active_task_continues_until_llm_closes_it(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult(passed=True, score=0.9, validator="test"), - ValidationResult(passed=True, score=0.9, validator="test"), - ] - ), - ) - ) - - first = asyncio.run( - service.process_direct( - "implement the search workflow", - session_id="web:continue", - provider_bundle=_bundle("first done", route_action="new_task"), - ) - ) - second = asyncio.run( - service.process_direct( - "also add tests for it", - session_id="web:continue", - provider_bundle=_bundle("tests added", route_action="continue_task"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(first.task_id) - - assert task is not None - assert second.task_id == first.task_id - assert len(task.run_ids) == 2 - - closed = asyncio.run( - service.process_direct( - "这个任务结束了", - session_id="web:continue", - provider_bundle=_bundle("好的,已结束。", route_action="close_task"), - ) - ) - task = loaded.task_service.get_task(first.task_id) - - assert closed.task_id is None - assert task is not None - assert task.status == "closed" - assert loaded.task_service.active_task_view("web:continue") is None - - -def test_active_task_revision_input_records_feedback_and_reruns(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult(passed=True, score=0.9, validator="test"), - ValidationResult(passed=True, score=0.95, validator="test"), - ] - ), - ) - ) - - first = asyncio.run( - service.process_direct( - "查询珠海天气", - session_id="web:revise-direct", - provider_bundle=_bundle("珠海天气概览", route_action="new_task"), - ) - ) - second = asyncio.run( - service.process_direct( - "再详细一点,并加上明后天穿衣建议", - session_id="web:revise-direct", - provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(first.task_id) - messages = loaded.session_manager.get_messages_as_conversation(first.session_id) - first_assistant = [ - message - for message in messages - if message.get("role") == "assistant" and message.get("run_id") == first.run_id - ][-1] - user_messages = [message.get("content") for message in messages if message.get("role") == "user"] - - assert second.task_id == first.task_id - assert task is not None - assert task.status == "awaiting_feedback" - assert len(task.run_ids) == 2 - assert task.feedback == [ - { - "feedback_type": "revise", - "comment": "再详细一点,并加上明后天穿衣建议", - "run_id": first.run_id, - "created_at": task.feedback[0]["created_at"], - } - ] - assert first_assistant["feedback_state"] == "revise" - assert "再详细一点,并加上明后天穿衣建议" in user_messages - - -def test_explicit_revision_feedback_then_input_reruns_without_duplicate_feedback(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult(passed=True, score=0.9, validator="test"), - ValidationResult(passed=True, score=0.95, validator="test"), - ] - ), - ) - ) - - first = asyncio.run( - service.process_direct( - "查询珠海天气", - session_id="web:explicit-revise", - provider_bundle=_bundle("珠海天气概览", route_action="new_task"), - ) - ) - feedback = asyncio.run( - service.submit_feedback( - session_id=first.session_id, - run_id=first.run_id, - feedback_type="revise", - comment="准备补充穿衣建议", - ) - ) - second = asyncio.run( - service.process_direct( - "加上明后天穿衣建议", - session_id="web:explicit-revise", - provider_bundle=_bundle("更新后的珠海天气和穿衣建议", route_action="revise_task"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(first.task_id) - - assert feedback["task_status"] == "needs_revision" - assert second.task_id == first.task_id - assert task is not None - assert task.status == "awaiting_feedback" - assert len(task.run_ids) == 2 - assert len(task.feedback) == 1 - assert task.feedback[0]["feedback_type"] == "revise" - assert task.feedback[0]["comment"] == "准备补充穿衣建议" - - -def test_validation_result_status_drives_accepted_and_passed() -> None: - accepted = ValidationResult(status="accepted", score=0.9, validator="test") - insufficient = ValidationResult(status="insufficient_evidence", score=0.9, validator="test") - rejected = ValidationResult(status="rejected", score=0.9, validator="test") - - assert accepted.passed is True - assert accepted.accepted is True - assert insufficient.passed is False - assert insufficient.accepted is False - assert rejected.passed is False - assert rejected.accepted is False - - -def test_validation_result_from_legacy_payload_maps_to_status() -> None: - accepted = ValidationResult.from_dict({"passed": True, "score": 0.9, "validator": "legacy"}) - low_score = ValidationResult.from_dict({"passed": True, "score": 0.7, "validator": "legacy"}) - rejected = ValidationResult.from_dict({"passed": False, "score": 0.2, "validator": "legacy"}) - - assert accepted is not None - assert accepted.status == "accepted" - assert low_score is not None - assert low_score.status == "rejected" - assert rejected is not None - assert rejected.status == "rejected" - - -def test_validation_result_rejects_unknown_status() -> None: - with pytest.raises(ValueError, match="unknown validation status"): - ValidationResult(status="pending", score=0.9, validator="test") # type: ignore[arg-type] - - -def test_validation_result_from_dict_rejects_unknown_explicit_status() -> None: - with pytest.raises(ValueError, match="unknown validation status"): - ValidationResult.from_dict({"status": "pending", "passed": True, "score": 0.9}) - - -def test_validation_result_evidence_gaps_round_trip() -> None: - validation = ValidationResult( - status="insufficient_evidence", - score=0.4, - evidence_gaps=["missing command output", "missing file reference"], - validator="test", - ) - - restored = ValidationResult.from_dict(validation.to_dict()) - - assert restored is not None - assert restored.status == "insufficient_evidence" - assert restored.evidence_gaps == ["missing command output", "missing file reference"] - assert restored.to_dict()["evidence_gaps"] == ["missing command output", "missing file reference"] - - -def test_task_record_status_helpers_distinguish_review_and_failed() -> None: - needs_review = _task_record("needs_review") - failed = _task_record("failed") - - assert needs_review.is_open is True - assert needs_review.is_execution_active is False - assert needs_review.requires_user_action is True - assert failed.is_open is False - assert failed.is_execution_active is False - assert failed.requires_user_action is False - - -def test_task_service_api_payload_emits_status_helpers(tmp_path: Path) -> None: - service = TaskService(tmp_path) - task = _task_record("needs_review") - - payload = service.to_api_dict(task) - - assert payload["is_open"] is True - assert payload["is_execution_active"] is False - assert payload["requires_user_action"] is True - - -def test_validation_failure_retries_once(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult( - passed=False, - score=0.2, - issues=["missing tests"], - recommended_revision_prompt="Add tests before final response.", - validator="test", - ), - ValidationResult(passed=True, score=0.88, validator="test"), - ] - ), - ) - ) - - result = asyncio.run( - service.process_direct( - "implement and validate the task", - session_id="web:retry", - provider_bundle=_bundle("first draft", "revised draft"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(result.task_id) - - assert result.output_text == "revised draft" - assert result.validation_result["accepted"] is True - assert task is not None - assert len(task.run_ids) == 2 - visible_messages = loaded.session_manager.get_messages_as_conversation(result.session_id) - visible_contents = [message.get("content") for message in visible_messages] - assert "first draft" not in visible_contents - assert "revised draft" in visible_contents - - -def test_feedback_closes_or_abandons_internal_task(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ValidationResult(passed=True, score=0.9, validator="test")] - ), + task_execution_planner=StubTaskExecutionPlanner(), ) ) result = asyncio.run( service.process_direct( - "implement feedback handling", - session_id="web:feedback", - provider_bundle=_bundle("done"), + "write implementation plan", + session_id="web:acceptance", + provider_bundle=_bundle("Plan"), ) ) - loaded = service.create_loop().boot() - learning_calls = [] - def build_learning_candidates_for_task(task_id: str, *, trigger_run_id: str) -> list[FakeLearningCandidate]: - learning_calls.append((task_id, trigger_run_id)) + loaded = service.create_loop().boot() + generated: list[tuple[str, str]] = [] + + def build_learning_candidates_for_task( + task_id: str, + *, + final_accepted_run_id: str | None = None, + trigger_run_id: str | None = None, + ) -> list[FakeLearningCandidate]: + generated.append((task_id, final_accepted_run_id or trigger_run_id or "")) return [FakeLearningCandidate()] loaded.skill_learning_service.build_learning_candidates_for_task = build_learning_candidates_for_task - feedback = asyncio.run( - service.submit_feedback( - session_id=result.session_id, + response = asyncio.run( + service.submit_acceptance( + session_id="web:acceptance", run_id=result.run_id, - feedback_type="satisfied", + acceptance_type="accept", ) ) - assert feedback["task_status"] == "closed" - assert feedback["learning_candidates"] == [ + assert response["task_status"] == "closed" + assert response["acceptance_type"] == "accept" + assert response["learning_candidates"] == [ {"candidate_id": "candidate-1", "kind": "new_skill", "status": "open"} ] - assert learning_calls == [(result.task_id, result.run_id)] + assert generated == [(result.task_id, result.run_id)] - service2 = AgentService( - loader=EngineLoader( - workspace=tmp_path / "abandon", - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult(passed=False, score=0.3, validator="test"), - ValidationResult(passed=False, score=0.3, validator="test"), - ] - ), - ) - ) - abandoned = asyncio.run( - service2.process_direct( - "implement another workflow", - session_id="web:abandon", - provider_bundle=_bundle("not enough", "still not enough"), - ) - ) - abandon_feedback = asyncio.run( - service2.submit_feedback( - session_id=abandoned.session_id, - run_id=abandoned.run_id, - feedback_type="abandon", - comment="too costly", - ) - ) - - assert abandon_feedback["task_status"] == "abandoned" - assert abandon_feedback["learning_candidates"] == [] - loaded2 = service2.create_loop().boot() - failure_events = [ - event - for event in loaded2.session_manager.get_run_event_records(abandoned.session_id, abandoned.run_id) - if event.event_type == "task_failure_evidence_recorded" - ] - assert len(failure_events) == 1 - assert loaded2.memory_service.get_store().memory_entries == [] + task_service = loaded.task_service + assert task_service is not None + task = task_service.get_task(result.task_id or "") + assert task is not None + assert task.metadata["final_accepted_run_id"] == result.run_id -def test_feedback_is_idempotent_and_projected_to_assistant_message(tmp_path: Path) -> None: +def test_revise_and_abandon_do_not_trigger_learning(tmp_path: Path) -> None: service = AgentService( loader=EngineLoader( workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ValidationResult(passed=True, score=0.9, validator="test")] - ), + task_execution_planner=StubTaskExecutionPlanner(), ) ) result = asyncio.run( service.process_direct( - "implement feedback projection", - session_id="web:feedback-projection", - provider_bundle=_bundle("done"), + "summarize notes", + session_id="web:revise", + provider_bundle=_bundle("Summary"), ) ) - loaded = service.create_loop().boot() - first = asyncio.run( - service.submit_feedback( - session_id=result.session_id, + response = asyncio.run( + service.submit_acceptance( + session_id="web:revise", run_id=result.run_id, - feedback_type="satisfied", + acceptance_type="revise", + comment="Add decisions", ) ) - second = asyncio.run( + + assert response["task_status"] == "needs_revision" + assert response["learning_candidates"] == [] + + task_service = service.create_loop().boot().task_service + assert task_service is not None + task = task_service.get_task(result.task_id or "") + assert task is not None + assert task.feedback[0]["acceptance_type"] == "revise" + + +def test_legacy_feedback_endpoint_maps_satisfied_to_accept(tmp_path: Path) -> None: + service = AgentService( + loader=EngineLoader( + workspace=tmp_path, + task_execution_planner=StubTaskExecutionPlanner(), + ) + ) + result = asyncio.run( + service.process_direct( + "prepare checklist", + session_id="web:legacy", + provider_bundle=_bundle("Checklist"), + ) + ) + + response = asyncio.run( service.submit_feedback( - session_id=result.session_id, + session_id="web:legacy", run_id=result.run_id, feedback_type="satisfied", ) ) - feedback_events = [ - event - for event in loaded.session_manager.get_run_event_records(result.session_id, result.run_id) - if event.event_type == "task_feedback_recorded" - ] - assistant = [ - message - for message in loaded.session_manager.get_messages_as_conversation(result.session_id) - if message.get("role") == "assistant" and message.get("run_id") == result.run_id - ][-1] - - assert first["task_status"] == "closed" - assert second["task_status"] == "closed" - assert len(feedback_events) == 1 - assert assistant["feedback_state"] == "satisfied" - assert assistant["task_status"] == "closed" - assert assistant["validation_status"] == "passed" - - with pytest.raises(ValueError, match="already recorded"): - asyncio.run( - service.submit_feedback( - session_id=result.session_id, - run_id=result.run_id, - feedback_type="abandon", - ) - ) - - task = loaded.task_service.get_task(result.task_id) - assert task is not None - assert task.status == "closed" + assert response["acceptance_type"] == "accept" + assert response["feedback_type"] == "satisfied" + assert response["task_status"] == "closed" -def test_task_mode_team_plan_runs_subagent_then_main_synthesis(tmp_path: Path) -> None: - main_provider = StubProvider( - [ - LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model") - ] - ) - sub_provider = StubProvider( - [ - LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model") - ] - ) - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=StubTaskExecutionPlanner([_team_plan()]), - validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]), - ) - ) +def test_task_service_maps_legacy_status_and_feedback(tmp_path: Path) -> None: + service = TaskService(tmp_path) + task = service.create_task(session_id="s", description="legacy") + task.status = "awaiting_feedback" + task.feedback.append({"feedback_type": "satisfied", "run_id": "run-1"}) + service.store.upsert_task(task) - result = asyncio.run( - service.process_direct( - "implement team-backed workflow", - session_id="web:team", - provider_bundle=_provider_bundle(main_provider), - team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(result.task_id) - events = loaded.session_manager.get_event_records(result.session_id) + loaded = service.get_task(task.task_id) - assert result.output_text == "final synthesized answer" - assert task is not None - assert len(task.run_ids) == 2 - assert result.run_id == task.run_ids[-1] - assert any(event.event_type == "task_execution_planned" for event in events) - assert any(event.event_type == "task_team_run_completed" for event in events) - assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"] - assert "sub-agent evidence" != result.output_text - - -def test_task_mode_team_synthesis_runs_without_tools_and_receives_evidence(tmp_path: Path) -> None: - main_provider = StubProvider( - [ - LLMResponse(content="final synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model") - ] - ) - sub_provider = StubProvider( - [ - LLMResponse(content="sub-agent evidence", finish_reason="stop", provider_name="stub", model="stub-model") - ] - ) - validation = StubValidationService([ValidationResult(status="accepted", score=0.9, validator="test")]) - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=StubTaskExecutionPlanner([_team_plan()]), - validation_service=validation, - ) - ) - - result = asyncio.run( - service.process_direct( - "implement team-backed workflow", - session_id="web:team-no-tools", - provider_bundle=_provider_bundle(main_provider), - team_provider_bundle_factory=lambda node: _provider_bundle(sub_provider), - ) - ) - - assert result.output_text == "final synthesized answer" - assert main_provider.calls[0]["tools"] is None - assert "sub-agent evidence" in main_provider.calls[0]["messages"][0]["content"] - assert "Task evidence packet" in validation.calls[0]["evidence_text"] - - -def test_task_mode_team_failure_still_uses_main_synthesis(tmp_path: Path) -> None: - main_provider = StubProvider( - [ - LLMResponse(content="fallback synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model") - ] - ) - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=StubTaskExecutionPlanner([_team_plan()]), - validation_service=StubValidationService([ValidationResult(passed=True, score=0.9, validator="test")]), - ) - ) - - result = asyncio.run( - service.process_direct( - "implement workflow despite team failure", - session_id="web:team-failure", - provider_bundle=_provider_bundle(main_provider), - team_provider_bundle_factory=lambda node: (_ for _ in ()).throw(RuntimeError("sub-agent unavailable")), - ) - ) - loaded = service.create_loop().boot() - events = loaded.session_manager.get_event_records(result.session_id) - - assert result.output_text == "fallback synthesized answer" - assert any(event.event_type == "task_team_run_failed" for event in events) - assert "sub-agent unavailable" in main_provider.calls[0]["messages"][0]["content"] - assert "same class of tools fails repeatedly" in main_provider.calls[0]["messages"][0]["content"] - assert "user-visible fallback answer" in main_provider.calls[0]["messages"][0]["content"] - - -def test_insufficient_evidence_moves_task_to_needs_review(tmp_path: Path) -> None: - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=_single_planner(), - validation_service=StubValidationService( - [ - ValidationResult( - status="insufficient_evidence", - score=0.4, - evidence_gaps=["source missing"], - validator="test", - ) - ] - ), - ) - ) - - result = asyncio.run( - service.process_direct( - "answer with uncertain evidence", - session_id="web:needs-review", - provider_bundle=_bundle("possible answer"), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(result.task_id) - events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) - validation_event = next(event for event in events if event.event_type == "task_validation_snapshotted") - - assert task is not None - assert task.status == "needs_review" - assert task.requires_user_action is True - assert task.is_execution_active is False - assert validation_event.event_payload["validation_result"]["status"] == "insufficient_evidence" - assert validation_event.event_payload["retry_scheduled"] is False - assert validation_event.event_payload["validation_debug"]["tool_result_count"] >= 0 - - -def test_task_mode_team_retry_hides_first_synthesis_run(tmp_path: Path) -> None: - main_provider = StubProvider( - [ - LLMResponse(content="first synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"), - LLMResponse(content="revised synthesized answer", finish_reason="stop", provider_name="stub", model="stub-model"), - ] - ) - sub_providers = [ - StubProvider([LLMResponse(content="first evidence", finish_reason="stop", provider_name="stub", model="stub-model")]), - StubProvider([LLMResponse(content="second evidence", finish_reason="stop", provider_name="stub", model="stub-model")]), - ] - service = AgentService( - loader=EngineLoader( - workspace=tmp_path, - task_execution_planner=StubTaskExecutionPlanner([_team_plan(), _team_plan()]), - validation_service=StubValidationService( - [ - ValidationResult(passed=False, score=0.2, recommended_revision_prompt="revise", validator="test"), - ValidationResult(passed=True, score=0.9, validator="test"), - ] - ), - ) - ) - - result = asyncio.run( - service.process_direct( - "implement and validate with team", - session_id="web:team-retry", - provider_bundle=_provider_bundle(main_provider), - team_provider_bundle_factory=lambda node: _provider_bundle(sub_providers.pop(0)), - ) - ) - loaded = service.create_loop().boot() - task = loaded.task_service.get_task(result.task_id) - visible = loaded.session_manager.get_messages_as_conversation(result.session_id) - visible_contents = [message.get("content") for message in visible] - run_records = {record.run_id: record for record in loaded.run_memory_store.list_runs()} - - assert result.output_text == "revised synthesized answer" - assert task is not None - assert len(task.run_ids) == 4 - assert "first synthesized answer" not in visible_contents - assert "revised synthesized answer" in visible_contents - for run_id in task.run_ids: - record = run_records[run_id] - events = loaded.session_manager.get_run_event_records(record.session_id, run_id) - skill_effects = [event for event in events if event.event_type == "skill_effects_snapshotted"] - assert skill_effects - assert skill_effects[-1].event_payload["candidate_generation_allowed"] is False - - -def test_context_builder_strips_ui_projection_fields_from_provider_history() -> None: - result = ContextBuilder().build_messages( - ContextBuildInput( - history=[ - { - "role": "assistant", - "content": "done", - "run_id": "run-1", - "task_id": "task-1", - "task_status": "closed", - "validation_status": "passed", - "feedback_state": "satisfied", - } - ], - ) - ) - - assistant = result.messages[-1] - assert assistant == {"role": "assistant", "content": "done"} - - -def test_context_builder_normalizes_persisted_tool_arguments() -> None: - result = ContextBuilder().build_messages( - ContextBuildInput( - history=[ - { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call-1", - "type": "function", - "function": { - "name": "cron", - "arguments": {"action": "add", "mode": "notification"}, - }, - } - ], - } - ], - ) - ) - - tool_call = result.messages[-1]["tool_calls"][0] - assert tool_call["function"]["arguments"] == '{"action": "add", "mode": "notification"}' - - -def test_llm_validator_parse_failure_is_not_accepted(tmp_path: Path) -> None: - task_service = TaskService(tmp_path / "tasks") - task = task_service.create_task(session_id="web:validator", description="implement validator handling") - validation = asyncio.run( - ValidationService().validate_task_result( - task=task, - user_message="implement validator handling", - final_output="done", - provider_bundle=_main_only_bundle("not json"), - ) - ) - - assert validation.accepted is False - assert validation.status == "validator_error" - assert validation.validator == "llm_error" - assert validation.issues + assert loaded is not None + assert loaded.status == "awaiting_acceptance" + assert loaded.feedback[0]["acceptance_type"] == "accept" diff --git a/app-instance/backend/tests/unit/test_websocket_chat.py b/app-instance/backend/tests/unit/test_websocket_chat.py index 4dabf5e..4bab30b 100644 --- a/app-instance/backend/tests/unit/test_websocket_chat.py +++ b/app-instance/backend/tests/unit/test_websocket_chat.py @@ -20,8 +20,8 @@ class StubRunResult: model: str | None = "stub-model" usage: dict[str, Any] = field(default_factory=lambda: {"total_tokens": 3}) task_id: str | None = "task-1" - task_status: str | None = "awaiting_feedback" - validation_result: dict[str, Any] | None = field(default_factory=lambda: {"accepted": True}) + task_status: str | None = "awaiting_acceptance" + validation_result: dict[str, Any] | None = None class StubAgentService(AgentService): @@ -101,9 +101,10 @@ def test_websocket_message_returns_chat_metadata_and_session_updated() -> None: assert message["session_id"] == "web:alpha" assert message["run_id"] == "run-1" assert message["task_id"] == "task-1" - assert message["task_status"] == "awaiting_feedback" - assert message["validation_result"] == {"accepted": True} - assert message["validation_status"] == "passed" + assert message["task_status"] == "awaiting_acceptance" + assert message["evidence_status"] == "recorded" + assert message["validation_result"] is None + assert "validation_status" not in message assert message["metadata"]["input_metadata"] == { "source": "test", "attachments": [{"file_id": "file-1", "name": "a.txt"}], diff --git a/app-instance/frontend/app/(app)/page.tsx b/app-instance/frontend/app/(app)/page.tsx index de05fc8..5fbda45 100644 --- a/app-instance/frontend/app/(app)/page.tsx +++ b/app-instance/frontend/app/(app)/page.tsx @@ -19,7 +19,7 @@ import { uploadFile, wsManager, } from '@/lib/api'; -import { mergeServerWithPendingUsers, shouldMergePendingUsers } from '@/lib/chat-messages'; +import { mergeServerWithPendingUsers, shouldDisplayChatMessage, shouldMergePendingUsers } from '@/lib/chat-messages'; import { pickAppText } from '@/lib/i18n/core'; import { useAppI18n } from '@/lib/i18n/provider'; import { buildSessionProgressView } from '@/lib/session-progress'; @@ -32,7 +32,7 @@ function isSessionUpdatedEvent(data: WsEvent | Record): data is function activeTaskStatusLabel(status: string, locale: 'zh-CN' | 'en-US') { if (status === 'needs_revision') return pickAppText(locale, '待修改', 'Needs revision'); - if (status === 'awaiting_feedback') return pickAppText(locale, '待反馈', 'Awaiting feedback'); + if (status === 'awaiting_acceptance') return pickAppText(locale, '待验收', 'Awaiting acceptance'); if (status === 'running') return pickAppText(locale, '进行中', 'Running'); return pickAppText(locale, '进行中', 'Active'); } @@ -157,10 +157,11 @@ export default function ChatPage() { setSessionProcess(key, process); } void loadActiveTask(key); - const shouldMergePending = shouldMergePendingUsers(detail.messages, localSnapshot, waitingForReply); + const displayMessages = detail.messages.filter(shouldDisplayChatMessage); + const shouldMergePending = shouldMergePendingUsers(displayMessages, localSnapshot, waitingForReply); const nextMessages = shouldMergePending - ? mergeServerWithPendingUsers(detail.messages, localSnapshot) - : detail.messages; + ? mergeServerWithPendingUsers(displayMessages, localSnapshot) + : displayMessages; setMessages(nextMessages); shouldSnapToLatestRef.current = true; const last = nextMessages[nextMessages.length - 1]; @@ -217,15 +218,11 @@ export default function ChatPage() { if (data.type === 'status' && data.status === 'thinking') { setIsThinking(true); } else if (data.type === 'message' && data.role === 'assistant') { - const validationResult = data.validation_result ?? data.metadata?.validation_result; - const validationStatus = data.validation_status - ? data.validation_status - : validationResult - ? ((validationResult as Record).accepted === true ? 'passed' : 'failed') - : 'unknown'; setIsThinking(false); setIsLoading(false); - addMessage({ + const rawEvidenceStatus = data.evidence_status ?? data.metadata?.evidence_status; + const evidenceStatus = rawEvidenceStatus === 'recorded' ? 'recorded' : undefined; + const assistantMessage = { role: 'assistant', content: typeof data.content === 'string' ? data.content : '', timestamp: new Date().toISOString(), @@ -233,8 +230,11 @@ export default function ChatPage() { run_id: typeof data.run_id === 'string' ? data.run_id : undefined, task_id: data.task_id ?? data.metadata?.task_id ?? null, task_status: data.task_status ?? data.metadata?.task_status ?? null, - validation_status: validationStatus, - }); + evidence_status: evidenceStatus, + } as const; + if (shouldDisplayChatMessage(assistantMessage)) { + addMessage(assistantMessage); + } void loadSessionMessages(typeof data.session_id === 'string' ? data.session_id : useChatStore.getState().sessionId); void loadActiveTask(typeof data.session_id === 'string' ? data.session_id : useChatStore.getState().sessionId); loadSessions(); @@ -359,17 +359,18 @@ export default function ChatPage() { await loadSessions(); return; } - addMessage({ + const assistantMessage = { role: 'assistant', content: result.response, timestamp: new Date().toISOString(), run_id: result.run_id, task_id: result.task_id, task_status: result.task_status, - validation_status: result.validation_result - ? (result.validation_result.accepted === true ? 'passed' : 'failed') - : 'unknown', - }); + evidence_status: result.evidence_status === 'recorded' ? 'recorded' : undefined, + } as const; + if (shouldDisplayChatMessage(assistantMessage)) { + addMessage(assistantMessage); + } void getSessionProcess(sessionId).then((process) => setSessionProcess(sessionId, process)).catch(() => null); void loadActiveTask(sessionId); loadSessions(); @@ -393,7 +394,7 @@ export default function ChatPage() { } }, [addMessage, clearInputDraft, input, isLoading, loadActiveTask, loadSessionMessages, loadSessions, locale, pendingFiles, revisionTargetRunId, sessionId, setIsLoading, setIsThinking, setSessionProcess, thinkingModeEnabled, updateMessageFeedback]); - const handleFeedback = useCallback(async (runId: string, feedbackType: 'satisfied' | 'revise' | 'abandon', comment?: string) => { + const handleFeedback = useCallback(async (runId: string, feedbackType: 'accept' | 'revise' | 'abandon', comment?: string) => { updateMessageFeedback(runId, feedbackType); try { await submitChatFeedback({ diff --git a/app-instance/frontend/app/(app)/skills/page.tsx b/app-instance/frontend/app/(app)/skills/page.tsx index 4928b53..ba9cb04 100644 --- a/app-instance/frontend/app/(app)/skills/page.tsx +++ b/app-instance/frontend/app/(app)/skills/page.tsx @@ -1238,7 +1238,7 @@ function riskLabel(risk: string, t: (zh: string, en: string) => string): string function triggerReasonLabel(reason: string, t: (zh: string, en: string) => string): string { const labels: Record = { - validation_accepted_and_user_satisfied: t('任务验证通过且用户满意', 'Validation accepted and user satisfied'), + task_accepted: t('任务已接受', 'Task accepted'), }; return labels[reason] || reason; } diff --git a/app-instance/frontend/app/(app)/tasks/[taskId]/page.tsx b/app-instance/frontend/app/(app)/tasks/[taskId]/page.tsx index c069dcc..f9b365f 100644 --- a/app-instance/frontend/app/(app)/tasks/[taskId]/page.tsx +++ b/app-instance/frontend/app/(app)/tasks/[taskId]/page.tsx @@ -3,7 +3,7 @@ import Link from 'next/link'; import { useParams, useRouter } from 'next/navigation'; import React, { useMemo, useState } from 'react'; -import { AlertCircle, ArrowLeft, Bot, CheckCircle2, Download, FileText, HelpCircle, Loader2, MessageSquare, RefreshCw, ThumbsUp, Trash2, User, XCircle } from 'lucide-react'; +import { AlertCircle, ArrowLeft, Bot, CheckCircle2, Download, FileText, Loader2, MessageSquare, RefreshCw, ThumbsUp, Trash2, User, XCircle } from 'lucide-react'; import { TaskRuntimeStatusBadge, formatTaskRuntimeDuration, formatTaskRuntimeTime, progressPercent } from '@/components/task-runtime/TaskRuntimeShared'; import { Badge } from '@/components/ui/badge'; @@ -17,8 +17,9 @@ import { buildTaskRuntimeView, type TaskRuntimeNodeView } from '@/lib/task-runti import { useChatStore } from '@/lib/store'; import type { BackendTask, BackendTaskRun, ProcessArtifact, ProcessEvent, ProcessRun } from '@/types'; -type TaskFeedbackType = 'satisfied' | 'revise' | 'abandon'; +type TaskFeedbackType = 'accept' | 'revise' | 'abandon'; type TaskFeedbackItem = { + acceptance_type?: unknown; feedback_type?: unknown; comment?: unknown; created_at?: unknown; @@ -151,12 +152,6 @@ export default function TaskDetailPage() { const backendFeedbackRunId = backendTask ? pickFeedbackRunId(backendTask) : null; if (!task && backendTask) { - const validation = backendTask.validation_result; - const accepted = Boolean(validation?.accepted); - const validationIssues = [ - ...arrayOfStrings(validation?.issues), - ...arrayOfStrings(validation?.missing_requirements), - ]; const feedbackItems = backendTask.feedback || []; return (
@@ -232,57 +227,6 @@ export default function TaskDetailPage() { - - - {pickAppText(locale, '验证和反馈', 'Validation and feedback')} - - -
-
- {validation ? ( - accepted ? : - ) : ( - - )} -
- {validation - ? accepted - ? pickAppText(locale, '验证通过', 'Validation passed') - : pickAppText(locale, '需要继续修改', 'Needs revision') - : pickAppText(locale, '尚未验证', 'Not validated yet')} -
-
- {validation ? ( -
- {pickAppText(locale, '评分', 'Score')}: {String(validation.score ?? '-')} · {pickAppText(locale, '验证器', 'Validator')}: {String(validation.validator ?? '-')} -
- ) : null} - {validationIssues.length > 0 && ( -
    - {validationIssues.map((item, index) =>
  • {item}
  • )} -
- )} - {typeof validation?.recommended_revision_prompt === 'string' && validation.recommended_revision_prompt && ( -

{validation.recommended_revision_prompt}

- )} -
- -
-
{pickAppText(locale, '用户反馈', 'User feedback')}
- {feedbackItems.length === 0 ? ( -

{pickAppText(locale, '还没有用户反馈。', 'No user feedback yet.')}

- ) : ( - feedbackItems.map((item, index) => ( -
-
{humanFeedback(String(item.feedback_type || ''), locale)}
- {item.comment ?

{String(item.comment)}

: null} - {item.created_at ?

{formatTaskRuntimeTime(String(item.created_at), locale)}

: null} -
- )) - )} -
-
-
); } @@ -476,6 +420,7 @@ export default function TaskDetailPage() { comment, }); setRuntimeFeedback({ + acceptance_type: feedbackType, feedback_type: feedbackType, comment: comment || '', created_at: new Date().toISOString(), @@ -660,14 +605,14 @@ function TaskFeedbackPanel({ return ( - {pickAppText(locale, '任务反馈', 'Task feedback')} + {pickAppText(locale, '任务验收', 'Task acceptance')} {recordedFeedback ? (
- {pickAppText(locale, '已提交反馈', 'Feedback submitted')}: {humanFeedback(String(recordedFeedback.feedback_type || ''), locale)} + {pickAppText(locale, '已提交验收', 'Acceptance submitted')}: {humanFeedback(String(recordedFeedback.acceptance_type || recordedFeedback.feedback_type || ''), locale)}
{recordedFeedback.comment ? (

{String(recordedFeedback.comment)}

@@ -678,22 +623,22 @@ function TaskFeedbackPanel({
) : isFinalized ? (
- {pickAppText(locale, '任务已结束,不能再提交新的反馈。', 'This task is finalized and cannot accept new feedback.')} + {pickAppText(locale, '任务已结束,不能再提交新的验收。', 'This task is finalized and cannot accept new acceptance.')}
) : !runId ? (
- {pickAppText(locale, '暂无可反馈的运行记录。', 'No run is available for feedback yet.')} + {pickAppText(locale, '暂无可验收的运行记录。', 'No run is available for acceptance yet.')}
) : null}
} - label={pickAppText(locale, '满意', 'Satisfied')} + label={pickAppText(locale, '接受', 'Accept')} actionBusy={actionBusy} disabled={!canSubmit} - onClick={() => submit('satisfied', comment.trim() || undefined)} + onClick={() => submit('accept', comment.trim() || undefined)} /> setComment(event.target.value)} disabled={Boolean(recordedFeedback) || isFinalized || Boolean(actionBusy)} - placeholder={pickAppText(locale, '需要修改时写下具体要求;满意或放弃可选填说明。', 'Describe requested changes; notes are optional for satisfied or abandon.')} + placeholder={pickAppText(locale, '需要修改时写下具体要求;接受或放弃可选填说明。', 'Describe requested changes; notes are optional for accept or abandon.')} />
- {pickAppText(locale, '反馈将记录到当前任务运行:', 'Feedback will be recorded on run: ')} + {pickAppText(locale, '验收将记录到当前任务运行:', 'Acceptance will be recorded on run: ')} {runId || '-'} · {pickAppText(locale, '会话:', 'Session: ')} @@ -807,8 +752,7 @@ function humanTaskStatus(status: string, locale: 'zh-CN' | 'en-US') { const map: Record = { open: ['已创建', 'Open'], running: ['执行中', 'Running'], - validating: ['验证中', 'Validating'], - awaiting_feedback: ['等待反馈', 'Awaiting feedback'], + awaiting_acceptance: ['等待验收', 'Awaiting acceptance'], needs_revision: ['需要修改', 'Needs revision'], closed: ['已完成', 'Closed'], abandoned: ['已放弃', 'Abandoned'], @@ -818,10 +762,10 @@ function humanTaskStatus(status: string, locale: 'zh-CN' | 'en-US') { } function humanFeedback(type: string, locale: 'zh-CN' | 'en-US') { - if (type === 'satisfied') return pickAppText(locale, '满意', 'Satisfied'); + if (type === 'accept' || type === 'satisfied') return pickAppText(locale, '接受', 'Accepted'); if (type === 'revise') return pickAppText(locale, '请求修改', 'Revision requested'); if (type === 'abandon') return pickAppText(locale, '放弃任务', 'Abandoned'); - return type || pickAppText(locale, '反馈', 'Feedback'); + return type || pickAppText(locale, '验收', 'Acceptance'); } function humanFinishReason(reason: string, locale: 'zh-CN' | 'en-US') { @@ -848,7 +792,3 @@ function feedbackForRun(items: TaskFeedbackItem[], runId: string | null): TaskFe function latestFeedback(items: TaskFeedbackItem[]): TaskFeedbackItem | null { return [...items].reverse()[0] ?? null; } - -function arrayOfStrings(value: unknown): string[] { - return Array.isArray(value) ? value.map((item) => String(item)).filter(Boolean) : []; -} diff --git a/app-instance/frontend/app/(app)/tasks/page.tsx b/app-instance/frontend/app/(app)/tasks/page.tsx index f62b462..09142bf 100644 --- a/app-instance/frontend/app/(app)/tasks/page.tsx +++ b/app-instance/frontend/app/(app)/tasks/page.tsx @@ -142,7 +142,7 @@ function OrdinaryTasks() {
- + {taskStatusLabel(task.status, locale)} @@ -185,8 +185,7 @@ function taskStatusLabel(status: string, locale: 'zh-CN' | 'en-US') { const labels: Record = { open: ['已创建', 'Open'], running: ['执行中', 'Running'], - validating: ['验证中', 'Validating'], - awaiting_feedback: ['等待反馈', 'Awaiting feedback'], + awaiting_acceptance: ['等待验收', 'Awaiting acceptance'], needs_revision: ['需要修改', 'Needs revision'], closed: ['已完成', 'Closed'], abandoned: ['已放弃', 'Abandoned'], diff --git a/app-instance/frontend/components/chat-workbench/ChatWorkbench.tsx b/app-instance/frontend/components/chat-workbench/ChatWorkbench.tsx index 647f454..057b914 100644 --- a/app-instance/frontend/components/chat-workbench/ChatWorkbench.tsx +++ b/app-instance/frontend/components/chat-workbench/ChatWorkbench.tsx @@ -27,7 +27,7 @@ export function ChatWorkbench({ processArtifacts: ProcessArtifact[]; selectedRunId: string | null; onSelectRun: (runId: string) => void; - onFeedback: (runId: string, feedbackType: 'satisfied' | 'revise' | 'abandon', comment?: string) => void; + onFeedback: (runId: string, feedbackType: 'accept' | 'revise' | 'abandon', comment?: string) => void; onRequestRevision: (runId: string) => void; }) { return ( diff --git a/app-instance/frontend/components/chat-workbench/MessageList.tsx b/app-instance/frontend/components/chat-workbench/MessageList.tsx index 157f817..02a5739 100644 --- a/app-instance/frontend/components/chat-workbench/MessageList.tsx +++ b/app-instance/frontend/components/chat-workbench/MessageList.tsx @@ -6,7 +6,7 @@ import { Bot, CheckCircle2, ChevronRight, Loader2, Paperclip, RefreshCcw, Thumbs import type { ChatMessage, ProcessArtifact, ProcessEvent, ProcessRun } from '@/types'; import { getAccessToken, getFileUrl } from '@/lib/api'; -import { getTaskCardMessageIndexes } from '@/lib/chat-messages'; +import { getTaskCardMessageIndexes, hasVisibleChatContent, normalizedMessageText, shouldDisplayChatMessage } from '@/lib/chat-messages'; import { AgentTeamBlock } from '@/components/chat-workbench/AgentTeamBlock'; import { MarkdownContent } from '@/components/chat-workbench/MarkdownContent'; import { ScrollArea } from '@/components/ui/scroll-area'; @@ -49,19 +49,14 @@ function MessageBubble({ message: ChatMessage; showTaskCard: boolean; canSendFeedback: boolean; - onFeedback: (runId: string, feedbackType: 'satisfied' | 'revise' | 'abandon', comment?: string) => void; + onFeedback: (runId: string, feedbackType: 'accept' | 'revise' | 'abandon', comment?: string) => void; onRequestRevision: (runId: string) => void; }) { const { locale } = useAppI18n(); const isUser = message.role === 'user'; - const textContent = typeof message.content === 'string' ? message.content : String(message.content || ''); - const [feedbackMode, setFeedbackMode] = React.useState<'satisfied' | null>(null); + const textContent = normalizedMessageText(message.content); + const [feedbackMode, setFeedbackMode] = React.useState<'accept' | null>(null); const [feedbackComment, setFeedbackComment] = React.useState(''); - const validationFailed = message.validation_status === 'failed'; - const validationDetails = - validationFailed - ? pickAppText(locale, '详细原因会在任务验证区展示;展开任务可查看验证报告。', 'Detailed reasons are shown in the task validation area. Open the task to inspect the validation report.') - : ''; return (
@@ -142,22 +137,14 @@ function MessageBubble({
)} - {!isUser && validationFailed && ( -
- - {pickAppText(locale, '验证失败', 'Validation failed')} - -

{validationDetails}

-
- )} {!isUser && (canSendFeedback || message.feedback_state) && message.run_id && (
{message.feedback_state ? (
- {message.feedback_state === 'satisfied' - ? pickAppText(locale, '已标记满意', 'Marked satisfied') + {message.feedback_state === 'accept' || message.feedback_state === 'satisfied' + ? pickAppText(locale, '已接受', 'Accepted') : message.feedback_state === 'revise' ? pickAppText(locale, '已请求修改', 'Revision requested') : pickAppText(locale, '已放弃任务', 'Task abandoned')} @@ -168,11 +155,11 @@ function MessageBubble({
+ ); +} + +export function TaskAcceptanceCard({ + sessionId, + runId, + taskStatus, + feedbackItems, + actionBusy, + revision, + onRevisionChange, + onSubmit, +}: Props) { + const { locale } = useAppI18n(); + const [localComment, setLocalComment] = React.useState(''); + const comment = revision ?? localComment; + const setComment = onRevisionChange ?? setLocalComment; + const isFinalized = taskStatus === 'closed' || taskStatus === 'abandoned'; + const recordedFeedback = feedbackForRun(feedbackItems, runId) ?? (isFinalized ? latestFeedback(feedbackItems) : null); + const canSubmit = Boolean(runId) && !recordedFeedback && !isFinalized && !actionBusy; + + const submit = (feedbackType: TaskFeedbackType, nextComment?: string) => { + if (!runId || !canSubmit) return; + void onSubmit(feedbackType, nextComment); + }; + + return ( + + + {pickAppText(locale, '任务验收', 'Task acceptance')} + + + {recordedFeedback ? ( +
+
+ + {pickAppText(locale, '已提交验收', 'Acceptance submitted')}: {humanFeedback(String(recordedFeedback.acceptance_type || recordedFeedback.feedback_type || ''), locale)} +
+ {recordedFeedback.comment ?

{String(recordedFeedback.comment)}

: null} + {recordedFeedback.created_at ?

{formatTaskRuntimeTime(String(recordedFeedback.created_at), locale)}

: null} +
+ ) : isFinalized ? ( +
+ {pickAppText(locale, '任务已结束,不能再提交新的验收。', 'This task is finalized and cannot accept new acceptance.')} +
+ ) : !runId ? ( +
+ {pickAppText(locale, '暂无可验收的运行记录。', 'No run is available for acceptance yet.')} +
+ ) : null} + +
+ } label={pickAppText(locale, '接受', 'Accept')} actionBusy={actionBusy} disabled={!canSubmit} onClick={() => submit('accept', comment.trim() || undefined)} /> + } label={pickAppText(locale, '需要修改', 'Needs revision')} actionBusy={actionBusy} disabled={!canSubmit || !comment.trim()} onClick={() => submit('revise', comment.trim())} /> + } label={pickAppText(locale, '放弃', 'Abandon')} actionBusy={actionBusy} disabled={!canSubmit} onClick={() => submit('abandon', comment.trim() || undefined)} /> +
+ +