From b808f5cbc26bc1cdfe37ed94f99725fcb8f31d63 Mon Sep 17 00:00:00 2001 From: steven_li Date: Fri, 22 May 2026 11:35:46 +0800 Subject: [PATCH] feat(task): route validation status to review states --- .../backend/beaver/services/agent_service.py | 38 ++++++++++++++++-- app-instance/backend/beaver/tasks/service.py | 26 ++++++++++-- .../backend/beaver/tasks/validation.py | 18 ++++++--- .../tests/unit/test_task_mode_feedback.py | 40 +++++++++++++++++++ 4 files changed, 110 insertions(+), 12 deletions(-) diff --git a/app-instance/backend/beaver/services/agent_service.py b/app-instance/backend/beaver/services/agent_service.py index ea6ea79..e489995 100644 --- a/app-instance/backend/beaver/services/agent_service.py +++ b/app-instance/backend/beaver/services/agent_service.py @@ -854,7 +854,19 @@ class AgentService: provider_bundle=provider_bundle, ) latest_validation = validation - task = task_service.record_validation(task.task_id, result.run_id, validation) + has_usable_answer = bool(result.output_text.strip()) and ( + "Tool loop stopped after reaching the configured iteration limit." not in result.output_text + ) + task = task_service.record_validation( + task.task_id, + result.run_id, + validation, + final_attempt=( + attempt_index == 2 + or validation.status in {"accepted", "insufficient_evidence", "validator_error"} + ), + has_usable_answer=has_usable_answer, + ) run_memory_store.update_run_record(result.run_id, validation_result=validation.to_dict()) session_manager.update_latest_assistant_event_payload( result.session_id, @@ -865,6 +877,23 @@ class AgentService: "validation_status": "passed" if validation.accepted else "failed", }, ) + validation_debug = { + "evidence_run_ids": [ + item.run_id for item in [evidence_packet.main_run, *evidence_packet.team_runs] if item is not None + ], + "evidence_session_ids": [ + item.session_id + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ], + "tool_result_count": sum( + len(item.tool_results) + for item in [evidence_packet.main_run, *evidence_packet.team_runs] + if item is not None + ), + "evidence_length": len(evidence_text), + } + retry_scheduled = validation.status == "rejected" and attempt_index == 1 session_manager.append_message( result.session_id, run_id=result.run_id, @@ -874,17 +903,18 @@ class AgentService: "task_id": task.task_id, "attempt_index": attempt_index, "validation_result": validation.to_dict(), - "retry_scheduled": not validation.accepted and attempt_index == 1, + "validation_debug": validation_debug, + "retry_scheduled": retry_scheduled, }, content=validation.recommended_revision_prompt or None, context_visible=False, ) - if not validation.accepted and attempt_index == 1: + if retry_scheduled: session_manager.set_run_context_visible(result.session_id, result.run_id, False) result.task_id = task.task_id result.task_status = task.status result.validation_result = validation.to_dict() - if validation.accepted or attempt_index == 2: + if not retry_scheduled: return result if last_result is None: # pragma: no cover - defensive diff --git a/app-instance/backend/beaver/tasks/service.py b/app-instance/backend/beaver/tasks/service.py index 6cabf6f..92701b1 100644 --- a/app-instance/backend/beaver/tasks/service.py +++ b/app-instance/backend/beaver/tasks/service.py @@ -110,10 +110,30 @@ class TaskService: self._event(task, "run_completed", run_id=run_id, payload={"skill_names": skill_names or []}) return task - def record_validation(self, task_id: str, run_id: str, validation: ValidationResult) -> TaskRecord: + def record_validation( + self, + task_id: str, + run_id: str, + validation: ValidationResult, + *, + final_attempt: bool = True, + has_usable_answer: bool = True, + ) -> TaskRecord: task = self._require(task_id) - task.status = "awaiting_feedback" - task.updated_at = self._now() + now = self._now() + if validation.status == "accepted": + task.status = "awaiting_feedback" + elif validation.status in {"insufficient_evidence", "validator_error"}: + task.status = "needs_review" + elif validation.status == "rejected" and not final_attempt: + task.status = "needs_revision" + elif validation.status == "rejected" and has_usable_answer: + task.status = "needs_review" + else: + task.status = "failed" + task.closed_at = now + task.close_reason = "automatic validation rejected the final attempt" + task.updated_at = now task.validation_result = validation.to_dict() self.store.upsert_task(task) self._event(task, "validated", run_id=run_id, payload=validation.to_dict()) diff --git a/app-instance/backend/beaver/tasks/validation.py b/app-instance/backend/beaver/tasks/validation.py index 0744145..28a3eaa 100644 --- a/app-instance/backend/beaver/tasks/validation.py +++ b/app-instance/backend/beaver/tasks/validation.py @@ -45,13 +45,13 @@ class ValidationService: ) except Exception as exc: return ValidationResult( - passed=False, + status="validator_error", score=0.0, issues=[f"Validator failed: {exc}"], - missing_requirements=["A valid automatic validation result is required before accepting the task."], + evidence_gaps=["Automatic validation failed before producing a reliable decision."], + missing_requirements=["User review is required because automatic validation failed."], recommended_revision_prompt=( - "Review the task result again because automatic validation failed, " - "then provide a corrected final answer that explicitly satisfies the task goal." + "Review the answer and evidence, then decide whether to revise or accept it." ), validator="llm_error", ) @@ -96,11 +96,19 @@ class ValidationService: temperature=0.0, ) payload = self._parse_json_object(response.content or "") + status = payload.get("status") + if status not in {"accepted", "rejected", "insufficient_evidence", "validator_error"}: + status = ( + "accepted" + if payload.get("passed") and float(payload.get("score", 0.0) or 0.0) >= 0.75 + else "rejected" + ) return ValidationResult( - passed=bool(payload.get("passed")), + status=status, score=max(0.0, min(1.0, float(payload.get("score", 0.0) or 0.0))), issues=[str(item) for item in payload.get("issues") or []], missing_requirements=[str(item) for item in payload.get("missing_requirements") or []], + evidence_gaps=[str(item) for item in payload.get("evidence_gaps") or []], recommended_revision_prompt=str(payload.get("recommended_revision_prompt") or ""), validator="llm", ) diff --git a/app-instance/backend/tests/unit/test_task_mode_feedback.py b/app-instance/backend/tests/unit/test_task_mode_feedback.py index a0cc7ea..39273de 100644 --- a/app-instance/backend/tests/unit/test_task_mode_feedback.py +++ b/app-instance/backend/tests/unit/test_task_mode_feedback.py @@ -779,6 +779,45 @@ def test_task_mode_team_failure_still_uses_main_synthesis(tmp_path: Path) -> Non assert "user-visible fallback answer" in main_provider.calls[0]["messages"][0]["content"] +def test_insufficient_evidence_moves_task_to_needs_review(tmp_path: Path) -> None: + service = AgentService( + loader=EngineLoader( + workspace=tmp_path, + task_execution_planner=_single_planner(), + validation_service=StubValidationService( + [ + ValidationResult( + status="insufficient_evidence", + score=0.4, + evidence_gaps=["source missing"], + validator="test", + ) + ] + ), + ) + ) + + result = asyncio.run( + service.process_direct( + "answer with uncertain evidence", + session_id="web:needs-review", + provider_bundle=_bundle("possible answer"), + ) + ) + loaded = service.create_loop().boot() + task = loaded.task_service.get_task(result.task_id) + events = loaded.session_manager.get_run_event_records(result.session_id, result.run_id) + validation_event = next(event for event in events if event.event_type == "task_validation_snapshotted") + + assert task is not None + assert task.status == "needs_review" + assert task.requires_user_action is True + assert task.is_execution_active is False + assert validation_event.event_payload["validation_result"]["status"] == "insufficient_evidence" + assert validation_event.event_payload["retry_scheduled"] is False + assert validation_event.event_payload["validation_debug"]["tool_result_count"] >= 0 + + def test_task_mode_team_retry_hides_first_synthesis_run(tmp_path: Path) -> None: main_provider = StubProvider( [ @@ -890,5 +929,6 @@ def test_llm_validator_parse_failure_is_not_accepted(tmp_path: Path) -> None: ) assert validation.accepted is False + assert validation.status == "validator_error" assert validation.validator == "llm_error" assert validation.issues