beaver_project/docs/superpowers/examples/steven_team_demo_harness.py

from __future__ import annotations

import asyncio
import json
from dataclasses import asdict
from pathlib import Path
from typing import Any

from beaver.engine import AgentLoop, EngineLoader
from beaver.engine.context import SkillContext
from beaver.engine.providers.base import LLMProvider, LLMResponse, ToolCallRequest
from beaver.engine.providers.factory import ProviderBundle, build_provider_runtime
from beaver.services.team_service import TeamService
from beaver.skills.catalog.loader import SkillsLoader
from beaver.skills.catalog.utils import strip_frontmatter
from beaver.skills.drafts import DraftService
from beaver.skills.specs import SkillSpecStore
from beaver.tasks.attempt_orchestrator import TaskAttemptOrchestrator
from beaver.tasks.models import TaskRecord
from beaver.tasks.planner import TaskExecutionPlanner
from beaver.tasks.skill_resolver import TaskSkillResolver


WORKSPACE = Path("/root/.beaver/workspace")
SKILL_NAME = "mgm-galaxy-financial-chart-report-safe"


def _text_from_messages(messages: list[dict[str, Any]]) -> str:
    return "\n".join(str(message.get("content") or "") for message in messages)


def _tool_names(tools: list[dict[str, Any]] | None) -> list[str]:
    names: list[str] = []
    for tool in tools or []:
        if "function" in tool:
            names.append(str(tool["function"].get("name") or ""))
        else:
            names.append(str(tool.get("name") or ""))
    return [name for name in names if name]


class DemoProvider(LLMProvider):
    def __init__(self, *, collect_uses_tool: bool) -> None:
        super().__init__()
        self.collect_uses_tool = collect_uses_tool
        self.calls: list[dict[str, Any]] = []

    async def chat(
        self,
        messages: list[dict[str, Any]],
        tools: list[dict[str, Any]] | None = None,
        model: str | None = None,
        max_tokens: int | None = None,
        temperature: float = 0.0,
        thinking_enabled: bool | None = None,
    ) -> LLMResponse:
        text = _text_from_messages(messages)
        names = _tool_names(tools)
        self.calls.append(
            {
                "tool_names": names,
                "has_tool_result": any(message.get("role") == "tool" for message in messages),
                "text_preview": text[:300],
            }
        )

        if "You choose whether an internal Beaver Task attempt" in text:
            return LLMResponse(
                content=json.dumps(_planner_json(), ensure_ascii=False),
                provider_name="demo",
                model="demo-model",
            )

        if "You select Beaver skills for a single run" in text:
            return LLMResponse(content="[]", provider_name="demo", model="demo-model")

        if "team:generate_chart_report" in text:
            return LLMResponse(
                content=(
                    "# MGM China vs Galaxy Entertainment Demo Report\n\n"
                    "| Company | Metric | Value | Source |\n"
                    "|---|---:|---:|---|\n"
                    "| MGM China | Revenue | demo value | upstream source |\n"
                    "| Galaxy Entertainment | Revenue | demo value | upstream source |\n\n"
                    "Chart-ready data is provided as Markdown. No image or saved chart file was generated."
                ),
                provider_name="demo",
                model="demo-model",
            )

        if "team:validate_metrics" in text:
            return LLMResponse(
                content="Validation complete: periods and units are labeled; no generated chart artifact is claimed.",
                provider_name="demo",
                model="demo-model",
            )

        if "team:extract_financial_metrics" in text:
            return LLMResponse(
                content=(
                    "Extracted demo metric table: MGM China revenue: source-backed placeholder; "
                    "Galaxy Entertainment revenue: source-backed placeholder. Currency, period, and source URL fields are labeled."
                ),
                provider_name="demo",
                model="demo-model",
            )

        if "team:collect_official_sources" in text:
            if self.collect_uses_tool and "web_fetch" in names and not any(message.get("role") == "tool" for message in messages):
                return LLMResponse(
                    content=None,
                    tool_calls=[
                        ToolCallRequest(
                            id="call_collect_fetch",
                            name="web_fetch",
                            arguments={
                                "url": "https://www.bing.com/search?q=MGM+China+Galaxy+Entertainment+annual+report",
                                "max_chars": 1000,
                            },
                        )
                    ],
                    finish_reason="tool_calls",
                    provider_name="demo",
                    model="demo-model",
                )
            return LLMResponse(
                content=(
                    "Collected official-source candidates for MGM China Holdings and Galaxy Entertainment. "
                    "Demo evidence includes a successful web_fetch tool result with URL captured by Beaver."
                ),
                provider_name="demo",
                model="demo-model",
            )

        return LLMResponse(content="Demo final synthesis.", provider_name="demo", model="demo-model")

    def get_default_model(self) -> str:
        return "demo-model"


def _planner_json() -> dict[str, Any]:
    return {
        "mode": "team",
        "reason": "finance comparison benefits from staged source collection, extraction, validation, and reporting",
        "strategy": "dag",
        "nodes": [
            {
                "node_id": "collect_official_sources",
                "task": "Collect official MGM China Holdings and Galaxy Entertainment financial disclosure sources for the requested period. Prefer annual reports, interim reports, results announcements, investor relations pages, and exchange filings. Return source URLs with short notes about period coverage.",
                "use_skill": "web-operation",
                "skill_query": "official financial disclosure web research",
                "depends_on": [],
                "requested_tools": ["web_search", "web_fetch"],
                "required_evidence": ["tool_result", "url"],
                "evidence_contract": {"version": 1, "entities": ["MGM China Holdings", "Galaxy Entertainment Group"]},
                "required_for_completion": True,
                "block_downstream_on_partial": True,
                "max_tool_iterations": 2,
            },
            {
                "node_id": "extract_financial_metrics",
                "task": "Extract comparable financial metrics for MGM China Holdings and Galaxy Entertainment from the collected official sources. Include revenue or net revenue, adjusted EBITDA where available, net profit/loss where available, period, currency, unit, and source URL for each metric.",
                "use_skill": "web-operation",
                "skill_query": "financial metric extraction from official disclosures",
                "depends_on": ["collect_official_sources"],
                "requested_tools": ["web_fetch"],
                "required_evidence": ["output"],
                "evidence_contract": {"version": 1, "metrics": ["revenue", "adjusted_ebitda", "net_profit_or_loss"]},
                "required_for_completion": True,
                "block_downstream_on_partial": True,
                "max_tool_iterations": 1,
            },
            {
                "node_id": "validate_metrics",
                "task": "Validate extracted metrics for source consistency, period alignment, currency/unit consistency, and obvious transcription errors. Produce a concise validation note and list any evidence gaps.",
                "use_skill": "utility-tools",
                "skill_query": "finance metric validation",
                "depends_on": ["extract_financial_metrics"],
                "requested_tools": [],
                "required_evidence": ["output"],
                "evidence_contract": {"version": 1, "checks": ["source_consistency", "period_alignment"]},
                "required_for_completion": True,
                "block_downstream_on_partial": True,
                "max_tool_iterations": 0,
            },
            {
                "node_id": "generate_chart_report",
                "task": "Generate the final Markdown comparison report. Include an executive summary, source-backed comparison table, chart-ready data table, optional Mermaid or text bar chart section, and caveats. Do not claim that a chart image, chart file, or saved artifact was generated.",
                "use_skill": "utility-tools",
                "skill_query": "financial markdown report with chart-ready data",
                "depends_on": ["validate_metrics"],
                "requested_tools": [],
                "required_evidence": ["output"],
                "evidence_contract": {"version": 1, "outputs": ["comparison_table", "chart_ready_data", "markdown_report"]},
                "required_for_completion": True,
                "block_downstream_on_partial": False,
                "max_tool_iterations": 0,
            },
        ],
        "adaptation": {"template_used": True},
        "final_synthesis_instruction": "Synthesize node outputs into a concise Markdown finance report.",
    }


def _task() -> TaskRecord:
    return TaskRecord(
        task_id="demo-task-mgm-galaxy",
        session_id="web:demo-mgm-galaxy-harness",
        description="Compare MGM China and Galaxy Entertainment using official public financial disclosures.",
        goal="Compare MGM China and Galaxy Entertainment using official public financial disclosures.",
        constraints=[],
        priority=0,
        status="open",
        creator="demo",
        created_at="demo",
        updated_at="demo",
    )


def _finance_skill_context(loader: SkillsLoader) -> SkillContext:
    record = loader.get_skill_record(SKILL_NAME)
    raw = loader.load_published_skill(SKILL_NAME)
    if record is None or raw is None:
        raise RuntimeError(f"missing published skill: {SKILL_NAME}")
    return SkillContext(
        name=record.name,
        version=record.version,
        content=strip_frontmatter(raw).strip(),
        content_hash=record.content_hash or "",
        activation_reason="demo_exact_skill",
        tool_hints=list(record.tool_hints),
        team_template=record.team_template,
        team_template_warnings=list(record.team_template_warnings),
    )


async def _run_case(*, collect_uses_tool: bool) -> dict[str, Any]:
    loader = SkillsLoader(WORKSPACE)
    store = SkillSpecStore(WORKSPACE)
    runtime = build_provider_runtime(model="demo-model", provider_name="custom", api_key="demo", api_base="http://demo.invalid/v1")
    provider = DemoProvider(collect_uses_tool=collect_uses_tool)
    bundle = ProviderBundle(main_runtime=runtime, main_provider=provider)
    engine_loader = EngineLoader(workspace=WORKSPACE)
    loop = AgentLoop(loader=engine_loader)
    loaded = loop.boot()
    resolver = TaskSkillResolver(skills_loader=loader, draft_service=DraftService(store))
    planner = TaskExecutionPlanner(task_skill_resolver=resolver, tool_registry=loaded.tool_registry)
    task = _task()
    skill_context = _finance_skill_context(loader)
    plan = await planner.plan(
        task=task,
        user_message=task.description,
        attempt_index=1,
        provider_bundle=bundle,
        activated_skills=[skill_context],
        timeout_seconds=5.0,
    )
    team_result = None
    if plan.is_team:
        team_result = await TeamService(loop).run_team(
            plan.graph,
            parent_task_id=None,
            parent_session_id=task.session_id,
            provider_bundle_factory=lambda node: bundle,
            inherited_pinned_skill_contexts=[skill_context],
        )
    context, prefix, metadata = TaskAttemptOrchestrator._team_synthesis_outcome(plan, team_result, prompt_locale="en")
    return {
        "case": "complete" if collect_uses_tool else "incomplete",
        "plan_mode": plan.mode,
        "plan_reason": plan.reason,
        "planner_adaptation": plan.planner_adaptation,
        "node_ids": [node.node_id for node in plan.graph.nodes] if plan.graph else [],
        "node_tool_scopes": {node.node_id: node.allowed_tool_names for node in plan.graph.nodes} if plan.graph else {},
        "node_skill_bindings": [
            {
                "node_id": node.node_id,
                "pinned_skill_names": node.inherited_pinned_skills,
                "pinned_skill_contexts": [skill.name for skill in node.inherited_pinned_skill_contexts],
                "role": node.agent.role,
                "sub_agent_kind": node.agent.metadata.get("sub_agent_kind"),
                "exact_binding_used": node.agent.metadata.get("exact_binding_used"),
            }
            for node in (plan.graph.nodes if plan.graph else [])
        ],
        "team_success": team_result.success if team_result else None,
        "team_summary": team_result.summary if team_result else None,
        "team_run_ids": team_result.run_ids if team_result else [],
        "node_results": [
            {
                "node_id": result.node_id,
                "success": result.success,
                "completion_status": result.completion_status,
                "finish_reason": result.finish_reason,
                "evidence_gaps": result.evidence_gaps,
                "output_preview": result.output_text[:180],
            }
            for result in (team_result.node_results if team_result else [])
        ],
        "synthesis_metadata": metadata,
        "incomplete_prefix_present": bool(prefix),
        "outcome_context_preview": context[:600],
        "provider_calls": provider.calls,
    }


async def main() -> None:
    results = [
        await _run_case(collect_uses_tool=True),
        await _run_case(collect_uses_tool=False),
    ]
    print(json.dumps(results, ensure_ascii=False, indent=2, default=str))


if __name__ == "__main__":
    asyncio.run(main())