beaver_project/app-instance/backend/beaver/tools/builtins/web.py

"""No-key web search and fetch tools."""

from __future__ import annotations

from dataclasses import dataclass, field
from html import unescape
import json
import re
from typing import Any
from urllib.parse import quote_plus, urlparse

import httpx


def _json_result(success: bool, **payload: Any) -> str:
    return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2)


def _strip_html(value: str) -> str:
    text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", value)
    text = re.sub(r"(?s)<[^>]+>", " ", text)
    text = unescape(text)
    return re.sub(r"\s+", " ", text).strip()


def _safe_url(url: str) -> str:
    parsed = urlparse(url)
    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
        raise ValueError("url must be an http(s) URL")
    return url


@dataclass(slots=True)
class WebFetchTool:
    name: str = "web_fetch"
    description: str = "Fetch a public HTTP(S) page and return readable text. No API key required."
    toolset: str = "web"
    always_available: bool = False
    parameters: dict[str, Any] = field(
        default_factory=lambda: {
            "type": "object",
            "properties": {
                "url": {"type": "string", "description": "HTTP(S) URL to fetch."},
                "max_chars": {"type": "integer", "default": 12000, "minimum": 1000, "maximum": 50000},
            },
            "required": ["url"],
        }
    )

    async def execute(self, *, url: str, max_chars: int = 12000, **_: Any) -> str:
        try:
            safe_url = _safe_url(url)
            limit = max(1000, min(int(max_chars or 12000), 50000))
            async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client:
                response = await client.get(
                    safe_url,
                    headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"},
                )
            response.raise_for_status()
            content_type = response.headers.get("content-type", "")
            raw = response.text
            text = _strip_html(raw) if "html" in content_type.lower() else raw
            truncated = len(text) > limit
            return _json_result(
                True,
                url=str(response.url),
                status_code=response.status_code,
                content_type=content_type,
                content=text[:limit],
                truncated=truncated,
            )
        except Exception as exc:
            return _json_result(False, url=url, error=str(exc))


@dataclass(slots=True)
class WebSearchTool:
    name: str = "web_search"
    description: str = "Search the web using DuckDuckGo HTML results. No API key required."
    toolset: str = "web"
    always_available: bool = False
    parameters: dict[str, Any] = field(
        default_factory=lambda: {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "Search query."},
                "limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 10},
            },
            "required": ["query"],
        }
    )

    async def execute(self, *, query: str, limit: int = 5, **_: Any) -> str:
        try:
            if not str(query).strip():
                raise ValueError("query is required")
            bounded = max(1, min(int(limit or 5), 10))
            url = f"https://duckduckgo.com/html/?q={quote_plus(query)}"
            async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=True) as client:
                response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"})
            response.raise_for_status()
            html = response.text
            results: list[dict[str, str]] = []
            pattern = re.compile(
                r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>',
                re.I | re.S,
            )
            for match in pattern.finditer(html):
                title = _strip_html(match.group("title"))
                result_url = unescape(match.group("url"))
                if title and result_url:
                    results.append({"title": title, "url": result_url, "snippet": ""})
                if len(results) >= bounded:
                    break
            return _json_result(True, query=query, results=results)
        except Exception as exc:
            return _json_result(False, query=query, error=str(exc))