"""No-key web search and fetch tools.""" from __future__ import annotations from dataclasses import dataclass, field from html import unescape import json import re from typing import Any from urllib.parse import quote_plus, urlparse import httpx def _json_result(success: bool, **payload: Any) -> str: return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2) def _strip_html(value: str) -> str: text = re.sub(r"(?is)<(script|style).*?>.*?", " ", value) text = re.sub(r"(?s)<[^>]+>", " ", text) text = unescape(text) return re.sub(r"\s+", " ", text).strip() def _safe_url(url: str) -> str: parsed = urlparse(url) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError("url must be an http(s) URL") return url @dataclass(slots=True) class WebFetchTool: name: str = "web_fetch" description: str = "Fetch a public HTTP(S) page and return readable text. No API key required." toolset: str = "web" always_available: bool = False parameters: dict[str, Any] = field( default_factory=lambda: { "type": "object", "properties": { "url": {"type": "string", "description": "HTTP(S) URL to fetch."}, "max_chars": {"type": "integer", "default": 12000, "minimum": 1000, "maximum": 50000}, }, "required": ["url"], } ) async def execute(self, *, url: str, max_chars: int = 12000, **_: Any) -> str: try: safe_url = _safe_url(url) limit = max(1000, min(int(max_chars or 12000), 50000)) async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=False) as client: response = await client.get( safe_url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"}, ) response.raise_for_status() content_type = response.headers.get("content-type", "") raw = response.text text = _strip_html(raw) if "html" in content_type.lower() else raw truncated = len(text) > limit return _json_result( True, url=str(response.url), status_code=response.status_code, content_type=content_type, content=text[:limit], truncated=truncated, ) except Exception as exc: return _json_result(False, url=url, error=str(exc)) @dataclass(slots=True) class WebSearchTool: name: str = "web_search" description: str = "Search the web using DuckDuckGo HTML results. No API key required." toolset: str = "web" always_available: bool = False parameters: dict[str, Any] = field( default_factory=lambda: { "type": "object", "properties": { "query": {"type": "string", "description": "Search query."}, "limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 10}, }, "required": ["query"], } ) async def execute(self, *, query: str, limit: int = 5, **_: Any) -> str: try: if not str(query).strip(): raise ValueError("query is required") bounded = max(1, min(int(limit or 5), 10)) url = f"https://duckduckgo.com/html/?q={quote_plus(query)}" async with httpx.AsyncClient(timeout=20, follow_redirects=True, trust_env=False) as client: response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"}) response.raise_for_status() html = response.text results: list[dict[str, str]] = [] pattern = re.compile( r']+class="result__a"[^>]+href="(?P[^"]+)"[^>]*>(?P.*?)</a>', re.I | re.S, ) for match in pattern.finditer(html): title = _strip_html(match.group("title")) result_url = unescape(match.group("url")) if title and result_url: results.append({"title": title, "url": result_url, "snippet": ""}) if len(results) >= bounded: break return _json_result(True, query=query, results=results) except Exception as exc: return _json_result(False, query=query, error=str(exc))