"""No-key web search and fetch tools.""" from __future__ import annotations import asyncio from dataclasses import dataclass, field from html import unescape from html.parser import HTMLParser import json import re from typing import Any from urllib.parse import quote_plus, urljoin, urlparse import httpx def _json_result(success: bool, **payload: Any) -> str: return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2) def _strip_html(value: str) -> str: text = re.sub(r"(?is)<(script|style).*?>.*?\1>", " ", value) text = re.sub(r"(?s)<[^>]+>", " ", text) text = unescape(text) return re.sub(r"\s+", " ", text).strip() def _compact_text(value: str) -> str: return re.sub(r"\s+", " ", unescape(value)).strip() def _safe_url(url: str) -> str: parsed = urlparse(url) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError("url must be an http(s) URL") return url class _HtmlMetadataParser(HTMLParser): def __init__(self, base_url: str) -> None: super().__init__(convert_charrefs=True) self.base_url = base_url self.title = "" self.links: list[dict[str, str]] = [] self._in_title = False self._current_href: str | None = None self._current_text: list[str] = [] self._skip_depth = 0 self._seen_urls: set[str] = set() def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: lowered = tag.lower() if lowered in {"script", "style"}: self._skip_depth += 1 return if self._skip_depth: return if lowered == "title": self._in_title = True return if lowered == "a": href = dict(attrs).get("href") if href: self._current_href = urljoin(self.base_url, href) self._current_text = [] def handle_endtag(self, tag: str) -> None: lowered = tag.lower() if lowered in {"script", "style"} and self._skip_depth: self._skip_depth -= 1 return if self._skip_depth: return if lowered == "title": self._in_title = False self.title = _compact_text(self.title) return if lowered == "a" and self._current_href: parsed = urlparse(self._current_href) if parsed.scheme in {"http", "https"} and self._current_href not in self._seen_urls: text = _compact_text(" ".join(self._current_text)) self.links.append({"text": text, "url": self._current_href}) self._seen_urls.add(self._current_href) self._current_href = None self._current_text = [] def handle_data(self, data: str) -> None: if self._skip_depth: return if self._in_title: self.title += data if self._current_href: self._current_text.append(data) def _extract_html_metadata(html: str, base_url: str, *, max_links: int = 80) -> dict[str, Any]: parser = _HtmlMetadataParser(base_url) parser.feed(html) links = parser.links[:max_links] pdf_links = [ link for link in links if urlparse(link["url"]).path.lower().endswith(".pdf") ][:30] return { "title": parser.title, "links": links, "pdf_links": pdf_links, } @dataclass(slots=True) class WebFetchTool: name: str = "web_fetch" description: str = "Fetch a public HTTP(S) page and return readable text. No API key required." toolset: str = "web" always_available: bool = False parameters: dict[str, Any] = field( default_factory=lambda: { "type": "object", "properties": { "url": {"type": "string", "description": "HTTP(S) URL to fetch."}, "max_chars": {"type": "integer", "default": 12000, "minimum": 1000, "maximum": 50000}, }, "required": ["url"], } ) async def execute(self, *, url: str, max_chars: int = 12000, **_: Any) -> str: try: safe_url = _safe_url(url) limit = max(1000, min(int(max_chars or 12000), 50000)) timeout = httpx.Timeout(connect=5, read=12, write=5, pool=5) async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client: response = await client.get( safe_url, headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"}, ) response.raise_for_status() content_type = response.headers.get("content-type", "") raw = response.text is_html = "html" in content_type.lower() text = _strip_html(raw) if is_html else raw metadata = _extract_html_metadata(raw, str(response.url)) if is_html else { "title": "", "links": [], "pdf_links": [], } truncated = len(text) > limit return _json_result( True, url=str(response.url), status_code=response.status_code, content_type=content_type, **metadata, content=text[:limit], truncated=truncated, ) except Exception as exc: return _json_result(False, url=url, error=str(exc)) @dataclass(slots=True) class WebSearchTool: name: str = "web_search" description: str = "Search the public web using HTML results. No API key required." toolset: str = "web" always_available: bool = False parameters: dict[str, Any] = field( default_factory=lambda: { "type": "object", "properties": { "query": {"type": "string", "description": "Search query."}, "limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 10}, }, "required": ["query"], } ) async def execute(self, *, query: str, limit: int = 5, **_: Any) -> str: try: if not str(query).strip(): raise ValueError("query is required") bounded = max(1, min(int(limit or 5), 10)) errors: list[str] = [] try: ddgs_results = await asyncio.to_thread(_search_ddgs, query, bounded) except Exception as exc: ddgs_results = [] errors.append(str(exc)) if ddgs_results: return _json_result(True, **_search_result_payload(query, "ddgs", ddgs_results)) headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"} timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5) async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client: tasks = [ asyncio.create_task( _search_bing( client, query=query, limit=bounded, headers=headers, ) ), asyncio.create_task( _search_duckduckgo( client, query=query, limit=bounded, headers=headers, ) ), ] try: for completed in asyncio.as_completed(tasks): try: engine, results = await completed except Exception as exc: errors.append(str(exc)) continue if results: return _json_result(True, **_search_result_payload(query, engine, results)) detail = "; ".join(error for error in errors if error) or "no search results" return _json_result(False, query=query, error=detail) finally: for task in tasks: if not task.done(): task.cancel() await asyncio.gather(*tasks, return_exceptions=True) except Exception as exc: return _json_result(False, query=query, error=str(exc)) async def _search_bing( client: httpx.AsyncClient, *, query: str, limit: int, headers: dict[str, str], ) -> tuple[str, list[dict[str, str]]]: response = await client.get(f"https://www.bing.com/search?q={quote_plus(query)}", headers=headers) response.raise_for_status() return "bing", _parse_bing_results(response.text, limit) async def _search_duckduckgo( client: httpx.AsyncClient, *, query: str, limit: int, headers: dict[str, str], ) -> tuple[str, list[dict[str, str]]]: response = await client.get(f"https://duckduckgo.com/html/?q={quote_plus(query)}", headers=headers) response.raise_for_status() return "duckduckgo", _parse_duckduckgo_results(response.text, limit) def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]: results: list[dict[str, str]] = [] pattern = re.compile( r'
]*>(?P