Files
beaver_project/app-instance/backend/beaver/tools/builtins/web.py
steven_li 4b0bf65ace ```
feat(engine): 优化智能体循环中的助手消息处理逻辑

- 在没有工具调用时才添加助手消息到上下文
- 确保工具调用响应正确添加到消息上下文中
- 修复了消息构建的条件逻辑

fix(cron): 改进定时任务调度的时间解析功能

- 添加正则表达式导入用于时间显示解析
- 实现从显示文本中提取毫秒间隔的功能
- 增强整数转换的安全性,避免类型错误
- 优化定时任务配置的解析逻辑

feat(outlook): 增强Outlook集成的功能和稳定性

- 将默认超时时间从10秒增加到180秒
- 为状态检查函数添加可选的验证参数
- 串行执行邮件概览获取操作而非并行
- 改进连接状态验证逻辑

feat(channel): 添加设备名称作为会话标识的选项

- 为终端WebSocket适配器添加新的配置选项
- 实现基于设备名称生成会话对等ID的功能
- 记录原始对等ID和设备名称的元数据
- 支持从设备名称创建会话对等ID

feat(skills): 完善技能学习评估系统和进度跟踪

- 在应用启动时自动调度待评估的技能草稿
- 为技能评估工作创建独立的循环工厂
- 实现异步技能评估任务的取消和清理机制
- 添加技能评估进度报告和状态跟踪功能
- 扩展会话列表API以包含更多详细信息
- 防止对不存在的会话进行操作
- 优化技能草稿提交和评估的业务逻辑

perf(skills): 提升技能评估的并发性能

- 实现并行技能案例评估以提高效率
- 添加最大并行案例数的环境变量控制
- 实现实时评估进度更新和回调机制
- 优化评估过程中的资源管理和同步

refactor(services): 创建隔离的智能体循环实例

- 添加创建独立智能体循环的工厂方法
- 确保新循环继承运行时服务配置
- 支持技能评估等需要隔离环境的场景
```
2026-06-15 14:48:16 +08:00

199 lines
7.2 KiB
Python

"""No-key web search and fetch tools."""
from __future__ import annotations
import asyncio
from dataclasses import dataclass, field
from html import unescape
import json
import re
from typing import Any
from urllib.parse import quote_plus, urlparse
import httpx
def _json_result(success: bool, **payload: Any) -> str:
return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2)
def _strip_html(value: str) -> str:
text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", value)
text = re.sub(r"(?s)<[^>]+>", " ", text)
text = unescape(text)
return re.sub(r"\s+", " ", text).strip()
def _safe_url(url: str) -> str:
parsed = urlparse(url)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
raise ValueError("url must be an http(s) URL")
return url
@dataclass(slots=True)
class WebFetchTool:
name: str = "web_fetch"
description: str = "Fetch a public HTTP(S) page and return readable text. No API key required."
toolset: str = "web"
always_available: bool = False
parameters: dict[str, Any] = field(
default_factory=lambda: {
"type": "object",
"properties": {
"url": {"type": "string", "description": "HTTP(S) URL to fetch."},
"max_chars": {"type": "integer", "default": 12000, "minimum": 1000, "maximum": 50000},
},
"required": ["url"],
}
)
async def execute(self, *, url: str, max_chars: int = 12000, **_: Any) -> str:
try:
safe_url = _safe_url(url)
limit = max(1000, min(int(max_chars or 12000), 50000))
timeout = httpx.Timeout(connect=5, read=12, write=5, pool=5)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
response = await client.get(
safe_url,
headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"},
)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
raw = response.text
text = _strip_html(raw) if "html" in content_type.lower() else raw
truncated = len(text) > limit
return _json_result(
True,
url=str(response.url),
status_code=response.status_code,
content_type=content_type,
content=text[:limit],
truncated=truncated,
)
except Exception as exc:
return _json_result(False, url=url, error=str(exc))
@dataclass(slots=True)
class WebSearchTool:
name: str = "web_search"
description: str = "Search the public web using HTML results. No API key required."
toolset: str = "web"
always_available: bool = False
parameters: dict[str, Any] = field(
default_factory=lambda: {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query."},
"limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 10},
},
"required": ["query"],
}
)
async def execute(self, *, query: str, limit: int = 5, **_: Any) -> str:
try:
if not str(query).strip():
raise ValueError("query is required")
bounded = max(1, min(int(limit or 5), 10))
headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"}
timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
tasks = [
asyncio.create_task(
_search_bing(
client,
query=query,
limit=bounded,
headers=headers,
)
),
asyncio.create_task(
_search_duckduckgo(
client,
query=query,
limit=bounded,
headers=headers,
)
),
]
errors: list[str] = []
try:
for completed in asyncio.as_completed(tasks):
try:
engine, results = await completed
except Exception as exc:
errors.append(str(exc))
continue
if results:
return _json_result(True, query=query, engine=engine, results=results)
detail = "; ".join(error for error in errors if error) or "no search results"
return _json_result(False, query=query, error=detail)
finally:
for task in tasks:
if not task.done():
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
except Exception as exc:
return _json_result(False, query=query, error=str(exc))
async def _search_bing(
client: httpx.AsyncClient,
*,
query: str,
limit: int,
headers: dict[str, str],
) -> tuple[str, list[dict[str, str]]]:
response = await client.get(f"https://www.bing.com/search?q={quote_plus(query)}", headers=headers)
response.raise_for_status()
return "bing", _parse_bing_results(response.text, limit)
async def _search_duckduckgo(
client: httpx.AsyncClient,
*,
query: str,
limit: int,
headers: dict[str, str],
) -> tuple[str, list[dict[str, str]]]:
response = await client.get(f"https://duckduckgo.com/html/?q={quote_plus(query)}", headers=headers)
response.raise_for_status()
return "duckduckgo", _parse_duckduckgo_results(response.text, limit)
def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]:
results: list[dict[str, str]] = []
pattern = re.compile(
r'<li[^>]+class="[^"]*\bb_algo\b[^"]*"[^>]*>.*?<h2[^>]*>\s*'
r'<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>.*?'
r'(?:<p[^>]*>(?P<snippet>.*?)</p>)?',
re.I | re.S,
)
for match in pattern.finditer(html):
title = _strip_html(match.group("title"))
result_url = unescape(match.group("url"))
snippet = _strip_html(match.group("snippet") or "")
if title and result_url:
results.append({"title": title, "url": result_url, "snippet": snippet})
if len(results) >= limit:
break
return results
def _parse_duckduckgo_results(html: str, limit: int) -> list[dict[str, str]]:
results: list[dict[str, str]] = []
pattern = re.compile(
r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>',
re.I | re.S,
)
for match in pattern.finditer(html):
title = _strip_html(match.group("title"))
result_url = unescape(match.group("url"))
if title and result_url:
results.append({"title": title, "url": result_url, "snippet": ""})
if len(results) >= limit:
break
return results