feat(engine): 优化智能体循环中的助手消息处理逻辑 - 在没有工具调用时才添加助手消息到上下文 - 确保工具调用响应正确添加到消息上下文中 - 修复了消息构建的条件逻辑 fix(cron): 改进定时任务调度的时间解析功能 - 添加正则表达式导入用于时间显示解析 - 实现从显示文本中提取毫秒间隔的功能 - 增强整数转换的安全性,避免类型错误 - 优化定时任务配置的解析逻辑 feat(outlook): 增强Outlook集成的功能和稳定性 - 将默认超时时间从10秒增加到180秒 - 为状态检查函数添加可选的验证参数 - 串行执行邮件概览获取操作而非并行 - 改进连接状态验证逻辑 feat(channel): 添加设备名称作为会话标识的选项 - 为终端WebSocket适配器添加新的配置选项 - 实现基于设备名称生成会话对等ID的功能 - 记录原始对等ID和设备名称的元数据 - 支持从设备名称创建会话对等ID feat(skills): 完善技能学习评估系统和进度跟踪 - 在应用启动时自动调度待评估的技能草稿 - 为技能评估工作创建独立的循环工厂 - 实现异步技能评估任务的取消和清理机制 - 添加技能评估进度报告和状态跟踪功能 - 扩展会话列表API以包含更多详细信息 - 防止对不存在的会话进行操作 - 优化技能草稿提交和评估的业务逻辑 perf(skills): 提升技能评估的并发性能 - 实现并行技能案例评估以提高效率 - 添加最大并行案例数的环境变量控制 - 实现实时评估进度更新和回调机制 - 优化评估过程中的资源管理和同步 refactor(services): 创建隔离的智能体循环实例 - 添加创建独立智能体循环的工厂方法 - 确保新循环继承运行时服务配置 - 支持技能评估等需要隔离环境的场景 ```
199 lines
7.2 KiB
Python
199 lines
7.2 KiB
Python
"""No-key web search and fetch tools."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from dataclasses import dataclass, field
|
|
from html import unescape
|
|
import json
|
|
import re
|
|
from typing import Any
|
|
from urllib.parse import quote_plus, urlparse
|
|
|
|
import httpx
|
|
|
|
|
|
def _json_result(success: bool, **payload: Any) -> str:
|
|
return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def _strip_html(value: str) -> str:
|
|
text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", value)
|
|
text = re.sub(r"(?s)<[^>]+>", " ", text)
|
|
text = unescape(text)
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
def _safe_url(url: str) -> str:
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
|
raise ValueError("url must be an http(s) URL")
|
|
return url
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class WebFetchTool:
|
|
name: str = "web_fetch"
|
|
description: str = "Fetch a public HTTP(S) page and return readable text. No API key required."
|
|
toolset: str = "web"
|
|
always_available: bool = False
|
|
parameters: dict[str, Any] = field(
|
|
default_factory=lambda: {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {"type": "string", "description": "HTTP(S) URL to fetch."},
|
|
"max_chars": {"type": "integer", "default": 12000, "minimum": 1000, "maximum": 50000},
|
|
},
|
|
"required": ["url"],
|
|
}
|
|
)
|
|
|
|
async def execute(self, *, url: str, max_chars: int = 12000, **_: Any) -> str:
|
|
try:
|
|
safe_url = _safe_url(url)
|
|
limit = max(1000, min(int(max_chars or 12000), 50000))
|
|
timeout = httpx.Timeout(connect=5, read=12, write=5, pool=5)
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
|
|
response = await client.get(
|
|
safe_url,
|
|
headers={"User-Agent": "Mozilla/5.0 Beaver/1.0"},
|
|
)
|
|
response.raise_for_status()
|
|
content_type = response.headers.get("content-type", "")
|
|
raw = response.text
|
|
text = _strip_html(raw) if "html" in content_type.lower() else raw
|
|
truncated = len(text) > limit
|
|
return _json_result(
|
|
True,
|
|
url=str(response.url),
|
|
status_code=response.status_code,
|
|
content_type=content_type,
|
|
content=text[:limit],
|
|
truncated=truncated,
|
|
)
|
|
except Exception as exc:
|
|
return _json_result(False, url=url, error=str(exc))
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class WebSearchTool:
|
|
name: str = "web_search"
|
|
description: str = "Search the public web using HTML results. No API key required."
|
|
toolset: str = "web"
|
|
always_available: bool = False
|
|
parameters: dict[str, Any] = field(
|
|
default_factory=lambda: {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {"type": "string", "description": "Search query."},
|
|
"limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 10},
|
|
},
|
|
"required": ["query"],
|
|
}
|
|
)
|
|
|
|
async def execute(self, *, query: str, limit: int = 5, **_: Any) -> str:
|
|
try:
|
|
if not str(query).strip():
|
|
raise ValueError("query is required")
|
|
bounded = max(1, min(int(limit or 5), 10))
|
|
headers = {"User-Agent": "Mozilla/5.0 Beaver/1.0"}
|
|
timeout = httpx.Timeout(connect=5, read=8, write=5, pool=5)
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, trust_env=True) as client:
|
|
tasks = [
|
|
asyncio.create_task(
|
|
_search_bing(
|
|
client,
|
|
query=query,
|
|
limit=bounded,
|
|
headers=headers,
|
|
)
|
|
),
|
|
asyncio.create_task(
|
|
_search_duckduckgo(
|
|
client,
|
|
query=query,
|
|
limit=bounded,
|
|
headers=headers,
|
|
)
|
|
),
|
|
]
|
|
errors: list[str] = []
|
|
try:
|
|
for completed in asyncio.as_completed(tasks):
|
|
try:
|
|
engine, results = await completed
|
|
except Exception as exc:
|
|
errors.append(str(exc))
|
|
continue
|
|
if results:
|
|
return _json_result(True, query=query, engine=engine, results=results)
|
|
detail = "; ".join(error for error in errors if error) or "no search results"
|
|
return _json_result(False, query=query, error=detail)
|
|
finally:
|
|
for task in tasks:
|
|
if not task.done():
|
|
task.cancel()
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
except Exception as exc:
|
|
return _json_result(False, query=query, error=str(exc))
|
|
|
|
|
|
async def _search_bing(
|
|
client: httpx.AsyncClient,
|
|
*,
|
|
query: str,
|
|
limit: int,
|
|
headers: dict[str, str],
|
|
) -> tuple[str, list[dict[str, str]]]:
|
|
response = await client.get(f"https://www.bing.com/search?q={quote_plus(query)}", headers=headers)
|
|
response.raise_for_status()
|
|
return "bing", _parse_bing_results(response.text, limit)
|
|
|
|
|
|
async def _search_duckduckgo(
|
|
client: httpx.AsyncClient,
|
|
*,
|
|
query: str,
|
|
limit: int,
|
|
headers: dict[str, str],
|
|
) -> tuple[str, list[dict[str, str]]]:
|
|
response = await client.get(f"https://duckduckgo.com/html/?q={quote_plus(query)}", headers=headers)
|
|
response.raise_for_status()
|
|
return "duckduckgo", _parse_duckduckgo_results(response.text, limit)
|
|
|
|
|
|
def _parse_bing_results(html: str, limit: int) -> list[dict[str, str]]:
|
|
results: list[dict[str, str]] = []
|
|
pattern = re.compile(
|
|
r'<li[^>]+class="[^"]*\bb_algo\b[^"]*"[^>]*>.*?<h2[^>]*>\s*'
|
|
r'<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>.*?'
|
|
r'(?:<p[^>]*>(?P<snippet>.*?)</p>)?',
|
|
re.I | re.S,
|
|
)
|
|
for match in pattern.finditer(html):
|
|
title = _strip_html(match.group("title"))
|
|
result_url = unescape(match.group("url"))
|
|
snippet = _strip_html(match.group("snippet") or "")
|
|
if title and result_url:
|
|
results.append({"title": title, "url": result_url, "snippet": snippet})
|
|
if len(results) >= limit:
|
|
break
|
|
return results
|
|
|
|
|
|
def _parse_duckduckgo_results(html: str, limit: int) -> list[dict[str, str]]:
|
|
results: list[dict[str, str]] = []
|
|
pattern = re.compile(
|
|
r'<a[^>]+class="result__a"[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<title>.*?)</a>',
|
|
re.I | re.S,
|
|
)
|
|
for match in pattern.finditer(html):
|
|
title = _strip_html(match.group("title"))
|
|
result_url = unescape(match.group("url"))
|
|
if title and result_url:
|
|
results.append({"title": title, "url": result_url, "snippet": ""})
|
|
if len(results) >= limit:
|
|
break
|
|
return results
|