"""Workspace-scoped read-only filesystem tools. 这些工具是 Beaver 第一批真实本地工具,只做只读能力: - list_directory - read_file - search_files 安全边界先保持非常明确:所有用户传入路径都必须解析到当前 `ToolContext.workspace` 内部。即使 workspace 里有指向外部的符号链接, 读取时也会因为真实路径越界而被拒绝。 """ from __future__ import annotations from dataclasses import dataclass, field import json from pathlib import Path from typing import Any, Iterable MAX_LIST_ENTRIES = 1_000 MAX_READ_LINES = 1_000 MAX_READ_CHARS = 120_000 MAX_SEARCH_RESULTS = 200 MAX_SEARCH_FILE_BYTES = 2_000_000 MAX_SEARCH_FILES = 5_000 SKIP_DIR_NAMES = { ".git", ".hg", ".svn", ".venv", "venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache", "node_modules", "dist", "build", } LIST_DIRECTORY_PARAMETERS: dict[str, Any] = { "type": "object", "properties": { "path": { "type": "string", "default": ".", "description": "Directory path relative to the current workspace. Absolute paths are allowed only if they stay inside the workspace.", }, "recursive": { "type": "boolean", "default": False, "description": "Whether to recursively list child entries. Symlink directories are not followed.", }, "max_entries": { "type": "integer", "default": 200, "minimum": 1, "maximum": MAX_LIST_ENTRIES, "description": "Maximum number of entries to return.", }, }, "required": [], } READ_FILE_PARAMETERS: dict[str, Any] = { "type": "object", "properties": { "path": { "type": "string", "description": "File path relative to the current workspace. Absolute paths are allowed only if they stay inside the workspace.", }, "start_line": { "type": "integer", "default": 1, "minimum": 1, "description": "1-based line number to start reading from.", }, "max_lines": { "type": "integer", "default": 200, "minimum": 1, "maximum": MAX_READ_LINES, "description": "Maximum number of lines to read.", }, }, "required": ["path"], } SEARCH_FILES_PARAMETERS: dict[str, Any] = { "type": "object", "properties": { "query": { "type": "string", "description": "Plain text query to search in file paths and UTF-8 text files.", }, "path": { "type": "string", "default": ".", "description": "Directory or file path relative to the current workspace.", }, "max_results": { "type": "integer", "default": 50, "minimum": 1, "maximum": MAX_SEARCH_RESULTS, "description": "Maximum number of matches to return.", }, "case_sensitive": { "type": "boolean", "default": False, "description": "Whether search should be case-sensitive.", }, }, "required": ["query"], } class WorkspacePathError(ValueError): """Raised when a requested path escapes the configured workspace.""" def _json_result(success: bool, **payload: Any) -> str: return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2) def _clamp_int(value: Any, *, default: int, minimum: int, maximum: int) -> int: try: parsed = int(value) except (TypeError, ValueError): parsed = default return max(minimum, min(parsed, maximum)) def _workspace_root(workspace: str | None) -> Path: if not workspace: raise WorkspacePathError("workspace is not configured for filesystem tools") root = Path(workspace).expanduser().resolve(strict=True) if not root.is_dir(): raise WorkspacePathError(f"workspace is not a directory: {root}") return root def _resolve_existing_path(workspace: str | None, user_path: str | None) -> tuple[Path, Path]: """Resolve a user path and ensure the real target stays inside workspace.""" root = _workspace_root(workspace) raw_path = Path(user_path or ".").expanduser() candidate = raw_path if raw_path.is_absolute() else root / raw_path resolved = candidate.resolve(strict=True) try: resolved.relative_to(root) except ValueError as exc: raise WorkspacePathError( f"path escapes workspace: {user_path or '.'}" ) from exc return root, resolved def _relative_path(root: Path, path: Path) -> str: try: return str(path.relative_to(root)) or "." except ValueError: return str(path) def _entry_type(path: Path) -> str: if path.is_symlink(): return "symlink" if path.is_dir(): return "directory" if path.is_file(): return "file" return "other" def _entry_payload(root: Path, path: Path) -> dict[str, Any]: try: stat = path.lstat() if path.is_symlink() else path.stat() size = stat.st_size except OSError: size = None return { "name": path.name, "path": _relative_path(root, path), "type": _entry_type(path), "size": size, } def _iter_directory(root: Path, directory: Path, *, recursive: bool) -> Iterable[Path]: def sort_key(item: Path) -> tuple[bool, str]: is_real_directory = not item.is_symlink() and item.is_dir() return (not is_real_directory, item.name.lower()) entries = sorted(directory.iterdir(), key=sort_key) for entry in entries: yield entry if not recursive or entry.is_symlink() or not entry.is_dir(): continue yield from _iter_directory(root, entry, recursive=True) def _looks_binary(path: Path) -> bool: try: with path.open("rb") as handle: sample = handle.read(4096) except OSError: return True return b"\0" in sample def _read_text_file(path: Path) -> str: if _looks_binary(path): raise ValueError("binary files cannot be read by read_file/search_files") return path.read_text(encoding="utf-8") def _iter_search_files(root: Path, start: Path) -> Iterable[Path]: if start.is_file(): yield start return stack = [start] visited = 0 while stack and visited < MAX_SEARCH_FILES: current = stack.pop() try: children = sorted(current.iterdir(), key=lambda item: item.name.lower()) except OSError: continue for child in children: if child.is_symlink(): continue if child.is_dir(): if child.name in SKIP_DIR_NAMES: continue stack.append(child) continue if child.is_file(): visited += 1 yield child if visited >= MAX_SEARCH_FILES: break @dataclass(slots=True) class ListDirectoryTool: """List files and directories inside the current workspace.""" name: str = "list_directory" description: str = ( "List files and directories inside the current workspace. " "Use this before reading files when you need to inspect project structure. " "This tool never follows paths outside the workspace." ) toolset: str = "filesystem" always_available: bool = True workspace: str | None = None parameters: dict[str, Any] = field(default_factory=lambda: dict(LIST_DIRECTORY_PARAMETERS)) async def execute( self, *, path: str = ".", recursive: bool = False, max_entries: int = 200, workspace: str | None = None, ) -> str: try: root, resolved = _resolve_existing_path(workspace, path) if not resolved.is_dir(): return _json_result(False, error="not_a_directory", path=path) limit = _clamp_int(max_entries, default=200, minimum=1, maximum=MAX_LIST_ENTRIES) entries: list[dict[str, Any]] = [] truncated = False for entry in _iter_directory(root, resolved, recursive=bool(recursive)): entries.append(_entry_payload(root, entry)) if len(entries) >= limit: truncated = True break return _json_result( True, path=_relative_path(root, resolved), recursive=bool(recursive), entries=entries, truncated=truncated, ) except (OSError, WorkspacePathError, ValueError) as exc: return _json_result(False, error=str(exc), path=path) @dataclass(slots=True) class ReadFileTool: """Read a UTF-8 text file inside the current workspace.""" name: str = "read_file" description: str = ( "Read a UTF-8 text file inside the current workspace with line limits. " "Use this to inspect source code, docs, config, or logs. " "This tool rejects binary files and paths outside the workspace." ) toolset: str = "filesystem" always_available: bool = True workspace: str | None = None parameters: dict[str, Any] = field(default_factory=lambda: dict(READ_FILE_PARAMETERS)) async def execute( self, *, path: str, start_line: int = 1, max_lines: int = 200, workspace: str | None = None, ) -> str: try: root, resolved = _resolve_existing_path(workspace, path) if not resolved.is_file(): return _json_result(False, error="not_a_file", path=path) start = _clamp_int(start_line, default=1, minimum=1, maximum=10_000_000) limit = _clamp_int(max_lines, default=200, minimum=1, maximum=MAX_READ_LINES) content = _read_text_file(resolved) lines = content.splitlines() selected = lines[start - 1 : start - 1 + limit] selected_text = "\n".join(selected) char_truncated = False if len(selected_text) > MAX_READ_CHARS: selected_text = selected_text[:MAX_READ_CHARS] char_truncated = True end_line = start + len(selected) - 1 if selected else start - 1 return _json_result( True, path=_relative_path(root, resolved), start_line=start, end_line=end_line, total_lines=len(lines), truncated=end_line < len(lines) or char_truncated, content=selected_text, ) except UnicodeDecodeError: return _json_result(False, error="file is not valid UTF-8 text", path=path) except (OSError, WorkspacePathError, ValueError) as exc: return _json_result(False, error=str(exc), path=path) @dataclass(slots=True) class SearchFilesTool: """Search filenames and UTF-8 text file contents inside the workspace.""" name: str = "search_files" description: str = ( "Search file paths and UTF-8 text file contents inside the current workspace. " "Use this to find relevant source files, docs, config keys, or log lines. " "This tool skips large/binary files and never searches outside the workspace." ) toolset: str = "filesystem" always_available: bool = True workspace: str | None = None parameters: dict[str, Any] = field(default_factory=lambda: dict(SEARCH_FILES_PARAMETERS)) async def execute( self, *, query: str, path: str = ".", max_results: int = 50, case_sensitive: bool = False, workspace: str | None = None, ) -> str: try: if not isinstance(query, str) or not query.strip(): return _json_result(False, error="query must be a non-empty string") root, resolved = _resolve_existing_path(workspace, path) if not resolved.is_dir() and not resolved.is_file(): return _json_result(False, error="path must be a file or directory", path=path) limit = _clamp_int(max_results, default=50, minimum=1, maximum=MAX_SEARCH_RESULTS) needle = query if case_sensitive else query.lower() results: list[dict[str, Any]] = [] searched_files = 0 skipped_files = 0 for file_path in _iter_search_files(root, resolved): relative = _relative_path(root, file_path) haystack_path = relative if case_sensitive else relative.lower() if needle in haystack_path: results.append( { "path": relative, "line": None, "match_type": "path", "preview": relative, } ) if len(results) >= limit: break try: if file_path.stat().st_size > MAX_SEARCH_FILE_BYTES or _looks_binary(file_path): skipped_files += 1 continue text = file_path.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): skipped_files += 1 continue searched_files += 1 lines = text.splitlines() for index, line in enumerate(lines, start=1): haystack_line = line if case_sensitive else line.lower() if needle not in haystack_line: continue results.append( { "path": relative, "line": index, "match_type": "content", "preview": line[:500], } ) if len(results) >= limit: break if len(results) >= limit: break return _json_result( True, query=query, path=_relative_path(root, resolved), results=results, truncated=len(results) >= limit, searched_files=searched_files, skipped_files=skipped_files, ) except (OSError, WorkspacePathError, ValueError) as exc: return _json_result(False, error=str(exc), path=path)