beaver_project/app-instance/backend/beaver/tools/builtins/filesystem.py

"""Workspace-scoped read-only filesystem tools.

这些工具是 Beaver 第一批真实本地工具，只做只读能力：
- list_directory
- read_file
- search_files

安全边界先保持非常明确：所有用户传入路径都必须解析到当前
`ToolContext.workspace` 内部。即使 workspace 里有指向外部的符号链接，
读取时也会因为真实路径越界而被拒绝。
"""

from __future__ import annotations

from dataclasses import dataclass, field
import json
from pathlib import Path
from typing import Any, Iterable


MAX_LIST_ENTRIES = 1_000
MAX_READ_LINES = 1_000
MAX_READ_CHARS = 120_000
MAX_SEARCH_RESULTS = 200
MAX_SEARCH_FILE_BYTES = 2_000_000
MAX_SEARCH_FILES = 5_000
SKIP_DIR_NAMES = {
    ".git",
    ".hg",
    ".svn",
    ".venv",
    "venv",
    "__pycache__",
    ".pytest_cache",
    ".mypy_cache",
    ".ruff_cache",
    "node_modules",
    "dist",
    "build",
}


LIST_DIRECTORY_PARAMETERS: dict[str, Any] = {
    "type": "object",
    "properties": {
        "path": {
            "type": "string",
            "default": ".",
            "description": "Directory path relative to the current workspace. Absolute paths are allowed only if they stay inside the workspace.",
        },
        "recursive": {
            "type": "boolean",
            "default": False,
            "description": "Whether to recursively list child entries. Symlink directories are not followed.",
        },
        "max_entries": {
            "type": "integer",
            "default": 200,
            "minimum": 1,
            "maximum": MAX_LIST_ENTRIES,
            "description": "Maximum number of entries to return.",
        },
    },
    "required": [],
}

READ_FILE_PARAMETERS: dict[str, Any] = {
    "type": "object",
    "properties": {
        "path": {
            "type": "string",
            "description": "File path relative to the current workspace. Absolute paths are allowed only if they stay inside the workspace.",
        },
        "start_line": {
            "type": "integer",
            "default": 1,
            "minimum": 1,
            "description": "1-based line number to start reading from.",
        },
        "max_lines": {
            "type": "integer",
            "default": 200,
            "minimum": 1,
            "maximum": MAX_READ_LINES,
            "description": "Maximum number of lines to read.",
        },
    },
    "required": ["path"],
}

SEARCH_FILES_PARAMETERS: dict[str, Any] = {
    "type": "object",
    "properties": {
        "query": {
            "type": "string",
            "description": "Plain text query to search in file paths and UTF-8 text files.",
        },
        "path": {
            "type": "string",
            "default": ".",
            "description": "Directory or file path relative to the current workspace.",
        },
        "max_results": {
            "type": "integer",
            "default": 50,
            "minimum": 1,
            "maximum": MAX_SEARCH_RESULTS,
            "description": "Maximum number of matches to return.",
        },
        "case_sensitive": {
            "type": "boolean",
            "default": False,
            "description": "Whether search should be case-sensitive.",
        },
    },
    "required": ["query"],
}


class WorkspacePathError(ValueError):
    """Raised when a requested path escapes the configured workspace."""


def _json_result(success: bool, **payload: Any) -> str:
    return json.dumps({"success": success, **payload}, ensure_ascii=False, indent=2)


def _clamp_int(value: Any, *, default: int, minimum: int, maximum: int) -> int:
    try:
        parsed = int(value)
    except (TypeError, ValueError):
        parsed = default
    return max(minimum, min(parsed, maximum))


def _workspace_root(workspace: str | None) -> Path:
    if not workspace:
        raise WorkspacePathError("workspace is not configured for filesystem tools")
    root = Path(workspace).expanduser().resolve(strict=True)
    if not root.is_dir():
        raise WorkspacePathError(f"workspace is not a directory: {root}")
    return root


def _resolve_existing_path(workspace: str | None, user_path: str | None) -> tuple[Path, Path]:
    """Resolve a user path and ensure the real target stays inside workspace."""

    root = _workspace_root(workspace)
    raw_path = Path(user_path or ".").expanduser()
    candidate = raw_path if raw_path.is_absolute() else root / raw_path
    resolved = candidate.resolve(strict=True)
    try:
        resolved.relative_to(root)
    except ValueError as exc:
        raise WorkspacePathError(
            f"path escapes workspace: {user_path or '.'}"
        ) from exc
    return root, resolved


def _relative_path(root: Path, path: Path) -> str:
    try:
        return str(path.relative_to(root)) or "."
    except ValueError:
        return str(path)


def _entry_type(path: Path) -> str:
    if path.is_symlink():
        return "symlink"
    if path.is_dir():
        return "directory"
    if path.is_file():
        return "file"
    return "other"


def _entry_payload(root: Path, path: Path) -> dict[str, Any]:
    try:
        stat = path.lstat() if path.is_symlink() else path.stat()
        size = stat.st_size
    except OSError:
        size = None
    return {
        "name": path.name,
        "path": _relative_path(root, path),
        "type": _entry_type(path),
        "size": size,
    }


def _iter_directory(root: Path, directory: Path, *, recursive: bool) -> Iterable[Path]:
    def sort_key(item: Path) -> tuple[bool, str]:
        is_real_directory = not item.is_symlink() and item.is_dir()
        return (not is_real_directory, item.name.lower())

    entries = sorted(directory.iterdir(), key=sort_key)
    for entry in entries:
        yield entry
        if not recursive or entry.is_symlink() or not entry.is_dir():
            continue
        yield from _iter_directory(root, entry, recursive=True)


def _looks_binary(path: Path) -> bool:
    try:
        with path.open("rb") as handle:
            sample = handle.read(4096)
    except OSError:
        return True
    return b"\0" in sample


def _read_text_file(path: Path) -> str:
    if _looks_binary(path):
        raise ValueError("binary files cannot be read by read_file/search_files")
    return path.read_text(encoding="utf-8")


def _iter_search_files(root: Path, start: Path) -> Iterable[Path]:
    if start.is_file():
        yield start
        return

    stack = [start]
    visited = 0
    while stack and visited < MAX_SEARCH_FILES:
        current = stack.pop()
        try:
            children = sorted(current.iterdir(), key=lambda item: item.name.lower())
        except OSError:
            continue

        for child in children:
            if child.is_symlink():
                continue
            if child.is_dir():
                if child.name in SKIP_DIR_NAMES:
                    continue
                stack.append(child)
                continue
            if child.is_file():
                visited += 1
                yield child
                if visited >= MAX_SEARCH_FILES:
                    break


@dataclass(slots=True)
class ListDirectoryTool:
    """List files and directories inside the current workspace."""

    name: str = "list_directory"
    description: str = (
        "List files and directories inside the current workspace. "
        "Use this before reading files when you need to inspect project structure. "
        "This tool never follows paths outside the workspace."
    )
    toolset: str = "filesystem"
    always_available: bool = True
    workspace: str | None = None
    parameters: dict[str, Any] = field(default_factory=lambda: dict(LIST_DIRECTORY_PARAMETERS))

    async def execute(
        self,
        *,
        path: str = ".",
        recursive: bool = False,
        max_entries: int = 200,
        workspace: str | None = None,
    ) -> str:
        try:
            root, resolved = _resolve_existing_path(workspace, path)
            if not resolved.is_dir():
                return _json_result(False, error="not_a_directory", path=path)

            limit = _clamp_int(max_entries, default=200, minimum=1, maximum=MAX_LIST_ENTRIES)
            entries: list[dict[str, Any]] = []
            truncated = False
            for entry in _iter_directory(root, resolved, recursive=bool(recursive)):
                entries.append(_entry_payload(root, entry))
                if len(entries) >= limit:
                    truncated = True
                    break

            return _json_result(
                True,
                path=_relative_path(root, resolved),
                recursive=bool(recursive),
                entries=entries,
                truncated=truncated,
            )
        except (OSError, WorkspacePathError, ValueError) as exc:
            return _json_result(False, error=str(exc), path=path)


@dataclass(slots=True)
class ReadFileTool:
    """Read a UTF-8 text file inside the current workspace."""

    name: str = "read_file"
    description: str = (
        "Read a UTF-8 text file inside the current workspace with line limits. "
        "Use this to inspect source code, docs, config, or logs. "
        "This tool rejects binary files and paths outside the workspace."
    )
    toolset: str = "filesystem"
    always_available: bool = True
    workspace: str | None = None
    parameters: dict[str, Any] = field(default_factory=lambda: dict(READ_FILE_PARAMETERS))

    async def execute(
        self,
        *,
        path: str,
        start_line: int = 1,
        max_lines: int = 200,
        workspace: str | None = None,
    ) -> str:
        try:
            root, resolved = _resolve_existing_path(workspace, path)
            if not resolved.is_file():
                return _json_result(False, error="not_a_file", path=path)

            start = _clamp_int(start_line, default=1, minimum=1, maximum=10_000_000)
            limit = _clamp_int(max_lines, default=200, minimum=1, maximum=MAX_READ_LINES)
            content = _read_text_file(resolved)
            lines = content.splitlines()
            selected = lines[start - 1 : start - 1 + limit]
            selected_text = "\n".join(selected)
            char_truncated = False
            if len(selected_text) > MAX_READ_CHARS:
                selected_text = selected_text[:MAX_READ_CHARS]
                char_truncated = True

            end_line = start + len(selected) - 1 if selected else start - 1
            return _json_result(
                True,
                path=_relative_path(root, resolved),
                start_line=start,
                end_line=end_line,
                total_lines=len(lines),
                truncated=end_line < len(lines) or char_truncated,
                content=selected_text,
            )
        except UnicodeDecodeError:
            return _json_result(False, error="file is not valid UTF-8 text", path=path)
        except (OSError, WorkspacePathError, ValueError) as exc:
            return _json_result(False, error=str(exc), path=path)


@dataclass(slots=True)
class SearchFilesTool:
    """Search filenames and UTF-8 text file contents inside the workspace."""

    name: str = "search_files"
    description: str = (
        "Search file paths and UTF-8 text file contents inside the current workspace. "
        "Use this to find relevant source files, docs, config keys, or log lines. "
        "This tool skips large/binary files and never searches outside the workspace."
    )
    toolset: str = "filesystem"
    always_available: bool = True
    workspace: str | None = None
    parameters: dict[str, Any] = field(default_factory=lambda: dict(SEARCH_FILES_PARAMETERS))

    async def execute(
        self,
        *,
        query: str,
        path: str = ".",
        max_results: int = 50,
        case_sensitive: bool = False,
        workspace: str | None = None,
    ) -> str:
        try:
            if not isinstance(query, str) or not query.strip():
                return _json_result(False, error="query must be a non-empty string")
            root, resolved = _resolve_existing_path(workspace, path)
            if not resolved.is_dir() and not resolved.is_file():
                return _json_result(False, error="path must be a file or directory", path=path)

            limit = _clamp_int(max_results, default=50, minimum=1, maximum=MAX_SEARCH_RESULTS)
            needle = query if case_sensitive else query.lower()
            results: list[dict[str, Any]] = []
            searched_files = 0
            skipped_files = 0

            for file_path in _iter_search_files(root, resolved):
                relative = _relative_path(root, file_path)
                haystack_path = relative if case_sensitive else relative.lower()
                if needle in haystack_path:
                    results.append(
                        {
                            "path": relative,
                            "line": None,
                            "match_type": "path",
                            "preview": relative,
                        }
                    )
                    if len(results) >= limit:
                        break

                try:
                    if file_path.stat().st_size > MAX_SEARCH_FILE_BYTES or _looks_binary(file_path):
                        skipped_files += 1
                        continue
                    text = file_path.read_text(encoding="utf-8")
                except (OSError, UnicodeDecodeError):
                    skipped_files += 1
                    continue

                searched_files += 1
                lines = text.splitlines()
                for index, line in enumerate(lines, start=1):
                    haystack_line = line if case_sensitive else line.lower()
                    if needle not in haystack_line:
                        continue
                    results.append(
                        {
                            "path": relative,
                            "line": index,
                            "match_type": "content",
                            "preview": line[:500],
                        }
                    )
                    if len(results) >= limit:
                        break
                if len(results) >= limit:
                    break

            return _json_result(
                True,
                query=query,
                path=_relative_path(root, resolved),
                results=results,
                truncated=len(results) >= limit,
                searched_files=searched_files,
                skipped_files=skipped_files,
            )
        except (OSError, WorkspacePathError, ValueError) as exc:
            return _json_result(False, error=str(exc), path=path)