"""User-visible file system service. This module owns the personal file-system boundary exposed to users and agents. Storage backends can change, but callers see only virtual paths under fixed roots. """ from __future__ import annotations from contextlib import suppress from dataclasses import dataclass from datetime import datetime, timezone from io import BytesIO import mimetypes from pathlib import Path, PurePosixPath import shutil import tempfile from typing import Protocol USER_FILE_ROOTS = ("uploads", "outputs", "shared", "tasks") MAX_PREVIEW_BYTES = 1024 * 1024 AGENT_UPLOADS_ERROR = "uploads/ is user-provided input storage; agents may read it but must not write it" AGENT_DELETE_ERROR = "agents cannot delete user-visible files; use the Files page or user-side APIs" class UserFileError(ValueError): """Base error for user file operations.""" class UserFilePathError(UserFileError): """Raised when a user file path violates the virtual path policy.""" class UserFileNotFoundError(UserFileError): """Raised when a user file path does not exist.""" class UserFileSizeError(UserFileError): """Raised when a user file upload exceeds configured limits.""" @dataclass(frozen=True, slots=True) class AgentUserFilePolicy: task_id: str | None = None fallback_scope: str = "interactive" @property def task_namespace(self) -> str: if self.task_id: return f"tasks/{self.task_id}" scope = _safe_scope(self.fallback_scope) return f"tasks/interactive/{scope}" def validate_read(self, path: str) -> str: return normalize_user_path(path, allow_root=False) def validate_write(self, path: str) -> str: normalized = normalize_user_path(path, allow_root=False) root = normalized.split("/", 1)[0] if root == "uploads": raise UserFilePathError(AGENT_UPLOADS_ERROR) if root == "tasks": self._validate_task_namespace(normalized) return normalized def validate_mkdir(self, path: str) -> str: return self.validate_write(path) def validate_delete(self, path: str) -> str: normalize_user_path(path, allow_root=False) raise UserFilePathError(AGENT_DELETE_ERROR) def _validate_task_namespace(self, normalized: str) -> None: namespace = self.task_namespace if normalized == "tasks" or not normalized.startswith(f"{namespace}/"): raise UserFilePathError(f"Agent task files must be written under {namespace}/") @dataclass(slots=True) class UserFileEntry: name: str path: str type: str size: int | None = None content_type: str | None = None modified: str | None = None def to_dict(self) -> dict[str, object]: return { "name": self.name, "path": self.path, "type": self.type, "size": self.size, "content_type": self.content_type, "modified": self.modified, } @dataclass(slots=True) class UserFileContent: name: str path: str size: int content_type: str modified: str | None content: bytes @dataclass(slots=True) class UserFilePreview: name: str path: str size: int content_type: str modified: str | None is_binary: bool is_truncated: bool content: str | None def to_dict(self) -> dict[str, object]: return { "name": self.name, "path": self.path, "size": self.size, "content_type": self.content_type, "modified": self.modified, "is_binary": self.is_binary, "is_truncated": self.is_truncated, "content": self.content, } class UserFileStorage(Protocol): async def list_dir(self, path: str) -> list[UserFileEntry]: ... async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent: ... async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry: ... async def write_file_stream( self, path: str, stream: object, *, content_type: str, max_bytes: int | None = None, part_size: int = 10 * 1024 * 1024, ) -> UserFileEntry: ... async def delete_path(self, path: str) -> bool: ... async def mkdir(self, path: str) -> UserFileEntry: ... class UserFileService: def __init__(self, storage: UserFileStorage) -> None: self.storage = storage async def browse(self, path: str = "") -> dict[str, object]: normalized = normalize_user_path(path, allow_root=True) if normalized == "": return { "path": "", "items": [ UserFileEntry(name=root, path=root, type="directory").to_dict() for root in USER_FILE_ROOTS ], } entries = await self.storage.list_dir(normalized) return {"path": normalized, "items": [entry.to_dict() for entry in entries]} async def upload(self, directory: str, filename: str, content: bytes, *, content_type: str) -> dict[str, object]: if not is_safe_filename(filename): raise UserFilePathError("Invalid filename") target = normalize_user_path(_join_user_path(directory, filename), allow_root=False) return (await self.storage.write_file(target, content, content_type=content_type)).to_dict() async def upload_stream( self, directory: str, filename: str, stream: object, *, content_type: str, max_bytes: int | None = None, part_size: int = 10 * 1024 * 1024, ) -> dict[str, object]: if not is_safe_filename(filename): raise UserFilePathError("Invalid filename") target = normalize_user_path(_join_user_path(directory, filename), allow_root=False) return ( await self.storage.write_file_stream( target, stream, content_type=content_type, max_bytes=max_bytes, part_size=part_size, ) ).to_dict() async def write_file(self, path: str, content: bytes | str, *, content_type: str = "text/plain") -> dict[str, object]: normalized = normalize_user_path(path, allow_root=False) raw = content.encode("utf-8") if isinstance(content, str) else bytes(content) return (await self.storage.write_file(normalized, raw, content_type=content_type)).to_dict() async def download(self, path: str) -> UserFileContent: return await self.storage.read_file(normalize_user_path(path, allow_root=False)) async def preview(self, path: str, *, max_bytes: int = MAX_PREVIEW_BYTES) -> dict[str, object]: content = await self.storage.read_file(normalize_user_path(path, allow_root=False), max_bytes=max_bytes) is_binary = _is_probably_binary(content.content, content.content_type) text = None if is_binary else content.content.decode("utf-8", errors="replace") return UserFilePreview( name=content.name, path=content.path, size=content.size, content_type=content.content_type, modified=content.modified, is_binary=is_binary, is_truncated=content.size > len(content.content), content=text, ).to_dict() async def delete(self, path: str) -> bool: normalized = normalize_user_path(path, allow_root=False) if normalized in USER_FILE_ROOTS: raise UserFilePathError("Cannot delete virtual root folders") return await self.storage.delete_path(normalized) async def mkdir(self, path: str) -> dict[str, object]: normalized = normalize_user_path(path, allow_root=False) if normalized in USER_FILE_ROOTS: raise UserFilePathError("Virtual root folders already exist") return (await self.storage.mkdir(normalized)).to_dict() class LocalUserFileStorage: """Filesystem-backed storage adapter for tests and local development.""" def __init__(self, root: Path) -> None: self.root = Path(root).expanduser().resolve() self.root.mkdir(parents=True, exist_ok=True) for name in USER_FILE_ROOTS: (self.root / name).mkdir(parents=True, exist_ok=True) async def list_dir(self, path: str) -> list[UserFileEntry]: target = self._path(path) if not target.exists(): target.mkdir(parents=True, exist_ok=True) if not target.is_dir(): raise UserFilePathError("Path is not a directory") entries: list[UserFileEntry] = [] for child in sorted(target.iterdir(), key=lambda item: (not item.is_dir(), item.name.lower())): if child.name.startswith("."): continue entries.append(self._entry(child)) return entries async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent: target = self._path(path) if not target.is_file(): raise UserFileNotFoundError("File not found") raw = target.read_bytes() selected = raw[:max_bytes] if max_bytes is not None else raw stat = target.stat() content_type, _ = mimetypes.guess_type(target.name) return UserFileContent( name=target.name, path=self._relative(target), size=stat.st_size, content_type=content_type or "application/octet-stream", modified=_iso_from_timestamp(stat.st_mtime), content=selected, ) async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry: target = self._path(path) target.parent.mkdir(parents=True, exist_ok=True) target.write_bytes(content) return self._entry(target, content_type=content_type) async def write_file_stream( self, path: str, stream: object, *, content_type: str, max_bytes: int | None = None, part_size: int = 10 * 1024 * 1024, ) -> UserFileEntry: target = self._path(path) target.parent.mkdir(parents=True, exist_ok=True) fd, tmp_name = tempfile.mkstemp(prefix=f".{target.name}.", suffix=".tmp", dir=target.parent) tmp_path = Path(tmp_name) total = 0 try: with open(fd, "wb", closefd=True) as output: while True: chunk = stream.read(part_size) # type: ignore[attr-defined] if not chunk: break total += len(chunk) if max_bytes is not None and total > max_bytes: raise UserFileSizeError(_size_error(max_bytes)) output.write(chunk) tmp_path.replace(target) except Exception: with suppress(FileNotFoundError): tmp_path.unlink() raise return self._entry(target, content_type=content_type) async def delete_path(self, path: str) -> bool: target = self._path(path) if not target.exists(): return False if target.is_dir(): shutil.rmtree(target) else: target.unlink() return True async def mkdir(self, path: str) -> UserFileEntry: target = self._path(path) target.mkdir(parents=True, exist_ok=True) return self._entry(target) def _path(self, path: str) -> Path: normalized = normalize_user_path(path, allow_root=False) target = (self.root / normalized).resolve() try: target.relative_to(self.root) except ValueError as exc: raise UserFilePathError("Path escapes user file root") from exc return target def _relative(self, path: Path) -> str: return path.relative_to(self.root).as_posix() def _entry(self, path: Path, *, content_type: str | None = None) -> UserFileEntry: stat = path.stat() guessed_type, _ = mimetypes.guess_type(path.name) return UserFileEntry( name=path.name, path=self._relative(path), type="directory" if path.is_dir() else "file", size=None if path.is_dir() else stat.st_size, content_type=None if path.is_dir() else (content_type or guessed_type or "application/octet-stream"), modified=_iso_from_timestamp(stat.st_mtime), ) @dataclass(slots=True) class MinIOStorageConfig: endpoint: str access_key: str secret_key: str bucket: str secure: bool = False region: str | None = None namespace: str = "" class MinIOUserFileStorage: """MinIO-backed user file storage adapter.""" def __init__(self, config: MinIOStorageConfig) -> None: if not config.endpoint or not config.access_key or not config.secret_key or not config.bucket: raise ValueError("MinIO storage requires endpoint, access key, secret key, and bucket") from minio import Minio self.config = config self.client = Minio( endpoint=config.endpoint, access_key=config.access_key, secret_key=config.secret_key, secure=config.secure, region=config.region, ) async def list_dir(self, path: str) -> list[UserFileEntry]: prefix = self._object_prefix(path) objects = self.client.list_objects(self.config.bucket, prefix=prefix, recursive=False) entries: list[UserFileEntry] = [] for obj in objects: object_name = str(obj.object_name or "") user_path = self._user_path(object_name) if not user_path or user_path == path or user_path.endswith("/.keep"): continue trimmed = user_path.rstrip("/") name = PurePosixPath(trimmed).name is_dir = bool(getattr(obj, "is_dir", False)) or object_name.endswith("/") entries.append( UserFileEntry( name=name, path=trimmed, type="directory" if is_dir else "file", size=None if is_dir else getattr(obj, "size", None), content_type=None if is_dir else "application/octet-stream", modified=obj.last_modified.isoformat() if getattr(obj, "last_modified", None) else None, ) ) return sorted(entries, key=lambda item: (item.type != "directory", item.name.lower())) async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent: object_name = self._object_name(path) try: stat = self.client.stat_object(self.config.bucket, object_name) if max_bytes is None: response = self.client.get_object(self.config.bucket, object_name) else: response = self.client.get_object(self.config.bucket, object_name, length=max_bytes) raw = response.read() response.close() response.release_conn() except Exception as exc: raise UserFileNotFoundError("File not found") from exc return UserFileContent( name=PurePosixPath(path).name, path=path, size=int(stat.size or len(raw)), content_type=stat.content_type or "application/octet-stream", modified=stat.last_modified.isoformat() if stat.last_modified else None, content=raw, ) async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry: object_name = self._object_name(path) result = self.client.put_object( self.config.bucket, object_name, BytesIO(content), length=len(content), content_type=content_type, ) return UserFileEntry( name=PurePosixPath(path).name, path=path, type="file", size=len(content), content_type=content_type, modified=datetime.now(timezone.utc).isoformat(), ) async def write_file_stream( self, path: str, stream: object, *, content_type: str, max_bytes: int | None = None, part_size: int = 10 * 1024 * 1024, ) -> UserFileEntry: object_name = self._object_name(path) reader = _LimitedReadStream(stream, max_bytes=max_bytes) try: self.client.put_object( self.config.bucket, object_name, reader, length=-1, part_size=max(5 * 1024 * 1024, part_size), content_type=content_type, ) except UserFileSizeError: try: self.client.remove_object(self.config.bucket, object_name) except Exception: pass raise return UserFileEntry( name=PurePosixPath(path).name, path=path, type="file", size=reader.bytes_read, content_type=content_type, modified=datetime.now(timezone.utc).isoformat(), ) async def delete_path(self, path: str) -> bool: object_name = self._object_name(path) removed = False try: self.client.remove_object(self.config.bucket, object_name) removed = True except Exception: pass prefix = f"{object_name.rstrip('/')}/" for obj in self.client.list_objects(self.config.bucket, prefix=prefix, recursive=True): self.client.remove_object(self.config.bucket, str(obj.object_name)) removed = True return removed async def mkdir(self, path: str) -> UserFileEntry: object_name = f"{self._object_name(path).rstrip('/')}/.keep" self.client.put_object( self.config.bucket, object_name, BytesIO(b""), length=0, content_type="application/x-directory", ) return UserFileEntry( name=PurePosixPath(path).name, path=path, type="directory", size=None, modified=datetime.now(timezone.utc).isoformat(), ) def _namespace(self) -> str: return self.config.namespace.strip("/") def _object_name(self, path: str) -> str: normalized = normalize_user_path(path, allow_root=False) namespace = self._namespace() object_name = f"{namespace}/{normalized}" if namespace else normalized if object_name.startswith("/") or "/../" in f"/{object_name}/": raise UserFilePathError("Object path escapes namespace") return object_name def _object_prefix(self, path: str) -> str: return f"{self._object_name(path).rstrip('/')}/" def _user_path(self, object_name: str) -> str: namespace = self._namespace() if namespace: prefix = f"{namespace}/" if not object_name.startswith(prefix): raise UserFilePathError("Object path escapes namespace") return object_name[len(prefix) :] return object_name def normalize_user_path(path: str | None, *, allow_root: bool) -> str: original = (path or "").replace("\\", "/").strip() if original.startswith("/"): raise UserFilePathError("Absolute paths are not allowed") raw = original.strip("/") if raw == "": if allow_root: return "" raise UserFilePathError("Path is required") posix = PurePosixPath(raw) if posix.is_absolute(): raise UserFilePathError("Absolute paths are not allowed") parts = [part for part in posix.parts if part not in ("", ".")] if any(part == ".." for part in parts): raise UserFilePathError("Parent-directory traversal is not allowed") if any(part.startswith(".") for part in parts): raise UserFilePathError("Hidden implementation paths are not allowed") if not parts or parts[0] not in USER_FILE_ROOTS: raise UserFilePathError("Path must be under uploads, outputs, shared, or tasks") return "/".join(parts) def is_safe_filename(filename: str) -> bool: return bool(filename) and "/" not in filename and "\\" not in filename and not filename.startswith(".") def _join_user_path(directory: str, filename: str) -> str: normalized_dir = normalize_user_path(directory, allow_root=False) return f"{normalized_dir.rstrip('/')}/{filename}" def _is_probably_binary(raw: bytes, content_type: str) -> bool: if content_type.startswith("text/") or content_type in { "application/json", "application/javascript", "application/xml", "application/x-yaml", }: return False if not raw: return False if b"\x00" in raw[:4096]: return True try: raw[:4096].decode("utf-8") except UnicodeDecodeError: return True return False def _iso_from_timestamp(value: float) -> str: return datetime.fromtimestamp(value, tz=timezone.utc).isoformat() def _safe_scope(value: str | None) -> str: raw = (value or "interactive").strip() allowed = [char if char.isalnum() or char in ("-", "_") else "-" for char in raw] cleaned = "".join(allowed).strip("-_") return cleaned or "interactive" class _LimitedReadStream: def __init__(self, stream: object, *, max_bytes: int | None = None) -> None: self.stream = stream self.max_bytes = max_bytes self.bytes_read = 0 def read(self, size: int = -1) -> bytes: chunk = self.stream.read(size) # type: ignore[attr-defined] if not chunk: return b"" self.bytes_read += len(chunk) if self.max_bytes is not None and self.bytes_read > self.max_bytes: raise UserFileSizeError(_size_error(self.max_bytes)) return chunk def _size_error(max_bytes: int) -> str: return f"File too large (max {_human_size(max_bytes)})" def _human_size(size: int) -> str: units = ("B", "KB", "MB", "GB", "TB") value = float(size) for unit in units: if value < 1024 or unit == units[-1]: return f"{value:.0f}{unit}" if unit == "B" else f"{value:.1f}{unit}" value /= 1024 return f"{size}B"