631 lines
22 KiB
Python
631 lines
22 KiB
Python
"""User-visible file system service.
|
|
|
|
This module owns the personal file-system boundary exposed to users and
|
|
agents. Storage backends can change, but callers see only virtual paths under
|
|
fixed roots.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from contextlib import suppress
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from io import BytesIO
|
|
import mimetypes
|
|
from pathlib import Path, PurePosixPath
|
|
import shutil
|
|
import tempfile
|
|
from typing import Protocol
|
|
|
|
|
|
USER_FILE_ROOTS = ("uploads", "outputs", "shared", "tasks")
|
|
MAX_PREVIEW_BYTES = 1024 * 1024
|
|
AGENT_UPLOADS_ERROR = "uploads/ is user-provided input storage; agents may read it but must not write it"
|
|
AGENT_DELETE_ERROR = "agents cannot delete user-visible files; use the Files page or user-side APIs"
|
|
|
|
|
|
class UserFileError(ValueError):
|
|
"""Base error for user file operations."""
|
|
|
|
|
|
class UserFilePathError(UserFileError):
|
|
"""Raised when a user file path violates the virtual path policy."""
|
|
|
|
|
|
class UserFileNotFoundError(UserFileError):
|
|
"""Raised when a user file path does not exist."""
|
|
|
|
|
|
class UserFileSizeError(UserFileError):
|
|
"""Raised when a user file upload exceeds configured limits."""
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
class AgentUserFilePolicy:
|
|
task_id: str | None = None
|
|
fallback_scope: str = "interactive"
|
|
|
|
@property
|
|
def task_namespace(self) -> str:
|
|
if self.task_id:
|
|
return f"tasks/{self.task_id}"
|
|
scope = _safe_scope(self.fallback_scope)
|
|
return f"tasks/interactive/{scope}"
|
|
|
|
def validate_read(self, path: str) -> str:
|
|
return normalize_user_path(path, allow_root=False)
|
|
|
|
def validate_write(self, path: str) -> str:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
root = normalized.split("/", 1)[0]
|
|
if root == "uploads":
|
|
raise UserFilePathError(AGENT_UPLOADS_ERROR)
|
|
if root == "tasks":
|
|
self._validate_task_namespace(normalized)
|
|
return normalized
|
|
|
|
def validate_mkdir(self, path: str) -> str:
|
|
return self.validate_write(path)
|
|
|
|
def validate_delete(self, path: str) -> str:
|
|
normalize_user_path(path, allow_root=False)
|
|
raise UserFilePathError(AGENT_DELETE_ERROR)
|
|
|
|
def _validate_task_namespace(self, normalized: str) -> None:
|
|
namespace = self.task_namespace
|
|
if normalized == "tasks" or not normalized.startswith(f"{namespace}/"):
|
|
raise UserFilePathError(f"Agent task files must be written under {namespace}/")
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class UserFileEntry:
|
|
name: str
|
|
path: str
|
|
type: str
|
|
size: int | None = None
|
|
content_type: str | None = None
|
|
modified: str | None = None
|
|
|
|
def to_dict(self) -> dict[str, object]:
|
|
return {
|
|
"name": self.name,
|
|
"path": self.path,
|
|
"type": self.type,
|
|
"size": self.size,
|
|
"content_type": self.content_type,
|
|
"modified": self.modified,
|
|
}
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class UserFileContent:
|
|
name: str
|
|
path: str
|
|
size: int
|
|
content_type: str
|
|
modified: str | None
|
|
content: bytes
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class UserFilePreview:
|
|
name: str
|
|
path: str
|
|
size: int
|
|
content_type: str
|
|
modified: str | None
|
|
is_binary: bool
|
|
is_truncated: bool
|
|
content: str | None
|
|
|
|
def to_dict(self) -> dict[str, object]:
|
|
return {
|
|
"name": self.name,
|
|
"path": self.path,
|
|
"size": self.size,
|
|
"content_type": self.content_type,
|
|
"modified": self.modified,
|
|
"is_binary": self.is_binary,
|
|
"is_truncated": self.is_truncated,
|
|
"content": self.content,
|
|
}
|
|
|
|
|
|
class UserFileStorage(Protocol):
|
|
async def list_dir(self, path: str) -> list[UserFileEntry]:
|
|
...
|
|
|
|
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
|
|
...
|
|
|
|
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
|
|
...
|
|
|
|
async def write_file_stream(
|
|
self,
|
|
path: str,
|
|
stream: object,
|
|
*,
|
|
content_type: str,
|
|
max_bytes: int | None = None,
|
|
part_size: int = 10 * 1024 * 1024,
|
|
) -> UserFileEntry:
|
|
...
|
|
|
|
async def delete_path(self, path: str) -> bool:
|
|
...
|
|
|
|
async def mkdir(self, path: str) -> UserFileEntry:
|
|
...
|
|
|
|
|
|
class UserFileService:
|
|
def __init__(self, storage: UserFileStorage) -> None:
|
|
self.storage = storage
|
|
|
|
async def browse(self, path: str = "") -> dict[str, object]:
|
|
normalized = normalize_user_path(path, allow_root=True)
|
|
if normalized == "":
|
|
return {
|
|
"path": "",
|
|
"items": [
|
|
UserFileEntry(name=root, path=root, type="directory").to_dict()
|
|
for root in USER_FILE_ROOTS
|
|
],
|
|
}
|
|
entries = await self.storage.list_dir(normalized)
|
|
return {"path": normalized, "items": [entry.to_dict() for entry in entries]}
|
|
|
|
async def upload(self, directory: str, filename: str, content: bytes, *, content_type: str) -> dict[str, object]:
|
|
if not is_safe_filename(filename):
|
|
raise UserFilePathError("Invalid filename")
|
|
target = normalize_user_path(_join_user_path(directory, filename), allow_root=False)
|
|
return (await self.storage.write_file(target, content, content_type=content_type)).to_dict()
|
|
|
|
async def upload_stream(
|
|
self,
|
|
directory: str,
|
|
filename: str,
|
|
stream: object,
|
|
*,
|
|
content_type: str,
|
|
max_bytes: int | None = None,
|
|
part_size: int = 10 * 1024 * 1024,
|
|
) -> dict[str, object]:
|
|
if not is_safe_filename(filename):
|
|
raise UserFilePathError("Invalid filename")
|
|
target = normalize_user_path(_join_user_path(directory, filename), allow_root=False)
|
|
return (
|
|
await self.storage.write_file_stream(
|
|
target,
|
|
stream,
|
|
content_type=content_type,
|
|
max_bytes=max_bytes,
|
|
part_size=part_size,
|
|
)
|
|
).to_dict()
|
|
|
|
async def write_file(self, path: str, content: bytes | str, *, content_type: str = "text/plain") -> dict[str, object]:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
raw = content.encode("utf-8") if isinstance(content, str) else bytes(content)
|
|
return (await self.storage.write_file(normalized, raw, content_type=content_type)).to_dict()
|
|
|
|
async def download(self, path: str) -> UserFileContent:
|
|
return await self.storage.read_file(normalize_user_path(path, allow_root=False))
|
|
|
|
async def preview(self, path: str, *, max_bytes: int = MAX_PREVIEW_BYTES) -> dict[str, object]:
|
|
content = await self.storage.read_file(normalize_user_path(path, allow_root=False), max_bytes=max_bytes)
|
|
is_binary = _is_probably_binary(content.content, content.content_type)
|
|
text = None if is_binary else content.content.decode("utf-8", errors="replace")
|
|
return UserFilePreview(
|
|
name=content.name,
|
|
path=content.path,
|
|
size=content.size,
|
|
content_type=content.content_type,
|
|
modified=content.modified,
|
|
is_binary=is_binary,
|
|
is_truncated=content.size > len(content.content),
|
|
content=text,
|
|
).to_dict()
|
|
|
|
async def delete(self, path: str) -> bool:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
if normalized in USER_FILE_ROOTS:
|
|
raise UserFilePathError("Cannot delete virtual root folders")
|
|
return await self.storage.delete_path(normalized)
|
|
|
|
async def mkdir(self, path: str) -> dict[str, object]:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
if normalized in USER_FILE_ROOTS:
|
|
raise UserFilePathError("Virtual root folders already exist")
|
|
return (await self.storage.mkdir(normalized)).to_dict()
|
|
|
|
|
|
class LocalUserFileStorage:
|
|
"""Filesystem-backed storage adapter for tests and local development."""
|
|
|
|
def __init__(self, root: Path) -> None:
|
|
self.root = Path(root).expanduser().resolve()
|
|
self.root.mkdir(parents=True, exist_ok=True)
|
|
for name in USER_FILE_ROOTS:
|
|
(self.root / name).mkdir(parents=True, exist_ok=True)
|
|
|
|
async def list_dir(self, path: str) -> list[UserFileEntry]:
|
|
target = self._path(path)
|
|
if not target.exists():
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
if not target.is_dir():
|
|
raise UserFilePathError("Path is not a directory")
|
|
entries: list[UserFileEntry] = []
|
|
for child in sorted(target.iterdir(), key=lambda item: (not item.is_dir(), item.name.lower())):
|
|
if child.name.startswith("."):
|
|
continue
|
|
entries.append(self._entry(child))
|
|
return entries
|
|
|
|
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
|
|
target = self._path(path)
|
|
if not target.is_file():
|
|
raise UserFileNotFoundError("File not found")
|
|
raw = target.read_bytes()
|
|
selected = raw[:max_bytes] if max_bytes is not None else raw
|
|
stat = target.stat()
|
|
content_type, _ = mimetypes.guess_type(target.name)
|
|
return UserFileContent(
|
|
name=target.name,
|
|
path=self._relative(target),
|
|
size=stat.st_size,
|
|
content_type=content_type or "application/octet-stream",
|
|
modified=_iso_from_timestamp(stat.st_mtime),
|
|
content=selected,
|
|
)
|
|
|
|
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
|
|
target = self._path(path)
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
target.write_bytes(content)
|
|
return self._entry(target, content_type=content_type)
|
|
|
|
async def write_file_stream(
|
|
self,
|
|
path: str,
|
|
stream: object,
|
|
*,
|
|
content_type: str,
|
|
max_bytes: int | None = None,
|
|
part_size: int = 10 * 1024 * 1024,
|
|
) -> UserFileEntry:
|
|
target = self._path(path)
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
fd, tmp_name = tempfile.mkstemp(prefix=f".{target.name}.", suffix=".tmp", dir=target.parent)
|
|
tmp_path = Path(tmp_name)
|
|
total = 0
|
|
try:
|
|
with open(fd, "wb", closefd=True) as output:
|
|
while True:
|
|
chunk = stream.read(part_size) # type: ignore[attr-defined]
|
|
if not chunk:
|
|
break
|
|
total += len(chunk)
|
|
if max_bytes is not None and total > max_bytes:
|
|
raise UserFileSizeError(_size_error(max_bytes))
|
|
output.write(chunk)
|
|
tmp_path.replace(target)
|
|
except Exception:
|
|
with suppress(FileNotFoundError):
|
|
tmp_path.unlink()
|
|
raise
|
|
return self._entry(target, content_type=content_type)
|
|
|
|
async def delete_path(self, path: str) -> bool:
|
|
target = self._path(path)
|
|
if not target.exists():
|
|
return False
|
|
if target.is_dir():
|
|
shutil.rmtree(target)
|
|
else:
|
|
target.unlink()
|
|
return True
|
|
|
|
async def mkdir(self, path: str) -> UserFileEntry:
|
|
target = self._path(path)
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
return self._entry(target)
|
|
|
|
def _path(self, path: str) -> Path:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
target = (self.root / normalized).resolve()
|
|
try:
|
|
target.relative_to(self.root)
|
|
except ValueError as exc:
|
|
raise UserFilePathError("Path escapes user file root") from exc
|
|
return target
|
|
|
|
def _relative(self, path: Path) -> str:
|
|
return path.relative_to(self.root).as_posix()
|
|
|
|
def _entry(self, path: Path, *, content_type: str | None = None) -> UserFileEntry:
|
|
stat = path.stat()
|
|
guessed_type, _ = mimetypes.guess_type(path.name)
|
|
return UserFileEntry(
|
|
name=path.name,
|
|
path=self._relative(path),
|
|
type="directory" if path.is_dir() else "file",
|
|
size=None if path.is_dir() else stat.st_size,
|
|
content_type=None if path.is_dir() else (content_type or guessed_type or "application/octet-stream"),
|
|
modified=_iso_from_timestamp(stat.st_mtime),
|
|
)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class MinIOStorageConfig:
|
|
endpoint: str
|
|
access_key: str
|
|
secret_key: str
|
|
bucket: str
|
|
secure: bool = False
|
|
region: str | None = None
|
|
namespace: str = ""
|
|
|
|
|
|
class MinIOUserFileStorage:
|
|
"""MinIO-backed user file storage adapter."""
|
|
|
|
def __init__(self, config: MinIOStorageConfig) -> None:
|
|
if not config.endpoint or not config.access_key or not config.secret_key or not config.bucket:
|
|
raise ValueError("MinIO storage requires endpoint, access key, secret key, and bucket")
|
|
from minio import Minio
|
|
|
|
self.config = config
|
|
self.client = Minio(
|
|
endpoint=config.endpoint,
|
|
access_key=config.access_key,
|
|
secret_key=config.secret_key,
|
|
secure=config.secure,
|
|
region=config.region,
|
|
)
|
|
|
|
async def list_dir(self, path: str) -> list[UserFileEntry]:
|
|
prefix = self._object_prefix(path)
|
|
objects = self.client.list_objects(self.config.bucket, prefix=prefix, recursive=False)
|
|
entries: list[UserFileEntry] = []
|
|
for obj in objects:
|
|
object_name = str(obj.object_name or "")
|
|
user_path = self._user_path(object_name)
|
|
if not user_path or user_path == path or user_path.endswith("/.keep"):
|
|
continue
|
|
trimmed = user_path.rstrip("/")
|
|
name = PurePosixPath(trimmed).name
|
|
is_dir = bool(getattr(obj, "is_dir", False)) or object_name.endswith("/")
|
|
entries.append(
|
|
UserFileEntry(
|
|
name=name,
|
|
path=trimmed,
|
|
type="directory" if is_dir else "file",
|
|
size=None if is_dir else getattr(obj, "size", None),
|
|
content_type=None if is_dir else "application/octet-stream",
|
|
modified=obj.last_modified.isoformat() if getattr(obj, "last_modified", None) else None,
|
|
)
|
|
)
|
|
return sorted(entries, key=lambda item: (item.type != "directory", item.name.lower()))
|
|
|
|
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
|
|
object_name = self._object_name(path)
|
|
try:
|
|
stat = self.client.stat_object(self.config.bucket, object_name)
|
|
if max_bytes is None:
|
|
response = self.client.get_object(self.config.bucket, object_name)
|
|
else:
|
|
response = self.client.get_object(self.config.bucket, object_name, length=max_bytes)
|
|
raw = response.read()
|
|
response.close()
|
|
response.release_conn()
|
|
except Exception as exc:
|
|
raise UserFileNotFoundError("File not found") from exc
|
|
return UserFileContent(
|
|
name=PurePosixPath(path).name,
|
|
path=path,
|
|
size=int(stat.size or len(raw)),
|
|
content_type=stat.content_type or "application/octet-stream",
|
|
modified=stat.last_modified.isoformat() if stat.last_modified else None,
|
|
content=raw,
|
|
)
|
|
|
|
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
|
|
object_name = self._object_name(path)
|
|
result = self.client.put_object(
|
|
self.config.bucket,
|
|
object_name,
|
|
BytesIO(content),
|
|
length=len(content),
|
|
content_type=content_type,
|
|
)
|
|
return UserFileEntry(
|
|
name=PurePosixPath(path).name,
|
|
path=path,
|
|
type="file",
|
|
size=len(content),
|
|
content_type=content_type,
|
|
modified=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
|
|
async def write_file_stream(
|
|
self,
|
|
path: str,
|
|
stream: object,
|
|
*,
|
|
content_type: str,
|
|
max_bytes: int | None = None,
|
|
part_size: int = 10 * 1024 * 1024,
|
|
) -> UserFileEntry:
|
|
object_name = self._object_name(path)
|
|
reader = _LimitedReadStream(stream, max_bytes=max_bytes)
|
|
try:
|
|
self.client.put_object(
|
|
self.config.bucket,
|
|
object_name,
|
|
reader,
|
|
length=-1,
|
|
part_size=max(5 * 1024 * 1024, part_size),
|
|
content_type=content_type,
|
|
)
|
|
except UserFileSizeError:
|
|
try:
|
|
self.client.remove_object(self.config.bucket, object_name)
|
|
except Exception:
|
|
pass
|
|
raise
|
|
return UserFileEntry(
|
|
name=PurePosixPath(path).name,
|
|
path=path,
|
|
type="file",
|
|
size=reader.bytes_read,
|
|
content_type=content_type,
|
|
modified=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
|
|
async def delete_path(self, path: str) -> bool:
|
|
object_name = self._object_name(path)
|
|
removed = False
|
|
try:
|
|
self.client.remove_object(self.config.bucket, object_name)
|
|
removed = True
|
|
except Exception:
|
|
pass
|
|
prefix = f"{object_name.rstrip('/')}/"
|
|
for obj in self.client.list_objects(self.config.bucket, prefix=prefix, recursive=True):
|
|
self.client.remove_object(self.config.bucket, str(obj.object_name))
|
|
removed = True
|
|
return removed
|
|
|
|
async def mkdir(self, path: str) -> UserFileEntry:
|
|
object_name = f"{self._object_name(path).rstrip('/')}/.keep"
|
|
self.client.put_object(
|
|
self.config.bucket,
|
|
object_name,
|
|
BytesIO(b""),
|
|
length=0,
|
|
content_type="application/x-directory",
|
|
)
|
|
return UserFileEntry(
|
|
name=PurePosixPath(path).name,
|
|
path=path,
|
|
type="directory",
|
|
size=None,
|
|
modified=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
|
|
def _namespace(self) -> str:
|
|
return self.config.namespace.strip("/")
|
|
|
|
def _object_name(self, path: str) -> str:
|
|
normalized = normalize_user_path(path, allow_root=False)
|
|
namespace = self._namespace()
|
|
object_name = f"{namespace}/{normalized}" if namespace else normalized
|
|
if object_name.startswith("/") or "/../" in f"/{object_name}/":
|
|
raise UserFilePathError("Object path escapes namespace")
|
|
return object_name
|
|
|
|
def _object_prefix(self, path: str) -> str:
|
|
return f"{self._object_name(path).rstrip('/')}/"
|
|
|
|
def _user_path(self, object_name: str) -> str:
|
|
namespace = self._namespace()
|
|
if namespace:
|
|
prefix = f"{namespace}/"
|
|
if not object_name.startswith(prefix):
|
|
raise UserFilePathError("Object path escapes namespace")
|
|
return object_name[len(prefix) :]
|
|
return object_name
|
|
|
|
|
|
def normalize_user_path(path: str | None, *, allow_root: bool) -> str:
|
|
original = (path or "").replace("\\", "/").strip()
|
|
if original.startswith("/"):
|
|
raise UserFilePathError("Absolute paths are not allowed")
|
|
raw = original.strip("/")
|
|
if raw == "":
|
|
if allow_root:
|
|
return ""
|
|
raise UserFilePathError("Path is required")
|
|
posix = PurePosixPath(raw)
|
|
if posix.is_absolute():
|
|
raise UserFilePathError("Absolute paths are not allowed")
|
|
parts = [part for part in posix.parts if part not in ("", ".")]
|
|
if any(part == ".." for part in parts):
|
|
raise UserFilePathError("Parent-directory traversal is not allowed")
|
|
if any(part.startswith(".") for part in parts):
|
|
raise UserFilePathError("Hidden implementation paths are not allowed")
|
|
if not parts or parts[0] not in USER_FILE_ROOTS:
|
|
raise UserFilePathError("Path must be under uploads, outputs, shared, or tasks")
|
|
return "/".join(parts)
|
|
|
|
|
|
def is_safe_filename(filename: str) -> bool:
|
|
return bool(filename) and "/" not in filename and "\\" not in filename and not filename.startswith(".")
|
|
|
|
|
|
def _join_user_path(directory: str, filename: str) -> str:
|
|
normalized_dir = normalize_user_path(directory, allow_root=False)
|
|
return f"{normalized_dir.rstrip('/')}/{filename}"
|
|
|
|
|
|
def _is_probably_binary(raw: bytes, content_type: str) -> bool:
|
|
if content_type.startswith("text/") or content_type in {
|
|
"application/json",
|
|
"application/javascript",
|
|
"application/xml",
|
|
"application/x-yaml",
|
|
}:
|
|
return False
|
|
if not raw:
|
|
return False
|
|
if b"\x00" in raw[:4096]:
|
|
return True
|
|
try:
|
|
raw[:4096].decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _iso_from_timestamp(value: float) -> str:
|
|
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
|
|
|
|
|
|
def _safe_scope(value: str | None) -> str:
|
|
raw = (value or "interactive").strip()
|
|
allowed = [char if char.isalnum() or char in ("-", "_") else "-" for char in raw]
|
|
cleaned = "".join(allowed).strip("-_")
|
|
return cleaned or "interactive"
|
|
|
|
|
|
class _LimitedReadStream:
|
|
def __init__(self, stream: object, *, max_bytes: int | None = None) -> None:
|
|
self.stream = stream
|
|
self.max_bytes = max_bytes
|
|
self.bytes_read = 0
|
|
|
|
def read(self, size: int = -1) -> bytes:
|
|
chunk = self.stream.read(size) # type: ignore[attr-defined]
|
|
if not chunk:
|
|
return b""
|
|
self.bytes_read += len(chunk)
|
|
if self.max_bytes is not None and self.bytes_read > self.max_bytes:
|
|
raise UserFileSizeError(_size_error(self.max_bytes))
|
|
return chunk
|
|
|
|
|
|
def _size_error(max_bytes: int) -> str:
|
|
return f"File too large (max {_human_size(max_bytes)})"
|
|
|
|
|
|
def _human_size(size: int) -> str:
|
|
units = ("B", "KB", "MB", "GB", "TB")
|
|
value = float(size)
|
|
for unit in units:
|
|
if value < 1024 or unit == units[-1]:
|
|
return f"{value:.0f}{unit}" if unit == "B" else f"{value:.1f}{unit}"
|
|
value /= 1024
|
|
return f"{size}B"
|