merge: personal user filesystem minio integration

This commit is contained in:
2026-06-03 16:32:29 +08:00
56 changed files with 4780 additions and 115 deletions

View File

@ -0,0 +1,201 @@
"""Resolve the user-visible file system for web and agent callers."""
from __future__ import annotations
from dataclasses import dataclass, field
import os
from pathlib import Path
from typing import Any
import httpx
from beaver.foundation.config.schema import BeaverConfig
from .user_files import (
LocalUserFileStorage,
MinIOStorageConfig,
MinIOUserFileStorage,
USER_FILE_ROOTS,
UserFileError,
UserFileService,
)
class UserFileConfigurationError(UserFileError):
"""Raised when user file storage is not configured for this backend."""
@dataclass(slots=True)
class FileAuthContext:
"""Authenticated identity used by the personal file system boundary."""
username: str
backend_id: str
storage_namespace: str
user_id: str | None = None
scopes: tuple[str, ...] = field(default_factory=tuple)
auth_source: str = "beaver-web-token"
@dataclass(slots=True)
class UserFileStorageStatus:
configured: bool
storage_mode: str
roots: list[str]
workspace_visible: bool = False
detail: str | None = None
def to_dict(self) -> dict[str, Any]:
payload: dict[str, Any] = {
"configured": self.configured,
"storage_mode": self.storage_mode,
"roots": self.roots,
"workspace_visible": self.workspace_visible,
}
if self.detail:
payload["detail"] = self.detail
return payload
class UserFileStorageResolver:
"""Build `UserFileService` from the current Beaver identity and config."""
def __init__(
self,
*,
config: BeaverConfig,
workspace: Path,
auth_context: FileAuthContext,
) -> None:
self.config = config
self.workspace = Path(workspace)
self.auth_context = auth_context
async def service(self) -> UserFileService:
mode = _storage_mode(self.config)
if mode == "local":
return UserFileService(LocalUserFileStorage(self.workspace / "user_files"))
settings = await self._load_minio_settings()
return UserFileService(
MinIOUserFileStorage(
MinIOStorageConfig(
endpoint=str(settings.get("endpoint") or ""),
access_key=str(settings.get("access_key") or ""),
secret_key=str(settings.get("secret_key") or ""),
bucket=str(settings.get("bucket") or ""),
secure=bool(settings.get("secure", False)),
region=_clean_optional(settings.get("region")),
namespace=str(settings.get("namespace") or self.auth_context.storage_namespace),
)
)
)
async def status(self) -> UserFileStorageStatus:
mode = _storage_mode(self.config)
if mode == "local":
return UserFileStorageStatus(
configured=True,
storage_mode="local",
roots=list(USER_FILE_ROOTS),
workspace_visible=False,
)
try:
await self._load_minio_settings()
except UserFileConfigurationError as exc:
return UserFileStorageStatus(
configured=False,
storage_mode="object",
roots=list(USER_FILE_ROOTS),
workspace_visible=False,
detail=str(exc),
)
return UserFileStorageStatus(
configured=True,
storage_mode="object",
roots=list(USER_FILE_ROOTS),
workspace_visible=False,
)
async def _load_minio_settings(self) -> dict[str, Any]:
backend_id = self.auth_context.backend_id.strip()
if not backend_id:
raise UserFileConfigurationError("User file storage backend identity is not configured")
base_url = self.config.authz.base_url.strip()
if not (self.config.authz.enabled and base_url):
raise UserFileConfigurationError("AuthZ is required for deployed user file storage")
token = (
os.getenv("BEAVER_AUTHZ_INTERNAL_TOKEN", "").strip()
or os.getenv("AUTHZ_INTERNAL_TOKEN", "").strip()
)
if not token:
raise UserFileConfigurationError("AuthZ internal token is not configured for user file storage")
try:
async with httpx.AsyncClient(
timeout=self.config.authz.request_timeout_seconds,
follow_redirects=True,
trust_env=False,
) as client:
response = await client.get(
f"{base_url.rstrip('/')}/internal/backends/{backend_id}/settings/minio",
headers={"Authorization": f"Bearer {token}"},
)
except httpx.HTTPError as exc:
raise UserFileConfigurationError(f"Unable to load user file storage settings: {exc}") from exc
if response.status_code == 404:
raise UserFileConfigurationError("MinIO user file storage is not configured")
if response.is_error:
raise UserFileConfigurationError(
f"Unable to load user file storage settings: HTTP {response.status_code}"
)
payload = response.json()
if not isinstance(payload, dict):
raise UserFileConfigurationError("Invalid MinIO settings response")
if not all(str(payload.get(key) or "").strip() for key in ("endpoint", "access_key", "secret_key", "bucket")):
raise UserFileConfigurationError("MinIO user file storage settings are incomplete")
payload.setdefault("namespace", self.auth_context.storage_namespace)
return payload
def build_file_auth_context(
*,
username: str,
config: BeaverConfig,
user_id: str | None = None,
scopes: tuple[str, ...] = (),
auth_source: str = "beaver-web-token",
) -> FileAuthContext:
backend_id = (
config.backend_identity.backend_id.strip()
or os.getenv("BEAVER_BACKEND_IDENTITY__BACKEND_ID", "").strip()
or username.strip()
)
namespace = default_user_file_namespace(backend_id)
return FileAuthContext(
username=username.strip(),
backend_id=backend_id,
storage_namespace=namespace,
user_id=user_id,
scopes=scopes,
auth_source=auth_source,
)
def default_user_file_namespace(backend_id: str) -> str:
cleaned = backend_id.strip().strip("/")
return f"users/{cleaned}" if cleaned else "users/unconfigured"
def _storage_mode(config: BeaverConfig) -> str:
raw = os.getenv("BEAVER_USER_FILES_STORAGE_MODE", "").strip().lower()
if raw in {"local", "dev-local", "development"}:
return "local"
if raw in {"minio", "object", "object-storage"}:
return "minio"
if config.authz.enabled and config.authz.base_url.strip() and config.backend_identity.backend_id.strip():
return "minio"
return "local"
def _clean_optional(value: Any) -> str | None:
text = str(value or "").strip()
return text or None

View File

@ -0,0 +1,630 @@
"""User-visible file system service.
This module owns the personal file-system boundary exposed to users and
agents. Storage backends can change, but callers see only virtual paths under
fixed roots.
"""
from __future__ import annotations
from contextlib import suppress
from dataclasses import dataclass
from datetime import datetime, timezone
from io import BytesIO
import mimetypes
from pathlib import Path, PurePosixPath
import shutil
import tempfile
from typing import Protocol
USER_FILE_ROOTS = ("uploads", "outputs", "shared", "tasks")
MAX_PREVIEW_BYTES = 1024 * 1024
AGENT_UPLOADS_ERROR = "uploads/ is user-provided input storage; agents may read it but must not write it"
AGENT_DELETE_ERROR = "agents cannot delete user-visible files; use the Files page or user-side APIs"
class UserFileError(ValueError):
"""Base error for user file operations."""
class UserFilePathError(UserFileError):
"""Raised when a user file path violates the virtual path policy."""
class UserFileNotFoundError(UserFileError):
"""Raised when a user file path does not exist."""
class UserFileSizeError(UserFileError):
"""Raised when a user file upload exceeds configured limits."""
@dataclass(frozen=True, slots=True)
class AgentUserFilePolicy:
task_id: str | None = None
fallback_scope: str = "interactive"
@property
def task_namespace(self) -> str:
if self.task_id:
return f"tasks/{self.task_id}"
scope = _safe_scope(self.fallback_scope)
return f"tasks/interactive/{scope}"
def validate_read(self, path: str) -> str:
return normalize_user_path(path, allow_root=False)
def validate_write(self, path: str) -> str:
normalized = normalize_user_path(path, allow_root=False)
root = normalized.split("/", 1)[0]
if root == "uploads":
raise UserFilePathError(AGENT_UPLOADS_ERROR)
if root == "tasks":
self._validate_task_namespace(normalized)
return normalized
def validate_mkdir(self, path: str) -> str:
return self.validate_write(path)
def validate_delete(self, path: str) -> str:
normalize_user_path(path, allow_root=False)
raise UserFilePathError(AGENT_DELETE_ERROR)
def _validate_task_namespace(self, normalized: str) -> None:
namespace = self.task_namespace
if normalized == "tasks" or not normalized.startswith(f"{namespace}/"):
raise UserFilePathError(f"Agent task files must be written under {namespace}/")
@dataclass(slots=True)
class UserFileEntry:
name: str
path: str
type: str
size: int | None = None
content_type: str | None = None
modified: str | None = None
def to_dict(self) -> dict[str, object]:
return {
"name": self.name,
"path": self.path,
"type": self.type,
"size": self.size,
"content_type": self.content_type,
"modified": self.modified,
}
@dataclass(slots=True)
class UserFileContent:
name: str
path: str
size: int
content_type: str
modified: str | None
content: bytes
@dataclass(slots=True)
class UserFilePreview:
name: str
path: str
size: int
content_type: str
modified: str | None
is_binary: bool
is_truncated: bool
content: str | None
def to_dict(self) -> dict[str, object]:
return {
"name": self.name,
"path": self.path,
"size": self.size,
"content_type": self.content_type,
"modified": self.modified,
"is_binary": self.is_binary,
"is_truncated": self.is_truncated,
"content": self.content,
}
class UserFileStorage(Protocol):
async def list_dir(self, path: str) -> list[UserFileEntry]:
...
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
...
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
...
async def write_file_stream(
self,
path: str,
stream: object,
*,
content_type: str,
max_bytes: int | None = None,
part_size: int = 10 * 1024 * 1024,
) -> UserFileEntry:
...
async def delete_path(self, path: str) -> bool:
...
async def mkdir(self, path: str) -> UserFileEntry:
...
class UserFileService:
def __init__(self, storage: UserFileStorage) -> None:
self.storage = storage
async def browse(self, path: str = "") -> dict[str, object]:
normalized = normalize_user_path(path, allow_root=True)
if normalized == "":
return {
"path": "",
"items": [
UserFileEntry(name=root, path=root, type="directory").to_dict()
for root in USER_FILE_ROOTS
],
}
entries = await self.storage.list_dir(normalized)
return {"path": normalized, "items": [entry.to_dict() for entry in entries]}
async def upload(self, directory: str, filename: str, content: bytes, *, content_type: str) -> dict[str, object]:
if not is_safe_filename(filename):
raise UserFilePathError("Invalid filename")
target = normalize_user_path(_join_user_path(directory, filename), allow_root=False)
return (await self.storage.write_file(target, content, content_type=content_type)).to_dict()
async def upload_stream(
self,
directory: str,
filename: str,
stream: object,
*,
content_type: str,
max_bytes: int | None = None,
part_size: int = 10 * 1024 * 1024,
) -> dict[str, object]:
if not is_safe_filename(filename):
raise UserFilePathError("Invalid filename")
target = normalize_user_path(_join_user_path(directory, filename), allow_root=False)
return (
await self.storage.write_file_stream(
target,
stream,
content_type=content_type,
max_bytes=max_bytes,
part_size=part_size,
)
).to_dict()
async def write_file(self, path: str, content: bytes | str, *, content_type: str = "text/plain") -> dict[str, object]:
normalized = normalize_user_path(path, allow_root=False)
raw = content.encode("utf-8") if isinstance(content, str) else bytes(content)
return (await self.storage.write_file(normalized, raw, content_type=content_type)).to_dict()
async def download(self, path: str) -> UserFileContent:
return await self.storage.read_file(normalize_user_path(path, allow_root=False))
async def preview(self, path: str, *, max_bytes: int = MAX_PREVIEW_BYTES) -> dict[str, object]:
content = await self.storage.read_file(normalize_user_path(path, allow_root=False), max_bytes=max_bytes)
is_binary = _is_probably_binary(content.content, content.content_type)
text = None if is_binary else content.content.decode("utf-8", errors="replace")
return UserFilePreview(
name=content.name,
path=content.path,
size=content.size,
content_type=content.content_type,
modified=content.modified,
is_binary=is_binary,
is_truncated=content.size > len(content.content),
content=text,
).to_dict()
async def delete(self, path: str) -> bool:
normalized = normalize_user_path(path, allow_root=False)
if normalized in USER_FILE_ROOTS:
raise UserFilePathError("Cannot delete virtual root folders")
return await self.storage.delete_path(normalized)
async def mkdir(self, path: str) -> dict[str, object]:
normalized = normalize_user_path(path, allow_root=False)
if normalized in USER_FILE_ROOTS:
raise UserFilePathError("Virtual root folders already exist")
return (await self.storage.mkdir(normalized)).to_dict()
class LocalUserFileStorage:
"""Filesystem-backed storage adapter for tests and local development."""
def __init__(self, root: Path) -> None:
self.root = Path(root).expanduser().resolve()
self.root.mkdir(parents=True, exist_ok=True)
for name in USER_FILE_ROOTS:
(self.root / name).mkdir(parents=True, exist_ok=True)
async def list_dir(self, path: str) -> list[UserFileEntry]:
target = self._path(path)
if not target.exists():
target.mkdir(parents=True, exist_ok=True)
if not target.is_dir():
raise UserFilePathError("Path is not a directory")
entries: list[UserFileEntry] = []
for child in sorted(target.iterdir(), key=lambda item: (not item.is_dir(), item.name.lower())):
if child.name.startswith("."):
continue
entries.append(self._entry(child))
return entries
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
target = self._path(path)
if not target.is_file():
raise UserFileNotFoundError("File not found")
raw = target.read_bytes()
selected = raw[:max_bytes] if max_bytes is not None else raw
stat = target.stat()
content_type, _ = mimetypes.guess_type(target.name)
return UserFileContent(
name=target.name,
path=self._relative(target),
size=stat.st_size,
content_type=content_type or "application/octet-stream",
modified=_iso_from_timestamp(stat.st_mtime),
content=selected,
)
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
target = self._path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(content)
return self._entry(target, content_type=content_type)
async def write_file_stream(
self,
path: str,
stream: object,
*,
content_type: str,
max_bytes: int | None = None,
part_size: int = 10 * 1024 * 1024,
) -> UserFileEntry:
target = self._path(path)
target.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_name = tempfile.mkstemp(prefix=f".{target.name}.", suffix=".tmp", dir=target.parent)
tmp_path = Path(tmp_name)
total = 0
try:
with open(fd, "wb", closefd=True) as output:
while True:
chunk = stream.read(part_size) # type: ignore[attr-defined]
if not chunk:
break
total += len(chunk)
if max_bytes is not None and total > max_bytes:
raise UserFileSizeError(_size_error(max_bytes))
output.write(chunk)
tmp_path.replace(target)
except Exception:
with suppress(FileNotFoundError):
tmp_path.unlink()
raise
return self._entry(target, content_type=content_type)
async def delete_path(self, path: str) -> bool:
target = self._path(path)
if not target.exists():
return False
if target.is_dir():
shutil.rmtree(target)
else:
target.unlink()
return True
async def mkdir(self, path: str) -> UserFileEntry:
target = self._path(path)
target.mkdir(parents=True, exist_ok=True)
return self._entry(target)
def _path(self, path: str) -> Path:
normalized = normalize_user_path(path, allow_root=False)
target = (self.root / normalized).resolve()
try:
target.relative_to(self.root)
except ValueError as exc:
raise UserFilePathError("Path escapes user file root") from exc
return target
def _relative(self, path: Path) -> str:
return path.relative_to(self.root).as_posix()
def _entry(self, path: Path, *, content_type: str | None = None) -> UserFileEntry:
stat = path.stat()
guessed_type, _ = mimetypes.guess_type(path.name)
return UserFileEntry(
name=path.name,
path=self._relative(path),
type="directory" if path.is_dir() else "file",
size=None if path.is_dir() else stat.st_size,
content_type=None if path.is_dir() else (content_type or guessed_type or "application/octet-stream"),
modified=_iso_from_timestamp(stat.st_mtime),
)
@dataclass(slots=True)
class MinIOStorageConfig:
endpoint: str
access_key: str
secret_key: str
bucket: str
secure: bool = False
region: str | None = None
namespace: str = ""
class MinIOUserFileStorage:
"""MinIO-backed user file storage adapter."""
def __init__(self, config: MinIOStorageConfig) -> None:
if not config.endpoint or not config.access_key or not config.secret_key or not config.bucket:
raise ValueError("MinIO storage requires endpoint, access key, secret key, and bucket")
from minio import Minio
self.config = config
self.client = Minio(
endpoint=config.endpoint,
access_key=config.access_key,
secret_key=config.secret_key,
secure=config.secure,
region=config.region,
)
async def list_dir(self, path: str) -> list[UserFileEntry]:
prefix = self._object_prefix(path)
objects = self.client.list_objects(self.config.bucket, prefix=prefix, recursive=False)
entries: list[UserFileEntry] = []
for obj in objects:
object_name = str(obj.object_name or "")
user_path = self._user_path(object_name)
if not user_path or user_path == path or user_path.endswith("/.keep"):
continue
trimmed = user_path.rstrip("/")
name = PurePosixPath(trimmed).name
is_dir = bool(getattr(obj, "is_dir", False)) or object_name.endswith("/")
entries.append(
UserFileEntry(
name=name,
path=trimmed,
type="directory" if is_dir else "file",
size=None if is_dir else getattr(obj, "size", None),
content_type=None if is_dir else "application/octet-stream",
modified=obj.last_modified.isoformat() if getattr(obj, "last_modified", None) else None,
)
)
return sorted(entries, key=lambda item: (item.type != "directory", item.name.lower()))
async def read_file(self, path: str, *, max_bytes: int | None = None) -> UserFileContent:
object_name = self._object_name(path)
try:
stat = self.client.stat_object(self.config.bucket, object_name)
if max_bytes is None:
response = self.client.get_object(self.config.bucket, object_name)
else:
response = self.client.get_object(self.config.bucket, object_name, length=max_bytes)
raw = response.read()
response.close()
response.release_conn()
except Exception as exc:
raise UserFileNotFoundError("File not found") from exc
return UserFileContent(
name=PurePosixPath(path).name,
path=path,
size=int(stat.size or len(raw)),
content_type=stat.content_type or "application/octet-stream",
modified=stat.last_modified.isoformat() if stat.last_modified else None,
content=raw,
)
async def write_file(self, path: str, content: bytes, *, content_type: str) -> UserFileEntry:
object_name = self._object_name(path)
result = self.client.put_object(
self.config.bucket,
object_name,
BytesIO(content),
length=len(content),
content_type=content_type,
)
return UserFileEntry(
name=PurePosixPath(path).name,
path=path,
type="file",
size=len(content),
content_type=content_type,
modified=datetime.now(timezone.utc).isoformat(),
)
async def write_file_stream(
self,
path: str,
stream: object,
*,
content_type: str,
max_bytes: int | None = None,
part_size: int = 10 * 1024 * 1024,
) -> UserFileEntry:
object_name = self._object_name(path)
reader = _LimitedReadStream(stream, max_bytes=max_bytes)
try:
self.client.put_object(
self.config.bucket,
object_name,
reader,
length=-1,
part_size=max(5 * 1024 * 1024, part_size),
content_type=content_type,
)
except UserFileSizeError:
try:
self.client.remove_object(self.config.bucket, object_name)
except Exception:
pass
raise
return UserFileEntry(
name=PurePosixPath(path).name,
path=path,
type="file",
size=reader.bytes_read,
content_type=content_type,
modified=datetime.now(timezone.utc).isoformat(),
)
async def delete_path(self, path: str) -> bool:
object_name = self._object_name(path)
removed = False
try:
self.client.remove_object(self.config.bucket, object_name)
removed = True
except Exception:
pass
prefix = f"{object_name.rstrip('/')}/"
for obj in self.client.list_objects(self.config.bucket, prefix=prefix, recursive=True):
self.client.remove_object(self.config.bucket, str(obj.object_name))
removed = True
return removed
async def mkdir(self, path: str) -> UserFileEntry:
object_name = f"{self._object_name(path).rstrip('/')}/.keep"
self.client.put_object(
self.config.bucket,
object_name,
BytesIO(b""),
length=0,
content_type="application/x-directory",
)
return UserFileEntry(
name=PurePosixPath(path).name,
path=path,
type="directory",
size=None,
modified=datetime.now(timezone.utc).isoformat(),
)
def _namespace(self) -> str:
return self.config.namespace.strip("/")
def _object_name(self, path: str) -> str:
normalized = normalize_user_path(path, allow_root=False)
namespace = self._namespace()
object_name = f"{namespace}/{normalized}" if namespace else normalized
if object_name.startswith("/") or "/../" in f"/{object_name}/":
raise UserFilePathError("Object path escapes namespace")
return object_name
def _object_prefix(self, path: str) -> str:
return f"{self._object_name(path).rstrip('/')}/"
def _user_path(self, object_name: str) -> str:
namespace = self._namespace()
if namespace:
prefix = f"{namespace}/"
if not object_name.startswith(prefix):
raise UserFilePathError("Object path escapes namespace")
return object_name[len(prefix) :]
return object_name
def normalize_user_path(path: str | None, *, allow_root: bool) -> str:
original = (path or "").replace("\\", "/").strip()
if original.startswith("/"):
raise UserFilePathError("Absolute paths are not allowed")
raw = original.strip("/")
if raw == "":
if allow_root:
return ""
raise UserFilePathError("Path is required")
posix = PurePosixPath(raw)
if posix.is_absolute():
raise UserFilePathError("Absolute paths are not allowed")
parts = [part for part in posix.parts if part not in ("", ".")]
if any(part == ".." for part in parts):
raise UserFilePathError("Parent-directory traversal is not allowed")
if any(part.startswith(".") for part in parts):
raise UserFilePathError("Hidden implementation paths are not allowed")
if not parts or parts[0] not in USER_FILE_ROOTS:
raise UserFilePathError("Path must be under uploads, outputs, shared, or tasks")
return "/".join(parts)
def is_safe_filename(filename: str) -> bool:
return bool(filename) and "/" not in filename and "\\" not in filename and not filename.startswith(".")
def _join_user_path(directory: str, filename: str) -> str:
normalized_dir = normalize_user_path(directory, allow_root=False)
return f"{normalized_dir.rstrip('/')}/{filename}"
def _is_probably_binary(raw: bytes, content_type: str) -> bool:
if content_type.startswith("text/") or content_type in {
"application/json",
"application/javascript",
"application/xml",
"application/x-yaml",
}:
return False
if not raw:
return False
if b"\x00" in raw[:4096]:
return True
try:
raw[:4096].decode("utf-8")
except UnicodeDecodeError:
return True
return False
def _iso_from_timestamp(value: float) -> str:
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
def _safe_scope(value: str | None) -> str:
raw = (value or "interactive").strip()
allowed = [char if char.isalnum() or char in ("-", "_") else "-" for char in raw]
cleaned = "".join(allowed).strip("-_")
return cleaned or "interactive"
class _LimitedReadStream:
def __init__(self, stream: object, *, max_bytes: int | None = None) -> None:
self.stream = stream
self.max_bytes = max_bytes
self.bytes_read = 0
def read(self, size: int = -1) -> bytes:
chunk = self.stream.read(size) # type: ignore[attr-defined]
if not chunk:
return b""
self.bytes_read += len(chunk)
if self.max_bytes is not None and self.bytes_read > self.max_bytes:
raise UserFileSizeError(_size_error(self.max_bytes))
return chunk
def _size_error(max_bytes: int) -> str:
return f"File too large (max {_human_size(max_bytes)})"
def _human_size(size: int) -> str:
units = ("B", "KB", "MB", "GB", "TB")
value = float(size)
for unit in units:
if value < 1024 or unit == units[-1]:
return f"{value:.0f}{unit}" if unit == "B" else f"{value:.1f}{unit}"
value /= 1024
return f"{size}B"