"""Process-wide LLM client accessor. Lazy singleton — first call reads settings and builds the algo LLM client; subsequent calls return the cached instance. Raises :class:`LLMNotConfiguredError` when no credentials are present so misconfiguration surfaces at app startup (via the LLM lifespan provider) instead of silently failing per-request downstream. """ from __future__ import annotations import base64 import binascii from io import BytesIO from typing import Any from everalgo.llm import build_client from everalgo.llm.config import LLMConfig from everalgo.llm.protocols import LLMClient from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart from pydantic import BaseModel from everos.config import load_settings from everos.core.observability.logging import get_logger logger = get_logger(__name__) class LLMNotConfiguredError(RuntimeError): """Raised when ``settings.llm`` is missing ``api_key`` or ``base_url``.""" _llm_client: LLMClient | None = None _multimodal_client: LLMClient | None = None _VLM_IMAGE_MIN_SIDE = 1024 _NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs" _NO_THINKING_PARAM = {"enable_thinking": False} _IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval. Output final Markdown directly; do not include reasoning. Focus on: 1. Key visible objects and their names, brands, colors, labels, quantities. 2. Spatial relationships and relative positions: left/right/above/below/center, foreground/background, nearby objects, and supporting surfaces. 3. Location-query facts, e.g. "the milk carton is center-left, to the right of X and to the left of Y". 4. Important visible text, but extract only useful labels/interface text; do not exhaustively OCR every key or menu item if that would crowd out object locations. Do NOT describe the parser, assistant, or ChatGPT as processing the image. If "ChatGPT" is visible, list it only as visible interface text. """ class _NoThinkingRequestDefaultsClient: """Inject default no-thinking request params for OpenAI-compatible servers.""" def __init__(self, inner: LLMClient) -> None: self._inner = inner async def chat( self, messages: list[ChatMessage], *, model: str | None = None, temperature: float | None = None, max_tokens: int | None = None, response_format: type[BaseModel] | None = None, **extra: Any, ) -> ChatResponse: return await self._inner.chat( messages, model=model, temperature=temperature, max_tokens=max_tokens, response_format=response_format, **_with_no_thinking_defaults(extra), ) class _MultimodalImageDetailCompatClient: """Patch image parts for strict OpenAI-compatible gateways. everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the field is unset. Some gateways reject that literal null and require one of OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing. """ def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None: self._inner = inner self._resize_images_for_vlm = resize_images_for_vlm async def chat( self, messages: list[ChatMessage], *, model: str | None = None, temperature: float | None = None, max_tokens: int | None = None, response_format: type[BaseModel] | None = None, **extra: Any, ) -> ChatResponse: return await self._inner.chat( [ _with_multimodal_image_defaults( m, resize_images_for_vlm=self._resize_images_for_vlm, ) for m in messages ], model=model, temperature=temperature, max_tokens=max_tokens, response_format=response_format, **_with_no_thinking_defaults(extra), ) def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]: """Return request kwargs with no-thinking enabled unless caller overrides.""" patched = dict(extra) extra_body = dict(patched.get("extra_body") or {}) chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {}) chat_template_kwargs.setdefault( "enable_thinking", _NO_THINKING_PARAM["enable_thinking"] ) extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs patched["extra_body"] = extra_body return patched def _with_multimodal_image_defaults( message: ChatMessage, *, resize_images_for_vlm: bool = True ) -> ChatMessage: """Return a copy with stricter-gateway + visual-memory image defaults.""" content = message.content if not isinstance(content, list): return message has_image = any(_is_image_part(part) for part in content) instructions_added = False changed = False patched_parts: list[object] = [] for part in content: patched = part if isinstance(part, ImageUrlPart): image_url_updates: dict[str, object] = {} if part.image_url.detail is None: image_url_updates["detail"] = "auto" if resize_images_for_vlm: resized_url = _resize_image_data_url(part.image_url.url) if resized_url != part.image_url.url: image_url_updates["url"] = resized_url if image_url_updates: image_url = part.image_url.model_copy(update=image_url_updates) patched = part.model_copy(update={"image_url": image_url}) changed = True if ( has_image and not instructions_added and isinstance(patched, TextPart) and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT ): patched = patched.model_copy( update={"text": _IMAGE_VISUAL_MEMORY_PROMPT} ) instructions_added = True changed = True patched_parts.append(patched) if not changed: return message return message.model_copy(update={"content": patched_parts}) def _is_image_part(part: object) -> bool: return ( isinstance(part, ImageUrlPart) and part.image_url.url.startswith("data:image/") ) def _resize_image_data_url(url: str) -> str: """Resize base64 data-url images so the shorter side is 64 pixels.""" if not url.startswith("data:image/"): return url try: header, encoded = url.split(",", 1) except ValueError: return url if ";base64" not in header.lower(): return url mime_type = header[5:].split(";", 1)[0].lower() image_format = { "image/jpeg": "JPEG", "image/jpg": "JPEG", "image/png": "PNG", "image/webp": "WEBP", }.get(mime_type) if image_format is None: return url try: from PIL import Image, ImageOps raw = base64.b64decode(encoded, validate=True) with Image.open(BytesIO(raw)) as image: image = ImageOps.exif_transpose(image) target_size = _image_size_with_min_side( image.size, _VLM_IMAGE_MIN_SIDE ) resized = image.resize(target_size, Image.Resampling.LANCZOS) if image_format == "JPEG" and resized.mode not in ("RGB", "L"): resized = resized.convert("RGB") buffer = BytesIO() save_kwargs: dict[str, object] = {"format": image_format} if image_format == "JPEG": save_kwargs["quality"] = 85 resized.save(buffer, **save_kwargs) except (ImportError, ValueError, OSError, binascii.Error): return url resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii") return f"{header},{resized_encoded}" def _image_size_with_min_side( size: tuple[int, int], min_side: int, ) -> tuple[int, int]: width, height = size shortest = min(width, height) if shortest <= 0: return (max(1, width), max(1, height)) scale = min_side / shortest return (max(1, round(width * scale)), max(1, round(height * scale))) def get_llm_client() -> LLMClient: """Return the singleton algo LLM client. Raises: LLMNotConfiguredError: When ``settings.llm.api_key`` or ``settings.llm.base_url`` is unset. """ global _llm_client if _llm_client is not None: return _llm_client llm_cfg = load_settings().llm api_key = ( llm_cfg.api_key.get_secret_value() if llm_cfg.api_key is not None else None ) if not api_key or not llm_cfg.base_url: raise LLMNotConfiguredError( "LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL" ) _llm_client = _NoThinkingRequestDefaultsClient( build_client( LLMConfig( model=llm_cfg.model, api_key=api_key, base_url=llm_cfg.base_url, timeout=llm_cfg.timeout_seconds, ) ) ) logger.info("llm_client_built", model=llm_cfg.model) return _llm_client def get_multimodal_llm_client() -> LLMClient: """Return the singleton multimodal LLM client (for everalgo.parser). Reads the flat ``[multimodal]`` config — kept separate from the main ``[llm]`` so parsing can target a vision/audio-capable endpoint. Raises: LLMNotConfiguredError: When ``settings.multimodal.api_key`` or ``settings.multimodal.base_url`` is unset. """ global _multimodal_client if _multimodal_client is not None: return _multimodal_client cfg = load_settings().multimodal api_key = cfg.api_key.get_secret_value() if cfg.api_key is not None else None if not api_key or not cfg.base_url: raise LLMNotConfiguredError( "Multimodal LLM is required for parsing; set " "EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL" ) _multimodal_client = _MultimodalImageDetailCompatClient( build_client( LLMConfig( model=cfg.model, api_key=api_key, base_url=cfg.base_url, timeout=cfg.timeout_seconds, ) ), resize_images_for_vlm=cfg.resize_images_for_vlm, ) logger.info("multimodal_llm_client_built", model=cfg.model) return _multimodal_client