EverOS/src/everos/component/llm/client.py

"""Process-wide LLM client accessor.

Lazy singleton — first call reads settings and builds the algo LLM
client; subsequent calls return the cached instance. Raises
:class:`LLMNotConfiguredError` when no credentials are present so
misconfiguration surfaces at app startup (via the LLM lifespan
provider) instead of silently failing per-request downstream.
"""

from __future__ import annotations

import base64
import binascii
from io import BytesIO
from typing import Any

from everalgo.llm import build_client
from everalgo.llm.config import LLMConfig
from everalgo.llm.protocols import LLMClient
from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart
from pydantic import BaseModel

from everos.config import load_settings
from everos.core.observability.logging import get_logger

logger = get_logger(__name__)


class LLMNotConfiguredError(RuntimeError):
    """Raised when ``settings.llm`` is missing ``api_key`` or ``base_url``."""


_llm_client: LLMClient | None = None
_multimodal_client: LLMClient | None = None
_VLM_IMAGE_MIN_SIDE = 1024
_NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs"
_NO_THINKING_PARAM = {"enable_thinking": False}

_IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval.

Output final Markdown directly; do not include reasoning.

Focus on:
1. Key visible objects and their names, brands, colors, labels, quantities.
2. Spatial relationships and relative positions: left/right/above/below/center,
   foreground/background, nearby objects, and supporting surfaces.
3. Location-query facts, e.g. "the milk carton is center-left, to the right of
   X and to the left of Y".
4. Important visible text, but extract only useful labels/interface text; do
   not exhaustively OCR every key or menu item if that would crowd out object
   locations.

Do NOT describe the parser, assistant, or ChatGPT as processing the image.
If "ChatGPT" is visible, list it only as visible interface text.
"""


class _NoThinkingRequestDefaultsClient:
    """Inject default no-thinking request params for OpenAI-compatible servers."""

    def __init__(self, inner: LLMClient) -> None:
        self._inner = inner

    async def chat(
        self,
        messages: list[ChatMessage],
        *,
        model: str | None = None,
        temperature: float | None = None,
        max_tokens: int | None = None,
        response_format: type[BaseModel] | None = None,
        **extra: Any,
    ) -> ChatResponse:
        return await self._inner.chat(
            messages,
            model=model,
            temperature=temperature,
            max_tokens=max_tokens,
            response_format=response_format,
            **_with_no_thinking_defaults(extra),
        )


class _MultimodalImageDetailCompatClient:
    """Patch image parts for strict OpenAI-compatible gateways.

    everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the
    field is unset. Some gateways reject that literal null and require one of
    OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing.
    """

    def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None:
        self._inner = inner
        self._resize_images_for_vlm = resize_images_for_vlm

    async def chat(
        self,
        messages: list[ChatMessage],
        *,
        model: str | None = None,
        temperature: float | None = None,
        max_tokens: int | None = None,
        response_format: type[BaseModel] | None = None,
        **extra: Any,
    ) -> ChatResponse:
        return await self._inner.chat(
            [
                _with_multimodal_image_defaults(
                    m,
                    resize_images_for_vlm=self._resize_images_for_vlm,
                )
                for m in messages
            ],
            model=model,
            temperature=temperature,
            max_tokens=max_tokens,
            response_format=response_format,
            **_with_no_thinking_defaults(extra),
        )


def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]:
    """Return request kwargs with no-thinking enabled unless caller overrides."""
    patched = dict(extra)
    extra_body = dict(patched.get("extra_body") or {})
    chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {})
    chat_template_kwargs.setdefault(
        "enable_thinking", _NO_THINKING_PARAM["enable_thinking"]
    )
    extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs
    patched["extra_body"] = extra_body
    return patched


def _with_multimodal_image_defaults(
    message: ChatMessage, *, resize_images_for_vlm: bool = True
) -> ChatMessage:
    """Return a copy with stricter-gateway + visual-memory image defaults."""
    content = message.content
    if not isinstance(content, list):
        return message

    has_image = any(_is_image_part(part) for part in content)
    instructions_added = False
    changed = False
    patched_parts: list[object] = []
    for part in content:
        patched = part
        if isinstance(part, ImageUrlPart):
            image_url_updates: dict[str, object] = {}
            if part.image_url.detail is None:
                image_url_updates["detail"] = "auto"
            if resize_images_for_vlm:
                resized_url = _resize_image_data_url(part.image_url.url)
                if resized_url != part.image_url.url:
                    image_url_updates["url"] = resized_url
            if image_url_updates:
                image_url = part.image_url.model_copy(update=image_url_updates)
                patched = part.model_copy(update={"image_url": image_url})
                changed = True
        if (
            has_image
            and not instructions_added
            and isinstance(patched, TextPart)
            and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT
        ):
            patched = patched.model_copy(
                update={"text": _IMAGE_VISUAL_MEMORY_PROMPT}
            )
            instructions_added = True
            changed = True
        patched_parts.append(patched)

    if not changed:
        return message
    return message.model_copy(update={"content": patched_parts})


def _is_image_part(part: object) -> bool:
    return (
        isinstance(part, ImageUrlPart)
        and part.image_url.url.startswith("data:image/")
    )


def _resize_image_data_url(url: str) -> str:
    """Resize base64 data-url images so the shorter side is 64 pixels."""
    if not url.startswith("data:image/"):
        return url
    try:
        header, encoded = url.split(",", 1)
    except ValueError:
        return url
    if ";base64" not in header.lower():
        return url

    mime_type = header[5:].split(";", 1)[0].lower()
    image_format = {
        "image/jpeg": "JPEG",
        "image/jpg": "JPEG",
        "image/png": "PNG",
        "image/webp": "WEBP",
    }.get(mime_type)
    if image_format is None:
        return url

    try:
        from PIL import Image, ImageOps

        raw = base64.b64decode(encoded, validate=True)
        with Image.open(BytesIO(raw)) as image:
            image = ImageOps.exif_transpose(image)
            target_size = _image_size_with_min_side(
                image.size, _VLM_IMAGE_MIN_SIDE
            )
            resized = image.resize(target_size, Image.Resampling.LANCZOS)
            if image_format == "JPEG" and resized.mode not in ("RGB", "L"):
                resized = resized.convert("RGB")
            buffer = BytesIO()
            save_kwargs: dict[str, object] = {"format": image_format}
            if image_format == "JPEG":
                save_kwargs["quality"] = 85
            resized.save(buffer, **save_kwargs)
    except (ImportError, ValueError, OSError, binascii.Error):
        return url

    resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
    return f"{header},{resized_encoded}"


def _image_size_with_min_side(
    size: tuple[int, int],
    min_side: int,
) -> tuple[int, int]:
    width, height = size
    shortest = min(width, height)
    if shortest <= 0:
        return (max(1, width), max(1, height))
    scale = min_side / shortest
    return (max(1, round(width * scale)), max(1, round(height * scale)))


def get_llm_client() -> LLMClient:
    """Return the singleton algo LLM client.

    Raises:
        LLMNotConfiguredError: When ``settings.llm.api_key`` or
            ``settings.llm.base_url`` is unset.
    """
    global _llm_client
    if _llm_client is not None:
        return _llm_client

    llm_cfg = load_settings().llm
    api_key = (
        llm_cfg.api_key.get_secret_value() if llm_cfg.api_key is not None else None
    )
    if not api_key or not llm_cfg.base_url:
        raise LLMNotConfiguredError(
            "LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL"
        )
    _llm_client = _NoThinkingRequestDefaultsClient(
        build_client(
            LLMConfig(
                model=llm_cfg.model,
                api_key=api_key,
                base_url=llm_cfg.base_url,
                timeout=llm_cfg.timeout_seconds,
            )
        )
    )
    logger.info("llm_client_built", model=llm_cfg.model)
    return _llm_client


def get_multimodal_llm_client() -> LLMClient:
    """Return the singleton multimodal LLM client (for everalgo.parser).

    Reads the flat ``[multimodal]`` config — kept separate from the main
    ``[llm]`` so parsing can target a vision/audio-capable endpoint.

    Raises:
        LLMNotConfiguredError: When ``settings.multimodal.api_key`` or
            ``settings.multimodal.base_url`` is unset.
    """
    global _multimodal_client
    if _multimodal_client is not None:
        return _multimodal_client

    cfg = load_settings().multimodal
    api_key = cfg.api_key.get_secret_value() if cfg.api_key is not None else None
    if not api_key or not cfg.base_url:
        raise LLMNotConfiguredError(
            "Multimodal LLM is required for parsing; set "
            "EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL"
        )
    _multimodal_client = _MultimodalImageDetailCompatClient(
        build_client(
            LLMConfig(
                model=cfg.model,
                api_key=api_key,
                base_url=cfg.base_url,
                timeout=cfg.timeout_seconds,
            )
        ),
        resize_images_for_vlm=cfg.resize_images_for_vlm,
    )
    logger.info("multimodal_llm_client_built", model=cfg.model)
    return _multimodal_client