Files
EverOS/src/everos/component/llm/client.py
tomtan 0910affc78
Some checks failed
CI / lint (push) Has been cancelled
CI / unit tests (push) Has been cancelled
CI / integration tests (push) Has been cancelled
CI / package build (push) Has been cancelled
Commit lint / pull request title (push) Has been cancelled
Commit lint / commit messages (push) Has been cancelled
Save local modifications for syncing
2026-06-10 10:05:52 +08:00

310 lines
10 KiB
Python

"""Process-wide LLM client accessor.
Lazy singleton — first call reads settings and builds the algo LLM
client; subsequent calls return the cached instance. Raises
:class:`LLMNotConfiguredError` when no credentials are present so
misconfiguration surfaces at app startup (via the LLM lifespan
provider) instead of silently failing per-request downstream.
"""
from __future__ import annotations
import base64
import binascii
from io import BytesIO
from typing import Any
from everalgo.llm import build_client
from everalgo.llm.config import LLMConfig
from everalgo.llm.protocols import LLMClient
from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart
from pydantic import BaseModel
from everos.config import load_settings
from everos.core.observability.logging import get_logger
logger = get_logger(__name__)
class LLMNotConfiguredError(RuntimeError):
"""Raised when ``settings.llm`` is missing ``api_key`` or ``base_url``."""
_llm_client: LLMClient | None = None
_multimodal_client: LLMClient | None = None
_VLM_IMAGE_MIN_SIDE = 1024
_NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs"
_NO_THINKING_PARAM = {"enable_thinking": False}
_IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval.
Output final Markdown directly; do not include reasoning.
Focus on:
1. Key visible objects and their names, brands, colors, labels, quantities.
2. Spatial relationships and relative positions: left/right/above/below/center,
foreground/background, nearby objects, and supporting surfaces.
3. Location-query facts, e.g. "the milk carton is center-left, to the right of
X and to the left of Y".
4. Important visible text, but extract only useful labels/interface text; do
not exhaustively OCR every key or menu item if that would crowd out object
locations.
Do NOT describe the parser, assistant, or ChatGPT as processing the image.
If "ChatGPT" is visible, list it only as visible interface text.
"""
class _NoThinkingRequestDefaultsClient:
"""Inject default no-thinking request params for OpenAI-compatible servers."""
def __init__(self, inner: LLMClient) -> None:
self._inner = inner
async def chat(
self,
messages: list[ChatMessage],
*,
model: str | None = None,
temperature: float | None = None,
max_tokens: int | None = None,
response_format: type[BaseModel] | None = None,
**extra: Any,
) -> ChatResponse:
return await self._inner.chat(
messages,
model=model,
temperature=temperature,
max_tokens=max_tokens,
response_format=response_format,
**_with_no_thinking_defaults(extra),
)
class _MultimodalImageDetailCompatClient:
"""Patch image parts for strict OpenAI-compatible gateways.
everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the
field is unset. Some gateways reject that literal null and require one of
OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing.
"""
def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None:
self._inner = inner
self._resize_images_for_vlm = resize_images_for_vlm
async def chat(
self,
messages: list[ChatMessage],
*,
model: str | None = None,
temperature: float | None = None,
max_tokens: int | None = None,
response_format: type[BaseModel] | None = None,
**extra: Any,
) -> ChatResponse:
return await self._inner.chat(
[
_with_multimodal_image_defaults(
m,
resize_images_for_vlm=self._resize_images_for_vlm,
)
for m in messages
],
model=model,
temperature=temperature,
max_tokens=max_tokens,
response_format=response_format,
**_with_no_thinking_defaults(extra),
)
def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]:
"""Return request kwargs with no-thinking enabled unless caller overrides."""
patched = dict(extra)
extra_body = dict(patched.get("extra_body") or {})
chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {})
chat_template_kwargs.setdefault(
"enable_thinking", _NO_THINKING_PARAM["enable_thinking"]
)
extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs
patched["extra_body"] = extra_body
return patched
def _with_multimodal_image_defaults(
message: ChatMessage, *, resize_images_for_vlm: bool = True
) -> ChatMessage:
"""Return a copy with stricter-gateway + visual-memory image defaults."""
content = message.content
if not isinstance(content, list):
return message
has_image = any(_is_image_part(part) for part in content)
instructions_added = False
changed = False
patched_parts: list[object] = []
for part in content:
patched = part
if isinstance(part, ImageUrlPart):
image_url_updates: dict[str, object] = {}
if part.image_url.detail is None:
image_url_updates["detail"] = "auto"
if resize_images_for_vlm:
resized_url = _resize_image_data_url(part.image_url.url)
if resized_url != part.image_url.url:
image_url_updates["url"] = resized_url
if image_url_updates:
image_url = part.image_url.model_copy(update=image_url_updates)
patched = part.model_copy(update={"image_url": image_url})
changed = True
if (
has_image
and not instructions_added
and isinstance(patched, TextPart)
and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT
):
patched = patched.model_copy(
update={"text": _IMAGE_VISUAL_MEMORY_PROMPT}
)
instructions_added = True
changed = True
patched_parts.append(patched)
if not changed:
return message
return message.model_copy(update={"content": patched_parts})
def _is_image_part(part: object) -> bool:
return (
isinstance(part, ImageUrlPart)
and part.image_url.url.startswith("data:image/")
)
def _resize_image_data_url(url: str) -> str:
"""Resize base64 data-url images so the shorter side is 64 pixels."""
if not url.startswith("data:image/"):
return url
try:
header, encoded = url.split(",", 1)
except ValueError:
return url
if ";base64" not in header.lower():
return url
mime_type = header[5:].split(";", 1)[0].lower()
image_format = {
"image/jpeg": "JPEG",
"image/jpg": "JPEG",
"image/png": "PNG",
"image/webp": "WEBP",
}.get(mime_type)
if image_format is None:
return url
try:
from PIL import Image, ImageOps
raw = base64.b64decode(encoded, validate=True)
with Image.open(BytesIO(raw)) as image:
image = ImageOps.exif_transpose(image)
target_size = _image_size_with_min_side(
image.size, _VLM_IMAGE_MIN_SIDE
)
resized = image.resize(target_size, Image.Resampling.LANCZOS)
if image_format == "JPEG" and resized.mode not in ("RGB", "L"):
resized = resized.convert("RGB")
buffer = BytesIO()
save_kwargs: dict[str, object] = {"format": image_format}
if image_format == "JPEG":
save_kwargs["quality"] = 85
resized.save(buffer, **save_kwargs)
except (ImportError, ValueError, OSError, binascii.Error):
return url
resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
return f"{header},{resized_encoded}"
def _image_size_with_min_side(
size: tuple[int, int],
min_side: int,
) -> tuple[int, int]:
width, height = size
shortest = min(width, height)
if shortest <= 0:
return (max(1, width), max(1, height))
scale = min_side / shortest
return (max(1, round(width * scale)), max(1, round(height * scale)))
def get_llm_client() -> LLMClient:
"""Return the singleton algo LLM client.
Raises:
LLMNotConfiguredError: When ``settings.llm.api_key`` or
``settings.llm.base_url`` is unset.
"""
global _llm_client
if _llm_client is not None:
return _llm_client
llm_cfg = load_settings().llm
api_key = (
llm_cfg.api_key.get_secret_value() if llm_cfg.api_key is not None else None
)
if not api_key or not llm_cfg.base_url:
raise LLMNotConfiguredError(
"LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL"
)
_llm_client = _NoThinkingRequestDefaultsClient(
build_client(
LLMConfig(
model=llm_cfg.model,
api_key=api_key,
base_url=llm_cfg.base_url,
timeout=llm_cfg.timeout_seconds,
)
)
)
logger.info("llm_client_built", model=llm_cfg.model)
return _llm_client
def get_multimodal_llm_client() -> LLMClient:
"""Return the singleton multimodal LLM client (for everalgo.parser).
Reads the flat ``[multimodal]`` config — kept separate from the main
``[llm]`` so parsing can target a vision/audio-capable endpoint.
Raises:
LLMNotConfiguredError: When ``settings.multimodal.api_key`` or
``settings.multimodal.base_url`` is unset.
"""
global _multimodal_client
if _multimodal_client is not None:
return _multimodal_client
cfg = load_settings().multimodal
api_key = cfg.api_key.get_secret_value() if cfg.api_key is not None else None
if not api_key or not cfg.base_url:
raise LLMNotConfiguredError(
"Multimodal LLM is required for parsing; set "
"EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL"
)
_multimodal_client = _MultimodalImageDetailCompatClient(
build_client(
LLMConfig(
model=cfg.model,
api_key=api_key,
base_url=cfg.base_url,
timeout=cfg.timeout_seconds,
)
),
resize_images_for_vlm=cfg.resize_images_for_vlm,
)
logger.info("multimodal_llm_client_built", model=cfg.model)
return _multimodal_client