Some checks failed
CI / lint (push) Has been cancelled
CI / unit tests (push) Has been cancelled
CI / integration tests (push) Has been cancelled
CI / package build (push) Has been cancelled
Commit lint / pull request title (push) Has been cancelled
Commit lint / commit messages (push) Has been cancelled
310 lines
10 KiB
Python
310 lines
10 KiB
Python
"""Process-wide LLM client accessor.
|
|
|
|
Lazy singleton — first call reads settings and builds the algo LLM
|
|
client; subsequent calls return the cached instance. Raises
|
|
:class:`LLMNotConfiguredError` when no credentials are present so
|
|
misconfiguration surfaces at app startup (via the LLM lifespan
|
|
provider) instead of silently failing per-request downstream.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import binascii
|
|
from io import BytesIO
|
|
from typing import Any
|
|
|
|
from everalgo.llm import build_client
|
|
from everalgo.llm.config import LLMConfig
|
|
from everalgo.llm.protocols import LLMClient
|
|
from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart
|
|
from pydantic import BaseModel
|
|
|
|
from everos.config import load_settings
|
|
from everos.core.observability.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class LLMNotConfiguredError(RuntimeError):
|
|
"""Raised when ``settings.llm`` is missing ``api_key`` or ``base_url``."""
|
|
|
|
|
|
_llm_client: LLMClient | None = None
|
|
_multimodal_client: LLMClient | None = None
|
|
_VLM_IMAGE_MIN_SIDE = 1024
|
|
_NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs"
|
|
_NO_THINKING_PARAM = {"enable_thinking": False}
|
|
|
|
_IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval.
|
|
|
|
Output final Markdown directly; do not include reasoning.
|
|
|
|
Focus on:
|
|
1. Key visible objects and their names, brands, colors, labels, quantities.
|
|
2. Spatial relationships and relative positions: left/right/above/below/center,
|
|
foreground/background, nearby objects, and supporting surfaces.
|
|
3. Location-query facts, e.g. "the milk carton is center-left, to the right of
|
|
X and to the left of Y".
|
|
4. Important visible text, but extract only useful labels/interface text; do
|
|
not exhaustively OCR every key or menu item if that would crowd out object
|
|
locations.
|
|
|
|
Do NOT describe the parser, assistant, or ChatGPT as processing the image.
|
|
If "ChatGPT" is visible, list it only as visible interface text.
|
|
"""
|
|
|
|
|
|
class _NoThinkingRequestDefaultsClient:
|
|
"""Inject default no-thinking request params for OpenAI-compatible servers."""
|
|
|
|
def __init__(self, inner: LLMClient) -> None:
|
|
self._inner = inner
|
|
|
|
async def chat(
|
|
self,
|
|
messages: list[ChatMessage],
|
|
*,
|
|
model: str | None = None,
|
|
temperature: float | None = None,
|
|
max_tokens: int | None = None,
|
|
response_format: type[BaseModel] | None = None,
|
|
**extra: Any,
|
|
) -> ChatResponse:
|
|
return await self._inner.chat(
|
|
messages,
|
|
model=model,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
response_format=response_format,
|
|
**_with_no_thinking_defaults(extra),
|
|
)
|
|
|
|
|
|
class _MultimodalImageDetailCompatClient:
|
|
"""Patch image parts for strict OpenAI-compatible gateways.
|
|
|
|
everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the
|
|
field is unset. Some gateways reject that literal null and require one of
|
|
OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing.
|
|
"""
|
|
|
|
def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None:
|
|
self._inner = inner
|
|
self._resize_images_for_vlm = resize_images_for_vlm
|
|
|
|
async def chat(
|
|
self,
|
|
messages: list[ChatMessage],
|
|
*,
|
|
model: str | None = None,
|
|
temperature: float | None = None,
|
|
max_tokens: int | None = None,
|
|
response_format: type[BaseModel] | None = None,
|
|
**extra: Any,
|
|
) -> ChatResponse:
|
|
return await self._inner.chat(
|
|
[
|
|
_with_multimodal_image_defaults(
|
|
m,
|
|
resize_images_for_vlm=self._resize_images_for_vlm,
|
|
)
|
|
for m in messages
|
|
],
|
|
model=model,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
response_format=response_format,
|
|
**_with_no_thinking_defaults(extra),
|
|
)
|
|
|
|
|
|
def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]:
|
|
"""Return request kwargs with no-thinking enabled unless caller overrides."""
|
|
patched = dict(extra)
|
|
extra_body = dict(patched.get("extra_body") or {})
|
|
chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {})
|
|
chat_template_kwargs.setdefault(
|
|
"enable_thinking", _NO_THINKING_PARAM["enable_thinking"]
|
|
)
|
|
extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs
|
|
patched["extra_body"] = extra_body
|
|
return patched
|
|
|
|
|
|
def _with_multimodal_image_defaults(
|
|
message: ChatMessage, *, resize_images_for_vlm: bool = True
|
|
) -> ChatMessage:
|
|
"""Return a copy with stricter-gateway + visual-memory image defaults."""
|
|
content = message.content
|
|
if not isinstance(content, list):
|
|
return message
|
|
|
|
has_image = any(_is_image_part(part) for part in content)
|
|
instructions_added = False
|
|
changed = False
|
|
patched_parts: list[object] = []
|
|
for part in content:
|
|
patched = part
|
|
if isinstance(part, ImageUrlPart):
|
|
image_url_updates: dict[str, object] = {}
|
|
if part.image_url.detail is None:
|
|
image_url_updates["detail"] = "auto"
|
|
if resize_images_for_vlm:
|
|
resized_url = _resize_image_data_url(part.image_url.url)
|
|
if resized_url != part.image_url.url:
|
|
image_url_updates["url"] = resized_url
|
|
if image_url_updates:
|
|
image_url = part.image_url.model_copy(update=image_url_updates)
|
|
patched = part.model_copy(update={"image_url": image_url})
|
|
changed = True
|
|
if (
|
|
has_image
|
|
and not instructions_added
|
|
and isinstance(patched, TextPart)
|
|
and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT
|
|
):
|
|
patched = patched.model_copy(
|
|
update={"text": _IMAGE_VISUAL_MEMORY_PROMPT}
|
|
)
|
|
instructions_added = True
|
|
changed = True
|
|
patched_parts.append(patched)
|
|
|
|
if not changed:
|
|
return message
|
|
return message.model_copy(update={"content": patched_parts})
|
|
|
|
|
|
def _is_image_part(part: object) -> bool:
|
|
return (
|
|
isinstance(part, ImageUrlPart)
|
|
and part.image_url.url.startswith("data:image/")
|
|
)
|
|
|
|
|
|
def _resize_image_data_url(url: str) -> str:
|
|
"""Resize base64 data-url images so the shorter side is 64 pixels."""
|
|
if not url.startswith("data:image/"):
|
|
return url
|
|
try:
|
|
header, encoded = url.split(",", 1)
|
|
except ValueError:
|
|
return url
|
|
if ";base64" not in header.lower():
|
|
return url
|
|
|
|
mime_type = header[5:].split(";", 1)[0].lower()
|
|
image_format = {
|
|
"image/jpeg": "JPEG",
|
|
"image/jpg": "JPEG",
|
|
"image/png": "PNG",
|
|
"image/webp": "WEBP",
|
|
}.get(mime_type)
|
|
if image_format is None:
|
|
return url
|
|
|
|
try:
|
|
from PIL import Image, ImageOps
|
|
|
|
raw = base64.b64decode(encoded, validate=True)
|
|
with Image.open(BytesIO(raw)) as image:
|
|
image = ImageOps.exif_transpose(image)
|
|
target_size = _image_size_with_min_side(
|
|
image.size, _VLM_IMAGE_MIN_SIDE
|
|
)
|
|
resized = image.resize(target_size, Image.Resampling.LANCZOS)
|
|
if image_format == "JPEG" and resized.mode not in ("RGB", "L"):
|
|
resized = resized.convert("RGB")
|
|
buffer = BytesIO()
|
|
save_kwargs: dict[str, object] = {"format": image_format}
|
|
if image_format == "JPEG":
|
|
save_kwargs["quality"] = 85
|
|
resized.save(buffer, **save_kwargs)
|
|
except (ImportError, ValueError, OSError, binascii.Error):
|
|
return url
|
|
|
|
resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
|
return f"{header},{resized_encoded}"
|
|
|
|
|
|
def _image_size_with_min_side(
|
|
size: tuple[int, int],
|
|
min_side: int,
|
|
) -> tuple[int, int]:
|
|
width, height = size
|
|
shortest = min(width, height)
|
|
if shortest <= 0:
|
|
return (max(1, width), max(1, height))
|
|
scale = min_side / shortest
|
|
return (max(1, round(width * scale)), max(1, round(height * scale)))
|
|
|
|
|
|
def get_llm_client() -> LLMClient:
|
|
"""Return the singleton algo LLM client.
|
|
|
|
Raises:
|
|
LLMNotConfiguredError: When ``settings.llm.api_key`` or
|
|
``settings.llm.base_url`` is unset.
|
|
"""
|
|
global _llm_client
|
|
if _llm_client is not None:
|
|
return _llm_client
|
|
|
|
llm_cfg = load_settings().llm
|
|
api_key = (
|
|
llm_cfg.api_key.get_secret_value() if llm_cfg.api_key is not None else None
|
|
)
|
|
if not api_key or not llm_cfg.base_url:
|
|
raise LLMNotConfiguredError(
|
|
"LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL"
|
|
)
|
|
_llm_client = _NoThinkingRequestDefaultsClient(
|
|
build_client(
|
|
LLMConfig(
|
|
model=llm_cfg.model,
|
|
api_key=api_key,
|
|
base_url=llm_cfg.base_url,
|
|
timeout=llm_cfg.timeout_seconds,
|
|
)
|
|
)
|
|
)
|
|
logger.info("llm_client_built", model=llm_cfg.model)
|
|
return _llm_client
|
|
|
|
|
|
def get_multimodal_llm_client() -> LLMClient:
|
|
"""Return the singleton multimodal LLM client (for everalgo.parser).
|
|
|
|
Reads the flat ``[multimodal]`` config — kept separate from the main
|
|
``[llm]`` so parsing can target a vision/audio-capable endpoint.
|
|
|
|
Raises:
|
|
LLMNotConfiguredError: When ``settings.multimodal.api_key`` or
|
|
``settings.multimodal.base_url`` is unset.
|
|
"""
|
|
global _multimodal_client
|
|
if _multimodal_client is not None:
|
|
return _multimodal_client
|
|
|
|
cfg = load_settings().multimodal
|
|
api_key = cfg.api_key.get_secret_value() if cfg.api_key is not None else None
|
|
if not api_key or not cfg.base_url:
|
|
raise LLMNotConfiguredError(
|
|
"Multimodal LLM is required for parsing; set "
|
|
"EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL"
|
|
)
|
|
_multimodal_client = _MultimodalImageDetailCompatClient(
|
|
build_client(
|
|
LLMConfig(
|
|
model=cfg.model,
|
|
api_key=api_key,
|
|
base_url=cfg.base_url,
|
|
timeout=cfg.timeout_seconds,
|
|
)
|
|
),
|
|
resize_images_for_vlm=cfg.resize_images_for_vlm,
|
|
)
|
|
logger.info("multimodal_llm_client_built", model=cfg.model)
|
|
return _multimodal_client
|