Save local modifications for syncing
Some checks failed
CI / lint (push) Has been cancelled
CI / unit tests (push) Has been cancelled
CI / integration tests (push) Has been cancelled
CI / package build (push) Has been cancelled
Commit lint / pull request title (push) Has been cancelled
Commit lint / commit messages (push) Has been cancelled
Some checks failed
CI / lint (push) Has been cancelled
CI / unit tests (push) Has been cancelled
CI / integration tests (push) Has been cancelled
CI / package build (push) Has been cancelled
Commit lint / pull request title (push) Has been cancelled
Commit lint / commit messages (push) Has been cancelled
This commit is contained in:
@ -9,9 +9,16 @@ provider) instead of silently failing per-request downstream.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
from everalgo.llm import build_client
|
||||
from everalgo.llm.config import LLMConfig
|
||||
from everalgo.llm.protocols import LLMClient
|
||||
from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart
|
||||
from pydantic import BaseModel
|
||||
|
||||
from everos.config import load_settings
|
||||
from everos.core.observability.logging import get_logger
|
||||
@ -25,6 +32,212 @@ class LLMNotConfiguredError(RuntimeError):
|
||||
|
||||
_llm_client: LLMClient | None = None
|
||||
_multimodal_client: LLMClient | None = None
|
||||
_VLM_IMAGE_MIN_SIDE = 1024
|
||||
_NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs"
|
||||
_NO_THINKING_PARAM = {"enable_thinking": False}
|
||||
|
||||
_IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval.
|
||||
|
||||
Output final Markdown directly; do not include reasoning.
|
||||
|
||||
Focus on:
|
||||
1. Key visible objects and their names, brands, colors, labels, quantities.
|
||||
2. Spatial relationships and relative positions: left/right/above/below/center,
|
||||
foreground/background, nearby objects, and supporting surfaces.
|
||||
3. Location-query facts, e.g. "the milk carton is center-left, to the right of
|
||||
X and to the left of Y".
|
||||
4. Important visible text, but extract only useful labels/interface text; do
|
||||
not exhaustively OCR every key or menu item if that would crowd out object
|
||||
locations.
|
||||
|
||||
Do NOT describe the parser, assistant, or ChatGPT as processing the image.
|
||||
If "ChatGPT" is visible, list it only as visible interface text.
|
||||
"""
|
||||
|
||||
|
||||
class _NoThinkingRequestDefaultsClient:
|
||||
"""Inject default no-thinking request params for OpenAI-compatible servers."""
|
||||
|
||||
def __init__(self, inner: LLMClient) -> None:
|
||||
self._inner = inner
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
messages: list[ChatMessage],
|
||||
*,
|
||||
model: str | None = None,
|
||||
temperature: float | None = None,
|
||||
max_tokens: int | None = None,
|
||||
response_format: type[BaseModel] | None = None,
|
||||
**extra: Any,
|
||||
) -> ChatResponse:
|
||||
return await self._inner.chat(
|
||||
messages,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
**_with_no_thinking_defaults(extra),
|
||||
)
|
||||
|
||||
|
||||
class _MultimodalImageDetailCompatClient:
|
||||
"""Patch image parts for strict OpenAI-compatible gateways.
|
||||
|
||||
everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the
|
||||
field is unset. Some gateways reject that literal null and require one of
|
||||
OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None:
|
||||
self._inner = inner
|
||||
self._resize_images_for_vlm = resize_images_for_vlm
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
messages: list[ChatMessage],
|
||||
*,
|
||||
model: str | None = None,
|
||||
temperature: float | None = None,
|
||||
max_tokens: int | None = None,
|
||||
response_format: type[BaseModel] | None = None,
|
||||
**extra: Any,
|
||||
) -> ChatResponse:
|
||||
return await self._inner.chat(
|
||||
[
|
||||
_with_multimodal_image_defaults(
|
||||
m,
|
||||
resize_images_for_vlm=self._resize_images_for_vlm,
|
||||
)
|
||||
for m in messages
|
||||
],
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
**_with_no_thinking_defaults(extra),
|
||||
)
|
||||
|
||||
|
||||
def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Return request kwargs with no-thinking enabled unless caller overrides."""
|
||||
patched = dict(extra)
|
||||
extra_body = dict(patched.get("extra_body") or {})
|
||||
chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {})
|
||||
chat_template_kwargs.setdefault(
|
||||
"enable_thinking", _NO_THINKING_PARAM["enable_thinking"]
|
||||
)
|
||||
extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs
|
||||
patched["extra_body"] = extra_body
|
||||
return patched
|
||||
|
||||
|
||||
def _with_multimodal_image_defaults(
|
||||
message: ChatMessage, *, resize_images_for_vlm: bool = True
|
||||
) -> ChatMessage:
|
||||
"""Return a copy with stricter-gateway + visual-memory image defaults."""
|
||||
content = message.content
|
||||
if not isinstance(content, list):
|
||||
return message
|
||||
|
||||
has_image = any(_is_image_part(part) for part in content)
|
||||
instructions_added = False
|
||||
changed = False
|
||||
patched_parts: list[object] = []
|
||||
for part in content:
|
||||
patched = part
|
||||
if isinstance(part, ImageUrlPart):
|
||||
image_url_updates: dict[str, object] = {}
|
||||
if part.image_url.detail is None:
|
||||
image_url_updates["detail"] = "auto"
|
||||
if resize_images_for_vlm:
|
||||
resized_url = _resize_image_data_url(part.image_url.url)
|
||||
if resized_url != part.image_url.url:
|
||||
image_url_updates["url"] = resized_url
|
||||
if image_url_updates:
|
||||
image_url = part.image_url.model_copy(update=image_url_updates)
|
||||
patched = part.model_copy(update={"image_url": image_url})
|
||||
changed = True
|
||||
if (
|
||||
has_image
|
||||
and not instructions_added
|
||||
and isinstance(patched, TextPart)
|
||||
and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT
|
||||
):
|
||||
patched = patched.model_copy(
|
||||
update={"text": _IMAGE_VISUAL_MEMORY_PROMPT}
|
||||
)
|
||||
instructions_added = True
|
||||
changed = True
|
||||
patched_parts.append(patched)
|
||||
|
||||
if not changed:
|
||||
return message
|
||||
return message.model_copy(update={"content": patched_parts})
|
||||
|
||||
|
||||
def _is_image_part(part: object) -> bool:
|
||||
return (
|
||||
isinstance(part, ImageUrlPart)
|
||||
and part.image_url.url.startswith("data:image/")
|
||||
)
|
||||
|
||||
|
||||
def _resize_image_data_url(url: str) -> str:
|
||||
"""Resize base64 data-url images so the shorter side is 64 pixels."""
|
||||
if not url.startswith("data:image/"):
|
||||
return url
|
||||
try:
|
||||
header, encoded = url.split(",", 1)
|
||||
except ValueError:
|
||||
return url
|
||||
if ";base64" not in header.lower():
|
||||
return url
|
||||
|
||||
mime_type = header[5:].split(";", 1)[0].lower()
|
||||
image_format = {
|
||||
"image/jpeg": "JPEG",
|
||||
"image/jpg": "JPEG",
|
||||
"image/png": "PNG",
|
||||
"image/webp": "WEBP",
|
||||
}.get(mime_type)
|
||||
if image_format is None:
|
||||
return url
|
||||
|
||||
try:
|
||||
from PIL import Image, ImageOps
|
||||
|
||||
raw = base64.b64decode(encoded, validate=True)
|
||||
with Image.open(BytesIO(raw)) as image:
|
||||
image = ImageOps.exif_transpose(image)
|
||||
target_size = _image_size_with_min_side(
|
||||
image.size, _VLM_IMAGE_MIN_SIDE
|
||||
)
|
||||
resized = image.resize(target_size, Image.Resampling.LANCZOS)
|
||||
if image_format == "JPEG" and resized.mode not in ("RGB", "L"):
|
||||
resized = resized.convert("RGB")
|
||||
buffer = BytesIO()
|
||||
save_kwargs: dict[str, object] = {"format": image_format}
|
||||
if image_format == "JPEG":
|
||||
save_kwargs["quality"] = 85
|
||||
resized.save(buffer, **save_kwargs)
|
||||
except (ImportError, ValueError, OSError, binascii.Error):
|
||||
return url
|
||||
|
||||
resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
||||
return f"{header},{resized_encoded}"
|
||||
|
||||
|
||||
def _image_size_with_min_side(
|
||||
size: tuple[int, int],
|
||||
min_side: int,
|
||||
) -> tuple[int, int]:
|
||||
width, height = size
|
||||
shortest = min(width, height)
|
||||
if shortest <= 0:
|
||||
return (max(1, width), max(1, height))
|
||||
scale = min_side / shortest
|
||||
return (max(1, round(width * scale)), max(1, round(height * scale)))
|
||||
|
||||
|
||||
def get_llm_client() -> LLMClient:
|
||||
@ -46,11 +259,14 @@ def get_llm_client() -> LLMClient:
|
||||
raise LLMNotConfiguredError(
|
||||
"LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL"
|
||||
)
|
||||
_llm_client = build_client(
|
||||
LLMConfig(
|
||||
model=llm_cfg.model,
|
||||
api_key=api_key,
|
||||
base_url=llm_cfg.base_url,
|
||||
_llm_client = _NoThinkingRequestDefaultsClient(
|
||||
build_client(
|
||||
LLMConfig(
|
||||
model=llm_cfg.model,
|
||||
api_key=api_key,
|
||||
base_url=llm_cfg.base_url,
|
||||
timeout=llm_cfg.timeout_seconds,
|
||||
)
|
||||
)
|
||||
)
|
||||
logger.info("llm_client_built", model=llm_cfg.model)
|
||||
@ -78,12 +294,16 @@ def get_multimodal_llm_client() -> LLMClient:
|
||||
"Multimodal LLM is required for parsing; set "
|
||||
"EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL"
|
||||
)
|
||||
_multimodal_client = build_client(
|
||||
LLMConfig(
|
||||
model=cfg.model,
|
||||
api_key=api_key,
|
||||
base_url=cfg.base_url,
|
||||
)
|
||||
_multimodal_client = _MultimodalImageDetailCompatClient(
|
||||
build_client(
|
||||
LLMConfig(
|
||||
model=cfg.model,
|
||||
api_key=api_key,
|
||||
base_url=cfg.base_url,
|
||||
timeout=cfg.timeout_seconds,
|
||||
)
|
||||
),
|
||||
resize_images_for_vlm=cfg.resize_images_for_vlm,
|
||||
)
|
||||
logger.info("multimodal_llm_client_built", model=cfg.model)
|
||||
return _multimodal_client
|
||||
|
||||
Reference in New Issue
Block a user