Save local modifications for syncing

2026-06-10 10:05:52 +08:00
parent 9fc6ad20d2
commit 0910affc78
13 changed files with 738 additions and 24 deletions
--- a/src/everos/component/llm/client.py
+++ b/src/everos/component/llm/client.py
@ -9,9 +9,16 @@ provider) instead of silently failing per-request downstream.

 from __future__ import annotations

+import base64
+import binascii
+from io import BytesIO
+from typing import Any
+
 from everalgo.llm import build_client
 from everalgo.llm.config import LLMConfig
 from everalgo.llm.protocols import LLMClient
+from everalgo.llm.types import ChatMessage, ChatResponse, ImageUrlPart, TextPart
+from pydantic import BaseModel

 from everos.config import load_settings
 from everos.core.observability.logging import get_logger
@ -25,6 +32,212 @@ class LLMNotConfiguredError(RuntimeError):

 _llm_client: LLMClient | None = None
 _multimodal_client: LLMClient | None = None
+_VLM_IMAGE_MIN_SIDE = 1024
+_NO_THINKING_EXTRA_BODY_KEY = "chat_template_kwargs"
+_NO_THINKING_PARAM = {"enable_thinking": False}
+
+_IMAGE_VISUAL_MEMORY_PROMPT = """Describe this image for visual memory retrieval.
+
+Output final Markdown directly; do not include reasoning.
+
+Focus on:
+1. Key visible objects and their names, brands, colors, labels, quantities.
+2. Spatial relationships and relative positions: left/right/above/below/center,
+   foreground/background, nearby objects, and supporting surfaces.
+3. Location-query facts, e.g. "the milk carton is center-left, to the right of
+   X and to the left of Y".
+4. Important visible text, but extract only useful labels/interface text; do
+   not exhaustively OCR every key or menu item if that would crowd out object
+   locations.
+
+Do NOT describe the parser, assistant, or ChatGPT as processing the image.
+If "ChatGPT" is visible, list it only as visible interface text.
+"""
+
+
+class _NoThinkingRequestDefaultsClient:
+    """Inject default no-thinking request params for OpenAI-compatible servers."""
+
+    def __init__(self, inner: LLMClient) -> None:
+        self._inner = inner
+
+    async def chat(
+        self,
+        messages: list[ChatMessage],
+        *,
+        model: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        response_format: type[BaseModel] | None = None,
+        **extra: Any,
+    ) -> ChatResponse:
+        return await self._inner.chat(
+            messages,
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            response_format=response_format,
+            **_with_no_thinking_defaults(extra),
+        )
+
+
+class _MultimodalImageDetailCompatClient:
+    """Patch image parts for strict OpenAI-compatible gateways.
+
+    everalgo-core 0.2.0 serialises ``image_url.detail`` as ``None`` when the
+    field is unset. Some gateways reject that literal null and require one of
+    OpenAI's enum values. EverOS only uses this wrapper for multimodal parsing.
+    """
+
+    def __init__(self, inner: LLMClient, *, resize_images_for_vlm: bool) -> None:
+        self._inner = inner
+        self._resize_images_for_vlm = resize_images_for_vlm
+
+    async def chat(
+        self,
+        messages: list[ChatMessage],
+        *,
+        model: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        response_format: type[BaseModel] | None = None,
+        **extra: Any,
+    ) -> ChatResponse:
+        return await self._inner.chat(
+            [
+                _with_multimodal_image_defaults(
+                    m,
+                    resize_images_for_vlm=self._resize_images_for_vlm,
+                )
+                for m in messages
+            ],
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            response_format=response_format,
+            **_with_no_thinking_defaults(extra),
+        )
+
+
+def _with_no_thinking_defaults(extra: dict[str, Any]) -> dict[str, Any]:
+    """Return request kwargs with no-thinking enabled unless caller overrides."""
+    patched = dict(extra)
+    extra_body = dict(patched.get("extra_body") or {})
+    chat_template_kwargs = dict(extra_body.get(_NO_THINKING_EXTRA_BODY_KEY) or {})
+    chat_template_kwargs.setdefault(
+        "enable_thinking", _NO_THINKING_PARAM["enable_thinking"]
+    )
+    extra_body[_NO_THINKING_EXTRA_BODY_KEY] = chat_template_kwargs
+    patched["extra_body"] = extra_body
+    return patched
+
+
+def _with_multimodal_image_defaults(
+    message: ChatMessage, *, resize_images_for_vlm: bool = True
+) -> ChatMessage:
+    """Return a copy with stricter-gateway + visual-memory image defaults."""
+    content = message.content
+    if not isinstance(content, list):
+        return message
+
+    has_image = any(_is_image_part(part) for part in content)
+    instructions_added = False
+    changed = False
+    patched_parts: list[object] = []
+    for part in content:
+        patched = part
+        if isinstance(part, ImageUrlPart):
+            image_url_updates: dict[str, object] = {}
+            if part.image_url.detail is None:
+                image_url_updates["detail"] = "auto"
+            if resize_images_for_vlm:
+                resized_url = _resize_image_data_url(part.image_url.url)
+                if resized_url != part.image_url.url:
+                    image_url_updates["url"] = resized_url
+            if image_url_updates:
+                image_url = part.image_url.model_copy(update=image_url_updates)
+                patched = part.model_copy(update={"image_url": image_url})
+                changed = True
+        if (
+            has_image
+            and not instructions_added
+            and isinstance(patched, TextPart)
+            and patched.text != _IMAGE_VISUAL_MEMORY_PROMPT
+        ):
+            patched = patched.model_copy(
+                update={"text": _IMAGE_VISUAL_MEMORY_PROMPT}
+            )
+            instructions_added = True
+            changed = True
+        patched_parts.append(patched)
+
+    if not changed:
+        return message
+    return message.model_copy(update={"content": patched_parts})
+
+
+def _is_image_part(part: object) -> bool:
+    return (
+        isinstance(part, ImageUrlPart)
+        and part.image_url.url.startswith("data:image/")
+    )
+
+
+def _resize_image_data_url(url: str) -> str:
+    """Resize base64 data-url images so the shorter side is 64 pixels."""
+    if not url.startswith("data:image/"):
+        return url
+    try:
+        header, encoded = url.split(",", 1)
+    except ValueError:
+        return url
+    if ";base64" not in header.lower():
+        return url
+
+    mime_type = header[5:].split(";", 1)[0].lower()
+    image_format = {
+        "image/jpeg": "JPEG",
+        "image/jpg": "JPEG",
+        "image/png": "PNG",
+        "image/webp": "WEBP",
+    }.get(mime_type)
+    if image_format is None:
+        return url
+
+    try:
+        from PIL import Image, ImageOps
+
+        raw = base64.b64decode(encoded, validate=True)
+        with Image.open(BytesIO(raw)) as image:
+            image = ImageOps.exif_transpose(image)
+            target_size = _image_size_with_min_side(
+                image.size, _VLM_IMAGE_MIN_SIDE
+            )
+            resized = image.resize(target_size, Image.Resampling.LANCZOS)
+            if image_format == "JPEG" and resized.mode not in ("RGB", "L"):
+                resized = resized.convert("RGB")
+            buffer = BytesIO()
+            save_kwargs: dict[str, object] = {"format": image_format}
+            if image_format == "JPEG":
+                save_kwargs["quality"] = 85
+            resized.save(buffer, **save_kwargs)
+    except (ImportError, ValueError, OSError, binascii.Error):
+        return url
+
+    resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+    return f"{header},{resized_encoded}"
+
+
+def _image_size_with_min_side(
+    size: tuple[int, int],
+    min_side: int,
+) -> tuple[int, int]:
+    width, height = size
+    shortest = min(width, height)
+    if shortest <= 0:
+        return (max(1, width), max(1, height))
+    scale = min_side / shortest
+    return (max(1, round(width * scale)), max(1, round(height * scale)))


 def get_llm_client() -> LLMClient:
@ -46,11 +259,14 @@ def get_llm_client() -> LLMClient:
        raise LLMNotConfiguredError(
            "LLM is required; set EVEROS_LLM__API_KEY + EVEROS_LLM__BASE_URL"
        )
-    _llm_client = build_client(
-        LLMConfig(
-            model=llm_cfg.model,
-            api_key=api_key,
-            base_url=llm_cfg.base_url,
+    _llm_client = _NoThinkingRequestDefaultsClient(
+        build_client(
+            LLMConfig(
+                model=llm_cfg.model,
+                api_key=api_key,
+                base_url=llm_cfg.base_url,
+                timeout=llm_cfg.timeout_seconds,
+            )
        )
    )
    logger.info("llm_client_built", model=llm_cfg.model)
@ -78,12 +294,16 @@ def get_multimodal_llm_client() -> LLMClient:
            "Multimodal LLM is required for parsing; set "
            "EVEROS_MULTIMODAL__API_KEY + EVEROS_MULTIMODAL__BASE_URL"
        )
-    _multimodal_client = build_client(
-        LLMConfig(
-            model=cfg.model,
-            api_key=api_key,
-            base_url=cfg.base_url,
-        )
+    _multimodal_client = _MultimodalImageDetailCompatClient(
+        build_client(
+            LLMConfig(
+                model=cfg.model,
+                api_key=api_key,
+                base_url=cfg.base_url,
+                timeout=cfg.timeout_seconds,
+            )
+        ),
+        resize_images_for_vlm=cfg.resize_images_for_vlm,
    )
    logger.info("multimodal_llm_client_built", model=cfg.model)
    return _multimodal_client