From b243018affb554e401636b0dc8389f263e9f7e77 Mon Sep 17 00:00:00 2001 From: tomtan Date: Tue, 16 Jun 2026 16:18:24 +0800 Subject: [PATCH] fix(service): enhance message filtering to drop empty chat messages while retaining tool requests --- QUICKSTART.md | 8 +- README.md | 7 +- README.zh-CN.md | 5 +- docs/api.md | 46 +++++----- docs/openapi.json | 1 + src/everos/entrypoints/api/routes/memorize.py | 2 +- .../memory/extract/ingest/multimodal.py | 48 +++++++++- src/everos/memory/extract/ingest/service.py | 3 +- .../memory/extract/parser/availability.py | 6 +- src/everos/service/_boundary.py | 14 ++- .../test_memorize_route_validation.py | 32 +++++++ .../test_ingest/test_multimodal.py | 89 +++++++++++++++++++ .../test_parser/test_availability.py | 5 ++ .../test_service/test_boundary_helpers.py | 20 +++++ 14 files changed, 248 insertions(+), 38 deletions(-) create mode 100644 tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py diff --git a/QUICKSTART.md b/QUICKSTART.md index d250f46..cc6e68f 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -285,10 +285,10 @@ LLM → metrics) before exiting. call them from your agent loop. - **App + project scope** — set `app_id` / `project_id` to anything other than `"default"` to partition memory spaces inside one server. -- **Multi-modal messages** — `messages[].content` accepts a list of - typed `ContentItem`s (`text` / `image` / `audio` / `doc` / `pdf` / - `html` / `email`) for non-text input. Install the optional extra - to enable parsing: +- **Mixed content messages** — `messages[].content` accepts a list of + typed `ContentItem`s (`text` / `md` / `image` / `audio` / `doc` / + `pdf` / `html` / `email`). Markdown (`md`) is read as UTF-8 text. + Install the optional extra to enable parsing for media/doc types: `uv pip install 'everos[multimodal]'`. Office documents (`doc` / `docx` / `xls` / `ppt` / `…`) additionally need **LibreOffice** on the host (`brew install --cask libreoffice` / diff --git a/README.md b/README.md index 249860b..d5301e0 100644 --- a/README.md +++ b/README.md @@ -168,9 +168,10 @@ read the markdown), see [QUICKSTART.md](QUICKSTART.md). ### Optional: Ingest Multimodal Files -To ingest non-text content (image / pdf / audio / office documents) -through `/api/v1/memory/add` `content` items, install the optional -extra: +Markdown files can be sent as `type: "md"` and are read as UTF-8 text +without the multimodal parser. To ingest non-text content (image / pdf / +audio / office documents) through `/api/v1/memory/add` `content` items, +install the optional extra: ```bash uv pip install 'everos[multimodal]' # or: pip install 'everos[multimodal]' diff --git a/README.zh-CN.md b/README.zh-CN.md index 1a382d5..4df30df 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -159,8 +159,9 @@ vLLM / Ollama / DeepInfra)。你可以覆盖生成的 `.env` 中的 `*__BASE_U ### 可选:摄取多模态文件 -如果要通过 `/api/v1/memory/add` 的 `content` items 摄取非文本内容 -(image / pdf / audio / office documents),安装可选 extra: +Markdown 文件可以用 `type: "md"` 发送,并会按 UTF-8 文本读取,不经过 +multimodal parser。如果要通过 `/api/v1/memory/add` 的 `content` items +摄取非文本内容(image / pdf / audio / office documents),安装可选 extra: ```bash uv pip install 'everos[multimodal]' # or: pip install 'everos[multimodal]' diff --git a/docs/api.md b/docs/api.md index d0f05c7..90f8eb1 100644 --- a/docs/api.md +++ b/docs/api.md @@ -239,10 +239,10 @@ file (`episode-.md` etc.). **`content`** — The message body. - A bare **string** is shorthand for a single text content item. -- An **array of `ContentItem`** is for mixed-modality input (text + - image / pdf / audio / ...); non-text items are parsed by the - multimodal LLM configured via `EVEROS_MULTIMODAL__*` env vars. See - [ContentItem](#contentitem). +- An **array of `ContentItem`** is for mixed input (`text` / `md` + + image / pdf / audio / ...). `md` items are read as UTF-8 text; + media/document items are parsed by the multimodal LLM configured via + `EVEROS_MULTIMODAL__*` env vars. See [ContentItem](#contentitem). **`tool_calls`** — When `role: "assistant"`, the tool calls the assistant emitted in this turn (OpenAI Chat Completions shape). @@ -252,34 +252,38 @@ message is the response to. ### ContentItem -Mixed-modality message-body element. Carry the payload in exactly one -of `text` / `uri` / `base64`; the others must be `null`. For -`type: "text"` use `text`; for every **non-text** type use `uri` -(`http(s)://`) or `base64` (with `ext`). Non-text items are routed -through the multimodal parser, which needs a fetchable or decodable -payload — a non-text item carrying only `text` returns `415`. +Mixed message-body element. Carry the payload in exactly one of `text` / +`uri` / `base64`; the others must be `null`. For `type: "text"` use +`text`. For `type: "md"` use `text`, a server-local `file://` URI, or +`base64` UTF-8 bytes. For every **non-text, non-md** type use `uri` +(`http(s)://`) or `base64` (with `ext`). Non-text, non-md items are +routed through the multimodal parser, which needs a fetchable or +decodable payload — passing only `text` returns `415`. | Field | Type | Required | Default | Notes | |---|---|---|---|---| -| `type` | `"text" \| "image" \| "audio" \| "doc" \| "pdf" \| "html" \| "email"` | yes | — | — | -| `text` | `string \| null` | no | `null` | Required when `type: "text"` | +| `type` | `"text" \| "md" \| "image" \| "audio" \| "doc" \| "pdf" \| "html" \| "email"` | yes | — | — | +| `text` | `string \| null` | no | `null` | Required when `type: "text"`; optional inline Markdown when `type: "md"` | | `uri` | `string \| null` | no | `null` | `http(s)://` (fetched server-side) or `file://` (read from the server's local fs, guardrailed) pointer | | `base64` | `string \| null` | no | `null` | Inline payload, plain base64 (no `data:` prefix) | | `ext` | `string \| null` | no | `null` | File-extension hint when `uri` lacks one | | `name` | `string \| null` | no | `null` | Display filename, used in logs | | `extras` | `object \| null` | no | `null` | Provider-specific metadata, opaque to EverOS | -**`type`** — The content kind. Each non-text type is dispatched to the -multimodal LLM. If the multimodal endpoint cannot handle the supplied -payload, `/add` returns `415 Unsupported Media Type`. +**`type`** — The content kind. `text` and `md` are treated as text. +Each other type is dispatched to the multimodal LLM. If the multimodal +endpoint cannot handle the supplied payload, `/add` returns +`415 Unsupported Media Type`. -**`text`** — The literal text payload; valid **only** for -`type: "text"`. A non-text type (including `"html"`) is always routed -to the parser and must carry `uri` or `base64`; passing only `text` on -a non-text item returns `415`. To inline HTML as plain text, send it -as `type: "text"`. +**`text`** — The literal text payload; valid for `type: "text"` and +inline `type: "md"`. A non-text, non-md type (including `"html"`) is +always routed to the parser and must carry `uri` or `base64`; passing +only `text` on those items returns `415`. To inline HTML as plain text, +send it as `type: "text"`. -**`uri`** — `http(s)://` or `file://` pointer to the asset. An +**`uri`** — `http(s)://` or `file://` pointer to the asset. For +`type: "md"`, only `file://` is supported and the file is decoded as +UTF-8 text. For parser-backed content, an `http(s)` uri is fetched by the server and dispatched by the response Content-Type (use it for assets hosted elsewhere — S3 / OSS presigned URL, http server). A `file://` uri is read from the **server's** local diff --git a/docs/openapi.json b/docs/openapi.json index 0ae6608..79c5d71 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -251,6 +251,7 @@ "type": "string", "enum": [ "text", + "md", "image", "audio", "doc", diff --git a/src/everos/entrypoints/api/routes/memorize.py b/src/everos/entrypoints/api/routes/memorize.py index 392fde9..0d84620 100644 --- a/src/everos/entrypoints/api/routes/memorize.py +++ b/src/everos/entrypoints/api/routes/memorize.py @@ -58,7 +58,7 @@ class ToolCallDTO(BaseModel): class ContentItemDTO(BaseModel): """Content piece (v1 API brief appendix A).""" - type: Literal["text", "image", "audio", "doc", "pdf", "html", "email"] + type: Literal["text", "md", "image", "audio", "doc", "pdf", "html", "email"] text: str | None = None uri: str | None = None base64: str | None = None diff --git a/src/everos/memory/extract/ingest/multimodal.py b/src/everos/memory/extract/ingest/multimodal.py index 30ca0bb..d1823f3 100644 --- a/src/everos/memory/extract/ingest/multimodal.py +++ b/src/everos/memory/extract/ingest/multimodal.py @@ -11,12 +11,19 @@ them via ``IngestResult.unparsed_non_text_count``. from __future__ import annotations +import base64 +import binascii from typing import Any +from urllib.parse import urlparse +from everos.core.errors import UnsupportedModalityError from everos.core.observability.logging import get_logger +from everos.memory.extract.parser.mapping import read_file_uri logger = get_logger(__name__) +_TEXTUAL_CONTENT_TYPES = frozenset({"text", "md"}) + _IMAGE_VISUAL_FACTS_NOTE = ( "Context: image visual facts extracted from an uploaded image; " "treat these as image content, not assistant actions." @@ -36,6 +43,34 @@ def coerce_items( return [_coerce_item(item) for item in content] +async def hydrate_md_items(items: list[dict[str, Any]]) -> None: + """Populate ``text`` for ``type="md"`` items before parser dispatch.""" + for item in items: + if item.get("type") != "md": + continue + if item.get("text") is not None: + item["text"] = str(item["text"]) + continue + uri = item.get("uri") + if uri: + if urlparse(str(uri)).scheme != "file": + raise UnsupportedModalityError("md uri must use file://") + raw, _ = await read_file_uri(str(uri), ext_hint=item.get("ext") or "md") + item["text"] = _decode_md(raw) + continue + encoded = item.get("base64") + if encoded: + try: + raw = base64.b64decode(str(encoded), validate=True) + except (binascii.Error, ValueError) as exc: + raise UnsupportedModalityError("invalid md base64 payload") from exc + item["text"] = _decode_md(raw) + continue + raise UnsupportedModalityError( + "md content item requires text, file:// uri, or base64" + ) + + def derive_text(items: list[dict[str, Any]]) -> tuple[str, int]: """Render items into the derived ``text`` + count still-unparsed non-text. @@ -49,7 +84,7 @@ def derive_text(items: list[dict[str, Any]]) -> tuple[str, int]: rendered = _render_item(item) if rendered: parts.append(rendered) - elif item.get("type") != "text": + elif item.get("type") not in _TEXTUAL_CONTENT_TYPES: non_text += 1 logger.warning( "multimodal_content_not_parsed", @@ -75,11 +110,11 @@ def normalise_content( def _render_item(item: dict[str, Any]) -> str | None: """Render one item to text, or ``None`` if it contributes nothing. - Text items yield their ``text``; non-text items yield + Text and md items yield their ``text``; non-text items yield ``[TYPE: name]\\n{parsed_content}`` once parsed; unparsed non-text yields ``None``. """ - if item.get("type") == "text": + if item.get("type") in _TEXTUAL_CONTENT_TYPES: text = item.get("text") return str(text) if text else None parsed = item.get("parsed_content") @@ -100,3 +135,10 @@ def _coerce_item(item: Any) -> dict[str, Any]: if isinstance(item, dict): return dict(item) return {"type": "unknown", "raw": repr(item)} + + +def _decode_md(raw: bytes) -> str: + try: + return raw.decode("utf-8") + except UnicodeDecodeError as exc: + raise UnsupportedModalityError("md payload must be UTF-8") from exc diff --git a/src/everos/memory/extract/ingest/service.py b/src/everos/memory/extract/ingest/service.py index c3f66a1..826187f 100644 --- a/src/everos/memory/extract/ingest/service.py +++ b/src/everos/memory/extract/ingest/service.py @@ -37,7 +37,7 @@ from everos.memory.extract.parser import ( ) from .id_gen import gen_message_id -from .multimodal import coerce_items, derive_text +from .multimodal import coerce_items, derive_text, hydrate_md_items async def process(payload: dict[str, Any]) -> IngestResult: @@ -55,6 +55,7 @@ async def process(payload: dict[str, Any]) -> IngestResult: non_text_total = 0 for idx, m in enumerate(raw_messages): content_items = coerce_items(m["content"]) + await hydrate_md_items(content_items) if has_unparsed_multimodal(content_items): require_multimodal() await enrich_content_items( diff --git a/src/everos/memory/extract/parser/availability.py b/src/everos/memory/extract/parser/availability.py index 544ed69..7efc55b 100644 --- a/src/everos/memory/extract/parser/availability.py +++ b/src/everos/memory/extract/parser/availability.py @@ -17,11 +17,15 @@ _INSTALL_HINT = ( "(or uv add 'everos[multimodal]')." ) +_TEXTUAL_CONTENT_TYPES = frozenset({"text", "md"}) + def has_unparsed_multimodal(items: list[dict[str, Any]]) -> bool: """True if any content item is non-text and not yet parsed.""" return any( - item.get("type") != "text" and "parsed_content" not in item for item in items + item.get("type") not in _TEXTUAL_CONTENT_TYPES + and "parsed_content" not in item + for item in items ) diff --git a/src/everos/service/_boundary.py b/src/everos/service/_boundary.py index eef37ad..5fa4567 100644 --- a/src/everos/service/_boundary.py +++ b/src/everos/service/_boundary.py @@ -206,8 +206,18 @@ def _filter_for_mode( ) -> list[CanonicalMessage]: """Chat mode drops tool rows; agent mode keeps everything.""" if mode == "chat": - return [m for m in msgs if m.role in ("user", "assistant") and not m.tool_calls] - return list(msgs) + return [ + m + for m in msgs + if m.role in ("user", "assistant") + and not m.tool_calls + and m.text.strip() + ] + return [m for m in msgs if _has_boundary_payload(m)] + + +def _has_boundary_payload(m: CanonicalMessage) -> bool: + return bool(m.text.strip()) or bool(m.tool_calls) or m.role == "tool" # ── Boundary dispatch ───────────────────────────────────────────────────── diff --git a/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py b/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py new file mode 100644 index 0000000..ce58ce2 --- /dev/null +++ b/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py @@ -0,0 +1,32 @@ +"""Validation paths for ``POST /api/v1/memory/add`` request DTOs.""" + +from __future__ import annotations + +from everos.entrypoints.api.routes.memorize import ContentItemDTO, MemorizeAddRequest + + +def test_add_request_accepts_md_content_item() -> None: + req = MemorizeAddRequest.model_validate( + { + "session_id": "s_md", + "messages": [ + { + "sender_id": "u1", + "role": "user", + "timestamp": 1_700_000_000_000, + "content": [ + { + "type": "md", + "text": "# Deploy\nUse nginx.", + "name": "deploy.md", + } + ], + } + ], + } + ) + + content = req.messages[0].content + assert isinstance(content, list) + assert isinstance(content[0], ContentItemDTO) + assert content[0].type == "md" diff --git a/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py b/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py index ccdd751..e39dafa 100644 --- a/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py +++ b/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py @@ -2,11 +2,18 @@ from __future__ import annotations +import base64 +from pathlib import Path + +import pytest + +from everos.config import load_settings from everos.memory.extract.ingest.multimodal import ( coerce_items, derive_text, normalise_content, ) +from everos.memory.extract.ingest.service import process def test_coerce_str_to_text_item() -> None: @@ -46,3 +53,85 @@ def test_normalise_content_text_only_unchanged() -> None: assert items == [{"type": "text", "text": "hello"}] assert text == "hello" assert non_text == 0 + + +@pytest.fixture(autouse=True) +def _clear_settings_cache(): + load_settings.cache_clear() + yield + load_settings.cache_clear() + + +async def test_process_renders_md_text_without_multimodal_parser( + monkeypatch: pytest.MonkeyPatch, +) -> None: + import everos.memory.extract.ingest.service as ingest_service + + monkeypatch.setattr( + ingest_service, + "require_multimodal", + lambda: (_ for _ in ()).throw(AssertionError("parser should not run")), + ) + result = await process( + { + "session_id": "s_md_text", + "messages": [ + { + "sender_id": "u1", + "role": "user", + "timestamp": 1_700_000_000_000, + "content": [{"type": "md", "text": "# Deploy\nUse nginx."}], + } + ], + } + ) + + assert result.messages[0].text == "# Deploy\nUse nginx." + assert result.messages[0].content_items[0]["type"] == "md" + assert result.messages[0].content_items[0]["text"] == "# Deploy\nUse nginx." + assert result.unparsed_non_text_count == 0 + + +async def test_process_reads_md_file_uri_as_utf8_text(tmp_path: Path) -> None: + doc = tmp_path / "guide.md" + doc.write_text("# 部署\n配置域名。", encoding="utf-8") + + result = await process( + { + "session_id": "s_md_uri", + "messages": [ + { + "sender_id": "u1", + "role": "user", + "timestamp": 1_700_000_000_000, + "content": [ + {"type": "md", "uri": f"file://{doc}", "name": "guide.md"} + ], + } + ], + } + ) + + assert result.messages[0].text == "# 部署\n配置域名。" + assert result.messages[0].content_items[0]["text"] == "# 部署\n配置域名。" + + +async def test_process_decodes_md_base64_as_utf8_text() -> None: + encoded = base64.b64encode("## Notes\n记住配置。".encode()).decode("ascii") + + result = await process( + { + "session_id": "s_md_base64", + "messages": [ + { + "sender_id": "u1", + "role": "user", + "timestamp": 1_700_000_000_000, + "content": [{"type": "md", "base64": encoded, "ext": "md"}], + } + ], + } + ) + + assert result.messages[0].text == "## Notes\n记住配置。" + assert result.messages[0].content_items[0]["text"] == "## Notes\n记住配置。" diff --git a/tests/unit/test_memory/test_extract/test_parser/test_availability.py b/tests/unit/test_memory/test_extract/test_parser/test_availability.py index 2a0b839..537f089 100644 --- a/tests/unit/test_memory/test_extract/test_parser/test_availability.py +++ b/tests/unit/test_memory/test_extract/test_parser/test_availability.py @@ -18,6 +18,11 @@ def test_has_unparsed_multimodal_false_when_all_text() -> None: assert availability.has_unparsed_multimodal(items) is False +def test_has_unparsed_multimodal_false_for_md() -> None: + items = [{"type": "md", "text": "# hi"}] + assert availability.has_unparsed_multimodal(items) is False + + def test_has_unparsed_multimodal_false_when_already_parsed() -> None: items = [{"type": "image", "uri": "x", "parsed_content": "ocr"}] assert availability.has_unparsed_multimodal(items) is False diff --git a/tests/unit/test_service/test_boundary_helpers.py b/tests/unit/test_service/test_boundary_helpers.py index da11f13..405d893 100644 --- a/tests/unit/test_service/test_boundary_helpers.py +++ b/tests/unit/test_service/test_boundary_helpers.py @@ -74,6 +74,26 @@ def test_filter_agent_keeps_everything() -> None: assert [m.message_id for m in out] == ["m1", "m2"] +def test_filter_drops_empty_plain_chat_messages_but_keeps_tool_requests() -> None: + msgs = [ + _msg("m1", "user", text=""), + _msg("m2", "assistant", text=" "), + _msg( + "m3", + "assistant", + text="", + tool_calls=[ToolCall(id="tc1", function={"name": "f", "arguments": "{}"})], + ), + _msg("m4", "user", text="ok"), + ] + + chat_out = _filter_for_mode(msgs, "chat") + agent_out = _filter_for_mode(msgs, "agent") + + assert [m.message_id for m in chat_out] == ["m4"] + assert [m.message_id for m in agent_out] == ["m3", "m4"] + + # ── _to_conversation_item dispatch ────────────────────────────────────────