From b243018affb554e401636b0dc8389f263e9f7e77 Mon Sep 17 00:00:00 2001
From: tomtan <tom.tan@boardware.com>
Date: Tue, 16 Jun 2026 16:18:24 +0800
Subject: [PATCH] fix(service): enhance message filtering to drop empty chat
 messages while retaining tool requests

---
 QUICKSTART.md                                 |  8 +-
 README.md                                     |  7 +-
 README.zh-CN.md                               |  5 +-
 docs/api.md                                   | 46 +++++-----
 docs/openapi.json                             |  1 +
 src/everos/entrypoints/api/routes/memorize.py |  2 +-
 .../memory/extract/ingest/multimodal.py       | 48 +++++++++-
 src/everos/memory/extract/ingest/service.py   |  3 +-
 .../memory/extract/parser/availability.py     |  6 +-
 src/everos/service/_boundary.py               | 14 ++-
 .../test_memorize_route_validation.py         | 32 +++++++
 .../test_ingest/test_multimodal.py            | 89 +++++++++++++++++++
 .../test_parser/test_availability.py          |  5 ++
 .../test_service/test_boundary_helpers.py     | 20 +++++
 14 files changed, 248 insertions(+), 38 deletions(-)
 create mode 100644 tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py
diff --git a/QUICKSTART.md b/QUICKSTART.md
index d250f46..cc6e68f 100644
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@@ -285,10 +285,10 @@ LLM → metrics) before exiting.
   call them from your agent loop.
 - **App + project scope** — set `app_id` / `project_id` to anything
   other than `"default"` to partition memory spaces inside one server.
-- **Multi-modal messages** — `messages[].content` accepts a list of
-  typed `ContentItem`s (`text` / `image` / `audio` / `doc` / `pdf` /
-  `html` / `email`) for non-text input. Install the optional extra
-  to enable parsing:
+- **Mixed content messages** — `messages[].content` accepts a list of
+  typed `ContentItem`s (`text` / `md` / `image` / `audio` / `doc` /
+  `pdf` / `html` / `email`). Markdown (`md`) is read as UTF-8 text.
+  Install the optional extra to enable parsing for media/doc types:
   `uv pip install 'everos[multimodal]'`. Office documents
   (`doc` / `docx` / `xls` / `ppt` / `…`) additionally need
   **LibreOffice** on the host (`brew install --cask libreoffice` /
diff --git a/README.md b/README.md
index 249860b..d5301e0 100644
--- a/README.md
+++ b/README.md
@@ -168,9 +168,10 @@ read the markdown), see [QUICKSTART.md](QUICKSTART.md).
 
 ### Optional: Ingest Multimodal Files
 
-To ingest non-text content (image / pdf / audio / office documents)
-through `/api/v1/memory/add` `content` items, install the optional
-extra:
+Markdown files can be sent as `type: "md"` and are read as UTF-8 text
+without the multimodal parser. To ingest non-text content (image / pdf /
+audio / office documents) through `/api/v1/memory/add` `content` items,
+install the optional extra:
 
 ```bash
 uv pip install 'everos[multimodal]'   # or: pip install 'everos[multimodal]'
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 1a382d5..4df30df 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -159,8 +159,9 @@ vLLM / Ollama / DeepInfra）。你可以覆盖生成的 `.env` 中的 `*__BASE_U
 
 ### 可选：摄取多模态文件
 
-如果要通过 `/api/v1/memory/add` 的 `content` items 摄取非文本内容
-（image / pdf / audio / office documents），安装可选 extra：
+Markdown 文件可以用 `type: "md"` 发送，并会按 UTF-8 文本读取，不经过
+multimodal parser。如果要通过 `/api/v1/memory/add` 的 `content` items
+摄取非文本内容（image / pdf / audio / office documents），安装可选 extra：
 
 ```bash
 uv pip install 'everos[multimodal]'   # or: pip install 'everos[multimodal]'
diff --git a/docs/api.md b/docs/api.md
index d0f05c7..90f8eb1 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -239,10 +239,10 @@ file (`episode-<YYYY-MM-DD>.md` etc.).
 
 **`content`** — The message body.
 - A bare **string** is shorthand for a single text content item.
-- An **array of `ContentItem`** is for mixed-modality input (text +
-  image / pdf / audio / ...); non-text items are parsed by the
-  multimodal LLM configured via `EVEROS_MULTIMODAL__*` env vars. See
-  [ContentItem](#contentitem).
+- An **array of `ContentItem`** is for mixed input (`text` / `md` +
+  image / pdf / audio / ...). `md` items are read as UTF-8 text;
+  media/document items are parsed by the multimodal LLM configured via
+  `EVEROS_MULTIMODAL__*` env vars. See [ContentItem](#contentitem).
 
 **`tool_calls`** — When `role: "assistant"`, the tool calls the
 assistant emitted in this turn (OpenAI Chat Completions shape).
@@ -252,34 +252,38 @@ message is the response to.
 
 ### ContentItem
 
-Mixed-modality message-body element. Carry the payload in exactly one
-of `text` / `uri` / `base64`; the others must be `null`. For
-`type: "text"` use `text`; for every **non-text** type use `uri`
-(`http(s)://`) or `base64` (with `ext`). Non-text items are routed
-through the multimodal parser, which needs a fetchable or decodable
-payload — a non-text item carrying only `text` returns `415`.
+Mixed message-body element. Carry the payload in exactly one of `text` /
+`uri` / `base64`; the others must be `null`. For `type: "text"` use
+`text`. For `type: "md"` use `text`, a server-local `file://` URI, or
+`base64` UTF-8 bytes. For every **non-text, non-md** type use `uri`
+(`http(s)://`) or `base64` (with `ext`). Non-text, non-md items are
+routed through the multimodal parser, which needs a fetchable or
+decodable payload — passing only `text` returns `415`.
 
 | Field | Type | Required | Default | Notes |
 |---|---|---|---|---|
-| `type` | `"text" \| "image" \| "audio" \| "doc" \| "pdf" \| "html" \| "email"` | yes | — | — |
-| `text` | `string \| null` | no | `null` | Required when `type: "text"` |
+| `type` | `"text" \| "md" \| "image" \| "audio" \| "doc" \| "pdf" \| "html" \| "email"` | yes | — | — |
+| `text` | `string \| null` | no | `null` | Required when `type: "text"`; optional inline Markdown when `type: "md"` |
 | `uri` | `string \| null` | no | `null` | `http(s)://` (fetched server-side) or `file://` (read from the server's local fs, guardrailed) pointer |
 | `base64` | `string \| null` | no | `null` | Inline payload, plain base64 (no `data:` prefix) |
 | `ext` | `string \| null` | no | `null` | File-extension hint when `uri` lacks one |
 | `name` | `string \| null` | no | `null` | Display filename, used in logs |
 | `extras` | `object \| null` | no | `null` | Provider-specific metadata, opaque to EverOS |
 
-**`type`** — The content kind. Each non-text type is dispatched to the
-multimodal LLM. If the multimodal endpoint cannot handle the supplied
-payload, `/add` returns `415 Unsupported Media Type`.
+**`type`** — The content kind. `text` and `md` are treated as text.
+Each other type is dispatched to the multimodal LLM. If the multimodal
+endpoint cannot handle the supplied payload, `/add` returns
+`415 Unsupported Media Type`.
 
-**`text`** — The literal text payload; valid **only** for
-`type: "text"`. A non-text type (including `"html"`) is always routed
-to the parser and must carry `uri` or `base64`; passing only `text` on
-a non-text item returns `415`. To inline HTML as plain text, send it
-as `type: "text"`.
+**`text`** — The literal text payload; valid for `type: "text"` and
+inline `type: "md"`. A non-text, non-md type (including `"html"`) is
+always routed to the parser and must carry `uri` or `base64`; passing
+only `text` on those items returns `415`. To inline HTML as plain text,
+send it as `type: "text"`.
 
-**`uri`** — `http(s)://` or `file://` pointer to the asset. An
+**`uri`** — `http(s)://` or `file://` pointer to the asset. For
+`type: "md"`, only `file://` is supported and the file is decoded as
+UTF-8 text. For parser-backed content, an
 `http(s)` uri is fetched by the server and dispatched by the response
 Content-Type (use it for assets hosted elsewhere — S3 / OSS presigned
 URL, http server). A `file://` uri is read from the **server's** local
diff --git a/docs/openapi.json b/docs/openapi.json
index 0ae6608..79c5d71 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -251,6 +251,7 @@
             "type": "string",
             "enum": [
               "text",
+              "md",
               "image",
               "audio",
               "doc",
diff --git a/src/everos/entrypoints/api/routes/memorize.py b/src/everos/entrypoints/api/routes/memorize.py
index 392fde9..0d84620 100644
--- a/src/everos/entrypoints/api/routes/memorize.py
+++ b/src/everos/entrypoints/api/routes/memorize.py
@@ -58,7 +58,7 @@ class ToolCallDTO(BaseModel):
 class ContentItemDTO(BaseModel):
     """Content piece (v1 API brief appendix A)."""
 
-    type: Literal["text", "image", "audio", "doc", "pdf", "html", "email"]
+    type: Literal["text", "md", "image", "audio", "doc", "pdf", "html", "email"]
     text: str | None = None
     uri: str | None = None
     base64: str | None = None
diff --git a/src/everos/memory/extract/ingest/multimodal.py b/src/everos/memory/extract/ingest/multimodal.py
index 30ca0bb..d1823f3 100644
--- a/src/everos/memory/extract/ingest/multimodal.py
+++ b/src/everos/memory/extract/ingest/multimodal.py
@@ -11,12 +11,19 @@ them via ``IngestResult.unparsed_non_text_count``.
 
 from __future__ import annotations
 
+import base64
+import binascii
 from typing import Any
+from urllib.parse import urlparse
 
+from everos.core.errors import UnsupportedModalityError
 from everos.core.observability.logging import get_logger
+from everos.memory.extract.parser.mapping import read_file_uri
 
 logger = get_logger(__name__)
 
+_TEXTUAL_CONTENT_TYPES = frozenset({"text", "md"})
+
 _IMAGE_VISUAL_FACTS_NOTE = (
     "Context: image visual facts extracted from an uploaded image; "
     "treat these as image content, not assistant actions."
@@ -36,6 +43,34 @@ def coerce_items(
     return [_coerce_item(item) for item in content]
 
 
+async def hydrate_md_items(items: list[dict[str, Any]]) -> None:
+    """Populate ``text`` for ``type="md"`` items before parser dispatch."""
+    for item in items:
+        if item.get("type") != "md":
+            continue
+        if item.get("text") is not None:
+            item["text"] = str(item["text"])
+            continue
+        uri = item.get("uri")
+        if uri:
+            if urlparse(str(uri)).scheme != "file":
+                raise UnsupportedModalityError("md uri must use file://")
+            raw, _ = await read_file_uri(str(uri), ext_hint=item.get("ext") or "md")
+            item["text"] = _decode_md(raw)
+            continue
+        encoded = item.get("base64")
+        if encoded:
+            try:
+                raw = base64.b64decode(str(encoded), validate=True)
+            except (binascii.Error, ValueError) as exc:
+                raise UnsupportedModalityError("invalid md base64 payload") from exc
+            item["text"] = _decode_md(raw)
+            continue
+        raise UnsupportedModalityError(
+            "md content item requires text, file:// uri, or base64"
+        )
+
+
 def derive_text(items: list[dict[str, Any]]) -> tuple[str, int]:
     """Render items into the derived ``text`` + count still-unparsed non-text.
 
@@ -49,7 +84,7 @@ def derive_text(items: list[dict[str, Any]]) -> tuple[str, int]:
         rendered = _render_item(item)
         if rendered:
             parts.append(rendered)
-        elif item.get("type") != "text":
+        elif item.get("type") not in _TEXTUAL_CONTENT_TYPES:
             non_text += 1
             logger.warning(
                 "multimodal_content_not_parsed",
@@ -75,11 +110,11 @@ def normalise_content(
 def _render_item(item: dict[str, Any]) -> str | None:
     """Render one item to text, or ``None`` if it contributes nothing.
 
-    Text items yield their ``text``; non-text items yield
+    Text and md items yield their ``text``; non-text items yield
     ``[TYPE: name]\\n{parsed_content}`` once parsed; unparsed non-text yields
     ``None``.
     """
-    if item.get("type") == "text":
+    if item.get("type") in _TEXTUAL_CONTENT_TYPES:
         text = item.get("text")
         return str(text) if text else None
     parsed = item.get("parsed_content")
@@ -100,3 +135,10 @@ def _coerce_item(item: Any) -> dict[str, Any]:
     if isinstance(item, dict):
         return dict(item)
     return {"type": "unknown", "raw": repr(item)}
+
+
+def _decode_md(raw: bytes) -> str:
+    try:
+        return raw.decode("utf-8")
+    except UnicodeDecodeError as exc:
+        raise UnsupportedModalityError("md payload must be UTF-8") from exc
diff --git a/src/everos/memory/extract/ingest/service.py b/src/everos/memory/extract/ingest/service.py
index c3f66a1..826187f 100644
--- a/src/everos/memory/extract/ingest/service.py
+++ b/src/everos/memory/extract/ingest/service.py
@@ -37,7 +37,7 @@ from everos.memory.extract.parser import (
 )
 
 from .id_gen import gen_message_id
-from .multimodal import coerce_items, derive_text
+from .multimodal import coerce_items, derive_text, hydrate_md_items
 
 
 async def process(payload: dict[str, Any]) -> IngestResult:
@@ -55,6 +55,7 @@ async def process(payload: dict[str, Any]) -> IngestResult:
     non_text_total = 0
     for idx, m in enumerate(raw_messages):
         content_items = coerce_items(m["content"])
+        await hydrate_md_items(content_items)
         if has_unparsed_multimodal(content_items):
             require_multimodal()
             await enrich_content_items(
diff --git a/src/everos/memory/extract/parser/availability.py b/src/everos/memory/extract/parser/availability.py
index 544ed69..7efc55b 100644
--- a/src/everos/memory/extract/parser/availability.py
+++ b/src/everos/memory/extract/parser/availability.py
@@ -17,11 +17,15 @@ _INSTALL_HINT = (
     "(or  uv add 'everos[multimodal]')."
 )
 
+_TEXTUAL_CONTENT_TYPES = frozenset({"text", "md"})
+
 
 def has_unparsed_multimodal(items: list[dict[str, Any]]) -> bool:
     """True if any content item is non-text and not yet parsed."""
     return any(
-        item.get("type") != "text" and "parsed_content" not in item for item in items
+        item.get("type") not in _TEXTUAL_CONTENT_TYPES
+        and "parsed_content" not in item
+        for item in items
     )
 
 
diff --git a/src/everos/service/_boundary.py b/src/everos/service/_boundary.py
index eef37ad..5fa4567 100644
--- a/src/everos/service/_boundary.py
+++ b/src/everos/service/_boundary.py
@@ -206,8 +206,18 @@ def _filter_for_mode(
 ) -> list[CanonicalMessage]:
     """Chat mode drops tool rows; agent mode keeps everything."""
     if mode == "chat":
-        return [m for m in msgs if m.role in ("user", "assistant") and not m.tool_calls]
-    return list(msgs)
+        return [
+            m
+            for m in msgs
+            if m.role in ("user", "assistant")
+            and not m.tool_calls
+            and m.text.strip()
+        ]
+    return [m for m in msgs if _has_boundary_payload(m)]
+
+
+def _has_boundary_payload(m: CanonicalMessage) -> bool:
+    return bool(m.text.strip()) or bool(m.tool_calls) or m.role == "tool"
 
 
 # ── Boundary dispatch ─────────────────────────────────────────────────────
diff --git a/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py b/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py
new file mode 100644
index 0000000..ce58ce2
--- /dev/null
+++ b/tests/unit/test_entrypoints/test_api/test_routes/test_memorize_route_validation.py
@@ -0,0 +1,32 @@
+"""Validation paths for ``POST /api/v1/memory/add`` request DTOs."""
+
+from __future__ import annotations
+
+from everos.entrypoints.api.routes.memorize import ContentItemDTO, MemorizeAddRequest
+
+
+def test_add_request_accepts_md_content_item() -> None:
+    req = MemorizeAddRequest.model_validate(
+        {
+            "session_id": "s_md",
+            "messages": [
+                {
+                    "sender_id": "u1",
+                    "role": "user",
+                    "timestamp": 1_700_000_000_000,
+                    "content": [
+                        {
+                            "type": "md",
+                            "text": "# Deploy\nUse nginx.",
+                            "name": "deploy.md",
+                        }
+                    ],
+                }
+            ],
+        }
+    )
+
+    content = req.messages[0].content
+    assert isinstance(content, list)
+    assert isinstance(content[0], ContentItemDTO)
+    assert content[0].type == "md"
diff --git a/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py b/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py
index ccdd751..e39dafa 100644
--- a/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py
+++ b/tests/unit/test_memory/test_extract/test_ingest/test_multimodal.py
@@ -2,11 +2,18 @@
 
 from __future__ import annotations
 
+import base64
+from pathlib import Path
+
+import pytest
+
+from everos.config import load_settings
 from everos.memory.extract.ingest.multimodal import (
     coerce_items,
     derive_text,
     normalise_content,
 )
+from everos.memory.extract.ingest.service import process
 
 
 def test_coerce_str_to_text_item() -> None:
@@ -46,3 +53,85 @@ def test_normalise_content_text_only_unchanged() -> None:
     assert items == [{"type": "text", "text": "hello"}]
     assert text == "hello"
     assert non_text == 0
+
+
+@pytest.fixture(autouse=True)
+def _clear_settings_cache():
+    load_settings.cache_clear()
+    yield
+    load_settings.cache_clear()
+
+
+async def test_process_renders_md_text_without_multimodal_parser(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    import everos.memory.extract.ingest.service as ingest_service
+
+    monkeypatch.setattr(
+        ingest_service,
+        "require_multimodal",
+        lambda: (_ for _ in ()).throw(AssertionError("parser should not run")),
+    )
+    result = await process(
+        {
+            "session_id": "s_md_text",
+            "messages": [
+                {
+                    "sender_id": "u1",
+                    "role": "user",
+                    "timestamp": 1_700_000_000_000,
+                    "content": [{"type": "md", "text": "# Deploy\nUse nginx."}],
+                }
+            ],
+        }
+    )
+
+    assert result.messages[0].text == "# Deploy\nUse nginx."
+    assert result.messages[0].content_items[0]["type"] == "md"
+    assert result.messages[0].content_items[0]["text"] == "# Deploy\nUse nginx."
+    assert result.unparsed_non_text_count == 0
+
+
+async def test_process_reads_md_file_uri_as_utf8_text(tmp_path: Path) -> None:
+    doc = tmp_path / "guide.md"
+    doc.write_text("# 部署\n配置域名。", encoding="utf-8")
+
+    result = await process(
+        {
+            "session_id": "s_md_uri",
+            "messages": [
+                {
+                    "sender_id": "u1",
+                    "role": "user",
+                    "timestamp": 1_700_000_000_000,
+                    "content": [
+                        {"type": "md", "uri": f"file://{doc}", "name": "guide.md"}
+                    ],
+                }
+            ],
+        }
+    )
+
+    assert result.messages[0].text == "# 部署\n配置域名。"
+    assert result.messages[0].content_items[0]["text"] == "# 部署\n配置域名。"
+
+
+async def test_process_decodes_md_base64_as_utf8_text() -> None:
+    encoded = base64.b64encode("## Notes\n记住配置。".encode()).decode("ascii")
+
+    result = await process(
+        {
+            "session_id": "s_md_base64",
+            "messages": [
+                {
+                    "sender_id": "u1",
+                    "role": "user",
+                    "timestamp": 1_700_000_000_000,
+                    "content": [{"type": "md", "base64": encoded, "ext": "md"}],
+                }
+            ],
+        }
+    )
+
+    assert result.messages[0].text == "## Notes\n记住配置。"
+    assert result.messages[0].content_items[0]["text"] == "## Notes\n记住配置。"
diff --git a/tests/unit/test_memory/test_extract/test_parser/test_availability.py b/tests/unit/test_memory/test_extract/test_parser/test_availability.py
index 2a0b839..537f089 100644
--- a/tests/unit/test_memory/test_extract/test_parser/test_availability.py
+++ b/tests/unit/test_memory/test_extract/test_parser/test_availability.py
@@ -18,6 +18,11 @@ def test_has_unparsed_multimodal_false_when_all_text() -> None:
     assert availability.has_unparsed_multimodal(items) is False
 
 
+def test_has_unparsed_multimodal_false_for_md() -> None:
+    items = [{"type": "md", "text": "# hi"}]
+    assert availability.has_unparsed_multimodal(items) is False
+
+
 def test_has_unparsed_multimodal_false_when_already_parsed() -> None:
     items = [{"type": "image", "uri": "x", "parsed_content": "ocr"}]
     assert availability.has_unparsed_multimodal(items) is False
diff --git a/tests/unit/test_service/test_boundary_helpers.py b/tests/unit/test_service/test_boundary_helpers.py
index da11f13..405d893 100644
--- a/tests/unit/test_service/test_boundary_helpers.py
+++ b/tests/unit/test_service/test_boundary_helpers.py
@@ -74,6 +74,26 @@ def test_filter_agent_keeps_everything() -> None:
     assert [m.message_id for m in out] == ["m1", "m2"]
 
 
+def test_filter_drops_empty_plain_chat_messages_but_keeps_tool_requests() -> None:
+    msgs = [
+        _msg("m1", "user", text=""),
+        _msg("m2", "assistant", text="   "),
+        _msg(
+            "m3",
+            "assistant",
+            text="",
+            tool_calls=[ToolCall(id="tc1", function={"name": "f", "arguments": "{}"})],
+        ),
+        _msg("m4", "user", text="ok"),
+    ]
+
+    chat_out = _filter_for_mode(msgs, "chat")
+    agent_out = _filter_for_mode(msgs, "agent")
+
+    assert [m.message_id for m in chat_out] == ["m4"]
+    assert [m.message_id for m in agent_out] == ["m3", "m4"]
+
+
 # ── _to_conversation_item dispatch ────────────────────────────────────────