"""Document ingestion helpers for Memory Gateway.""" from __future__ import annotations import re from datetime import datetime, timezone from pathlib import Path def slugify(value: str, fallback: str = "document") -> str: slug = re.sub(r"[^a-zA-Z0-9\u4e00-\u9fff_-]+", "-", value.lower()).strip("-") slug = re.sub(r"-+", "-", slug)[:100].strip("-") return slug or fallback def convert_file_to_markdown(file_path: str | Path) -> str: """Convert a local document to Markdown using Microsoft MarkItDown.""" try: from markitdown import MarkItDown except ModuleNotFoundError as exc: raise RuntimeError("markitdown is not installed. Install with: pip install 'markitdown[all]'") from exc file_path = Path(file_path) converter = MarkItDown(enable_plugins=False) if hasattr(converter, "convert_local"): result = converter.convert_local(str(file_path)) else: result = converter.convert(str(file_path)) markdown = getattr(result, "text_content", "") or "" if not markdown.strip(): raise RuntimeError("Document conversion produced empty Markdown") return markdown def build_markdown_note( *, title: str, markdown: str, source_filename: str, tags: list[str], knowledge_type: str, summary: str | None = None, ) -> str: tag_text = ", ".join(tags) frontmatter = [ "---", f"title: {title}", f"knowledge_type: {knowledge_type}", f"source_filename: {source_filename}", f"created_at: {datetime.now(timezone.utc).isoformat()}", f"tags: [{tag_text}]" if tag_text else "tags: []", ] if summary: escaped = summary.replace('"', '\\"') frontmatter.append(f'summary: "{escaped}"') frontmatter.extend(["---", "", f"# {title}", "", markdown.strip(), ""]) return "\n".join(frontmatter) def save_markdown_to_obsidian( *, vault_path: str | Path, relative_dir: str, title: str, markdown: str, source_filename: str, tags: list[str], knowledge_type: str, summary: str | None = None, ) -> Path: vault = Path(vault_path) target_dir = vault / relative_dir.strip("/") target_dir.mkdir(parents=True, exist_ok=True) digest = slugify(source_filename.rsplit(".", 1)[0] or title) note_name = f"{slugify(title, digest)}.md" target = target_dir / note_name target.write_text( build_markdown_note( title=title, markdown=markdown, source_filename=source_filename, tags=tags, knowledge_type=knowledge_type, summary=summary, ), encoding="utf-8", ) return target