88 lines
2.6 KiB
Python
88 lines
2.6 KiB
Python
"""Document ingestion helpers for Memory Gateway."""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def slugify(value: str, fallback: str = "document") -> str:
|
|
slug = re.sub(r"[^a-zA-Z0-9\u4e00-\u9fff_-]+", "-", value.lower()).strip("-")
|
|
slug = re.sub(r"-+", "-", slug)[:100].strip("-")
|
|
return slug or fallback
|
|
|
|
|
|
def convert_file_to_markdown(file_path: str | Path) -> str:
|
|
"""Convert a local document to Markdown using Microsoft MarkItDown."""
|
|
try:
|
|
from markitdown import MarkItDown
|
|
except ModuleNotFoundError as exc:
|
|
raise RuntimeError("markitdown is not installed. Install with: pip install 'markitdown[all]'") from exc
|
|
|
|
file_path = Path(file_path)
|
|
converter = MarkItDown(enable_plugins=False)
|
|
if hasattr(converter, "convert_local"):
|
|
result = converter.convert_local(str(file_path))
|
|
else:
|
|
result = converter.convert(str(file_path))
|
|
markdown = getattr(result, "text_content", "") or ""
|
|
if not markdown.strip():
|
|
raise RuntimeError("Document conversion produced empty Markdown")
|
|
return markdown
|
|
|
|
|
|
def build_markdown_note(
|
|
*,
|
|
title: str,
|
|
markdown: str,
|
|
source_filename: str,
|
|
tags: list[str],
|
|
knowledge_type: str,
|
|
summary: str | None = None,
|
|
) -> str:
|
|
tag_text = ", ".join(tags)
|
|
frontmatter = [
|
|
"---",
|
|
f"title: {title}",
|
|
f"knowledge_type: {knowledge_type}",
|
|
f"source_filename: {source_filename}",
|
|
f"created_at: {datetime.now(timezone.utc).isoformat()}",
|
|
f"tags: [{tag_text}]" if tag_text else "tags: []",
|
|
]
|
|
if summary:
|
|
escaped = summary.replace('"', '\\"')
|
|
frontmatter.append(f'summary: "{escaped}"')
|
|
frontmatter.extend(["---", "", f"# {title}", "", markdown.strip(), ""])
|
|
return "\n".join(frontmatter)
|
|
|
|
|
|
def save_markdown_to_obsidian(
|
|
*,
|
|
vault_path: str | Path,
|
|
relative_dir: str,
|
|
title: str,
|
|
markdown: str,
|
|
source_filename: str,
|
|
tags: list[str],
|
|
knowledge_type: str,
|
|
summary: str | None = None,
|
|
) -> Path:
|
|
vault = Path(vault_path)
|
|
target_dir = vault / relative_dir.strip("/")
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
digest = slugify(source_filename.rsplit(".", 1)[0] or title)
|
|
note_name = f"{slugify(title, digest)}.md"
|
|
target = target_dir / note_name
|
|
target.write_text(
|
|
build_markdown_note(
|
|
title=title,
|
|
markdown=markdown,
|
|
source_filename=source_filename,
|
|
tags=tags,
|
|
knowledge_type=knowledge_type,
|
|
summary=summary,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
return target
|