Initial commit
This commit is contained in:
238
email_dlp/converter.py
Normal file
238
email_dlp/converter.py
Normal file
@ -0,0 +1,238 @@
|
||||
"""Attachment → markdown text conversion routing."""
|
||||
|
||||
import base64
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
MAX_TEXT_CHARS = 20_000
|
||||
|
||||
# Sentinel prefix used to pass image data through the (text, status) interface.
|
||||
# Format: IMAGE_SENTINEL + "<mime_type>:<base64_data>"
|
||||
IMAGE_SENTINEL = "__IMAGE__:"
|
||||
|
||||
_IMAGE_MIME = {
|
||||
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
||||
".png": "image/png", ".gif": "image/gif",
|
||||
".bmp": "image/bmp", ".tiff": "image/tiff", ".webp": "image/webp",
|
||||
".img": "image/png", # fallback for generated inline names
|
||||
}
|
||||
|
||||
|
||||
def _convert_single_file(filepath: Path) -> tuple[str, str]:
|
||||
"""Convert a single file to text. Returns (text, status).
|
||||
|
||||
For image files, text is IMAGE_SENTINEL + "<mime>:<base64>" and
|
||||
status is "ok:image". Callers must check for the sentinel.
|
||||
"""
|
||||
suffix = filepath.suffix.lower()
|
||||
|
||||
# Image — return base64 sentinel for VLM consumption
|
||||
if suffix in _IMAGE_MIME:
|
||||
mime = _IMAGE_MIME[suffix]
|
||||
b64 = base64.b64encode(filepath.read_bytes()).decode()
|
||||
return IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"
|
||||
|
||||
known_binary_exts = {
|
||||
".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
|
||||
".go", ".rb", ".rs", ".sh", ".txt", ".md", ".sql", ".yaml", ".yml",
|
||||
".json", ".xml", ".html", ".htm", ".css",
|
||||
}
|
||||
|
||||
if suffix in known_binary_exts:
|
||||
# Plain text fallback — read directly
|
||||
try:
|
||||
text = filepath.read_text(errors="replace")
|
||||
return text, "ok"
|
||||
except Exception as e:
|
||||
return "", f"failed: {e}"
|
||||
|
||||
# Use markitdown for PDF, DOCX, XLSX, CSV, etc.
|
||||
try:
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(filepath))
|
||||
return result.text_content or "", "ok"
|
||||
except Exception as e:
|
||||
# Fallback to plain-text read for unknown types
|
||||
try:
|
||||
text = filepath.read_text(errors="replace")
|
||||
return text, f"fallback: {e}"
|
||||
except Exception as e2:
|
||||
return "", f"failed: {e2}"
|
||||
|
||||
|
||||
_OFFICE_MEDIA_DIRS = {
|
||||
".docx": "word/media/",
|
||||
".pptx": "ppt/media/",
|
||||
".xlsx": "xl/media/",
|
||||
}
|
||||
|
||||
_IMAGE_EXTS = set(_IMAGE_MIME.keys())
|
||||
|
||||
|
||||
def _extract_pdf_images(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract embedded images from a PDF using PyMuPDF.
|
||||
|
||||
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
|
||||
Returns empty list if fitz is not installed or no images found.
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
return []
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
try:
|
||||
doc = fitz.open(str(filepath))
|
||||
img_index = 0
|
||||
for page in doc:
|
||||
for img in page.get_images():
|
||||
xref = img[0]
|
||||
img_data = doc.extract_image(xref)
|
||||
ext = img_data.get("ext", "png")
|
||||
mime = _IMAGE_MIME.get(f".{ext}", f"image/{ext}")
|
||||
b64 = base64.b64encode(img_data["image"]).decode()
|
||||
display_name = f"{filename}/image_{img_index}.{ext}"
|
||||
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
|
||||
img_index += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _extract_office_images(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract embedded images from a DOCX/PPTX/XLSX using zipfile.
|
||||
|
||||
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
|
||||
Returns empty list if the file is not a valid ZIP or has no images.
|
||||
"""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
media_dir = _OFFICE_MEDIA_DIRS.get(suffix)
|
||||
if not media_dir:
|
||||
return []
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
try:
|
||||
with zipfile.ZipFile(str(filepath), "r") as zf:
|
||||
for name in sorted(zf.namelist()):
|
||||
if not name.startswith(media_dir):
|
||||
continue
|
||||
member_suffix = Path(name).suffix.lower()
|
||||
if member_suffix not in _IMAGE_EXTS:
|
||||
continue
|
||||
mime = _IMAGE_MIME[member_suffix]
|
||||
b64 = base64.b64encode(zf.read(name)).decode()
|
||||
display_name = f"{filename}/{Path(name).name}"
|
||||
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _convert_7z(
|
||||
filepath: Path, archive_name: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract a .7z archive and convert each member.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status), one entry per member.
|
||||
display_name uses "archive.7z/member.ext" format.
|
||||
"""
|
||||
try:
|
||||
import py7zr
|
||||
except ImportError:
|
||||
return [(archive_name, "", "failed: py7zr not installed")]
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="email_dlp_7z_") as tmpdir:
|
||||
tmp = Path(tmpdir)
|
||||
try:
|
||||
with py7zr.SevenZipFile(str(filepath), mode="r") as archive:
|
||||
archive.extractall(path=str(tmp))
|
||||
except Exception as e:
|
||||
return [(archive_name, "", f"failed: 7z extraction error: {e}")]
|
||||
|
||||
for member_path in sorted(tmp.rglob("*")):
|
||||
if not member_path.is_file():
|
||||
continue
|
||||
display_name = f"{archive_name}/{member_path.name}"
|
||||
text, status = _convert_single_file(member_path)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results.append((display_name, text, status))
|
||||
|
||||
return results if results else [(archive_name, "", "skipped")]
|
||||
|
||||
|
||||
def _convert_zip(
|
||||
filepath: Path, archive_name: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract a .zip archive and convert each member.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status), one entry per member.
|
||||
display_name uses "archive.zip/member.ext" format.
|
||||
"""
|
||||
import zipfile
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="email_dlp_zip_") as tmpdir:
|
||||
tmp = Path(tmpdir)
|
||||
try:
|
||||
with zipfile.ZipFile(str(filepath), mode="r") as archive:
|
||||
archive.extractall(path=str(tmp))
|
||||
except Exception as e:
|
||||
return [(archive_name, "", f"failed: zip extraction error: {e}")]
|
||||
|
||||
for member_path in sorted(tmp.rglob("*")):
|
||||
if not member_path.is_file():
|
||||
continue
|
||||
display_name = f"{archive_name}/{member_path.name}"
|
||||
text, status = _convert_single_file(member_path)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results.append((display_name, text, status))
|
||||
|
||||
return results if results else [(archive_name, "", "skipped")]
|
||||
|
||||
|
||||
def convert_attachment(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Convert an attachment file for LLM analysis.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status).
|
||||
- Non-archive files: single-element list.
|
||||
- .7z archives: one element per member file inside the archive.
|
||||
|
||||
text_or_sentinel is either plain text or IMAGE_SENTINEL + "<mime>:<base64>"
|
||||
for image files. Text is truncated to MAX_TEXT_CHARS (images are not truncated).
|
||||
"""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
|
||||
if suffix == ".7z":
|
||||
return _convert_7z(filepath, filename)
|
||||
elif suffix == ".zip":
|
||||
return _convert_zip(filepath, filename)
|
||||
|
||||
text, status = _convert_single_file(filepath)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results = [(filename, text, status)]
|
||||
|
||||
# For PDF and Office files, also extract embedded images
|
||||
if suffix == ".pdf":
|
||||
results.extend(_extract_pdf_images(filepath, filename))
|
||||
elif suffix in _OFFICE_MEDIA_DIRS:
|
||||
results.extend(_extract_office_images(filepath, filename))
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user