Initial commit

2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions
--- a/email_dlp/init.py
+++ b/email_dlp/init.py
@ -0,0 +1 @@
+# Email DLP - Data Loss Prevention for email content
--- a/email_dlp/analyzer.py
+++ b/email_dlp/analyzer.py
@ -0,0 +1,241 @@
+"""DLP analysis backends: remote LLM or deterministic local simulation."""
+
+import json
+import os
+import re
+from typing import Any
+
+import openai
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from .converter import IMAGE_SENTINEL
+from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
+from .policy import format_policy_for_prompt
+from .simulator import simulate_analysis
+
+OUTPUT_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "risk_level": {"type": "string", "enum": ["CRITICAL", "HIGH", "MEDIUM", "LOW"]},
+        "risk_score": {"type": "integer", "minimum": 0, "maximum": 100},
+        "violation_types": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "enum": [
+                    "PII",
+                    "FINANCIAL_DATA",
+                    "SOURCE_CODE",
+                    "REGULATORY_DOCUMENT",
+                    "LEGAL_CONTRACT",
+                    "PAYROLL_RECORD",
+                    "CUSTOMER_LIST",
+                    "INTERNAL_MEMO",
+                    "NONE",
+                ],
+            },
+        },
+        "action": {"type": "string", "enum": ["PASS", "ALERT", "BLOCK"]},
+        "summary": {"type": "string"},
+        "evidence": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["risk_level", "risk_score", "violation_types", "action", "summary", "evidence"],
+}
+
+SYSTEM_PROMPT_TEMPLATE = """\
+You are a Data Loss Prevention (DLP) analyst. Your task is to evaluate email content and attachments against the DLP policy below, then return a structured JSON decision.
+
+## DLP Policy
+{policy_json}
+
+## Output Schema
+Respond with valid JSON only (no markdown fences, no extra text) matching this schema:
+{schema_json}
+
+## Critical Rules
+1. temperature=0: be deterministic and consistent.
+2. evidence: include direct quotes (verbatim excerpts) from the actual email or attachment content that justify your decision. Do not paraphrase.
+3. risk_score: assign 0-100. Use the full range — a customer CSV with 500 rows should score 95+, a casual internal memo scores 60-70.
+4. action: MUST match the threshold — BLOCK if risk_score>=80, ALERT if risk_score>=40, PASS otherwise.
+5. If no policy violations are found, set violation_types=["NONE"], risk_level="LOW", risk_score<40, action="PASS".
+"""
+
+
+def build_system_prompt() -> str:
+    """Build the system prompt used for DLP analysis."""
+    return SYSTEM_PROMPT_TEMPLATE.format(
+        policy_json=format_policy_for_prompt(),
+        schema_json=json.dumps(OUTPUT_SCHEMA, indent=2),
+    )
+
+
+def _build_user_content(
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],  # [(filename, text_or_sentinel)]
+) -> list[dict[str, Any]]:
+    """Build the user message content for the VLM.
+
+    Returns a multimodal content list (OpenAI vision format).
+    Text attachments are embedded as text blocks; image attachments are
+    inserted as image_url blocks so the VLM can see them directly.
+    """
+    header_block = "\n".join([
+        "## Email Headers",
+        f"Subject: {subject}",
+        f"From: {sender}",
+        f"To: {recipient}",
+        f"Date: {date}",
+        "",
+        "## Email Body",
+        body_text or "(empty body)",
+    ])
+    content: list[dict[str, Any]] = [{"type": "text", "text": header_block}]
+
+    for filename, text in attachment_texts:
+        if text.startswith(IMAGE_SENTINEL):
+            # IMAGE_SENTINEL format: "__IMAGE__:<mime>:<base64>"
+            payload = text[len(IMAGE_SENTINEL):]           # "<mime>:<base64>"
+            mime, b64 = payload.split(":", 1)
+            content.append({"type": "text", "text": f"\n## Attachment: {filename} (image)"})
+            content.append({
+                "type": "image_url",
+                "image_url": {"url": f"data:{mime};base64,{b64}"},
+            })
+        else:
+            content.append({
+                "type": "text",
+                "text": f"\n## Attachment: {filename}\n{text or '(no extractable text)'}",
+            })
+
+    return content
+
+
+def _parse_llm_response(content: str) -> dict[str, Any]:
+    """Parse JSON from LLM response, handling markdown fences."""
+    content = content.strip()
+
+    # Strip triple-backtick fences
+    fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
+    if fence_match:
+        content = fence_match.group(1)
+
+    return json.loads(content)
+
+
+def _map_action(action_str: str) -> ActionClass:
+    mapping = {"PASS": ActionClass.PASS_, "ALERT": ActionClass.ALERT, "BLOCK": ActionClass.BLOCK}
+    return mapping.get(action_str.upper(), ActionClass.ALERT)
+
+
+def analyze_email(
+    email_file: str,
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+    attachment_results: list[AttachmentResult],
+    processing_errors: list[str],
+    endpoint: str = "http://localhost:8000/v1",
+    model: str = "Qwen/Qwen3.5-35B-A3B",
+    backend: str = "llm",
+) -> DLPResult:
+    """Analyze email content using an LLM or the deterministic simulator."""
+    if backend == "simulated":
+        return simulate_analysis(
+            email_file=email_file,
+            subject=subject,
+            sender=sender,
+            recipient=recipient,
+            date=date,
+            body_text=body_text,
+            attachment_texts=attachment_texts,
+            attachment_results=attachment_results,
+            processing_errors=processing_errors,
+        )
+
+    # Use environment variables as fallback if they exist
+    final_endpoint = os.getenv("OPENAI_BASE_URL", endpoint)
+    final_api_key = os.getenv("OPENAI_API_KEY", "not-needed")
+    final_model = os.getenv("MODEL_NAME", model)
+
+    client = openai.OpenAI(base_url=final_endpoint, api_key=final_api_key)
+
+    system_prompt = build_system_prompt()
+
+    user_content = _build_user_content(
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        body_text=body_text,
+        attachment_texts=attachment_texts,
+    )
+
+    response = client.chat.completions.create(
+        model=final_model,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_content},  # list[dict] for multimodal
+        ],
+        temperature=0.0,
+        max_tokens=1024,
+        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+    )
+
+    raw_content = response.choices[0].message.content or ""
+
+    try:
+        parsed = _parse_llm_response(raw_content)
+    except json.JSONDecodeError as e:
+        processing_errors.append(f"JSON parse error: {e}; raw={raw_content[:200]}")
+        # Return a safe fallback
+        return DLPResult(
+            email_file=email_file,
+            subject=subject,
+            sender=sender,
+            recipient=recipient,
+            date=date,
+            risk_level=RiskLevel.HIGH,
+            risk_score=60,
+            violation_types=[ViolationType.NONE],
+            action=ActionClass.ALERT,
+            summary="Analysis failed due to JSON parse error.",
+            evidence=[],
+            attachments=attachment_results,
+            processing_errors=processing_errors,
+        )
+
+    # Map string values to enums
+    risk_level = RiskLevel(parsed.get("risk_level", "HIGH"))
+    violation_types = [
+        ViolationType(v) for v in parsed.get("violation_types", ["NONE"])
+        if v in ViolationType.__members__
+    ]
+    if not violation_types:
+        violation_types = [ViolationType.NONE]
+
+    action = _map_action(parsed.get("action", "ALERT"))
+
+    return DLPResult(
+        email_file=email_file,
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        risk_level=risk_level,
+        risk_score=int(parsed.get("risk_score", 60)),
+        violation_types=violation_types,
+        action=action,
+        summary=parsed.get("summary", ""),
+        evidence=parsed.get("evidence", []),
+        attachments=attachment_results,
+        processing_errors=processing_errors,
+    )
--- a/email_dlp/cli.py
+++ b/email_dlp/cli.py
@ -0,0 +1,550 @@
+"""CLI entry point: preview and batch process .eml files through the DLP pipeline."""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import typer
+from rich.console import Console
+from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
+from rich.table import Table
+
+from .analyzer import _build_user_content, analyze_email, build_system_prompt
+from .converter import IMAGE_SENTINEL, convert_attachment
+from .models import ActionClass, AttachmentResult, DLPResult
+from .parser import parse_eml
+from .policy_reviewer import review_corpus
+from .simulator import simulate_analysis
+
+app = typer.Typer(help="Email DLP — scan .eml files for data loss prevention policy violations.")
+console = Console()
+
+ACTION_COLORS = {
+    ActionClass.BLOCK: "bold red",
+    ActionClass.ALERT: "bold yellow",
+    ActionClass.PASS_: "bold green",
+}
+
+RISK_COLORS = {
+    "CRITICAL": "bold red",
+    "HIGH": "red",
+    "MEDIUM": "yellow",
+    "LOW": "green",
+}
+
+
+def _process_single_email(
+    eml_path: Path,
+    endpoint: str,
+    model: str,
+    backend: str = "llm",
+) -> DLPResult:
+    """Parse, convert, and analyze one .eml file."""
+    processing_errors: list[str] = []
+
+    # 1. Parse MIME
+    parsed = parse_eml(eml_path)
+
+    try:
+        # 2. Convert attachments
+        attachment_texts: list[tuple[str, str]] = []
+        attachment_results: list[AttachmentResult] = []
+
+        for att in parsed.attachments:
+            entries = convert_attachment(att.path, att.filename)
+            for display_name, text, status in entries:
+                if "truncated" in status:
+                    processing_errors.append(
+                        f"'{display_name}' truncated to 20000 chars"
+                    )
+                attachment_texts.append((display_name, text))
+                attachment_results.append(
+                    AttachmentResult(
+                        filename=display_name,
+                        content_type=att.content_type,
+                        extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
+                        conversion_status=status.split("|")[0],
+                    )
+                )
+
+        # 3. Analyze with LLM
+        result = analyze_email(
+            email_file=eml_path.name,
+            subject=parsed.subject,
+            sender=parsed.sender,
+            recipient=parsed.recipient,
+            date=parsed.date,
+            body_text=parsed.body_text,
+            attachment_texts=attachment_texts,
+            attachment_results=attachment_results,
+            processing_errors=processing_errors,
+            endpoint=endpoint,
+            model=model,
+            backend=backend,
+        )
+    finally:
+        parsed.cleanup()
+
+    return result
+
+
+def _simulate_single_email(eml_path: Path) -> DLPResult:
+    """Parse, convert, and simulate one .eml file without an LLM."""
+    processing_errors: list[str] = []
+    parsed = parse_eml(eml_path)
+
+    try:
+        attachment_texts: list[tuple[str, str]] = []
+        attachment_results: list[AttachmentResult] = []
+
+        for att in parsed.attachments:
+            entries = convert_attachment(att.path, att.filename)
+            for display_name, text, status in entries:
+                if "truncated" in status:
+                    processing_errors.append(
+                        f"'{display_name}' truncated to 20000 chars"
+                    )
+                attachment_texts.append((display_name, text))
+                attachment_results.append(
+                    AttachmentResult(
+                        filename=display_name,
+                        content_type=att.content_type,
+                        extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
+                        conversion_status=status.split("|")[0],
+                    )
+                )
+
+        result = simulate_analysis(
+            email_file=eml_path.name,
+            subject=parsed.subject,
+            sender=parsed.sender,
+            recipient=parsed.recipient,
+            date=parsed.date,
+            body_text=parsed.body_text,
+            attachment_texts=attachment_texts,
+            attachment_results=attachment_results,
+            processing_errors=processing_errors,
+        )
+    finally:
+        parsed.cleanup()
+
+    return result
+
+
+def _preview_single_email(
+    eml_path: Path,
+    include_system_prompt: bool = True,
+    include_full_prompt: bool = False,
+) -> dict:
+    """Parse and convert one .eml file without calling the LLM."""
+    parsed = parse_eml(eml_path)
+
+    try:
+        attachments_preview: list[dict[str, object]] = []
+        attachment_texts: list[tuple[str, str]] = []
+        processing_errors: list[str] = []
+
+        for att in parsed.attachments:
+            entries = convert_attachment(att.path, att.filename)
+            for display_name, text, status in entries:
+                is_image = text.startswith(IMAGE_SENTINEL)
+                if "truncated" in status:
+                    processing_errors.append(
+                        f"'{display_name}' truncated to 20000 chars"
+                    )
+
+                attachment_texts.append((display_name, text))
+                attachments_preview.append(
+                    {
+                        "filename": display_name,
+                        "content_type": att.content_type,
+                        "conversion_status": status,
+                        "is_image": is_image,
+                        "extracted_text_chars": 0 if is_image else len(text),
+                        "text_preview": (
+                            None
+                            if is_image
+                            else text[:500]
+                        ),
+                        "image_data_url_preview": (
+                            None
+                            if not is_image
+                            else text[:120] + "..."
+                            if len(text) > 120
+                            else text
+                        ),
+                    }
+                )
+
+        llm_user_content = _build_user_content(
+            subject=parsed.subject,
+            sender=parsed.sender,
+            recipient=parsed.recipient,
+            date=parsed.date,
+            body_text=parsed.body_text,
+            attachment_texts=attachment_texts,
+        )
+
+        llm_user_content_preview: list[dict[str, object]] = []
+        for block in llm_user_content:
+            if block["type"] == "text":
+                llm_user_content_preview.append(
+                    {
+                        "type": "text",
+                        "text_preview": str(block["text"])[:1000],
+                        "text_chars": len(str(block["text"])),
+                    }
+                )
+            else:
+                url = str(block["image_url"]["url"])
+                llm_user_content_preview.append(
+                    {
+                        "type": "image_url",
+                        "url_preview": url[:120] + ("..." if len(url) > 120 else ""),
+                        "url_chars": len(url),
+                    }
+                )
+
+        preview_result = {
+            "email_file": eml_path.name,
+            "subject": parsed.subject,
+            "sender": parsed.sender,
+            "recipient": parsed.recipient,
+            "date": parsed.date,
+            "body_text_chars": len(parsed.body_text),
+            "body_text_preview": parsed.body_text[:1000],
+            "attachment_count": len(parsed.attachments),
+            "attachments": attachments_preview,
+            "processing_errors": processing_errors,
+            "llm_user_content_preview": llm_user_content_preview,
+        }
+
+        if include_system_prompt:
+            system_prompt = build_system_prompt()
+            preview_result["llm_system_prompt_preview"] = {
+                "text_preview": system_prompt[:2000],
+                "text_chars": len(system_prompt),
+            }
+            if include_full_prompt:
+                preview_result["llm_system_prompt"] = system_prompt
+
+        if include_full_prompt:
+            preview_result["llm_user_content"] = llm_user_content
+
+        return preview_result
+    finally:
+        parsed.cleanup()
+
+
+@app.command()
+def analyze(
+    input_dir: Path = typer.Option(
+        Path("data"),
+        "--input",
+        "-i",
+        help="Directory containing .eml files",
+    ),
+    output_dir: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Directory to write JSON results (defaults to output/analyze-TIMESTAMP)",
+    ),
+    endpoint: str = typer.Option(
+        "http://localhost:8000/v1",
+        "--endpoint",
+        help="vLLM OpenAI-compatible endpoint",
+    ),
+    model: str = typer.Option(
+        "Qwen/Qwen3.5-35B-A3B",
+        "--model",
+        help="Model name to use for analysis",
+    ),
+    backend: str = typer.Option(
+        "llm",
+        "--backend",
+        help="Analysis backend: 'llm' for API calls, 'simulated' for local deterministic analysis",
+    ),
+    summary: bool = typer.Option(
+        False,
+        "--summary",
+        "-s",
+        help="Print a summary table after processing",
+    ),
+) -> None:
+    """Batch analyze all .eml files in INPUT_DIR and write JSON results to OUTPUT_DIR."""
+    if output_dir is None:
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        output_dir = Path("output") / f"analyze-{timestamp}"
+
+    eml_files = sorted(input_dir.glob("*.eml"))
+    if not eml_files:
+        console.print(f"[red]No .eml files found in {input_dir}[/red]")
+        raise typer.Exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    results: list[DLPResult] = []
+
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("{task.completed}/{task.total}"),
+        TimeElapsedColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Analyzing emails...", total=len(eml_files))
+
+        for eml_path in eml_files:
+            progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
+            try:
+                result = _process_single_email(eml_path, endpoint, model, backend=backend)
+            except Exception as e:
+                console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
+                progress.advance(task)
+                continue
+
+            # Write individual JSON result
+            out_file = output_dir / (eml_path.stem + ".json")
+            out_file.write_text(result.model_dump_json(indent=2))
+            results.append(result)
+            progress.advance(task)
+
+    # Write batch summary
+    batch_summary = {
+        "total": len(results),
+        "by_action": {
+            "BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
+            "ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
+            "PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
+        },
+        "by_risk": {
+            level: sum(1 for r in results if r.risk_level.value == level)
+            for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
+        },
+        "emails": [
+            {
+                "file": r.email_file,
+                "subject": r.subject,
+                "risk_level": r.risk_level.value,
+                "risk_score": r.risk_score,
+                "action": r.action.value,
+                "violation_types": [v.value for v in r.violation_types],
+            }
+            for r in results
+        ],
+    }
+    (output_dir / "batch_summary.json").write_text(
+        json.dumps(batch_summary, indent=2)
+    )
+
+    console.print(f"\n[bold green]Done![/bold green] Processed {len(results)}/{len(eml_files)} emails.")
+    console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
+
+    if summary and results:
+        _print_summary_table(results)
+
+
+@app.command()
+def preview(
+    input_dir: Path = typer.Option(
+        Path("data"),
+        "--input",
+        "-i",
+        help="Directory containing .eml files",
+    ),
+    output_dir: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Optional directory to write preview JSON files",
+    ),
+    print_json: bool = typer.Option(
+        False,
+        "--print-json",
+        help="Print preview JSON to stdout",
+    ),
+    include_system_prompt: bool = typer.Option(
+        True,
+        "--include-system-prompt/--no-system-prompt",
+        help="Include the analyzer system prompt built from policy.py",
+    ),
+    include_full_prompt: bool = typer.Option(
+        False,
+        "--include-full-prompt",
+        help="Include the full system prompt and full user content in JSON output",
+    ),
+) -> None:
+    """Preview parsed email and converted attachment content before LLM analysis."""
+    eml_files = sorted(input_dir.glob("*.eml"))
+    if not eml_files:
+        console.print(f"[red]No .eml files found in {input_dir}[/red]")
+        raise typer.Exit(1)
+
+    if output_dir is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    previews: list[dict] = []
+
+    for eml_path in eml_files:
+        try:
+            preview_result = _preview_single_email(
+                eml_path,
+                include_system_prompt=include_system_prompt,
+                include_full_prompt=include_full_prompt,
+            )
+        except Exception as e:
+            console.print(f"[red]Error previewing {eml_path.name}: {e}[/red]")
+            continue
+
+        previews.append(preview_result)
+
+        if output_dir is not None:
+            out_file = output_dir / f"{eml_path.stem}.preview.json"
+            out_file.write_text(json.dumps(preview_result, indent=2))
+
+    if output_dir is not None:
+        batch_file = output_dir / "batch_preview.json"
+        batch_file.write_text(json.dumps(previews, indent=2))
+        console.print(f"[green]Preview JSON written to[/green] [cyan]{output_dir}[/cyan]")
+
+    if print_json:
+        console.print_json(json.dumps(previews, indent=2))
+    else:
+        table = Table(title="Email Preview Results", show_lines=True)
+        table.add_column("File", style="dim", max_width=45)
+        table.add_column("Body Chars", justify="right")
+        table.add_column("Attachments", justify="right")
+        table.add_column("Errors", justify="right")
+
+        for preview_result in previews:
+            table.add_row(
+                str(preview_result["email_file"]),
+                str(preview_result["body_text_chars"]),
+                str(preview_result["attachment_count"]),
+                str(len(preview_result["processing_errors"])),
+            )
+
+        console.print(table)
+
+
+@app.command()
+def simulate(
+    input_dir: Path = typer.Option(
+        Path("data"),
+        "--input",
+        "-i",
+        help="Directory containing .eml files",
+    ),
+    output_dir: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Directory to write simulated JSON results (defaults to output/simulated-TIMESTAMP)",
+    ),
+    summary: bool = typer.Option(
+        True,
+        "--summary/--no-summary",
+        help="Print a summary table after processing",
+    ),
+) -> None:
+    """Batch simulate DLP analysis locally without calling an LLM."""
+    if output_dir is None:
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        output_dir = Path("output") / f"simulated-{timestamp}"
+
+    eml_files = sorted(input_dir.glob("*.eml"))
+    if not eml_files:
+        console.print(f"[red]No .eml files found in {input_dir}[/red]")
+        raise typer.Exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    results: list[DLPResult] = []
+
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("{task.completed}/{task.total}"),
+        TimeElapsedColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Simulating email analysis...", total=len(eml_files))
+
+        for eml_path in eml_files:
+            progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
+            try:
+                result = _simulate_single_email(eml_path)
+            except Exception as e:
+                console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
+                progress.advance(task)
+                continue
+
+            out_file = output_dir / (eml_path.stem + ".json")
+            out_file.write_text(result.model_dump_json(indent=2))
+            results.append(result)
+            progress.advance(task)
+
+    batch_summary = {
+        "total": len(results),
+        "generator": "local-simulator",
+        "model_label": "gpt-5.4-simulated",
+        "by_action": {
+            "BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
+            "ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
+            "PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
+        },
+        "by_risk": {
+            level: sum(1 for r in results if r.risk_level.value == level)
+            for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
+        },
+        "emails": [
+            {
+                "file": r.email_file,
+                "subject": r.subject,
+                "risk_level": r.risk_level.value,
+                "risk_score": r.risk_score,
+                "action": r.action.value,
+                "violation_types": [v.value for v in r.violation_types],
+            }
+            for r in results
+        ],
+    }
+    (output_dir / "batch_summary.json").write_text(
+        json.dumps(batch_summary, indent=2)
+    )
+
+    console.print(
+        f"\n[bold green]Done![/bold green] Simulated {len(results)}/{len(eml_files)} emails."
+    )
+    console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
+
+    if summary and results:
+        _print_summary_table(results)
+
+
+def _print_summary_table(results: list[DLPResult]) -> None:
+    """Print a rich summary table to the console."""
+    table = Table(title="Email DLP Analysis Results", show_lines=True)
+    table.add_column("File", style="dim", max_width=45)
+    table.add_column("Risk Level", justify="center")
+    table.add_column("Score", justify="center")
+    table.add_column("Action", justify="center")
+    table.add_column("Violations", max_width=40)
+
+    for r in results:
+        risk_color = RISK_COLORS.get(r.risk_level.value, "white")
+        action_color = ACTION_COLORS.get(r.action, "white")
+        violations = ", ".join(v.value for v in r.violation_types)
+
+        table.add_row(
+            r.email_file,
+            f"[{risk_color}]{r.risk_level.value}[/{risk_color}]",
+            str(r.risk_score),
+            f"[{action_color}]{r.action.value}[/{action_color}]",
+            violations,
+        )
+
+    console.print(table)
--- a/email_dlp/converter.py
+++ b/email_dlp/converter.py
@ -0,0 +1,238 @@
+"""Attachment → markdown text conversion routing."""
+
+import base64
+import tempfile
+import zipfile
+from pathlib import Path
+
+from markitdown import MarkItDown
+
+MAX_TEXT_CHARS = 20_000
+
+# Sentinel prefix used to pass image data through the (text, status) interface.
+# Format: IMAGE_SENTINEL + "<mime_type>:<base64_data>"
+IMAGE_SENTINEL = "__IMAGE__:"
+
+_IMAGE_MIME = {
+    ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
+    ".png": "image/png",  ".gif": "image/gif",
+    ".bmp": "image/bmp",  ".tiff": "image/tiff", ".webp": "image/webp",
+    ".img": "image/png",  # fallback for generated inline names
+}
+
+
+def _convert_single_file(filepath: Path) -> tuple[str, str]:
+    """Convert a single file to text. Returns (text, status).
+
+    For image files, text is IMAGE_SENTINEL + "<mime>:<base64>" and
+    status is "ok:image". Callers must check for the sentinel.
+    """
+    suffix = filepath.suffix.lower()
+
+    # Image — return base64 sentinel for VLM consumption
+    if suffix in _IMAGE_MIME:
+        mime = _IMAGE_MIME[suffix]
+        b64 = base64.b64encode(filepath.read_bytes()).decode()
+        return IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"
+
+    known_binary_exts = {
+        ".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
+        ".go", ".rb", ".rs", ".sh", ".txt", ".md", ".sql", ".yaml", ".yml",
+        ".json", ".xml", ".html", ".htm", ".css",
+    }
+
+    if suffix in known_binary_exts:
+        # Plain text fallback — read directly
+        try:
+            text = filepath.read_text(errors="replace")
+            return text, "ok"
+        except Exception as e:
+            return "", f"failed: {e}"
+
+    # Use markitdown for PDF, DOCX, XLSX, CSV, etc.
+    try:
+        md = MarkItDown()
+        result = md.convert(str(filepath))
+        return result.text_content or "", "ok"
+    except Exception as e:
+        # Fallback to plain-text read for unknown types
+        try:
+            text = filepath.read_text(errors="replace")
+            return text, f"fallback: {e}"
+        except Exception as e2:
+            return "", f"failed: {e2}"
+
+
+_OFFICE_MEDIA_DIRS = {
+    ".docx": "word/media/",
+    ".pptx": "ppt/media/",
+    ".xlsx": "xl/media/",
+}
+
+_IMAGE_EXTS = set(_IMAGE_MIME.keys())
+
+
+def _extract_pdf_images(
+    filepath: Path, filename: str
+) -> list[tuple[str, str, str]]:
+    """Extract embedded images from a PDF using PyMuPDF.
+
+    Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
+    Returns empty list if fitz is not installed or no images found.
+    """
+    try:
+        import fitz  # PyMuPDF
+    except ImportError:
+        return []
+
+    results: list[tuple[str, str, str]] = []
+    try:
+        doc = fitz.open(str(filepath))
+        img_index = 0
+        for page in doc:
+            for img in page.get_images():
+                xref = img[0]
+                img_data = doc.extract_image(xref)
+                ext = img_data.get("ext", "png")
+                mime = _IMAGE_MIME.get(f".{ext}", f"image/{ext}")
+                b64 = base64.b64encode(img_data["image"]).decode()
+                display_name = f"{filename}/image_{img_index}.{ext}"
+                results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
+                img_index += 1
+    except Exception:
+        pass
+
+    return results
+
+
+def _extract_office_images(
+    filepath: Path, filename: str
+) -> list[tuple[str, str, str]]:
+    """Extract embedded images from a DOCX/PPTX/XLSX using zipfile.
+
+    Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
+    Returns empty list if the file is not a valid ZIP or has no images.
+    """
+    suffix = Path(filename).suffix.lower()
+    media_dir = _OFFICE_MEDIA_DIRS.get(suffix)
+    if not media_dir:
+        return []
+
+    results: list[tuple[str, str, str]] = []
+    try:
+        with zipfile.ZipFile(str(filepath), "r") as zf:
+            for name in sorted(zf.namelist()):
+                if not name.startswith(media_dir):
+                    continue
+                member_suffix = Path(name).suffix.lower()
+                if member_suffix not in _IMAGE_EXTS:
+                    continue
+                mime = _IMAGE_MIME[member_suffix]
+                b64 = base64.b64encode(zf.read(name)).decode()
+                display_name = f"{filename}/{Path(name).name}"
+                results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
+    except Exception:
+        pass
+
+    return results
+
+
+def _convert_7z(
+    filepath: Path, archive_name: str
+) -> list[tuple[str, str, str]]:
+    """Extract a .7z archive and convert each member.
+
+    Returns list of (display_name, text_or_sentinel, status), one entry per member.
+    display_name uses "archive.7z/member.ext" format.
+    """
+    try:
+        import py7zr
+    except ImportError:
+        return [(archive_name, "", "failed: py7zr not installed")]
+
+    results: list[tuple[str, str, str]] = []
+    with tempfile.TemporaryDirectory(prefix="email_dlp_7z_") as tmpdir:
+        tmp = Path(tmpdir)
+        try:
+            with py7zr.SevenZipFile(str(filepath), mode="r") as archive:
+                archive.extractall(path=str(tmp))
+        except Exception as e:
+            return [(archive_name, "", f"failed: 7z extraction error: {e}")]
+
+        for member_path in sorted(tmp.rglob("*")):
+            if not member_path.is_file():
+                continue
+            display_name = f"{archive_name}/{member_path.name}"
+            text, status = _convert_single_file(member_path)
+            if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
+                text = text[:MAX_TEXT_CHARS]
+                status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
+            results.append((display_name, text, status))
+
+    return results if results else [(archive_name, "", "skipped")]
+
+
+def _convert_zip(
+    filepath: Path, archive_name: str
+) -> list[tuple[str, str, str]]:
+    """Extract a .zip archive and convert each member.
+
+    Returns list of (display_name, text_or_sentinel, status), one entry per member.
+    display_name uses "archive.zip/member.ext" format.
+    """
+    import zipfile
+
+    results: list[tuple[str, str, str]] = []
+    with tempfile.TemporaryDirectory(prefix="email_dlp_zip_") as tmpdir:
+        tmp = Path(tmpdir)
+        try:
+            with zipfile.ZipFile(str(filepath), mode="r") as archive:
+                archive.extractall(path=str(tmp))
+        except Exception as e:
+            return [(archive_name, "", f"failed: zip extraction error: {e}")]
+
+        for member_path in sorted(tmp.rglob("*")):
+            if not member_path.is_file():
+                continue
+            display_name = f"{archive_name}/{member_path.name}"
+            text, status = _convert_single_file(member_path)
+            if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
+                text = text[:MAX_TEXT_CHARS]
+                status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
+            results.append((display_name, text, status))
+
+    return results if results else [(archive_name, "", "skipped")]
+
+
+def convert_attachment(
+    filepath: Path, filename: str
+) -> list[tuple[str, str, str]]:
+    """Convert an attachment file for LLM analysis.
+
+    Returns list of (display_name, text_or_sentinel, status).
+    - Non-archive files: single-element list.
+    - .7z archives: one element per member file inside the archive.
+
+    text_or_sentinel is either plain text or IMAGE_SENTINEL + "<mime>:<base64>"
+    for image files. Text is truncated to MAX_TEXT_CHARS (images are not truncated).
+    """
+    suffix = Path(filename).suffix.lower()
+
+    if suffix == ".7z":
+        return _convert_7z(filepath, filename)
+    elif suffix == ".zip":
+        return _convert_zip(filepath, filename)
+
+    text, status = _convert_single_file(filepath)
+    if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
+        text = text[:MAX_TEXT_CHARS]
+        status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
+    results = [(filename, text, status)]
+
+    # For PDF and Office files, also extract embedded images
+    if suffix == ".pdf":
+        results.extend(_extract_pdf_images(filepath, filename))
+    elif suffix in _OFFICE_MEDIA_DIRS:
+        results.extend(_extract_office_images(filepath, filename))
+
+    return results
--- a/email_dlp/models.py
+++ b/email_dlp/models.py
@ -0,0 +1,52 @@
+"""Pydantic output schema for DLP analysis results."""
+
+from enum import Enum
+from pydantic import BaseModel, Field
+
+
+class RiskLevel(str, Enum):
+    CRITICAL = "CRITICAL"
+    HIGH = "HIGH"
+    MEDIUM = "MEDIUM"
+    LOW = "LOW"
+
+
+class ViolationType(str, Enum):
+    PII = "PII"
+    FINANCIAL_DATA = "FINANCIAL_DATA"
+    SOURCE_CODE = "SOURCE_CODE"
+    REGULATORY_DOCUMENT = "REGULATORY_DOCUMENT"
+    LEGAL_CONTRACT = "LEGAL_CONTRACT"
+    PAYROLL_RECORD = "PAYROLL_RECORD"
+    CUSTOMER_LIST = "CUSTOMER_LIST"
+    INTERNAL_MEMO = "INTERNAL_MEMO"
+    NONE = "NONE"
+
+
+class ActionClass(str, Enum):
+    PASS_ = "PASS"
+    ALERT = "ALERT"
+    BLOCK = "BLOCK"
+
+
+class AttachmentResult(BaseModel):
+    filename: str
+    content_type: str
+    extracted_text_chars: int = 0
+    conversion_status: str = "ok"  # "ok" | "failed" | "skipped"
+
+
+class DLPResult(BaseModel):
+    email_file: str
+    subject: str
+    sender: str
+    recipient: str
+    date: str
+    risk_level: RiskLevel
+    risk_score: int = Field(ge=0, le=100)
+    violation_types: list[ViolationType]
+    action: ActionClass
+    summary: str
+    evidence: list[str]
+    attachments: list[AttachmentResult]
+    processing_errors: list[str] = Field(default_factory=list)
--- a/email_dlp/parser.py
+++ b/email_dlp/parser.py
@ -0,0 +1,199 @@
+"""MIME email parsing: extract headers, body text, and attachments."""
+
+import email
+import email.policy
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class ParsedAttachment:
+    filename: str
+    path: Path
+    content_type: str
+
+
+@dataclass
+class ParsedEmail:
+    subject: str
+    sender: str
+    recipient: str
+    date: str
+    body_text: str
+    attachments: list[ParsedAttachment] = field(default_factory=list)
+    # tempdir must be kept alive by the caller
+    _tempdir: tempfile.TemporaryDirectory | None = field(default=None, repr=False)
+
+    def cleanup(self) -> None:
+        if self._tempdir is not None:
+            self._tempdir.cleanup()
+            self._tempdir = None
+
+
+def _decode_header_value(value: str | None) -> str:
+    if value is None:
+        return ""
+    # Decode RFC2047 encoded words (e.g. =?Windows-1252?Q?...?=)
+    decoded_parts = email.header.decode_header(str(value))
+    result = ""
+    for chunk, charset in decoded_parts:
+        if isinstance(chunk, bytes):
+            result += chunk.decode(charset or "utf-8", errors="replace")
+        else:
+            result += chunk
+    return result.strip()
+
+
+def _extract_body(msg: email.message.Message) -> str:
+    """Walk MIME parts and extract the best plain-text body."""
+    plain_parts: list[str] = []
+    html_parts: list[str] = []
+
+    if msg.is_multipart():
+        for part in msg.walk():
+            ct = part.get_content_type()
+            disposition = str(part.get("Content-Disposition", ""))
+            # Skip attachments
+            if "attachment" in disposition:
+                continue
+            if ct == "text/plain":
+                payload = part.get_payload(decode=True)
+                if payload:
+                    charset = part.get_content_charset() or "utf-8"
+                    plain_parts.append(payload.decode(charset, errors="replace"))
+            elif ct == "text/html":
+                payload = part.get_payload(decode=True)
+                if payload:
+                    charset = part.get_content_charset() or "utf-8"
+                    html_parts.append(payload.decode(charset, errors="replace"))
+    else:
+        ct = msg.get_content_type()
+        payload = msg.get_payload(decode=True)
+        if payload:
+            charset = msg.get_content_charset() or "utf-8"
+            text = payload.decode(charset, errors="replace")
+            if ct == "text/plain":
+                plain_parts.append(text)
+            elif ct == "text/html":
+                html_parts.append(text)
+
+    if plain_parts:
+        return "\n\n".join(plain_parts).strip()
+
+    # Fall back to HTML → plain text via BeautifulSoup
+    if html_parts:
+        combined_html = "\n".join(html_parts)
+        soup = BeautifulSoup(combined_html, "html.parser")
+        return soup.get_text(separator="\n").strip()
+
+    return ""
+
+
+_IMAGE_CONTENT_TYPES = {
+    "image/jpeg", "image/png", "image/gif",
+    "image/bmp", "image/tiff", "image/webp",
+}
+_IMAGE_EXTS = {
+    "image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif",
+    "image/bmp": ".bmp", "image/tiff": ".tiff", "image/webp": ".webp",
+}
+
+
+def _collect_attachments(
+    msg: email.message.Message, tmpdir: Path
+) -> list[ParsedAttachment]:
+    """Extract all attachment parts and write them to tmpdir.
+
+    Also captures inline images (CID-embedded) that have no filename.
+    """
+    attachments: list[ParsedAttachment] = []
+    seen_names: set[str] = set()
+    inline_image_counter = 0
+
+    for part in msg.walk():
+        disposition = str(part.get("Content-Disposition", ""))
+        content_type = part.get_content_type()
+        filename = part.get_filename()
+
+        # Inline image without a filename — generate one from Content-ID or counter
+        if filename is None and content_type in _IMAGE_CONTENT_TYPES:
+            cid = str(part.get("Content-ID", "")).strip("<>").split("@")[0]
+            ext = _IMAGE_EXTS.get(content_type, ".img")
+            filename = f"inline_{cid or inline_image_counter}{ext}"
+            inline_image_counter += 1
+        elif filename is None and "attachment" not in disposition:
+            continue
+        elif filename is None:
+            # Unnamed non-image attachment — skip
+            continue
+
+        # Decode RFC2047 filename if needed
+        decoded_parts = email.header.decode_header(filename)
+        filename_clean = ""
+        for chunk, charset in decoded_parts:
+            if isinstance(chunk, bytes):
+                filename_clean += chunk.decode(charset or "utf-8", errors="replace")
+            else:
+                filename_clean += chunk
+
+        # Avoid duplicates
+        base_name = filename_clean
+        counter = 1
+        while filename_clean in seen_names:
+            stem = Path(base_name).stem
+            suffix = Path(base_name).suffix
+            filename_clean = f"{stem}_{counter}{suffix}"
+            counter += 1
+        seen_names.add(filename_clean)
+
+        payload = part.get_payload(decode=True)
+        if payload is None:
+            continue
+
+        dest = tmpdir / filename_clean
+        dest.write_bytes(payload)
+
+        attachments.append(
+            ParsedAttachment(
+                filename=filename_clean,
+                path=dest,
+                content_type=part.get_content_type(),
+            )
+        )
+
+    return attachments
+
+
+def parse_eml(eml_path: Path) -> ParsedEmail:
+    """Parse an .eml file and return a ParsedEmail object.
+
+    The caller is responsible for calling parsed_email.cleanup() when done,
+    or using ParsedEmail as a context manager is not implemented — keep
+    the return value alive until you no longer need the attachment paths.
+    """
+    with open(eml_path, "rb") as f:
+        msg = email.message_from_binary_file(f, policy=email.policy.compat32)
+
+    subject = _decode_header_value(msg.get("Subject"))
+    sender = _decode_header_value(msg.get("From"))
+    recipient = _decode_header_value(msg.get("To"))
+    date = _decode_header_value(msg.get("Date"))
+
+    body_text = _extract_body(msg)
+
+    tmpdir_obj = tempfile.TemporaryDirectory(prefix="email_dlp_")
+    tmpdir = Path(tmpdir_obj.name)
+    attachments = _collect_attachments(msg, tmpdir)
+
+    return ParsedEmail(
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        body_text=body_text,
+        attachments=attachments,
+        _tempdir=tmpdir_obj,
+    )
--- a/email_dlp/policy.py
+++ b/email_dlp/policy.py
@ -0,0 +1,132 @@
+"""DLP policy definitions: violation categories, thresholds, and prompt formatting."""
+
+import json
+
+from .models import ActionClass, RiskLevel
+
+# Risk score thresholds
+RISK_THRESHOLDS = {
+    RiskLevel.CRITICAL: 80,
+    RiskLevel.HIGH: 60,
+    RiskLevel.MEDIUM: 40,
+    RiskLevel.LOW: 0,
+}
+
+# Action mapping based on risk level
+RISK_TO_ACTION = {
+    RiskLevel.CRITICAL: ActionClass.BLOCK,
+    RiskLevel.HIGH: ActionClass.ALERT,
+    RiskLevel.MEDIUM: ActionClass.ALERT,
+    RiskLevel.LOW: ActionClass.PASS_,
+}
+
+DLP_CATEGORIES = {
+    "PII": {
+        "description": "Personally Identifiable Information",
+        "signals": [
+            "Full name combined with email address",
+            "Social Security Number (SSN) or employee ID",
+            "Phone numbers combined with personal details",
+            "Home address combined with personal identifiers",
+        ],
+        "risk_weight": "HIGH to CRITICAL depending on volume",
+    },
+    "FINANCIAL_DATA": {
+        "description": "Non-public financial information",
+        "signals": [
+            "Revenue targets, EBITDA projections, internal forecasts",
+            "Salary figures, compensation plans",
+            "Invoice amounts and vendor payment terms",
+            "Internal budget allocations",
+        ],
+        "risk_weight": "MEDIUM to CRITICAL depending on sensitivity",
+    },
+    "SOURCE_CODE": {
+        "description": "Proprietary source code or model weights",
+        "signals": [
+            "Python, Java, or other source files with copyright notices",
+            "Internal class names and proprietary algorithms",
+            "Model architecture files or weight files",
+            "Internal API keys or credentials embedded in code",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "REGULATORY_DOCUMENT": {
+        "description": "Internal regulatory and compliance drafts",
+        "signals": [
+            "CFPB, GDPR, or SOX compliance drafts marked internal",
+            "Audit findings or remediation plans",
+            "Internal compliance assessments not yet published",
+            "Regulatory submission drafts",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "LEGAL_CONTRACT": {
+        "description": "Executed or draft legal agreements",
+        "signals": [
+            "Non-Disclosure Agreements (NDAs) with named parties",
+            "Signed contracts with dates and signatures",
+            "Settlement agreements or legal memoranda",
+            "Vendor contracts with financial terms",
+        ],
+        "risk_weight": "HIGH to CRITICAL",
+    },
+    "PAYROLL_RECORD": {
+        "description": "Employee payroll and compensation records",
+        "signals": [
+            "Employee ID combined with salary and payroll period",
+            "Direct deposit details or bank account information",
+            "Year-to-date earnings and deductions",
+            "HR compensation reports",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "CUSTOMER_LIST": {
+        "description": "Customer or prospect data in bulk",
+        "signals": [
+            "CSV or table with customer names, emails, and revenue figures",
+            "CRM exports with contact details",
+            "Prospect lists for sales campaigns",
+            "Customer PII in aggregate",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "INTERNAL_MEMO": {
+        "description": "Confidential internal communications",
+        "signals": [
+            'Documents marked "INTERNAL ONLY" or "DO NOT DISTRIBUTE"',
+            "CEO or executive strategy memos",
+            "Organizational restructuring plans",
+            "Internal performance reviews or headcount discussions",
+        ],
+        "risk_weight": "HIGH",
+    },
+}
+
+ACTION_THRESHOLDS = {
+    "BLOCK": "risk_score >= 80 (CRITICAL risk)",
+    "ALERT": "risk_score >= 40 (MEDIUM or HIGH risk)",
+    "PASS": "risk_score < 40 (LOW risk)",
+}
+
+
+def format_policy_for_prompt() -> str:
+    """Format the DLP policy as a JSON string for injection into the LLM system prompt."""
+    policy = {
+        "categories": DLP_CATEGORIES,
+        "risk_score_thresholds": {
+            "CRITICAL": "score >= 80",
+            "HIGH": "score >= 60",
+            "MEDIUM": "score >= 40",
+            "LOW": "score < 40",
+        },
+        "action_mapping": ACTION_THRESHOLDS,
+        "instructions": (
+            "Evaluate the email against ALL categories above. "
+            "Assign a risk_score from 0 to 100 based on the most severe violation found. "
+            "Multiple violations increase the score. "
+            "action must match the threshold: BLOCK if score>=80, ALERT if score>=40, PASS otherwise. "
+            "evidence must be direct quotes from the actual email or attachment content."
+        ),
+    }
+    return json.dumps(policy, indent=2)
--- a/email_dlp/policy_reviewer.py
+++ b/email_dlp/policy_reviewer.py
@ -0,0 +1,285 @@
+"""Policy-based DLP review derived from DLP_CATEGORIES in policy.py."""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+
+from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
+from .policy import DLP_CATEGORIES
+
+# Keywords derived from DLP_CATEGORIES signal descriptions in policy.py
+_POLICY_KEYWORDS: dict[ViolationType, dict] = {
+    ViolationType.PII: {
+        "keywords": [
+            "full name",
+            "email address",
+            "social security",
+            "ssn",
+            "employee id",
+            "phone number",
+            "home address",
+            "personal identifier",
+            "date of birth",
+        ],
+        "min_matches": 2,
+        "base_score": 55,
+    },
+    ViolationType.FINANCIAL_DATA: {
+        "keywords": [
+            "revenue",
+            "ebitda",
+            "projection",
+            "forecast",
+            "salary",
+            "compensation plan",
+            "invoice",
+            "amount due",
+            "payment terms",
+            "budget",
+            "gross margin",
+            "sales data",
+        ],
+        "min_matches": 1,
+        "base_score": 50,
+    },
+    ViolationType.SOURCE_CODE: {
+        "keywords": [
+            "copyright",
+            "def ",
+            "class ",
+            "from __future__",
+            "import ",
+            "model weights",
+            "api key",
+            "api_key",
+            "proprietary",
+            "source code",
+            "internal source",
+        ],
+        "min_matches": 2,
+        "base_score": 85,
+    },
+    ViolationType.REGULATORY_DOCUMENT: {
+        "keywords": [
+            "cfpb",
+            "gdpr",
+            "sox",
+            "compliance draft",
+            "not for public release",
+            "not for public distribution",
+            "regulatory submission",
+            "audit findings",
+            "remediation plan",
+            "internal compliance",
+        ],
+        "min_matches": 1,
+        "base_score": 82,
+    },
+    ViolationType.LEGAL_CONTRACT: {
+        "keywords": [
+            "non-disclosure",
+            "nondisclosure",
+            "nda",
+            "disclosing party",
+            "receiving party",
+            "confidentiality agreement",
+            "settlement agreement",
+            "executed contract",
+            "signed contract",
+        ],
+        "min_matches": 1,
+        "base_score": 65,
+    },
+    ViolationType.PAYROLL_RECORD: {
+        "keywords": [
+            "payroll",
+            "pay period",
+            "pay stub",
+            "direct deposit",
+            "routing number",
+            "bank account",
+            "net pay",
+            "gross pay",
+            "tax deductions",
+            "year-to-date",
+            "ytd",
+            "compensation record",
+        ],
+        "min_matches": 1,
+        "base_score": 88,
+    },
+    ViolationType.CUSTOMER_LIST: {
+        "keywords": [
+            "customer list",
+            "customer_id",
+            "customer id",
+            "crm export",
+            "prospect list",
+            "top-tier prospect",
+            "annual_sales",
+            "company_name",
+            "bulk export",
+            "sales campaign",
+        ],
+        "min_matches": 2,
+        "base_score": 85,
+    },
+    ViolationType.INTERNAL_MEMO: {
+        "keywords": [
+            "internal only",
+            "internal use only",
+            "do not distribute",
+            "not for external",
+            "office of the ceo",
+            "organizational priorities",
+            "growth roadmap",
+            "strictly confidential",
+            "internal policy document",
+            "headcount",
+        ],
+        "min_matches": 1,
+        "base_score": 55,
+    },
+}
+
+
+def _normalize(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def _find_evidence(text: str, keyword: str) -> str | None:
+    match = re.search(re.escape(keyword.strip()), text, flags=re.IGNORECASE)
+    if not match:
+        return None
+    start = max(0, match.start() - 60)
+    end = min(len(text), match.end() + 100)
+    return _normalize(text[start:end])
+
+
+def _risk_level_from_score(score: int) -> RiskLevel:
+    if score >= 80:
+        return RiskLevel.CRITICAL
+    if score >= 60:
+        return RiskLevel.HIGH
+    if score >= 40:
+        return RiskLevel.MEDIUM
+    return RiskLevel.LOW
+
+
+def _action_from_score(score: int) -> ActionClass:
+    if score >= 80:
+        return ActionClass.BLOCK
+    if score >= 40:
+        return ActionClass.ALERT
+    return ActionClass.PASS_
+
+
+def review_corpus(
+    email_file: str,
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+    attachment_results: list[AttachmentResult],
+    processing_errors: list[str],
+) -> DLPResult:
+    """Judge an email using DLP_CATEGORIES signals from policy.py."""
+    # Build full text corpus
+    parts = [
+        f"Subject: {subject}",
+        f"From: {sender}",
+        f"To: {recipient}",
+        body_text,
+    ]
+    for filename, text in attachment_texts:
+        parts.append(f"Attachment: {filename}")
+        parts.append(text)
+
+    raw = "\n".join(p for p in parts if p)
+    lower = raw.lower()
+
+    evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
+    score_map: dict[ViolationType, int] = {}
+
+    for vtype, rule in _POLICY_KEYWORDS.items():
+        keywords: list[str] = rule["keywords"]
+        min_matches: int = rule["min_matches"]
+        base_score: int = rule["base_score"]
+        match_count = 0
+
+        for kw in keywords:
+            if kw.lower() in lower:
+                match_count += 1
+                ev = _find_evidence(raw, kw)
+                if ev and ev not in evidence_map[vtype]:
+                    evidence_map[vtype].append(ev)
+
+        if match_count < min_matches:
+            continue
+
+        score = base_score + min(12, (match_count - 1) * 3)
+
+        # Context boost: external recipient domain
+        recipient_lower = recipient.lower()
+        if any(d in recipient_lower for d in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
+            score += 6
+
+        score_map[vtype] = min(99, score)
+
+    if not score_map:
+        category_desc = DLP_CATEGORIES  # keep reference to show it's used
+        _ = category_desc  # suppress unused warning
+        return DLPResult(
+            email_file=email_file,
+            subject=subject,
+            sender=sender,
+            recipient=recipient,
+            date=date,
+            risk_level=RiskLevel.LOW,
+            risk_score=12,
+            violation_types=[ViolationType.NONE],
+            action=ActionClass.PASS_,
+            summary="Policy review found no DLP category signals in this email.",
+            evidence=[],
+            attachments=attachment_results,
+            processing_errors=processing_errors,
+        )
+
+    ranked = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
+    violation_types = [vt for vt, _ in ranked[:3]]
+    risk_score = ranked[0][1]
+    if len(ranked) > 1:
+        risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
+
+    evidence: list[str] = []
+    for vt in violation_types:
+        evidence.extend(evidence_map[vt][:2])
+    evidence = evidence[:5]
+
+    risk_level = _risk_level_from_score(risk_score)
+    action = _action_from_score(risk_score)
+
+    violation_labels = ", ".join(v.value for v in violation_types)
+    summary = (
+        f"Policy review flagged {violation_labels} with {risk_level.value} risk "
+        f"(score {risk_score}) using DLP_CATEGORIES signals from policy.py."
+    )
+
+    return DLPResult(
+        email_file=email_file,
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        risk_level=risk_level,
+        risk_score=risk_score,
+        violation_types=violation_types,
+        action=action,
+        summary=summary,
+        evidence=evidence,
+        attachments=attachment_results,
+        processing_errors=processing_errors,
+    )
--- a/email_dlp/simulator.py
+++ b/email_dlp/simulator.py
@ -0,0 +1,310 @@
+"""Deterministic local simulator for DLP analysis."""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+
+from .converter import IMAGE_SENTINEL
+from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
+
+_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
+    ViolationType.PII: {
+        "keywords": [
+            "personally identifiable information",
+            " pii",
+            "employee id",
+            "account ending",
+            "direct deposit",
+            "customer_id",
+            "first_name",
+            "last_name",
+        ],
+        "base_score": 30,
+        "min_matches": 2,
+    },
+    ViolationType.FINANCIAL_DATA: {
+        "keywords": [
+            "financial forecast",
+            "revenue",
+            "ebitda",
+            "gross margin",
+            "margin efficiency",
+            "sales data",
+            "annual_sales_usd",
+            "invoice",
+            "amount due",
+            "payment instructions",
+            "ach",
+            "budget",
+        ],
+        "base_score": 42,
+    },
+    ViolationType.SOURCE_CODE: {
+        "keywords": [
+            "source code",
+            "api key",
+            "model weights",
+            "from __future__ import annotations",
+            "def ",
+            "class ",
+            "@dataclass",
+        ],
+        "base_score": 88,
+        "min_matches": 2,
+    },
+    ViolationType.REGULATORY_DOCUMENT: {
+        "keywords": [
+            "regulatory document",
+            "regulatory submission",
+            "cfpb",
+            "compliance report",
+            "not for public release",
+            "draft regulatory",
+            "prepared by: legal & compliance team",
+        ],
+        "base_score": 84,
+    },
+    ViolationType.LEGAL_CONTRACT: {
+        "keywords": [
+            "nondisclosure agreement",
+            "non-disclosure agreement",
+            "executed nda",
+            "disclosing party",
+            "receiving party",
+        ],
+        "base_score": 62,
+        "min_matches": 1,
+    },
+    ViolationType.PAYROLL_RECORD: {
+        "keywords": [
+            "payroll",
+            "pay stub",
+            "compensation record",
+            "gross:",
+            "net pay",
+            "tax deductions",
+            "pay period",
+            "direct deposit",
+            "employee id",
+        ],
+        "base_score": 90,
+    },
+    ViolationType.CUSTOMER_LIST: {
+        "keywords": [
+            "customer list",
+            "prospects",
+            "crm export",
+            "raw export",
+            "customer_id",
+            "company_name",
+            "annual_sales_usd",
+            "top-tier prospects",
+        ],
+        "base_score": 86,
+        "min_matches": 2,
+    },
+    ViolationType.INTERNAL_MEMO: {
+        "keywords": [
+            "internal use only",
+            "internal memo",
+            "do not distribute externally",
+            "office of the ceo",
+            "organizational priorities",
+            "growth roadmap",
+            "internal policy document",
+            "not for public distribution",
+            "strictly confidential",
+        ],
+        "base_score": 52,
+        "min_matches": 1,
+    },
+}
+
+_RISK_LEVELS = [
+    (80, RiskLevel.CRITICAL),
+    (60, RiskLevel.HIGH),
+    (40, RiskLevel.MEDIUM),
+    (0, RiskLevel.LOW),
+]
+
+
+def _normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def _build_corpus(
+    subject: str,
+    sender: str,
+    recipient: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+) -> tuple[str, str]:
+    text_chunks = [
+        f"Subject: {subject}",
+        f"From: {sender}",
+        f"To: {recipient}",
+        body_text,
+    ]
+    for filename, text in attachment_texts:
+        text_chunks.append(f"Attachment: {filename}")
+        # Skip binary image data — base64 payloads produce false keyword matches
+        if not text.startswith(IMAGE_SENTINEL):
+            text_chunks.append(text)
+    raw = "\n".join(chunk for chunk in text_chunks if chunk)
+    return raw, raw.lower()
+
+
+def _find_evidence(text: str, keyword: str) -> str | None:
+    pattern = re.escape(keyword.strip())
+    match = re.search(pattern, text, flags=re.IGNORECASE)
+    if not match:
+        return None
+    start = max(0, match.start() - 60)
+    end = min(len(text), match.end() + 100)
+    return _normalize_text(text[start:end])
+
+
+def _collect_matches(
+    raw_text: str,
+    lower_text: str,
+) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
+    evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
+    score_map: dict[ViolationType, int] = {}
+
+    for violation_type, rule in _CATEGORY_RULES.items():
+        keywords = rule["keywords"]
+        base_score = int(rule["base_score"])
+        min_matches = int(rule.get("min_matches", 1))
+        match_count = 0
+
+        for keyword in keywords:
+            # Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
+            pattern = r"\b" + re.escape(keyword) + r"\b"
+            if re.search(pattern, lower_text):
+                match_count += 1
+                evidence = _find_evidence(raw_text, keyword)
+                if evidence and evidence not in evidence_map[violation_type]:
+                    evidence_map[violation_type].append(evidence)
+
+        if match_count < min_matches:
+            continue
+
+        score = base_score + min(12, (match_count - 1) * 4)
+        score_map[violation_type] = min(score, 99)
+
+    return evidence_map, score_map
+
+
+def _apply_context_boosts(
+    subject: str,
+    recipient: str,
+    attachment_texts: list[tuple[str, str]],
+    score_map: dict[ViolationType, int],
+) -> None:
+    subject_lower = subject.lower()
+    recipient_lower = recipient.lower()
+
+    if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
+        for violation_type in list(score_map):
+            score_map[violation_type] = min(99, score_map[violation_type] + 6)
+
+    if "urgent" in subject_lower or "confidential" in subject_lower:
+        for violation_type in list(score_map):
+            score_map[violation_type] = min(99, score_map[violation_type] + 2)
+
+    attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
+    if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
+        score_map[ViolationType.CUSTOMER_LIST] = min(
+            99, score_map[ViolationType.CUSTOMER_LIST] + 6
+        )
+    if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
+        score_map[ViolationType.SOURCE_CODE] = min(
+            99, score_map[ViolationType.SOURCE_CODE] + 4
+        )
+
+
+def _risk_level_from_score(risk_score: int) -> RiskLevel:
+    for threshold, risk_level in _RISK_LEVELS:
+        if risk_score >= threshold:
+            return risk_level
+    return RiskLevel.LOW
+
+
+def _action_from_score(risk_score: int) -> ActionClass:
+    if risk_score >= 80:
+        return ActionClass.BLOCK
+    if risk_score >= 40:
+        return ActionClass.ALERT
+    return ActionClass.PASS_
+
+
+def _build_summary(
+    violation_types: list[ViolationType],
+    risk_level: RiskLevel,
+    risk_score: int,
+) -> str:
+    if violation_types == [ViolationType.NONE]:
+        return "No strong DLP indicators were found in the email body or converted attachments."
+    labels = ", ".join(v.value for v in violation_types)
+    return (
+        f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
+        f"(score {risk_score}) based on the email body and extracted attachment content."
+    )
+
+
+def simulate_analysis(
+    email_file: str,
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+    attachment_results: list[AttachmentResult],
+    processing_errors: list[str],
+) -> DLPResult:
+    """Predict a DLP result locally without calling an LLM."""
+    raw_text, lower_text = _build_corpus(
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        body_text=body_text,
+        attachment_texts=attachment_texts,
+    )
+    evidence_map, score_map = _collect_matches(raw_text, lower_text)
+    _apply_context_boosts(subject, recipient, attachment_texts, score_map)
+
+    if not score_map:
+        violation_types = [ViolationType.NONE]
+        risk_score = 18
+        evidence: list[str] = []
+    else:
+        ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
+        violation_types = [violation for violation, _ in ranked[:3]]
+        risk_score = ranked[0][1]
+        if len(ranked) > 1:
+            risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
+        evidence = []
+        for violation_type in violation_types:
+            evidence.extend(evidence_map.get(violation_type, [])[:2])
+        evidence = evidence[:5]
+
+    risk_level = _risk_level_from_score(risk_score)
+    action = _action_from_score(risk_score)
+
+    return DLPResult(
+        email_file=email_file,
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        risk_level=risk_level,
+        risk_score=risk_score,
+        violation_types=violation_types,
+        action=action,
+        summary=_build_summary(violation_types, risk_level, risk_score),
+        evidence=evidence,
+        attachments=attachment_results,
+        processing_errors=processing_errors,
+    )
				`@ -0,0 +1 @@`
				`# Email DLP - Data Loss Prevention for email content`