Initial commit

This commit is contained in:
2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions

550
email_dlp/cli.py Normal file
View File

@ -0,0 +1,550 @@
"""CLI entry point: preview and batch process .eml files through the DLP pipeline."""
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from rich.table import Table
from .analyzer import _build_user_content, analyze_email, build_system_prompt
from .converter import IMAGE_SENTINEL, convert_attachment
from .models import ActionClass, AttachmentResult, DLPResult
from .parser import parse_eml
from .policy_reviewer import review_corpus
from .simulator import simulate_analysis
app = typer.Typer(help="Email DLP — scan .eml files for data loss prevention policy violations.")
console = Console()
ACTION_COLORS = {
ActionClass.BLOCK: "bold red",
ActionClass.ALERT: "bold yellow",
ActionClass.PASS_: "bold green",
}
RISK_COLORS = {
"CRITICAL": "bold red",
"HIGH": "red",
"MEDIUM": "yellow",
"LOW": "green",
}
def _process_single_email(
eml_path: Path,
endpoint: str,
model: str,
backend: str = "llm",
) -> DLPResult:
"""Parse, convert, and analyze one .eml file."""
processing_errors: list[str] = []
# 1. Parse MIME
parsed = parse_eml(eml_path)
try:
# 2. Convert attachments
attachment_texts: list[tuple[str, str]] = []
attachment_results: list[AttachmentResult] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachment_results.append(
AttachmentResult(
filename=display_name,
content_type=att.content_type,
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
conversion_status=status.split("|")[0],
)
)
# 3. Analyze with LLM
result = analyze_email(
email_file=eml_path.name,
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
endpoint=endpoint,
model=model,
backend=backend,
)
finally:
parsed.cleanup()
return result
def _simulate_single_email(eml_path: Path) -> DLPResult:
"""Parse, convert, and simulate one .eml file without an LLM."""
processing_errors: list[str] = []
parsed = parse_eml(eml_path)
try:
attachment_texts: list[tuple[str, str]] = []
attachment_results: list[AttachmentResult] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachment_results.append(
AttachmentResult(
filename=display_name,
content_type=att.content_type,
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
conversion_status=status.split("|")[0],
)
)
result = simulate_analysis(
email_file=eml_path.name,
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
)
finally:
parsed.cleanup()
return result
def _preview_single_email(
eml_path: Path,
include_system_prompt: bool = True,
include_full_prompt: bool = False,
) -> dict:
"""Parse and convert one .eml file without calling the LLM."""
parsed = parse_eml(eml_path)
try:
attachments_preview: list[dict[str, object]] = []
attachment_texts: list[tuple[str, str]] = []
processing_errors: list[str] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
is_image = text.startswith(IMAGE_SENTINEL)
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachments_preview.append(
{
"filename": display_name,
"content_type": att.content_type,
"conversion_status": status,
"is_image": is_image,
"extracted_text_chars": 0 if is_image else len(text),
"text_preview": (
None
if is_image
else text[:500]
),
"image_data_url_preview": (
None
if not is_image
else text[:120] + "..."
if len(text) > 120
else text
),
}
)
llm_user_content = _build_user_content(
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
)
llm_user_content_preview: list[dict[str, object]] = []
for block in llm_user_content:
if block["type"] == "text":
llm_user_content_preview.append(
{
"type": "text",
"text_preview": str(block["text"])[:1000],
"text_chars": len(str(block["text"])),
}
)
else:
url = str(block["image_url"]["url"])
llm_user_content_preview.append(
{
"type": "image_url",
"url_preview": url[:120] + ("..." if len(url) > 120 else ""),
"url_chars": len(url),
}
)
preview_result = {
"email_file": eml_path.name,
"subject": parsed.subject,
"sender": parsed.sender,
"recipient": parsed.recipient,
"date": parsed.date,
"body_text_chars": len(parsed.body_text),
"body_text_preview": parsed.body_text[:1000],
"attachment_count": len(parsed.attachments),
"attachments": attachments_preview,
"processing_errors": processing_errors,
"llm_user_content_preview": llm_user_content_preview,
}
if include_system_prompt:
system_prompt = build_system_prompt()
preview_result["llm_system_prompt_preview"] = {
"text_preview": system_prompt[:2000],
"text_chars": len(system_prompt),
}
if include_full_prompt:
preview_result["llm_system_prompt"] = system_prompt
if include_full_prompt:
preview_result["llm_user_content"] = llm_user_content
return preview_result
finally:
parsed.cleanup()
@app.command()
def analyze(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Directory to write JSON results (defaults to output/analyze-TIMESTAMP)",
),
endpoint: str = typer.Option(
"http://localhost:8000/v1",
"--endpoint",
help="vLLM OpenAI-compatible endpoint",
),
model: str = typer.Option(
"Qwen/Qwen3.5-35B-A3B",
"--model",
help="Model name to use for analysis",
),
backend: str = typer.Option(
"llm",
"--backend",
help="Analysis backend: 'llm' for API calls, 'simulated' for local deterministic analysis",
),
summary: bool = typer.Option(
False,
"--summary",
"-s",
help="Print a summary table after processing",
),
) -> None:
"""Batch analyze all .eml files in INPUT_DIR and write JSON results to OUTPUT_DIR."""
if output_dir is None:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_dir = Path("output") / f"analyze-{timestamp}"
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
results: list[DLPResult] = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Analyzing emails...", total=len(eml_files))
for eml_path in eml_files:
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
try:
result = _process_single_email(eml_path, endpoint, model, backend=backend)
except Exception as e:
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
progress.advance(task)
continue
# Write individual JSON result
out_file = output_dir / (eml_path.stem + ".json")
out_file.write_text(result.model_dump_json(indent=2))
results.append(result)
progress.advance(task)
# Write batch summary
batch_summary = {
"total": len(results),
"by_action": {
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
},
"by_risk": {
level: sum(1 for r in results if r.risk_level.value == level)
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
},
"emails": [
{
"file": r.email_file,
"subject": r.subject,
"risk_level": r.risk_level.value,
"risk_score": r.risk_score,
"action": r.action.value,
"violation_types": [v.value for v in r.violation_types],
}
for r in results
],
}
(output_dir / "batch_summary.json").write_text(
json.dumps(batch_summary, indent=2)
)
console.print(f"\n[bold green]Done![/bold green] Processed {len(results)}/{len(eml_files)} emails.")
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
if summary and results:
_print_summary_table(results)
@app.command()
def preview(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Optional directory to write preview JSON files",
),
print_json: bool = typer.Option(
False,
"--print-json",
help="Print preview JSON to stdout",
),
include_system_prompt: bool = typer.Option(
True,
"--include-system-prompt/--no-system-prompt",
help="Include the analyzer system prompt built from policy.py",
),
include_full_prompt: bool = typer.Option(
False,
"--include-full-prompt",
help="Include the full system prompt and full user content in JSON output",
),
) -> None:
"""Preview parsed email and converted attachment content before LLM analysis."""
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
if output_dir is not None:
output_dir.mkdir(parents=True, exist_ok=True)
previews: list[dict] = []
for eml_path in eml_files:
try:
preview_result = _preview_single_email(
eml_path,
include_system_prompt=include_system_prompt,
include_full_prompt=include_full_prompt,
)
except Exception as e:
console.print(f"[red]Error previewing {eml_path.name}: {e}[/red]")
continue
previews.append(preview_result)
if output_dir is not None:
out_file = output_dir / f"{eml_path.stem}.preview.json"
out_file.write_text(json.dumps(preview_result, indent=2))
if output_dir is not None:
batch_file = output_dir / "batch_preview.json"
batch_file.write_text(json.dumps(previews, indent=2))
console.print(f"[green]Preview JSON written to[/green] [cyan]{output_dir}[/cyan]")
if print_json:
console.print_json(json.dumps(previews, indent=2))
else:
table = Table(title="Email Preview Results", show_lines=True)
table.add_column("File", style="dim", max_width=45)
table.add_column("Body Chars", justify="right")
table.add_column("Attachments", justify="right")
table.add_column("Errors", justify="right")
for preview_result in previews:
table.add_row(
str(preview_result["email_file"]),
str(preview_result["body_text_chars"]),
str(preview_result["attachment_count"]),
str(len(preview_result["processing_errors"])),
)
console.print(table)
@app.command()
def simulate(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Directory to write simulated JSON results (defaults to output/simulated-TIMESTAMP)",
),
summary: bool = typer.Option(
True,
"--summary/--no-summary",
help="Print a summary table after processing",
),
) -> None:
"""Batch simulate DLP analysis locally without calling an LLM."""
if output_dir is None:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_dir = Path("output") / f"simulated-{timestamp}"
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
results: list[DLPResult] = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Simulating email analysis...", total=len(eml_files))
for eml_path in eml_files:
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
try:
result = _simulate_single_email(eml_path)
except Exception as e:
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
progress.advance(task)
continue
out_file = output_dir / (eml_path.stem + ".json")
out_file.write_text(result.model_dump_json(indent=2))
results.append(result)
progress.advance(task)
batch_summary = {
"total": len(results),
"generator": "local-simulator",
"model_label": "gpt-5.4-simulated",
"by_action": {
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
},
"by_risk": {
level: sum(1 for r in results if r.risk_level.value == level)
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
},
"emails": [
{
"file": r.email_file,
"subject": r.subject,
"risk_level": r.risk_level.value,
"risk_score": r.risk_score,
"action": r.action.value,
"violation_types": [v.value for v in r.violation_types],
}
for r in results
],
}
(output_dir / "batch_summary.json").write_text(
json.dumps(batch_summary, indent=2)
)
console.print(
f"\n[bold green]Done![/bold green] Simulated {len(results)}/{len(eml_files)} emails."
)
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
if summary and results:
_print_summary_table(results)
def _print_summary_table(results: list[DLPResult]) -> None:
"""Print a rich summary table to the console."""
table = Table(title="Email DLP Analysis Results", show_lines=True)
table.add_column("File", style="dim", max_width=45)
table.add_column("Risk Level", justify="center")
table.add_column("Score", justify="center")
table.add_column("Action", justify="center")
table.add_column("Violations", max_width=40)
for r in results:
risk_color = RISK_COLORS.get(r.risk_level.value, "white")
action_color = ACTION_COLORS.get(r.action, "white")
violations = ", ".join(v.value for v in r.violation_types)
table.add_row(
r.email_file,
f"[{risk_color}]{r.risk_level.value}[/{risk_color}]",
str(r.risk_score),
f"[{action_color}]{r.action.value}[/{action_color}]",
violations,
)
console.print(table)