551 lines
18 KiB
Python
551 lines
18 KiB
Python
"""CLI entry point: preview and batch process .eml files through the DLP pipeline."""
|
|
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from rich.console import Console
|
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
from rich.table import Table
|
|
|
|
from .analyzer import _build_user_content, analyze_email, build_system_prompt
|
|
from .converter import IMAGE_SENTINEL, convert_attachment
|
|
from .models import ActionClass, AttachmentResult, DLPResult
|
|
from .parser import parse_eml
|
|
from .policy_reviewer import review_corpus
|
|
from .simulator import simulate_analysis
|
|
|
|
app = typer.Typer(help="Email DLP — scan .eml files for data loss prevention policy violations.")
|
|
console = Console()
|
|
|
|
ACTION_COLORS = {
|
|
ActionClass.BLOCK: "bold red",
|
|
ActionClass.ALERT: "bold yellow",
|
|
ActionClass.PASS_: "bold green",
|
|
}
|
|
|
|
RISK_COLORS = {
|
|
"CRITICAL": "bold red",
|
|
"HIGH": "red",
|
|
"MEDIUM": "yellow",
|
|
"LOW": "green",
|
|
}
|
|
|
|
|
|
def _process_single_email(
|
|
eml_path: Path,
|
|
endpoint: str,
|
|
model: str,
|
|
backend: str = "llm",
|
|
) -> DLPResult:
|
|
"""Parse, convert, and analyze one .eml file."""
|
|
processing_errors: list[str] = []
|
|
|
|
# 1. Parse MIME
|
|
parsed = parse_eml(eml_path)
|
|
|
|
try:
|
|
# 2. Convert attachments
|
|
attachment_texts: list[tuple[str, str]] = []
|
|
attachment_results: list[AttachmentResult] = []
|
|
|
|
for att in parsed.attachments:
|
|
entries = convert_attachment(att.path, att.filename)
|
|
for display_name, text, status in entries:
|
|
if "truncated" in status:
|
|
processing_errors.append(
|
|
f"'{display_name}' truncated to 20000 chars"
|
|
)
|
|
attachment_texts.append((display_name, text))
|
|
attachment_results.append(
|
|
AttachmentResult(
|
|
filename=display_name,
|
|
content_type=att.content_type,
|
|
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
|
|
conversion_status=status.split("|")[0],
|
|
)
|
|
)
|
|
|
|
# 3. Analyze with LLM
|
|
result = analyze_email(
|
|
email_file=eml_path.name,
|
|
subject=parsed.subject,
|
|
sender=parsed.sender,
|
|
recipient=parsed.recipient,
|
|
date=parsed.date,
|
|
body_text=parsed.body_text,
|
|
attachment_texts=attachment_texts,
|
|
attachment_results=attachment_results,
|
|
processing_errors=processing_errors,
|
|
endpoint=endpoint,
|
|
model=model,
|
|
backend=backend,
|
|
)
|
|
finally:
|
|
parsed.cleanup()
|
|
|
|
return result
|
|
|
|
|
|
def _simulate_single_email(eml_path: Path) -> DLPResult:
|
|
"""Parse, convert, and simulate one .eml file without an LLM."""
|
|
processing_errors: list[str] = []
|
|
parsed = parse_eml(eml_path)
|
|
|
|
try:
|
|
attachment_texts: list[tuple[str, str]] = []
|
|
attachment_results: list[AttachmentResult] = []
|
|
|
|
for att in parsed.attachments:
|
|
entries = convert_attachment(att.path, att.filename)
|
|
for display_name, text, status in entries:
|
|
if "truncated" in status:
|
|
processing_errors.append(
|
|
f"'{display_name}' truncated to 20000 chars"
|
|
)
|
|
attachment_texts.append((display_name, text))
|
|
attachment_results.append(
|
|
AttachmentResult(
|
|
filename=display_name,
|
|
content_type=att.content_type,
|
|
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
|
|
conversion_status=status.split("|")[0],
|
|
)
|
|
)
|
|
|
|
result = simulate_analysis(
|
|
email_file=eml_path.name,
|
|
subject=parsed.subject,
|
|
sender=parsed.sender,
|
|
recipient=parsed.recipient,
|
|
date=parsed.date,
|
|
body_text=parsed.body_text,
|
|
attachment_texts=attachment_texts,
|
|
attachment_results=attachment_results,
|
|
processing_errors=processing_errors,
|
|
)
|
|
finally:
|
|
parsed.cleanup()
|
|
|
|
return result
|
|
|
|
|
|
def _preview_single_email(
|
|
eml_path: Path,
|
|
include_system_prompt: bool = True,
|
|
include_full_prompt: bool = False,
|
|
) -> dict:
|
|
"""Parse and convert one .eml file without calling the LLM."""
|
|
parsed = parse_eml(eml_path)
|
|
|
|
try:
|
|
attachments_preview: list[dict[str, object]] = []
|
|
attachment_texts: list[tuple[str, str]] = []
|
|
processing_errors: list[str] = []
|
|
|
|
for att in parsed.attachments:
|
|
entries = convert_attachment(att.path, att.filename)
|
|
for display_name, text, status in entries:
|
|
is_image = text.startswith(IMAGE_SENTINEL)
|
|
if "truncated" in status:
|
|
processing_errors.append(
|
|
f"'{display_name}' truncated to 20000 chars"
|
|
)
|
|
|
|
attachment_texts.append((display_name, text))
|
|
attachments_preview.append(
|
|
{
|
|
"filename": display_name,
|
|
"content_type": att.content_type,
|
|
"conversion_status": status,
|
|
"is_image": is_image,
|
|
"extracted_text_chars": 0 if is_image else len(text),
|
|
"text_preview": (
|
|
None
|
|
if is_image
|
|
else text[:500]
|
|
),
|
|
"image_data_url_preview": (
|
|
None
|
|
if not is_image
|
|
else text[:120] + "..."
|
|
if len(text) > 120
|
|
else text
|
|
),
|
|
}
|
|
)
|
|
|
|
llm_user_content = _build_user_content(
|
|
subject=parsed.subject,
|
|
sender=parsed.sender,
|
|
recipient=parsed.recipient,
|
|
date=parsed.date,
|
|
body_text=parsed.body_text,
|
|
attachment_texts=attachment_texts,
|
|
)
|
|
|
|
llm_user_content_preview: list[dict[str, object]] = []
|
|
for block in llm_user_content:
|
|
if block["type"] == "text":
|
|
llm_user_content_preview.append(
|
|
{
|
|
"type": "text",
|
|
"text_preview": str(block["text"])[:1000],
|
|
"text_chars": len(str(block["text"])),
|
|
}
|
|
)
|
|
else:
|
|
url = str(block["image_url"]["url"])
|
|
llm_user_content_preview.append(
|
|
{
|
|
"type": "image_url",
|
|
"url_preview": url[:120] + ("..." if len(url) > 120 else ""),
|
|
"url_chars": len(url),
|
|
}
|
|
)
|
|
|
|
preview_result = {
|
|
"email_file": eml_path.name,
|
|
"subject": parsed.subject,
|
|
"sender": parsed.sender,
|
|
"recipient": parsed.recipient,
|
|
"date": parsed.date,
|
|
"body_text_chars": len(parsed.body_text),
|
|
"body_text_preview": parsed.body_text[:1000],
|
|
"attachment_count": len(parsed.attachments),
|
|
"attachments": attachments_preview,
|
|
"processing_errors": processing_errors,
|
|
"llm_user_content_preview": llm_user_content_preview,
|
|
}
|
|
|
|
if include_system_prompt:
|
|
system_prompt = build_system_prompt()
|
|
preview_result["llm_system_prompt_preview"] = {
|
|
"text_preview": system_prompt[:2000],
|
|
"text_chars": len(system_prompt),
|
|
}
|
|
if include_full_prompt:
|
|
preview_result["llm_system_prompt"] = system_prompt
|
|
|
|
if include_full_prompt:
|
|
preview_result["llm_user_content"] = llm_user_content
|
|
|
|
return preview_result
|
|
finally:
|
|
parsed.cleanup()
|
|
|
|
|
|
@app.command()
|
|
def analyze(
|
|
input_dir: Path = typer.Option(
|
|
Path("data"),
|
|
"--input",
|
|
"-i",
|
|
help="Directory containing .eml files",
|
|
),
|
|
output_dir: Optional[Path] = typer.Option(
|
|
None,
|
|
"--output",
|
|
"-o",
|
|
help="Directory to write JSON results (defaults to output/analyze-TIMESTAMP)",
|
|
),
|
|
endpoint: str = typer.Option(
|
|
"http://localhost:8000/v1",
|
|
"--endpoint",
|
|
help="vLLM OpenAI-compatible endpoint",
|
|
),
|
|
model: str = typer.Option(
|
|
"Qwen/Qwen3.5-35B-A3B",
|
|
"--model",
|
|
help="Model name to use for analysis",
|
|
),
|
|
backend: str = typer.Option(
|
|
"llm",
|
|
"--backend",
|
|
help="Analysis backend: 'llm' for API calls, 'simulated' for local deterministic analysis",
|
|
),
|
|
summary: bool = typer.Option(
|
|
False,
|
|
"--summary",
|
|
"-s",
|
|
help="Print a summary table after processing",
|
|
),
|
|
) -> None:
|
|
"""Batch analyze all .eml files in INPUT_DIR and write JSON results to OUTPUT_DIR."""
|
|
if output_dir is None:
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
output_dir = Path("output") / f"analyze-{timestamp}"
|
|
|
|
eml_files = sorted(input_dir.glob("*.eml"))
|
|
if not eml_files:
|
|
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
results: list[DLPResult] = []
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("{task.completed}/{task.total}"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task("Analyzing emails...", total=len(eml_files))
|
|
|
|
for eml_path in eml_files:
|
|
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
|
|
try:
|
|
result = _process_single_email(eml_path, endpoint, model, backend=backend)
|
|
except Exception as e:
|
|
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
|
|
progress.advance(task)
|
|
continue
|
|
|
|
# Write individual JSON result
|
|
out_file = output_dir / (eml_path.stem + ".json")
|
|
out_file.write_text(result.model_dump_json(indent=2))
|
|
results.append(result)
|
|
progress.advance(task)
|
|
|
|
# Write batch summary
|
|
batch_summary = {
|
|
"total": len(results),
|
|
"by_action": {
|
|
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
|
|
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
|
|
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
|
|
},
|
|
"by_risk": {
|
|
level: sum(1 for r in results if r.risk_level.value == level)
|
|
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
|
|
},
|
|
"emails": [
|
|
{
|
|
"file": r.email_file,
|
|
"subject": r.subject,
|
|
"risk_level": r.risk_level.value,
|
|
"risk_score": r.risk_score,
|
|
"action": r.action.value,
|
|
"violation_types": [v.value for v in r.violation_types],
|
|
}
|
|
for r in results
|
|
],
|
|
}
|
|
(output_dir / "batch_summary.json").write_text(
|
|
json.dumps(batch_summary, indent=2)
|
|
)
|
|
|
|
console.print(f"\n[bold green]Done![/bold green] Processed {len(results)}/{len(eml_files)} emails.")
|
|
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
|
|
|
|
if summary and results:
|
|
_print_summary_table(results)
|
|
|
|
|
|
@app.command()
|
|
def preview(
|
|
input_dir: Path = typer.Option(
|
|
Path("data"),
|
|
"--input",
|
|
"-i",
|
|
help="Directory containing .eml files",
|
|
),
|
|
output_dir: Optional[Path] = typer.Option(
|
|
None,
|
|
"--output",
|
|
"-o",
|
|
help="Optional directory to write preview JSON files",
|
|
),
|
|
print_json: bool = typer.Option(
|
|
False,
|
|
"--print-json",
|
|
help="Print preview JSON to stdout",
|
|
),
|
|
include_system_prompt: bool = typer.Option(
|
|
True,
|
|
"--include-system-prompt/--no-system-prompt",
|
|
help="Include the analyzer system prompt built from policy.py",
|
|
),
|
|
include_full_prompt: bool = typer.Option(
|
|
False,
|
|
"--include-full-prompt",
|
|
help="Include the full system prompt and full user content in JSON output",
|
|
),
|
|
) -> None:
|
|
"""Preview parsed email and converted attachment content before LLM analysis."""
|
|
eml_files = sorted(input_dir.glob("*.eml"))
|
|
if not eml_files:
|
|
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
if output_dir is not None:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
previews: list[dict] = []
|
|
|
|
for eml_path in eml_files:
|
|
try:
|
|
preview_result = _preview_single_email(
|
|
eml_path,
|
|
include_system_prompt=include_system_prompt,
|
|
include_full_prompt=include_full_prompt,
|
|
)
|
|
except Exception as e:
|
|
console.print(f"[red]Error previewing {eml_path.name}: {e}[/red]")
|
|
continue
|
|
|
|
previews.append(preview_result)
|
|
|
|
if output_dir is not None:
|
|
out_file = output_dir / f"{eml_path.stem}.preview.json"
|
|
out_file.write_text(json.dumps(preview_result, indent=2))
|
|
|
|
if output_dir is not None:
|
|
batch_file = output_dir / "batch_preview.json"
|
|
batch_file.write_text(json.dumps(previews, indent=2))
|
|
console.print(f"[green]Preview JSON written to[/green] [cyan]{output_dir}[/cyan]")
|
|
|
|
if print_json:
|
|
console.print_json(json.dumps(previews, indent=2))
|
|
else:
|
|
table = Table(title="Email Preview Results", show_lines=True)
|
|
table.add_column("File", style="dim", max_width=45)
|
|
table.add_column("Body Chars", justify="right")
|
|
table.add_column("Attachments", justify="right")
|
|
table.add_column("Errors", justify="right")
|
|
|
|
for preview_result in previews:
|
|
table.add_row(
|
|
str(preview_result["email_file"]),
|
|
str(preview_result["body_text_chars"]),
|
|
str(preview_result["attachment_count"]),
|
|
str(len(preview_result["processing_errors"])),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
|
|
@app.command()
|
|
def simulate(
|
|
input_dir: Path = typer.Option(
|
|
Path("data"),
|
|
"--input",
|
|
"-i",
|
|
help="Directory containing .eml files",
|
|
),
|
|
output_dir: Optional[Path] = typer.Option(
|
|
None,
|
|
"--output",
|
|
"-o",
|
|
help="Directory to write simulated JSON results (defaults to output/simulated-TIMESTAMP)",
|
|
),
|
|
summary: bool = typer.Option(
|
|
True,
|
|
"--summary/--no-summary",
|
|
help="Print a summary table after processing",
|
|
),
|
|
) -> None:
|
|
"""Batch simulate DLP analysis locally without calling an LLM."""
|
|
if output_dir is None:
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
output_dir = Path("output") / f"simulated-{timestamp}"
|
|
|
|
eml_files = sorted(input_dir.glob("*.eml"))
|
|
if not eml_files:
|
|
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
results: list[DLPResult] = []
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("{task.completed}/{task.total}"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task("Simulating email analysis...", total=len(eml_files))
|
|
|
|
for eml_path in eml_files:
|
|
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
|
|
try:
|
|
result = _simulate_single_email(eml_path)
|
|
except Exception as e:
|
|
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
|
|
progress.advance(task)
|
|
continue
|
|
|
|
out_file = output_dir / (eml_path.stem + ".json")
|
|
out_file.write_text(result.model_dump_json(indent=2))
|
|
results.append(result)
|
|
progress.advance(task)
|
|
|
|
batch_summary = {
|
|
"total": len(results),
|
|
"generator": "local-simulator",
|
|
"model_label": "gpt-5.4-simulated",
|
|
"by_action": {
|
|
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
|
|
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
|
|
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
|
|
},
|
|
"by_risk": {
|
|
level: sum(1 for r in results if r.risk_level.value == level)
|
|
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
|
|
},
|
|
"emails": [
|
|
{
|
|
"file": r.email_file,
|
|
"subject": r.subject,
|
|
"risk_level": r.risk_level.value,
|
|
"risk_score": r.risk_score,
|
|
"action": r.action.value,
|
|
"violation_types": [v.value for v in r.violation_types],
|
|
}
|
|
for r in results
|
|
],
|
|
}
|
|
(output_dir / "batch_summary.json").write_text(
|
|
json.dumps(batch_summary, indent=2)
|
|
)
|
|
|
|
console.print(
|
|
f"\n[bold green]Done![/bold green] Simulated {len(results)}/{len(eml_files)} emails."
|
|
)
|
|
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
|
|
|
|
if summary and results:
|
|
_print_summary_table(results)
|
|
|
|
|
|
def _print_summary_table(results: list[DLPResult]) -> None:
|
|
"""Print a rich summary table to the console."""
|
|
table = Table(title="Email DLP Analysis Results", show_lines=True)
|
|
table.add_column("File", style="dim", max_width=45)
|
|
table.add_column("Risk Level", justify="center")
|
|
table.add_column("Score", justify="center")
|
|
table.add_column("Action", justify="center")
|
|
table.add_column("Violations", max_width=40)
|
|
|
|
for r in results:
|
|
risk_color = RISK_COLORS.get(r.risk_level.value, "white")
|
|
action_color = ACTION_COLORS.get(r.action, "white")
|
|
violations = ", ".join(v.value for v in r.violation_types)
|
|
|
|
table.add_row(
|
|
r.email_file,
|
|
f"[{risk_color}]{r.risk_level.value}[/{risk_color}]",
|
|
str(r.risk_score),
|
|
f"[{action_color}]{r.action.value}[/{action_color}]",
|
|
violations,
|
|
)
|
|
|
|
console.print(table)
|