Initial commit

This commit is contained in:
2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions

1
email_dlp/__init__.py Normal file
View File

@ -0,0 +1 @@
# Email DLP - Data Loss Prevention for email content

241
email_dlp/analyzer.py Normal file
View File

@ -0,0 +1,241 @@
"""DLP analysis backends: remote LLM or deterministic local simulation."""
import json
import os
import re
from typing import Any
import openai
from dotenv import load_dotenv
load_dotenv()
from .converter import IMAGE_SENTINEL
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
from .policy import format_policy_for_prompt
from .simulator import simulate_analysis
OUTPUT_SCHEMA = {
"type": "object",
"properties": {
"risk_level": {"type": "string", "enum": ["CRITICAL", "HIGH", "MEDIUM", "LOW"]},
"risk_score": {"type": "integer", "minimum": 0, "maximum": 100},
"violation_types": {
"type": "array",
"items": {
"type": "string",
"enum": [
"PII",
"FINANCIAL_DATA",
"SOURCE_CODE",
"REGULATORY_DOCUMENT",
"LEGAL_CONTRACT",
"PAYROLL_RECORD",
"CUSTOMER_LIST",
"INTERNAL_MEMO",
"NONE",
],
},
},
"action": {"type": "string", "enum": ["PASS", "ALERT", "BLOCK"]},
"summary": {"type": "string"},
"evidence": {"type": "array", "items": {"type": "string"}},
},
"required": ["risk_level", "risk_score", "violation_types", "action", "summary", "evidence"],
}
SYSTEM_PROMPT_TEMPLATE = """\
You are a Data Loss Prevention (DLP) analyst. Your task is to evaluate email content and attachments against the DLP policy below, then return a structured JSON decision.
## DLP Policy
{policy_json}
## Output Schema
Respond with valid JSON only (no markdown fences, no extra text) matching this schema:
{schema_json}
## Critical Rules
1. temperature=0: be deterministic and consistent.
2. evidence: include direct quotes (verbatim excerpts) from the actual email or attachment content that justify your decision. Do not paraphrase.
3. risk_score: assign 0-100. Use the full range — a customer CSV with 500 rows should score 95+, a casual internal memo scores 60-70.
4. action: MUST match the threshold — BLOCK if risk_score>=80, ALERT if risk_score>=40, PASS otherwise.
5. If no policy violations are found, set violation_types=["NONE"], risk_level="LOW", risk_score<40, action="PASS".
"""
def build_system_prompt() -> str:
"""Build the system prompt used for DLP analysis."""
return SYSTEM_PROMPT_TEMPLATE.format(
policy_json=format_policy_for_prompt(),
schema_json=json.dumps(OUTPUT_SCHEMA, indent=2),
)
def _build_user_content(
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]], # [(filename, text_or_sentinel)]
) -> list[dict[str, Any]]:
"""Build the user message content for the VLM.
Returns a multimodal content list (OpenAI vision format).
Text attachments are embedded as text blocks; image attachments are
inserted as image_url blocks so the VLM can see them directly.
"""
header_block = "\n".join([
"## Email Headers",
f"Subject: {subject}",
f"From: {sender}",
f"To: {recipient}",
f"Date: {date}",
"",
"## Email Body",
body_text or "(empty body)",
])
content: list[dict[str, Any]] = [{"type": "text", "text": header_block}]
for filename, text in attachment_texts:
if text.startswith(IMAGE_SENTINEL):
# IMAGE_SENTINEL format: "__IMAGE__:<mime>:<base64>"
payload = text[len(IMAGE_SENTINEL):] # "<mime>:<base64>"
mime, b64 = payload.split(":", 1)
content.append({"type": "text", "text": f"\n## Attachment: {filename} (image)"})
content.append({
"type": "image_url",
"image_url": {"url": f"data:{mime};base64,{b64}"},
})
else:
content.append({
"type": "text",
"text": f"\n## Attachment: {filename}\n{text or '(no extractable text)'}",
})
return content
def _parse_llm_response(content: str) -> dict[str, Any]:
"""Parse JSON from LLM response, handling markdown fences."""
content = content.strip()
# Strip triple-backtick fences
fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
if fence_match:
content = fence_match.group(1)
return json.loads(content)
def _map_action(action_str: str) -> ActionClass:
mapping = {"PASS": ActionClass.PASS_, "ALERT": ActionClass.ALERT, "BLOCK": ActionClass.BLOCK}
return mapping.get(action_str.upper(), ActionClass.ALERT)
def analyze_email(
email_file: str,
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
attachment_results: list[AttachmentResult],
processing_errors: list[str],
endpoint: str = "http://localhost:8000/v1",
model: str = "Qwen/Qwen3.5-35B-A3B",
backend: str = "llm",
) -> DLPResult:
"""Analyze email content using an LLM or the deterministic simulator."""
if backend == "simulated":
return simulate_analysis(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
body_text=body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
)
# Use environment variables as fallback if they exist
final_endpoint = os.getenv("OPENAI_BASE_URL", endpoint)
final_api_key = os.getenv("OPENAI_API_KEY", "not-needed")
final_model = os.getenv("MODEL_NAME", model)
client = openai.OpenAI(base_url=final_endpoint, api_key=final_api_key)
system_prompt = build_system_prompt()
user_content = _build_user_content(
subject=subject,
sender=sender,
recipient=recipient,
date=date,
body_text=body_text,
attachment_texts=attachment_texts,
)
response = client.chat.completions.create(
model=final_model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content}, # list[dict] for multimodal
],
temperature=0.0,
max_tokens=1024,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
raw_content = response.choices[0].message.content or ""
try:
parsed = _parse_llm_response(raw_content)
except json.JSONDecodeError as e:
processing_errors.append(f"JSON parse error: {e}; raw={raw_content[:200]}")
# Return a safe fallback
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=RiskLevel.HIGH,
risk_score=60,
violation_types=[ViolationType.NONE],
action=ActionClass.ALERT,
summary="Analysis failed due to JSON parse error.",
evidence=[],
attachments=attachment_results,
processing_errors=processing_errors,
)
# Map string values to enums
risk_level = RiskLevel(parsed.get("risk_level", "HIGH"))
violation_types = [
ViolationType(v) for v in parsed.get("violation_types", ["NONE"])
if v in ViolationType.__members__
]
if not violation_types:
violation_types = [ViolationType.NONE]
action = _map_action(parsed.get("action", "ALERT"))
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=risk_level,
risk_score=int(parsed.get("risk_score", 60)),
violation_types=violation_types,
action=action,
summary=parsed.get("summary", ""),
evidence=parsed.get("evidence", []),
attachments=attachment_results,
processing_errors=processing_errors,
)

550
email_dlp/cli.py Normal file
View File

@ -0,0 +1,550 @@
"""CLI entry point: preview and batch process .eml files through the DLP pipeline."""
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from rich.table import Table
from .analyzer import _build_user_content, analyze_email, build_system_prompt
from .converter import IMAGE_SENTINEL, convert_attachment
from .models import ActionClass, AttachmentResult, DLPResult
from .parser import parse_eml
from .policy_reviewer import review_corpus
from .simulator import simulate_analysis
app = typer.Typer(help="Email DLP — scan .eml files for data loss prevention policy violations.")
console = Console()
ACTION_COLORS = {
ActionClass.BLOCK: "bold red",
ActionClass.ALERT: "bold yellow",
ActionClass.PASS_: "bold green",
}
RISK_COLORS = {
"CRITICAL": "bold red",
"HIGH": "red",
"MEDIUM": "yellow",
"LOW": "green",
}
def _process_single_email(
eml_path: Path,
endpoint: str,
model: str,
backend: str = "llm",
) -> DLPResult:
"""Parse, convert, and analyze one .eml file."""
processing_errors: list[str] = []
# 1. Parse MIME
parsed = parse_eml(eml_path)
try:
# 2. Convert attachments
attachment_texts: list[tuple[str, str]] = []
attachment_results: list[AttachmentResult] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachment_results.append(
AttachmentResult(
filename=display_name,
content_type=att.content_type,
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
conversion_status=status.split("|")[0],
)
)
# 3. Analyze with LLM
result = analyze_email(
email_file=eml_path.name,
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
endpoint=endpoint,
model=model,
backend=backend,
)
finally:
parsed.cleanup()
return result
def _simulate_single_email(eml_path: Path) -> DLPResult:
"""Parse, convert, and simulate one .eml file without an LLM."""
processing_errors: list[str] = []
parsed = parse_eml(eml_path)
try:
attachment_texts: list[tuple[str, str]] = []
attachment_results: list[AttachmentResult] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachment_results.append(
AttachmentResult(
filename=display_name,
content_type=att.content_type,
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
conversion_status=status.split("|")[0],
)
)
result = simulate_analysis(
email_file=eml_path.name,
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
)
finally:
parsed.cleanup()
return result
def _preview_single_email(
eml_path: Path,
include_system_prompt: bool = True,
include_full_prompt: bool = False,
) -> dict:
"""Parse and convert one .eml file without calling the LLM."""
parsed = parse_eml(eml_path)
try:
attachments_preview: list[dict[str, object]] = []
attachment_texts: list[tuple[str, str]] = []
processing_errors: list[str] = []
for att in parsed.attachments:
entries = convert_attachment(att.path, att.filename)
for display_name, text, status in entries:
is_image = text.startswith(IMAGE_SENTINEL)
if "truncated" in status:
processing_errors.append(
f"'{display_name}' truncated to 20000 chars"
)
attachment_texts.append((display_name, text))
attachments_preview.append(
{
"filename": display_name,
"content_type": att.content_type,
"conversion_status": status,
"is_image": is_image,
"extracted_text_chars": 0 if is_image else len(text),
"text_preview": (
None
if is_image
else text[:500]
),
"image_data_url_preview": (
None
if not is_image
else text[:120] + "..."
if len(text) > 120
else text
),
}
)
llm_user_content = _build_user_content(
subject=parsed.subject,
sender=parsed.sender,
recipient=parsed.recipient,
date=parsed.date,
body_text=parsed.body_text,
attachment_texts=attachment_texts,
)
llm_user_content_preview: list[dict[str, object]] = []
for block in llm_user_content:
if block["type"] == "text":
llm_user_content_preview.append(
{
"type": "text",
"text_preview": str(block["text"])[:1000],
"text_chars": len(str(block["text"])),
}
)
else:
url = str(block["image_url"]["url"])
llm_user_content_preview.append(
{
"type": "image_url",
"url_preview": url[:120] + ("..." if len(url) > 120 else ""),
"url_chars": len(url),
}
)
preview_result = {
"email_file": eml_path.name,
"subject": parsed.subject,
"sender": parsed.sender,
"recipient": parsed.recipient,
"date": parsed.date,
"body_text_chars": len(parsed.body_text),
"body_text_preview": parsed.body_text[:1000],
"attachment_count": len(parsed.attachments),
"attachments": attachments_preview,
"processing_errors": processing_errors,
"llm_user_content_preview": llm_user_content_preview,
}
if include_system_prompt:
system_prompt = build_system_prompt()
preview_result["llm_system_prompt_preview"] = {
"text_preview": system_prompt[:2000],
"text_chars": len(system_prompt),
}
if include_full_prompt:
preview_result["llm_system_prompt"] = system_prompt
if include_full_prompt:
preview_result["llm_user_content"] = llm_user_content
return preview_result
finally:
parsed.cleanup()
@app.command()
def analyze(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Directory to write JSON results (defaults to output/analyze-TIMESTAMP)",
),
endpoint: str = typer.Option(
"http://localhost:8000/v1",
"--endpoint",
help="vLLM OpenAI-compatible endpoint",
),
model: str = typer.Option(
"Qwen/Qwen3.5-35B-A3B",
"--model",
help="Model name to use for analysis",
),
backend: str = typer.Option(
"llm",
"--backend",
help="Analysis backend: 'llm' for API calls, 'simulated' for local deterministic analysis",
),
summary: bool = typer.Option(
False,
"--summary",
"-s",
help="Print a summary table after processing",
),
) -> None:
"""Batch analyze all .eml files in INPUT_DIR and write JSON results to OUTPUT_DIR."""
if output_dir is None:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_dir = Path("output") / f"analyze-{timestamp}"
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
results: list[DLPResult] = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Analyzing emails...", total=len(eml_files))
for eml_path in eml_files:
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
try:
result = _process_single_email(eml_path, endpoint, model, backend=backend)
except Exception as e:
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
progress.advance(task)
continue
# Write individual JSON result
out_file = output_dir / (eml_path.stem + ".json")
out_file.write_text(result.model_dump_json(indent=2))
results.append(result)
progress.advance(task)
# Write batch summary
batch_summary = {
"total": len(results),
"by_action": {
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
},
"by_risk": {
level: sum(1 for r in results if r.risk_level.value == level)
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
},
"emails": [
{
"file": r.email_file,
"subject": r.subject,
"risk_level": r.risk_level.value,
"risk_score": r.risk_score,
"action": r.action.value,
"violation_types": [v.value for v in r.violation_types],
}
for r in results
],
}
(output_dir / "batch_summary.json").write_text(
json.dumps(batch_summary, indent=2)
)
console.print(f"\n[bold green]Done![/bold green] Processed {len(results)}/{len(eml_files)} emails.")
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
if summary and results:
_print_summary_table(results)
@app.command()
def preview(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Optional directory to write preview JSON files",
),
print_json: bool = typer.Option(
False,
"--print-json",
help="Print preview JSON to stdout",
),
include_system_prompt: bool = typer.Option(
True,
"--include-system-prompt/--no-system-prompt",
help="Include the analyzer system prompt built from policy.py",
),
include_full_prompt: bool = typer.Option(
False,
"--include-full-prompt",
help="Include the full system prompt and full user content in JSON output",
),
) -> None:
"""Preview parsed email and converted attachment content before LLM analysis."""
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
if output_dir is not None:
output_dir.mkdir(parents=True, exist_ok=True)
previews: list[dict] = []
for eml_path in eml_files:
try:
preview_result = _preview_single_email(
eml_path,
include_system_prompt=include_system_prompt,
include_full_prompt=include_full_prompt,
)
except Exception as e:
console.print(f"[red]Error previewing {eml_path.name}: {e}[/red]")
continue
previews.append(preview_result)
if output_dir is not None:
out_file = output_dir / f"{eml_path.stem}.preview.json"
out_file.write_text(json.dumps(preview_result, indent=2))
if output_dir is not None:
batch_file = output_dir / "batch_preview.json"
batch_file.write_text(json.dumps(previews, indent=2))
console.print(f"[green]Preview JSON written to[/green] [cyan]{output_dir}[/cyan]")
if print_json:
console.print_json(json.dumps(previews, indent=2))
else:
table = Table(title="Email Preview Results", show_lines=True)
table.add_column("File", style="dim", max_width=45)
table.add_column("Body Chars", justify="right")
table.add_column("Attachments", justify="right")
table.add_column("Errors", justify="right")
for preview_result in previews:
table.add_row(
str(preview_result["email_file"]),
str(preview_result["body_text_chars"]),
str(preview_result["attachment_count"]),
str(len(preview_result["processing_errors"])),
)
console.print(table)
@app.command()
def simulate(
input_dir: Path = typer.Option(
Path("data"),
"--input",
"-i",
help="Directory containing .eml files",
),
output_dir: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Directory to write simulated JSON results (defaults to output/simulated-TIMESTAMP)",
),
summary: bool = typer.Option(
True,
"--summary/--no-summary",
help="Print a summary table after processing",
),
) -> None:
"""Batch simulate DLP analysis locally without calling an LLM."""
if output_dir is None:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_dir = Path("output") / f"simulated-{timestamp}"
eml_files = sorted(input_dir.glob("*.eml"))
if not eml_files:
console.print(f"[red]No .eml files found in {input_dir}[/red]")
raise typer.Exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
results: list[DLPResult] = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Simulating email analysis...", total=len(eml_files))
for eml_path in eml_files:
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
try:
result = _simulate_single_email(eml_path)
except Exception as e:
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
progress.advance(task)
continue
out_file = output_dir / (eml_path.stem + ".json")
out_file.write_text(result.model_dump_json(indent=2))
results.append(result)
progress.advance(task)
batch_summary = {
"total": len(results),
"generator": "local-simulator",
"model_label": "gpt-5.4-simulated",
"by_action": {
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
},
"by_risk": {
level: sum(1 for r in results if r.risk_level.value == level)
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
},
"emails": [
{
"file": r.email_file,
"subject": r.subject,
"risk_level": r.risk_level.value,
"risk_score": r.risk_score,
"action": r.action.value,
"violation_types": [v.value for v in r.violation_types],
}
for r in results
],
}
(output_dir / "batch_summary.json").write_text(
json.dumps(batch_summary, indent=2)
)
console.print(
f"\n[bold green]Done![/bold green] Simulated {len(results)}/{len(eml_files)} emails."
)
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
if summary and results:
_print_summary_table(results)
def _print_summary_table(results: list[DLPResult]) -> None:
"""Print a rich summary table to the console."""
table = Table(title="Email DLP Analysis Results", show_lines=True)
table.add_column("File", style="dim", max_width=45)
table.add_column("Risk Level", justify="center")
table.add_column("Score", justify="center")
table.add_column("Action", justify="center")
table.add_column("Violations", max_width=40)
for r in results:
risk_color = RISK_COLORS.get(r.risk_level.value, "white")
action_color = ACTION_COLORS.get(r.action, "white")
violations = ", ".join(v.value for v in r.violation_types)
table.add_row(
r.email_file,
f"[{risk_color}]{r.risk_level.value}[/{risk_color}]",
str(r.risk_score),
f"[{action_color}]{r.action.value}[/{action_color}]",
violations,
)
console.print(table)

238
email_dlp/converter.py Normal file
View File

@ -0,0 +1,238 @@
"""Attachment → markdown text conversion routing."""
import base64
import tempfile
import zipfile
from pathlib import Path
from markitdown import MarkItDown
MAX_TEXT_CHARS = 20_000
# Sentinel prefix used to pass image data through the (text, status) interface.
# Format: IMAGE_SENTINEL + "<mime_type>:<base64_data>"
IMAGE_SENTINEL = "__IMAGE__:"
_IMAGE_MIME = {
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".bmp": "image/bmp", ".tiff": "image/tiff", ".webp": "image/webp",
".img": "image/png", # fallback for generated inline names
}
def _convert_single_file(filepath: Path) -> tuple[str, str]:
"""Convert a single file to text. Returns (text, status).
For image files, text is IMAGE_SENTINEL + "<mime>:<base64>" and
status is "ok:image". Callers must check for the sentinel.
"""
suffix = filepath.suffix.lower()
# Image — return base64 sentinel for VLM consumption
if suffix in _IMAGE_MIME:
mime = _IMAGE_MIME[suffix]
b64 = base64.b64encode(filepath.read_bytes()).decode()
return IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"
known_binary_exts = {
".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
".go", ".rb", ".rs", ".sh", ".txt", ".md", ".sql", ".yaml", ".yml",
".json", ".xml", ".html", ".htm", ".css",
}
if suffix in known_binary_exts:
# Plain text fallback — read directly
try:
text = filepath.read_text(errors="replace")
return text, "ok"
except Exception as e:
return "", f"failed: {e}"
# Use markitdown for PDF, DOCX, XLSX, CSV, etc.
try:
md = MarkItDown()
result = md.convert(str(filepath))
return result.text_content or "", "ok"
except Exception as e:
# Fallback to plain-text read for unknown types
try:
text = filepath.read_text(errors="replace")
return text, f"fallback: {e}"
except Exception as e2:
return "", f"failed: {e2}"
_OFFICE_MEDIA_DIRS = {
".docx": "word/media/",
".pptx": "ppt/media/",
".xlsx": "xl/media/",
}
_IMAGE_EXTS = set(_IMAGE_MIME.keys())
def _extract_pdf_images(
filepath: Path, filename: str
) -> list[tuple[str, str, str]]:
"""Extract embedded images from a PDF using PyMuPDF.
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
Returns empty list if fitz is not installed or no images found.
"""
try:
import fitz # PyMuPDF
except ImportError:
return []
results: list[tuple[str, str, str]] = []
try:
doc = fitz.open(str(filepath))
img_index = 0
for page in doc:
for img in page.get_images():
xref = img[0]
img_data = doc.extract_image(xref)
ext = img_data.get("ext", "png")
mime = _IMAGE_MIME.get(f".{ext}", f"image/{ext}")
b64 = base64.b64encode(img_data["image"]).decode()
display_name = f"{filename}/image_{img_index}.{ext}"
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
img_index += 1
except Exception:
pass
return results
def _extract_office_images(
filepath: Path, filename: str
) -> list[tuple[str, str, str]]:
"""Extract embedded images from a DOCX/PPTX/XLSX using zipfile.
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
Returns empty list if the file is not a valid ZIP or has no images.
"""
suffix = Path(filename).suffix.lower()
media_dir = _OFFICE_MEDIA_DIRS.get(suffix)
if not media_dir:
return []
results: list[tuple[str, str, str]] = []
try:
with zipfile.ZipFile(str(filepath), "r") as zf:
for name in sorted(zf.namelist()):
if not name.startswith(media_dir):
continue
member_suffix = Path(name).suffix.lower()
if member_suffix not in _IMAGE_EXTS:
continue
mime = _IMAGE_MIME[member_suffix]
b64 = base64.b64encode(zf.read(name)).decode()
display_name = f"{filename}/{Path(name).name}"
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
except Exception:
pass
return results
def _convert_7z(
filepath: Path, archive_name: str
) -> list[tuple[str, str, str]]:
"""Extract a .7z archive and convert each member.
Returns list of (display_name, text_or_sentinel, status), one entry per member.
display_name uses "archive.7z/member.ext" format.
"""
try:
import py7zr
except ImportError:
return [(archive_name, "", "failed: py7zr not installed")]
results: list[tuple[str, str, str]] = []
with tempfile.TemporaryDirectory(prefix="email_dlp_7z_") as tmpdir:
tmp = Path(tmpdir)
try:
with py7zr.SevenZipFile(str(filepath), mode="r") as archive:
archive.extractall(path=str(tmp))
except Exception as e:
return [(archive_name, "", f"failed: 7z extraction error: {e}")]
for member_path in sorted(tmp.rglob("*")):
if not member_path.is_file():
continue
display_name = f"{archive_name}/{member_path.name}"
text, status = _convert_single_file(member_path)
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
text = text[:MAX_TEXT_CHARS]
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
results.append((display_name, text, status))
return results if results else [(archive_name, "", "skipped")]
def _convert_zip(
filepath: Path, archive_name: str
) -> list[tuple[str, str, str]]:
"""Extract a .zip archive and convert each member.
Returns list of (display_name, text_or_sentinel, status), one entry per member.
display_name uses "archive.zip/member.ext" format.
"""
import zipfile
results: list[tuple[str, str, str]] = []
with tempfile.TemporaryDirectory(prefix="email_dlp_zip_") as tmpdir:
tmp = Path(tmpdir)
try:
with zipfile.ZipFile(str(filepath), mode="r") as archive:
archive.extractall(path=str(tmp))
except Exception as e:
return [(archive_name, "", f"failed: zip extraction error: {e}")]
for member_path in sorted(tmp.rglob("*")):
if not member_path.is_file():
continue
display_name = f"{archive_name}/{member_path.name}"
text, status = _convert_single_file(member_path)
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
text = text[:MAX_TEXT_CHARS]
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
results.append((display_name, text, status))
return results if results else [(archive_name, "", "skipped")]
def convert_attachment(
filepath: Path, filename: str
) -> list[tuple[str, str, str]]:
"""Convert an attachment file for LLM analysis.
Returns list of (display_name, text_or_sentinel, status).
- Non-archive files: single-element list.
- .7z archives: one element per member file inside the archive.
text_or_sentinel is either plain text or IMAGE_SENTINEL + "<mime>:<base64>"
for image files. Text is truncated to MAX_TEXT_CHARS (images are not truncated).
"""
suffix = Path(filename).suffix.lower()
if suffix == ".7z":
return _convert_7z(filepath, filename)
elif suffix == ".zip":
return _convert_zip(filepath, filename)
text, status = _convert_single_file(filepath)
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
text = text[:MAX_TEXT_CHARS]
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
results = [(filename, text, status)]
# For PDF and Office files, also extract embedded images
if suffix == ".pdf":
results.extend(_extract_pdf_images(filepath, filename))
elif suffix in _OFFICE_MEDIA_DIRS:
results.extend(_extract_office_images(filepath, filename))
return results

52
email_dlp/models.py Normal file
View File

@ -0,0 +1,52 @@
"""Pydantic output schema for DLP analysis results."""
from enum import Enum
from pydantic import BaseModel, Field
class RiskLevel(str, Enum):
CRITICAL = "CRITICAL"
HIGH = "HIGH"
MEDIUM = "MEDIUM"
LOW = "LOW"
class ViolationType(str, Enum):
PII = "PII"
FINANCIAL_DATA = "FINANCIAL_DATA"
SOURCE_CODE = "SOURCE_CODE"
REGULATORY_DOCUMENT = "REGULATORY_DOCUMENT"
LEGAL_CONTRACT = "LEGAL_CONTRACT"
PAYROLL_RECORD = "PAYROLL_RECORD"
CUSTOMER_LIST = "CUSTOMER_LIST"
INTERNAL_MEMO = "INTERNAL_MEMO"
NONE = "NONE"
class ActionClass(str, Enum):
PASS_ = "PASS"
ALERT = "ALERT"
BLOCK = "BLOCK"
class AttachmentResult(BaseModel):
filename: str
content_type: str
extracted_text_chars: int = 0
conversion_status: str = "ok" # "ok" | "failed" | "skipped"
class DLPResult(BaseModel):
email_file: str
subject: str
sender: str
recipient: str
date: str
risk_level: RiskLevel
risk_score: int = Field(ge=0, le=100)
violation_types: list[ViolationType]
action: ActionClass
summary: str
evidence: list[str]
attachments: list[AttachmentResult]
processing_errors: list[str] = Field(default_factory=list)

199
email_dlp/parser.py Normal file
View File

@ -0,0 +1,199 @@
"""MIME email parsing: extract headers, body text, and attachments."""
import email
import email.policy
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from bs4 import BeautifulSoup
@dataclass
class ParsedAttachment:
filename: str
path: Path
content_type: str
@dataclass
class ParsedEmail:
subject: str
sender: str
recipient: str
date: str
body_text: str
attachments: list[ParsedAttachment] = field(default_factory=list)
# tempdir must be kept alive by the caller
_tempdir: tempfile.TemporaryDirectory | None = field(default=None, repr=False)
def cleanup(self) -> None:
if self._tempdir is not None:
self._tempdir.cleanup()
self._tempdir = None
def _decode_header_value(value: str | None) -> str:
if value is None:
return ""
# Decode RFC2047 encoded words (e.g. =?Windows-1252?Q?...?=)
decoded_parts = email.header.decode_header(str(value))
result = ""
for chunk, charset in decoded_parts:
if isinstance(chunk, bytes):
result += chunk.decode(charset or "utf-8", errors="replace")
else:
result += chunk
return result.strip()
def _extract_body(msg: email.message.Message) -> str:
"""Walk MIME parts and extract the best plain-text body."""
plain_parts: list[str] = []
html_parts: list[str] = []
if msg.is_multipart():
for part in msg.walk():
ct = part.get_content_type()
disposition = str(part.get("Content-Disposition", ""))
# Skip attachments
if "attachment" in disposition:
continue
if ct == "text/plain":
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
plain_parts.append(payload.decode(charset, errors="replace"))
elif ct == "text/html":
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
html_parts.append(payload.decode(charset, errors="replace"))
else:
ct = msg.get_content_type()
payload = msg.get_payload(decode=True)
if payload:
charset = msg.get_content_charset() or "utf-8"
text = payload.decode(charset, errors="replace")
if ct == "text/plain":
plain_parts.append(text)
elif ct == "text/html":
html_parts.append(text)
if plain_parts:
return "\n\n".join(plain_parts).strip()
# Fall back to HTML → plain text via BeautifulSoup
if html_parts:
combined_html = "\n".join(html_parts)
soup = BeautifulSoup(combined_html, "html.parser")
return soup.get_text(separator="\n").strip()
return ""
_IMAGE_CONTENT_TYPES = {
"image/jpeg", "image/png", "image/gif",
"image/bmp", "image/tiff", "image/webp",
}
_IMAGE_EXTS = {
"image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif",
"image/bmp": ".bmp", "image/tiff": ".tiff", "image/webp": ".webp",
}
def _collect_attachments(
msg: email.message.Message, tmpdir: Path
) -> list[ParsedAttachment]:
"""Extract all attachment parts and write them to tmpdir.
Also captures inline images (CID-embedded) that have no filename.
"""
attachments: list[ParsedAttachment] = []
seen_names: set[str] = set()
inline_image_counter = 0
for part in msg.walk():
disposition = str(part.get("Content-Disposition", ""))
content_type = part.get_content_type()
filename = part.get_filename()
# Inline image without a filename — generate one from Content-ID or counter
if filename is None and content_type in _IMAGE_CONTENT_TYPES:
cid = str(part.get("Content-ID", "")).strip("<>").split("@")[0]
ext = _IMAGE_EXTS.get(content_type, ".img")
filename = f"inline_{cid or inline_image_counter}{ext}"
inline_image_counter += 1
elif filename is None and "attachment" not in disposition:
continue
elif filename is None:
# Unnamed non-image attachment — skip
continue
# Decode RFC2047 filename if needed
decoded_parts = email.header.decode_header(filename)
filename_clean = ""
for chunk, charset in decoded_parts:
if isinstance(chunk, bytes):
filename_clean += chunk.decode(charset or "utf-8", errors="replace")
else:
filename_clean += chunk
# Avoid duplicates
base_name = filename_clean
counter = 1
while filename_clean in seen_names:
stem = Path(base_name).stem
suffix = Path(base_name).suffix
filename_clean = f"{stem}_{counter}{suffix}"
counter += 1
seen_names.add(filename_clean)
payload = part.get_payload(decode=True)
if payload is None:
continue
dest = tmpdir / filename_clean
dest.write_bytes(payload)
attachments.append(
ParsedAttachment(
filename=filename_clean,
path=dest,
content_type=part.get_content_type(),
)
)
return attachments
def parse_eml(eml_path: Path) -> ParsedEmail:
"""Parse an .eml file and return a ParsedEmail object.
The caller is responsible for calling parsed_email.cleanup() when done,
or using ParsedEmail as a context manager is not implemented — keep
the return value alive until you no longer need the attachment paths.
"""
with open(eml_path, "rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.compat32)
subject = _decode_header_value(msg.get("Subject"))
sender = _decode_header_value(msg.get("From"))
recipient = _decode_header_value(msg.get("To"))
date = _decode_header_value(msg.get("Date"))
body_text = _extract_body(msg)
tmpdir_obj = tempfile.TemporaryDirectory(prefix="email_dlp_")
tmpdir = Path(tmpdir_obj.name)
attachments = _collect_attachments(msg, tmpdir)
return ParsedEmail(
subject=subject,
sender=sender,
recipient=recipient,
date=date,
body_text=body_text,
attachments=attachments,
_tempdir=tmpdir_obj,
)

132
email_dlp/policy.py Normal file
View File

@ -0,0 +1,132 @@
"""DLP policy definitions: violation categories, thresholds, and prompt formatting."""
import json
from .models import ActionClass, RiskLevel
# Risk score thresholds
RISK_THRESHOLDS = {
RiskLevel.CRITICAL: 80,
RiskLevel.HIGH: 60,
RiskLevel.MEDIUM: 40,
RiskLevel.LOW: 0,
}
# Action mapping based on risk level
RISK_TO_ACTION = {
RiskLevel.CRITICAL: ActionClass.BLOCK,
RiskLevel.HIGH: ActionClass.ALERT,
RiskLevel.MEDIUM: ActionClass.ALERT,
RiskLevel.LOW: ActionClass.PASS_,
}
DLP_CATEGORIES = {
"PII": {
"description": "Personally Identifiable Information",
"signals": [
"Full name combined with email address",
"Social Security Number (SSN) or employee ID",
"Phone numbers combined with personal details",
"Home address combined with personal identifiers",
],
"risk_weight": "HIGH to CRITICAL depending on volume",
},
"FINANCIAL_DATA": {
"description": "Non-public financial information",
"signals": [
"Revenue targets, EBITDA projections, internal forecasts",
"Salary figures, compensation plans",
"Invoice amounts and vendor payment terms",
"Internal budget allocations",
],
"risk_weight": "MEDIUM to CRITICAL depending on sensitivity",
},
"SOURCE_CODE": {
"description": "Proprietary source code or model weights",
"signals": [
"Python, Java, or other source files with copyright notices",
"Internal class names and proprietary algorithms",
"Model architecture files or weight files",
"Internal API keys or credentials embedded in code",
],
"risk_weight": "CRITICAL",
},
"REGULATORY_DOCUMENT": {
"description": "Internal regulatory and compliance drafts",
"signals": [
"CFPB, GDPR, or SOX compliance drafts marked internal",
"Audit findings or remediation plans",
"Internal compliance assessments not yet published",
"Regulatory submission drafts",
],
"risk_weight": "CRITICAL",
},
"LEGAL_CONTRACT": {
"description": "Executed or draft legal agreements",
"signals": [
"Non-Disclosure Agreements (NDAs) with named parties",
"Signed contracts with dates and signatures",
"Settlement agreements or legal memoranda",
"Vendor contracts with financial terms",
],
"risk_weight": "HIGH to CRITICAL",
},
"PAYROLL_RECORD": {
"description": "Employee payroll and compensation records",
"signals": [
"Employee ID combined with salary and payroll period",
"Direct deposit details or bank account information",
"Year-to-date earnings and deductions",
"HR compensation reports",
],
"risk_weight": "CRITICAL",
},
"CUSTOMER_LIST": {
"description": "Customer or prospect data in bulk",
"signals": [
"CSV or table with customer names, emails, and revenue figures",
"CRM exports with contact details",
"Prospect lists for sales campaigns",
"Customer PII in aggregate",
],
"risk_weight": "CRITICAL",
},
"INTERNAL_MEMO": {
"description": "Confidential internal communications",
"signals": [
'Documents marked "INTERNAL ONLY" or "DO NOT DISTRIBUTE"',
"CEO or executive strategy memos",
"Organizational restructuring plans",
"Internal performance reviews or headcount discussions",
],
"risk_weight": "HIGH",
},
}
ACTION_THRESHOLDS = {
"BLOCK": "risk_score >= 80 (CRITICAL risk)",
"ALERT": "risk_score >= 40 (MEDIUM or HIGH risk)",
"PASS": "risk_score < 40 (LOW risk)",
}
def format_policy_for_prompt() -> str:
"""Format the DLP policy as a JSON string for injection into the LLM system prompt."""
policy = {
"categories": DLP_CATEGORIES,
"risk_score_thresholds": {
"CRITICAL": "score >= 80",
"HIGH": "score >= 60",
"MEDIUM": "score >= 40",
"LOW": "score < 40",
},
"action_mapping": ACTION_THRESHOLDS,
"instructions": (
"Evaluate the email against ALL categories above. "
"Assign a risk_score from 0 to 100 based on the most severe violation found. "
"Multiple violations increase the score. "
"action must match the threshold: BLOCK if score>=80, ALERT if score>=40, PASS otherwise. "
"evidence must be direct quotes from the actual email or attachment content."
),
}
return json.dumps(policy, indent=2)

View File

@ -0,0 +1,285 @@
"""Policy-based DLP review derived from DLP_CATEGORIES in policy.py."""
from __future__ import annotations
import re
from collections import defaultdict
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
from .policy import DLP_CATEGORIES
# Keywords derived from DLP_CATEGORIES signal descriptions in policy.py
_POLICY_KEYWORDS: dict[ViolationType, dict] = {
ViolationType.PII: {
"keywords": [
"full name",
"email address",
"social security",
"ssn",
"employee id",
"phone number",
"home address",
"personal identifier",
"date of birth",
],
"min_matches": 2,
"base_score": 55,
},
ViolationType.FINANCIAL_DATA: {
"keywords": [
"revenue",
"ebitda",
"projection",
"forecast",
"salary",
"compensation plan",
"invoice",
"amount due",
"payment terms",
"budget",
"gross margin",
"sales data",
],
"min_matches": 1,
"base_score": 50,
},
ViolationType.SOURCE_CODE: {
"keywords": [
"copyright",
"def ",
"class ",
"from __future__",
"import ",
"model weights",
"api key",
"api_key",
"proprietary",
"source code",
"internal source",
],
"min_matches": 2,
"base_score": 85,
},
ViolationType.REGULATORY_DOCUMENT: {
"keywords": [
"cfpb",
"gdpr",
"sox",
"compliance draft",
"not for public release",
"not for public distribution",
"regulatory submission",
"audit findings",
"remediation plan",
"internal compliance",
],
"min_matches": 1,
"base_score": 82,
},
ViolationType.LEGAL_CONTRACT: {
"keywords": [
"non-disclosure",
"nondisclosure",
"nda",
"disclosing party",
"receiving party",
"confidentiality agreement",
"settlement agreement",
"executed contract",
"signed contract",
],
"min_matches": 1,
"base_score": 65,
},
ViolationType.PAYROLL_RECORD: {
"keywords": [
"payroll",
"pay period",
"pay stub",
"direct deposit",
"routing number",
"bank account",
"net pay",
"gross pay",
"tax deductions",
"year-to-date",
"ytd",
"compensation record",
],
"min_matches": 1,
"base_score": 88,
},
ViolationType.CUSTOMER_LIST: {
"keywords": [
"customer list",
"customer_id",
"customer id",
"crm export",
"prospect list",
"top-tier prospect",
"annual_sales",
"company_name",
"bulk export",
"sales campaign",
],
"min_matches": 2,
"base_score": 85,
},
ViolationType.INTERNAL_MEMO: {
"keywords": [
"internal only",
"internal use only",
"do not distribute",
"not for external",
"office of the ceo",
"organizational priorities",
"growth roadmap",
"strictly confidential",
"internal policy document",
"headcount",
],
"min_matches": 1,
"base_score": 55,
},
}
def _normalize(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def _find_evidence(text: str, keyword: str) -> str | None:
match = re.search(re.escape(keyword.strip()), text, flags=re.IGNORECASE)
if not match:
return None
start = max(0, match.start() - 60)
end = min(len(text), match.end() + 100)
return _normalize(text[start:end])
def _risk_level_from_score(score: int) -> RiskLevel:
if score >= 80:
return RiskLevel.CRITICAL
if score >= 60:
return RiskLevel.HIGH
if score >= 40:
return RiskLevel.MEDIUM
return RiskLevel.LOW
def _action_from_score(score: int) -> ActionClass:
if score >= 80:
return ActionClass.BLOCK
if score >= 40:
return ActionClass.ALERT
return ActionClass.PASS_
def review_corpus(
email_file: str,
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
attachment_results: list[AttachmentResult],
processing_errors: list[str],
) -> DLPResult:
"""Judge an email using DLP_CATEGORIES signals from policy.py."""
# Build full text corpus
parts = [
f"Subject: {subject}",
f"From: {sender}",
f"To: {recipient}",
body_text,
]
for filename, text in attachment_texts:
parts.append(f"Attachment: {filename}")
parts.append(text)
raw = "\n".join(p for p in parts if p)
lower = raw.lower()
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
score_map: dict[ViolationType, int] = {}
for vtype, rule in _POLICY_KEYWORDS.items():
keywords: list[str] = rule["keywords"]
min_matches: int = rule["min_matches"]
base_score: int = rule["base_score"]
match_count = 0
for kw in keywords:
if kw.lower() in lower:
match_count += 1
ev = _find_evidence(raw, kw)
if ev and ev not in evidence_map[vtype]:
evidence_map[vtype].append(ev)
if match_count < min_matches:
continue
score = base_score + min(12, (match_count - 1) * 3)
# Context boost: external recipient domain
recipient_lower = recipient.lower()
if any(d in recipient_lower for d in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
score += 6
score_map[vtype] = min(99, score)
if not score_map:
category_desc = DLP_CATEGORIES # keep reference to show it's used
_ = category_desc # suppress unused warning
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=RiskLevel.LOW,
risk_score=12,
violation_types=[ViolationType.NONE],
action=ActionClass.PASS_,
summary="Policy review found no DLP category signals in this email.",
evidence=[],
attachments=attachment_results,
processing_errors=processing_errors,
)
ranked = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
violation_types = [vt for vt, _ in ranked[:3]]
risk_score = ranked[0][1]
if len(ranked) > 1:
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
evidence: list[str] = []
for vt in violation_types:
evidence.extend(evidence_map[vt][:2])
evidence = evidence[:5]
risk_level = _risk_level_from_score(risk_score)
action = _action_from_score(risk_score)
violation_labels = ", ".join(v.value for v in violation_types)
summary = (
f"Policy review flagged {violation_labels} with {risk_level.value} risk "
f"(score {risk_score}) using DLP_CATEGORIES signals from policy.py."
)
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=risk_level,
risk_score=risk_score,
violation_types=violation_types,
action=action,
summary=summary,
evidence=evidence,
attachments=attachment_results,
processing_errors=processing_errors,
)

310
email_dlp/simulator.py Normal file
View File

@ -0,0 +1,310 @@
"""Deterministic local simulator for DLP analysis."""
from __future__ import annotations
import re
from collections import defaultdict
from .converter import IMAGE_SENTINEL
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
ViolationType.PII: {
"keywords": [
"personally identifiable information",
" pii",
"employee id",
"account ending",
"direct deposit",
"customer_id",
"first_name",
"last_name",
],
"base_score": 30,
"min_matches": 2,
},
ViolationType.FINANCIAL_DATA: {
"keywords": [
"financial forecast",
"revenue",
"ebitda",
"gross margin",
"margin efficiency",
"sales data",
"annual_sales_usd",
"invoice",
"amount due",
"payment instructions",
"ach",
"budget",
],
"base_score": 42,
},
ViolationType.SOURCE_CODE: {
"keywords": [
"source code",
"api key",
"model weights",
"from __future__ import annotations",
"def ",
"class ",
"@dataclass",
],
"base_score": 88,
"min_matches": 2,
},
ViolationType.REGULATORY_DOCUMENT: {
"keywords": [
"regulatory document",
"regulatory submission",
"cfpb",
"compliance report",
"not for public release",
"draft regulatory",
"prepared by: legal & compliance team",
],
"base_score": 84,
},
ViolationType.LEGAL_CONTRACT: {
"keywords": [
"nondisclosure agreement",
"non-disclosure agreement",
"executed nda",
"disclosing party",
"receiving party",
],
"base_score": 62,
"min_matches": 1,
},
ViolationType.PAYROLL_RECORD: {
"keywords": [
"payroll",
"pay stub",
"compensation record",
"gross:",
"net pay",
"tax deductions",
"pay period",
"direct deposit",
"employee id",
],
"base_score": 90,
},
ViolationType.CUSTOMER_LIST: {
"keywords": [
"customer list",
"prospects",
"crm export",
"raw export",
"customer_id",
"company_name",
"annual_sales_usd",
"top-tier prospects",
],
"base_score": 86,
"min_matches": 2,
},
ViolationType.INTERNAL_MEMO: {
"keywords": [
"internal use only",
"internal memo",
"do not distribute externally",
"office of the ceo",
"organizational priorities",
"growth roadmap",
"internal policy document",
"not for public distribution",
"strictly confidential",
],
"base_score": 52,
"min_matches": 1,
},
}
_RISK_LEVELS = [
(80, RiskLevel.CRITICAL),
(60, RiskLevel.HIGH),
(40, RiskLevel.MEDIUM),
(0, RiskLevel.LOW),
]
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def _build_corpus(
subject: str,
sender: str,
recipient: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
) -> tuple[str, str]:
text_chunks = [
f"Subject: {subject}",
f"From: {sender}",
f"To: {recipient}",
body_text,
]
for filename, text in attachment_texts:
text_chunks.append(f"Attachment: {filename}")
# Skip binary image data — base64 payloads produce false keyword matches
if not text.startswith(IMAGE_SENTINEL):
text_chunks.append(text)
raw = "\n".join(chunk for chunk in text_chunks if chunk)
return raw, raw.lower()
def _find_evidence(text: str, keyword: str) -> str | None:
pattern = re.escape(keyword.strip())
match = re.search(pattern, text, flags=re.IGNORECASE)
if not match:
return None
start = max(0, match.start() - 60)
end = min(len(text), match.end() + 100)
return _normalize_text(text[start:end])
def _collect_matches(
raw_text: str,
lower_text: str,
) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
score_map: dict[ViolationType, int] = {}
for violation_type, rule in _CATEGORY_RULES.items():
keywords = rule["keywords"]
base_score = int(rule["base_score"])
min_matches = int(rule.get("min_matches", 1))
match_count = 0
for keyword in keywords:
# Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
pattern = r"\b" + re.escape(keyword) + r"\b"
if re.search(pattern, lower_text):
match_count += 1
evidence = _find_evidence(raw_text, keyword)
if evidence and evidence not in evidence_map[violation_type]:
evidence_map[violation_type].append(evidence)
if match_count < min_matches:
continue
score = base_score + min(12, (match_count - 1) * 4)
score_map[violation_type] = min(score, 99)
return evidence_map, score_map
def _apply_context_boosts(
subject: str,
recipient: str,
attachment_texts: list[tuple[str, str]],
score_map: dict[ViolationType, int],
) -> None:
subject_lower = subject.lower()
recipient_lower = recipient.lower()
if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
for violation_type in list(score_map):
score_map[violation_type] = min(99, score_map[violation_type] + 6)
if "urgent" in subject_lower or "confidential" in subject_lower:
for violation_type in list(score_map):
score_map[violation_type] = min(99, score_map[violation_type] + 2)
attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
score_map[ViolationType.CUSTOMER_LIST] = min(
99, score_map[ViolationType.CUSTOMER_LIST] + 6
)
if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
score_map[ViolationType.SOURCE_CODE] = min(
99, score_map[ViolationType.SOURCE_CODE] + 4
)
def _risk_level_from_score(risk_score: int) -> RiskLevel:
for threshold, risk_level in _RISK_LEVELS:
if risk_score >= threshold:
return risk_level
return RiskLevel.LOW
def _action_from_score(risk_score: int) -> ActionClass:
if risk_score >= 80:
return ActionClass.BLOCK
if risk_score >= 40:
return ActionClass.ALERT
return ActionClass.PASS_
def _build_summary(
violation_types: list[ViolationType],
risk_level: RiskLevel,
risk_score: int,
) -> str:
if violation_types == [ViolationType.NONE]:
return "No strong DLP indicators were found in the email body or converted attachments."
labels = ", ".join(v.value for v in violation_types)
return (
f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
f"(score {risk_score}) based on the email body and extracted attachment content."
)
def simulate_analysis(
email_file: str,
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
attachment_results: list[AttachmentResult],
processing_errors: list[str],
) -> DLPResult:
"""Predict a DLP result locally without calling an LLM."""
raw_text, lower_text = _build_corpus(
subject=subject,
sender=sender,
recipient=recipient,
body_text=body_text,
attachment_texts=attachment_texts,
)
evidence_map, score_map = _collect_matches(raw_text, lower_text)
_apply_context_boosts(subject, recipient, attachment_texts, score_map)
if not score_map:
violation_types = [ViolationType.NONE]
risk_score = 18
evidence: list[str] = []
else:
ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
violation_types = [violation for violation, _ in ranked[:3]]
risk_score = ranked[0][1]
if len(ranked) > 1:
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
evidence = []
for violation_type in violation_types:
evidence.extend(evidence_map.get(violation_type, [])[:2])
evidence = evidence[:5]
risk_level = _risk_level_from_score(risk_score)
action = _action_from_score(risk_score)
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=risk_level,
risk_score=risk_score,
violation_types=violation_types,
action=action,
summary=_build_summary(violation_types, risk_level, risk_score),
evidence=evidence,
attachments=attachment_results,
processing_errors=processing_errors,
)