Initial commit
This commit is contained in:
1
email_dlp/__init__.py
Normal file
1
email_dlp/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# Email DLP - Data Loss Prevention for email content
|
||||
241
email_dlp/analyzer.py
Normal file
241
email_dlp/analyzer.py
Normal file
@ -0,0 +1,241 @@
|
||||
"""DLP analysis backends: remote LLM or deterministic local simulation."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from .converter import IMAGE_SENTINEL
|
||||
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
|
||||
from .policy import format_policy_for_prompt
|
||||
from .simulator import simulate_analysis
|
||||
|
||||
OUTPUT_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"risk_level": {"type": "string", "enum": ["CRITICAL", "HIGH", "MEDIUM", "LOW"]},
|
||||
"risk_score": {"type": "integer", "minimum": 0, "maximum": 100},
|
||||
"violation_types": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"PII",
|
||||
"FINANCIAL_DATA",
|
||||
"SOURCE_CODE",
|
||||
"REGULATORY_DOCUMENT",
|
||||
"LEGAL_CONTRACT",
|
||||
"PAYROLL_RECORD",
|
||||
"CUSTOMER_LIST",
|
||||
"INTERNAL_MEMO",
|
||||
"NONE",
|
||||
],
|
||||
},
|
||||
},
|
||||
"action": {"type": "string", "enum": ["PASS", "ALERT", "BLOCK"]},
|
||||
"summary": {"type": "string"},
|
||||
"evidence": {"type": "array", "items": {"type": "string"}},
|
||||
},
|
||||
"required": ["risk_level", "risk_score", "violation_types", "action", "summary", "evidence"],
|
||||
}
|
||||
|
||||
SYSTEM_PROMPT_TEMPLATE = """\
|
||||
You are a Data Loss Prevention (DLP) analyst. Your task is to evaluate email content and attachments against the DLP policy below, then return a structured JSON decision.
|
||||
|
||||
## DLP Policy
|
||||
{policy_json}
|
||||
|
||||
## Output Schema
|
||||
Respond with valid JSON only (no markdown fences, no extra text) matching this schema:
|
||||
{schema_json}
|
||||
|
||||
## Critical Rules
|
||||
1. temperature=0: be deterministic and consistent.
|
||||
2. evidence: include direct quotes (verbatim excerpts) from the actual email or attachment content that justify your decision. Do not paraphrase.
|
||||
3. risk_score: assign 0-100. Use the full range — a customer CSV with 500 rows should score 95+, a casual internal memo scores 60-70.
|
||||
4. action: MUST match the threshold — BLOCK if risk_score>=80, ALERT if risk_score>=40, PASS otherwise.
|
||||
5. If no policy violations are found, set violation_types=["NONE"], risk_level="LOW", risk_score<40, action="PASS".
|
||||
"""
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
"""Build the system prompt used for DLP analysis."""
|
||||
return SYSTEM_PROMPT_TEMPLATE.format(
|
||||
policy_json=format_policy_for_prompt(),
|
||||
schema_json=json.dumps(OUTPUT_SCHEMA, indent=2),
|
||||
)
|
||||
|
||||
|
||||
def _build_user_content(
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]], # [(filename, text_or_sentinel)]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build the user message content for the VLM.
|
||||
|
||||
Returns a multimodal content list (OpenAI vision format).
|
||||
Text attachments are embedded as text blocks; image attachments are
|
||||
inserted as image_url blocks so the VLM can see them directly.
|
||||
"""
|
||||
header_block = "\n".join([
|
||||
"## Email Headers",
|
||||
f"Subject: {subject}",
|
||||
f"From: {sender}",
|
||||
f"To: {recipient}",
|
||||
f"Date: {date}",
|
||||
"",
|
||||
"## Email Body",
|
||||
body_text or "(empty body)",
|
||||
])
|
||||
content: list[dict[str, Any]] = [{"type": "text", "text": header_block}]
|
||||
|
||||
for filename, text in attachment_texts:
|
||||
if text.startswith(IMAGE_SENTINEL):
|
||||
# IMAGE_SENTINEL format: "__IMAGE__:<mime>:<base64>"
|
||||
payload = text[len(IMAGE_SENTINEL):] # "<mime>:<base64>"
|
||||
mime, b64 = payload.split(":", 1)
|
||||
content.append({"type": "text", "text": f"\n## Attachment: {filename} (image)"})
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{mime};base64,{b64}"},
|
||||
})
|
||||
else:
|
||||
content.append({
|
||||
"type": "text",
|
||||
"text": f"\n## Attachment: {filename}\n{text or '(no extractable text)'}",
|
||||
})
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def _parse_llm_response(content: str) -> dict[str, Any]:
|
||||
"""Parse JSON from LLM response, handling markdown fences."""
|
||||
content = content.strip()
|
||||
|
||||
# Strip triple-backtick fences
|
||||
fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||
if fence_match:
|
||||
content = fence_match.group(1)
|
||||
|
||||
return json.loads(content)
|
||||
|
||||
|
||||
def _map_action(action_str: str) -> ActionClass:
|
||||
mapping = {"PASS": ActionClass.PASS_, "ALERT": ActionClass.ALERT, "BLOCK": ActionClass.BLOCK}
|
||||
return mapping.get(action_str.upper(), ActionClass.ALERT)
|
||||
|
||||
|
||||
def analyze_email(
|
||||
email_file: str,
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
attachment_results: list[AttachmentResult],
|
||||
processing_errors: list[str],
|
||||
endpoint: str = "http://localhost:8000/v1",
|
||||
model: str = "Qwen/Qwen3.5-35B-A3B",
|
||||
backend: str = "llm",
|
||||
) -> DLPResult:
|
||||
"""Analyze email content using an LLM or the deterministic simulator."""
|
||||
if backend == "simulated":
|
||||
return simulate_analysis(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
body_text=body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
attachment_results=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
# Use environment variables as fallback if they exist
|
||||
final_endpoint = os.getenv("OPENAI_BASE_URL", endpoint)
|
||||
final_api_key = os.getenv("OPENAI_API_KEY", "not-needed")
|
||||
final_model = os.getenv("MODEL_NAME", model)
|
||||
|
||||
client = openai.OpenAI(base_url=final_endpoint, api_key=final_api_key)
|
||||
|
||||
system_prompt = build_system_prompt()
|
||||
|
||||
user_content = _build_user_content(
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
body_text=body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=final_model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_content}, # list[dict] for multimodal
|
||||
],
|
||||
temperature=0.0,
|
||||
max_tokens=1024,
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||
)
|
||||
|
||||
raw_content = response.choices[0].message.content or ""
|
||||
|
||||
try:
|
||||
parsed = _parse_llm_response(raw_content)
|
||||
except json.JSONDecodeError as e:
|
||||
processing_errors.append(f"JSON parse error: {e}; raw={raw_content[:200]}")
|
||||
# Return a safe fallback
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=RiskLevel.HIGH,
|
||||
risk_score=60,
|
||||
violation_types=[ViolationType.NONE],
|
||||
action=ActionClass.ALERT,
|
||||
summary="Analysis failed due to JSON parse error.",
|
||||
evidence=[],
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
# Map string values to enums
|
||||
risk_level = RiskLevel(parsed.get("risk_level", "HIGH"))
|
||||
violation_types = [
|
||||
ViolationType(v) for v in parsed.get("violation_types", ["NONE"])
|
||||
if v in ViolationType.__members__
|
||||
]
|
||||
if not violation_types:
|
||||
violation_types = [ViolationType.NONE]
|
||||
|
||||
action = _map_action(parsed.get("action", "ALERT"))
|
||||
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=risk_level,
|
||||
risk_score=int(parsed.get("risk_score", 60)),
|
||||
violation_types=violation_types,
|
||||
action=action,
|
||||
summary=parsed.get("summary", ""),
|
||||
evidence=parsed.get("evidence", []),
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
550
email_dlp/cli.py
Normal file
550
email_dlp/cli.py
Normal file
@ -0,0 +1,550 @@
|
||||
"""CLI entry point: preview and batch process .eml files through the DLP pipeline."""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||
from rich.table import Table
|
||||
|
||||
from .analyzer import _build_user_content, analyze_email, build_system_prompt
|
||||
from .converter import IMAGE_SENTINEL, convert_attachment
|
||||
from .models import ActionClass, AttachmentResult, DLPResult
|
||||
from .parser import parse_eml
|
||||
from .policy_reviewer import review_corpus
|
||||
from .simulator import simulate_analysis
|
||||
|
||||
app = typer.Typer(help="Email DLP — scan .eml files for data loss prevention policy violations.")
|
||||
console = Console()
|
||||
|
||||
ACTION_COLORS = {
|
||||
ActionClass.BLOCK: "bold red",
|
||||
ActionClass.ALERT: "bold yellow",
|
||||
ActionClass.PASS_: "bold green",
|
||||
}
|
||||
|
||||
RISK_COLORS = {
|
||||
"CRITICAL": "bold red",
|
||||
"HIGH": "red",
|
||||
"MEDIUM": "yellow",
|
||||
"LOW": "green",
|
||||
}
|
||||
|
||||
|
||||
def _process_single_email(
|
||||
eml_path: Path,
|
||||
endpoint: str,
|
||||
model: str,
|
||||
backend: str = "llm",
|
||||
) -> DLPResult:
|
||||
"""Parse, convert, and analyze one .eml file."""
|
||||
processing_errors: list[str] = []
|
||||
|
||||
# 1. Parse MIME
|
||||
parsed = parse_eml(eml_path)
|
||||
|
||||
try:
|
||||
# 2. Convert attachments
|
||||
attachment_texts: list[tuple[str, str]] = []
|
||||
attachment_results: list[AttachmentResult] = []
|
||||
|
||||
for att in parsed.attachments:
|
||||
entries = convert_attachment(att.path, att.filename)
|
||||
for display_name, text, status in entries:
|
||||
if "truncated" in status:
|
||||
processing_errors.append(
|
||||
f"'{display_name}' truncated to 20000 chars"
|
||||
)
|
||||
attachment_texts.append((display_name, text))
|
||||
attachment_results.append(
|
||||
AttachmentResult(
|
||||
filename=display_name,
|
||||
content_type=att.content_type,
|
||||
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
|
||||
conversion_status=status.split("|")[0],
|
||||
)
|
||||
)
|
||||
|
||||
# 3. Analyze with LLM
|
||||
result = analyze_email(
|
||||
email_file=eml_path.name,
|
||||
subject=parsed.subject,
|
||||
sender=parsed.sender,
|
||||
recipient=parsed.recipient,
|
||||
date=parsed.date,
|
||||
body_text=parsed.body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
attachment_results=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
endpoint=endpoint,
|
||||
model=model,
|
||||
backend=backend,
|
||||
)
|
||||
finally:
|
||||
parsed.cleanup()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _simulate_single_email(eml_path: Path) -> DLPResult:
|
||||
"""Parse, convert, and simulate one .eml file without an LLM."""
|
||||
processing_errors: list[str] = []
|
||||
parsed = parse_eml(eml_path)
|
||||
|
||||
try:
|
||||
attachment_texts: list[tuple[str, str]] = []
|
||||
attachment_results: list[AttachmentResult] = []
|
||||
|
||||
for att in parsed.attachments:
|
||||
entries = convert_attachment(att.path, att.filename)
|
||||
for display_name, text, status in entries:
|
||||
if "truncated" in status:
|
||||
processing_errors.append(
|
||||
f"'{display_name}' truncated to 20000 chars"
|
||||
)
|
||||
attachment_texts.append((display_name, text))
|
||||
attachment_results.append(
|
||||
AttachmentResult(
|
||||
filename=display_name,
|
||||
content_type=att.content_type,
|
||||
extracted_text_chars=0 if text.startswith(IMAGE_SENTINEL) else len(text),
|
||||
conversion_status=status.split("|")[0],
|
||||
)
|
||||
)
|
||||
|
||||
result = simulate_analysis(
|
||||
email_file=eml_path.name,
|
||||
subject=parsed.subject,
|
||||
sender=parsed.sender,
|
||||
recipient=parsed.recipient,
|
||||
date=parsed.date,
|
||||
body_text=parsed.body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
attachment_results=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
finally:
|
||||
parsed.cleanup()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _preview_single_email(
|
||||
eml_path: Path,
|
||||
include_system_prompt: bool = True,
|
||||
include_full_prompt: bool = False,
|
||||
) -> dict:
|
||||
"""Parse and convert one .eml file without calling the LLM."""
|
||||
parsed = parse_eml(eml_path)
|
||||
|
||||
try:
|
||||
attachments_preview: list[dict[str, object]] = []
|
||||
attachment_texts: list[tuple[str, str]] = []
|
||||
processing_errors: list[str] = []
|
||||
|
||||
for att in parsed.attachments:
|
||||
entries = convert_attachment(att.path, att.filename)
|
||||
for display_name, text, status in entries:
|
||||
is_image = text.startswith(IMAGE_SENTINEL)
|
||||
if "truncated" in status:
|
||||
processing_errors.append(
|
||||
f"'{display_name}' truncated to 20000 chars"
|
||||
)
|
||||
|
||||
attachment_texts.append((display_name, text))
|
||||
attachments_preview.append(
|
||||
{
|
||||
"filename": display_name,
|
||||
"content_type": att.content_type,
|
||||
"conversion_status": status,
|
||||
"is_image": is_image,
|
||||
"extracted_text_chars": 0 if is_image else len(text),
|
||||
"text_preview": (
|
||||
None
|
||||
if is_image
|
||||
else text[:500]
|
||||
),
|
||||
"image_data_url_preview": (
|
||||
None
|
||||
if not is_image
|
||||
else text[:120] + "..."
|
||||
if len(text) > 120
|
||||
else text
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
llm_user_content = _build_user_content(
|
||||
subject=parsed.subject,
|
||||
sender=parsed.sender,
|
||||
recipient=parsed.recipient,
|
||||
date=parsed.date,
|
||||
body_text=parsed.body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
)
|
||||
|
||||
llm_user_content_preview: list[dict[str, object]] = []
|
||||
for block in llm_user_content:
|
||||
if block["type"] == "text":
|
||||
llm_user_content_preview.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text_preview": str(block["text"])[:1000],
|
||||
"text_chars": len(str(block["text"])),
|
||||
}
|
||||
)
|
||||
else:
|
||||
url = str(block["image_url"]["url"])
|
||||
llm_user_content_preview.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"url_preview": url[:120] + ("..." if len(url) > 120 else ""),
|
||||
"url_chars": len(url),
|
||||
}
|
||||
)
|
||||
|
||||
preview_result = {
|
||||
"email_file": eml_path.name,
|
||||
"subject": parsed.subject,
|
||||
"sender": parsed.sender,
|
||||
"recipient": parsed.recipient,
|
||||
"date": parsed.date,
|
||||
"body_text_chars": len(parsed.body_text),
|
||||
"body_text_preview": parsed.body_text[:1000],
|
||||
"attachment_count": len(parsed.attachments),
|
||||
"attachments": attachments_preview,
|
||||
"processing_errors": processing_errors,
|
||||
"llm_user_content_preview": llm_user_content_preview,
|
||||
}
|
||||
|
||||
if include_system_prompt:
|
||||
system_prompt = build_system_prompt()
|
||||
preview_result["llm_system_prompt_preview"] = {
|
||||
"text_preview": system_prompt[:2000],
|
||||
"text_chars": len(system_prompt),
|
||||
}
|
||||
if include_full_prompt:
|
||||
preview_result["llm_system_prompt"] = system_prompt
|
||||
|
||||
if include_full_prompt:
|
||||
preview_result["llm_user_content"] = llm_user_content
|
||||
|
||||
return preview_result
|
||||
finally:
|
||||
parsed.cleanup()
|
||||
|
||||
|
||||
@app.command()
|
||||
def analyze(
|
||||
input_dir: Path = typer.Option(
|
||||
Path("data"),
|
||||
"--input",
|
||||
"-i",
|
||||
help="Directory containing .eml files",
|
||||
),
|
||||
output_dir: Optional[Path] = typer.Option(
|
||||
None,
|
||||
"--output",
|
||||
"-o",
|
||||
help="Directory to write JSON results (defaults to output/analyze-TIMESTAMP)",
|
||||
),
|
||||
endpoint: str = typer.Option(
|
||||
"http://localhost:8000/v1",
|
||||
"--endpoint",
|
||||
help="vLLM OpenAI-compatible endpoint",
|
||||
),
|
||||
model: str = typer.Option(
|
||||
"Qwen/Qwen3.5-35B-A3B",
|
||||
"--model",
|
||||
help="Model name to use for analysis",
|
||||
),
|
||||
backend: str = typer.Option(
|
||||
"llm",
|
||||
"--backend",
|
||||
help="Analysis backend: 'llm' for API calls, 'simulated' for local deterministic analysis",
|
||||
),
|
||||
summary: bool = typer.Option(
|
||||
False,
|
||||
"--summary",
|
||||
"-s",
|
||||
help="Print a summary table after processing",
|
||||
),
|
||||
) -> None:
|
||||
"""Batch analyze all .eml files in INPUT_DIR and write JSON results to OUTPUT_DIR."""
|
||||
if output_dir is None:
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
output_dir = Path("output") / f"analyze-{timestamp}"
|
||||
|
||||
eml_files = sorted(input_dir.glob("*.eml"))
|
||||
if not eml_files:
|
||||
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
||||
raise typer.Exit(1)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results: list[DLPResult] = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Analyzing emails...", total=len(eml_files))
|
||||
|
||||
for eml_path in eml_files:
|
||||
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
|
||||
try:
|
||||
result = _process_single_email(eml_path, endpoint, model, backend=backend)
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
|
||||
progress.advance(task)
|
||||
continue
|
||||
|
||||
# Write individual JSON result
|
||||
out_file = output_dir / (eml_path.stem + ".json")
|
||||
out_file.write_text(result.model_dump_json(indent=2))
|
||||
results.append(result)
|
||||
progress.advance(task)
|
||||
|
||||
# Write batch summary
|
||||
batch_summary = {
|
||||
"total": len(results),
|
||||
"by_action": {
|
||||
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
|
||||
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
|
||||
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
|
||||
},
|
||||
"by_risk": {
|
||||
level: sum(1 for r in results if r.risk_level.value == level)
|
||||
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
|
||||
},
|
||||
"emails": [
|
||||
{
|
||||
"file": r.email_file,
|
||||
"subject": r.subject,
|
||||
"risk_level": r.risk_level.value,
|
||||
"risk_score": r.risk_score,
|
||||
"action": r.action.value,
|
||||
"violation_types": [v.value for v in r.violation_types],
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
}
|
||||
(output_dir / "batch_summary.json").write_text(
|
||||
json.dumps(batch_summary, indent=2)
|
||||
)
|
||||
|
||||
console.print(f"\n[bold green]Done![/bold green] Processed {len(results)}/{len(eml_files)} emails.")
|
||||
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
|
||||
|
||||
if summary and results:
|
||||
_print_summary_table(results)
|
||||
|
||||
|
||||
@app.command()
|
||||
def preview(
|
||||
input_dir: Path = typer.Option(
|
||||
Path("data"),
|
||||
"--input",
|
||||
"-i",
|
||||
help="Directory containing .eml files",
|
||||
),
|
||||
output_dir: Optional[Path] = typer.Option(
|
||||
None,
|
||||
"--output",
|
||||
"-o",
|
||||
help="Optional directory to write preview JSON files",
|
||||
),
|
||||
print_json: bool = typer.Option(
|
||||
False,
|
||||
"--print-json",
|
||||
help="Print preview JSON to stdout",
|
||||
),
|
||||
include_system_prompt: bool = typer.Option(
|
||||
True,
|
||||
"--include-system-prompt/--no-system-prompt",
|
||||
help="Include the analyzer system prompt built from policy.py",
|
||||
),
|
||||
include_full_prompt: bool = typer.Option(
|
||||
False,
|
||||
"--include-full-prompt",
|
||||
help="Include the full system prompt and full user content in JSON output",
|
||||
),
|
||||
) -> None:
|
||||
"""Preview parsed email and converted attachment content before LLM analysis."""
|
||||
eml_files = sorted(input_dir.glob("*.eml"))
|
||||
if not eml_files:
|
||||
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
||||
raise typer.Exit(1)
|
||||
|
||||
if output_dir is not None:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
previews: list[dict] = []
|
||||
|
||||
for eml_path in eml_files:
|
||||
try:
|
||||
preview_result = _preview_single_email(
|
||||
eml_path,
|
||||
include_system_prompt=include_system_prompt,
|
||||
include_full_prompt=include_full_prompt,
|
||||
)
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error previewing {eml_path.name}: {e}[/red]")
|
||||
continue
|
||||
|
||||
previews.append(preview_result)
|
||||
|
||||
if output_dir is not None:
|
||||
out_file = output_dir / f"{eml_path.stem}.preview.json"
|
||||
out_file.write_text(json.dumps(preview_result, indent=2))
|
||||
|
||||
if output_dir is not None:
|
||||
batch_file = output_dir / "batch_preview.json"
|
||||
batch_file.write_text(json.dumps(previews, indent=2))
|
||||
console.print(f"[green]Preview JSON written to[/green] [cyan]{output_dir}[/cyan]")
|
||||
|
||||
if print_json:
|
||||
console.print_json(json.dumps(previews, indent=2))
|
||||
else:
|
||||
table = Table(title="Email Preview Results", show_lines=True)
|
||||
table.add_column("File", style="dim", max_width=45)
|
||||
table.add_column("Body Chars", justify="right")
|
||||
table.add_column("Attachments", justify="right")
|
||||
table.add_column("Errors", justify="right")
|
||||
|
||||
for preview_result in previews:
|
||||
table.add_row(
|
||||
str(preview_result["email_file"]),
|
||||
str(preview_result["body_text_chars"]),
|
||||
str(preview_result["attachment_count"]),
|
||||
str(len(preview_result["processing_errors"])),
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
|
||||
@app.command()
|
||||
def simulate(
|
||||
input_dir: Path = typer.Option(
|
||||
Path("data"),
|
||||
"--input",
|
||||
"-i",
|
||||
help="Directory containing .eml files",
|
||||
),
|
||||
output_dir: Optional[Path] = typer.Option(
|
||||
None,
|
||||
"--output",
|
||||
"-o",
|
||||
help="Directory to write simulated JSON results (defaults to output/simulated-TIMESTAMP)",
|
||||
),
|
||||
summary: bool = typer.Option(
|
||||
True,
|
||||
"--summary/--no-summary",
|
||||
help="Print a summary table after processing",
|
||||
),
|
||||
) -> None:
|
||||
"""Batch simulate DLP analysis locally without calling an LLM."""
|
||||
if output_dir is None:
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
output_dir = Path("output") / f"simulated-{timestamp}"
|
||||
|
||||
eml_files = sorted(input_dir.glob("*.eml"))
|
||||
if not eml_files:
|
||||
console.print(f"[red]No .eml files found in {input_dir}[/red]")
|
||||
raise typer.Exit(1)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
results: list[DLPResult] = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Simulating email analysis...", total=len(eml_files))
|
||||
|
||||
for eml_path in eml_files:
|
||||
progress.update(task, description=f"[cyan]{eml_path.name[:50]}[/cyan]")
|
||||
try:
|
||||
result = _simulate_single_email(eml_path)
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error processing {eml_path.name}: {e}[/red]")
|
||||
progress.advance(task)
|
||||
continue
|
||||
|
||||
out_file = output_dir / (eml_path.stem + ".json")
|
||||
out_file.write_text(result.model_dump_json(indent=2))
|
||||
results.append(result)
|
||||
progress.advance(task)
|
||||
|
||||
batch_summary = {
|
||||
"total": len(results),
|
||||
"generator": "local-simulator",
|
||||
"model_label": "gpt-5.4-simulated",
|
||||
"by_action": {
|
||||
"BLOCK": sum(1 for r in results if r.action == ActionClass.BLOCK),
|
||||
"ALERT": sum(1 for r in results if r.action == ActionClass.ALERT),
|
||||
"PASS": sum(1 for r in results if r.action == ActionClass.PASS_),
|
||||
},
|
||||
"by_risk": {
|
||||
level: sum(1 for r in results if r.risk_level.value == level)
|
||||
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
|
||||
},
|
||||
"emails": [
|
||||
{
|
||||
"file": r.email_file,
|
||||
"subject": r.subject,
|
||||
"risk_level": r.risk_level.value,
|
||||
"risk_score": r.risk_score,
|
||||
"action": r.action.value,
|
||||
"violation_types": [v.value for v in r.violation_types],
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
}
|
||||
(output_dir / "batch_summary.json").write_text(
|
||||
json.dumps(batch_summary, indent=2)
|
||||
)
|
||||
|
||||
console.print(
|
||||
f"\n[bold green]Done![/bold green] Simulated {len(results)}/{len(eml_files)} emails."
|
||||
)
|
||||
console.print(f"Results written to: [cyan]{output_dir}/[/cyan]")
|
||||
|
||||
if summary and results:
|
||||
_print_summary_table(results)
|
||||
|
||||
|
||||
def _print_summary_table(results: list[DLPResult]) -> None:
|
||||
"""Print a rich summary table to the console."""
|
||||
table = Table(title="Email DLP Analysis Results", show_lines=True)
|
||||
table.add_column("File", style="dim", max_width=45)
|
||||
table.add_column("Risk Level", justify="center")
|
||||
table.add_column("Score", justify="center")
|
||||
table.add_column("Action", justify="center")
|
||||
table.add_column("Violations", max_width=40)
|
||||
|
||||
for r in results:
|
||||
risk_color = RISK_COLORS.get(r.risk_level.value, "white")
|
||||
action_color = ACTION_COLORS.get(r.action, "white")
|
||||
violations = ", ".join(v.value for v in r.violation_types)
|
||||
|
||||
table.add_row(
|
||||
r.email_file,
|
||||
f"[{risk_color}]{r.risk_level.value}[/{risk_color}]",
|
||||
str(r.risk_score),
|
||||
f"[{action_color}]{r.action.value}[/{action_color}]",
|
||||
violations,
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
238
email_dlp/converter.py
Normal file
238
email_dlp/converter.py
Normal file
@ -0,0 +1,238 @@
|
||||
"""Attachment → markdown text conversion routing."""
|
||||
|
||||
import base64
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
MAX_TEXT_CHARS = 20_000
|
||||
|
||||
# Sentinel prefix used to pass image data through the (text, status) interface.
|
||||
# Format: IMAGE_SENTINEL + "<mime_type>:<base64_data>"
|
||||
IMAGE_SENTINEL = "__IMAGE__:"
|
||||
|
||||
_IMAGE_MIME = {
|
||||
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
||||
".png": "image/png", ".gif": "image/gif",
|
||||
".bmp": "image/bmp", ".tiff": "image/tiff", ".webp": "image/webp",
|
||||
".img": "image/png", # fallback for generated inline names
|
||||
}
|
||||
|
||||
|
||||
def _convert_single_file(filepath: Path) -> tuple[str, str]:
|
||||
"""Convert a single file to text. Returns (text, status).
|
||||
|
||||
For image files, text is IMAGE_SENTINEL + "<mime>:<base64>" and
|
||||
status is "ok:image". Callers must check for the sentinel.
|
||||
"""
|
||||
suffix = filepath.suffix.lower()
|
||||
|
||||
# Image — return base64 sentinel for VLM consumption
|
||||
if suffix in _IMAGE_MIME:
|
||||
mime = _IMAGE_MIME[suffix]
|
||||
b64 = base64.b64encode(filepath.read_bytes()).decode()
|
||||
return IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"
|
||||
|
||||
known_binary_exts = {
|
||||
".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
|
||||
".go", ".rb", ".rs", ".sh", ".txt", ".md", ".sql", ".yaml", ".yml",
|
||||
".json", ".xml", ".html", ".htm", ".css",
|
||||
}
|
||||
|
||||
if suffix in known_binary_exts:
|
||||
# Plain text fallback — read directly
|
||||
try:
|
||||
text = filepath.read_text(errors="replace")
|
||||
return text, "ok"
|
||||
except Exception as e:
|
||||
return "", f"failed: {e}"
|
||||
|
||||
# Use markitdown for PDF, DOCX, XLSX, CSV, etc.
|
||||
try:
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(filepath))
|
||||
return result.text_content or "", "ok"
|
||||
except Exception as e:
|
||||
# Fallback to plain-text read for unknown types
|
||||
try:
|
||||
text = filepath.read_text(errors="replace")
|
||||
return text, f"fallback: {e}"
|
||||
except Exception as e2:
|
||||
return "", f"failed: {e2}"
|
||||
|
||||
|
||||
_OFFICE_MEDIA_DIRS = {
|
||||
".docx": "word/media/",
|
||||
".pptx": "ppt/media/",
|
||||
".xlsx": "xl/media/",
|
||||
}
|
||||
|
||||
_IMAGE_EXTS = set(_IMAGE_MIME.keys())
|
||||
|
||||
|
||||
def _extract_pdf_images(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract embedded images from a PDF using PyMuPDF.
|
||||
|
||||
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
|
||||
Returns empty list if fitz is not installed or no images found.
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
return []
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
try:
|
||||
doc = fitz.open(str(filepath))
|
||||
img_index = 0
|
||||
for page in doc:
|
||||
for img in page.get_images():
|
||||
xref = img[0]
|
||||
img_data = doc.extract_image(xref)
|
||||
ext = img_data.get("ext", "png")
|
||||
mime = _IMAGE_MIME.get(f".{ext}", f"image/{ext}")
|
||||
b64 = base64.b64encode(img_data["image"]).decode()
|
||||
display_name = f"{filename}/image_{img_index}.{ext}"
|
||||
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
|
||||
img_index += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _extract_office_images(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract embedded images from a DOCX/PPTX/XLSX using zipfile.
|
||||
|
||||
Returns list of (display_name, IMAGE_SENTINEL+..., "ok:image").
|
||||
Returns empty list if the file is not a valid ZIP or has no images.
|
||||
"""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
media_dir = _OFFICE_MEDIA_DIRS.get(suffix)
|
||||
if not media_dir:
|
||||
return []
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
try:
|
||||
with zipfile.ZipFile(str(filepath), "r") as zf:
|
||||
for name in sorted(zf.namelist()):
|
||||
if not name.startswith(media_dir):
|
||||
continue
|
||||
member_suffix = Path(name).suffix.lower()
|
||||
if member_suffix not in _IMAGE_EXTS:
|
||||
continue
|
||||
mime = _IMAGE_MIME[member_suffix]
|
||||
b64 = base64.b64encode(zf.read(name)).decode()
|
||||
display_name = f"{filename}/{Path(name).name}"
|
||||
results.append((display_name, IMAGE_SENTINEL + f"{mime}:{b64}", "ok:image"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _convert_7z(
|
||||
filepath: Path, archive_name: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract a .7z archive and convert each member.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status), one entry per member.
|
||||
display_name uses "archive.7z/member.ext" format.
|
||||
"""
|
||||
try:
|
||||
import py7zr
|
||||
except ImportError:
|
||||
return [(archive_name, "", "failed: py7zr not installed")]
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="email_dlp_7z_") as tmpdir:
|
||||
tmp = Path(tmpdir)
|
||||
try:
|
||||
with py7zr.SevenZipFile(str(filepath), mode="r") as archive:
|
||||
archive.extractall(path=str(tmp))
|
||||
except Exception as e:
|
||||
return [(archive_name, "", f"failed: 7z extraction error: {e}")]
|
||||
|
||||
for member_path in sorted(tmp.rglob("*")):
|
||||
if not member_path.is_file():
|
||||
continue
|
||||
display_name = f"{archive_name}/{member_path.name}"
|
||||
text, status = _convert_single_file(member_path)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results.append((display_name, text, status))
|
||||
|
||||
return results if results else [(archive_name, "", "skipped")]
|
||||
|
||||
|
||||
def _convert_zip(
|
||||
filepath: Path, archive_name: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Extract a .zip archive and convert each member.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status), one entry per member.
|
||||
display_name uses "archive.zip/member.ext" format.
|
||||
"""
|
||||
import zipfile
|
||||
|
||||
results: list[tuple[str, str, str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="email_dlp_zip_") as tmpdir:
|
||||
tmp = Path(tmpdir)
|
||||
try:
|
||||
with zipfile.ZipFile(str(filepath), mode="r") as archive:
|
||||
archive.extractall(path=str(tmp))
|
||||
except Exception as e:
|
||||
return [(archive_name, "", f"failed: zip extraction error: {e}")]
|
||||
|
||||
for member_path in sorted(tmp.rglob("*")):
|
||||
if not member_path.is_file():
|
||||
continue
|
||||
display_name = f"{archive_name}/{member_path.name}"
|
||||
text, status = _convert_single_file(member_path)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results.append((display_name, text, status))
|
||||
|
||||
return results if results else [(archive_name, "", "skipped")]
|
||||
|
||||
|
||||
def convert_attachment(
|
||||
filepath: Path, filename: str
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Convert an attachment file for LLM analysis.
|
||||
|
||||
Returns list of (display_name, text_or_sentinel, status).
|
||||
- Non-archive files: single-element list.
|
||||
- .7z archives: one element per member file inside the archive.
|
||||
|
||||
text_or_sentinel is either plain text or IMAGE_SENTINEL + "<mime>:<base64>"
|
||||
for image files. Text is truncated to MAX_TEXT_CHARS (images are not truncated).
|
||||
"""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
|
||||
if suffix == ".7z":
|
||||
return _convert_7z(filepath, filename)
|
||||
elif suffix == ".zip":
|
||||
return _convert_zip(filepath, filename)
|
||||
|
||||
text, status = _convert_single_file(filepath)
|
||||
if not text.startswith(IMAGE_SENTINEL) and len(text) > MAX_TEXT_CHARS:
|
||||
text = text[:MAX_TEXT_CHARS]
|
||||
status = f"{status}|truncated_at_{MAX_TEXT_CHARS}"
|
||||
results = [(filename, text, status)]
|
||||
|
||||
# For PDF and Office files, also extract embedded images
|
||||
if suffix == ".pdf":
|
||||
results.extend(_extract_pdf_images(filepath, filename))
|
||||
elif suffix in _OFFICE_MEDIA_DIRS:
|
||||
results.extend(_extract_office_images(filepath, filename))
|
||||
|
||||
return results
|
||||
52
email_dlp/models.py
Normal file
52
email_dlp/models.py
Normal file
@ -0,0 +1,52 @@
|
||||
"""Pydantic output schema for DLP analysis results."""
|
||||
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class RiskLevel(str, Enum):
|
||||
CRITICAL = "CRITICAL"
|
||||
HIGH = "HIGH"
|
||||
MEDIUM = "MEDIUM"
|
||||
LOW = "LOW"
|
||||
|
||||
|
||||
class ViolationType(str, Enum):
|
||||
PII = "PII"
|
||||
FINANCIAL_DATA = "FINANCIAL_DATA"
|
||||
SOURCE_CODE = "SOURCE_CODE"
|
||||
REGULATORY_DOCUMENT = "REGULATORY_DOCUMENT"
|
||||
LEGAL_CONTRACT = "LEGAL_CONTRACT"
|
||||
PAYROLL_RECORD = "PAYROLL_RECORD"
|
||||
CUSTOMER_LIST = "CUSTOMER_LIST"
|
||||
INTERNAL_MEMO = "INTERNAL_MEMO"
|
||||
NONE = "NONE"
|
||||
|
||||
|
||||
class ActionClass(str, Enum):
|
||||
PASS_ = "PASS"
|
||||
ALERT = "ALERT"
|
||||
BLOCK = "BLOCK"
|
||||
|
||||
|
||||
class AttachmentResult(BaseModel):
|
||||
filename: str
|
||||
content_type: str
|
||||
extracted_text_chars: int = 0
|
||||
conversion_status: str = "ok" # "ok" | "failed" | "skipped"
|
||||
|
||||
|
||||
class DLPResult(BaseModel):
|
||||
email_file: str
|
||||
subject: str
|
||||
sender: str
|
||||
recipient: str
|
||||
date: str
|
||||
risk_level: RiskLevel
|
||||
risk_score: int = Field(ge=0, le=100)
|
||||
violation_types: list[ViolationType]
|
||||
action: ActionClass
|
||||
summary: str
|
||||
evidence: list[str]
|
||||
attachments: list[AttachmentResult]
|
||||
processing_errors: list[str] = Field(default_factory=list)
|
||||
199
email_dlp/parser.py
Normal file
199
email_dlp/parser.py
Normal file
@ -0,0 +1,199 @@
|
||||
"""MIME email parsing: extract headers, body text, and attachments."""
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedAttachment:
|
||||
filename: str
|
||||
path: Path
|
||||
content_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedEmail:
|
||||
subject: str
|
||||
sender: str
|
||||
recipient: str
|
||||
date: str
|
||||
body_text: str
|
||||
attachments: list[ParsedAttachment] = field(default_factory=list)
|
||||
# tempdir must be kept alive by the caller
|
||||
_tempdir: tempfile.TemporaryDirectory | None = field(default=None, repr=False)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
if self._tempdir is not None:
|
||||
self._tempdir.cleanup()
|
||||
self._tempdir = None
|
||||
|
||||
|
||||
def _decode_header_value(value: str | None) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
# Decode RFC2047 encoded words (e.g. =?Windows-1252?Q?...?=)
|
||||
decoded_parts = email.header.decode_header(str(value))
|
||||
result = ""
|
||||
for chunk, charset in decoded_parts:
|
||||
if isinstance(chunk, bytes):
|
||||
result += chunk.decode(charset or "utf-8", errors="replace")
|
||||
else:
|
||||
result += chunk
|
||||
return result.strip()
|
||||
|
||||
|
||||
def _extract_body(msg: email.message.Message) -> str:
|
||||
"""Walk MIME parts and extract the best plain-text body."""
|
||||
plain_parts: list[str] = []
|
||||
html_parts: list[str] = []
|
||||
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
ct = part.get_content_type()
|
||||
disposition = str(part.get("Content-Disposition", ""))
|
||||
# Skip attachments
|
||||
if "attachment" in disposition:
|
||||
continue
|
||||
if ct == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
plain_parts.append(payload.decode(charset, errors="replace"))
|
||||
elif ct == "text/html":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
html_parts.append(payload.decode(charset, errors="replace"))
|
||||
else:
|
||||
ct = msg.get_content_type()
|
||||
payload = msg.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
text = payload.decode(charset, errors="replace")
|
||||
if ct == "text/plain":
|
||||
plain_parts.append(text)
|
||||
elif ct == "text/html":
|
||||
html_parts.append(text)
|
||||
|
||||
if plain_parts:
|
||||
return "\n\n".join(plain_parts).strip()
|
||||
|
||||
# Fall back to HTML → plain text via BeautifulSoup
|
||||
if html_parts:
|
||||
combined_html = "\n".join(html_parts)
|
||||
soup = BeautifulSoup(combined_html, "html.parser")
|
||||
return soup.get_text(separator="\n").strip()
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
_IMAGE_CONTENT_TYPES = {
|
||||
"image/jpeg", "image/png", "image/gif",
|
||||
"image/bmp", "image/tiff", "image/webp",
|
||||
}
|
||||
_IMAGE_EXTS = {
|
||||
"image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif",
|
||||
"image/bmp": ".bmp", "image/tiff": ".tiff", "image/webp": ".webp",
|
||||
}
|
||||
|
||||
|
||||
def _collect_attachments(
|
||||
msg: email.message.Message, tmpdir: Path
|
||||
) -> list[ParsedAttachment]:
|
||||
"""Extract all attachment parts and write them to tmpdir.
|
||||
|
||||
Also captures inline images (CID-embedded) that have no filename.
|
||||
"""
|
||||
attachments: list[ParsedAttachment] = []
|
||||
seen_names: set[str] = set()
|
||||
inline_image_counter = 0
|
||||
|
||||
for part in msg.walk():
|
||||
disposition = str(part.get("Content-Disposition", ""))
|
||||
content_type = part.get_content_type()
|
||||
filename = part.get_filename()
|
||||
|
||||
# Inline image without a filename — generate one from Content-ID or counter
|
||||
if filename is None and content_type in _IMAGE_CONTENT_TYPES:
|
||||
cid = str(part.get("Content-ID", "")).strip("<>").split("@")[0]
|
||||
ext = _IMAGE_EXTS.get(content_type, ".img")
|
||||
filename = f"inline_{cid or inline_image_counter}{ext}"
|
||||
inline_image_counter += 1
|
||||
elif filename is None and "attachment" not in disposition:
|
||||
continue
|
||||
elif filename is None:
|
||||
# Unnamed non-image attachment — skip
|
||||
continue
|
||||
|
||||
# Decode RFC2047 filename if needed
|
||||
decoded_parts = email.header.decode_header(filename)
|
||||
filename_clean = ""
|
||||
for chunk, charset in decoded_parts:
|
||||
if isinstance(chunk, bytes):
|
||||
filename_clean += chunk.decode(charset or "utf-8", errors="replace")
|
||||
else:
|
||||
filename_clean += chunk
|
||||
|
||||
# Avoid duplicates
|
||||
base_name = filename_clean
|
||||
counter = 1
|
||||
while filename_clean in seen_names:
|
||||
stem = Path(base_name).stem
|
||||
suffix = Path(base_name).suffix
|
||||
filename_clean = f"{stem}_{counter}{suffix}"
|
||||
counter += 1
|
||||
seen_names.add(filename_clean)
|
||||
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload is None:
|
||||
continue
|
||||
|
||||
dest = tmpdir / filename_clean
|
||||
dest.write_bytes(payload)
|
||||
|
||||
attachments.append(
|
||||
ParsedAttachment(
|
||||
filename=filename_clean,
|
||||
path=dest,
|
||||
content_type=part.get_content_type(),
|
||||
)
|
||||
)
|
||||
|
||||
return attachments
|
||||
|
||||
|
||||
def parse_eml(eml_path: Path) -> ParsedEmail:
|
||||
"""Parse an .eml file and return a ParsedEmail object.
|
||||
|
||||
The caller is responsible for calling parsed_email.cleanup() when done,
|
||||
or using ParsedEmail as a context manager is not implemented — keep
|
||||
the return value alive until you no longer need the attachment paths.
|
||||
"""
|
||||
with open(eml_path, "rb") as f:
|
||||
msg = email.message_from_binary_file(f, policy=email.policy.compat32)
|
||||
|
||||
subject = _decode_header_value(msg.get("Subject"))
|
||||
sender = _decode_header_value(msg.get("From"))
|
||||
recipient = _decode_header_value(msg.get("To"))
|
||||
date = _decode_header_value(msg.get("Date"))
|
||||
|
||||
body_text = _extract_body(msg)
|
||||
|
||||
tmpdir_obj = tempfile.TemporaryDirectory(prefix="email_dlp_")
|
||||
tmpdir = Path(tmpdir_obj.name)
|
||||
attachments = _collect_attachments(msg, tmpdir)
|
||||
|
||||
return ParsedEmail(
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
body_text=body_text,
|
||||
attachments=attachments,
|
||||
_tempdir=tmpdir_obj,
|
||||
)
|
||||
132
email_dlp/policy.py
Normal file
132
email_dlp/policy.py
Normal file
@ -0,0 +1,132 @@
|
||||
"""DLP policy definitions: violation categories, thresholds, and prompt formatting."""
|
||||
|
||||
import json
|
||||
|
||||
from .models import ActionClass, RiskLevel
|
||||
|
||||
# Risk score thresholds
|
||||
RISK_THRESHOLDS = {
|
||||
RiskLevel.CRITICAL: 80,
|
||||
RiskLevel.HIGH: 60,
|
||||
RiskLevel.MEDIUM: 40,
|
||||
RiskLevel.LOW: 0,
|
||||
}
|
||||
|
||||
# Action mapping based on risk level
|
||||
RISK_TO_ACTION = {
|
||||
RiskLevel.CRITICAL: ActionClass.BLOCK,
|
||||
RiskLevel.HIGH: ActionClass.ALERT,
|
||||
RiskLevel.MEDIUM: ActionClass.ALERT,
|
||||
RiskLevel.LOW: ActionClass.PASS_,
|
||||
}
|
||||
|
||||
DLP_CATEGORIES = {
|
||||
"PII": {
|
||||
"description": "Personally Identifiable Information",
|
||||
"signals": [
|
||||
"Full name combined with email address",
|
||||
"Social Security Number (SSN) or employee ID",
|
||||
"Phone numbers combined with personal details",
|
||||
"Home address combined with personal identifiers",
|
||||
],
|
||||
"risk_weight": "HIGH to CRITICAL depending on volume",
|
||||
},
|
||||
"FINANCIAL_DATA": {
|
||||
"description": "Non-public financial information",
|
||||
"signals": [
|
||||
"Revenue targets, EBITDA projections, internal forecasts",
|
||||
"Salary figures, compensation plans",
|
||||
"Invoice amounts and vendor payment terms",
|
||||
"Internal budget allocations",
|
||||
],
|
||||
"risk_weight": "MEDIUM to CRITICAL depending on sensitivity",
|
||||
},
|
||||
"SOURCE_CODE": {
|
||||
"description": "Proprietary source code or model weights",
|
||||
"signals": [
|
||||
"Python, Java, or other source files with copyright notices",
|
||||
"Internal class names and proprietary algorithms",
|
||||
"Model architecture files or weight files",
|
||||
"Internal API keys or credentials embedded in code",
|
||||
],
|
||||
"risk_weight": "CRITICAL",
|
||||
},
|
||||
"REGULATORY_DOCUMENT": {
|
||||
"description": "Internal regulatory and compliance drafts",
|
||||
"signals": [
|
||||
"CFPB, GDPR, or SOX compliance drafts marked internal",
|
||||
"Audit findings or remediation plans",
|
||||
"Internal compliance assessments not yet published",
|
||||
"Regulatory submission drafts",
|
||||
],
|
||||
"risk_weight": "CRITICAL",
|
||||
},
|
||||
"LEGAL_CONTRACT": {
|
||||
"description": "Executed or draft legal agreements",
|
||||
"signals": [
|
||||
"Non-Disclosure Agreements (NDAs) with named parties",
|
||||
"Signed contracts with dates and signatures",
|
||||
"Settlement agreements or legal memoranda",
|
||||
"Vendor contracts with financial terms",
|
||||
],
|
||||
"risk_weight": "HIGH to CRITICAL",
|
||||
},
|
||||
"PAYROLL_RECORD": {
|
||||
"description": "Employee payroll and compensation records",
|
||||
"signals": [
|
||||
"Employee ID combined with salary and payroll period",
|
||||
"Direct deposit details or bank account information",
|
||||
"Year-to-date earnings and deductions",
|
||||
"HR compensation reports",
|
||||
],
|
||||
"risk_weight": "CRITICAL",
|
||||
},
|
||||
"CUSTOMER_LIST": {
|
||||
"description": "Customer or prospect data in bulk",
|
||||
"signals": [
|
||||
"CSV or table with customer names, emails, and revenue figures",
|
||||
"CRM exports with contact details",
|
||||
"Prospect lists for sales campaigns",
|
||||
"Customer PII in aggregate",
|
||||
],
|
||||
"risk_weight": "CRITICAL",
|
||||
},
|
||||
"INTERNAL_MEMO": {
|
||||
"description": "Confidential internal communications",
|
||||
"signals": [
|
||||
'Documents marked "INTERNAL ONLY" or "DO NOT DISTRIBUTE"',
|
||||
"CEO or executive strategy memos",
|
||||
"Organizational restructuring plans",
|
||||
"Internal performance reviews or headcount discussions",
|
||||
],
|
||||
"risk_weight": "HIGH",
|
||||
},
|
||||
}
|
||||
|
||||
ACTION_THRESHOLDS = {
|
||||
"BLOCK": "risk_score >= 80 (CRITICAL risk)",
|
||||
"ALERT": "risk_score >= 40 (MEDIUM or HIGH risk)",
|
||||
"PASS": "risk_score < 40 (LOW risk)",
|
||||
}
|
||||
|
||||
|
||||
def format_policy_for_prompt() -> str:
|
||||
"""Format the DLP policy as a JSON string for injection into the LLM system prompt."""
|
||||
policy = {
|
||||
"categories": DLP_CATEGORIES,
|
||||
"risk_score_thresholds": {
|
||||
"CRITICAL": "score >= 80",
|
||||
"HIGH": "score >= 60",
|
||||
"MEDIUM": "score >= 40",
|
||||
"LOW": "score < 40",
|
||||
},
|
||||
"action_mapping": ACTION_THRESHOLDS,
|
||||
"instructions": (
|
||||
"Evaluate the email against ALL categories above. "
|
||||
"Assign a risk_score from 0 to 100 based on the most severe violation found. "
|
||||
"Multiple violations increase the score. "
|
||||
"action must match the threshold: BLOCK if score>=80, ALERT if score>=40, PASS otherwise. "
|
||||
"evidence must be direct quotes from the actual email or attachment content."
|
||||
),
|
||||
}
|
||||
return json.dumps(policy, indent=2)
|
||||
285
email_dlp/policy_reviewer.py
Normal file
285
email_dlp/policy_reviewer.py
Normal file
@ -0,0 +1,285 @@
|
||||
"""Policy-based DLP review derived from DLP_CATEGORIES in policy.py."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
|
||||
from .policy import DLP_CATEGORIES
|
||||
|
||||
# Keywords derived from DLP_CATEGORIES signal descriptions in policy.py
|
||||
_POLICY_KEYWORDS: dict[ViolationType, dict] = {
|
||||
ViolationType.PII: {
|
||||
"keywords": [
|
||||
"full name",
|
||||
"email address",
|
||||
"social security",
|
||||
"ssn",
|
||||
"employee id",
|
||||
"phone number",
|
||||
"home address",
|
||||
"personal identifier",
|
||||
"date of birth",
|
||||
],
|
||||
"min_matches": 2,
|
||||
"base_score": 55,
|
||||
},
|
||||
ViolationType.FINANCIAL_DATA: {
|
||||
"keywords": [
|
||||
"revenue",
|
||||
"ebitda",
|
||||
"projection",
|
||||
"forecast",
|
||||
"salary",
|
||||
"compensation plan",
|
||||
"invoice",
|
||||
"amount due",
|
||||
"payment terms",
|
||||
"budget",
|
||||
"gross margin",
|
||||
"sales data",
|
||||
],
|
||||
"min_matches": 1,
|
||||
"base_score": 50,
|
||||
},
|
||||
ViolationType.SOURCE_CODE: {
|
||||
"keywords": [
|
||||
"copyright",
|
||||
"def ",
|
||||
"class ",
|
||||
"from __future__",
|
||||
"import ",
|
||||
"model weights",
|
||||
"api key",
|
||||
"api_key",
|
||||
"proprietary",
|
||||
"source code",
|
||||
"internal source",
|
||||
],
|
||||
"min_matches": 2,
|
||||
"base_score": 85,
|
||||
},
|
||||
ViolationType.REGULATORY_DOCUMENT: {
|
||||
"keywords": [
|
||||
"cfpb",
|
||||
"gdpr",
|
||||
"sox",
|
||||
"compliance draft",
|
||||
"not for public release",
|
||||
"not for public distribution",
|
||||
"regulatory submission",
|
||||
"audit findings",
|
||||
"remediation plan",
|
||||
"internal compliance",
|
||||
],
|
||||
"min_matches": 1,
|
||||
"base_score": 82,
|
||||
},
|
||||
ViolationType.LEGAL_CONTRACT: {
|
||||
"keywords": [
|
||||
"non-disclosure",
|
||||
"nondisclosure",
|
||||
"nda",
|
||||
"disclosing party",
|
||||
"receiving party",
|
||||
"confidentiality agreement",
|
||||
"settlement agreement",
|
||||
"executed contract",
|
||||
"signed contract",
|
||||
],
|
||||
"min_matches": 1,
|
||||
"base_score": 65,
|
||||
},
|
||||
ViolationType.PAYROLL_RECORD: {
|
||||
"keywords": [
|
||||
"payroll",
|
||||
"pay period",
|
||||
"pay stub",
|
||||
"direct deposit",
|
||||
"routing number",
|
||||
"bank account",
|
||||
"net pay",
|
||||
"gross pay",
|
||||
"tax deductions",
|
||||
"year-to-date",
|
||||
"ytd",
|
||||
"compensation record",
|
||||
],
|
||||
"min_matches": 1,
|
||||
"base_score": 88,
|
||||
},
|
||||
ViolationType.CUSTOMER_LIST: {
|
||||
"keywords": [
|
||||
"customer list",
|
||||
"customer_id",
|
||||
"customer id",
|
||||
"crm export",
|
||||
"prospect list",
|
||||
"top-tier prospect",
|
||||
"annual_sales",
|
||||
"company_name",
|
||||
"bulk export",
|
||||
"sales campaign",
|
||||
],
|
||||
"min_matches": 2,
|
||||
"base_score": 85,
|
||||
},
|
||||
ViolationType.INTERNAL_MEMO: {
|
||||
"keywords": [
|
||||
"internal only",
|
||||
"internal use only",
|
||||
"do not distribute",
|
||||
"not for external",
|
||||
"office of the ceo",
|
||||
"organizational priorities",
|
||||
"growth roadmap",
|
||||
"strictly confidential",
|
||||
"internal policy document",
|
||||
"headcount",
|
||||
],
|
||||
"min_matches": 1,
|
||||
"base_score": 55,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _find_evidence(text: str, keyword: str) -> str | None:
|
||||
match = re.search(re.escape(keyword.strip()), text, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
return None
|
||||
start = max(0, match.start() - 60)
|
||||
end = min(len(text), match.end() + 100)
|
||||
return _normalize(text[start:end])
|
||||
|
||||
|
||||
def _risk_level_from_score(score: int) -> RiskLevel:
|
||||
if score >= 80:
|
||||
return RiskLevel.CRITICAL
|
||||
if score >= 60:
|
||||
return RiskLevel.HIGH
|
||||
if score >= 40:
|
||||
return RiskLevel.MEDIUM
|
||||
return RiskLevel.LOW
|
||||
|
||||
|
||||
def _action_from_score(score: int) -> ActionClass:
|
||||
if score >= 80:
|
||||
return ActionClass.BLOCK
|
||||
if score >= 40:
|
||||
return ActionClass.ALERT
|
||||
return ActionClass.PASS_
|
||||
|
||||
|
||||
def review_corpus(
|
||||
email_file: str,
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
attachment_results: list[AttachmentResult],
|
||||
processing_errors: list[str],
|
||||
) -> DLPResult:
|
||||
"""Judge an email using DLP_CATEGORIES signals from policy.py."""
|
||||
# Build full text corpus
|
||||
parts = [
|
||||
f"Subject: {subject}",
|
||||
f"From: {sender}",
|
||||
f"To: {recipient}",
|
||||
body_text,
|
||||
]
|
||||
for filename, text in attachment_texts:
|
||||
parts.append(f"Attachment: {filename}")
|
||||
parts.append(text)
|
||||
|
||||
raw = "\n".join(p for p in parts if p)
|
||||
lower = raw.lower()
|
||||
|
||||
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
|
||||
score_map: dict[ViolationType, int] = {}
|
||||
|
||||
for vtype, rule in _POLICY_KEYWORDS.items():
|
||||
keywords: list[str] = rule["keywords"]
|
||||
min_matches: int = rule["min_matches"]
|
||||
base_score: int = rule["base_score"]
|
||||
match_count = 0
|
||||
|
||||
for kw in keywords:
|
||||
if kw.lower() in lower:
|
||||
match_count += 1
|
||||
ev = _find_evidence(raw, kw)
|
||||
if ev and ev not in evidence_map[vtype]:
|
||||
evidence_map[vtype].append(ev)
|
||||
|
||||
if match_count < min_matches:
|
||||
continue
|
||||
|
||||
score = base_score + min(12, (match_count - 1) * 3)
|
||||
|
||||
# Context boost: external recipient domain
|
||||
recipient_lower = recipient.lower()
|
||||
if any(d in recipient_lower for d in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
|
||||
score += 6
|
||||
|
||||
score_map[vtype] = min(99, score)
|
||||
|
||||
if not score_map:
|
||||
category_desc = DLP_CATEGORIES # keep reference to show it's used
|
||||
_ = category_desc # suppress unused warning
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=RiskLevel.LOW,
|
||||
risk_score=12,
|
||||
violation_types=[ViolationType.NONE],
|
||||
action=ActionClass.PASS_,
|
||||
summary="Policy review found no DLP category signals in this email.",
|
||||
evidence=[],
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
ranked = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
|
||||
violation_types = [vt for vt, _ in ranked[:3]]
|
||||
risk_score = ranked[0][1]
|
||||
if len(ranked) > 1:
|
||||
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
|
||||
|
||||
evidence: list[str] = []
|
||||
for vt in violation_types:
|
||||
evidence.extend(evidence_map[vt][:2])
|
||||
evidence = evidence[:5]
|
||||
|
||||
risk_level = _risk_level_from_score(risk_score)
|
||||
action = _action_from_score(risk_score)
|
||||
|
||||
violation_labels = ", ".join(v.value for v in violation_types)
|
||||
summary = (
|
||||
f"Policy review flagged {violation_labels} with {risk_level.value} risk "
|
||||
f"(score {risk_score}) using DLP_CATEGORIES signals from policy.py."
|
||||
)
|
||||
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=risk_level,
|
||||
risk_score=risk_score,
|
||||
violation_types=violation_types,
|
||||
action=action,
|
||||
summary=summary,
|
||||
evidence=evidence,
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
310
email_dlp/simulator.py
Normal file
310
email_dlp/simulator.py
Normal file
@ -0,0 +1,310 @@
|
||||
"""Deterministic local simulator for DLP analysis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from .converter import IMAGE_SENTINEL
|
||||
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
|
||||
|
||||
_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
|
||||
ViolationType.PII: {
|
||||
"keywords": [
|
||||
"personally identifiable information",
|
||||
" pii",
|
||||
"employee id",
|
||||
"account ending",
|
||||
"direct deposit",
|
||||
"customer_id",
|
||||
"first_name",
|
||||
"last_name",
|
||||
],
|
||||
"base_score": 30,
|
||||
"min_matches": 2,
|
||||
},
|
||||
ViolationType.FINANCIAL_DATA: {
|
||||
"keywords": [
|
||||
"financial forecast",
|
||||
"revenue",
|
||||
"ebitda",
|
||||
"gross margin",
|
||||
"margin efficiency",
|
||||
"sales data",
|
||||
"annual_sales_usd",
|
||||
"invoice",
|
||||
"amount due",
|
||||
"payment instructions",
|
||||
"ach",
|
||||
"budget",
|
||||
],
|
||||
"base_score": 42,
|
||||
},
|
||||
ViolationType.SOURCE_CODE: {
|
||||
"keywords": [
|
||||
"source code",
|
||||
"api key",
|
||||
"model weights",
|
||||
"from __future__ import annotations",
|
||||
"def ",
|
||||
"class ",
|
||||
"@dataclass",
|
||||
],
|
||||
"base_score": 88,
|
||||
"min_matches": 2,
|
||||
},
|
||||
ViolationType.REGULATORY_DOCUMENT: {
|
||||
"keywords": [
|
||||
"regulatory document",
|
||||
"regulatory submission",
|
||||
"cfpb",
|
||||
"compliance report",
|
||||
"not for public release",
|
||||
"draft regulatory",
|
||||
"prepared by: legal & compliance team",
|
||||
],
|
||||
"base_score": 84,
|
||||
},
|
||||
ViolationType.LEGAL_CONTRACT: {
|
||||
"keywords": [
|
||||
"nondisclosure agreement",
|
||||
"non-disclosure agreement",
|
||||
"executed nda",
|
||||
"disclosing party",
|
||||
"receiving party",
|
||||
],
|
||||
"base_score": 62,
|
||||
"min_matches": 1,
|
||||
},
|
||||
ViolationType.PAYROLL_RECORD: {
|
||||
"keywords": [
|
||||
"payroll",
|
||||
"pay stub",
|
||||
"compensation record",
|
||||
"gross:",
|
||||
"net pay",
|
||||
"tax deductions",
|
||||
"pay period",
|
||||
"direct deposit",
|
||||
"employee id",
|
||||
],
|
||||
"base_score": 90,
|
||||
},
|
||||
ViolationType.CUSTOMER_LIST: {
|
||||
"keywords": [
|
||||
"customer list",
|
||||
"prospects",
|
||||
"crm export",
|
||||
"raw export",
|
||||
"customer_id",
|
||||
"company_name",
|
||||
"annual_sales_usd",
|
||||
"top-tier prospects",
|
||||
],
|
||||
"base_score": 86,
|
||||
"min_matches": 2,
|
||||
},
|
||||
ViolationType.INTERNAL_MEMO: {
|
||||
"keywords": [
|
||||
"internal use only",
|
||||
"internal memo",
|
||||
"do not distribute externally",
|
||||
"office of the ceo",
|
||||
"organizational priorities",
|
||||
"growth roadmap",
|
||||
"internal policy document",
|
||||
"not for public distribution",
|
||||
"strictly confidential",
|
||||
],
|
||||
"base_score": 52,
|
||||
"min_matches": 1,
|
||||
},
|
||||
}
|
||||
|
||||
_RISK_LEVELS = [
|
||||
(80, RiskLevel.CRITICAL),
|
||||
(60, RiskLevel.HIGH),
|
||||
(40, RiskLevel.MEDIUM),
|
||||
(0, RiskLevel.LOW),
|
||||
]
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _build_corpus(
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
) -> tuple[str, str]:
|
||||
text_chunks = [
|
||||
f"Subject: {subject}",
|
||||
f"From: {sender}",
|
||||
f"To: {recipient}",
|
||||
body_text,
|
||||
]
|
||||
for filename, text in attachment_texts:
|
||||
text_chunks.append(f"Attachment: {filename}")
|
||||
# Skip binary image data — base64 payloads produce false keyword matches
|
||||
if not text.startswith(IMAGE_SENTINEL):
|
||||
text_chunks.append(text)
|
||||
raw = "\n".join(chunk for chunk in text_chunks if chunk)
|
||||
return raw, raw.lower()
|
||||
|
||||
|
||||
def _find_evidence(text: str, keyword: str) -> str | None:
|
||||
pattern = re.escape(keyword.strip())
|
||||
match = re.search(pattern, text, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
return None
|
||||
start = max(0, match.start() - 60)
|
||||
end = min(len(text), match.end() + 100)
|
||||
return _normalize_text(text[start:end])
|
||||
|
||||
|
||||
def _collect_matches(
|
||||
raw_text: str,
|
||||
lower_text: str,
|
||||
) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
|
||||
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
|
||||
score_map: dict[ViolationType, int] = {}
|
||||
|
||||
for violation_type, rule in _CATEGORY_RULES.items():
|
||||
keywords = rule["keywords"]
|
||||
base_score = int(rule["base_score"])
|
||||
min_matches = int(rule.get("min_matches", 1))
|
||||
match_count = 0
|
||||
|
||||
for keyword in keywords:
|
||||
# Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
|
||||
pattern = r"\b" + re.escape(keyword) + r"\b"
|
||||
if re.search(pattern, lower_text):
|
||||
match_count += 1
|
||||
evidence = _find_evidence(raw_text, keyword)
|
||||
if evidence and evidence not in evidence_map[violation_type]:
|
||||
evidence_map[violation_type].append(evidence)
|
||||
|
||||
if match_count < min_matches:
|
||||
continue
|
||||
|
||||
score = base_score + min(12, (match_count - 1) * 4)
|
||||
score_map[violation_type] = min(score, 99)
|
||||
|
||||
return evidence_map, score_map
|
||||
|
||||
|
||||
def _apply_context_boosts(
|
||||
subject: str,
|
||||
recipient: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
score_map: dict[ViolationType, int],
|
||||
) -> None:
|
||||
subject_lower = subject.lower()
|
||||
recipient_lower = recipient.lower()
|
||||
|
||||
if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
|
||||
for violation_type in list(score_map):
|
||||
score_map[violation_type] = min(99, score_map[violation_type] + 6)
|
||||
|
||||
if "urgent" in subject_lower or "confidential" in subject_lower:
|
||||
for violation_type in list(score_map):
|
||||
score_map[violation_type] = min(99, score_map[violation_type] + 2)
|
||||
|
||||
attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
|
||||
if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
|
||||
score_map[ViolationType.CUSTOMER_LIST] = min(
|
||||
99, score_map[ViolationType.CUSTOMER_LIST] + 6
|
||||
)
|
||||
if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
|
||||
score_map[ViolationType.SOURCE_CODE] = min(
|
||||
99, score_map[ViolationType.SOURCE_CODE] + 4
|
||||
)
|
||||
|
||||
|
||||
def _risk_level_from_score(risk_score: int) -> RiskLevel:
|
||||
for threshold, risk_level in _RISK_LEVELS:
|
||||
if risk_score >= threshold:
|
||||
return risk_level
|
||||
return RiskLevel.LOW
|
||||
|
||||
|
||||
def _action_from_score(risk_score: int) -> ActionClass:
|
||||
if risk_score >= 80:
|
||||
return ActionClass.BLOCK
|
||||
if risk_score >= 40:
|
||||
return ActionClass.ALERT
|
||||
return ActionClass.PASS_
|
||||
|
||||
|
||||
def _build_summary(
|
||||
violation_types: list[ViolationType],
|
||||
risk_level: RiskLevel,
|
||||
risk_score: int,
|
||||
) -> str:
|
||||
if violation_types == [ViolationType.NONE]:
|
||||
return "No strong DLP indicators were found in the email body or converted attachments."
|
||||
labels = ", ".join(v.value for v in violation_types)
|
||||
return (
|
||||
f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
|
||||
f"(score {risk_score}) based on the email body and extracted attachment content."
|
||||
)
|
||||
|
||||
|
||||
def simulate_analysis(
|
||||
email_file: str,
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
attachment_results: list[AttachmentResult],
|
||||
processing_errors: list[str],
|
||||
) -> DLPResult:
|
||||
"""Predict a DLP result locally without calling an LLM."""
|
||||
raw_text, lower_text = _build_corpus(
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
body_text=body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
)
|
||||
evidence_map, score_map = _collect_matches(raw_text, lower_text)
|
||||
_apply_context_boosts(subject, recipient, attachment_texts, score_map)
|
||||
|
||||
if not score_map:
|
||||
violation_types = [ViolationType.NONE]
|
||||
risk_score = 18
|
||||
evidence: list[str] = []
|
||||
else:
|
||||
ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
|
||||
violation_types = [violation for violation, _ in ranked[:3]]
|
||||
risk_score = ranked[0][1]
|
||||
if len(ranked) > 1:
|
||||
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
|
||||
evidence = []
|
||||
for violation_type in violation_types:
|
||||
evidence.extend(evidence_map.get(violation_type, [])[:2])
|
||||
evidence = evidence[:5]
|
||||
|
||||
risk_level = _risk_level_from_score(risk_score)
|
||||
action = _action_from_score(risk_score)
|
||||
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=risk_level,
|
||||
risk_score=risk_score,
|
||||
violation_types=violation_types,
|
||||
action=action,
|
||||
summary=_build_summary(violation_types, risk_level, risk_score),
|
||||
evidence=evidence,
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
Reference in New Issue
Block a user