Initial commit

This commit is contained in:
2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions

310
email_dlp/simulator.py Normal file
View File

@ -0,0 +1,310 @@
"""Deterministic local simulator for DLP analysis."""
from __future__ import annotations
import re
from collections import defaultdict
from .converter import IMAGE_SENTINEL
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
ViolationType.PII: {
"keywords": [
"personally identifiable information",
" pii",
"employee id",
"account ending",
"direct deposit",
"customer_id",
"first_name",
"last_name",
],
"base_score": 30,
"min_matches": 2,
},
ViolationType.FINANCIAL_DATA: {
"keywords": [
"financial forecast",
"revenue",
"ebitda",
"gross margin",
"margin efficiency",
"sales data",
"annual_sales_usd",
"invoice",
"amount due",
"payment instructions",
"ach",
"budget",
],
"base_score": 42,
},
ViolationType.SOURCE_CODE: {
"keywords": [
"source code",
"api key",
"model weights",
"from __future__ import annotations",
"def ",
"class ",
"@dataclass",
],
"base_score": 88,
"min_matches": 2,
},
ViolationType.REGULATORY_DOCUMENT: {
"keywords": [
"regulatory document",
"regulatory submission",
"cfpb",
"compliance report",
"not for public release",
"draft regulatory",
"prepared by: legal & compliance team",
],
"base_score": 84,
},
ViolationType.LEGAL_CONTRACT: {
"keywords": [
"nondisclosure agreement",
"non-disclosure agreement",
"executed nda",
"disclosing party",
"receiving party",
],
"base_score": 62,
"min_matches": 1,
},
ViolationType.PAYROLL_RECORD: {
"keywords": [
"payroll",
"pay stub",
"compensation record",
"gross:",
"net pay",
"tax deductions",
"pay period",
"direct deposit",
"employee id",
],
"base_score": 90,
},
ViolationType.CUSTOMER_LIST: {
"keywords": [
"customer list",
"prospects",
"crm export",
"raw export",
"customer_id",
"company_name",
"annual_sales_usd",
"top-tier prospects",
],
"base_score": 86,
"min_matches": 2,
},
ViolationType.INTERNAL_MEMO: {
"keywords": [
"internal use only",
"internal memo",
"do not distribute externally",
"office of the ceo",
"organizational priorities",
"growth roadmap",
"internal policy document",
"not for public distribution",
"strictly confidential",
],
"base_score": 52,
"min_matches": 1,
},
}
_RISK_LEVELS = [
(80, RiskLevel.CRITICAL),
(60, RiskLevel.HIGH),
(40, RiskLevel.MEDIUM),
(0, RiskLevel.LOW),
]
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def _build_corpus(
subject: str,
sender: str,
recipient: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
) -> tuple[str, str]:
text_chunks = [
f"Subject: {subject}",
f"From: {sender}",
f"To: {recipient}",
body_text,
]
for filename, text in attachment_texts:
text_chunks.append(f"Attachment: {filename}")
# Skip binary image data — base64 payloads produce false keyword matches
if not text.startswith(IMAGE_SENTINEL):
text_chunks.append(text)
raw = "\n".join(chunk for chunk in text_chunks if chunk)
return raw, raw.lower()
def _find_evidence(text: str, keyword: str) -> str | None:
pattern = re.escape(keyword.strip())
match = re.search(pattern, text, flags=re.IGNORECASE)
if not match:
return None
start = max(0, match.start() - 60)
end = min(len(text), match.end() + 100)
return _normalize_text(text[start:end])
def _collect_matches(
raw_text: str,
lower_text: str,
) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
score_map: dict[ViolationType, int] = {}
for violation_type, rule in _CATEGORY_RULES.items():
keywords = rule["keywords"]
base_score = int(rule["base_score"])
min_matches = int(rule.get("min_matches", 1))
match_count = 0
for keyword in keywords:
# Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
pattern = r"\b" + re.escape(keyword) + r"\b"
if re.search(pattern, lower_text):
match_count += 1
evidence = _find_evidence(raw_text, keyword)
if evidence and evidence not in evidence_map[violation_type]:
evidence_map[violation_type].append(evidence)
if match_count < min_matches:
continue
score = base_score + min(12, (match_count - 1) * 4)
score_map[violation_type] = min(score, 99)
return evidence_map, score_map
def _apply_context_boosts(
subject: str,
recipient: str,
attachment_texts: list[tuple[str, str]],
score_map: dict[ViolationType, int],
) -> None:
subject_lower = subject.lower()
recipient_lower = recipient.lower()
if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
for violation_type in list(score_map):
score_map[violation_type] = min(99, score_map[violation_type] + 6)
if "urgent" in subject_lower or "confidential" in subject_lower:
for violation_type in list(score_map):
score_map[violation_type] = min(99, score_map[violation_type] + 2)
attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
score_map[ViolationType.CUSTOMER_LIST] = min(
99, score_map[ViolationType.CUSTOMER_LIST] + 6
)
if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
score_map[ViolationType.SOURCE_CODE] = min(
99, score_map[ViolationType.SOURCE_CODE] + 4
)
def _risk_level_from_score(risk_score: int) -> RiskLevel:
for threshold, risk_level in _RISK_LEVELS:
if risk_score >= threshold:
return risk_level
return RiskLevel.LOW
def _action_from_score(risk_score: int) -> ActionClass:
if risk_score >= 80:
return ActionClass.BLOCK
if risk_score >= 40:
return ActionClass.ALERT
return ActionClass.PASS_
def _build_summary(
violation_types: list[ViolationType],
risk_level: RiskLevel,
risk_score: int,
) -> str:
if violation_types == [ViolationType.NONE]:
return "No strong DLP indicators were found in the email body or converted attachments."
labels = ", ".join(v.value for v in violation_types)
return (
f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
f"(score {risk_score}) based on the email body and extracted attachment content."
)
def simulate_analysis(
email_file: str,
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
attachment_results: list[AttachmentResult],
processing_errors: list[str],
) -> DLPResult:
"""Predict a DLP result locally without calling an LLM."""
raw_text, lower_text = _build_corpus(
subject=subject,
sender=sender,
recipient=recipient,
body_text=body_text,
attachment_texts=attachment_texts,
)
evidence_map, score_map = _collect_matches(raw_text, lower_text)
_apply_context_boosts(subject, recipient, attachment_texts, score_map)
if not score_map:
violation_types = [ViolationType.NONE]
risk_score = 18
evidence: list[str] = []
else:
ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
violation_types = [violation for violation, _ in ranked[:3]]
risk_score = ranked[0][1]
if len(ranked) > 1:
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
evidence = []
for violation_type in violation_types:
evidence.extend(evidence_map.get(violation_type, [])[:2])
evidence = evidence[:5]
risk_level = _risk_level_from_score(risk_score)
action = _action_from_score(risk_score)
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=risk_level,
risk_score=risk_score,
violation_types=violation_types,
action=action,
summary=_build_summary(violation_types, risk_level, risk_score),
evidence=evidence,
attachments=attachment_results,
processing_errors=processing_errors,
)