311 lines
9.3 KiB
Python
311 lines
9.3 KiB
Python
"""Deterministic local simulator for DLP analysis."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from .converter import IMAGE_SENTINEL
|
|
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
|
|
|
|
_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
|
|
ViolationType.PII: {
|
|
"keywords": [
|
|
"personally identifiable information",
|
|
" pii",
|
|
"employee id",
|
|
"account ending",
|
|
"direct deposit",
|
|
"customer_id",
|
|
"first_name",
|
|
"last_name",
|
|
],
|
|
"base_score": 30,
|
|
"min_matches": 2,
|
|
},
|
|
ViolationType.FINANCIAL_DATA: {
|
|
"keywords": [
|
|
"financial forecast",
|
|
"revenue",
|
|
"ebitda",
|
|
"gross margin",
|
|
"margin efficiency",
|
|
"sales data",
|
|
"annual_sales_usd",
|
|
"invoice",
|
|
"amount due",
|
|
"payment instructions",
|
|
"ach",
|
|
"budget",
|
|
],
|
|
"base_score": 42,
|
|
},
|
|
ViolationType.SOURCE_CODE: {
|
|
"keywords": [
|
|
"source code",
|
|
"api key",
|
|
"model weights",
|
|
"from __future__ import annotations",
|
|
"def ",
|
|
"class ",
|
|
"@dataclass",
|
|
],
|
|
"base_score": 88,
|
|
"min_matches": 2,
|
|
},
|
|
ViolationType.REGULATORY_DOCUMENT: {
|
|
"keywords": [
|
|
"regulatory document",
|
|
"regulatory submission",
|
|
"cfpb",
|
|
"compliance report",
|
|
"not for public release",
|
|
"draft regulatory",
|
|
"prepared by: legal & compliance team",
|
|
],
|
|
"base_score": 84,
|
|
},
|
|
ViolationType.LEGAL_CONTRACT: {
|
|
"keywords": [
|
|
"nondisclosure agreement",
|
|
"non-disclosure agreement",
|
|
"executed nda",
|
|
"disclosing party",
|
|
"receiving party",
|
|
],
|
|
"base_score": 62,
|
|
"min_matches": 1,
|
|
},
|
|
ViolationType.PAYROLL_RECORD: {
|
|
"keywords": [
|
|
"payroll",
|
|
"pay stub",
|
|
"compensation record",
|
|
"gross:",
|
|
"net pay",
|
|
"tax deductions",
|
|
"pay period",
|
|
"direct deposit",
|
|
"employee id",
|
|
],
|
|
"base_score": 90,
|
|
},
|
|
ViolationType.CUSTOMER_LIST: {
|
|
"keywords": [
|
|
"customer list",
|
|
"prospects",
|
|
"crm export",
|
|
"raw export",
|
|
"customer_id",
|
|
"company_name",
|
|
"annual_sales_usd",
|
|
"top-tier prospects",
|
|
],
|
|
"base_score": 86,
|
|
"min_matches": 2,
|
|
},
|
|
ViolationType.INTERNAL_MEMO: {
|
|
"keywords": [
|
|
"internal use only",
|
|
"internal memo",
|
|
"do not distribute externally",
|
|
"office of the ceo",
|
|
"organizational priorities",
|
|
"growth roadmap",
|
|
"internal policy document",
|
|
"not for public distribution",
|
|
"strictly confidential",
|
|
],
|
|
"base_score": 52,
|
|
"min_matches": 1,
|
|
},
|
|
}
|
|
|
|
_RISK_LEVELS = [
|
|
(80, RiskLevel.CRITICAL),
|
|
(60, RiskLevel.HIGH),
|
|
(40, RiskLevel.MEDIUM),
|
|
(0, RiskLevel.LOW),
|
|
]
|
|
|
|
|
|
def _normalize_text(text: str) -> str:
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
def _build_corpus(
|
|
subject: str,
|
|
sender: str,
|
|
recipient: str,
|
|
body_text: str,
|
|
attachment_texts: list[tuple[str, str]],
|
|
) -> tuple[str, str]:
|
|
text_chunks = [
|
|
f"Subject: {subject}",
|
|
f"From: {sender}",
|
|
f"To: {recipient}",
|
|
body_text,
|
|
]
|
|
for filename, text in attachment_texts:
|
|
text_chunks.append(f"Attachment: {filename}")
|
|
# Skip binary image data — base64 payloads produce false keyword matches
|
|
if not text.startswith(IMAGE_SENTINEL):
|
|
text_chunks.append(text)
|
|
raw = "\n".join(chunk for chunk in text_chunks if chunk)
|
|
return raw, raw.lower()
|
|
|
|
|
|
def _find_evidence(text: str, keyword: str) -> str | None:
|
|
pattern = re.escape(keyword.strip())
|
|
match = re.search(pattern, text, flags=re.IGNORECASE)
|
|
if not match:
|
|
return None
|
|
start = max(0, match.start() - 60)
|
|
end = min(len(text), match.end() + 100)
|
|
return _normalize_text(text[start:end])
|
|
|
|
|
|
def _collect_matches(
|
|
raw_text: str,
|
|
lower_text: str,
|
|
) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
|
|
evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
|
|
score_map: dict[ViolationType, int] = {}
|
|
|
|
for violation_type, rule in _CATEGORY_RULES.items():
|
|
keywords = rule["keywords"]
|
|
base_score = int(rule["base_score"])
|
|
min_matches = int(rule.get("min_matches", 1))
|
|
match_count = 0
|
|
|
|
for keyword in keywords:
|
|
# Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
|
|
pattern = r"\b" + re.escape(keyword) + r"\b"
|
|
if re.search(pattern, lower_text):
|
|
match_count += 1
|
|
evidence = _find_evidence(raw_text, keyword)
|
|
if evidence and evidence not in evidence_map[violation_type]:
|
|
evidence_map[violation_type].append(evidence)
|
|
|
|
if match_count < min_matches:
|
|
continue
|
|
|
|
score = base_score + min(12, (match_count - 1) * 4)
|
|
score_map[violation_type] = min(score, 99)
|
|
|
|
return evidence_map, score_map
|
|
|
|
|
|
def _apply_context_boosts(
|
|
subject: str,
|
|
recipient: str,
|
|
attachment_texts: list[tuple[str, str]],
|
|
score_map: dict[ViolationType, int],
|
|
) -> None:
|
|
subject_lower = subject.lower()
|
|
recipient_lower = recipient.lower()
|
|
|
|
if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
|
|
for violation_type in list(score_map):
|
|
score_map[violation_type] = min(99, score_map[violation_type] + 6)
|
|
|
|
if "urgent" in subject_lower or "confidential" in subject_lower:
|
|
for violation_type in list(score_map):
|
|
score_map[violation_type] = min(99, score_map[violation_type] + 2)
|
|
|
|
attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
|
|
if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
|
|
score_map[ViolationType.CUSTOMER_LIST] = min(
|
|
99, score_map[ViolationType.CUSTOMER_LIST] + 6
|
|
)
|
|
if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
|
|
score_map[ViolationType.SOURCE_CODE] = min(
|
|
99, score_map[ViolationType.SOURCE_CODE] + 4
|
|
)
|
|
|
|
|
|
def _risk_level_from_score(risk_score: int) -> RiskLevel:
|
|
for threshold, risk_level in _RISK_LEVELS:
|
|
if risk_score >= threshold:
|
|
return risk_level
|
|
return RiskLevel.LOW
|
|
|
|
|
|
def _action_from_score(risk_score: int) -> ActionClass:
|
|
if risk_score >= 80:
|
|
return ActionClass.BLOCK
|
|
if risk_score >= 40:
|
|
return ActionClass.ALERT
|
|
return ActionClass.PASS_
|
|
|
|
|
|
def _build_summary(
|
|
violation_types: list[ViolationType],
|
|
risk_level: RiskLevel,
|
|
risk_score: int,
|
|
) -> str:
|
|
if violation_types == [ViolationType.NONE]:
|
|
return "No strong DLP indicators were found in the email body or converted attachments."
|
|
labels = ", ".join(v.value for v in violation_types)
|
|
return (
|
|
f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
|
|
f"(score {risk_score}) based on the email body and extracted attachment content."
|
|
)
|
|
|
|
|
|
def simulate_analysis(
|
|
email_file: str,
|
|
subject: str,
|
|
sender: str,
|
|
recipient: str,
|
|
date: str,
|
|
body_text: str,
|
|
attachment_texts: list[tuple[str, str]],
|
|
attachment_results: list[AttachmentResult],
|
|
processing_errors: list[str],
|
|
) -> DLPResult:
|
|
"""Predict a DLP result locally without calling an LLM."""
|
|
raw_text, lower_text = _build_corpus(
|
|
subject=subject,
|
|
sender=sender,
|
|
recipient=recipient,
|
|
body_text=body_text,
|
|
attachment_texts=attachment_texts,
|
|
)
|
|
evidence_map, score_map = _collect_matches(raw_text, lower_text)
|
|
_apply_context_boosts(subject, recipient, attachment_texts, score_map)
|
|
|
|
if not score_map:
|
|
violation_types = [ViolationType.NONE]
|
|
risk_score = 18
|
|
evidence: list[str] = []
|
|
else:
|
|
ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
|
|
violation_types = [violation for violation, _ in ranked[:3]]
|
|
risk_score = ranked[0][1]
|
|
if len(ranked) > 1:
|
|
risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
|
|
evidence = []
|
|
for violation_type in violation_types:
|
|
evidence.extend(evidence_map.get(violation_type, [])[:2])
|
|
evidence = evidence[:5]
|
|
|
|
risk_level = _risk_level_from_score(risk_score)
|
|
action = _action_from_score(risk_score)
|
|
|
|
return DLPResult(
|
|
email_file=email_file,
|
|
subject=subject,
|
|
sender=sender,
|
|
recipient=recipient,
|
|
date=date,
|
|
risk_level=risk_level,
|
|
risk_score=risk_score,
|
|
violation_types=violation_types,
|
|
action=action,
|
|
summary=_build_summary(violation_types, risk_level, risk_score),
|
|
evidence=evidence,
|
|
attachments=attachment_results,
|
|
processing_errors=processing_errors,
|
|
)
|