Initial commit

2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions
--- a/email_dlp/simulator.py
+++ b/email_dlp/simulator.py
@ -0,0 +1,310 @@
+"""Deterministic local simulator for DLP analysis."""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+
+from .converter import IMAGE_SENTINEL
+from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
+
+_CATEGORY_RULES: dict[ViolationType, dict[str, object]] = {
+    ViolationType.PII: {
+        "keywords": [
+            "personally identifiable information",
+            " pii",
+            "employee id",
+            "account ending",
+            "direct deposit",
+            "customer_id",
+            "first_name",
+            "last_name",
+        ],
+        "base_score": 30,
+        "min_matches": 2,
+    },
+    ViolationType.FINANCIAL_DATA: {
+        "keywords": [
+            "financial forecast",
+            "revenue",
+            "ebitda",
+            "gross margin",
+            "margin efficiency",
+            "sales data",
+            "annual_sales_usd",
+            "invoice",
+            "amount due",
+            "payment instructions",
+            "ach",
+            "budget",
+        ],
+        "base_score": 42,
+    },
+    ViolationType.SOURCE_CODE: {
+        "keywords": [
+            "source code",
+            "api key",
+            "model weights",
+            "from __future__ import annotations",
+            "def ",
+            "class ",
+            "@dataclass",
+        ],
+        "base_score": 88,
+        "min_matches": 2,
+    },
+    ViolationType.REGULATORY_DOCUMENT: {
+        "keywords": [
+            "regulatory document",
+            "regulatory submission",
+            "cfpb",
+            "compliance report",
+            "not for public release",
+            "draft regulatory",
+            "prepared by: legal & compliance team",
+        ],
+        "base_score": 84,
+    },
+    ViolationType.LEGAL_CONTRACT: {
+        "keywords": [
+            "nondisclosure agreement",
+            "non-disclosure agreement",
+            "executed nda",
+            "disclosing party",
+            "receiving party",
+        ],
+        "base_score": 62,
+        "min_matches": 1,
+    },
+    ViolationType.PAYROLL_RECORD: {
+        "keywords": [
+            "payroll",
+            "pay stub",
+            "compensation record",
+            "gross:",
+            "net pay",
+            "tax deductions",
+            "pay period",
+            "direct deposit",
+            "employee id",
+        ],
+        "base_score": 90,
+    },
+    ViolationType.CUSTOMER_LIST: {
+        "keywords": [
+            "customer list",
+            "prospects",
+            "crm export",
+            "raw export",
+            "customer_id",
+            "company_name",
+            "annual_sales_usd",
+            "top-tier prospects",
+        ],
+        "base_score": 86,
+        "min_matches": 2,
+    },
+    ViolationType.INTERNAL_MEMO: {
+        "keywords": [
+            "internal use only",
+            "internal memo",
+            "do not distribute externally",
+            "office of the ceo",
+            "organizational priorities",
+            "growth roadmap",
+            "internal policy document",
+            "not for public distribution",
+            "strictly confidential",
+        ],
+        "base_score": 52,
+        "min_matches": 1,
+    },
+}
+
+_RISK_LEVELS = [
+    (80, RiskLevel.CRITICAL),
+    (60, RiskLevel.HIGH),
+    (40, RiskLevel.MEDIUM),
+    (0, RiskLevel.LOW),
+]
+
+
+def _normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def _build_corpus(
+    subject: str,
+    sender: str,
+    recipient: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+) -> tuple[str, str]:
+    text_chunks = [
+        f"Subject: {subject}",
+        f"From: {sender}",
+        f"To: {recipient}",
+        body_text,
+    ]
+    for filename, text in attachment_texts:
+        text_chunks.append(f"Attachment: {filename}")
+        # Skip binary image data — base64 payloads produce false keyword matches
+        if not text.startswith(IMAGE_SENTINEL):
+            text_chunks.append(text)
+    raw = "\n".join(chunk for chunk in text_chunks if chunk)
+    return raw, raw.lower()
+
+
+def _find_evidence(text: str, keyword: str) -> str | None:
+    pattern = re.escape(keyword.strip())
+    match = re.search(pattern, text, flags=re.IGNORECASE)
+    if not match:
+        return None
+    start = max(0, match.start() - 60)
+    end = min(len(text), match.end() + 100)
+    return _normalize_text(text[start:end])
+
+
+def _collect_matches(
+    raw_text: str,
+    lower_text: str,
+) -> tuple[dict[ViolationType, list[str]], dict[ViolationType, int]]:
+    evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
+    score_map: dict[ViolationType, int] = {}
+
+    for violation_type, rule in _CATEGORY_RULES.items():
+        keywords = rule["keywords"]
+        base_score = int(rule["base_score"])
+        min_matches = int(rule.get("min_matches", 1))
+        match_count = 0
+
+        for keyword in keywords:
+            # Use word boundaries to avoid substring false positives (e.g. "ach" in "attached")
+            pattern = r"\b" + re.escape(keyword) + r"\b"
+            if re.search(pattern, lower_text):
+                match_count += 1
+                evidence = _find_evidence(raw_text, keyword)
+                if evidence and evidence not in evidence_map[violation_type]:
+                    evidence_map[violation_type].append(evidence)
+
+        if match_count < min_matches:
+            continue
+
+        score = base_score + min(12, (match_count - 1) * 4)
+        score_map[violation_type] = min(score, 99)
+
+    return evidence_map, score_map
+
+
+def _apply_context_boosts(
+    subject: str,
+    recipient: str,
+    attachment_texts: list[tuple[str, str]],
+    score_map: dict[ViolationType, int],
+) -> None:
+    subject_lower = subject.lower()
+    recipient_lower = recipient.lower()
+
+    if any(domain in recipient_lower for domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
+        for violation_type in list(score_map):
+            score_map[violation_type] = min(99, score_map[violation_type] + 6)
+
+    if "urgent" in subject_lower or "confidential" in subject_lower:
+        for violation_type in list(score_map):
+            score_map[violation_type] = min(99, score_map[violation_type] + 2)
+
+    attachment_names = " ".join(filename.lower() for filename, _ in attachment_texts)
+    if ".csv" in attachment_names and ViolationType.CUSTOMER_LIST in score_map:
+        score_map[ViolationType.CUSTOMER_LIST] = min(
+            99, score_map[ViolationType.CUSTOMER_LIST] + 6
+        )
+    if ".py" in attachment_names and ViolationType.SOURCE_CODE in score_map:
+        score_map[ViolationType.SOURCE_CODE] = min(
+            99, score_map[ViolationType.SOURCE_CODE] + 4
+        )
+
+
+def _risk_level_from_score(risk_score: int) -> RiskLevel:
+    for threshold, risk_level in _RISK_LEVELS:
+        if risk_score >= threshold:
+            return risk_level
+    return RiskLevel.LOW
+
+
+def _action_from_score(risk_score: int) -> ActionClass:
+    if risk_score >= 80:
+        return ActionClass.BLOCK
+    if risk_score >= 40:
+        return ActionClass.ALERT
+    return ActionClass.PASS_
+
+
+def _build_summary(
+    violation_types: list[ViolationType],
+    risk_level: RiskLevel,
+    risk_score: int,
+) -> str:
+    if violation_types == [ViolationType.NONE]:
+        return "No strong DLP indicators were found in the email body or converted attachments."
+    labels = ", ".join(v.value for v in violation_types)
+    return (
+        f"Simulated DLP review flagged {labels} with {risk_level.value} risk "
+        f"(score {risk_score}) based on the email body and extracted attachment content."
+    )
+
+
+def simulate_analysis(
+    email_file: str,
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+    attachment_results: list[AttachmentResult],
+    processing_errors: list[str],
+) -> DLPResult:
+    """Predict a DLP result locally without calling an LLM."""
+    raw_text, lower_text = _build_corpus(
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        body_text=body_text,
+        attachment_texts=attachment_texts,
+    )
+    evidence_map, score_map = _collect_matches(raw_text, lower_text)
+    _apply_context_boosts(subject, recipient, attachment_texts, score_map)
+
+    if not score_map:
+        violation_types = [ViolationType.NONE]
+        risk_score = 18
+        evidence: list[str] = []
+    else:
+        ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)
+        violation_types = [violation for violation, _ in ranked[:3]]
+        risk_score = ranked[0][1]
+        if len(ranked) > 1:
+            risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
+        evidence = []
+        for violation_type in violation_types:
+            evidence.extend(evidence_map.get(violation_type, [])[:2])
+        evidence = evidence[:5]
+
+    risk_level = _risk_level_from_score(risk_score)
+    action = _action_from_score(risk_score)
+
+    return DLPResult(
+        email_file=email_file,
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        risk_level=risk_level,
+        risk_score=risk_score,
+        violation_types=violation_types,
+        action=action,
+        summary=_build_summary(violation_types, risk_level, risk_score),
+        evidence=evidence,
+        attachments=attachment_results,
+        processing_errors=processing_errors,
+    )