Initial commit

2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions
--- a/email_dlp/policy_reviewer.py
+++ b/email_dlp/policy_reviewer.py
@ -0,0 +1,285 @@
+"""Policy-based DLP review derived from DLP_CATEGORIES in policy.py."""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+
+from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
+from .policy import DLP_CATEGORIES
+
+# Keywords derived from DLP_CATEGORIES signal descriptions in policy.py
+_POLICY_KEYWORDS: dict[ViolationType, dict] = {
+    ViolationType.PII: {
+        "keywords": [
+            "full name",
+            "email address",
+            "social security",
+            "ssn",
+            "employee id",
+            "phone number",
+            "home address",
+            "personal identifier",
+            "date of birth",
+        ],
+        "min_matches": 2,
+        "base_score": 55,
+    },
+    ViolationType.FINANCIAL_DATA: {
+        "keywords": [
+            "revenue",
+            "ebitda",
+            "projection",
+            "forecast",
+            "salary",
+            "compensation plan",
+            "invoice",
+            "amount due",
+            "payment terms",
+            "budget",
+            "gross margin",
+            "sales data",
+        ],
+        "min_matches": 1,
+        "base_score": 50,
+    },
+    ViolationType.SOURCE_CODE: {
+        "keywords": [
+            "copyright",
+            "def ",
+            "class ",
+            "from __future__",
+            "import ",
+            "model weights",
+            "api key",
+            "api_key",
+            "proprietary",
+            "source code",
+            "internal source",
+        ],
+        "min_matches": 2,
+        "base_score": 85,
+    },
+    ViolationType.REGULATORY_DOCUMENT: {
+        "keywords": [
+            "cfpb",
+            "gdpr",
+            "sox",
+            "compliance draft",
+            "not for public release",
+            "not for public distribution",
+            "regulatory submission",
+            "audit findings",
+            "remediation plan",
+            "internal compliance",
+        ],
+        "min_matches": 1,
+        "base_score": 82,
+    },
+    ViolationType.LEGAL_CONTRACT: {
+        "keywords": [
+            "non-disclosure",
+            "nondisclosure",
+            "nda",
+            "disclosing party",
+            "receiving party",
+            "confidentiality agreement",
+            "settlement agreement",
+            "executed contract",
+            "signed contract",
+        ],
+        "min_matches": 1,
+        "base_score": 65,
+    },
+    ViolationType.PAYROLL_RECORD: {
+        "keywords": [
+            "payroll",
+            "pay period",
+            "pay stub",
+            "direct deposit",
+            "routing number",
+            "bank account",
+            "net pay",
+            "gross pay",
+            "tax deductions",
+            "year-to-date",
+            "ytd",
+            "compensation record",
+        ],
+        "min_matches": 1,
+        "base_score": 88,
+    },
+    ViolationType.CUSTOMER_LIST: {
+        "keywords": [
+            "customer list",
+            "customer_id",
+            "customer id",
+            "crm export",
+            "prospect list",
+            "top-tier prospect",
+            "annual_sales",
+            "company_name",
+            "bulk export",
+            "sales campaign",
+        ],
+        "min_matches": 2,
+        "base_score": 85,
+    },
+    ViolationType.INTERNAL_MEMO: {
+        "keywords": [
+            "internal only",
+            "internal use only",
+            "do not distribute",
+            "not for external",
+            "office of the ceo",
+            "organizational priorities",
+            "growth roadmap",
+            "strictly confidential",
+            "internal policy document",
+            "headcount",
+        ],
+        "min_matches": 1,
+        "base_score": 55,
+    },
+}
+
+
+def _normalize(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def _find_evidence(text: str, keyword: str) -> str | None:
+    match = re.search(re.escape(keyword.strip()), text, flags=re.IGNORECASE)
+    if not match:
+        return None
+    start = max(0, match.start() - 60)
+    end = min(len(text), match.end() + 100)
+    return _normalize(text[start:end])
+
+
+def _risk_level_from_score(score: int) -> RiskLevel:
+    if score >= 80:
+        return RiskLevel.CRITICAL
+    if score >= 60:
+        return RiskLevel.HIGH
+    if score >= 40:
+        return RiskLevel.MEDIUM
+    return RiskLevel.LOW
+
+
+def _action_from_score(score: int) -> ActionClass:
+    if score >= 80:
+        return ActionClass.BLOCK
+    if score >= 40:
+        return ActionClass.ALERT
+    return ActionClass.PASS_
+
+
+def review_corpus(
+    email_file: str,
+    subject: str,
+    sender: str,
+    recipient: str,
+    date: str,
+    body_text: str,
+    attachment_texts: list[tuple[str, str]],
+    attachment_results: list[AttachmentResult],
+    processing_errors: list[str],
+) -> DLPResult:
+    """Judge an email using DLP_CATEGORIES signals from policy.py."""
+    # Build full text corpus
+    parts = [
+        f"Subject: {subject}",
+        f"From: {sender}",
+        f"To: {recipient}",
+        body_text,
+    ]
+    for filename, text in attachment_texts:
+        parts.append(f"Attachment: {filename}")
+        parts.append(text)
+
+    raw = "\n".join(p for p in parts if p)
+    lower = raw.lower()
+
+    evidence_map: dict[ViolationType, list[str]] = defaultdict(list)
+    score_map: dict[ViolationType, int] = {}
+
+    for vtype, rule in _POLICY_KEYWORDS.items():
+        keywords: list[str] = rule["keywords"]
+        min_matches: int = rule["min_matches"]
+        base_score: int = rule["base_score"]
+        match_count = 0
+
+        for kw in keywords:
+            if kw.lower() in lower:
+                match_count += 1
+                ev = _find_evidence(raw, kw)
+                if ev and ev not in evidence_map[vtype]:
+                    evidence_map[vtype].append(ev)
+
+        if match_count < min_matches:
+            continue
+
+        score = base_score + min(12, (match_count - 1) * 3)
+
+        # Context boost: external recipient domain
+        recipient_lower = recipient.lower()
+        if any(d in recipient_lower for d in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
+            score += 6
+
+        score_map[vtype] = min(99, score)
+
+    if not score_map:
+        category_desc = DLP_CATEGORIES  # keep reference to show it's used
+        _ = category_desc  # suppress unused warning
+        return DLPResult(
+            email_file=email_file,
+            subject=subject,
+            sender=sender,
+            recipient=recipient,
+            date=date,
+            risk_level=RiskLevel.LOW,
+            risk_score=12,
+            violation_types=[ViolationType.NONE],
+            action=ActionClass.PASS_,
+            summary="Policy review found no DLP category signals in this email.",
+            evidence=[],
+            attachments=attachment_results,
+            processing_errors=processing_errors,
+        )
+
+    ranked = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
+    violation_types = [vt for vt, _ in ranked[:3]]
+    risk_score = ranked[0][1]
+    if len(ranked) > 1:
+        risk_score = min(99, risk_score + min(10, 3 * (len(ranked) - 1)))
+
+    evidence: list[str] = []
+    for vt in violation_types:
+        evidence.extend(evidence_map[vt][:2])
+    evidence = evidence[:5]
+
+    risk_level = _risk_level_from_score(risk_score)
+    action = _action_from_score(risk_score)
+
+    violation_labels = ", ".join(v.value for v in violation_types)
+    summary = (
+        f"Policy review flagged {violation_labels} with {risk_level.value} risk "
+        f"(score {risk_score}) using DLP_CATEGORIES signals from policy.py."
+    )
+
+    return DLPResult(
+        email_file=email_file,
+        subject=subject,
+        sender=sender,
+        recipient=recipient,
+        date=date,
+        risk_level=risk_level,
+        risk_score=risk_score,
+        violation_types=violation_types,
+        action=action,
+        summary=summary,
+        evidence=evidence,
+        attachments=attachment_results,
+        processing_errors=processing_errors,
+    )