Initial commit

2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions
--- a/email_dlp/policy.py
+++ b/email_dlp/policy.py
@ -0,0 +1,132 @@
+"""DLP policy definitions: violation categories, thresholds, and prompt formatting."""
+
+import json
+
+from .models import ActionClass, RiskLevel
+
+# Risk score thresholds
+RISK_THRESHOLDS = {
+    RiskLevel.CRITICAL: 80,
+    RiskLevel.HIGH: 60,
+    RiskLevel.MEDIUM: 40,
+    RiskLevel.LOW: 0,
+}
+
+# Action mapping based on risk level
+RISK_TO_ACTION = {
+    RiskLevel.CRITICAL: ActionClass.BLOCK,
+    RiskLevel.HIGH: ActionClass.ALERT,
+    RiskLevel.MEDIUM: ActionClass.ALERT,
+    RiskLevel.LOW: ActionClass.PASS_,
+}
+
+DLP_CATEGORIES = {
+    "PII": {
+        "description": "Personally Identifiable Information",
+        "signals": [
+            "Full name combined with email address",
+            "Social Security Number (SSN) or employee ID",
+            "Phone numbers combined with personal details",
+            "Home address combined with personal identifiers",
+        ],
+        "risk_weight": "HIGH to CRITICAL depending on volume",
+    },
+    "FINANCIAL_DATA": {
+        "description": "Non-public financial information",
+        "signals": [
+            "Revenue targets, EBITDA projections, internal forecasts",
+            "Salary figures, compensation plans",
+            "Invoice amounts and vendor payment terms",
+            "Internal budget allocations",
+        ],
+        "risk_weight": "MEDIUM to CRITICAL depending on sensitivity",
+    },
+    "SOURCE_CODE": {
+        "description": "Proprietary source code or model weights",
+        "signals": [
+            "Python, Java, or other source files with copyright notices",
+            "Internal class names and proprietary algorithms",
+            "Model architecture files or weight files",
+            "Internal API keys or credentials embedded in code",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "REGULATORY_DOCUMENT": {
+        "description": "Internal regulatory and compliance drafts",
+        "signals": [
+            "CFPB, GDPR, or SOX compliance drafts marked internal",
+            "Audit findings or remediation plans",
+            "Internal compliance assessments not yet published",
+            "Regulatory submission drafts",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "LEGAL_CONTRACT": {
+        "description": "Executed or draft legal agreements",
+        "signals": [
+            "Non-Disclosure Agreements (NDAs) with named parties",
+            "Signed contracts with dates and signatures",
+            "Settlement agreements or legal memoranda",
+            "Vendor contracts with financial terms",
+        ],
+        "risk_weight": "HIGH to CRITICAL",
+    },
+    "PAYROLL_RECORD": {
+        "description": "Employee payroll and compensation records",
+        "signals": [
+            "Employee ID combined with salary and payroll period",
+            "Direct deposit details or bank account information",
+            "Year-to-date earnings and deductions",
+            "HR compensation reports",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "CUSTOMER_LIST": {
+        "description": "Customer or prospect data in bulk",
+        "signals": [
+            "CSV or table with customer names, emails, and revenue figures",
+            "CRM exports with contact details",
+            "Prospect lists for sales campaigns",
+            "Customer PII in aggregate",
+        ],
+        "risk_weight": "CRITICAL",
+    },
+    "INTERNAL_MEMO": {
+        "description": "Confidential internal communications",
+        "signals": [
+            'Documents marked "INTERNAL ONLY" or "DO NOT DISTRIBUTE"',
+            "CEO or executive strategy memos",
+            "Organizational restructuring plans",
+            "Internal performance reviews or headcount discussions",
+        ],
+        "risk_weight": "HIGH",
+    },
+}
+
+ACTION_THRESHOLDS = {
+    "BLOCK": "risk_score >= 80 (CRITICAL risk)",
+    "ALERT": "risk_score >= 40 (MEDIUM or HIGH risk)",
+    "PASS": "risk_score < 40 (LOW risk)",
+}
+
+
+def format_policy_for_prompt() -> str:
+    """Format the DLP policy as a JSON string for injection into the LLM system prompt."""
+    policy = {
+        "categories": DLP_CATEGORIES,
+        "risk_score_thresholds": {
+            "CRITICAL": "score >= 80",
+            "HIGH": "score >= 60",
+            "MEDIUM": "score >= 40",
+            "LOW": "score < 40",
+        },
+        "action_mapping": ACTION_THRESHOLDS,
+        "instructions": (
+            "Evaluate the email against ALL categories above. "
+            "Assign a risk_score from 0 to 100 based on the most severe violation found. "
+            "Multiple violations increase the score. "
+            "action must match the threshold: BLOCK if score>=80, ALERT if score>=40, PASS otherwise. "
+            "evidence must be direct quotes from the actual email or attachment content."
+        ),
+    }
+    return json.dumps(policy, indent=2)