email-dlp/email_dlp/policy.py

"""DLP policy definitions: violation categories, thresholds, and prompt formatting."""

import json

from .models import ActionClass, RiskLevel

# Risk score thresholds
RISK_THRESHOLDS = {
    RiskLevel.CRITICAL: 80,
    RiskLevel.HIGH: 60,
    RiskLevel.MEDIUM: 40,
    RiskLevel.LOW: 0,
}

# Action mapping based on risk level
RISK_TO_ACTION = {
    RiskLevel.CRITICAL: ActionClass.BLOCK,
    RiskLevel.HIGH: ActionClass.ALERT,
    RiskLevel.MEDIUM: ActionClass.ALERT,
    RiskLevel.LOW: ActionClass.PASS_,
}

DLP_CATEGORIES = {
    "PII": {
        "description": "Personally Identifiable Information",
        "signals": [
            "Full name combined with email address",
            "Social Security Number (SSN) or employee ID",
            "Phone numbers combined with personal details",
            "Home address combined with personal identifiers",
        ],
        "risk_weight": "HIGH to CRITICAL depending on volume",
    },
    "FINANCIAL_DATA": {
        "description": "Non-public financial information",
        "signals": [
            "Revenue targets, EBITDA projections, internal forecasts",
            "Salary figures, compensation plans",
            "Invoice amounts and vendor payment terms",
            "Internal budget allocations",
        ],
        "risk_weight": "MEDIUM to CRITICAL depending on sensitivity",
    },
    "SOURCE_CODE": {
        "description": "Proprietary source code or model weights",
        "signals": [
            "Python, Java, or other source files with copyright notices",
            "Internal class names and proprietary algorithms",
            "Model architecture files or weight files",
            "Internal API keys or credentials embedded in code",
        ],
        "risk_weight": "CRITICAL",
    },
    "REGULATORY_DOCUMENT": {
        "description": "Internal regulatory and compliance drafts",
        "signals": [
            "CFPB, GDPR, or SOX compliance drafts marked internal",
            "Audit findings or remediation plans",
            "Internal compliance assessments not yet published",
            "Regulatory submission drafts",
        ],
        "risk_weight": "CRITICAL",
    },
    "LEGAL_CONTRACT": {
        "description": "Executed or draft legal agreements",
        "signals": [
            "Non-Disclosure Agreements (NDAs) with named parties",
            "Signed contracts with dates and signatures",
            "Settlement agreements or legal memoranda",
            "Vendor contracts with financial terms",
        ],
        "risk_weight": "HIGH to CRITICAL",
    },
    "PAYROLL_RECORD": {
        "description": "Employee payroll and compensation records",
        "signals": [
            "Employee ID combined with salary and payroll period",
            "Direct deposit details or bank account information",
            "Year-to-date earnings and deductions",
            "HR compensation reports",
        ],
        "risk_weight": "CRITICAL",
    },
    "CUSTOMER_LIST": {
        "description": "Customer or prospect data in bulk",
        "signals": [
            "CSV or table with customer names, emails, and revenue figures",
            "CRM exports with contact details",
            "Prospect lists for sales campaigns",
            "Customer PII in aggregate",
        ],
        "risk_weight": "CRITICAL",
    },
    "INTERNAL_MEMO": {
        "description": "Confidential internal communications",
        "signals": [
            'Documents marked "INTERNAL ONLY" or "DO NOT DISTRIBUTE"',
            "CEO or executive strategy memos",
            "Organizational restructuring plans",
            "Internal performance reviews or headcount discussions",
        ],
        "risk_weight": "HIGH",
    },
}

ACTION_THRESHOLDS = {
    "BLOCK": "risk_score >= 80 (CRITICAL risk)",
    "ALERT": "risk_score >= 40 (MEDIUM or HIGH risk)",
    "PASS": "risk_score < 40 (LOW risk)",
}


def format_policy_for_prompt() -> str:
    """Format the DLP policy as a JSON string for injection into the LLM system prompt."""
    policy = {
        "categories": DLP_CATEGORIES,
        "risk_score_thresholds": {
            "CRITICAL": "score >= 80",
            "HIGH": "score >= 60",
            "MEDIUM": "score >= 40",
            "LOW": "score < 40",
        },
        "action_mapping": ACTION_THRESHOLDS,
        "instructions": (
            "Evaluate the email against ALL categories above. "
            "Assign a risk_score from 0 to 100 based on the most severe violation found. "
            "Multiple violations increase the score. "
            "action must match the threshold: BLOCK if score>=80, ALERT if score>=40, PASS otherwise. "
            "evidence must be direct quotes from the actual email or attachment content."
        ),
    }
    return json.dumps(policy, indent=2)