Initial commit

This commit is contained in:
2026-03-20 10:28:28 +08:00
commit 1b4d5a277f
30 changed files with 14869 additions and 0 deletions

241
email_dlp/analyzer.py Normal file
View File

@ -0,0 +1,241 @@
"""DLP analysis backends: remote LLM or deterministic local simulation."""
import json
import os
import re
from typing import Any
import openai
from dotenv import load_dotenv
load_dotenv()
from .converter import IMAGE_SENTINEL
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
from .policy import format_policy_for_prompt
from .simulator import simulate_analysis
OUTPUT_SCHEMA = {
"type": "object",
"properties": {
"risk_level": {"type": "string", "enum": ["CRITICAL", "HIGH", "MEDIUM", "LOW"]},
"risk_score": {"type": "integer", "minimum": 0, "maximum": 100},
"violation_types": {
"type": "array",
"items": {
"type": "string",
"enum": [
"PII",
"FINANCIAL_DATA",
"SOURCE_CODE",
"REGULATORY_DOCUMENT",
"LEGAL_CONTRACT",
"PAYROLL_RECORD",
"CUSTOMER_LIST",
"INTERNAL_MEMO",
"NONE",
],
},
},
"action": {"type": "string", "enum": ["PASS", "ALERT", "BLOCK"]},
"summary": {"type": "string"},
"evidence": {"type": "array", "items": {"type": "string"}},
},
"required": ["risk_level", "risk_score", "violation_types", "action", "summary", "evidence"],
}
SYSTEM_PROMPT_TEMPLATE = """\
You are a Data Loss Prevention (DLP) analyst. Your task is to evaluate email content and attachments against the DLP policy below, then return a structured JSON decision.
## DLP Policy
{policy_json}
## Output Schema
Respond with valid JSON only (no markdown fences, no extra text) matching this schema:
{schema_json}
## Critical Rules
1. temperature=0: be deterministic and consistent.
2. evidence: include direct quotes (verbatim excerpts) from the actual email or attachment content that justify your decision. Do not paraphrase.
3. risk_score: assign 0-100. Use the full range — a customer CSV with 500 rows should score 95+, a casual internal memo scores 60-70.
4. action: MUST match the threshold — BLOCK if risk_score>=80, ALERT if risk_score>=40, PASS otherwise.
5. If no policy violations are found, set violation_types=["NONE"], risk_level="LOW", risk_score<40, action="PASS".
"""
def build_system_prompt() -> str:
"""Build the system prompt used for DLP analysis."""
return SYSTEM_PROMPT_TEMPLATE.format(
policy_json=format_policy_for_prompt(),
schema_json=json.dumps(OUTPUT_SCHEMA, indent=2),
)
def _build_user_content(
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]], # [(filename, text_or_sentinel)]
) -> list[dict[str, Any]]:
"""Build the user message content for the VLM.
Returns a multimodal content list (OpenAI vision format).
Text attachments are embedded as text blocks; image attachments are
inserted as image_url blocks so the VLM can see them directly.
"""
header_block = "\n".join([
"## Email Headers",
f"Subject: {subject}",
f"From: {sender}",
f"To: {recipient}",
f"Date: {date}",
"",
"## Email Body",
body_text or "(empty body)",
])
content: list[dict[str, Any]] = [{"type": "text", "text": header_block}]
for filename, text in attachment_texts:
if text.startswith(IMAGE_SENTINEL):
# IMAGE_SENTINEL format: "__IMAGE__:<mime>:<base64>"
payload = text[len(IMAGE_SENTINEL):] # "<mime>:<base64>"
mime, b64 = payload.split(":", 1)
content.append({"type": "text", "text": f"\n## Attachment: {filename} (image)"})
content.append({
"type": "image_url",
"image_url": {"url": f"data:{mime};base64,{b64}"},
})
else:
content.append({
"type": "text",
"text": f"\n## Attachment: {filename}\n{text or '(no extractable text)'}",
})
return content
def _parse_llm_response(content: str) -> dict[str, Any]:
"""Parse JSON from LLM response, handling markdown fences."""
content = content.strip()
# Strip triple-backtick fences
fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
if fence_match:
content = fence_match.group(1)
return json.loads(content)
def _map_action(action_str: str) -> ActionClass:
mapping = {"PASS": ActionClass.PASS_, "ALERT": ActionClass.ALERT, "BLOCK": ActionClass.BLOCK}
return mapping.get(action_str.upper(), ActionClass.ALERT)
def analyze_email(
email_file: str,
subject: str,
sender: str,
recipient: str,
date: str,
body_text: str,
attachment_texts: list[tuple[str, str]],
attachment_results: list[AttachmentResult],
processing_errors: list[str],
endpoint: str = "http://localhost:8000/v1",
model: str = "Qwen/Qwen3.5-35B-A3B",
backend: str = "llm",
) -> DLPResult:
"""Analyze email content using an LLM or the deterministic simulator."""
if backend == "simulated":
return simulate_analysis(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
body_text=body_text,
attachment_texts=attachment_texts,
attachment_results=attachment_results,
processing_errors=processing_errors,
)
# Use environment variables as fallback if they exist
final_endpoint = os.getenv("OPENAI_BASE_URL", endpoint)
final_api_key = os.getenv("OPENAI_API_KEY", "not-needed")
final_model = os.getenv("MODEL_NAME", model)
client = openai.OpenAI(base_url=final_endpoint, api_key=final_api_key)
system_prompt = build_system_prompt()
user_content = _build_user_content(
subject=subject,
sender=sender,
recipient=recipient,
date=date,
body_text=body_text,
attachment_texts=attachment_texts,
)
response = client.chat.completions.create(
model=final_model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content}, # list[dict] for multimodal
],
temperature=0.0,
max_tokens=1024,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
raw_content = response.choices[0].message.content or ""
try:
parsed = _parse_llm_response(raw_content)
except json.JSONDecodeError as e:
processing_errors.append(f"JSON parse error: {e}; raw={raw_content[:200]}")
# Return a safe fallback
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=RiskLevel.HIGH,
risk_score=60,
violation_types=[ViolationType.NONE],
action=ActionClass.ALERT,
summary="Analysis failed due to JSON parse error.",
evidence=[],
attachments=attachment_results,
processing_errors=processing_errors,
)
# Map string values to enums
risk_level = RiskLevel(parsed.get("risk_level", "HIGH"))
violation_types = [
ViolationType(v) for v in parsed.get("violation_types", ["NONE"])
if v in ViolationType.__members__
]
if not violation_types:
violation_types = [ViolationType.NONE]
action = _map_action(parsed.get("action", "ALERT"))
return DLPResult(
email_file=email_file,
subject=subject,
sender=sender,
recipient=recipient,
date=date,
risk_level=risk_level,
risk_score=int(parsed.get("risk_score", 60)),
violation_types=violation_types,
action=action,
summary=parsed.get("summary", ""),
evidence=parsed.get("evidence", []),
attachments=attachment_results,
processing_errors=processing_errors,
)