Initial commit
This commit is contained in:
241
email_dlp/analyzer.py
Normal file
241
email_dlp/analyzer.py
Normal file
@ -0,0 +1,241 @@
|
||||
"""DLP analysis backends: remote LLM or deterministic local simulation."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from .converter import IMAGE_SENTINEL
|
||||
from .models import ActionClass, AttachmentResult, DLPResult, RiskLevel, ViolationType
|
||||
from .policy import format_policy_for_prompt
|
||||
from .simulator import simulate_analysis
|
||||
|
||||
OUTPUT_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"risk_level": {"type": "string", "enum": ["CRITICAL", "HIGH", "MEDIUM", "LOW"]},
|
||||
"risk_score": {"type": "integer", "minimum": 0, "maximum": 100},
|
||||
"violation_types": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"PII",
|
||||
"FINANCIAL_DATA",
|
||||
"SOURCE_CODE",
|
||||
"REGULATORY_DOCUMENT",
|
||||
"LEGAL_CONTRACT",
|
||||
"PAYROLL_RECORD",
|
||||
"CUSTOMER_LIST",
|
||||
"INTERNAL_MEMO",
|
||||
"NONE",
|
||||
],
|
||||
},
|
||||
},
|
||||
"action": {"type": "string", "enum": ["PASS", "ALERT", "BLOCK"]},
|
||||
"summary": {"type": "string"},
|
||||
"evidence": {"type": "array", "items": {"type": "string"}},
|
||||
},
|
||||
"required": ["risk_level", "risk_score", "violation_types", "action", "summary", "evidence"],
|
||||
}
|
||||
|
||||
SYSTEM_PROMPT_TEMPLATE = """\
|
||||
You are a Data Loss Prevention (DLP) analyst. Your task is to evaluate email content and attachments against the DLP policy below, then return a structured JSON decision.
|
||||
|
||||
## DLP Policy
|
||||
{policy_json}
|
||||
|
||||
## Output Schema
|
||||
Respond with valid JSON only (no markdown fences, no extra text) matching this schema:
|
||||
{schema_json}
|
||||
|
||||
## Critical Rules
|
||||
1. temperature=0: be deterministic and consistent.
|
||||
2. evidence: include direct quotes (verbatim excerpts) from the actual email or attachment content that justify your decision. Do not paraphrase.
|
||||
3. risk_score: assign 0-100. Use the full range — a customer CSV with 500 rows should score 95+, a casual internal memo scores 60-70.
|
||||
4. action: MUST match the threshold — BLOCK if risk_score>=80, ALERT if risk_score>=40, PASS otherwise.
|
||||
5. If no policy violations are found, set violation_types=["NONE"], risk_level="LOW", risk_score<40, action="PASS".
|
||||
"""
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
"""Build the system prompt used for DLP analysis."""
|
||||
return SYSTEM_PROMPT_TEMPLATE.format(
|
||||
policy_json=format_policy_for_prompt(),
|
||||
schema_json=json.dumps(OUTPUT_SCHEMA, indent=2),
|
||||
)
|
||||
|
||||
|
||||
def _build_user_content(
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]], # [(filename, text_or_sentinel)]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build the user message content for the VLM.
|
||||
|
||||
Returns a multimodal content list (OpenAI vision format).
|
||||
Text attachments are embedded as text blocks; image attachments are
|
||||
inserted as image_url blocks so the VLM can see them directly.
|
||||
"""
|
||||
header_block = "\n".join([
|
||||
"## Email Headers",
|
||||
f"Subject: {subject}",
|
||||
f"From: {sender}",
|
||||
f"To: {recipient}",
|
||||
f"Date: {date}",
|
||||
"",
|
||||
"## Email Body",
|
||||
body_text or "(empty body)",
|
||||
])
|
||||
content: list[dict[str, Any]] = [{"type": "text", "text": header_block}]
|
||||
|
||||
for filename, text in attachment_texts:
|
||||
if text.startswith(IMAGE_SENTINEL):
|
||||
# IMAGE_SENTINEL format: "__IMAGE__:<mime>:<base64>"
|
||||
payload = text[len(IMAGE_SENTINEL):] # "<mime>:<base64>"
|
||||
mime, b64 = payload.split(":", 1)
|
||||
content.append({"type": "text", "text": f"\n## Attachment: {filename} (image)"})
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{mime};base64,{b64}"},
|
||||
})
|
||||
else:
|
||||
content.append({
|
||||
"type": "text",
|
||||
"text": f"\n## Attachment: {filename}\n{text or '(no extractable text)'}",
|
||||
})
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def _parse_llm_response(content: str) -> dict[str, Any]:
|
||||
"""Parse JSON from LLM response, handling markdown fences."""
|
||||
content = content.strip()
|
||||
|
||||
# Strip triple-backtick fences
|
||||
fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||
if fence_match:
|
||||
content = fence_match.group(1)
|
||||
|
||||
return json.loads(content)
|
||||
|
||||
|
||||
def _map_action(action_str: str) -> ActionClass:
|
||||
mapping = {"PASS": ActionClass.PASS_, "ALERT": ActionClass.ALERT, "BLOCK": ActionClass.BLOCK}
|
||||
return mapping.get(action_str.upper(), ActionClass.ALERT)
|
||||
|
||||
|
||||
def analyze_email(
|
||||
email_file: str,
|
||||
subject: str,
|
||||
sender: str,
|
||||
recipient: str,
|
||||
date: str,
|
||||
body_text: str,
|
||||
attachment_texts: list[tuple[str, str]],
|
||||
attachment_results: list[AttachmentResult],
|
||||
processing_errors: list[str],
|
||||
endpoint: str = "http://localhost:8000/v1",
|
||||
model: str = "Qwen/Qwen3.5-35B-A3B",
|
||||
backend: str = "llm",
|
||||
) -> DLPResult:
|
||||
"""Analyze email content using an LLM or the deterministic simulator."""
|
||||
if backend == "simulated":
|
||||
return simulate_analysis(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
body_text=body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
attachment_results=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
# Use environment variables as fallback if they exist
|
||||
final_endpoint = os.getenv("OPENAI_BASE_URL", endpoint)
|
||||
final_api_key = os.getenv("OPENAI_API_KEY", "not-needed")
|
||||
final_model = os.getenv("MODEL_NAME", model)
|
||||
|
||||
client = openai.OpenAI(base_url=final_endpoint, api_key=final_api_key)
|
||||
|
||||
system_prompt = build_system_prompt()
|
||||
|
||||
user_content = _build_user_content(
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
body_text=body_text,
|
||||
attachment_texts=attachment_texts,
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=final_model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_content}, # list[dict] for multimodal
|
||||
],
|
||||
temperature=0.0,
|
||||
max_tokens=1024,
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||
)
|
||||
|
||||
raw_content = response.choices[0].message.content or ""
|
||||
|
||||
try:
|
||||
parsed = _parse_llm_response(raw_content)
|
||||
except json.JSONDecodeError as e:
|
||||
processing_errors.append(f"JSON parse error: {e}; raw={raw_content[:200]}")
|
||||
# Return a safe fallback
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=RiskLevel.HIGH,
|
||||
risk_score=60,
|
||||
violation_types=[ViolationType.NONE],
|
||||
action=ActionClass.ALERT,
|
||||
summary="Analysis failed due to JSON parse error.",
|
||||
evidence=[],
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
# Map string values to enums
|
||||
risk_level = RiskLevel(parsed.get("risk_level", "HIGH"))
|
||||
violation_types = [
|
||||
ViolationType(v) for v in parsed.get("violation_types", ["NONE"])
|
||||
if v in ViolationType.__members__
|
||||
]
|
||||
if not violation_types:
|
||||
violation_types = [ViolationType.NONE]
|
||||
|
||||
action = _map_action(parsed.get("action", "ALERT"))
|
||||
|
||||
return DLPResult(
|
||||
email_file=email_file,
|
||||
subject=subject,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
date=date,
|
||||
risk_level=risk_level,
|
||||
risk_score=int(parsed.get("risk_score", 60)),
|
||||
violation_types=violation_types,
|
||||
action=action,
|
||||
summary=parsed.get("summary", ""),
|
||||
evidence=parsed.get("evidence", []),
|
||||
attachments=attachment_results,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
Reference in New Issue
Block a user