Initial SOC memory POC implementation

2026-04-27 17:13:06 +08:00
parent fc68581198
commit e6b1520bce
89 changed files with 7610 additions and 1 deletions
--- a/integrations/hermes/soc-memory-poc/scripts/triage_email.py
+++ b/integrations/hermes/soc-memory-poc/scripts/triage_email.py
@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+TRIAGE_ALERT = SCRIPT_DIR / "triage_alert.py"
+
+EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
+URL_RE = re.compile(r"https?://[^\s<>\"]+")
+IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
+HOST_RE = re.compile(r"\b[A-Z]{2,}(?:-[A-Z0-9]+)+\b")
+ATTACHMENT_RE = re.compile(r"\b[\w.-]+\.(?:html|htm|pdf|zip|docx|xlsx|eml)\b", re.IGNORECASE)
+HEADER_RE = re.compile(
+    r"^(From|To|Subject|Attachment|URL|IP|Host|User|Alert type|Scenario)\s*:\s*(.+)$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+
+def first_nonempty(*values: str) -> str:
+    for value in values:
+        if value and value.strip():
+            return value.strip()
+    return ""
+
+
+def load_text(args: argparse.Namespace) -> str:
+    if args.file:
+        return Path(args.file).read_text(encoding="utf-8")
+    if args.text:
+        return args.text
+    data = sys.stdin.read()
+    if data.strip():
+        return data
+    return ""
+
+
+def find_header(text: str, name: str) -> str:
+    for key, value in HEADER_RE.findall(text):
+        if key.lower() == name.lower():
+            return value.strip()
+    return ""
+
+
+def unique_matches(pattern: re.Pattern[str], text: str) -> list[str]:
+    seen: list[str] = []
+    for match in pattern.findall(text):
+        if match not in seen:
+            seen.append(match)
+    return seen
+
+
+def infer_scenario(text: str, explicit_scenario: str = "", explicit_alert_type: str = "") -> tuple[str, str]:
+    if explicit_scenario:
+        return explicit_scenario, explicit_alert_type
+
+    lowered = text.lower()
+    if any(token in lowered for token in ["impossible travel", "mfa fatigue", "oauth consent", "inbox rule", "entra", "azuread", "sign-in", "signin"]):
+        alert_type = explicit_alert_type or ("azuread_impossible_travel" if "impossible travel" in lowered else "o365_suspicious_login")
+        return "o365_suspicious_login", alert_type
+
+    if any(token in lowered for token in ["phishing", "invoice", "attachment", "credential harvest", "fake microsoft 365", "dmarc", "mail_suspicious", "wire transfer"]):
+        if explicit_alert_type:
+            return "phishing", explicit_alert_type
+        if "wire transfer" in lowered or "executive impersonation" in lowered or "bec" in lowered:
+            return "phishing", "mail_bec_impersonation"
+        if "link" in lowered and "attachment" not in lowered:
+            return "phishing", "mail_suspicious_link"
+        return "phishing", "mail_suspicious_attachment"
+
+    return "phishing", explicit_alert_type
+
+
+def collect_facts(text: str, provided: list[str]) -> list[str]:
+    facts: list[str] = []
+    for fact in provided:
+        if fact and fact not in facts:
+            facts.append(fact)
+
+    lowered = text.lower()
+    fact_patterns = [
+        ("DMARC failed", ["dmarc failed"]),
+        ("SPF failed", ["spf failed"]),
+        ("User may have clicked the link", ["clicked", "user clicked"]),
+        ("Credential submission suspected", ["submitted credentials", "credential submission", "entered credentials"]),
+        ("Impossible travel observed", ["impossible travel"]),
+        ("MFA fatigue observed", ["mfa fatigue", "repeated mfa"]),
+        ("Inbox rule creation observed", ["inbox rule"]),
+        ("OAuth consent activity observed", ["oauth consent"]),
+    ]
+    for label, needles in fact_patterns:
+        if any(needle in lowered for needle in needles) and label not in facts:
+            facts.append(label)
+
+    for line in text.splitlines():
+        stripped = line.strip("-* \t")
+        if not stripped or len(stripped) > 160:
+            continue
+        lower = stripped.lower()
+        if any(word in lower for word in ["dmarc", "spf", "clicked", "credential", "impossible travel", "mfa", "inbox rule", "oauth"]):
+            if stripped not in facts:
+                facts.append(stripped)
+    return facts[:8]
+
+
+def build_summary(text: str, subject: str, provided_summary: str = "") -> str:
+    if provided_summary:
+        return provided_summary[:240]
+    if subject:
+        return subject[:180]
+    for line in text.splitlines():
+        stripped = line.strip()
+        if len(stripped) >= 20 and ":" not in stripped[:20]:
+            return stripped[:240]
+    return text.strip()[:240]
+
+
+def parse_input(args: argparse.Namespace) -> dict[str, str | list[str]]:
+    text = load_text(args)
+    scenario, alert_type = infer_scenario(text, args.scenario, args.alert_type)
+    emails = unique_matches(EMAIL_RE, text)
+    urls = unique_matches(URL_RE, text)
+    ips = unique_matches(IP_RE, text)
+    hosts = unique_matches(HOST_RE, text)
+    attachments = unique_matches(ATTACHMENT_RE, text)
+
+    sender = first_nonempty(args.sender, find_header(text, "From"), emails[0] if emails else "")
+    user = first_nonempty(args.user, find_header(text, "User"), find_header(text, "To"), emails[1] if len(emails) > 1 else "")
+    subject = first_nonempty(args.subject, find_header(text, "Subject"))
+    attachment = first_nonempty(args.attachment, find_header(text, "Attachment"), attachments[0] if attachments else "")
+    url = first_nonempty(args.url, find_header(text, "URL"), urls[0] if urls else "")
+    ip = first_nonempty(args.ip, find_header(text, "IP"), ips[0] if ips else "")
+    host = first_nonempty(args.host, find_header(text, "Host"), hosts[0] if hosts else "")
+    summary = build_summary(text, subject, args.summary)
+    facts = collect_facts(text, args.fact)
+
+    return {
+        "scenario": scenario,
+        "alert_type": alert_type,
+        "user": user,
+        "host": host,
+        "sender": sender,
+        "subject": subject,
+        "attachment": attachment,
+        "url": url,
+        "ip": ip,
+        "summary": summary,
+        "facts": facts,
+    }
+
+
+def run_triage(parsed: dict[str, str | list[str]], limit: int) -> None:
+    cmd = [
+        sys.executable,
+        str(TRIAGE_ALERT),
+        "--scenario", str(parsed["scenario"]),
+        "--alert-type", str(parsed["alert_type"]),
+        "--user", str(parsed["user"]),
+        "--host", str(parsed["host"]),
+        "--sender", str(parsed["sender"]),
+        "--subject", str(parsed["subject"]),
+        "--attachment", str(parsed["attachment"]),
+        "--url", str(parsed["url"]),
+        "--ip", str(parsed["ip"]),
+        "--summary", str(parsed["summary"]),
+        "--limit", str(limit),
+    ]
+    for fact in parsed["facts"]:
+        cmd.extend(["--fact", str(fact)])
+    subprocess.run(cmd, check=True, env=os.environ.copy())
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Unified SOC alert/email triage entrypoint with memory and Obsidian retrieval.")
+    parser.add_argument("--text", help="Raw email, ticket text, or freeform alert text")
+    parser.add_argument("--file", help="Path to a raw email/ticket/alert text file")
+    parser.add_argument("--scenario", default="", help="Optional scenario override")
+    parser.add_argument("--alert-type", default="", help="Optional alert type override")
+    parser.add_argument("--user", default="", help="Optional user override")
+    parser.add_argument("--host", default="", help="Optional host override")
+    parser.add_argument("--sender", default="", help="Optional sender override")
+    parser.add_argument("--subject", default="", help="Optional subject override")
+    parser.add_argument("--attachment", default="", help="Optional attachment override")
+    parser.add_argument("--url", default="", help="Optional URL override")
+    parser.add_argument("--ip", default="", help="Optional IP override")
+    parser.add_argument("--summary", default="", help="Optional summary override")
+    parser.add_argument("--fact", action="append", default=[], help="Additional known fact; repeatable")
+    parser.add_argument("--limit", type=int, default=5, help="Search limit")
+    args = parser.parse_args()
+
+    parsed = parse_input(args)
+    run_triage(parsed, args.limit)
+
+
+if __name__ == "__main__":
+    main()