Initial SOC memory POC implementation
This commit is contained in:
14
pipeline/README.md
Normal file
14
pipeline/README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# Pipeline
|
||||
|
||||
这个目录用于保存知识源接入和数据清洗流程。
|
||||
|
||||
建议优先接入:
|
||||
|
||||
- 历史 case
|
||||
- KB / Playbook
|
||||
|
||||
后续再逐步扩展:
|
||||
|
||||
- ticket system
|
||||
- intel system
|
||||
- 月报 / 报告
|
||||
41
pipeline/jobs/ingest_case.py
Normal file
41
pipeline/jobs/ingest_case.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""Batch-ingest mock case files and emit normalized case JSON documents."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.transforms.normalize_case import load_and_normalize_case
|
||||
|
||||
|
||||
def ingest_cases(input_dir: str | Path, output_dir: str | Path) -> list[Path]:
|
||||
input_dir = Path(input_dir)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
written: list[Path] = []
|
||||
for src in sorted(input_dir.rglob("*.json")):
|
||||
normalized = load_and_normalize_case(src)
|
||||
dest = output_dir / f"{normalized.id}.json"
|
||||
with dest.open("w", encoding="utf-8") as f:
|
||||
json.dump(asdict(normalized), f, ensure_ascii=False, indent=2)
|
||||
written.append(dest)
|
||||
return written
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Normalize a directory of mock case JSON files.")
|
||||
parser.add_argument("--input-dir", default="evaluation/datasets/mock_cases", help="Directory containing raw mock case files")
|
||||
parser.add_argument("--output-dir", default="evaluation/datasets/normalized_cases", help="Directory to write normalized case files")
|
||||
args = parser.parse_args()
|
||||
|
||||
written = ingest_cases(args.input_dir, args.output_dir)
|
||||
print(f"normalized_cases={len(written)}")
|
||||
for path in written:
|
||||
print(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
41
pipeline/jobs/ingest_kb.py
Normal file
41
pipeline/jobs/ingest_kb.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""Batch-ingest mock KB/playbook files and emit normalized knowledge JSON documents."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.transforms.normalize_kb import load_and_normalize_kb
|
||||
|
||||
|
||||
def ingest_kb(input_dir: str | Path, output_dir: str | Path) -> list[Path]:
|
||||
input_dir = Path(input_dir)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
written: list[Path] = []
|
||||
for src in sorted(input_dir.rglob("*.json")):
|
||||
normalized = load_and_normalize_kb(src)
|
||||
dest = output_dir / f"{normalized.id}.json"
|
||||
with dest.open("w", encoding="utf-8") as f:
|
||||
json.dump(asdict(normalized), f, ensure_ascii=False, indent=2)
|
||||
written.append(dest)
|
||||
return written
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Normalize a directory of mock KB/playbook JSON files.")
|
||||
parser.add_argument("--input-dir", default="evaluation/datasets/mock_kb", help="Directory containing raw mock KB/playbook files")
|
||||
parser.add_argument("--output-dir", default="evaluation/datasets/normalized_kb", help="Directory to write normalized KB/playbook files")
|
||||
args = parser.parse_args()
|
||||
|
||||
written = ingest_kb(args.input_dir, args.output_dir)
|
||||
print(f"normalized_kb={len(written)}")
|
||||
for path in written:
|
||||
print(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
91
pipeline/transforms/normalize_case.py
Normal file
91
pipeline/transforms/normalize_case.py
Normal file
@ -0,0 +1,91 @@
|
||||
"""Normalize raw mock SOC cases into a retrieval-friendly structure.
|
||||
|
||||
This module is intentionally small and deterministic so it can be used with
|
||||
mock data before real connectors are available.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizedCase:
|
||||
id: str
|
||||
memory_type: str
|
||||
scenario: str
|
||||
title: str
|
||||
abstract: str
|
||||
verdict: str
|
||||
severity: str
|
||||
entities: dict[str, list[str]]
|
||||
observables: dict[str, list[str]]
|
||||
evidence: list[str]
|
||||
patterns: list[str]
|
||||
related_refs: dict[str, list[str]]
|
||||
source_path: str
|
||||
tags: list[str]
|
||||
|
||||
|
||||
def _derive_patterns(raw_case: dict[str, Any]) -> list[str]:
|
||||
"""Derive a small set of reusable patterns from the case payload."""
|
||||
patterns: list[str] = []
|
||||
|
||||
verdict = raw_case.get("conclusion", {}).get("verdict")
|
||||
if verdict:
|
||||
patterns.append(f"verdict:{verdict}")
|
||||
|
||||
scenario = raw_case.get("scenario")
|
||||
if scenario:
|
||||
patterns.append(f"scenario:{scenario}")
|
||||
|
||||
alert_type = raw_case.get("alert_type")
|
||||
if alert_type:
|
||||
patterns.append(f"alert_type:{alert_type}")
|
||||
|
||||
return patterns
|
||||
|
||||
|
||||
def normalize_case(raw_case: dict[str, Any], source_path: str = "") -> NormalizedCase:
|
||||
"""Convert a raw case document into the internal normalized case model."""
|
||||
conclusion = raw_case.get("conclusion", {})
|
||||
return NormalizedCase(
|
||||
id=raw_case["case_id"],
|
||||
memory_type="case",
|
||||
scenario=raw_case["scenario"],
|
||||
title=raw_case["title"],
|
||||
abstract=raw_case.get("summary", ""),
|
||||
verdict=conclusion.get("verdict", raw_case.get("status", "unknown")),
|
||||
severity=raw_case.get("severity", "unknown"),
|
||||
entities=raw_case.get("entities", {}),
|
||||
observables=raw_case.get("observables", {}),
|
||||
evidence=raw_case.get("evidence", []),
|
||||
patterns=_derive_patterns(raw_case),
|
||||
related_refs=raw_case.get("related_refs", {}),
|
||||
source_path=source_path,
|
||||
tags=raw_case.get("tags", []),
|
||||
)
|
||||
|
||||
|
||||
def load_and_normalize_case(path: str | Path) -> NormalizedCase:
|
||||
path = Path(path)
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
raw_case = json.load(f)
|
||||
return normalize_case(raw_case, source_path=str(path))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Normalize a mock SOC case JSON file.")
|
||||
parser.add_argument("path", help="Path to a raw case JSON file")
|
||||
args = parser.parse_args()
|
||||
|
||||
normalized = load_and_normalize_case(args.path)
|
||||
print(json.dumps(asdict(normalized), ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
63
pipeline/transforms/normalize_kb.py
Normal file
63
pipeline/transforms/normalize_kb.py
Normal file
@ -0,0 +1,63 @@
|
||||
"""Normalize raw mock KB/playbook documents into a retrieval-friendly structure."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizedKnowledge:
|
||||
id: str
|
||||
memory_type: str
|
||||
doc_type: str
|
||||
scenario: str
|
||||
title: str
|
||||
abstract: str
|
||||
key_points: list[str]
|
||||
investigation_guidance: list[str]
|
||||
decision_points: list[str]
|
||||
related_refs: dict[str, list[str]]
|
||||
source_path: str
|
||||
tags: list[str]
|
||||
|
||||
|
||||
def normalize_kb(raw_doc: dict[str, Any], source_path: str = "") -> NormalizedKnowledge:
|
||||
"""Convert a raw KB or playbook document into the normalized knowledge model."""
|
||||
return NormalizedKnowledge(
|
||||
id=raw_doc["doc_id"],
|
||||
memory_type="knowledge",
|
||||
doc_type=raw_doc["doc_type"],
|
||||
scenario=raw_doc["scenario"],
|
||||
title=raw_doc["title"],
|
||||
abstract=raw_doc.get("summary", ""),
|
||||
key_points=raw_doc.get("key_points", []),
|
||||
investigation_guidance=raw_doc.get("investigation_guidance", []),
|
||||
decision_points=raw_doc.get("decision_points", []),
|
||||
related_refs=raw_doc.get("related_refs", {}),
|
||||
source_path=source_path,
|
||||
tags=raw_doc.get("tags", []),
|
||||
)
|
||||
|
||||
|
||||
def load_and_normalize_kb(path: str | Path) -> NormalizedKnowledge:
|
||||
path = Path(path)
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
raw_doc = json.load(f)
|
||||
return normalize_kb(raw_doc, source_path=str(path))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Normalize a mock KB or playbook JSON file.")
|
||||
parser.add_argument("path", help="Path to a raw KB/playbook JSON file")
|
||||
args = parser.parse_args()
|
||||
|
||||
normalized = load_and_normalize_kb(args.path)
|
||||
print(json.dumps(asdict(normalized), ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user