memory-gateway/integrations/hermes/soc-memory-poc/scripts/search_obsidian_docs.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import os
import re
from pathlib import Path
from typing import Any

DEFAULT_POC_ROOT = os.environ.get("SOC_MEMORY_POC_ROOT", "/home/tom/soc_memory_poc")
DEFAULT_VAULT_ROOT = str(Path(DEFAULT_POC_ROOT) / "obsidian-vault")

TOKEN_RE = re.compile(r"[A-Za-z0-9_./:-]+")
SKIP_DIRS = {"05_Templates"}
SKIP_FILES = {"README.md"}


def tokenize(text: str) -> list[str]:
    lowered = (text or "").lower()
    tokens = TOKEN_RE.findall(lowered)
    return [token for token in tokens if len(token) >= 3]


def parse_frontmatter(text: str) -> tuple[dict[str, str], str]:
    if not text.startswith("---\n"):
        return {}, text
    parts = text.split("\n---\n", 1)
    if len(parts) != 2:
        return {}, text
    raw_frontmatter = parts[0].splitlines()[1:]
    body = parts[1]
    data: dict[str, str] = {}
    for line in raw_frontmatter:
        if ":" not in line:
            continue
        key, value = line.split(":", 1)
        data[key.strip()] = value.strip()
    return data, body


def extract_title(body: str, fallback: str) -> str:
    for line in body.splitlines():
        if line.startswith("# "):
            return line[2:].strip()
    return fallback


def extract_section_text(body: str, heading: str) -> str:
    lines = body.splitlines()
    marker = f"## {heading}"
    collecting = False
    collected: list[str] = []
    for line in lines:
        if line.strip() == marker:
            collecting = True
            continue
        if collecting and line.startswith("## "):
            break
        if collecting:
            stripped = line.strip()
            if stripped:
                collected.append(stripped)
    return " ".join(collected[:4]).strip()


def extract_tags(body: str) -> list[str]:
    tags: list[str] = []
    in_tag_section = False
    for line in body.splitlines():
        if line.strip() == "## 标签":
            in_tag_section = True
            continue
        if in_tag_section and line.startswith("## "):
            break
        if in_tag_section:
            for token in re.findall(r"#[^\s,]+", line):
                tags.append(token)
    return tags


def score_doc(query: str, tokens: list[str], doc: dict[str, Any]) -> tuple[int, list[str]]:
    score = 0
    matched: list[str] = []

    path_text = f"{doc['relative_path']} {doc['file_name']}".lower()
    title_text = doc["title"].lower()
    summary_text = doc.get("summary", "").lower()
    body_text = doc.get("body", "").lower()
    frontmatter_text = " ".join(f"{k}:{v}" for k, v in doc.get("frontmatter", {}).items()).lower()
    tags_text = " ".join(doc.get("tags", [])).lower()

    if query and query.lower() in body_text:
        score += 8
        matched.append(query.lower())

    case_id = doc.get("frontmatter", {}).get("case_id", "")
    if case_id and case_id.lower() in query.lower():
        score += 80
        matched.append(case_id.lower())

    scenario = doc.get("frontmatter", {}).get("scenario", "")
    if scenario and scenario.lower() in query.lower():
        score += 20
        matched.append(scenario.lower())

    for token in tokens:
        token_hit = False
        if token in title_text:
            score += 12
            token_hit = True
        elif token in summary_text:
            score += 7
            token_hit = True
        elif token in path_text:
            score += 6
            token_hit = True
        elif token in frontmatter_text:
            score += 5
            token_hit = True
        elif token in tags_text:
            score += 4
            token_hit = True
        elif token in body_text:
            score += 1
            token_hit = True
        if token_hit and token not in matched:
            matched.append(token)

    return score, matched[:8]


def load_docs(vault_root: str | Path) -> list[dict[str, Any]]:
    vault_root = Path(vault_root)
    docs: list[dict[str, Any]] = []
    for path in sorted(vault_root.rglob("*.md")):
        rel = path.relative_to(vault_root)
        if any(part in SKIP_DIRS for part in rel.parts):
            continue
        if path.name in SKIP_FILES:
            continue
        text = path.read_text(encoding="utf-8")
        frontmatter, body = parse_frontmatter(text)
        docs.append(
            {
                "file_name": path.name,
                "relative_path": str(rel),
                "absolute_path": str(path),
                "category": rel.parts[0] if rel.parts else "",
                "directory": str(rel.parent),
                "frontmatter": frontmatter,
                "title": extract_title(body, path.stem),
                "summary": extract_section_text(body, "告警摘要") or extract_section_text(body, "Summary"),
                "tags": extract_tags(body),
                "body": body,
            }
        )
    return docs


def main() -> None:
    parser = argparse.ArgumentParser(description="Search Obsidian SOC notes and return matching document references.")
    parser.add_argument("--query", required=True, help="Search query")
    parser.add_argument("--vault-root", default=DEFAULT_VAULT_ROOT, help="Obsidian vault root")
    parser.add_argument("--limit", type=int, default=5, help="Maximum results")
    parser.add_argument("--scenario", default="", help="Optional scenario filter")
    args = parser.parse_args()

    docs = load_docs(args.vault_root)
    tokens = tokenize(args.query)
    results: list[dict[str, Any]] = []

    for doc in docs:
        scenario = doc.get("frontmatter", {}).get("scenario", "")
        if args.scenario and scenario != args.scenario:
            continue
        score, matched_terms = score_doc(args.query, tokens, doc)
        if score <= 0:
            continue
        results.append(
            {
                "score": score,
                "title": doc["title"],
                "file_name": doc["file_name"],
                "relative_path": doc["relative_path"],
                "directory": doc["directory"],
                "category": doc["category"],
                "scenario": scenario,
                "summary": doc.get("summary", ""),
                "tags": doc.get("tags", []),
                "matched_terms": matched_terms,
            }
        )

    results.sort(key=lambda item: item["score"], reverse=True)
    payload = {
        "query": args.query,
        "vault_root": str(Path(args.vault_root)),
        "matched_docs": results[: args.limit],
    }
    print(json.dumps(payload, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()