Add generic memory gateway v1

2026-05-05 16:18:31 +08:00
parent ba84b1ddb3
commit e65731a273
54 changed files with 4082 additions and 49 deletions
--- a/memory_gateway/workers/init.py
+++ b/memory_gateway/workers/init.py
@ -0,0 +1,2 @@
+"""Background worker skeletons."""
+
--- a/memory_gateway/workers/evermemos_worker.py
+++ b/memory_gateway/workers/evermemos_worker.py
@ -0,0 +1,186 @@
+"""Minimal EverMemOS-style consolidation worker.
+
+This worker is deliberately deterministic for the POC. It extracts stable
+candidate memories from session episodes, deduplicates them against existing
+records, promotes eligible records, and sends high-risk/high-value candidates
+to Obsidian review rather than blindly polluting long-term memory.
+"""
+from __future__ import annotations
+
+import hashlib
+import re
+from dataclasses import dataclass, field
+
+from memory_gateway.namespace import default_namespace_for_context
+from memory_gateway.obsidian_review import write_review_draft
+from memory_gateway.repositories import MetadataRepository
+from memory_gateway.schemas import (
+    AccessContext,
+    EpisodeRecord,
+    MemoryRecord,
+    MemoryType,
+    SourceType,
+    Visibility,
+)
+
+
+_SENTENCE_RE = re.compile(r"(?<=[。！？.!?])\s+|\n+")
+_NOISE_RE = re.compile(r"\s+")
+
+
+@dataclass
+class ConsolidationResult:
+    session_id: str
+    episodes: int
+    candidates: list[MemoryRecord] = field(default_factory=list)
+    promoted: list[MemoryRecord] = field(default_factory=list)
+    duplicates: list[dict] = field(default_factory=list)
+    review_drafts: list[str] = field(default_factory=list)
+    conflicts: list[dict] = field(default_factory=list)
+
+
+class EverMemOSWorker:
+    def __init__(self, repo: MetadataRepository) -> None:
+        self.repo = repo
+
+    def consolidate_session(
+        self,
+        session_id: str,
+        ctx: AccessContext,
+        min_importance: float = 0.6,
+        target_namespace: str | None = None,
+    ) -> ConsolidationResult:
+        episodes = self.repo.list_session_episodes(session_id)
+        result = ConsolidationResult(session_id=session_id, episodes=len(episodes))
+        existing = list(self.repo.list_memories())
+        seen_fingerprints = {self._fingerprint(memory.content): memory for memory in existing}
+
+        for episode in episodes:
+            for candidate in self._extract_candidates(episode, ctx, min_importance, target_namespace):
+                result.candidates.append(candidate)
+                fingerprint = self._fingerprint(candidate.content)
+                duplicate = seen_fingerprints.get(fingerprint)
+                if duplicate:
+                    result.duplicates.append({"candidate_id": candidate.id, "existing_id": duplicate.id})
+                    continue
+
+                conflict_ids = self._find_conflicts(candidate, existing)
+                if conflict_ids:
+                    draft = write_review_draft(candidate, reason="conflict", conflict_ids=conflict_ids)
+                    result.review_drafts.append(str(draft))
+                    result.conflicts.append({"candidate_id": candidate.id, "conflict_ids": conflict_ids})
+                    continue
+
+                if candidate.importance >= 0.85:
+                    draft = write_review_draft(candidate, reason="high_value")
+                    result.review_drafts.append(str(draft))
+                    continue
+
+                if candidate.importance >= min_importance and candidate.confidence >= 0.55:
+                    self.repo.upsert_memory(candidate)
+                    result.promoted.append(candidate)
+                    seen_fingerprints[fingerprint] = candidate
+                    existing.append(candidate)
+
+        return result
+
+    def _extract_candidates(
+        self,
+        episode: EpisodeRecord,
+        ctx: AccessContext,
+        min_importance: float,
+        target_namespace: str | None,
+    ) -> list[MemoryRecord]:
+        text = episode.summary or episode.content
+        parts = [self._normalize(part) for part in _SENTENCE_RE.split(text) if self._normalize(part)]
+        candidates: list[MemoryRecord] = []
+        for part in parts:
+            if len(part) < 20:
+                continue
+            memory_type = self._classify_type(part, episode.tags)
+            importance = self._estimate_importance(part, episode.tags, min_importance)
+            confidence = 0.65 if episode.summary else 0.58
+            visibility = Visibility.WORKSPACE_SHARED if "workspace" in episode.tags and ctx.workspace_id else Visibility.PRIVATE
+            memory_ctx = AccessContext(
+                user_id=ctx.user_id,
+                agent_id=ctx.agent_id,
+                workspace_id=ctx.workspace_id,
+                session_id=ctx.session_id,
+            )
+            candidates.append(
+                MemoryRecord(
+                    user_id=ctx.user_id,
+                    agent_id=ctx.agent_id,
+                    workspace_id=ctx.workspace_id,
+                    session_id=episode.session_id,
+                    namespace=target_namespace or default_namespace_for_context(memory_ctx, visibility),
+                    memory_type=memory_type,
+                    content=part,
+                    summary=part[:180],
+                    tags=list(set(episode.tags + ["promoted-from-session", "evermemos-candidate"])),
+                    importance=importance,
+                    confidence=confidence,
+                    visibility=visibility,
+                    source=SourceType.EVERMEMOS,
+                    source_ref=episode.id,
+                )
+            )
+        return candidates
+
+    def _classify_type(self, text: str, tags: list[str]) -> MemoryType:
+        lowered = text.lower()
+        if "preference" in tags or "偏好" in text:
+            return MemoryType.PREFERENCE
+        if "decision" in tags or "决定" in text or "决策" in text:
+            return MemoryType.DECISION
+        if "procedure" in tags or "步骤" in text or "流程" in text:
+            return MemoryType.PROCEDURE
+        if "经验" in text or "worked" in lowered or "failed" in lowered:
+            return MemoryType.EXPERIENCE
+        return MemoryType.SUMMARY
+
+    def _estimate_importance(self, text: str, tags: list[str], min_importance: float) -> float:
+        importance = max(min_importance, 0.6)
+        signal_words = ["必须", "不要", "偏好", "长期", "决策", "结论", "重要", "preference", "decision", "must"]
+        if any(word in text.lower() for word in signal_words):
+            importance += 0.15
+        if "review" in tags or "high-value" in tags:
+            importance += 0.2
+        return min(1.0, importance)
+
+    def _find_conflicts(self, candidate: MemoryRecord, existing: list[MemoryRecord]) -> list[str]:
+        candidate_text = candidate.content.lower()
+        negation_signals = ["不要", "不再", "禁止", "not ", "never", "disable"]
+        positive_signals = ["需要", "必须", "启用", "prefer", "always", "enable"]
+        has_negative = any(signal in candidate_text for signal in negation_signals)
+        has_positive = any(signal in candidate_text for signal in positive_signals)
+        if not has_negative and not has_positive:
+            return []
+
+        candidate_tokens = self._tokens(candidate.content)
+        conflicts = []
+        for memory in existing:
+            if memory.user_id != candidate.user_id:
+                continue
+            if memory.memory_type != candidate.memory_type:
+                continue
+            overlap = candidate_tokens.intersection(self._tokens(memory.content))
+            if len(overlap) < 2:
+                continue
+            memory_text = memory.content.lower()
+            memory_negative = any(signal in memory_text for signal in negation_signals)
+            memory_positive = any(signal in memory_text for signal in positive_signals)
+            if has_negative != memory_negative or has_positive != memory_positive:
+                conflicts.append(memory.id)
+        return conflicts
+
+    def _tokens(self, text: str) -> set[str]:
+        return {token for token in re.split(r"[^a-zA-Z0-9\u4e00-\u9fff]+", text.lower()) if len(token) >= 2}
+
+    def _normalize(self, text: str) -> str:
+        return _NOISE_RE.sub(" ", text).strip(" -_*#\t")
+
+    def _fingerprint(self, text: str) -> str:
+        normalized = self._normalize(text).lower()
+        return hashlib.sha1(normalized.encode("utf-8")).hexdigest()
+