Add generic memory gateway v1

This commit is contained in:
2026-05-05 16:18:31 +08:00
parent ba84b1ddb3
commit e65731a273
54 changed files with 4082 additions and 49 deletions

View File

@ -0,0 +1,2 @@
"""Background worker skeletons."""

View File

@ -0,0 +1,186 @@
"""Minimal EverMemOS-style consolidation worker.
This worker is deliberately deterministic for the POC. It extracts stable
candidate memories from session episodes, deduplicates them against existing
records, promotes eligible records, and sends high-risk/high-value candidates
to Obsidian review rather than blindly polluting long-term memory.
"""
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass, field
from memory_gateway.namespace import default_namespace_for_context
from memory_gateway.obsidian_review import write_review_draft
from memory_gateway.repositories import MetadataRepository
from memory_gateway.schemas import (
AccessContext,
EpisodeRecord,
MemoryRecord,
MemoryType,
SourceType,
Visibility,
)
_SENTENCE_RE = re.compile(r"(?<=[。!?.!?])\s+|\n+")
_NOISE_RE = re.compile(r"\s+")
@dataclass
class ConsolidationResult:
session_id: str
episodes: int
candidates: list[MemoryRecord] = field(default_factory=list)
promoted: list[MemoryRecord] = field(default_factory=list)
duplicates: list[dict] = field(default_factory=list)
review_drafts: list[str] = field(default_factory=list)
conflicts: list[dict] = field(default_factory=list)
class EverMemOSWorker:
def __init__(self, repo: MetadataRepository) -> None:
self.repo = repo
def consolidate_session(
self,
session_id: str,
ctx: AccessContext,
min_importance: float = 0.6,
target_namespace: str | None = None,
) -> ConsolidationResult:
episodes = self.repo.list_session_episodes(session_id)
result = ConsolidationResult(session_id=session_id, episodes=len(episodes))
existing = list(self.repo.list_memories())
seen_fingerprints = {self._fingerprint(memory.content): memory for memory in existing}
for episode in episodes:
for candidate in self._extract_candidates(episode, ctx, min_importance, target_namespace):
result.candidates.append(candidate)
fingerprint = self._fingerprint(candidate.content)
duplicate = seen_fingerprints.get(fingerprint)
if duplicate:
result.duplicates.append({"candidate_id": candidate.id, "existing_id": duplicate.id})
continue
conflict_ids = self._find_conflicts(candidate, existing)
if conflict_ids:
draft = write_review_draft(candidate, reason="conflict", conflict_ids=conflict_ids)
result.review_drafts.append(str(draft))
result.conflicts.append({"candidate_id": candidate.id, "conflict_ids": conflict_ids})
continue
if candidate.importance >= 0.85:
draft = write_review_draft(candidate, reason="high_value")
result.review_drafts.append(str(draft))
continue
if candidate.importance >= min_importance and candidate.confidence >= 0.55:
self.repo.upsert_memory(candidate)
result.promoted.append(candidate)
seen_fingerprints[fingerprint] = candidate
existing.append(candidate)
return result
def _extract_candidates(
self,
episode: EpisodeRecord,
ctx: AccessContext,
min_importance: float,
target_namespace: str | None,
) -> list[MemoryRecord]:
text = episode.summary or episode.content
parts = [self._normalize(part) for part in _SENTENCE_RE.split(text) if self._normalize(part)]
candidates: list[MemoryRecord] = []
for part in parts:
if len(part) < 20:
continue
memory_type = self._classify_type(part, episode.tags)
importance = self._estimate_importance(part, episode.tags, min_importance)
confidence = 0.65 if episode.summary else 0.58
visibility = Visibility.WORKSPACE_SHARED if "workspace" in episode.tags and ctx.workspace_id else Visibility.PRIVATE
memory_ctx = AccessContext(
user_id=ctx.user_id,
agent_id=ctx.agent_id,
workspace_id=ctx.workspace_id,
session_id=ctx.session_id,
)
candidates.append(
MemoryRecord(
user_id=ctx.user_id,
agent_id=ctx.agent_id,
workspace_id=ctx.workspace_id,
session_id=episode.session_id,
namespace=target_namespace or default_namespace_for_context(memory_ctx, visibility),
memory_type=memory_type,
content=part,
summary=part[:180],
tags=list(set(episode.tags + ["promoted-from-session", "evermemos-candidate"])),
importance=importance,
confidence=confidence,
visibility=visibility,
source=SourceType.EVERMEMOS,
source_ref=episode.id,
)
)
return candidates
def _classify_type(self, text: str, tags: list[str]) -> MemoryType:
lowered = text.lower()
if "preference" in tags or "偏好" in text:
return MemoryType.PREFERENCE
if "decision" in tags or "决定" in text or "决策" in text:
return MemoryType.DECISION
if "procedure" in tags or "步骤" in text or "流程" in text:
return MemoryType.PROCEDURE
if "经验" in text or "worked" in lowered or "failed" in lowered:
return MemoryType.EXPERIENCE
return MemoryType.SUMMARY
def _estimate_importance(self, text: str, tags: list[str], min_importance: float) -> float:
importance = max(min_importance, 0.6)
signal_words = ["必须", "不要", "偏好", "长期", "决策", "结论", "重要", "preference", "decision", "must"]
if any(word in text.lower() for word in signal_words):
importance += 0.15
if "review" in tags or "high-value" in tags:
importance += 0.2
return min(1.0, importance)
def _find_conflicts(self, candidate: MemoryRecord, existing: list[MemoryRecord]) -> list[str]:
candidate_text = candidate.content.lower()
negation_signals = ["不要", "不再", "禁止", "not ", "never", "disable"]
positive_signals = ["需要", "必须", "启用", "prefer", "always", "enable"]
has_negative = any(signal in candidate_text for signal in negation_signals)
has_positive = any(signal in candidate_text for signal in positive_signals)
if not has_negative and not has_positive:
return []
candidate_tokens = self._tokens(candidate.content)
conflicts = []
for memory in existing:
if memory.user_id != candidate.user_id:
continue
if memory.memory_type != candidate.memory_type:
continue
overlap = candidate_tokens.intersection(self._tokens(memory.content))
if len(overlap) < 2:
continue
memory_text = memory.content.lower()
memory_negative = any(signal in memory_text for signal in negation_signals)
memory_positive = any(signal in memory_text for signal in positive_signals)
if has_negative != memory_negative or has_positive != memory_positive:
conflicts.append(memory.id)
return conflicts
def _tokens(self, text: str) -> set[str]:
return {token for token in re.split(r"[^a-zA-Z0-9\u4e00-\u9fff]+", text.lower()) if len(token) >= 2}
def _normalize(self, text: str) -> str:
return _NOISE_RE.sub(" ", text).strip(" -_*#\t")
def _fingerprint(self, text: str) -> str:
normalized = self._normalize(text).lower()
return hashlib.sha1(normalized.encode("utf-8")).hexdigest()