beaver_project/app-instance/backend/beaver/services/skill_migration.py

"""Import legacy and staged skills into the Beaver SkillSpecStore."""

from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime, timezone
import io
import json
import re
import zipfile
from pathlib import Path
from typing import Any

from beaver.skills.catalog.utils import parse_frontmatter, strip_frontmatter
from beaver.skills.specs import SkillSpec, SkillSpecStore, SkillVersion
from beaver.skills.specs.serialization import canonical_hash, normalize_frontmatter, summarize_skill_content


@dataclass(slots=True)
class SkillMigrationService:
    store: SkillSpecStore
    repo_root: Path | None = None

    def migrate_all(self) -> dict[str, Any]:
        included: list[dict[str, Any]] = []
        skipped: list[dict[str, Any]] = []
        for path in self._backend_old_skills():
            self._migrate_skill_file(path, "backend-old", included, skipped)
        for path in self._staged_skills():
            self._migrate_skill_file(path, "stevenli-staged", included, skipped)
        for path in self._skill_zips():
            self._migrate_zip(path, included, skipped)
        manifest = {
            "generated_at": _now(),
            "workspace": str(self.store.workspace),
            "included": included,
            "skipped": skipped,
        }
        manifest_path = self.store.workspace / "skill_migration_manifest.json"
        manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
        return manifest

    def _backend_old_skills(self) -> list[Path]:
        root = self._repo_root() / "app-instance" / "backend-old" / "nanobot" / "skills"
        if not root.exists():
            return []
        return sorted(root.glob("*/SKILL.md"))

    def _staged_skills(self) -> list[Path]:
        root = self.store.workspace / "state" / "skill-reviews"
        if not root.exists():
            return []
        return sorted(root.glob("*/staged/*/SKILL.md"))

    def _skill_zips(self) -> list[Path]:
        root = self.store.workspace / "skills"
        if not root.exists():
            return []
        return sorted(root.glob("*.zip"))

    def _repo_root(self) -> Path:
        if self.repo_root is not None:
            return self.repo_root
        return Path(__file__).resolve().parents[4]

    def _migrate_skill_file(self, path: Path, source: str, included: list[dict[str, Any]], skipped: list[dict[str, Any]]) -> None:
        try:
            content = path.read_text(encoding="utf-8")
            result = self._publish_content(content, source=source, source_path=str(path))
            included.append(result)
        except Exception as exc:
            skipped.append({"source": source, "source_path": str(path), "reason": str(exc)})

    def _migrate_zip(self, path: Path, included: list[dict[str, Any]], skipped: list[dict[str, Any]]) -> None:
        try:
            with zipfile.ZipFile(io.BytesIO(path.read_bytes()), "r") as archive:
                entries = [info for info in archive.infolist() if not info.is_dir()]
                skill_entry = _find_skill_entry(entries)
                content = archive.read(skill_entry).decode("utf-8", errors="replace")
                result = self._publish_content(content, source="stevenli-zip", source_path=str(path))
                skill_name = result["skill_name"]
                version = result["version"]
                top = Path(skill_entry).parts[0] if len(Path(skill_entry).parts) == 2 else ""
                for info in entries:
                    raw = info.filename.replace("\\", "/")
                    if raw == skill_entry or raw.startswith("/") or "__MACOSX" in Path(raw).parts:
                        continue
                    parts = Path(raw).parts
                    rel_parts = parts[1:] if top and parts and parts[0] == top else parts
                    if not rel_parts or any(part in {"", ".", ".."} for part in rel_parts):
                        continue
                    target = self.store.root / skill_name / "versions" / version / "/".join(rel_parts)
                    target.parent.mkdir(parents=True, exist_ok=True)
                    target.write_bytes(archive.read(info))
                included.append(result)
        except Exception as exc:
            skipped.append({"source": "stevenli-zip", "source_path": str(path), "reason": str(exc)})

    def _publish_content(self, content: str, *, source: str, source_path: str) -> dict[str, Any]:
        frontmatter, body = parse_frontmatter(content)
        skill_name = _safe_name(str(frontmatter.get("name") or Path(source_path).parent.name))
        if not skill_name:
            raise ValueError("unsafe or missing skill name")
        normalized = normalize_frontmatter(
            {
                **frontmatter,
                "name": skill_name,
                "description": frontmatter.get("description") or skill_name,
            }
        )
        rendered = _render_skill_content(normalized, body)
        content_hash = canonical_hash(rendered)
        existing = self.store.read_published_skill(skill_name)
        if existing is not None and existing.version.content_hash == content_hash:
            return {
                "status": "unchanged",
                "skill_name": skill_name,
                "version": existing.version.version,
                "source": source,
                "source_path": source_path,
            }
        version_id = self._next_version(skill_name)
        now = _now()
        skill_version = SkillVersion(
            skill_name=skill_name,
            version=version_id,
            content_hash=content_hash,
            summary_hash=canonical_hash(strip_frontmatter(rendered).strip()),
            created_at=now,
            created_by="migration",
            change_reason=f"Import skill from {source}",
            parent_version=existing.version.version if existing is not None else None,
            review_state="published",
            frontmatter=normalized,
            summary=summarize_skill_content(body),
            tool_hints=self.store._extract_tool_hints(normalized),
            provenance={"source": source, "source_path": source_path, "imported_at": now},
        )
        self.store.write_skill_version(skill_version, rendered)
        spec = self.store.get_skill_spec(skill_name) or SkillSpec(
            name=skill_name,
            display_name=skill_name,
            description=str(normalized.get("description") or skill_name),
            created_at=now,
            updated_at=now,
            current_version=version_id,
            status="active",
            tags=[],
            owners=["migration"],
            source_kind=source,
            lineage=[],
        )
        spec.current_version = version_id
        spec.updated_at = now
        spec.status = "active"
        spec.source_kind = source
        if "migration" not in spec.owners:
            spec.owners.append("migration")
        self.store.write_skill_spec(spec)
        self.store.set_current_version(skill_name, version_id)
        published = self.store.read_index("published")
        if skill_name not in published:
            published.append(skill_name)
            self.store.update_index("published", published)
        return {"status": "included", "skill_name": skill_name, "version": version_id, "source": source, "source_path": source_path}

    def _next_version(self, skill_name: str) -> str:
        versions = [item for item in self.store.list_versions(skill_name) if item.startswith("v")]
        numbers = [int(item[1:]) for item in versions if item[1:].isdigit()]
        return f"v{(max(numbers) if numbers else 0) + 1:04d}"


def _find_skill_entry(entries: list[zipfile.ZipInfo]) -> str:
    candidates = []
    for info in entries:
        raw = info.filename.replace("\\", "/")
        parts = Path(raw).parts
        if raw.startswith("/") or any(part in {"", ".", ".."} for part in parts):
            raise ValueError(f"unsafe archive entry: {info.filename}")
        if parts and parts[-1] == "SKILL.md" and len(parts) in (1, 2):
            candidates.append(raw)
    if not candidates:
        raise ValueError("zip has no root SKILL.md")
    return candidates[0]


def _safe_name(value: str) -> str:
    cleaned = value.strip().replace(" ", "-")
    if not cleaned or cleaned in {".", ".."} or "/" in cleaned or "\\" in cleaned:
        return ""
    return cleaned if re.fullmatch(r"[A-Za-z0-9_.-]+", cleaned) else ""


def _render_skill_content(frontmatter: dict[str, Any], body: str) -> str:
    lines = ["---"]
    for key, value in normalize_frontmatter(frontmatter).items():
        if isinstance(value, list):
            lines.append(f"{key}:")
            for item in value:
                lines.append(f"  - {item}")
        else:
            lines.append(f"{key}: {value}")
    lines.extend(["---", "", body.strip()])
    return "\n".join(lines).rstrip() + "\n"


def _now() -> str:
    return datetime.now(timezone.utc).isoformat()