memoria/scripts/utils.py

"""Shared utilities for the personal knowledge base."""

import hashlib
import json
import re
from pathlib import Path

from config import (
    CONCEPTS_DIR,
    CONNECTIONS_DIR,
    DAILY_DIR,
    INDEX_FILE,
    KNOWLEDGE_DIR,
    LOG_FILE,
    QA_DIR,
    STATE_FILE,
)
from fs_utils import (
    atomic_write_text,
    extract_wikilinks as _extract_wikilinks,
    load_json_with_recovery,
    parse_wikilink,
    safe_article_path,
)


_DEFAULT_STATE: dict = {"ingested": {}, "query_count": 0, "last_lint": None, "total_cost": 0.0}


# ── State management ──────────────────────────────────────────────────

def load_state() -> dict:
    """Load persistent state from state.json.

    On file corruption, moves the bad file aside with a timestamped backup,
    logs a warning, and returns a fresh default state. This avoids the
    silent "full recompile" failure mode that would otherwise follow a
    JSONDecodeError.
    """
    # Return a *copy* of defaults so mutations by the caller don't pollute
    # the module-level default.
    return load_json_with_recovery(STATE_FILE, dict(_DEFAULT_STATE))


def save_state(state: dict) -> None:
    """Save state atomically (tmp + fsync + rename).

    Interruptions during write leave state.json in its previous good state.
    The partial tmp file is cleaned up on exception.
    """
    atomic_write_text(STATE_FILE, json.dumps(state, indent=2))


# ── File hashing ──────────────────────────────────────────────────────

def file_hash(path: Path) -> str:
    """SHA-256 hash of a file (first 16 hex chars)."""
    return hashlib.sha256(path.read_bytes()).hexdigest()[:16]


# ── Slug / naming ─────────────────────────────────────────────────────

def slugify(text: str) -> str:
    """Convert text to a filename-safe slug."""
    text = text.lower().strip()
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"[\s_]+", "-", text)
    text = re.sub(r"-+", "-", text)
    return text.strip("-")


# ── Wikilink helpers ──────────────────────────────────────────────────

def extract_wikilinks(content: str) -> list[str]:
    """Extract all [[wikilinks]] from markdown content.

    Pipe-aliased links ([[target|display]]) are returned as the bare
    `target` slug — display text is stripped. Callers should treat the
    return value as filesystem-relative slugs, not raw link text.
    """
    return _extract_wikilinks(content)


def wiki_article_exists(link: str) -> bool:
    """Check if a wikilinked article exists on disk.

    Resolves via `safe_article_path`: pipe-alias is stripped, the final
    path is asserted to remain inside `KNOWLEDGE_DIR`, and traversal
    attempts (`../../etc/passwd`) return False without touching the
    filesystem outside the knowledge tree.
    """
    path = safe_article_path(link, KNOWLEDGE_DIR)
    return path is not None and path.exists()


# ── Wiki content helpers ──────────────────────────────────────────────

def read_wiki_index() -> str:
    """Read the knowledge base index file."""
    if INDEX_FILE.exists():
        return INDEX_FILE.read_text(encoding="utf-8")
    return "# Knowledge Base Index\n\n| Article | Summary | Compiled From | Updated |\n|---------|---------|---------------|---------|"


def read_all_wiki_content() -> str:
    """Read index + all wiki articles into a single string for context."""
    parts = [f"## INDEX\n\n{read_wiki_index()}"]

    for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]:
        if not subdir.exists():
            continue
        for md_file in sorted(subdir.glob("*.md")):
            rel = md_file.relative_to(KNOWLEDGE_DIR)
            content = md_file.read_text(encoding="utf-8")
            parts.append(f"## {rel}\n\n{content}")

    return "\n\n---\n\n".join(parts)


def list_wiki_articles() -> list[Path]:
    """List all wiki article files."""
    articles = []
    for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]:
        if subdir.exists():
            articles.extend(sorted(subdir.glob("*.md")))
    return articles


def list_raw_files() -> list[Path]:
    """List all daily log files."""
    if not DAILY_DIR.exists():
        return []
    return sorted(DAILY_DIR.glob("*.md"))


# ── Index helpers ─────────────────────────────────────────────────────

def count_inbound_links(target: str, exclude_file: Path | None = None) -> int:
    """Count how many wiki articles link to a given target.

    Correctly handles pipe-aliased links — an article containing
    `[[concepts/foo|Display]]` counts as an inbound link to
    `concepts/foo`.
    """
    target_slug = parse_wikilink(target)
    count = 0
    for article in list_wiki_articles():
        if article == exclude_file:
            continue
        content = article.read_text(encoding="utf-8")
        if any(parse_wikilink(raw) == target_slug for raw in _extract_wikilinks(content)):
            count += 1
    return count


def get_article_word_count(path: Path) -> int:
    """Count words in an article, excluding YAML frontmatter."""
    content = path.read_text(encoding="utf-8")
    # Strip frontmatter
    if content.startswith("---"):
        end = content.find("---", 3)
        if end != -1:
            content = content[end + 3:]
    return len(content.split())


def build_index_entry(rel_path: str, summary: str, sources: str, updated: str) -> str:
    """Build a single index table row."""
    link = rel_path.replace(".md", "")
    return f"| [[{link}]] | {summary} | {sources} | {updated} |"