"""Shared utilities for the personal knowledge base.""" import hashlib import json import re from pathlib import Path from config import ( CONCEPTS_DIR, CONNECTIONS_DIR, DAILY_DIR, INDEX_FILE, KNOWLEDGE_DIR, LOG_FILE, QA_DIR, STATE_FILE, ) from fs_utils import ( atomic_write_text, extract_wikilinks as _extract_wikilinks, load_json_with_recovery, parse_wikilink, safe_article_path, ) _DEFAULT_STATE: dict = {"ingested": {}, "query_count": 0, "last_lint": None, "total_cost": 0.0} # ── State management ────────────────────────────────────────────────── def load_state() -> dict: """Load persistent state from state.json. On file corruption, moves the bad file aside with a timestamped backup, logs a warning, and returns a fresh default state. This avoids the silent "full recompile" failure mode that would otherwise follow a JSONDecodeError. """ # Return a *copy* of defaults so mutations by the caller don't pollute # the module-level default. return load_json_with_recovery(STATE_FILE, dict(_DEFAULT_STATE)) def save_state(state: dict) -> None: """Save state atomically (tmp + fsync + rename). Interruptions during write leave state.json in its previous good state. The partial tmp file is cleaned up on exception. """ atomic_write_text(STATE_FILE, json.dumps(state, indent=2)) # ── File hashing ────────────────────────────────────────────────────── def file_hash(path: Path) -> str: """SHA-256 hash of a file (first 16 hex chars).""" return hashlib.sha256(path.read_bytes()).hexdigest()[:16] # ── Slug / naming ───────────────────────────────────────────────────── def slugify(text: str) -> str: """Convert text to a filename-safe slug.""" text = text.lower().strip() text = re.sub(r"[^\w\s-]", "", text) text = re.sub(r"[\s_]+", "-", text) text = re.sub(r"-+", "-", text) return text.strip("-") # ── Wikilink helpers ────────────────────────────────────────────────── def extract_wikilinks(content: str) -> list[str]: """Extract all [[wikilinks]] from markdown content. Pipe-aliased links ([[target|display]]) are returned as the bare `target` slug — display text is stripped. Callers should treat the return value as filesystem-relative slugs, not raw link text. """ return _extract_wikilinks(content) def wiki_article_exists(link: str) -> bool: """Check if a wikilinked article exists on disk. Resolves via `safe_article_path`: pipe-alias is stripped, the final path is asserted to remain inside `KNOWLEDGE_DIR`, and traversal attempts (`../../etc/passwd`) return False without touching the filesystem outside the knowledge tree. """ path = safe_article_path(link, KNOWLEDGE_DIR) return path is not None and path.exists() # ── Wiki content helpers ────────────────────────────────────────────── def read_wiki_index() -> str: """Read the knowledge base index file.""" if INDEX_FILE.exists(): return INDEX_FILE.read_text(encoding="utf-8") return "# Knowledge Base Index\n\n| Article | Summary | Compiled From | Updated |\n|---------|---------|---------------|---------|" def read_all_wiki_content() -> str: """Read index + all wiki articles into a single string for context.""" parts = [f"## INDEX\n\n{read_wiki_index()}"] for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]: if not subdir.exists(): continue for md_file in sorted(subdir.glob("*.md")): rel = md_file.relative_to(KNOWLEDGE_DIR) content = md_file.read_text(encoding="utf-8") parts.append(f"## {rel}\n\n{content}") return "\n\n---\n\n".join(parts) def list_wiki_articles() -> list[Path]: """List all wiki article files.""" articles = [] for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]: if subdir.exists(): articles.extend(sorted(subdir.glob("*.md"))) return articles def list_raw_files() -> list[Path]: """List all daily log files.""" if not DAILY_DIR.exists(): return [] return sorted(DAILY_DIR.glob("*.md")) # ── Index helpers ───────────────────────────────────────────────────── def count_inbound_links(target: str, exclude_file: Path | None = None) -> int: """Count how many wiki articles link to a given target. Correctly handles pipe-aliased links — an article containing `[[concepts/foo|Display]]` counts as an inbound link to `concepts/foo`. """ target_slug = parse_wikilink(target) count = 0 for article in list_wiki_articles(): if article == exclude_file: continue content = article.read_text(encoding="utf-8") if any(parse_wikilink(raw) == target_slug for raw in _extract_wikilinks(content)): count += 1 return count def get_article_word_count(path: Path) -> int: """Count words in an article, excluding YAML frontmatter.""" content = path.read_text(encoding="utf-8") # Strip frontmatter if content.startswith("---"): end = content.find("---", 3) if end != -1: content = content[end + 3:] return len(content.split()) def build_index_entry(rel_path: str, summary: str, sources: str, updated: str) -> str: """Build a single index table row.""" link = rel_path.replace(".md", "") return f"| [[{link}]] | {summary} | {sources} | {updated} |"