memoria/scripts/compile.py

"""
Compile daily conversation logs into structured knowledge articles.

This is the "LLM compiler" - it reads daily logs (source code) and produces
organized knowledge articles (the executable).

Usage:
    uv run python compile.py                    # compile new/changed logs only
    uv run python compile.py --all              # force recompile everything
    uv run python compile.py --file daily/2026-04-01.md  # compile a specific log
    uv run python compile.py --dry-run          # show what would be compiled
"""

from __future__ import annotations

import argparse
import asyncio
import os
import re
import sys
from pathlib import Path

from config import AGENTS_FILE, CONCEPTS_DIR, CONNECTIONS_DIR, DAILY_DIR, KNOWLEDGE_DIR, now_iso
from utils import (
    file_hash,
    list_raw_files,
    list_wiki_articles,
    load_state,
    read_wiki_index,
    save_state,
)

# ── Paths for the LLM to use ──────────────────────────────────────────
ROOT_DIR = Path(__file__).resolve().parent.parent

# Compilation model (Sonnet by default — knowledge extraction benefits from
# strong reasoning; override via MEMORIA_COMPILE_MODEL for experiments).
COMPILE_MODEL = os.environ.get("MEMORIA_COMPILE_MODEL", "sonnet")

# Chunk threshold for large daily logs. Anything above ~100K chars gets
# split along `### ` section boundaries so a single LLM call never
# receives the whole log when it's oversized. Each chunk compiles via a
# fresh Claude invocation; they merge naturally because all writes go
# through Edit on shared files (index.md, existing concept articles).
#
# 100K chars ≈ 25K tokens — well under Claude's context window even
# after schema + index + instructions + headroom.
MAX_LOG_CHARS_PER_CHUNK = int(os.environ.get("MEMORIA_MAX_LOG_CHARS", "100000"))


def _split_log_into_chunks(log_content: str, max_chars: int) -> list[str]:
    """Split a daily log by ### section headers if it exceeds max_chars.

    Returns a list of chunk strings where each chunk is <= max_chars (unless
    a single section itself exceeds max_chars, in which case the section is
    emitted as its own oversized chunk — preferable to splitting mid-thought).

    If the whole log is <= max_chars, returns a single-element list.
    """
    if len(log_content) <= max_chars:
        return [log_content]

    # Split at ### boundaries, keeping the header attached to its body.
    parts = re.split(r"(?m)(?=^### )", log_content)

    chunks: list[str] = []
    current = ""
    for part in parts:
        if not part:
            continue
        # If this part alone exceeds max_chars, emit it as its own chunk.
        if len(part) > max_chars:
            if current:
                chunks.append(current)
                current = ""
            chunks.append(part)
            continue
        # If appending would overflow, close out current and start new.
        if current and len(current) + len(part) > max_chars:
            chunks.append(current)
            current = part
        else:
            current += part

    if current:
        chunks.append(current)

    return chunks


def _build_prompt(log_name: str, chunk_body: str, *, chunk_info: str = "") -> str:
    """Assemble the compile prompt.

    Unlike upstream, we do NOT inline every existing article into the prompt
    — that would send the whole wiki on every call, exploding cost and
    hitting context limits past ~50 articles (upstream issues #3/#5/#9).
    Instead, we provide:
      * the schema (AGENTS.md) — stable structural rules
      * the current index — lets the compiler identify which concepts exist
      * the daily log — the new material to compile
    The compiler uses its Read tool to fetch specific existing articles
    it deems relevant (index has paths + summaries), keeping prompt size
    bounded regardless of knowledge-base size.
    """
    schema = AGENTS_FILE.read_text(encoding="utf-8")
    wiki_index = read_wiki_index()
    timestamp = now_iso()

    return f"""You are a knowledge compiler. Your job is to read a daily conversation log
and extract knowledge into structured wiki articles.{chunk_info}

## Schema (AGENTS.md)

{schema}

## Current Wiki Index

The index below lists every existing wiki article with a one-line summary.
When extracting concepts, check this index first. If a concept already
exists, use the Read tool to fetch its current content and update it
rather than duplicating. Only fetch articles you actually need — do not
read the entire wiki.

{wiki_index}

## Daily Log to Compile

**File:** {log_name}

{chunk_body}

## Your Task

Read the daily log above and compile it into wiki articles following the schema exactly.

### Rules:

1. **Consult the index first.** Identify which concepts in the daily log
   already have articles (use the Read tool to fetch them) and which are
   new. Do not list or read the whole wiki — only what's relevant.
2. **Extract key concepts** - Identify 3-7 distinct concepts worth their own article
3. **Create concept articles** in `knowledge/concepts/` - One .md file per concept
   - Use the exact article format from AGENTS.md (YAML frontmatter + sections)
   - Include `sources:` in frontmatter pointing to the daily log file
   - Use `[[concepts/slug]]` wikilinks to link to related concepts
   - Write in encyclopedia style - neutral, comprehensive
4. **Create connection articles** in `knowledge/connections/` if this log reveals non-obvious
   relationships between 2+ existing concepts
5. **Update existing articles** if this log adds new information to concepts already in the wiki
   - Read the existing article, add the new information, add the source to frontmatter
6. **Update knowledge/index.md** - Add new entries to the table
   - Each entry: `| [[path/slug]] | One-line summary | source-file | {timestamp[:10]} |`
7. **Append to knowledge/log.md** - Add a timestamped entry:
   ```
   ## [{timestamp}] compile | {log_name}
   - Source: daily/{log_name}
   - Articles created: [[concepts/x]], [[concepts/y]]
   - Articles updated: [[concepts/z]] (if any)
   ```

### File paths:
- Write concept articles to: {CONCEPTS_DIR}
- Write connection articles to: {CONNECTIONS_DIR}
- Update index at: {KNOWLEDGE_DIR / 'index.md'}
- Append log at: {KNOWLEDGE_DIR / 'log.md'}

### Quality standards:
- Every article must have complete YAML frontmatter
- Every article must link to at least 2 other articles via [[wikilinks]]
- Key Points section should have 3-5 bullet points
- Details section should have 2+ paragraphs
- Related Concepts section should have 2+ entries
- Sources section should cite the daily log with specific claims extracted
"""


async def _invoke_llm(prompt: str) -> tuple[float, bool]:
    """Run one LLM compile pass. Returns (cost_usd, success).

    success=False means the SDK raised an exception — the caller must NOT
    mark the daily log as compiled in state.json, so the log is retried on
    the next run rather than silently dropped.
    """
    from claude_agent_sdk import (
        AssistantMessage,
        ClaudeAgentOptions,
        ResultMessage,
        TextBlock,
        query,
    )

    cost = 0.0
    try:
        async for message in query(
            prompt=prompt,
            options=ClaudeAgentOptions(
                cwd=str(ROOT_DIR),
                model=COMPILE_MODEL,
                system_prompt={"type": "preset", "preset": "claude_code"},
                allowed_tools=["Read", "Write", "Edit", "Glob", "Grep"],
                permission_mode="acceptEdits",
                max_turns=30,
            ),
        ):
            if isinstance(message, AssistantMessage):
                for block in message.content:
                    if isinstance(block, TextBlock):
                        pass  # LLM writes files directly via tools
            elif isinstance(message, ResultMessage):
                cost = message.total_cost_usd or 0.0
                print(f"    Cost: ${cost:.4f}")
        return cost, True
    except Exception as e:
        print(f"    SDK error: {e}")
        return cost, False


async def compile_daily_log(log_path: Path, state: dict) -> float:
    """Compile a single daily log into knowledge articles.

    Splits large logs into `### `-bounded chunks before invoking the LLM,
    so a single call never receives an oversized daily log. State is only
    updated when ALL chunks succeed — partial failure leaves the log
    flagged as uncompiled so the next run retries it.

    Returns total API cost of the compilation (sum across chunks).
    """
    log_content = log_path.read_text(encoding="utf-8")
    chunks = _split_log_into_chunks(log_content, MAX_LOG_CHARS_PER_CHUNK)

    total_cost = 0.0
    all_succeeded = True

    for i, chunk in enumerate(chunks, 1):
        chunk_info = (
            f"\n\n(Chunk {i} of {len(chunks)} — compile the sections in this chunk; "
            "remaining chunks of the same log follow in subsequent calls.)"
            if len(chunks) > 1
            else ""
        )
        prompt = _build_prompt(log_path.name, chunk, chunk_info=chunk_info)
        print(f"  Chunk {i}/{len(chunks)} ({len(chunk):,} chars)...")
        cost, ok = await _invoke_llm(prompt)
        total_cost += cost
        if not ok:
            all_succeeded = False
            break

    if not all_succeeded:
        print(f"  FAILED: log not marked compiled; will retry on next run.")
        return total_cost

    # All chunks succeeded — atomically update state.
    rel_path = log_path.name
    state.setdefault("ingested", {})[rel_path] = {
        "hash": file_hash(log_path),
        "compiled_at": now_iso(),
        "cost_usd": total_cost,
        "chunks": len(chunks),
    }
    state["total_cost"] = state.get("total_cost", 0.0) + total_cost
    save_state(state)

    return total_cost


def main():
    parser = argparse.ArgumentParser(description="Compile daily logs into knowledge articles")
    parser.add_argument("--all", action="store_true", help="Force recompile all logs")
    parser.add_argument("--file", type=str, help="Compile a specific daily log file")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be compiled")
    args = parser.parse_args()

    state = load_state()

    # Determine which files to compile
    if args.file:
        target = Path(args.file)
        if not target.is_absolute():
            target = DAILY_DIR / target.name
        if not target.exists():
            # Try resolving relative to project root
            target = ROOT_DIR / args.file
        if not target.exists():
            print(f"Error: {args.file} not found")
            sys.exit(1)
        to_compile = [target]
    else:
        all_logs = list_raw_files()
        if args.all:
            to_compile = all_logs
        else:
            to_compile = []
            for log_path in all_logs:
                rel = log_path.name
                prev = state.get("ingested", {}).get(rel, {})
                if not prev or prev.get("hash") != file_hash(log_path):
                    to_compile.append(log_path)

    if not to_compile:
        print("Nothing to compile - all daily logs are up to date.")
        return

    print(f"{'[DRY RUN] ' if args.dry_run else ''}Files to compile ({len(to_compile)}):")
    for f in to_compile:
        print(f"  - {f.name}")

    if args.dry_run:
        return

    async def _compile_all() -> float:
        total = 0.0
        for i, log_path in enumerate(to_compile, 1):
            print(f"\n[{i}/{len(to_compile)}] Compiling {log_path.name}...")
            cost = await compile_daily_log(log_path, state)
            total += cost
            print(f"  Done.")
        return total

    # Single event-loop lifecycle for the whole batch — avoids reinit overhead
    # and lets any async resources in the SDK settle predictably.
    total_cost = asyncio.run(_compile_all())

    articles = list_wiki_articles()
    print(f"\nCompilation complete. Total cost: ${total_cost:.2f}")
    print(f"Knowledge base: {len(articles)} articles")


if __name__ == "__main__":
    main()