""" Compile daily conversation logs into structured knowledge articles. This is the "LLM compiler" - it reads daily logs (source code) and produces organized knowledge articles (the executable). Usage: uv run python compile.py # compile new/changed logs only uv run python compile.py --all # force recompile everything uv run python compile.py --file daily/2026-04-01.md # compile a specific log uv run python compile.py --dry-run # show what would be compiled """ from __future__ import annotations import argparse import asyncio import os import re import sys from pathlib import Path from config import AGENTS_FILE, CONCEPTS_DIR, CONNECTIONS_DIR, DAILY_DIR, KNOWLEDGE_DIR, now_iso from utils import ( file_hash, list_raw_files, list_wiki_articles, load_state, read_wiki_index, save_state, ) # ── Paths for the LLM to use ────────────────────────────────────────── ROOT_DIR = Path(__file__).resolve().parent.parent # Compilation model (Sonnet by default — knowledge extraction benefits from # strong reasoning; override via MEMORIA_COMPILE_MODEL for experiments). COMPILE_MODEL = os.environ.get("MEMORIA_COMPILE_MODEL", "sonnet") # Chunk threshold for large daily logs. Anything above ~100K chars gets # split along `### ` section boundaries so a single LLM call never # receives the whole log when it's oversized. Each chunk compiles via a # fresh Claude invocation; they merge naturally because all writes go # through Edit on shared files (index.md, existing concept articles). # # 100K chars ≈ 25K tokens — well under Claude's context window even # after schema + index + instructions + headroom. MAX_LOG_CHARS_PER_CHUNK = int(os.environ.get("MEMORIA_MAX_LOG_CHARS", "100000")) def _split_log_into_chunks(log_content: str, max_chars: int) -> list[str]: """Split a daily log by ### section headers if it exceeds max_chars. Returns a list of chunk strings where each chunk is <= max_chars (unless a single section itself exceeds max_chars, in which case the section is emitted as its own oversized chunk — preferable to splitting mid-thought). If the whole log is <= max_chars, returns a single-element list. """ if len(log_content) <= max_chars: return [log_content] # Split at ### boundaries, keeping the header attached to its body. parts = re.split(r"(?m)(?=^### )", log_content) chunks: list[str] = [] current = "" for part in parts: if not part: continue # If this part alone exceeds max_chars, emit it as its own chunk. if len(part) > max_chars: if current: chunks.append(current) current = "" chunks.append(part) continue # If appending would overflow, close out current and start new. if current and len(current) + len(part) > max_chars: chunks.append(current) current = part else: current += part if current: chunks.append(current) return chunks def _build_prompt(log_name: str, chunk_body: str, *, chunk_info: str = "") -> str: """Assemble the compile prompt. Unlike upstream, we do NOT inline every existing article into the prompt — that would send the whole wiki on every call, exploding cost and hitting context limits past ~50 articles (upstream issues #3/#5/#9). Instead, we provide: * the schema (AGENTS.md) — stable structural rules * the current index — lets the compiler identify which concepts exist * the daily log — the new material to compile The compiler uses its Read tool to fetch specific existing articles it deems relevant (index has paths + summaries), keeping prompt size bounded regardless of knowledge-base size. """ schema = AGENTS_FILE.read_text(encoding="utf-8") wiki_index = read_wiki_index() timestamp = now_iso() return f"""You are a knowledge compiler. Your job is to read a daily conversation log and extract knowledge into structured wiki articles.{chunk_info} ## Schema (AGENTS.md) {schema} ## Current Wiki Index The index below lists every existing wiki article with a one-line summary. When extracting concepts, check this index first. If a concept already exists, use the Read tool to fetch its current content and update it rather than duplicating. Only fetch articles you actually need — do not read the entire wiki. {wiki_index} ## Daily Log to Compile **File:** {log_name} {chunk_body} ## Your Task Read the daily log above and compile it into wiki articles following the schema exactly. ### Rules: 1. **Consult the index first.** Identify which concepts in the daily log already have articles (use the Read tool to fetch them) and which are new. Do not list or read the whole wiki — only what's relevant. 2. **Extract key concepts** - Identify 3-7 distinct concepts worth their own article 3. **Create concept articles** in `knowledge/concepts/` - One .md file per concept - Use the exact article format from AGENTS.md (YAML frontmatter + sections) - Include `sources:` in frontmatter pointing to the daily log file - Use `[[concepts/slug]]` wikilinks to link to related concepts - Write in encyclopedia style - neutral, comprehensive 4. **Create connection articles** in `knowledge/connections/` if this log reveals non-obvious relationships between 2+ existing concepts 5. **Update existing articles** if this log adds new information to concepts already in the wiki - Read the existing article, add the new information, add the source to frontmatter 6. **Update knowledge/index.md** - Add new entries to the table - Each entry: `| [[path/slug]] | One-line summary | source-file | {timestamp[:10]} |` 7. **Append to knowledge/log.md** - Add a timestamped entry: ``` ## [{timestamp}] compile | {log_name} - Source: daily/{log_name} - Articles created: [[concepts/x]], [[concepts/y]] - Articles updated: [[concepts/z]] (if any) ``` ### File paths: - Write concept articles to: {CONCEPTS_DIR} - Write connection articles to: {CONNECTIONS_DIR} - Update index at: {KNOWLEDGE_DIR / 'index.md'} - Append log at: {KNOWLEDGE_DIR / 'log.md'} ### Quality standards: - Every article must have complete YAML frontmatter - Every article must link to at least 2 other articles via [[wikilinks]] - Key Points section should have 3-5 bullet points - Details section should have 2+ paragraphs - Related Concepts section should have 2+ entries - Sources section should cite the daily log with specific claims extracted """ async def _invoke_llm(prompt: str) -> tuple[float, bool]: """Run one LLM compile pass. Returns (cost_usd, success). success=False means the SDK raised an exception — the caller must NOT mark the daily log as compiled in state.json, so the log is retried on the next run rather than silently dropped. """ from claude_agent_sdk import ( AssistantMessage, ClaudeAgentOptions, ResultMessage, TextBlock, query, ) cost = 0.0 try: async for message in query( prompt=prompt, options=ClaudeAgentOptions( cwd=str(ROOT_DIR), model=COMPILE_MODEL, system_prompt={"type": "preset", "preset": "claude_code"}, allowed_tools=["Read", "Write", "Edit", "Glob", "Grep"], permission_mode="acceptEdits", max_turns=30, ), ): if isinstance(message, AssistantMessage): for block in message.content: if isinstance(block, TextBlock): pass # LLM writes files directly via tools elif isinstance(message, ResultMessage): cost = message.total_cost_usd or 0.0 print(f" Cost: ${cost:.4f}") return cost, True except Exception as e: print(f" SDK error: {e}") return cost, False async def compile_daily_log(log_path: Path, state: dict) -> float: """Compile a single daily log into knowledge articles. Splits large logs into `### `-bounded chunks before invoking the LLM, so a single call never receives an oversized daily log. State is only updated when ALL chunks succeed — partial failure leaves the log flagged as uncompiled so the next run retries it. Returns total API cost of the compilation (sum across chunks). """ log_content = log_path.read_text(encoding="utf-8") chunks = _split_log_into_chunks(log_content, MAX_LOG_CHARS_PER_CHUNK) total_cost = 0.0 all_succeeded = True for i, chunk in enumerate(chunks, 1): chunk_info = ( f"\n\n(Chunk {i} of {len(chunks)} — compile the sections in this chunk; " "remaining chunks of the same log follow in subsequent calls.)" if len(chunks) > 1 else "" ) prompt = _build_prompt(log_path.name, chunk, chunk_info=chunk_info) print(f" Chunk {i}/{len(chunks)} ({len(chunk):,} chars)...") cost, ok = await _invoke_llm(prompt) total_cost += cost if not ok: all_succeeded = False break if not all_succeeded: print(f" FAILED: log not marked compiled; will retry on next run.") return total_cost # All chunks succeeded — atomically update state. rel_path = log_path.name state.setdefault("ingested", {})[rel_path] = { "hash": file_hash(log_path), "compiled_at": now_iso(), "cost_usd": total_cost, "chunks": len(chunks), } state["total_cost"] = state.get("total_cost", 0.0) + total_cost save_state(state) return total_cost def main(): parser = argparse.ArgumentParser(description="Compile daily logs into knowledge articles") parser.add_argument("--all", action="store_true", help="Force recompile all logs") parser.add_argument("--file", type=str, help="Compile a specific daily log file") parser.add_argument("--dry-run", action="store_true", help="Show what would be compiled") args = parser.parse_args() state = load_state() # Determine which files to compile if args.file: target = Path(args.file) if not target.is_absolute(): target = DAILY_DIR / target.name if not target.exists(): # Try resolving relative to project root target = ROOT_DIR / args.file if not target.exists(): print(f"Error: {args.file} not found") sys.exit(1) to_compile = [target] else: all_logs = list_raw_files() if args.all: to_compile = all_logs else: to_compile = [] for log_path in all_logs: rel = log_path.name prev = state.get("ingested", {}).get(rel, {}) if not prev or prev.get("hash") != file_hash(log_path): to_compile.append(log_path) if not to_compile: print("Nothing to compile - all daily logs are up to date.") return print(f"{'[DRY RUN] ' if args.dry_run else ''}Files to compile ({len(to_compile)}):") for f in to_compile: print(f" - {f.name}") if args.dry_run: return async def _compile_all() -> float: total = 0.0 for i, log_path in enumerate(to_compile, 1): print(f"\n[{i}/{len(to_compile)}] Compiling {log_path.name}...") cost = await compile_daily_log(log_path, state) total += cost print(f" Done.") return total # Single event-loop lifecycle for the whole batch — avoids reinit overhead # and lets any async resources in the SDK settle predictably. total_cost = asyncio.run(_compile_all()) articles = list_wiki_articles() print(f"\nCompilation complete. Total cost: ${total_cost:.2f}") print(f"Knowledge base: {len(articles)} articles") if __name__ == "__main__": main()