""" Lint the knowledge base for structural and semantic health. Runs 7 checks: broken links, orphan pages, orphan sources, stale articles, contradictions (LLM), missing backlinks, and sparse articles. Usage: uv run python lint.py # all checks uv run python lint.py --structural-only # skip LLM checks (faster, cheaper) """ from __future__ import annotations import os # Recursion guard — set before any SDK import so global SessionEnd/PreCompact # hooks see CLAUDE_INVOKED_BY in the nested Claude subprocess's env and exit # cleanly. See compile.py for full rationale. os.environ["CLAUDE_INVOKED_BY"] = "memoria_lint" import argparse import asyncio from pathlib import Path from config import KNOWLEDGE_DIR, REPORTS_DIR, now_iso, today_iso # Contradiction-check model. Kept as Sonnet for reasoning quality; override # via MEMORIA_LINT_MODEL (e.g. to use a cheaper model for structural runs # that happen to include the LLM check). LINT_MODEL = os.environ.get("MEMORIA_LINT_MODEL", "sonnet") from utils import ( count_inbound_links, extract_wikilinks, file_hash, get_article_word_count, list_raw_files, list_wiki_articles, load_state, read_all_wiki_content, save_state, wiki_article_exists, ) ROOT_DIR = Path(__file__).resolve().parent.parent def check_broken_links() -> list[dict]: """Check for [[wikilinks]] that point to non-existent articles.""" issues = [] for article in list_wiki_articles(): content = article.read_text(encoding="utf-8") rel = article.relative_to(KNOWLEDGE_DIR) for link in extract_wikilinks(content): if link.startswith("daily/"): continue # daily log references are valid if not wiki_article_exists(link): issues.append({ "severity": "error", "check": "broken_link", "file": str(rel), "detail": f"Broken link: [[{link}]] - target does not exist", }) return issues def check_orphan_pages() -> list[dict]: """Check for articles with zero inbound links.""" issues = [] for article in list_wiki_articles(): rel = article.relative_to(KNOWLEDGE_DIR) link_target = str(rel).replace(".md", "").replace("\\", "/") inbound = count_inbound_links(link_target) if inbound == 0: issues.append({ "severity": "warning", "check": "orphan_page", "file": str(rel), "detail": f"Orphan page: no other articles link to [[{link_target}]]", }) return issues def check_orphan_sources() -> list[dict]: """Check for daily logs that haven't been compiled yet.""" state = load_state() ingested = state.get("ingested", {}) issues = [] for log_path in list_raw_files(): if log_path.name not in ingested: issues.append({ "severity": "warning", "check": "orphan_source", "file": f"daily/{log_path.name}", "detail": f"Uncompiled daily log: {log_path.name} has not been ingested", }) return issues def check_stale_articles() -> list[dict]: """Check if source daily logs have changed since compilation.""" state = load_state() ingested = state.get("ingested", {}) issues = [] for log_path in list_raw_files(): rel = log_path.name if rel in ingested: stored_hash = ingested[rel].get("hash", "") current_hash = file_hash(log_path) if stored_hash != current_hash: issues.append({ "severity": "warning", "check": "stale_article", "file": f"daily/{rel}", "detail": f"Stale: {rel} has changed since last compilation", }) return issues def check_missing_backlinks() -> list[dict]: """Check for asymmetric links: A links to B but B doesn't link to A. Skips any source or target under `qa/` or `sources/`: Q&A articles intentionally reference concepts without requiring a reciprocal link (concepts would otherwise accumulate a backlink per question, which drowns real relationships). Also handles pipe-aliased wikilinks via the alias-aware extract_wikilinks helper. """ issues = [] # Aliased/nested variants of the source link that should count as a # valid backlink on the target side: bare slug, and pipe-aliased form. for article in list_wiki_articles(): content = article.read_text(encoding="utf-8") rel = article.relative_to(KNOWLEDGE_DIR) rel_str = str(rel).replace("\\", "/") # Skip one-way source categories. if rel_str.startswith("qa/") or rel_str.startswith("sources/"): continue source_link = rel_str.replace(".md", "") for link in extract_wikilinks(content): if link.startswith("daily/") or link.startswith("qa/") or link.startswith("sources/"): continue target_path = KNOWLEDGE_DIR / f"{link}.md" if target_path.exists(): target_content = target_path.read_text(encoding="utf-8") target_backlinks = extract_wikilinks(target_content) if source_link not in target_backlinks: issues.append({ "severity": "suggestion", "check": "missing_backlink", "file": str(rel), "detail": f"[[{source_link}]] links to [[{link}]] but not vice versa", "auto_fixable": True, }) return issues def check_sparse_articles() -> list[dict]: """Check for articles with fewer than 200 words.""" issues = [] for article in list_wiki_articles(): word_count = get_article_word_count(article) if word_count < 200: rel = article.relative_to(KNOWLEDGE_DIR) issues.append({ "severity": "suggestion", "check": "sparse_article", "file": str(rel), "detail": f"Sparse article: {word_count} words (minimum recommended: 200)", }) return issues async def check_contradictions() -> list[dict]: """Use LLM to detect contradictions across articles.""" from claude_agent_sdk import ( AssistantMessage, ClaudeAgentOptions, ResultMessage, TextBlock, query, ) wiki_content = read_all_wiki_content() prompt = f"""Review this knowledge base for contradictions, inconsistencies, or conflicting claims across articles. ## Knowledge Base {wiki_content} ## Instructions Look for: - Direct contradictions (article A says X, article B says not-X) - Inconsistent recommendations (different articles recommend conflicting approaches) - Outdated information that conflicts with newer entries For each issue found, output EXACTLY one line in this format: CONTRADICTION: [file1] vs [file2] - description of the conflict INCONSISTENCY: [file] - description of the inconsistency If no issues found, output exactly: NO_ISSUES Do NOT output anything else - no preamble, no explanation, just the formatted lines.""" response = "" try: async for message in query( prompt=prompt, options=ClaudeAgentOptions( cwd=str(ROOT_DIR), model=LINT_MODEL, allowed_tools=[], max_turns=2, ), ): if isinstance(message, AssistantMessage): for block in message.content: if isinstance(block, TextBlock): response += block.text except Exception as e: return [{"severity": "error", "check": "contradiction", "file": "(system)", "detail": f"LLM check failed: {e}"}] issues = [] if "NO_ISSUES" not in response: for line in response.strip().split("\n"): line = line.strip() if line.startswith("CONTRADICTION:") or line.startswith("INCONSISTENCY:"): issues.append({ "severity": "warning", "check": "contradiction", "file": "(cross-article)", "detail": line, }) return issues def generate_report(all_issues: list[dict]) -> str: """Generate a markdown lint report.""" errors = [i for i in all_issues if i["severity"] == "error"] warnings = [i for i in all_issues if i["severity"] == "warning"] suggestions = [i for i in all_issues if i["severity"] == "suggestion"] lines = [ f"# Lint Report - {today_iso()}", "", f"**Total issues:** {len(all_issues)}", f"- Errors: {len(errors)}", f"- Warnings: {len(warnings)}", f"- Suggestions: {len(suggestions)}", "", ] for severity, issues, marker in [ ("Errors", errors, "x"), ("Warnings", warnings, "!"), ("Suggestions", suggestions, "?"), ]: if issues: lines.append(f"## {severity}") lines.append("") for issue in issues: fixable = " (auto-fixable)" if issue.get("auto_fixable") else "" lines.append(f"- **[{marker}]** `{issue['file']}` - {issue['detail']}{fixable}") lines.append("") if not all_issues: lines.append("All checks passed. Knowledge base is healthy.") lines.append("") return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="Lint the knowledge base") parser.add_argument( "--structural-only", action="store_true", help="Skip LLM-based checks (contradictions) - faster and free", ) args = parser.parse_args() print("Running knowledge base lint checks...") all_issues: list[dict] = [] # Structural checks (free, instant) checks = [ ("Broken links", check_broken_links), ("Orphan pages", check_orphan_pages), ("Orphan sources", check_orphan_sources), ("Stale articles", check_stale_articles), ("Missing backlinks", check_missing_backlinks), ("Sparse articles", check_sparse_articles), ] for name, check_fn in checks: print(f" Checking: {name}...") issues = check_fn() all_issues.extend(issues) print(f" Found {len(issues)} issue(s)") # LLM check (costs money) if not args.structural_only: print(" Checking: Contradictions (LLM)...") issues = asyncio.run(check_contradictions()) all_issues.extend(issues) print(f" Found {len(issues)} issue(s)") else: print(" Skipping: Contradictions (--structural-only)") # Generate and save report report = generate_report(all_issues) REPORTS_DIR.mkdir(parents=True, exist_ok=True) report_path = REPORTS_DIR / f"lint-{today_iso()}.md" report_path.write_text(report, encoding="utf-8") print(f"\nReport saved to: {report_path}") # Update state state = load_state() state["last_lint"] = now_iso() save_state(state) # Summary errors = sum(1 for i in all_issues if i["severity"] == "error") warnings = sum(1 for i in all_issues if i["severity"] == "warning") suggestions = sum(1 for i in all_issues if i["severity"] == "suggestion") print(f"\nResults: {errors} errors, {warnings} warnings, {suggestions} suggestions") if errors > 0: print("\nErrors found - knowledge base needs attention!") return 1 return 0 if __name__ == "__main__": exit(main())