Fixes upstream issues #3/#5/#9 (whole-wiki in every prompt) and adds large-log chunking. Addresses the audit's P1 scaling findings (C1), the chunking requirement operator added on top, C8 explicit model wiring across all LLM call sites, and D3 single-event-loop refactor. ## compile.py - **Index-only context.** The `existing_articles_context` concatenation of every wiki article has been removed from the prompt. Instead the LLM receives only the index + schema + daily log and uses the Read tool (already in allowed_tools) to fetch specific articles it decides are relevant. Prompt size stays bounded regardless of KB growth — upstream's 250K-token prompts past ~100 articles are gone. - **Chunking.** `_split_log_into_chunks()` splits oversized daily logs along `### ` section boundaries. Threshold MAX_LOG_CHARS_PER_CHUNK (default 100K chars ≈ 25K tokens, configurable via MEMORIA_MAX_LOG_CHARS). Chunks compile via separate LLM calls that naturally merge through Edit on shared files. Oversized single sections emit as their own chunks rather than splitting mid-thought. - **Atomic state on chunked compile.** State is only written after ALL chunks succeed — partial-failure leaves the log flagged as uncompiled in state.json so the next run retries it cleanly. Was already correct for single-chunk logs (early return on SDK error) and now correct for multi-chunk too. - **Explicit model.** `model=COMPILE_MODEL` passed to ClaudeAgentOptions. Default "sonnet"; override via MEMORIA_COMPILE_MODEL env var. - **D3: single asyncio.run.** The per-file `asyncio.run()` in the compile loop is replaced with one outer call wrapping `_compile_all`. Avoids repeated event-loop setup/teardown and matches the pattern used for async resources in the SDK. ## query.py - **Index-only context.** `read_all_wiki_content()` replaced with `read_wiki_index()`. The LLM reads the index and uses its Read tool to fetch specific articles. Same rationale as compile.py — keeps prompt size bounded and cost predictable. - **Explicit model.** `model=QUERY_MODEL`, default "sonnet", override via MEMORIA_QUERY_MODEL. ## lint.py - **C9: skip qa/sources in missing-backlink check.** Articles under qa/ or sources/ no longer trigger a suggestion that every referenced concept should backlink to them. Concepts aren't expected to link back to every Q&A that mentions them — doing so would drown real relationships. - **Alias-aware backlink detection.** Uses `extract_wikilinks()` to parse the target's link list so `[[concepts/foo|Display]]` forms count as valid backlinks (previously required exact `[[foo]]` match, causing false positives on aliased forms). - **Explicit model.** `model=LINT_MODEL` in check_contradictions call, default "sonnet", override via MEMORIA_LINT_MODEL. ## Verified - Chunking: 120K-char 3-section log splits into 80K + 40K, reconstructs byte-exact. Oversized single section (150K) emits as its own chunk. Small log (<100K) returns as single chunk. - All patched modules import cleanly with expected config values. - compile_daily_log / query.run_query / flush.maybe_trigger_compilation / lint.check_missing_backlinks all callable post-patch.
335 lines
11 KiB
Python
335 lines
11 KiB
Python
"""
|
|
Lint the knowledge base for structural and semantic health.
|
|
|
|
Runs 7 checks: broken links, orphan pages, orphan sources, stale articles,
|
|
contradictions (LLM), missing backlinks, and sparse articles.
|
|
|
|
Usage:
|
|
uv run python lint.py # all checks
|
|
uv run python lint.py --structural-only # skip LLM checks (faster, cheaper)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from config import KNOWLEDGE_DIR, REPORTS_DIR, now_iso, today_iso
|
|
|
|
# Contradiction-check model. Kept as Sonnet for reasoning quality; override
|
|
# via MEMORIA_LINT_MODEL (e.g. to use a cheaper model for structural runs
|
|
# that happen to include the LLM check).
|
|
LINT_MODEL = os.environ.get("MEMORIA_LINT_MODEL", "sonnet")
|
|
from utils import (
|
|
count_inbound_links,
|
|
extract_wikilinks,
|
|
file_hash,
|
|
get_article_word_count,
|
|
list_raw_files,
|
|
list_wiki_articles,
|
|
load_state,
|
|
read_all_wiki_content,
|
|
save_state,
|
|
wiki_article_exists,
|
|
)
|
|
|
|
ROOT_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
|
|
def check_broken_links() -> list[dict]:
|
|
"""Check for [[wikilinks]] that point to non-existent articles."""
|
|
issues = []
|
|
for article in list_wiki_articles():
|
|
content = article.read_text(encoding="utf-8")
|
|
rel = article.relative_to(KNOWLEDGE_DIR)
|
|
for link in extract_wikilinks(content):
|
|
if link.startswith("daily/"):
|
|
continue # daily log references are valid
|
|
if not wiki_article_exists(link):
|
|
issues.append({
|
|
"severity": "error",
|
|
"check": "broken_link",
|
|
"file": str(rel),
|
|
"detail": f"Broken link: [[{link}]] - target does not exist",
|
|
})
|
|
return issues
|
|
|
|
|
|
def check_orphan_pages() -> list[dict]:
|
|
"""Check for articles with zero inbound links."""
|
|
issues = []
|
|
for article in list_wiki_articles():
|
|
rel = article.relative_to(KNOWLEDGE_DIR)
|
|
link_target = str(rel).replace(".md", "").replace("\\", "/")
|
|
inbound = count_inbound_links(link_target)
|
|
if inbound == 0:
|
|
issues.append({
|
|
"severity": "warning",
|
|
"check": "orphan_page",
|
|
"file": str(rel),
|
|
"detail": f"Orphan page: no other articles link to [[{link_target}]]",
|
|
})
|
|
return issues
|
|
|
|
|
|
def check_orphan_sources() -> list[dict]:
|
|
"""Check for daily logs that haven't been compiled yet."""
|
|
state = load_state()
|
|
ingested = state.get("ingested", {})
|
|
issues = []
|
|
for log_path in list_raw_files():
|
|
if log_path.name not in ingested:
|
|
issues.append({
|
|
"severity": "warning",
|
|
"check": "orphan_source",
|
|
"file": f"daily/{log_path.name}",
|
|
"detail": f"Uncompiled daily log: {log_path.name} has not been ingested",
|
|
})
|
|
return issues
|
|
|
|
|
|
def check_stale_articles() -> list[dict]:
|
|
"""Check if source daily logs have changed since compilation."""
|
|
state = load_state()
|
|
ingested = state.get("ingested", {})
|
|
issues = []
|
|
for log_path in list_raw_files():
|
|
rel = log_path.name
|
|
if rel in ingested:
|
|
stored_hash = ingested[rel].get("hash", "")
|
|
current_hash = file_hash(log_path)
|
|
if stored_hash != current_hash:
|
|
issues.append({
|
|
"severity": "warning",
|
|
"check": "stale_article",
|
|
"file": f"daily/{rel}",
|
|
"detail": f"Stale: {rel} has changed since last compilation",
|
|
})
|
|
return issues
|
|
|
|
|
|
def check_missing_backlinks() -> list[dict]:
|
|
"""Check for asymmetric links: A links to B but B doesn't link to A.
|
|
|
|
Skips any source or target under `qa/` or `sources/`: Q&A articles
|
|
intentionally reference concepts without requiring a reciprocal link
|
|
(concepts would otherwise accumulate a backlink per question, which
|
|
drowns real relationships). Also handles pipe-aliased wikilinks via
|
|
the alias-aware extract_wikilinks helper.
|
|
"""
|
|
issues = []
|
|
# Aliased/nested variants of the source link that should count as a
|
|
# valid backlink on the target side: bare slug, and pipe-aliased form.
|
|
for article in list_wiki_articles():
|
|
content = article.read_text(encoding="utf-8")
|
|
rel = article.relative_to(KNOWLEDGE_DIR)
|
|
rel_str = str(rel).replace("\\", "/")
|
|
|
|
# Skip one-way source categories.
|
|
if rel_str.startswith("qa/") or rel_str.startswith("sources/"):
|
|
continue
|
|
|
|
source_link = rel_str.replace(".md", "")
|
|
|
|
for link in extract_wikilinks(content):
|
|
if link.startswith("daily/") or link.startswith("qa/") or link.startswith("sources/"):
|
|
continue
|
|
target_path = KNOWLEDGE_DIR / f"{link}.md"
|
|
if target_path.exists():
|
|
target_content = target_path.read_text(encoding="utf-8")
|
|
target_backlinks = extract_wikilinks(target_content)
|
|
if source_link not in target_backlinks:
|
|
issues.append({
|
|
"severity": "suggestion",
|
|
"check": "missing_backlink",
|
|
"file": str(rel),
|
|
"detail": f"[[{source_link}]] links to [[{link}]] but not vice versa",
|
|
"auto_fixable": True,
|
|
})
|
|
return issues
|
|
|
|
|
|
def check_sparse_articles() -> list[dict]:
|
|
"""Check for articles with fewer than 200 words."""
|
|
issues = []
|
|
for article in list_wiki_articles():
|
|
word_count = get_article_word_count(article)
|
|
if word_count < 200:
|
|
rel = article.relative_to(KNOWLEDGE_DIR)
|
|
issues.append({
|
|
"severity": "suggestion",
|
|
"check": "sparse_article",
|
|
"file": str(rel),
|
|
"detail": f"Sparse article: {word_count} words (minimum recommended: 200)",
|
|
})
|
|
return issues
|
|
|
|
|
|
async def check_contradictions() -> list[dict]:
|
|
"""Use LLM to detect contradictions across articles."""
|
|
from claude_agent_sdk import (
|
|
AssistantMessage,
|
|
ClaudeAgentOptions,
|
|
ResultMessage,
|
|
TextBlock,
|
|
query,
|
|
)
|
|
|
|
wiki_content = read_all_wiki_content()
|
|
|
|
prompt = f"""Review this knowledge base for contradictions, inconsistencies, or
|
|
conflicting claims across articles.
|
|
|
|
## Knowledge Base
|
|
|
|
{wiki_content}
|
|
|
|
## Instructions
|
|
|
|
Look for:
|
|
- Direct contradictions (article A says X, article B says not-X)
|
|
- Inconsistent recommendations (different articles recommend conflicting approaches)
|
|
- Outdated information that conflicts with newer entries
|
|
|
|
For each issue found, output EXACTLY one line in this format:
|
|
CONTRADICTION: [file1] vs [file2] - description of the conflict
|
|
INCONSISTENCY: [file] - description of the inconsistency
|
|
|
|
If no issues found, output exactly: NO_ISSUES
|
|
|
|
Do NOT output anything else - no preamble, no explanation, just the formatted lines."""
|
|
|
|
response = ""
|
|
try:
|
|
async for message in query(
|
|
prompt=prompt,
|
|
options=ClaudeAgentOptions(
|
|
cwd=str(ROOT_DIR),
|
|
model=LINT_MODEL,
|
|
allowed_tools=[],
|
|
max_turns=2,
|
|
),
|
|
):
|
|
if isinstance(message, AssistantMessage):
|
|
for block in message.content:
|
|
if isinstance(block, TextBlock):
|
|
response += block.text
|
|
except Exception as e:
|
|
return [{"severity": "error", "check": "contradiction", "file": "(system)", "detail": f"LLM check failed: {e}"}]
|
|
|
|
issues = []
|
|
if "NO_ISSUES" not in response:
|
|
for line in response.strip().split("\n"):
|
|
line = line.strip()
|
|
if line.startswith("CONTRADICTION:") or line.startswith("INCONSISTENCY:"):
|
|
issues.append({
|
|
"severity": "warning",
|
|
"check": "contradiction",
|
|
"file": "(cross-article)",
|
|
"detail": line,
|
|
})
|
|
|
|
return issues
|
|
|
|
|
|
def generate_report(all_issues: list[dict]) -> str:
|
|
"""Generate a markdown lint report."""
|
|
errors = [i for i in all_issues if i["severity"] == "error"]
|
|
warnings = [i for i in all_issues if i["severity"] == "warning"]
|
|
suggestions = [i for i in all_issues if i["severity"] == "suggestion"]
|
|
|
|
lines = [
|
|
f"# Lint Report - {today_iso()}",
|
|
"",
|
|
f"**Total issues:** {len(all_issues)}",
|
|
f"- Errors: {len(errors)}",
|
|
f"- Warnings: {len(warnings)}",
|
|
f"- Suggestions: {len(suggestions)}",
|
|
"",
|
|
]
|
|
|
|
for severity, issues, marker in [
|
|
("Errors", errors, "x"),
|
|
("Warnings", warnings, "!"),
|
|
("Suggestions", suggestions, "?"),
|
|
]:
|
|
if issues:
|
|
lines.append(f"## {severity}")
|
|
lines.append("")
|
|
for issue in issues:
|
|
fixable = " (auto-fixable)" if issue.get("auto_fixable") else ""
|
|
lines.append(f"- **[{marker}]** `{issue['file']}` - {issue['detail']}{fixable}")
|
|
lines.append("")
|
|
|
|
if not all_issues:
|
|
lines.append("All checks passed. Knowledge base is healthy.")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Lint the knowledge base")
|
|
parser.add_argument(
|
|
"--structural-only",
|
|
action="store_true",
|
|
help="Skip LLM-based checks (contradictions) - faster and free",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
print("Running knowledge base lint checks...")
|
|
all_issues: list[dict] = []
|
|
|
|
# Structural checks (free, instant)
|
|
checks = [
|
|
("Broken links", check_broken_links),
|
|
("Orphan pages", check_orphan_pages),
|
|
("Orphan sources", check_orphan_sources),
|
|
("Stale articles", check_stale_articles),
|
|
("Missing backlinks", check_missing_backlinks),
|
|
("Sparse articles", check_sparse_articles),
|
|
]
|
|
|
|
for name, check_fn in checks:
|
|
print(f" Checking: {name}...")
|
|
issues = check_fn()
|
|
all_issues.extend(issues)
|
|
print(f" Found {len(issues)} issue(s)")
|
|
|
|
# LLM check (costs money)
|
|
if not args.structural_only:
|
|
print(" Checking: Contradictions (LLM)...")
|
|
issues = asyncio.run(check_contradictions())
|
|
all_issues.extend(issues)
|
|
print(f" Found {len(issues)} issue(s)")
|
|
else:
|
|
print(" Skipping: Contradictions (--structural-only)")
|
|
|
|
# Generate and save report
|
|
report = generate_report(all_issues)
|
|
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
report_path = REPORTS_DIR / f"lint-{today_iso()}.md"
|
|
report_path.write_text(report, encoding="utf-8")
|
|
print(f"\nReport saved to: {report_path}")
|
|
|
|
# Update state
|
|
state = load_state()
|
|
state["last_lint"] = now_iso()
|
|
save_state(state)
|
|
|
|
# Summary
|
|
errors = sum(1 for i in all_issues if i["severity"] == "error")
|
|
warnings = sum(1 for i in all_issues if i["severity"] == "warning")
|
|
suggestions = sum(1 for i in all_issues if i["severity"] == "suggestion")
|
|
print(f"\nResults: {errors} errors, {warnings} warnings, {suggestions} suggestions")
|
|
|
|
if errors > 0:
|
|
print("\nErrors found - knowledge base needs attention!")
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|