This is the initial fork commit for agent-admin/memoria, a production
hardening of coleam00/claude-memory-compiler. It addresses all four P0
findings from the bug audit (atomic state writes, file locking on
daily log appends, subprocess detachment, path-traversal guard) plus
several P1s (aliased wikilinks, timezone wiring, staleness-based
compile trigger, SDK retry with backoff, file-handle context manager).
File-level changes:
- LICENSE — MIT (fork is self-declared FOSS; upstream has no LICENSE
file but author has stated FOSS intent).
- pyproject.toml — renamed project to `memoria`, removed unused
python-dotenv dependency, added optional `test` dep group.
- scripts/fs_utils.py — NEW module containing the primitives that
the other patches rely on:
* atomic_write_text(path, content): tmp + fsync + os.replace;
interrupted writes leave the target unchanged.
* locked_append_text(path, content): fcntl.flock (POSIX) /
msvcrt.locking (Windows) exclusive lock around the write so
concurrent callers never interleave.
* extract_wikilinks / parse_wikilink: strip [[target|display]]
aliases correctly (fixes upstream issues #7 and #8).
* safe_article_path(link, base): resolves a wikilink slug inside
a base dir or returns None (path traversal guard).
* load_json_with_recovery(path, default): on corruption, moves
the bad file aside with a timestamped .bak-YYYYMMDDTHHMMSSZ
suffix, logs a warning, returns the default. Replaces the
silent `{}` return that would otherwise cause full-recompile.
- scripts/utils.py — save_state/load_state now use atomic writes and
corruption recovery; wiki_article_exists + count_inbound_links now
alias-aware via fs_utils helpers.
- scripts/config.py — TIMEZONE is now wired via zoneinfo.ZoneInfo
and used by now_iso/today_iso (previously defined but ignored).
Overridable via MEMORIA_TZ env var. Unknown zones log a warning
and fall back to system local time rather than crashing.
- scripts/flush.py —
* save_flush_state / load_flush_state use atomic + recovery.
* append_to_daily_log uses locked_append_text; concurrent flush
+ pre-compact calls can no longer interleave log entries.
* run_flush retries SDK failures up to MAX_SDK_ATTEMPTS=3 with
exponential backoff (2s, 4s) before returning FLUSH_ERROR.
* On FLUSH_ERROR, main() preserves the context file and does NOT
update dedup state — the next flush retries cleanly instead of
the failure being silently swallowed.
* Explicit model="haiku" for flush (short summarization task).
* maybe_trigger_compilation replaced: 6 PM wall-clock gate is
gone; trigger is now staleness-based (hash changed AND
COMPILE_INTERVAL_MIN elapsed since last compile). Configurable
via MEMORIA_COMPILE_INTERVAL_MIN. Uses _now_local() from
config so the clock respects the configured timezone.
* compile.log handle uses a `with open()` context manager so the
fd is always cleaned up, even if Popen throws.
- hooks/session-end.py, hooks/pre-compact.py — subprocess.Popen now
passes start_new_session=True on POSIX, detaching flush.py from
the hook's process group so it survives post-hook SIGHUP. Fixes
the intermittent-data-loss failure mode where flush subprocess
was killed mid-LLM-call.
Tests (formal acceptance suite still to come in this phase): each
helper verified via unit exercise in scratch directories — atomic
roundtrip, corruption recovery with .bak creation, alias parsing,
path-traversal rejection.
Upstream issue mapping: #3/#5/#9 addressed by the next commit
(compile.py + query.py scaling fix). #7/#8 addressed here via
alias-aware helpers. License (#11) resolved via MIT LICENSE.
170 lines
6 KiB
Python
170 lines
6 KiB
Python
"""Shared utilities for the personal knowledge base."""
|
|
|
|
import hashlib
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
from config import (
|
|
CONCEPTS_DIR,
|
|
CONNECTIONS_DIR,
|
|
DAILY_DIR,
|
|
INDEX_FILE,
|
|
KNOWLEDGE_DIR,
|
|
LOG_FILE,
|
|
QA_DIR,
|
|
STATE_FILE,
|
|
)
|
|
from fs_utils import (
|
|
atomic_write_text,
|
|
extract_wikilinks as _extract_wikilinks,
|
|
load_json_with_recovery,
|
|
parse_wikilink,
|
|
safe_article_path,
|
|
)
|
|
|
|
|
|
_DEFAULT_STATE: dict = {"ingested": {}, "query_count": 0, "last_lint": None, "total_cost": 0.0}
|
|
|
|
|
|
# ── State management ──────────────────────────────────────────────────
|
|
|
|
def load_state() -> dict:
|
|
"""Load persistent state from state.json.
|
|
|
|
On file corruption, moves the bad file aside with a timestamped backup,
|
|
logs a warning, and returns a fresh default state. This avoids the
|
|
silent "full recompile" failure mode that would otherwise follow a
|
|
JSONDecodeError.
|
|
"""
|
|
# Return a *copy* of defaults so mutations by the caller don't pollute
|
|
# the module-level default.
|
|
return load_json_with_recovery(STATE_FILE, dict(_DEFAULT_STATE))
|
|
|
|
|
|
def save_state(state: dict) -> None:
|
|
"""Save state atomically (tmp + fsync + rename).
|
|
|
|
Interruptions during write leave state.json in its previous good state.
|
|
The partial tmp file is cleaned up on exception.
|
|
"""
|
|
atomic_write_text(STATE_FILE, json.dumps(state, indent=2))
|
|
|
|
|
|
# ── File hashing ──────────────────────────────────────────────────────
|
|
|
|
def file_hash(path: Path) -> str:
|
|
"""SHA-256 hash of a file (first 16 hex chars)."""
|
|
return hashlib.sha256(path.read_bytes()).hexdigest()[:16]
|
|
|
|
|
|
# ── Slug / naming ─────────────────────────────────────────────────────
|
|
|
|
def slugify(text: str) -> str:
|
|
"""Convert text to a filename-safe slug."""
|
|
text = text.lower().strip()
|
|
text = re.sub(r"[^\w\s-]", "", text)
|
|
text = re.sub(r"[\s_]+", "-", text)
|
|
text = re.sub(r"-+", "-", text)
|
|
return text.strip("-")
|
|
|
|
|
|
# ── Wikilink helpers ──────────────────────────────────────────────────
|
|
|
|
def extract_wikilinks(content: str) -> list[str]:
|
|
"""Extract all [[wikilinks]] from markdown content.
|
|
|
|
Pipe-aliased links ([[target|display]]) are returned as the bare
|
|
`target` slug — display text is stripped. Callers should treat the
|
|
return value as filesystem-relative slugs, not raw link text.
|
|
"""
|
|
return _extract_wikilinks(content)
|
|
|
|
|
|
def wiki_article_exists(link: str) -> bool:
|
|
"""Check if a wikilinked article exists on disk.
|
|
|
|
Resolves via `safe_article_path`: pipe-alias is stripped, the final
|
|
path is asserted to remain inside `KNOWLEDGE_DIR`, and traversal
|
|
attempts (`../../etc/passwd`) return False without touching the
|
|
filesystem outside the knowledge tree.
|
|
"""
|
|
path = safe_article_path(link, KNOWLEDGE_DIR)
|
|
return path is not None and path.exists()
|
|
|
|
|
|
# ── Wiki content helpers ──────────────────────────────────────────────
|
|
|
|
def read_wiki_index() -> str:
|
|
"""Read the knowledge base index file."""
|
|
if INDEX_FILE.exists():
|
|
return INDEX_FILE.read_text(encoding="utf-8")
|
|
return "# Knowledge Base Index\n\n| Article | Summary | Compiled From | Updated |\n|---------|---------|---------------|---------|"
|
|
|
|
|
|
def read_all_wiki_content() -> str:
|
|
"""Read index + all wiki articles into a single string for context."""
|
|
parts = [f"## INDEX\n\n{read_wiki_index()}"]
|
|
|
|
for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]:
|
|
if not subdir.exists():
|
|
continue
|
|
for md_file in sorted(subdir.glob("*.md")):
|
|
rel = md_file.relative_to(KNOWLEDGE_DIR)
|
|
content = md_file.read_text(encoding="utf-8")
|
|
parts.append(f"## {rel}\n\n{content}")
|
|
|
|
return "\n\n---\n\n".join(parts)
|
|
|
|
|
|
def list_wiki_articles() -> list[Path]:
|
|
"""List all wiki article files."""
|
|
articles = []
|
|
for subdir in [CONCEPTS_DIR, CONNECTIONS_DIR, QA_DIR]:
|
|
if subdir.exists():
|
|
articles.extend(sorted(subdir.glob("*.md")))
|
|
return articles
|
|
|
|
|
|
def list_raw_files() -> list[Path]:
|
|
"""List all daily log files."""
|
|
if not DAILY_DIR.exists():
|
|
return []
|
|
return sorted(DAILY_DIR.glob("*.md"))
|
|
|
|
|
|
# ── Index helpers ─────────────────────────────────────────────────────
|
|
|
|
def count_inbound_links(target: str, exclude_file: Path | None = None) -> int:
|
|
"""Count how many wiki articles link to a given target.
|
|
|
|
Correctly handles pipe-aliased links — an article containing
|
|
`[[concepts/foo|Display]]` counts as an inbound link to
|
|
`concepts/foo`.
|
|
"""
|
|
target_slug = parse_wikilink(target)
|
|
count = 0
|
|
for article in list_wiki_articles():
|
|
if article == exclude_file:
|
|
continue
|
|
content = article.read_text(encoding="utf-8")
|
|
if any(parse_wikilink(raw) == target_slug for raw in _extract_wikilinks(content)):
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def get_article_word_count(path: Path) -> int:
|
|
"""Count words in an article, excluding YAML frontmatter."""
|
|
content = path.read_text(encoding="utf-8")
|
|
# Strip frontmatter
|
|
if content.startswith("---"):
|
|
end = content.find("---", 3)
|
|
if end != -1:
|
|
content = content[end + 3:]
|
|
return len(content.split())
|
|
|
|
|
|
def build_index_entry(rel_path: str, summary: str, sources: str, updated: str) -> str:
|
|
"""Build a single index table row."""
|
|
link = rel_path.replace(".md", "")
|
|
return f"| [[{link}]] | {summary} | {sources} | {updated} |"
|