"""Tests for scripts/fs_utils.py primitives. Covers audit assertions: concurrent flush safety (acceptance #1), crash/corruption recovery (#2, #9), path-traversal rejection (#8), aliased wikilinks (#6 — the parsing side). """ from __future__ import annotations import json import multiprocessing as mp import os import re import time from pathlib import Path import pytest from fs_utils import ( atomic_write_text, extract_wikilinks, load_json_with_recovery, locked_append_text, parse_wikilink, safe_article_path, ) # ── Atomic writes ─────────────────────────────────────────────────────── def test_atomic_write_roundtrip(tmp_path: Path) -> None: target = tmp_path / "state.json" atomic_write_text(target, '{"a": 1}') assert target.read_text() == '{"a": 1}' def test_atomic_write_overwrites(tmp_path: Path) -> None: target = tmp_path / "state.json" target.write_text("old content") atomic_write_text(target, "new content") assert target.read_text() == "new content" def test_atomic_write_leaves_original_on_exception(tmp_path: Path, monkeypatch) -> None: """Simulate a write that fails mid-way. Target must retain old content.""" target = tmp_path / "state.json" target.write_text("old content") # Patch os.replace to raise — simulates a filesystem error at the # critical moment between tmp-write and atomic rename. import fs_utils as fsu original_replace = os.replace def boom(*args, **kwargs): raise OSError("simulated filesystem failure") monkeypatch.setattr(os, "replace", boom) with pytest.raises(OSError, match="simulated"): atomic_write_text(target, "new content") # Original is intact. assert target.read_text() == "old content" # No leftover tmp files. monkeypatch.setattr(os, "replace", original_replace) tmp_leftovers = list(tmp_path.glob(".state.json.*.tmp")) assert tmp_leftovers == [] def test_atomic_write_creates_parent_dirs(tmp_path: Path) -> None: target = tmp_path / "nested" / "deep" / "state.json" atomic_write_text(target, "ok") assert target.read_text() == "ok" # ── Locked append (acceptance #1) ─────────────────────────────────────── def _appender_worker(path_str: str, marker: str, count: int) -> None: """Worker: append `count` well-bounded entries to `path_str`. Each entry is a full 'header + body + blank line' unit; if writes interleave, the stream will contain a partial entry. """ from fs_utils import locked_append_text # re-import in worker for i in range(count): entry = ( f"### {marker} entry {i}\n" f"body line 1 of {marker}-{i}\n" f"body line 2 of {marker}-{i}\n" f"\n" ) locked_append_text(Path(path_str), entry) def test_locked_append_no_interleaving(tmp_path: Path) -> None: """Run many concurrent appenders; verify no entry is split or interleaved.""" target = tmp_path / "daily.md" n_workers = 4 entries_per_worker = 25 markers = [f"W{i}" for i in range(n_workers)] ctx = mp.get_context("fork") procs = [ ctx.Process(target=_appender_worker, args=(str(target), m, entries_per_worker)) for m in markers ] for p in procs: p.start() for p in procs: p.join(timeout=30) assert p.exitcode == 0, f"worker {p} failed" content = target.read_text() # Every expected entry must appear exactly once and each entry's body # lines must be contiguous (no interleaving). for marker in markers: for i in range(entries_per_worker): header = f"### {marker} entry {i}\n" body1 = f"body line 1 of {marker}-{i}" body2 = f"body line 2 of {marker}-{i}" # Header appears exactly once. assert content.count(header) == 1, f"missing or duplicate header: {header}" # Body line immediately follows header (no interleaving). idx = content.index(header) tail = content[idx + len(header):] assert tail.startswith(body1 + "\n" + body2 + "\n"), ( f"entry {marker}-{i} body was interleaved or split" ) # ── JSON recovery (acceptance #9) ─────────────────────────────────────── def test_json_recovery_roundtrip_clean(tmp_path: Path) -> None: target = tmp_path / "state.json" target.write_text(json.dumps({"a": 1})) assert load_json_with_recovery(target, {}) == {"a": 1} def test_json_recovery_missing_returns_default(tmp_path: Path) -> None: target = tmp_path / "does-not-exist.json" assert load_json_with_recovery(target, {"fresh": True}) == {"fresh": True} def test_json_recovery_corruption_creates_backup(tmp_path: Path) -> None: target = tmp_path / "state.json" target.write_text("{ this is not valid json") default = {"ingested": {}} result = load_json_with_recovery(target, default) # Returned the default. assert result == default # Corrupted file moved aside with .bak-YYYYMMDDTHHMMSSZ suffix. bak_files = list(tmp_path.glob("state.json.bak-*")) assert len(bak_files) == 1, f"expected one .bak file, found {bak_files}" assert re.search(r"bak-\d{8}T\d{6}Z$", bak_files[0].name) # The corrupt content is preserved in the backup. assert bak_files[0].read_text() == "{ this is not valid json" # The main target no longer exists (was moved, not deleted). assert not target.exists() # ── Wikilink parsing (acceptance #6) ──────────────────────────────────── def test_extract_wikilinks_bare() -> None: assert extract_wikilinks("see [[concepts/foo]] and [[concepts/bar]]") == [ "concepts/foo", "concepts/bar", ] def test_extract_wikilinks_aliased() -> None: assert extract_wikilinks( "ref [[concepts/foo|Foo Display]] and [[concepts/bar|Bar]]" ) == ["concepts/foo", "concepts/bar"] def test_extract_wikilinks_mixed() -> None: assert extract_wikilinks( "[[concepts/a]] and [[concepts/b|B]] and [[concepts/c]]" ) == ["concepts/a", "concepts/b", "concepts/c"] def test_parse_wikilink() -> None: assert parse_wikilink("concepts/foo") == "concepts/foo" assert parse_wikilink("concepts/foo|Display") == "concepts/foo" assert parse_wikilink(" spaces ") == "spaces" # ── Path traversal (acceptance #8) ────────────────────────────────────── def test_safe_path_clean(tmp_path: Path) -> None: base = tmp_path / "kb" base.mkdir() result = safe_article_path("concepts/foo", base) assert result is not None assert result == (base / "concepts" / "foo.md").resolve() def test_safe_path_traversal_rejected(tmp_path: Path) -> None: base = tmp_path / "kb" base.mkdir() assert safe_article_path("../../etc/passwd", base) is None assert safe_article_path("../outside", base) is None def test_safe_path_absolute_rejected(tmp_path: Path) -> None: base = tmp_path / "kb" base.mkdir() assert safe_article_path("/etc/passwd", base) is None def test_safe_path_empty_and_invalid(tmp_path: Path) -> None: base = tmp_path / "kb" base.mkdir() assert safe_article_path("", base) is None assert safe_article_path("foo\0bar", base) is None # null byte def test_safe_path_strips_alias(tmp_path: Path) -> None: base = tmp_path / "kb" base.mkdir() result = safe_article_path("concepts/foo|Display", base) assert result == (base / "concepts" / "foo.md").resolve()