refactor: extract atomic_json_write helper, add 24 checkpoint tests

Extract the duplicated temp-file + fsync + os.replace pattern from
batch_runner.py (1 instance) and process_registry.py (2 instances) into
a shared utils.atomic_json_write() function.

Add 12 tests for atomic_json_write covering: valid JSON, parent dir
creation, overwrite, crash safety (original preserved on error), no temp
file leaks, string paths, unicode, custom indent, concurrent writes.

Add 12 tests for batch_runner checkpoint behavior covering:
_save_checkpoint (valid JSON, last_updated, overwrite, lock/no-lock,
parent dirs, no temp leaks), _load_checkpoint (missing file, existing
data, corrupt JSON), and resume logic (preserves prior progress,
different run_name starts fresh).
This commit is contained in:
teknium1
2026-03-06 05:50:12 -08:00
parent c05c60665e
commit d63b363cde
5 changed files with 340 additions and 64 deletions

View File

@@ -37,7 +37,6 @@ import shlex
import shutil
import signal
import subprocess
import tempfile
import threading
import time
import uuid
@@ -707,25 +706,9 @@ class ProcessRegistry:
"session_key": s.session_key,
})
# Atomic write: temp file + os.replace to avoid corruption on crash
CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
dir=str(CHECKPOINT_PATH.parent),
prefix='.checkpoint_',
suffix='.tmp',
)
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
json.dump(entries, f, indent=2, ensure_ascii=False)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, CHECKPOINT_PATH)
except BaseException:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
# Atomic write to avoid corruption on crash
from utils import atomic_json_write
atomic_json_write(CHECKPOINT_PATH, entries)
except Exception as e:
logger.debug("Failed to write checkpoint file: %s", e, exc_info=True)
@@ -774,26 +757,9 @@ class ProcessRegistry:
logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid)
# Clear the checkpoint (will be rewritten as processes finish)
# Use atomic write to avoid corruption
try:
CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
dir=str(CHECKPOINT_PATH.parent),
prefix='.checkpoint_',
suffix='.tmp',
)
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write("[]")
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, CHECKPOINT_PATH)
except BaseException:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
from utils import atomic_json_write
atomic_json_write(CHECKPOINT_PATH, [])
except Exception as e:
logger.debug("Could not clear checkpoint file: %s", e, exc_info=True)