Fix timestamp normalization to prevent false change detection after unzip

This commit is contained in:
Nathan
2025-12-10 12:06:02 -07:00
parent 0287cbd7ef
commit 19e4eb63aa
2 changed files with 4813 additions and 37 deletions

View File

@@ -530,13 +530,23 @@ def compute_state(seq_dir: Path) -> dict:
iter_sequence_files(seq_dir),
key=lambda p: p.relative_to(seq_dir).as_posix(),
)
# On Windows, normalize timestamps to 100-nanosecond boundaries (NTFS precision)
# to match what we do when restoring timestamps
is_windows = platform.system() == "Windows"
for file_path in files:
stat = file_path.stat()
mtime_ns = stat.st_mtime_ns
# Normalize to filesystem precision (Windows NTFS uses 100-nanosecond intervals)
if is_windows:
mtime_ns = (mtime_ns // 100) * 100
entries.append(
{
"path": file_path.relative_to(seq_dir).as_posix(),
"size": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
"mtime_ns": mtime_ns,
}
)
return {"files": entries}
@@ -552,15 +562,81 @@ def load_state(state_path: Path) -> dict | None:
if not state_path.exists():
return None
try:
return json.loads(state_path.read_text())
state = json.loads(state_path.read_text())
# Normalize timestamps in loaded state to ensure consistency
# This handles state files created before normalization was added
is_windows = platform.system() == "Windows"
if is_windows and "files" in state:
for entry in state.get("files", []):
if "mtime_ns" in entry:
# Normalize to 100-nanosecond boundaries (NTFS precision)
entry["mtime_ns"] = (entry["mtime_ns"] // 100) * 100
return state
except json.JSONDecodeError:
return None
def state_changed(seq_state: dict, stored_state: dict | None) -> bool:
def state_changed(seq_state: dict, stored_state: dict | None, *, verbose: bool = False) -> bool:
if stored_state is None:
if verbose:
log("scan", "State changed: no stored state found", verbose_only=True, verbose=verbose)
return True
# Normalize timestamps in both states for comparison
# On Windows, normalize to 100-nanosecond boundaries (NTFS precision)
is_windows = platform.system() == "Windows"
def normalize_state(state: dict) -> dict:
"""Normalize timestamps in state to filesystem precision."""
normalized = {"files": []}
for entry in state.get("files", []):
mtime_ns = entry["mtime_ns"]
if is_windows:
mtime_ns = (mtime_ns // 100) * 100
normalized["files"].append({
"path": entry["path"],
"size": entry["size"],
"mtime_ns": mtime_ns,
})
return normalized
# Compare normalized states
normalized_seq = normalize_state(seq_state)
normalized_stored = normalize_state(stored_state)
if normalized_seq == normalized_stored:
return False
# States differ - log diagnostic info if verbose
if verbose:
seq_files = {f["path"]: f for f in normalized_seq["files"]}
stored_files = {f["path"]: f for f in normalized_stored["files"]}
seq_paths = set(seq_files.keys())
stored_paths = set(stored_files.keys())
if seq_paths != stored_paths:
missing = stored_paths - seq_paths
extra = seq_paths - stored_paths
if missing:
log("scan", f"State diff: files in stored but not current: {sorted(missing)[:5]}", verbose_only=True, verbose=verbose)
if extra:
log("scan", f"State diff: files in current but not stored: {sorted(extra)[:5]}", verbose_only=True, verbose=verbose)
else:
# Same files, check for differences
diffs = []
for path in sorted(seq_paths)[:5]: # Check first 5 differences
seq_file = seq_files[path]
stored_file = stored_files[path]
if seq_file != stored_file:
if seq_file["size"] != stored_file["size"]:
diffs.append(f"{path}: size {seq_file['size']} vs {stored_file['size']}")
elif seq_file["mtime_ns"] != stored_file["mtime_ns"]:
diffs.append(f"{path}: mtime {seq_file['mtime_ns']} vs {stored_file['mtime_ns']} (diff: {abs(seq_file['mtime_ns'] - stored_file['mtime_ns'])})")
if diffs:
log("scan", f"State diff: {diffs[0]}", verbose_only=True, verbose=verbose)
return True
return seq_state != stored_state
def archive_path_for(seq_dir: Path) -> Path:
@@ -606,10 +682,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
old_state_path.unlink(missing_ok=True)
# Build list of files to archive with relative paths
file_list = []
for file_path in iter_sequence_files(seq_dir):
rel_path = file_path.relative_to(seq_dir).as_posix()
file_list.append(rel_path)
# Sort files to ensure consistent archive ordering (matches compute_state)
files = sorted(
iter_sequence_files(seq_dir),
key=lambda p: p.relative_to(seq_dir).as_posix(),
)
file_list = [file_path.relative_to(seq_dir).as_posix() for file_path in files]
if not file_list:
raise RuntimeError(f"No files found to archive in {seq_dir}")
@@ -755,7 +833,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}
with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
for file_path in iter_sequence_files(seq_dir):
# Sort files to ensure consistent archive ordering (matches compute_state)
files = sorted(
iter_sequence_files(seq_dir),
key=lambda p: p.relative_to(seq_dir).as_posix(),
)
for file_path in files:
archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
return
@@ -767,7 +850,7 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
)
def expand_sequence(zip_path: Path, seq_state: dict) -> None:
def expand_sequence(zip_path: Path, seq_state: dict, *, verbose: bool = False) -> None:
target_dir = sequence_dir_for(zip_path)
if target_dir.exists():
shutil.rmtree(target_dir)
@@ -783,6 +866,7 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
SEVEN_Z_EXE,
"x",
"-y",
"-mtc=on", # Preserve timestamps during extraction
str(zip_path),
f"-o{target_dir}",
]
@@ -810,10 +894,68 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
)
for entry in seq_state.get("files", []):
# Restore timestamps from stored state
# Windows NTFS supports 100-nanosecond precision, so we need to handle rounding
is_windows = platform.system() == "Windows"
restored_count = 0
failed_count = 0
mismatch_count = 0
# Process files in sorted order to match state ordering
files_to_restore = sorted(
seq_state.get("files", []),
key=lambda e: e["path"]
)
for entry in files_to_restore:
file_path = target_dir / entry["path"]
if file_path.exists():
os.utime(file_path, ns=(entry["mtime_ns"], entry["mtime_ns"]))
if not file_path.exists():
if verbose:
log("expand", f"Warning: File not found after extraction: {entry['path']}", verbose_only=True, verbose=verbose)
failed_count += 1
continue
stored_mtime_ns = entry["mtime_ns"]
# On Windows, round to 100-nanosecond boundaries (NTFS precision)
if is_windows:
# NTFS FileTime uses 100-nanosecond intervals
stored_mtime_ns = (stored_mtime_ns // 100) * 100
try:
# Restore timestamp
os.utime(file_path, ns=(stored_mtime_ns, stored_mtime_ns))
# Verify the timestamp was set correctly (within filesystem precision tolerance)
actual_stat = file_path.stat()
actual_mtime_ns = actual_stat.st_mtime_ns
# On Windows, compare at 100-nanosecond precision
if is_windows:
actual_mtime_ns = (actual_mtime_ns // 100) * 100
stored_mtime_ns_rounded = (entry["mtime_ns"] // 100) * 100
if abs(actual_mtime_ns - stored_mtime_ns_rounded) > 100:
if verbose:
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
mismatch_count += 1
else:
restored_count += 1
else:
# On other systems, allow small tolerance for filesystem precision
tolerance_ns = 1000 # 1 microsecond tolerance
if abs(actual_mtime_ns - entry["mtime_ns"]) > tolerance_ns:
if verbose:
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
mismatch_count += 1
else:
restored_count += 1
except OSError as e:
if verbose:
log("expand", f"Error restoring timestamp for {entry['path']}: {e}", verbose_only=True, verbose=verbose)
failed_count += 1
if verbose and (failed_count > 0 or mismatch_count > 0):
log("expand", f"Timestamp restoration: {restored_count} restored, {failed_count} failed, {mismatch_count} mismatched", verbose_only=True, verbose=verbose)
def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
@@ -825,7 +967,7 @@ def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict
def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
expand_sequence(zip_path, state)
expand_sequence(zip_path, state, verbose=verbose)
def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
@@ -851,6 +993,15 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
# Quick check: if archive exists, load stored state first (fast)
stored_state = load_state(state_path)
# Debug: log if stored state is missing
if stored_state is None:
if zip_path.exists():
if verbose:
log("scan", f"Warning: {rel} archive exists but no stored state file found at {state_path}", verbose_only=True, verbose=verbose)
else:
if verbose:
log("scan", f"Info: {rel} no archive or state file (will create new)", verbose_only=True, verbose=verbose)
# Check if we need to upgrade from .zip to .7z
old_zip_path = None
old_stored_state = None
@@ -894,7 +1045,17 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
continue
# Check if state changed
if stored_state is not None and not state_changed(seq_state, stored_state):
if stored_state is None:
# No stored state - will need to create archive
if verbose:
log("scan", f"{rel} no stored state found - will create archive", verbose_only=True, verbose=verbose)
else:
# Check if state changed and log first few mismatches
changed = state_changed(seq_state, stored_state, verbose=verbose)
if changed and verbose and queued < 3:
# Show detailed diff for first few sequences
log("scan", f"{rel} state changed - checking details...", verbose_only=False, verbose=verbose)
if not changed:
# Metadata matches stored state
state_skipped += 1
if state_skipped <= 5:
@@ -913,6 +1074,7 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
else:
# No archive exists, but state matches (shouldn't happen, but be safe)
continue
continue
work_items.append((seq_dir, zip_path, state_path, seq_state))
queued += 1