Fix timestamp normalization to prevent false change detection after unzip
This commit is contained in:
File diff suppressed because it is too large
Load Diff
192
zip_sequences.py
192
zip_sequences.py
@@ -530,13 +530,23 @@ def compute_state(seq_dir: Path) -> dict:
|
||||
iter_sequence_files(seq_dir),
|
||||
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||
)
|
||||
# On Windows, normalize timestamps to 100-nanosecond boundaries (NTFS precision)
|
||||
# to match what we do when restoring timestamps
|
||||
is_windows = platform.system() == "Windows"
|
||||
|
||||
for file_path in files:
|
||||
stat = file_path.stat()
|
||||
mtime_ns = stat.st_mtime_ns
|
||||
|
||||
# Normalize to filesystem precision (Windows NTFS uses 100-nanosecond intervals)
|
||||
if is_windows:
|
||||
mtime_ns = (mtime_ns // 100) * 100
|
||||
|
||||
entries.append(
|
||||
{
|
||||
"path": file_path.relative_to(seq_dir).as_posix(),
|
||||
"size": stat.st_size,
|
||||
"mtime_ns": stat.st_mtime_ns,
|
||||
"mtime_ns": mtime_ns,
|
||||
}
|
||||
)
|
||||
return {"files": entries}
|
||||
@@ -552,15 +562,81 @@ def load_state(state_path: Path) -> dict | None:
|
||||
if not state_path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(state_path.read_text())
|
||||
state = json.loads(state_path.read_text())
|
||||
# Normalize timestamps in loaded state to ensure consistency
|
||||
# This handles state files created before normalization was added
|
||||
is_windows = platform.system() == "Windows"
|
||||
if is_windows and "files" in state:
|
||||
for entry in state.get("files", []):
|
||||
if "mtime_ns" in entry:
|
||||
# Normalize to 100-nanosecond boundaries (NTFS precision)
|
||||
entry["mtime_ns"] = (entry["mtime_ns"] // 100) * 100
|
||||
return state
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def state_changed(seq_state: dict, stored_state: dict | None) -> bool:
|
||||
def state_changed(seq_state: dict, stored_state: dict | None, *, verbose: bool = False) -> bool:
|
||||
if stored_state is None:
|
||||
if verbose:
|
||||
log("scan", "State changed: no stored state found", verbose_only=True, verbose=verbose)
|
||||
return True
|
||||
|
||||
# Normalize timestamps in both states for comparison
|
||||
# On Windows, normalize to 100-nanosecond boundaries (NTFS precision)
|
||||
is_windows = platform.system() == "Windows"
|
||||
|
||||
def normalize_state(state: dict) -> dict:
|
||||
"""Normalize timestamps in state to filesystem precision."""
|
||||
normalized = {"files": []}
|
||||
for entry in state.get("files", []):
|
||||
mtime_ns = entry["mtime_ns"]
|
||||
if is_windows:
|
||||
mtime_ns = (mtime_ns // 100) * 100
|
||||
normalized["files"].append({
|
||||
"path": entry["path"],
|
||||
"size": entry["size"],
|
||||
"mtime_ns": mtime_ns,
|
||||
})
|
||||
return normalized
|
||||
|
||||
# Compare normalized states
|
||||
normalized_seq = normalize_state(seq_state)
|
||||
normalized_stored = normalize_state(stored_state)
|
||||
|
||||
if normalized_seq == normalized_stored:
|
||||
return False
|
||||
|
||||
# States differ - log diagnostic info if verbose
|
||||
if verbose:
|
||||
seq_files = {f["path"]: f for f in normalized_seq["files"]}
|
||||
stored_files = {f["path"]: f for f in normalized_stored["files"]}
|
||||
|
||||
seq_paths = set(seq_files.keys())
|
||||
stored_paths = set(stored_files.keys())
|
||||
|
||||
if seq_paths != stored_paths:
|
||||
missing = stored_paths - seq_paths
|
||||
extra = seq_paths - stored_paths
|
||||
if missing:
|
||||
log("scan", f"State diff: files in stored but not current: {sorted(missing)[:5]}", verbose_only=True, verbose=verbose)
|
||||
if extra:
|
||||
log("scan", f"State diff: files in current but not stored: {sorted(extra)[:5]}", verbose_only=True, verbose=verbose)
|
||||
else:
|
||||
# Same files, check for differences
|
||||
diffs = []
|
||||
for path in sorted(seq_paths)[:5]: # Check first 5 differences
|
||||
seq_file = seq_files[path]
|
||||
stored_file = stored_files[path]
|
||||
if seq_file != stored_file:
|
||||
if seq_file["size"] != stored_file["size"]:
|
||||
diffs.append(f"{path}: size {seq_file['size']} vs {stored_file['size']}")
|
||||
elif seq_file["mtime_ns"] != stored_file["mtime_ns"]:
|
||||
diffs.append(f"{path}: mtime {seq_file['mtime_ns']} vs {stored_file['mtime_ns']} (diff: {abs(seq_file['mtime_ns'] - stored_file['mtime_ns'])})")
|
||||
if diffs:
|
||||
log("scan", f"State diff: {diffs[0]}", verbose_only=True, verbose=verbose)
|
||||
|
||||
return True
|
||||
return seq_state != stored_state
|
||||
|
||||
|
||||
def archive_path_for(seq_dir: Path) -> Path:
|
||||
@@ -606,10 +682,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
||||
old_state_path.unlink(missing_ok=True)
|
||||
|
||||
# Build list of files to archive with relative paths
|
||||
file_list = []
|
||||
for file_path in iter_sequence_files(seq_dir):
|
||||
rel_path = file_path.relative_to(seq_dir).as_posix()
|
||||
file_list.append(rel_path)
|
||||
# Sort files to ensure consistent archive ordering (matches compute_state)
|
||||
files = sorted(
|
||||
iter_sequence_files(seq_dir),
|
||||
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||
)
|
||||
file_list = [file_path.relative_to(seq_dir).as_posix() for file_path in files]
|
||||
|
||||
if not file_list:
|
||||
raise RuntimeError(f"No files found to archive in {seq_dir}")
|
||||
@@ -755,7 +833,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
||||
zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}
|
||||
|
||||
with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
|
||||
for file_path in iter_sequence_files(seq_dir):
|
||||
# Sort files to ensure consistent archive ordering (matches compute_state)
|
||||
files = sorted(
|
||||
iter_sequence_files(seq_dir),
|
||||
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||
)
|
||||
for file_path in files:
|
||||
archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
|
||||
return
|
||||
|
||||
@@ -767,7 +850,7 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
||||
)
|
||||
|
||||
|
||||
def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
||||
def expand_sequence(zip_path: Path, seq_state: dict, *, verbose: bool = False) -> None:
|
||||
target_dir = sequence_dir_for(zip_path)
|
||||
if target_dir.exists():
|
||||
shutil.rmtree(target_dir)
|
||||
@@ -783,6 +866,7 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
||||
SEVEN_Z_EXE,
|
||||
"x",
|
||||
"-y",
|
||||
"-mtc=on", # Preserve timestamps during extraction
|
||||
str(zip_path),
|
||||
f"-o{target_dir}",
|
||||
]
|
||||
@@ -810,10 +894,68 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
||||
f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
|
||||
)
|
||||
|
||||
for entry in seq_state.get("files", []):
|
||||
# Restore timestamps from stored state
|
||||
# Windows NTFS supports 100-nanosecond precision, so we need to handle rounding
|
||||
is_windows = platform.system() == "Windows"
|
||||
restored_count = 0
|
||||
failed_count = 0
|
||||
mismatch_count = 0
|
||||
|
||||
# Process files in sorted order to match state ordering
|
||||
files_to_restore = sorted(
|
||||
seq_state.get("files", []),
|
||||
key=lambda e: e["path"]
|
||||
)
|
||||
|
||||
for entry in files_to_restore:
|
||||
file_path = target_dir / entry["path"]
|
||||
if file_path.exists():
|
||||
os.utime(file_path, ns=(entry["mtime_ns"], entry["mtime_ns"]))
|
||||
if not file_path.exists():
|
||||
if verbose:
|
||||
log("expand", f"Warning: File not found after extraction: {entry['path']}", verbose_only=True, verbose=verbose)
|
||||
failed_count += 1
|
||||
continue
|
||||
|
||||
stored_mtime_ns = entry["mtime_ns"]
|
||||
|
||||
# On Windows, round to 100-nanosecond boundaries (NTFS precision)
|
||||
if is_windows:
|
||||
# NTFS FileTime uses 100-nanosecond intervals
|
||||
stored_mtime_ns = (stored_mtime_ns // 100) * 100
|
||||
|
||||
try:
|
||||
# Restore timestamp
|
||||
os.utime(file_path, ns=(stored_mtime_ns, stored_mtime_ns))
|
||||
|
||||
# Verify the timestamp was set correctly (within filesystem precision tolerance)
|
||||
actual_stat = file_path.stat()
|
||||
actual_mtime_ns = actual_stat.st_mtime_ns
|
||||
|
||||
# On Windows, compare at 100-nanosecond precision
|
||||
if is_windows:
|
||||
actual_mtime_ns = (actual_mtime_ns // 100) * 100
|
||||
stored_mtime_ns_rounded = (entry["mtime_ns"] // 100) * 100
|
||||
if abs(actual_mtime_ns - stored_mtime_ns_rounded) > 100:
|
||||
if verbose:
|
||||
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
|
||||
mismatch_count += 1
|
||||
else:
|
||||
restored_count += 1
|
||||
else:
|
||||
# On other systems, allow small tolerance for filesystem precision
|
||||
tolerance_ns = 1000 # 1 microsecond tolerance
|
||||
if abs(actual_mtime_ns - entry["mtime_ns"]) > tolerance_ns:
|
||||
if verbose:
|
||||
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
|
||||
mismatch_count += 1
|
||||
else:
|
||||
restored_count += 1
|
||||
except OSError as e:
|
||||
if verbose:
|
||||
log("expand", f"Error restoring timestamp for {entry['path']}: {e}", verbose_only=True, verbose=verbose)
|
||||
failed_count += 1
|
||||
|
||||
if verbose and (failed_count > 0 or mismatch_count > 0):
|
||||
log("expand", f"Timestamp restoration: {restored_count} restored, {failed_count} failed, {mismatch_count} mismatched", verbose_only=True, verbose=verbose)
|
||||
|
||||
|
||||
def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
|
||||
@@ -825,7 +967,7 @@ def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict
|
||||
|
||||
def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
|
||||
log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
|
||||
expand_sequence(zip_path, state)
|
||||
expand_sequence(zip_path, state, verbose=verbose)
|
||||
|
||||
|
||||
def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
||||
@@ -851,6 +993,15 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
||||
# Quick check: if archive exists, load stored state first (fast)
|
||||
stored_state = load_state(state_path)
|
||||
|
||||
# Debug: log if stored state is missing
|
||||
if stored_state is None:
|
||||
if zip_path.exists():
|
||||
if verbose:
|
||||
log("scan", f"Warning: {rel} archive exists but no stored state file found at {state_path}", verbose_only=True, verbose=verbose)
|
||||
else:
|
||||
if verbose:
|
||||
log("scan", f"Info: {rel} no archive or state file (will create new)", verbose_only=True, verbose=verbose)
|
||||
|
||||
# Check if we need to upgrade from .zip to .7z
|
||||
old_zip_path = None
|
||||
old_stored_state = None
|
||||
@@ -894,7 +1045,17 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
||||
continue
|
||||
|
||||
# Check if state changed
|
||||
if stored_state is not None and not state_changed(seq_state, stored_state):
|
||||
if stored_state is None:
|
||||
# No stored state - will need to create archive
|
||||
if verbose:
|
||||
log("scan", f"{rel} no stored state found - will create archive", verbose_only=True, verbose=verbose)
|
||||
else:
|
||||
# Check if state changed and log first few mismatches
|
||||
changed = state_changed(seq_state, stored_state, verbose=verbose)
|
||||
if changed and verbose and queued < 3:
|
||||
# Show detailed diff for first few sequences
|
||||
log("scan", f"{rel} state changed - checking details...", verbose_only=False, verbose=verbose)
|
||||
if not changed:
|
||||
# Metadata matches stored state
|
||||
state_skipped += 1
|
||||
if state_skipped <= 5:
|
||||
@@ -913,6 +1074,7 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
||||
else:
|
||||
# No archive exists, but state matches (shouldn't happen, but be safe)
|
||||
continue
|
||||
continue
|
||||
|
||||
work_items.append((seq_dir, zip_path, state_path, seq_state))
|
||||
queued += 1
|
||||
|
||||
Reference in New Issue
Block a user