Fix timestamp normalization to prevent false change detection after unzip
This commit is contained in:
File diff suppressed because it is too large
Load Diff
192
zip_sequences.py
192
zip_sequences.py
@@ -530,13 +530,23 @@ def compute_state(seq_dir: Path) -> dict:
|
|||||||
iter_sequence_files(seq_dir),
|
iter_sequence_files(seq_dir),
|
||||||
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||||
)
|
)
|
||||||
|
# On Windows, normalize timestamps to 100-nanosecond boundaries (NTFS precision)
|
||||||
|
# to match what we do when restoring timestamps
|
||||||
|
is_windows = platform.system() == "Windows"
|
||||||
|
|
||||||
for file_path in files:
|
for file_path in files:
|
||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
mtime_ns = stat.st_mtime_ns
|
||||||
|
|
||||||
|
# Normalize to filesystem precision (Windows NTFS uses 100-nanosecond intervals)
|
||||||
|
if is_windows:
|
||||||
|
mtime_ns = (mtime_ns // 100) * 100
|
||||||
|
|
||||||
entries.append(
|
entries.append(
|
||||||
{
|
{
|
||||||
"path": file_path.relative_to(seq_dir).as_posix(),
|
"path": file_path.relative_to(seq_dir).as_posix(),
|
||||||
"size": stat.st_size,
|
"size": stat.st_size,
|
||||||
"mtime_ns": stat.st_mtime_ns,
|
"mtime_ns": mtime_ns,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return {"files": entries}
|
return {"files": entries}
|
||||||
@@ -552,15 +562,81 @@ def load_state(state_path: Path) -> dict | None:
|
|||||||
if not state_path.exists():
|
if not state_path.exists():
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
return json.loads(state_path.read_text())
|
state = json.loads(state_path.read_text())
|
||||||
|
# Normalize timestamps in loaded state to ensure consistency
|
||||||
|
# This handles state files created before normalization was added
|
||||||
|
is_windows = platform.system() == "Windows"
|
||||||
|
if is_windows and "files" in state:
|
||||||
|
for entry in state.get("files", []):
|
||||||
|
if "mtime_ns" in entry:
|
||||||
|
# Normalize to 100-nanosecond boundaries (NTFS precision)
|
||||||
|
entry["mtime_ns"] = (entry["mtime_ns"] // 100) * 100
|
||||||
|
return state
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def state_changed(seq_state: dict, stored_state: dict | None) -> bool:
|
def state_changed(seq_state: dict, stored_state: dict | None, *, verbose: bool = False) -> bool:
|
||||||
if stored_state is None:
|
if stored_state is None:
|
||||||
|
if verbose:
|
||||||
|
log("scan", "State changed: no stored state found", verbose_only=True, verbose=verbose)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Normalize timestamps in both states for comparison
|
||||||
|
# On Windows, normalize to 100-nanosecond boundaries (NTFS precision)
|
||||||
|
is_windows = platform.system() == "Windows"
|
||||||
|
|
||||||
|
def normalize_state(state: dict) -> dict:
|
||||||
|
"""Normalize timestamps in state to filesystem precision."""
|
||||||
|
normalized = {"files": []}
|
||||||
|
for entry in state.get("files", []):
|
||||||
|
mtime_ns = entry["mtime_ns"]
|
||||||
|
if is_windows:
|
||||||
|
mtime_ns = (mtime_ns // 100) * 100
|
||||||
|
normalized["files"].append({
|
||||||
|
"path": entry["path"],
|
||||||
|
"size": entry["size"],
|
||||||
|
"mtime_ns": mtime_ns,
|
||||||
|
})
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
# Compare normalized states
|
||||||
|
normalized_seq = normalize_state(seq_state)
|
||||||
|
normalized_stored = normalize_state(stored_state)
|
||||||
|
|
||||||
|
if normalized_seq == normalized_stored:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# States differ - log diagnostic info if verbose
|
||||||
|
if verbose:
|
||||||
|
seq_files = {f["path"]: f for f in normalized_seq["files"]}
|
||||||
|
stored_files = {f["path"]: f for f in normalized_stored["files"]}
|
||||||
|
|
||||||
|
seq_paths = set(seq_files.keys())
|
||||||
|
stored_paths = set(stored_files.keys())
|
||||||
|
|
||||||
|
if seq_paths != stored_paths:
|
||||||
|
missing = stored_paths - seq_paths
|
||||||
|
extra = seq_paths - stored_paths
|
||||||
|
if missing:
|
||||||
|
log("scan", f"State diff: files in stored but not current: {sorted(missing)[:5]}", verbose_only=True, verbose=verbose)
|
||||||
|
if extra:
|
||||||
|
log("scan", f"State diff: files in current but not stored: {sorted(extra)[:5]}", verbose_only=True, verbose=verbose)
|
||||||
|
else:
|
||||||
|
# Same files, check for differences
|
||||||
|
diffs = []
|
||||||
|
for path in sorted(seq_paths)[:5]: # Check first 5 differences
|
||||||
|
seq_file = seq_files[path]
|
||||||
|
stored_file = stored_files[path]
|
||||||
|
if seq_file != stored_file:
|
||||||
|
if seq_file["size"] != stored_file["size"]:
|
||||||
|
diffs.append(f"{path}: size {seq_file['size']} vs {stored_file['size']}")
|
||||||
|
elif seq_file["mtime_ns"] != stored_file["mtime_ns"]:
|
||||||
|
diffs.append(f"{path}: mtime {seq_file['mtime_ns']} vs {stored_file['mtime_ns']} (diff: {abs(seq_file['mtime_ns'] - stored_file['mtime_ns'])})")
|
||||||
|
if diffs:
|
||||||
|
log("scan", f"State diff: {diffs[0]}", verbose_only=True, verbose=verbose)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
return seq_state != stored_state
|
|
||||||
|
|
||||||
|
|
||||||
def archive_path_for(seq_dir: Path) -> Path:
|
def archive_path_for(seq_dir: Path) -> Path:
|
||||||
@@ -606,10 +682,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
|||||||
old_state_path.unlink(missing_ok=True)
|
old_state_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
# Build list of files to archive with relative paths
|
# Build list of files to archive with relative paths
|
||||||
file_list = []
|
# Sort files to ensure consistent archive ordering (matches compute_state)
|
||||||
for file_path in iter_sequence_files(seq_dir):
|
files = sorted(
|
||||||
rel_path = file_path.relative_to(seq_dir).as_posix()
|
iter_sequence_files(seq_dir),
|
||||||
file_list.append(rel_path)
|
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||||
|
)
|
||||||
|
file_list = [file_path.relative_to(seq_dir).as_posix() for file_path in files]
|
||||||
|
|
||||||
if not file_list:
|
if not file_list:
|
||||||
raise RuntimeError(f"No files found to archive in {seq_dir}")
|
raise RuntimeError(f"No files found to archive in {seq_dir}")
|
||||||
@@ -755,7 +833,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
|||||||
zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}
|
zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}
|
||||||
|
|
||||||
with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
|
with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
|
||||||
for file_path in iter_sequence_files(seq_dir):
|
# Sort files to ensure consistent archive ordering (matches compute_state)
|
||||||
|
files = sorted(
|
||||||
|
iter_sequence_files(seq_dir),
|
||||||
|
key=lambda p: p.relative_to(seq_dir).as_posix(),
|
||||||
|
)
|
||||||
|
for file_path in files:
|
||||||
archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
|
archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -767,7 +850,7 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
def expand_sequence(zip_path: Path, seq_state: dict, *, verbose: bool = False) -> None:
|
||||||
target_dir = sequence_dir_for(zip_path)
|
target_dir = sequence_dir_for(zip_path)
|
||||||
if target_dir.exists():
|
if target_dir.exists():
|
||||||
shutil.rmtree(target_dir)
|
shutil.rmtree(target_dir)
|
||||||
@@ -783,6 +866,7 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
|||||||
SEVEN_Z_EXE,
|
SEVEN_Z_EXE,
|
||||||
"x",
|
"x",
|
||||||
"-y",
|
"-y",
|
||||||
|
"-mtc=on", # Preserve timestamps during extraction
|
||||||
str(zip_path),
|
str(zip_path),
|
||||||
f"-o{target_dir}",
|
f"-o{target_dir}",
|
||||||
]
|
]
|
||||||
@@ -810,10 +894,68 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
|
|||||||
f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
|
f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
|
||||||
)
|
)
|
||||||
|
|
||||||
for entry in seq_state.get("files", []):
|
# Restore timestamps from stored state
|
||||||
|
# Windows NTFS supports 100-nanosecond precision, so we need to handle rounding
|
||||||
|
is_windows = platform.system() == "Windows"
|
||||||
|
restored_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
mismatch_count = 0
|
||||||
|
|
||||||
|
# Process files in sorted order to match state ordering
|
||||||
|
files_to_restore = sorted(
|
||||||
|
seq_state.get("files", []),
|
||||||
|
key=lambda e: e["path"]
|
||||||
|
)
|
||||||
|
|
||||||
|
for entry in files_to_restore:
|
||||||
file_path = target_dir / entry["path"]
|
file_path = target_dir / entry["path"]
|
||||||
if file_path.exists():
|
if not file_path.exists():
|
||||||
os.utime(file_path, ns=(entry["mtime_ns"], entry["mtime_ns"]))
|
if verbose:
|
||||||
|
log("expand", f"Warning: File not found after extraction: {entry['path']}", verbose_only=True, verbose=verbose)
|
||||||
|
failed_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
stored_mtime_ns = entry["mtime_ns"]
|
||||||
|
|
||||||
|
# On Windows, round to 100-nanosecond boundaries (NTFS precision)
|
||||||
|
if is_windows:
|
||||||
|
# NTFS FileTime uses 100-nanosecond intervals
|
||||||
|
stored_mtime_ns = (stored_mtime_ns // 100) * 100
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Restore timestamp
|
||||||
|
os.utime(file_path, ns=(stored_mtime_ns, stored_mtime_ns))
|
||||||
|
|
||||||
|
# Verify the timestamp was set correctly (within filesystem precision tolerance)
|
||||||
|
actual_stat = file_path.stat()
|
||||||
|
actual_mtime_ns = actual_stat.st_mtime_ns
|
||||||
|
|
||||||
|
# On Windows, compare at 100-nanosecond precision
|
||||||
|
if is_windows:
|
||||||
|
actual_mtime_ns = (actual_mtime_ns // 100) * 100
|
||||||
|
stored_mtime_ns_rounded = (entry["mtime_ns"] // 100) * 100
|
||||||
|
if abs(actual_mtime_ns - stored_mtime_ns_rounded) > 100:
|
||||||
|
if verbose:
|
||||||
|
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
|
||||||
|
mismatch_count += 1
|
||||||
|
else:
|
||||||
|
restored_count += 1
|
||||||
|
else:
|
||||||
|
# On other systems, allow small tolerance for filesystem precision
|
||||||
|
tolerance_ns = 1000 # 1 microsecond tolerance
|
||||||
|
if abs(actual_mtime_ns - entry["mtime_ns"]) > tolerance_ns:
|
||||||
|
if verbose:
|
||||||
|
log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
|
||||||
|
mismatch_count += 1
|
||||||
|
else:
|
||||||
|
restored_count += 1
|
||||||
|
except OSError as e:
|
||||||
|
if verbose:
|
||||||
|
log("expand", f"Error restoring timestamp for {entry['path']}: {e}", verbose_only=True, verbose=verbose)
|
||||||
|
failed_count += 1
|
||||||
|
|
||||||
|
if verbose and (failed_count > 0 or mismatch_count > 0):
|
||||||
|
log("expand", f"Timestamp restoration: {restored_count} restored, {failed_count} failed, {mismatch_count} mismatched", verbose_only=True, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
|
def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
|
||||||
@@ -825,7 +967,7 @@ def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict
|
|||||||
|
|
||||||
def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
|
def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
|
||||||
log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
|
log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
|
||||||
expand_sequence(zip_path, state)
|
expand_sequence(zip_path, state, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
||||||
@@ -851,6 +993,15 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
|||||||
# Quick check: if archive exists, load stored state first (fast)
|
# Quick check: if archive exists, load stored state first (fast)
|
||||||
stored_state = load_state(state_path)
|
stored_state = load_state(state_path)
|
||||||
|
|
||||||
|
# Debug: log if stored state is missing
|
||||||
|
if stored_state is None:
|
||||||
|
if zip_path.exists():
|
||||||
|
if verbose:
|
||||||
|
log("scan", f"Warning: {rel} archive exists but no stored state file found at {state_path}", verbose_only=True, verbose=verbose)
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
log("scan", f"Info: {rel} no archive or state file (will create new)", verbose_only=True, verbose=verbose)
|
||||||
|
|
||||||
# Check if we need to upgrade from .zip to .7z
|
# Check if we need to upgrade from .zip to .7z
|
||||||
old_zip_path = None
|
old_zip_path = None
|
||||||
old_stored_state = None
|
old_stored_state = None
|
||||||
@@ -894,7 +1045,17 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if state changed
|
# Check if state changed
|
||||||
if stored_state is not None and not state_changed(seq_state, stored_state):
|
if stored_state is None:
|
||||||
|
# No stored state - will need to create archive
|
||||||
|
if verbose:
|
||||||
|
log("scan", f"{rel} no stored state found - will create archive", verbose_only=True, verbose=verbose)
|
||||||
|
else:
|
||||||
|
# Check if state changed and log first few mismatches
|
||||||
|
changed = state_changed(seq_state, stored_state, verbose=verbose)
|
||||||
|
if changed and verbose and queued < 3:
|
||||||
|
# Show detailed diff for first few sequences
|
||||||
|
log("scan", f"{rel} state changed - checking details...", verbose_only=False, verbose=verbose)
|
||||||
|
if not changed:
|
||||||
# Metadata matches stored state
|
# Metadata matches stored state
|
||||||
state_skipped += 1
|
state_skipped += 1
|
||||||
if state_skipped <= 5:
|
if state_skipped <= 5:
|
||||||
@@ -913,6 +1074,7 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
|
|||||||
else:
|
else:
|
||||||
# No archive exists, but state matches (shouldn't happen, but be safe)
|
# No archive exists, but state matches (shouldn't happen, but be safe)
|
||||||
continue
|
continue
|
||||||
|
continue
|
||||||
|
|
||||||
work_items.append((seq_dir, zip_path, state_path, seq_state))
|
work_items.append((seq_dir, zip_path, state_path, seq_state))
|
||||||
queued += 1
|
queued += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user