Fix timestamp normalization to prevent false change detection after unzip

2025-12-10 12:06:02 -07:00
parent 0287cbd7ef
commit 19e4eb63aa
2 changed files with 4813 additions and 37 deletions
--- a/.specstory/history/2025-11-26_20-08Z-configure-configloader-for-zipseq-and-unzipseq.md
+++ b/.specstory/history/2025-11-26_20-08Z-configure-configloader-for-zipseq-and-unzipseq.md
--- a/zip_sequences.py
+++ b/zip_sequences.py
@@ -530,13 +530,23 @@ def compute_state(seq_dir: Path) -> dict:
        iter_sequence_files(seq_dir),
        key=lambda p: p.relative_to(seq_dir).as_posix(),
    )
    # On Windows, normalize timestamps to 100-nanosecond boundaries (NTFS precision)
    # to match what we do when restoring timestamps
    is_windows = platform.system() == "Windows"
    for file_path in files:
        stat = file_path.stat()
        mtime_ns = stat.st_mtime_ns
        # Normalize to filesystem precision (Windows NTFS uses 100-nanosecond intervals)
        if is_windows:
            mtime_ns = (mtime_ns // 100) * 100
        entries.append(
            {
                "path": file_path.relative_to(seq_dir).as_posix(),
                "size": stat.st_size,
-                "mtime_ns": stat.st_mtime_ns,
+                "mtime_ns": mtime_ns,
            }
        )
    return {"files": entries}
@@ -552,15 +562,81 @@ def load_state(state_path: Path) -> dict | None:
    if not state_path.exists():
        return None
    try:
-        return json.loads(state_path.read_text())
+        state = json.loads(state_path.read_text())
        # Normalize timestamps in loaded state to ensure consistency
        # This handles state files created before normalization was added
        is_windows = platform.system() == "Windows"
        if is_windows and "files" in state:
            for entry in state.get("files", []):
                if "mtime_ns" in entry:
                    # Normalize to 100-nanosecond boundaries (NTFS precision)
                    entry["mtime_ns"] = (entry["mtime_ns"] // 100) * 100
        return state
    except json.JSONDecodeError:
        return None
-def state_changed(seq_state: dict, stored_state: dict | None) -> bool:
+def state_changed(seq_state: dict, stored_state: dict | None, *, verbose: bool = False) -> bool:
    if stored_state is None:
        if verbose:
            log("scan", "State changed: no stored state found", verbose_only=True, verbose=verbose)
        return True
    # Normalize timestamps in both states for comparison
    # On Windows, normalize to 100-nanosecond boundaries (NTFS precision)
    is_windows = platform.system() == "Windows"
    def normalize_state(state: dict) -> dict:
        """Normalize timestamps in state to filesystem precision."""
        normalized = {"files": []}
        for entry in state.get("files", []):
            mtime_ns = entry["mtime_ns"]
            if is_windows:
                mtime_ns = (mtime_ns // 100) * 100
            normalized["files"].append({
                "path": entry["path"],
                "size": entry["size"],
                "mtime_ns": mtime_ns,
            })
        return normalized
    # Compare normalized states
    normalized_seq = normalize_state(seq_state)
    normalized_stored = normalize_state(stored_state)
    if normalized_seq == normalized_stored:
        return False
    # States differ - log diagnostic info if verbose
    if verbose:
        seq_files = {f["path"]: f for f in normalized_seq["files"]}
        stored_files = {f["path"]: f for f in normalized_stored["files"]}
        seq_paths = set(seq_files.keys())
        stored_paths = set(stored_files.keys())
        if seq_paths != stored_paths:
            missing = stored_paths - seq_paths
            extra = seq_paths - stored_paths
            if missing:
                log("scan", f"State diff: files in stored but not current: {sorted(missing)[:5]}", verbose_only=True, verbose=verbose)
            if extra:
                log("scan", f"State diff: files in current but not stored: {sorted(extra)[:5]}", verbose_only=True, verbose=verbose)
        else:
            # Same files, check for differences
            diffs = []
            for path in sorted(seq_paths)[:5]:  # Check first 5 differences
                seq_file = seq_files[path]
                stored_file = stored_files[path]
                if seq_file != stored_file:
                    if seq_file["size"] != stored_file["size"]:
                        diffs.append(f"{path}: size {seq_file['size']} vs {stored_file['size']}")
                    elif seq_file["mtime_ns"] != stored_file["mtime_ns"]:
                        diffs.append(f"{path}: mtime {seq_file['mtime_ns']} vs {stored_file['mtime_ns']} (diff: {abs(seq_file['mtime_ns'] - stored_file['mtime_ns'])})")
            if diffs:
                log("scan", f"State diff: {diffs[0]}", verbose_only=True, verbose=verbose)
    return True
    return seq_state != stored_state
 def archive_path_for(seq_dir: Path) -> Path:
@@ -606,10 +682,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
                    old_state_path.unlink(missing_ok=True)
        # Build list of files to archive with relative paths
-        file_list = []
+        # Sort files to ensure consistent archive ordering (matches compute_state)
-        for file_path in iter_sequence_files(seq_dir):
+        files = sorted(
-            rel_path = file_path.relative_to(seq_dir).as_posix()
+            iter_sequence_files(seq_dir),
-            file_list.append(rel_path)
+            key=lambda p: p.relative_to(seq_dir).as_posix(),
        )
        file_list = [file_path.relative_to(seq_dir).as_posix() for file_path in files]
        if not file_list:
            raise RuntimeError(f"No files found to archive in {seq_dir}")
@@ -755,7 +833,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
            zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}
        with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
-            for file_path in iter_sequence_files(seq_dir):
+            # Sort files to ensure consistent archive ordering (matches compute_state)
            files = sorted(
                iter_sequence_files(seq_dir),
                key=lambda p: p.relative_to(seq_dir).as_posix(),
            )
            for file_path in files:
                archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
        return
@@ -767,7 +850,7 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
    )
-def expand_sequence(zip_path: Path, seq_state: dict) -> None:
+def expand_sequence(zip_path: Path, seq_state: dict, *, verbose: bool = False) -> None:
    target_dir = sequence_dir_for(zip_path)
    if target_dir.exists():
        shutil.rmtree(target_dir)
@@ -783,6 +866,7 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
            SEVEN_Z_EXE,
            "x",
            "-y",
            "-mtc=on",  # Preserve timestamps during extraction
            str(zip_path),
            f"-o{target_dir}",
        ]
@@ -810,10 +894,68 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
            f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
        )
-    for entry in seq_state.get("files", []):
+    # Restore timestamps from stored state
    # Windows NTFS supports 100-nanosecond precision, so we need to handle rounding
    is_windows = platform.system() == "Windows"
    restored_count = 0
    failed_count = 0
    mismatch_count = 0
    # Process files in sorted order to match state ordering
    files_to_restore = sorted(
        seq_state.get("files", []),
        key=lambda e: e["path"]
    )
    for entry in files_to_restore:
        file_path = target_dir / entry["path"]
-        if file_path.exists():
+        if not file_path.exists():
-            os.utime(file_path, ns=(entry["mtime_ns"], entry["mtime_ns"]))
+            if verbose:
                log("expand", f"Warning: File not found after extraction: {entry['path']}", verbose_only=True, verbose=verbose)
            failed_count += 1
            continue
        stored_mtime_ns = entry["mtime_ns"]
        # On Windows, round to 100-nanosecond boundaries (NTFS precision)
        if is_windows:
            # NTFS FileTime uses 100-nanosecond intervals
            stored_mtime_ns = (stored_mtime_ns // 100) * 100
        try:
            # Restore timestamp
            os.utime(file_path, ns=(stored_mtime_ns, stored_mtime_ns))
            # Verify the timestamp was set correctly (within filesystem precision tolerance)
            actual_stat = file_path.stat()
            actual_mtime_ns = actual_stat.st_mtime_ns
            # On Windows, compare at 100-nanosecond precision
            if is_windows:
                actual_mtime_ns = (actual_mtime_ns // 100) * 100
                stored_mtime_ns_rounded = (entry["mtime_ns"] // 100) * 100
                if abs(actual_mtime_ns - stored_mtime_ns_rounded) > 100:
                    if verbose:
                        log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
                    mismatch_count += 1
                else:
                    restored_count += 1
            else:
                # On other systems, allow small tolerance for filesystem precision
                tolerance_ns = 1000  # 1 microsecond tolerance
                if abs(actual_mtime_ns - entry["mtime_ns"]) > tolerance_ns:
                    if verbose:
                        log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
                    mismatch_count += 1
                else:
                    restored_count += 1
        except OSError as e:
            if verbose:
                log("expand", f"Error restoring timestamp for {entry['path']}: {e}", verbose_only=True, verbose=verbose)
            failed_count += 1
    if verbose and (failed_count > 0 or mismatch_count > 0):
        log("expand", f"Timestamp restoration: {restored_count} restored, {failed_count} failed, {mismatch_count} mismatched", verbose_only=True, verbose=verbose)
 def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
@@ -825,7 +967,7 @@ def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict
 def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
    log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
-    expand_sequence(zip_path, state)
+    expand_sequence(zip_path, state, verbose=verbose)
 def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
@@ -851,6 +993,15 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
            # Quick check: if archive exists, load stored state first (fast)
            stored_state = load_state(state_path)
            # Debug: log if stored state is missing
            if stored_state is None:
                if zip_path.exists():
                    if verbose:
                        log("scan", f"Warning: {rel} archive exists but no stored state file found at {state_path}", verbose_only=True, verbose=verbose)
                else:
                    if verbose:
                        log("scan", f"Info: {rel} no archive or state file (will create new)", verbose_only=True, verbose=verbose)
            # Check if we need to upgrade from .zip to .7z
            old_zip_path = None
            old_stored_state = None
@@ -894,7 +1045,17 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
                continue
            # Check if state changed
-            if stored_state is not None and not state_changed(seq_state, stored_state):
+            if stored_state is None:
                # No stored state - will need to create archive
                if verbose:
                    log("scan", f"{rel} no stored state found - will create archive", verbose_only=True, verbose=verbose)
            else:
                # Check if state changed and log first few mismatches
                changed = state_changed(seq_state, stored_state, verbose=verbose)
                if changed and verbose and queued < 3:
                    # Show detailed diff for first few sequences
                    log("scan", f"{rel} state changed - checking details...", verbose_only=False, verbose=verbose)
                if not changed:
                    # Metadata matches stored state
                    state_skipped += 1
                    if state_skipped <= 5:
@@ -913,6 +1074,7 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
                    else:
                        # No archive exists, but state matches (shouldn't happen, but be safe)
                        continue
                    continue
            work_items.append((seq_dir, zip_path, state_path, seq_state))
            queued += 1