Fix timestamp normalization to prevent false change detection after unzip

2025-12-10 12:06:02 -07:00
parent 0287cbd7ef
commit 19e4eb63aa
2 changed files with 4813 additions and 37 deletions
--- a/.specstory/history/2025-11-26_20-08Z-configure-configloader-for-zipseq-and-unzipseq.md
+++ b/.specstory/history/2025-11-26_20-08Z-configure-configloader-for-zipseq-and-unzipseq.md
--- a/zip_sequences.py
+++ b/zip_sequences.py
@@ -530,13 +530,23 @@ def compute_state(seq_dir: Path) -> dict:
        iter_sequence_files(seq_dir),
        key=lambda p: p.relative_to(seq_dir).as_posix(),
    )
+    # On Windows, normalize timestamps to 100-nanosecond boundaries (NTFS precision)
+    # to match what we do when restoring timestamps
+    is_windows = platform.system() == "Windows"
+    
    for file_path in files:
        stat = file_path.stat()
+        mtime_ns = stat.st_mtime_ns
+        
+        # Normalize to filesystem precision (Windows NTFS uses 100-nanosecond intervals)
+        if is_windows:
+            mtime_ns = (mtime_ns // 100) * 100
+        
        entries.append(
            {
                "path": file_path.relative_to(seq_dir).as_posix(),
                "size": stat.st_size,
-                "mtime_ns": stat.st_mtime_ns,
+                "mtime_ns": mtime_ns,
            }
        )
    return {"files": entries}
@@ -552,15 +562,81 @@ def load_state(state_path: Path) -> dict | None:
    if not state_path.exists():
        return None
    try:
-        return json.loads(state_path.read_text())
+        state = json.loads(state_path.read_text())
+        # Normalize timestamps in loaded state to ensure consistency
+        # This handles state files created before normalization was added
+        is_windows = platform.system() == "Windows"
+        if is_windows and "files" in state:
+            for entry in state.get("files", []):
+                if "mtime_ns" in entry:
+                    # Normalize to 100-nanosecond boundaries (NTFS precision)
+                    entry["mtime_ns"] = (entry["mtime_ns"] // 100) * 100
+        return state
    except json.JSONDecodeError:
        return None


-def state_changed(seq_state: dict, stored_state: dict | None) -> bool:
+def state_changed(seq_state: dict, stored_state: dict | None, *, verbose: bool = False) -> bool:
    if stored_state is None:
+        if verbose:
+            log("scan", "State changed: no stored state found", verbose_only=True, verbose=verbose)
+        return True
+    
+    # Normalize timestamps in both states for comparison
+    # On Windows, normalize to 100-nanosecond boundaries (NTFS precision)
+    is_windows = platform.system() == "Windows"
+    
+    def normalize_state(state: dict) -> dict:
+        """Normalize timestamps in state to filesystem precision."""
+        normalized = {"files": []}
+        for entry in state.get("files", []):
+            mtime_ns = entry["mtime_ns"]
+            if is_windows:
+                mtime_ns = (mtime_ns // 100) * 100
+            normalized["files"].append({
+                "path": entry["path"],
+                "size": entry["size"],
+                "mtime_ns": mtime_ns,
+            })
+        return normalized
+    
+    # Compare normalized states
+    normalized_seq = normalize_state(seq_state)
+    normalized_stored = normalize_state(stored_state)
+    
+    if normalized_seq == normalized_stored:
+        return False
+    
+    # States differ - log diagnostic info if verbose
+    if verbose:
+        seq_files = {f["path"]: f for f in normalized_seq["files"]}
+        stored_files = {f["path"]: f for f in normalized_stored["files"]}
+        
+        seq_paths = set(seq_files.keys())
+        stored_paths = set(stored_files.keys())
+        
+        if seq_paths != stored_paths:
+            missing = stored_paths - seq_paths
+            extra = seq_paths - stored_paths
+            if missing:
+                log("scan", f"State diff: files in stored but not current: {sorted(missing)[:5]}", verbose_only=True, verbose=verbose)
+            if extra:
+                log("scan", f"State diff: files in current but not stored: {sorted(extra)[:5]}", verbose_only=True, verbose=verbose)
+        else:
+            # Same files, check for differences
+            diffs = []
+            for path in sorted(seq_paths)[:5]:  # Check first 5 differences
+                seq_file = seq_files[path]
+                stored_file = stored_files[path]
+                if seq_file != stored_file:
+                    if seq_file["size"] != stored_file["size"]:
+                        diffs.append(f"{path}: size {seq_file['size']} vs {stored_file['size']}")
+                    elif seq_file["mtime_ns"] != stored_file["mtime_ns"]:
+                        diffs.append(f"{path}: mtime {seq_file['mtime_ns']} vs {stored_file['mtime_ns']} (diff: {abs(seq_file['mtime_ns'] - stored_file['mtime_ns'])})")
+            if diffs:
+                log("scan", f"State diff: {diffs[0]}", verbose_only=True, verbose=verbose)
+    
    return True
-    return seq_state != stored_state


 def archive_path_for(seq_dir: Path) -> Path:
@@ -606,10 +682,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
                    old_state_path.unlink(missing_ok=True)
        
        # Build list of files to archive with relative paths
-        file_list = []
-        for file_path in iter_sequence_files(seq_dir):
-            rel_path = file_path.relative_to(seq_dir).as_posix()
-            file_list.append(rel_path)
+        # Sort files to ensure consistent archive ordering (matches compute_state)
+        files = sorted(
+            iter_sequence_files(seq_dir),
+            key=lambda p: p.relative_to(seq_dir).as_posix(),
+        )
+        file_list = [file_path.relative_to(seq_dir).as_posix() for file_path in files]
        
        if not file_list:
            raise RuntimeError(f"No files found to archive in {seq_dir}")
@@ -755,7 +833,12 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
            zip_kwargs = {"compresslevel": COMPRESSION_LEVEL}

        with ZipFile(zip_path, "w", compression=compression, **zip_kwargs) as archive:
-            for file_path in iter_sequence_files(seq_dir):
+            # Sort files to ensure consistent archive ordering (matches compute_state)
+            files = sorted(
+                iter_sequence_files(seq_dir),
+                key=lambda p: p.relative_to(seq_dir).as_posix(),
+            )
+            for file_path in files:
                archive.write(file_path, arcname=file_path.relative_to(seq_dir).as_posix())
        return

@@ -767,7 +850,7 @@ def zip_sequence(seq_dir: Path, zip_path: Path, per_job_memory_limit: int | None
    )


-def expand_sequence(zip_path: Path, seq_state: dict) -> None:
+def expand_sequence(zip_path: Path, seq_state: dict, *, verbose: bool = False) -> None:
    target_dir = sequence_dir_for(zip_path)
    if target_dir.exists():
        shutil.rmtree(target_dir)
@@ -783,6 +866,7 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
            SEVEN_Z_EXE,
            "x",
            "-y",
+            "-mtc=on",  # Preserve timestamps during extraction
            str(zip_path),
            f"-o{target_dir}",
        ]
@@ -810,10 +894,68 @@ def expand_sequence(zip_path: Path, seq_state: dict) -> None:
            f"Config zipper value: {CONFIG.get('zipper', 'not set')!r}"
        )

-    for entry in seq_state.get("files", []):
+    # Restore timestamps from stored state
+    # Windows NTFS supports 100-nanosecond precision, so we need to handle rounding
+    is_windows = platform.system() == "Windows"
+    restored_count = 0
+    failed_count = 0
+    mismatch_count = 0
+    
+    # Process files in sorted order to match state ordering
+    files_to_restore = sorted(
+        seq_state.get("files", []),
+        key=lambda e: e["path"]
+    )
+    
+    for entry in files_to_restore:
        file_path = target_dir / entry["path"]
-        if file_path.exists():
-            os.utime(file_path, ns=(entry["mtime_ns"], entry["mtime_ns"]))
+        if not file_path.exists():
+            if verbose:
+                log("expand", f"Warning: File not found after extraction: {entry['path']}", verbose_only=True, verbose=verbose)
+            failed_count += 1
+            continue
+        
+        stored_mtime_ns = entry["mtime_ns"]
+        
+        # On Windows, round to 100-nanosecond boundaries (NTFS precision)
+        if is_windows:
+            # NTFS FileTime uses 100-nanosecond intervals
+            stored_mtime_ns = (stored_mtime_ns // 100) * 100
+        
+        try:
+            # Restore timestamp
+            os.utime(file_path, ns=(stored_mtime_ns, stored_mtime_ns))
+            
+            # Verify the timestamp was set correctly (within filesystem precision tolerance)
+            actual_stat = file_path.stat()
+            actual_mtime_ns = actual_stat.st_mtime_ns
+            
+            # On Windows, compare at 100-nanosecond precision
+            if is_windows:
+                actual_mtime_ns = (actual_mtime_ns // 100) * 100
+                stored_mtime_ns_rounded = (entry["mtime_ns"] // 100) * 100
+                if abs(actual_mtime_ns - stored_mtime_ns_rounded) > 100:
+                    if verbose:
+                        log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
+                    mismatch_count += 1
+                else:
+                    restored_count += 1
+            else:
+                # On other systems, allow small tolerance for filesystem precision
+                tolerance_ns = 1000  # 1 microsecond tolerance
+                if abs(actual_mtime_ns - entry["mtime_ns"]) > tolerance_ns:
+                    if verbose:
+                        log("expand", f"Warning: Timestamp mismatch for {entry['path']}: stored={entry['mtime_ns']}, actual={actual_stat.st_mtime_ns}", verbose_only=True, verbose=verbose)
+                    mismatch_count += 1
+                else:
+                    restored_count += 1
+        except OSError as e:
+            if verbose:
+                log("expand", f"Error restoring timestamp for {entry['path']}: {e}", verbose_only=True, verbose=verbose)
+            failed_count += 1
+    
+    if verbose and (failed_count > 0 or mismatch_count > 0):
+        log("expand", f"Timestamp restoration: {restored_count} restored, {failed_count} failed, {mismatch_count} mismatched", verbose_only=True, verbose=verbose)


 def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict, per_job_memory_limit: int | None, worker_count: int, *, verbose: bool) -> Sequence[Path]:
@@ -825,7 +967,7 @@ def process_zip(seq_dir: Path, zip_path: Path, state_path: Path, seq_state: dict

 def process_expand(zip_path: Path, state: dict, *, verbose: bool) -> None:
    log("expand", f"{zip_path} -> {sequence_dir_for(zip_path)}", verbose_only=True, verbose=verbose)
-    expand_sequence(zip_path, state)
+    expand_sequence(zip_path, state, verbose=verbose)


 def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
@@ -851,6 +993,15 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
            # Quick check: if archive exists, load stored state first (fast)
            stored_state = load_state(state_path)
            
+            # Debug: log if stored state is missing
+            if stored_state is None:
+                if zip_path.exists():
+                    if verbose:
+                        log("scan", f"Warning: {rel} archive exists but no stored state file found at {state_path}", verbose_only=True, verbose=verbose)
+                else:
+                    if verbose:
+                        log("scan", f"Info: {rel} no archive or state file (will create new)", verbose_only=True, verbose=verbose)
+            
            # Check if we need to upgrade from .zip to .7z
            old_zip_path = None
            old_stored_state = None
@@ -894,7 +1045,17 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
                continue
            
            # Check if state changed
-            if stored_state is not None and not state_changed(seq_state, stored_state):
+            if stored_state is None:
+                # No stored state - will need to create archive
+                if verbose:
+                    log("scan", f"{rel} no stored state found - will create archive", verbose_only=True, verbose=verbose)
+            else:
+                # Check if state changed and log first few mismatches
+                changed = state_changed(seq_state, stored_state, verbose=verbose)
+                if changed and verbose and queued < 3:
+                    # Show detailed diff for first few sequences
+                    log("scan", f"{rel} state changed - checking details...", verbose_only=False, verbose=verbose)
+                if not changed:
                    # Metadata matches stored state
                    state_skipped += 1
                    if state_skipped <= 5:
@@ -913,6 +1074,7 @@ def run_zip(requested_workers: int | None, *, verbose: bool) -> int:
                    else:
                        # No archive exists, but state matches (shouldn't happen, but be safe)
                        continue
+                    continue

            work_items.append((seq_dir, zip_path, state_path, seq_state))
            queued += 1