From 4416949d7c7695b69f4dca040aa825f6c38ccbb9 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 11 Dec 2025 15:21:36 -0700 Subject: [PATCH] detect and quarantine corrupt pngs --- .gitignore | 1 + ...press-png-sequences-with-multithreading.md | 758 ++++++++++++++++++ compress_pngs.py | 123 ++- 3 files changed, 872 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 7b99f19..8326436 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ input/ output/ +corrupted/ diff --git a/.specstory/history/2025-12-11_18-21Z-compress-png-sequences-with-multithreading.md b/.specstory/history/2025-12-11_18-21Z-compress-png-sequences-with-multithreading.md index 049d01a..cae797b 100644 --- a/.specstory/history/2025-12-11_18-21Z-compress-png-sequences-with-multithreading.md +++ b/.specstory/history/2025-12-11_18-21Z-compress-png-sequences-with-multithreading.md @@ -3012,3 +3012,761 @@ The Threadripper’s higher core count should provide a clear speedup for this C --- +_**User (2025-12-11 21:39Z)**_ + +How about a threadripper 1950X? + +--- + +_**Agent (model default, mode Agent)**_ + + + +--- + + +The Threadripper 1950X will likely be slower than the i9-13900KF, despite having more threads. + +**Threadripper 1950X:** +- 16 physical cores, 32 threads +- First-gen Threadripper (Zen, 2017) +- The script would use 32 worker processes +- Older architecture, lower IPC + +**i9-13900KF:** +- 8 P-cores + 8 E-cores = 16 physical cores, 24 threads +- Raptor Lake (2022) +- The script would use 24 worker processes +- Much higher single-threaded performance + +**Why the i9-13900KF will likely be faster:** +1. Per-core performance: The i9’s P-cores are significantly faster (roughly 50–70% higher IPC). +2. Core efficiency: 24 faster workers can outperform 32 slower ones for this workload. +3. Memory: The i9’s newer memory controller can help. +4. Architecture: 5+ years of improvements in efficiency and performance. + +**Expected performance:** +- The i9-13900KF should be roughly 20–40% faster, depending on I/O and the specific compression work. + +**Bottom line:** The 1950X’s extra threads don’t make up for the i9’s much faster cores. The 3970X’s large core count can overcome the per-core gap; the 1950X cannot. + +--- + +_**User (2025-12-11 21:39Z)**_ + +Sometimes, PNGs corrupt; we'll get bad results where everything is black, the image cuts off too early, or the file is just 0kb (placeholder). Can we have it report bad PNGs? + +--- + +_**Agent (model default, mode Agent)**_ + + + +--- + + +Adding validation to detect and report corrupted PNGs: + + +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 43, lines removed: 1 + +```diff +@@ -38,5 +38,47 @@ + + original_size = os.path.getsize(input_path) +- img = Image.open(input_path) ++ ++ # Check for corrupted/empty files ++ if original_size == 0: ++ return (str(input_path), False, "CORRUPTED: File is 0 bytes (empty/placeholder)", 0, 0, 0, False, True) ++ ++ # Try to open and validate the image ++ try: ++ img = Image.open(input_path) ++ # Force load to detect corruption early ++ img.load() ++ except Exception as e: ++ return (str(input_path), False, f"CORRUPTED: Cannot open/load image - {str(e)}", original_size, 0, 0, False, True) ++ ++ # Validate image dimensions ++ if img.width == 0 or img.height == 0: ++ return (str(input_path), False, f"CORRUPTED: Invalid dimensions ({img.width}x{img.height})", original_size, 0, 0, False, True) ++ ++ # Check if image appears completely black (potential corruption indicator) ++ # But be careful - some images might legitimately be black ++ # We'll only flag this if the file is suspiciously small for its dimensions ++ try: ++ # Sample a few pixels to check if all are black ++ sample_size = min(100, img.width * img.height) ++ pixels = list(img.getdata()[:sample_size]) ++ if pixels: ++ # Check if all sampled pixels are black (0 or (0,0,0) or (0,0,0,0)) ++ all_black = True ++ for pixel in pixels: ++ if isinstance(pixel, (tuple, list)): ++ if any(p > 0 for p in pixel): ++ all_black = False ++ break ++ else: ++ if pixel > 0: ++ all_black = False ++ break ++ ++ # If all black AND file is suspiciously small, flag as potentially corrupted ++ if all_black and original_size < (img.width * img.height * 0.1): # Less than 0.1 bytes per pixel ++ return (str(input_path), False, f"CORRUPTED: Image appears all black with suspiciously small file size", original_size, 0, 0, False, True) ++ except: ++ # If we can't check pixels, continue anyway ++ pass + + # Determine target bit depth +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 1, lines removed: 1 + +```diff +@@ -32,5 +32,5 @@ + savings = original_size - new_size + savings_pct = (savings / original_size * 100) if original_size > 0 else 0 +- return (str(input_path), True, None, original_size, new_size, savings_pct, True) ++ return (str(input_path), True, None, original_size, new_size, savings_pct, True, False) + + # Ensure output directory exists +``` + +
+ +
+ +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 4, lines removed: 2 + +```diff +@@ -174,7 +174,9 @@ + savings = original_size - new_size + savings_pct = (savings / original_size * 100) if original_size > 0 else 0 +- return (str(input_path), True, None, original_size, new_size, savings_pct, False) ++ return (str(input_path), True, None, original_size, new_size, savings_pct, False, False) + except Exception as e: +- return (str(input_path), False, str(e), 0, 0, 0, False) ++ # Check if error might indicate corruption ++ is_corrupted = "truncated" in str(e).lower() or "cannot identify" in str(e).lower() or "corrupt" in str(e).lower() ++ return (str(input_path), False, str(e), 0, 0, 0, False, is_corrupted) + + def find_image_files(input_dir): +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 2, lines removed: undefined + +```diff +@@ -291,4 +291,6 @@ + skipped = 0 + failed = 0 ++ corrupted = 0 ++ corrupted_files = [] + total_original_size = 0 + total_new_size = 0 +``` + +**Chunk 2** +Lines added: 1, lines removed: 1 + +```diff +@@ -306,5 +308,5 @@ + for future in as_completed(future_to_file): + result = future.result() +- file_path, success, error, orig_size, new_size, savings_pct, was_skipped = result ++ file_path, success, error, orig_size, new_size, savings_pct, was_skipped, is_corrupted = result + + if success: +``` + +**Chunk 3** +Lines added: 1, lines removed: 1 + +```diff +@@ -325,5 +327,5 @@ + if processed == 1 or time_since_update >= 0.5: + rate = processed / elapsed if elapsed > 0 else 0 +- remaining = len(png_files) - processed ++ remaining = len(png_files) - processed - failed + eta_seconds = remaining / rate if rate > 0 and remaining > 0 else 0 + +``` + +**Chunk 4** +Lines added: 9, lines removed: 4 + +```diff +@@ -335,12 +337,17 @@ + + print(f"[{processed:5d}/{len(png_files)}] " +- f"Compressed: {compressed} | Skipped: {skipped} | " ++ f"Compressed: {compressed} | Skipped: {skipped} | Corrupted: {corrupted} | " + f"Speed: {rate:.1f} files/sec | " + f"Elapsed: {elapsed_str} | ETA: {eta_str} | " + f"Saved: {format_size(total_savings)} ({total_savings_pct:.1f}%)", end='\r') + last_update_time = current_time +- else: +- failed += 1 +- print(f"\n[ERROR] Failed to compress {file_path}: {error}") ++ else: ++ if is_corrupted: ++ corrupted += 1 ++ corrupted_files.append((file_path, error)) ++ print(f"\n[CORRUPTED] {file_path}: {error}") ++ else: ++ failed += 1 ++ print(f"\n[ERROR] Failed to compress {file_path}: {error}") + + total_time = time.time() - start_time +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 2, lines removed: undefined + +```diff +@@ -363,4 +363,6 @@ + if jpg_files: + print(f"Ignored (JPG/JPEG): {len(jpg_files)} files") ++ if corrupted > 0: ++ print(f"Corrupted (bad PNGs): {corrupted} files") + print(f"Failed: {failed} files") + print(f"Total time: {format_time(total_time)}") +``` + +**Chunk 2** +Lines added: 9, lines removed: undefined + +```diff +@@ -369,4 +371,13 @@ + print(f"Compressed size: {format_size(total_new_size)}") + print(f"Total savings: {format_size(total_savings)} ({total_savings_pct:.1f}%)") ++ ++ # Print list of corrupted files if any ++ if corrupted_files: ++ print("\n" + "=" * 80) ++ print("CORRUPTED FILES LIST:") ++ print("=" * 80) ++ for file_path, error in corrupted_files: ++ print(f" {file_path}") ++ print(f" Reason: {error}") + + if __name__ == '__main__': +``` + +
+ +
+ +--- + + +
+ Tool use: **read_lints** • Read lints for 1 file + +Lint paths: + +- `compress_pngs.py` + +Lint results + + +**No lint errors found** +
+
+ +--- + + + +--- + + +Updated the script to detect and report corrupted PNGs. It now: + +1. **Detects 0-byte files** — flags empty/placeholder files +2. **Validates image loading** — checks if the PNG can be opened and loaded +3. **Checks dimensions** — flags files with 0x0 or invalid dimensions +4. **Detects suspicious all-black images** — flags images that are all black with suspiciously small file sizes (potential corruption) +5. **Tracks corrupted files separately** — counts them separately from regular failures +6. **Reports in real-time** — shows `[CORRUPTED]` messages as they're found +7. **Summary report** — includes corrupted file count in the final summary +8. **Detailed list** — prints a list of all corrupted files with reasons at the end + +The script will now: +- Show corrupted files in the progress output: `Corrupted: X` +- Print `[CORRUPTED]` messages when found +- List all corrupted files with their specific error reasons at the end + +This should catch the issues you mentioned: black images, cut-off images, and 0-byte placeholder files. + +--- + +_**User (2025-12-11 21:39Z)**_ + +@pwsh (155-303) nice! can we make it so it moves all of them from input\corrupted to another folder to isolate them? + +--- + +_**Agent (model default, mode Agent)**_ + + + +--- + + +Updating the script to move corrupted files to a `corrupted` folder, preserving the directory structure: + + +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 25, lines removed: undefined + +```diff +@@ -208,4 +208,29 @@ + return output_dir / relative_path + ++ def move_to_corrupted(input_path, input_dir, corrupted_dir): ++ """Move a corrupted file to the corrupted folder, preserving directory structure.""" ++ try: ++ input_path = Path(input_path) ++ input_dir = Path(input_dir) ++ corrupted_dir = Path(corrupted_dir) ++ ++ # Get relative path from input directory ++ try: ++ relative_path = input_path.relative_to(input_dir) ++ except ValueError: ++ relative_path = input_path.name ++ ++ # Create destination path ++ dest_path = corrupted_dir / relative_path ++ ++ # Ensure destination directory exists ++ dest_path.parent.mkdir(parents=True, exist_ok=True) ++ ++ # Move the file ++ input_path.rename(dest_path) ++ return True, None ++ except Exception as e: ++ return False, str(e) ++ + def format_size(size_bytes): + """Format file size in human readable format.""" +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 4, lines removed: 1 + +```diff +@@ -305,6 +305,9 @@ + print(f"Found {len(png_files)} PNG files to process.") + +- # Create output directory ++ # Create output and corrupted directories + output_dir.mkdir(exist_ok=True) ++ corrupted_dir = Path('corrupted') ++ corrupted_dir.mkdir(exist_ok=True) ++ print(f"Corrupted files will be moved to: {corrupted_dir}") + + # Use all available CPU cores +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 8, lines removed: 2 + +```diff +@@ -373,6 +373,12 @@ + if is_corrupted: + corrupted += 1 +- corrupted_files.append((file_path, error)) +- print(f"\n[CORRUPTED] {file_path}: {error}") ++ # Move corrupted file to corrupted folder ++ move_success, move_error = move_to_corrupted(file_path, input_dir, corrupted_dir) ++ if move_success: ++ corrupted_files.append((file_path, error, f"Moved to {corrupted_dir / Path(file_path).relative_to(input_dir)}")) ++ print(f"\n[CORRUPTED] {file_path}: {error} -> Moved to corrupted folder") ++ else: ++ corrupted_files.append((file_path, error, f"Failed to move: {move_error}")) ++ print(f"\n[CORRUPTED] {file_path}: {error} (Failed to move: {move_error})") + else: + failed += 1 +``` + +
+ +
+ +--- + + +
Tool use: **read_file** • Read file: compress_pngs.py + +
+
+ +--- + + + +--- + + + + + +--- + + + +
Tool use: **search_replace** • Edit file: compress_pngs.py + +**Chunk 1** +Lines added: 10, lines removed: 3 + +```diff +@@ -411,7 +411,14 @@ + print("CORRUPTED FILES LIST:") + print("=" * 80) +- for file_path, error in corrupted_files: +- print(f" {file_path}") +- print(f" Reason: {error}") ++ for item in corrupted_files: ++ if len(item) == 3: ++ file_path, error, move_status = item ++ print(f" {file_path}") ++ print(f" Reason: {error}") ++ print(f" Status: {move_status}") ++ else: ++ file_path, error = item ++ print(f" {file_path}") ++ print(f" Reason: {error}") + + if __name__ == '__main__': +``` + +
+ +
+ +--- + + +
+ Tool use: **read_lints** • Read lints for 1 file + +Lint paths: + +- `compress_pngs.py` + +Lint results + + +**No lint errors found** +
+
+ +--- + + + +--- + + +Updating `.gitignore` to ignore the `corrupted` folder: + + +--- + + +
Tool use: **read_file** • Read file: .gitignore + +
+
+ +--- + + + +
Tool use: **search_replace** • Edit file: .gitignore + +**Chunk 1** +Lines added: 1, lines removed: undefined + +```diff +@@ -1,4 +1,5 @@ + input/ + output/ ++ corrupted/ + + +``` + +
+ +
+ +--- + + + +--- + + +Updated the script to automatically move corrupted PNG files to a `corrupted` folder. It will: + +1. **Create a `corrupted` folder** at the root level +2. **Move corrupted files** there when detected, preserving the directory structure (e.g., `input\Pack2BagScan_animation 6c\file.png` → `corrupted\Pack2BagScan_animation 6c\file.png`) +3. **Report the move** in real-time, showing where each file was moved +4. **Show move status** in the final summary + +The script will now: +- Display `[CORRUPTED] file.png: error -> Moved to corrupted folder` when moving files +- List all moved files in the summary with their new locations +- Keep corrupted files isolated from your input folder + +Also updated `.gitignore` to ignore the `corrupted` folder. When you run the script, corrupted files will be automatically moved out of the input folder and into the corrupted folder, making it easy to identify and handle them separately. + +--- + diff --git a/compress_pngs.py b/compress_pngs.py index 0243012..6f49978 100644 --- a/compress_pngs.py +++ b/compress_pngs.py @@ -31,13 +31,55 @@ def compress_png(input_path, output_path, force_bitdepth=None): new_size = os.path.getsize(output_path) savings = original_size - new_size savings_pct = (savings / original_size * 100) if original_size > 0 else 0 - return (str(input_path), True, None, original_size, new_size, savings_pct, True) + return (str(input_path), True, None, original_size, new_size, savings_pct, True, False) # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) original_size = os.path.getsize(input_path) - img = Image.open(input_path) + + # Check for corrupted/empty files + if original_size == 0: + return (str(input_path), False, "CORRUPTED: File is 0 bytes (empty/placeholder)", 0, 0, 0, False, True) + + # Try to open and validate the image + try: + img = Image.open(input_path) + # Force load to detect corruption early + img.load() + except Exception as e: + return (str(input_path), False, f"CORRUPTED: Cannot open/load image - {str(e)}", original_size, 0, 0, False, True) + + # Validate image dimensions + if img.width == 0 or img.height == 0: + return (str(input_path), False, f"CORRUPTED: Invalid dimensions ({img.width}x{img.height})", original_size, 0, 0, False, True) + + # Check if image appears completely black (potential corruption indicator) + # But be careful - some images might legitimately be black + # We'll only flag this if the file is suspiciously small for its dimensions + try: + # Sample a few pixels to check if all are black + sample_size = min(100, img.width * img.height) + pixels = list(img.getdata()[:sample_size]) + if pixels: + # Check if all sampled pixels are black (0 or (0,0,0) or (0,0,0,0)) + all_black = True + for pixel in pixels: + if isinstance(pixel, (tuple, list)): + if any(p > 0 for p in pixel): + all_black = False + break + else: + if pixel > 0: + all_black = False + break + + # If all black AND file is suspiciously small, flag as potentially corrupted + if all_black and original_size < (img.width * img.height * 0.1): # Less than 0.1 bytes per pixel + return (str(input_path), False, f"CORRUPTED: Image appears all black with suspiciously small file size", original_size, 0, 0, False, True) + except: + # If we can't check pixels, continue anyway + pass # Determine target bit depth if force_bitdepth == '8': @@ -131,9 +173,11 @@ def compress_png(input_path, output_path, force_bitdepth=None): new_size = os.path.getsize(output_path) savings = original_size - new_size savings_pct = (savings / original_size * 100) if original_size > 0 else 0 - return (str(input_path), True, None, original_size, new_size, savings_pct, False) + return (str(input_path), True, None, original_size, new_size, savings_pct, False, False) except Exception as e: - return (str(input_path), False, str(e), 0, 0, 0, False) + # Check if error might indicate corruption + is_corrupted = "truncated" in str(e).lower() or "cannot identify" in str(e).lower() or "corrupt" in str(e).lower() + return (str(input_path), False, str(e), 0, 0, 0, False, is_corrupted) def find_image_files(input_dir): """Find all PNG and JPG files in subdirectories.""" @@ -163,6 +207,31 @@ def get_output_path(input_path, input_dir, output_dir): # Create output path return output_dir / relative_path +def move_to_corrupted(input_path, input_dir, corrupted_dir): + """Move a corrupted file to the corrupted folder, preserving directory structure.""" + try: + input_path = Path(input_path) + input_dir = Path(input_dir) + corrupted_dir = Path(corrupted_dir) + + # Get relative path from input directory + try: + relative_path = input_path.relative_to(input_dir) + except ValueError: + relative_path = input_path.name + + # Create destination path + dest_path = corrupted_dir / relative_path + + # Ensure destination directory exists + dest_path.parent.mkdir(parents=True, exist_ok=True) + + # Move the file + input_path.rename(dest_path) + return True, None + except Exception as e: + return False, str(e) + def format_size(size_bytes): """Format file size in human readable format.""" for unit in ['B', 'KB', 'MB', 'GB']: @@ -235,8 +304,11 @@ def main(): print(f"Found {len(png_files)} PNG files to process.") - # Create output directory + # Create output and corrupted directories output_dir.mkdir(exist_ok=True) + corrupted_dir = Path('corrupted') + corrupted_dir.mkdir(exist_ok=True) + print(f"Corrupted files will be moved to: {corrupted_dir}") # Use all available CPU cores max_workers = multiprocessing.cpu_count() @@ -246,6 +318,8 @@ def main(): compressed = 0 skipped = 0 failed = 0 + corrupted = 0 + corrupted_files = [] total_original_size = 0 total_new_size = 0 start_time = time.time() @@ -261,7 +335,7 @@ def main(): # Process results as they complete for future in as_completed(future_to_file): result = future.result() - file_path, success, error, orig_size, new_size, savings_pct, was_skipped = result + file_path, success, error, orig_size, new_size, savings_pct, was_skipped, is_corrupted = result if success: if was_skipped: @@ -280,7 +354,7 @@ def main(): # Update every file or every 0.5 seconds, whichever comes first if processed == 1 or time_since_update >= 0.5: rate = processed / elapsed if elapsed > 0 else 0 - remaining = len(png_files) - processed + remaining = len(png_files) - processed - failed eta_seconds = remaining / rate if rate > 0 and remaining > 0 else 0 total_savings = total_original_size - total_new_size @@ -290,14 +364,25 @@ def main(): elapsed_str = format_time(elapsed) print(f"[{processed:5d}/{len(png_files)}] " - f"Compressed: {compressed} | Skipped: {skipped} | " + f"Compressed: {compressed} | Skipped: {skipped} | Corrupted: {corrupted} | " f"Speed: {rate:.1f} files/sec | " f"Elapsed: {elapsed_str} | ETA: {eta_str} | " f"Saved: {format_size(total_savings)} ({total_savings_pct:.1f}%)", end='\r') last_update_time = current_time else: - failed += 1 - print(f"\n[ERROR] Failed to compress {file_path}: {error}") + if is_corrupted: + corrupted += 1 + # Move corrupted file to corrupted folder + move_success, move_error = move_to_corrupted(file_path, input_dir, corrupted_dir) + if move_success: + corrupted_files.append((file_path, error, f"Moved to {corrupted_dir / Path(file_path).relative_to(input_dir)}")) + print(f"\n[CORRUPTED] {file_path}: {error} -> Moved to corrupted folder") + else: + corrupted_files.append((file_path, error, f"Failed to move: {move_error}")) + print(f"\n[CORRUPTED] {file_path}: {error} (Failed to move: {move_error})") + else: + failed += 1 + print(f"\n[ERROR] Failed to compress {file_path}: {error}") total_time = time.time() - start_time total_savings = total_original_size - total_new_size @@ -311,12 +396,30 @@ def main(): print(f"Skipped (already exist): {skipped} files") if jpg_files: print(f"Ignored (JPG/JPEG): {len(jpg_files)} files") + if corrupted > 0: + print(f"Corrupted (bad PNGs): {corrupted} files") print(f"Failed: {failed} files") print(f"Total time: {format_time(total_time)}") print(f"Average speed: {avg_rate:.2f} files/second") print(f"Original size: {format_size(total_original_size)}") print(f"Compressed size: {format_size(total_new_size)}") print(f"Total savings: {format_size(total_savings)} ({total_savings_pct:.1f}%)") + + # Print list of corrupted files if any + if corrupted_files: + print("\n" + "=" * 80) + print("CORRUPTED FILES LIST:") + print("=" * 80) + for item in corrupted_files: + if len(item) == 3: + file_path, error, move_status = item + print(f" {file_path}") + print(f" Reason: {error}") + print(f" Status: {move_status}") + else: + file_path, error = item + print(f" {file_path}") + print(f" Reason: {error}") if __name__ == '__main__': main()