Compare commits

...

4 Commits

Author SHA1 Message Date
Nathan
fd5c8a9ab4 use over 61 workers 2025-12-23 17:01:00 -07:00
Nathan
f1cf096069 specst 2025-12-17 15:36:54 -07:00
Nathan
1885b1d55d include jpgs skipped in skipped count 2025-12-11 15:32:18 -07:00
Nathan
4416949d7c detect and quarantine corrupt pngs 2025-12-11 15:21:36 -07:00
4 changed files with 1135 additions and 14 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
input/ input/
output/ output/
corrupted/

View File

@@ -3012,3 +3012,959 @@ The Threadrippers higher core count should provide a clear speedup for this C
--- ---
_**User (2025-12-11 21:39Z)**_
How about a threadripper 1950X?
---
_**Agent (model default, mode Agent)**_
---
The Threadripper 1950X will likely be slower than the i9-13900KF, despite having more threads.
**Threadripper 1950X:**
- 16 physical cores, 32 threads
- First-gen Threadripper (Zen, 2017)
- The script would use 32 worker processes
- Older architecture, lower IPC
**i9-13900KF:**
- 8 P-cores + 8 E-cores = 16 physical cores, 24 threads
- Raptor Lake (2022)
- The script would use 24 worker processes
- Much higher single-threaded performance
**Why the i9-13900KF will likely be faster:**
1. Per-core performance: The i9s P-cores are significantly faster (roughly 5070% higher IPC).
2. Core efficiency: 24 faster workers can outperform 32 slower ones for this workload.
3. Memory: The i9s newer memory controller can help.
4. Architecture: 5+ years of improvements in efficiency and performance.
**Expected performance:**
- The i9-13900KF should be roughly 2040% faster, depending on I/O and the specific compression work.
**Bottom line:** The 1950Xs extra threads dont make up for the i9s much faster cores. The 3970Xs large core count can overcome the per-core gap; the 1950X cannot.
---
_**User (2025-12-11 21:39Z)**_
Sometimes, PNGs corrupt; we'll get bad results where everything is black, the image cuts off too early, or the file is just 0kb (placeholder). Can we have it report bad PNGs?
---
_**Agent (model default, mode Agent)**_
---
Adding validation to detect and report corrupted PNGs:
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 43, lines removed: 1
```diff
@@ -38,5 +38,47 @@
original_size = os.path.getsize(input_path)
- img = Image.open(input_path)
+
+ # Check for corrupted/empty files
+ if original_size == 0:
+ return (str(input_path), False, "CORRUPTED: File is 0 bytes (empty/placeholder)", 0, 0, 0, False, True)
+
+ # Try to open and validate the image
+ try:
+ img = Image.open(input_path)
+ # Force load to detect corruption early
+ img.load()
+ except Exception as e:
+ return (str(input_path), False, f"CORRUPTED: Cannot open/load image - {str(e)}", original_size, 0, 0, False, True)
+
+ # Validate image dimensions
+ if img.width == 0 or img.height == 0:
+ return (str(input_path), False, f"CORRUPTED: Invalid dimensions ({img.width}x{img.height})", original_size, 0, 0, False, True)
+
+ # Check if image appears completely black (potential corruption indicator)
+ # But be careful - some images might legitimately be black
+ # We'll only flag this if the file is suspiciously small for its dimensions
+ try:
+ # Sample a few pixels to check if all are black
+ sample_size = min(100, img.width * img.height)
+ pixels = list(img.getdata()[:sample_size])
+ if pixels:
+ # Check if all sampled pixels are black (0 or (0,0,0) or (0,0,0,0))
+ all_black = True
+ for pixel in pixels:
+ if isinstance(pixel, (tuple, list)):
+ if any(p &gt; 0 for p in pixel):
+ all_black = False
+ break
+ else:
+ if pixel &gt; 0:
+ all_black = False
+ break
+
+ # If all black AND file is suspiciously small, flag as potentially corrupted
+ if all_black and original_size &lt; (img.width * img.height * 0.1): # Less than 0.1 bytes per pixel
+ return (str(input_path), False, f"CORRUPTED: Image appears all black with suspiciously small file size", original_size, 0, 0, False, True)
+ except:
+ # If we can't check pixels, continue anyway
+ pass
# Determine target bit depth
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 1, lines removed: 1
```diff
@@ -32,5 +32,5 @@
savings = original_size - new_size
savings_pct = (savings / original_size * 100) if original_size &gt; 0 else 0
- return (str(input_path), True, None, original_size, new_size, savings_pct, True)
+ return (str(input_path), True, None, original_size, new_size, savings_pct, True, False)
# Ensure output directory exists
```
</details>
</tool-use>
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 4, lines removed: 2
```diff
@@ -174,7 +174,9 @@
savings = original_size - new_size
savings_pct = (savings / original_size * 100) if original_size &gt; 0 else 0
- return (str(input_path), True, None, original_size, new_size, savings_pct, False)
+ return (str(input_path), True, None, original_size, new_size, savings_pct, False, False)
except Exception as e:
- return (str(input_path), False, str(e), 0, 0, 0, False)
+ # Check if error might indicate corruption
+ is_corrupted = "truncated" in str(e).lower() or "cannot identify" in str(e).lower() or "corrupt" in str(e).lower()
+ return (str(input_path), False, str(e), 0, 0, 0, False, is_corrupted)
def find_image_files(input_dir):
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 2, lines removed: undefined
```diff
@@ -291,4 +291,6 @@
skipped = 0
failed = 0
+ corrupted = 0
+ corrupted_files = []
total_original_size = 0
total_new_size = 0
```
**Chunk 2**
Lines added: 1, lines removed: 1
```diff
@@ -306,5 +308,5 @@
for future in as_completed(future_to_file):
result = future.result()
- file_path, success, error, orig_size, new_size, savings_pct, was_skipped = result
+ file_path, success, error, orig_size, new_size, savings_pct, was_skipped, is_corrupted = result
if success:
```
**Chunk 3**
Lines added: 1, lines removed: 1
```diff
@@ -325,5 +327,5 @@
if processed == 1 or time_since_update &gt;= 0.5:
rate = processed / elapsed if elapsed &gt; 0 else 0
- remaining = len(png_files) - processed
+ remaining = len(png_files) - processed - failed
eta_seconds = remaining / rate if rate &gt; 0 and remaining &gt; 0 else 0
```
**Chunk 4**
Lines added: 9, lines removed: 4
```diff
@@ -335,12 +337,17 @@
print(f"[{processed:5d}/{len(png_files)}] "
- f"Compressed: {compressed} | Skipped: {skipped} | "
+ f"Compressed: {compressed} | Skipped: {skipped} | Corrupted: {corrupted} | "
f"Speed: {rate:.1f} files/sec | "
f"Elapsed: {elapsed_str} | ETA: {eta_str} | "
f"Saved: {format_size(total_savings)} ({total_savings_pct:.1f}%)", end='\r')
last_update_time = current_time
- else:
- failed += 1
- print(f"\n[ERROR] Failed to compress {file_path}: {error}")
+ else:
+ if is_corrupted:
+ corrupted += 1
+ corrupted_files.append((file_path, error))
+ print(f"\n[CORRUPTED] {file_path}: {error}")
+ else:
+ failed += 1
+ print(f"\n[ERROR] Failed to compress {file_path}: {error}")
total_time = time.time() - start_time
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 2, lines removed: undefined
```diff
@@ -363,4 +363,6 @@
if jpg_files:
print(f"Ignored (JPG/JPEG): {len(jpg_files)} files")
+ if corrupted &gt; 0:
+ print(f"Corrupted (bad PNGs): {corrupted} files")
print(f"Failed: {failed} files")
print(f"Total time: {format_time(total_time)}")
```
**Chunk 2**
Lines added: 9, lines removed: undefined
```diff
@@ -369,4 +371,13 @@
print(f"Compressed size: {format_size(total_new_size)}")
print(f"Total savings: {format_size(total_savings)} ({total_savings_pct:.1f}%)")
+
+ # Print list of corrupted files if any
+ if corrupted_files:
+ print("\n" + "=" * 80)
+ print("CORRUPTED FILES LIST:")
+ print("=" * 80)
+ for file_path, error in corrupted_files:
+ print(f" {file_path}")
+ print(f" Reason: {error}")
if __name__ == '__main__':
```
</details>
</tool-use>
---
<tool-use data-tool-type="generic" data-tool-name="read_lints">
<details>
<summary>Tool use: **read_lints** • Read lints for 1 file</summary>
Lint paths:
- `compress_pngs.py`
Lint results
**No lint errors found**
</details>
</tool-use>
---
---
Updated the script to detect and report corrupted PNGs. It now:
1. **Detects 0-byte files** — flags empty/placeholder files
2. **Validates image loading** — checks if the PNG can be opened and loaded
3. **Checks dimensions** — flags files with 0x0 or invalid dimensions
4. **Detects suspicious all-black images** — flags images that are all black with suspiciously small file sizes (potential corruption)
5. **Tracks corrupted files separately** — counts them separately from regular failures
6. **Reports in real-time** — shows `[CORRUPTED]` messages as they're found
7. **Summary report** — includes corrupted file count in the final summary
8. **Detailed list** — prints a list of all corrupted files with reasons at the end
The script will now:
- Show corrupted files in the progress output: `Corrupted: X`
- Print `[CORRUPTED]` messages when found
- List all corrupted files with their specific error reasons at the end
This should catch the issues you mentioned: black images, cut-off images, and 0-byte placeholder files.
---
_**User (2025-12-11 21:39Z)**_
@pwsh (155-303) nice! can we make it so it moves all of them from input\corrupted to another folder to isolate them?
---
_**Agent (model default, mode Agent)**_
---
Updating the script to move corrupted files to a `corrupted` folder, preserving the directory structure:
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 25, lines removed: undefined
```diff
@@ -208,4 +208,29 @@
return output_dir / relative_path
+ def move_to_corrupted(input_path, input_dir, corrupted_dir):
+ """Move a corrupted file to the corrupted folder, preserving directory structure."""
+ try:
+ input_path = Path(input_path)
+ input_dir = Path(input_dir)
+ corrupted_dir = Path(corrupted_dir)
+
+ # Get relative path from input directory
+ try:
+ relative_path = input_path.relative_to(input_dir)
+ except ValueError:
+ relative_path = input_path.name
+
+ # Create destination path
+ dest_path = corrupted_dir / relative_path
+
+ # Ensure destination directory exists
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Move the file
+ input_path.rename(dest_path)
+ return True, None
+ except Exception as e:
+ return False, str(e)
+
def format_size(size_bytes):
"""Format file size in human readable format."""
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 4, lines removed: 1
```diff
@@ -305,6 +305,9 @@
print(f"Found {len(png_files)} PNG files to process.")
- # Create output directory
+ # Create output and corrupted directories
output_dir.mkdir(exist_ok=True)
+ corrupted_dir = Path('corrupted')
+ corrupted_dir.mkdir(exist_ok=True)
+ print(f"Corrupted files will be moved to: {corrupted_dir}")
# Use all available CPU cores
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 8, lines removed: 2
```diff
@@ -373,6 +373,12 @@
if is_corrupted:
corrupted += 1
- corrupted_files.append((file_path, error))
- print(f"\n[CORRUPTED] {file_path}: {error}")
+ # Move corrupted file to corrupted folder
+ move_success, move_error = move_to_corrupted(file_path, input_dir, corrupted_dir)
+ if move_success:
+ corrupted_files.append((file_path, error, f"Moved to {corrupted_dir / Path(file_path).relative_to(input_dir)}"))
+ print(f"\n[CORRUPTED] {file_path}: {error} -&gt; Moved to corrupted folder")
+ else:
+ corrupted_files.append((file_path, error, f"Failed to move: {move_error}"))
+ print(f"\n[CORRUPTED] {file_path}: {error} (Failed to move: {move_error})")
else:
failed += 1
```
</details>
</tool-use>
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 10, lines removed: 3
```diff
@@ -411,7 +411,14 @@
print("CORRUPTED FILES LIST:")
print("=" * 80)
- for file_path, error in corrupted_files:
- print(f" {file_path}")
- print(f" Reason: {error}")
+ for item in corrupted_files:
+ if len(item) == 3:
+ file_path, error, move_status = item
+ print(f" {file_path}")
+ print(f" Reason: {error}")
+ print(f" Status: {move_status}")
+ else:
+ file_path, error = item
+ print(f" {file_path}")
+ print(f" Reason: {error}")
if __name__ == '__main__':
```
</details>
</tool-use>
---
<tool-use data-tool-type="generic" data-tool-name="read_lints">
<details>
<summary>Tool use: **read_lints** • Read lints for 1 file</summary>
Lint paths:
- `compress_pngs.py`
Lint results
**No lint errors found**
</details>
</tool-use>
---
---
Updating `.gitignore` to ignore the `corrupted` folder:
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: .gitignore</summary>
</details>
</tool-use>
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: .gitignore</summary>
**Chunk 1**
Lines added: 1, lines removed: undefined
```diff
@@ -1,4 +1,5 @@
input/
output/
+ corrupted/
```
</details>
</tool-use>
---
---
Updated the script to automatically move corrupted PNG files to a `corrupted` folder. It will:
1. **Create a `corrupted` folder** at the root level
2. **Move corrupted files** there when detected, preserving the directory structure (e.g., `input\Pack2BagScan_animation 6c\file.png``corrupted\Pack2BagScan_animation 6c\file.png`)
3. **Report the move** in real-time, showing where each file was moved
4. **Show move status** in the final summary
The script will now:
- Display `[CORRUPTED] file.png: error -> Moved to corrupted folder` when moving files
- List all moved files in the summary with their new locations
- Keep corrupted files isolated from your input folder
Also updated `.gitignore` to ignore the `corrupted` folder. When you run the script, corrupted files will be automatically moved out of the input folder and into the corrupted folder, making it easy to identify and handle them separately.
---
_**User (2025-12-11 21:39Z)**_
@pwsh (506-528) jpegs should be included in the skipped count (Skipped: 0)
---
_**Agent (model default, mode Agent)**_
---
Updating the script to count JPG/JPEG files as skipped:
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 1, lines removed: 1
```diff
@@ -317,5 +317,5 @@
compressed = 0
- skipped = 0
+ skipped = len(jpg_files) # Start with JPG files counted as skipped
failed = 0
corrupted = 0
```
</details>
</tool-use>
---
<tool-use data-tool-type="write" data-tool-name="search_replace">
<details><summary>Tool use: **search_replace** • Edit file: compress_pngs.py</summary>
**Chunk 1**
Lines added: 4, lines removed: 2
```diff
@@ -394,7 +394,9 @@
print(f"Compression complete!")
print(f"Successfully compressed: {compressed} files")
- print(f"Skipped (already exist): {skipped} files")
+ skipped_existing = skipped - len(jpg_files) if skipped &gt;= len(jpg_files) else 0
if jpg_files:
- print(f"Ignored (JPG/JPEG): {len(jpg_files)} files")
+ print(f"Skipped: {skipped} files ({skipped_existing} already exist, {len(jpg_files)} JPG/JPEG)")
+ else:
+ print(f"Skipped (already exist): {skipped} files")
if corrupted &gt; 0:
print(f"Corrupted (bad PNGs): {corrupted} files")
```
</details>
</tool-use>
---
<tool-use data-tool-type="generic" data-tool-name="read_lints">
<details>
<summary>Tool use: **read_lints** • Read lints for 1 file</summary>
Lint paths:
- `compress_pngs.py`
Lint results
**No lint errors found**
</details>
</tool-use>
---
---
Updating the progress display to show the correct total, including JPG files:
---
<tool-use data-tool-type="read" data-tool-name="read_file">
<details><summary>Tool use: **read_file** • Read file: compress_pngs.py</summary>
</details>
</tool-use>
---
---
Updated the script to count JPG/JPEG files as skipped. Changes:
1. **Initialize skipped count** with the number of JPG files found
2. **Progress display** shows the total skipped count (including JPGs)
3. **Final summary** shows skipped files with a breakdown: `Skipped: X files (Y already exist, Z JPG/JPEG)`
When you run the script, JPG files are included in the skipped count from the start, so you'll see something like:
- `Skipped: 803` (if all 803 are JPGs)
- Or `Skipped: 805` (if 803 are JPGs and 2 PNGs were already processed)
The summary will show the breakdown of what was skipped.
---
_**User (2025-12-11 21:39Z)**_
for this script, which wins, a 7800x3d, or a threadripper 1950X?
---
_**Agent (model default, mode Agent)**_
---
The **7800X3D should win** for this script.
**7800X3D:**
- 8 cores, 16 threads → script uses 16 worker processes
- Zen 4 (2023)
- Strong single-threaded performance
- Higher IPC
**Threadripper 1950X:**
- 16 cores, 32 threads → script uses 32 worker processes
- Zen 1 (2017)
- Lower single-threaded performance
- Lower IPC
**Why the 7800X3D wins:**
1. Per-core performance: Each worker finishes faster, so 16 fast workers can outperform 32 slower ones.
2. Architecture gap: Zen 4 vs Zen 1 is a large IPC improvement.
3. PNG compression: PIL operations benefit from faster cores; the 7800X3Ds cores are roughly 23x faster per core.
**Expected performance:**
- The 7800X3D should be roughly 3050% faster, depending on I/O.
**Bottom line:** The 1950Xs extra threads dont offset the 7800X3Ds much faster cores. The 7800X3D is the better choice for this workload.
---

View File

@@ -0,0 +1,40 @@
<!-- Generated by SpecStory, Markdown v2.1.0 -->
<!-- cursor Session c6b9ac3c-667a-4529-8923-79fe43eb7838 (2025-12-23 23:54Z) -->
# Untitled (2025-12-23 23:54Z)
_**User (2025-12-23 23:54Z)**_
it didn't work on the threadripper. additional context: @.specstory/history/2025-12-11_18-21Z-compress-png-sequences-with-multithreading.md
Mode: Auto-detect bit depth (preserve 16-bit if present)
Input directory: input
Output directory: output
Scanning for image files...
Found 646 JPG/JPEG files - ignoring (skipping)
Found 6371 PNG files to process.
Corrupted files will be moved to: corrupted
Using 64 worker processes for compression...
--------------------------------------------------------------------------------
Traceback (most recent call last):
File "E:\SequenceCompressor\compress_pngs.py", line 427, in <module>
main()
~~~~^^
File "E:\SequenceCompressor\compress_pngs.py", line 328, in main
with ProcessPoolExecutor(max_workers=max_workers) as executor:
~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python314\Lib\concurrent\futures\process.py", line 671, in __init__
raise ValueError(
f"max_workers must be <= {_MAX_WINDOWS_WORKERS}")
ValueError: max_workers must be <= 61
Press any key to continue . . .
---
_**Agent (model default, mode Agent)**_
---

View File

@@ -7,12 +7,21 @@ Compresses all PNG files in subdirectories with maximum parallelism.
import os import os
import sys import sys
import argparse import argparse
import platform
from pathlib import Path from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, as_completed
from PIL import Image from PIL import Image
import multiprocessing import multiprocessing
import time import time
# Try to unlock ProcessPoolExecutor on Windows to bypass 61-worker limit
try:
import unlock_processpool
unlock_processpool.please()
UNLOCKED = True
except ImportError:
UNLOCKED = False
def compress_png(input_path, output_path, force_bitdepth=None): def compress_png(input_path, output_path, force_bitdepth=None):
"""Compress a single PNG file. """Compress a single PNG file.
@@ -31,13 +40,55 @@ def compress_png(input_path, output_path, force_bitdepth=None):
new_size = os.path.getsize(output_path) new_size = os.path.getsize(output_path)
savings = original_size - new_size savings = original_size - new_size
savings_pct = (savings / original_size * 100) if original_size > 0 else 0 savings_pct = (savings / original_size * 100) if original_size > 0 else 0
return (str(input_path), True, None, original_size, new_size, savings_pct, True) return (str(input_path), True, None, original_size, new_size, savings_pct, True, False)
# Ensure output directory exists # Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
original_size = os.path.getsize(input_path) original_size = os.path.getsize(input_path)
# Check for corrupted/empty files
if original_size == 0:
return (str(input_path), False, "CORRUPTED: File is 0 bytes (empty/placeholder)", 0, 0, 0, False, True)
# Try to open and validate the image
try:
img = Image.open(input_path) img = Image.open(input_path)
# Force load to detect corruption early
img.load()
except Exception as e:
return (str(input_path), False, f"CORRUPTED: Cannot open/load image - {str(e)}", original_size, 0, 0, False, True)
# Validate image dimensions
if img.width == 0 or img.height == 0:
return (str(input_path), False, f"CORRUPTED: Invalid dimensions ({img.width}x{img.height})", original_size, 0, 0, False, True)
# Check if image appears completely black (potential corruption indicator)
# But be careful - some images might legitimately be black
# We'll only flag this if the file is suspiciously small for its dimensions
try:
# Sample a few pixels to check if all are black
sample_size = min(100, img.width * img.height)
pixels = list(img.getdata()[:sample_size])
if pixels:
# Check if all sampled pixels are black (0 or (0,0,0) or (0,0,0,0))
all_black = True
for pixel in pixels:
if isinstance(pixel, (tuple, list)):
if any(p > 0 for p in pixel):
all_black = False
break
else:
if pixel > 0:
all_black = False
break
# If all black AND file is suspiciously small, flag as potentially corrupted
if all_black and original_size < (img.width * img.height * 0.1): # Less than 0.1 bytes per pixel
return (str(input_path), False, f"CORRUPTED: Image appears all black with suspiciously small file size", original_size, 0, 0, False, True)
except:
# If we can't check pixels, continue anyway
pass
# Determine target bit depth # Determine target bit depth
if force_bitdepth == '8': if force_bitdepth == '8':
@@ -131,9 +182,11 @@ def compress_png(input_path, output_path, force_bitdepth=None):
new_size = os.path.getsize(output_path) new_size = os.path.getsize(output_path)
savings = original_size - new_size savings = original_size - new_size
savings_pct = (savings / original_size * 100) if original_size > 0 else 0 savings_pct = (savings / original_size * 100) if original_size > 0 else 0
return (str(input_path), True, None, original_size, new_size, savings_pct, False) return (str(input_path), True, None, original_size, new_size, savings_pct, False, False)
except Exception as e: except Exception as e:
return (str(input_path), False, str(e), 0, 0, 0, False) # Check if error might indicate corruption
is_corrupted = "truncated" in str(e).lower() or "cannot identify" in str(e).lower() or "corrupt" in str(e).lower()
return (str(input_path), False, str(e), 0, 0, 0, False, is_corrupted)
def find_image_files(input_dir): def find_image_files(input_dir):
"""Find all PNG and JPG files in subdirectories.""" """Find all PNG and JPG files in subdirectories."""
@@ -163,6 +216,31 @@ def get_output_path(input_path, input_dir, output_dir):
# Create output path # Create output path
return output_dir / relative_path return output_dir / relative_path
def move_to_corrupted(input_path, input_dir, corrupted_dir):
"""Move a corrupted file to the corrupted folder, preserving directory structure."""
try:
input_path = Path(input_path)
input_dir = Path(input_dir)
corrupted_dir = Path(corrupted_dir)
# Get relative path from input directory
try:
relative_path = input_path.relative_to(input_dir)
except ValueError:
relative_path = input_path.name
# Create destination path
dest_path = corrupted_dir / relative_path
# Ensure destination directory exists
dest_path.parent.mkdir(parents=True, exist_ok=True)
# Move the file
input_path.rename(dest_path)
return True, None
except Exception as e:
return False, str(e)
def format_size(size_bytes): def format_size(size_bytes):
"""Format file size in human readable format.""" """Format file size in human readable format."""
for unit in ['B', 'KB', 'MB', 'GB']: for unit in ['B', 'KB', 'MB', 'GB']:
@@ -235,17 +313,32 @@ def main():
print(f"Found {len(png_files)} PNG files to process.") print(f"Found {len(png_files)} PNG files to process.")
# Create output directory # Create output and corrupted directories
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
corrupted_dir = Path('corrupted')
corrupted_dir.mkdir(exist_ok=True)
print(f"Corrupted files will be moved to: {corrupted_dir}")
# Use all available CPU cores # Use all available CPU cores
max_workers = multiprocessing.cpu_count() cpu_count = multiprocessing.cpu_count()
if platform.system() == 'Windows' and not UNLOCKED:
# Windows ProcessPoolExecutor has a maximum of 61 workers (unless unlocked)
max_workers = min(cpu_count, 61)
if cpu_count > 61:
print(f"Detected {cpu_count} CPU threads, but Windows limits ProcessPoolExecutor to 61 workers.")
print("Install 'unlock-processpool-win' package to use all cores: pip install unlock-processpool-win")
else:
max_workers = cpu_count
if UNLOCKED:
print(f"Using unlock-processpool-win to bypass Windows 61-worker limit")
print(f"Using {max_workers} worker processes for compression...") print(f"Using {max_workers} worker processes for compression...")
print("-" * 80) print("-" * 80)
compressed = 0 compressed = 0
skipped = 0 skipped = len(jpg_files) # Start with JPG files counted as skipped
failed = 0 failed = 0
corrupted = 0
corrupted_files = []
total_original_size = 0 total_original_size = 0
total_new_size = 0 total_new_size = 0
start_time = time.time() start_time = time.time()
@@ -261,7 +354,7 @@ def main():
# Process results as they complete # Process results as they complete
for future in as_completed(future_to_file): for future in as_completed(future_to_file):
result = future.result() result = future.result()
file_path, success, error, orig_size, new_size, savings_pct, was_skipped = result file_path, success, error, orig_size, new_size, savings_pct, was_skipped, is_corrupted = result
if success: if success:
if was_skipped: if was_skipped:
@@ -280,7 +373,7 @@ def main():
# Update every file or every 0.5 seconds, whichever comes first # Update every file or every 0.5 seconds, whichever comes first
if processed == 1 or time_since_update >= 0.5: if processed == 1 or time_since_update >= 0.5:
rate = processed / elapsed if elapsed > 0 else 0 rate = processed / elapsed if elapsed > 0 else 0
remaining = len(png_files) - processed remaining = len(png_files) - processed - failed
eta_seconds = remaining / rate if rate > 0 and remaining > 0 else 0 eta_seconds = remaining / rate if rate > 0 and remaining > 0 else 0
total_savings = total_original_size - total_new_size total_savings = total_original_size - total_new_size
@@ -290,11 +383,22 @@ def main():
elapsed_str = format_time(elapsed) elapsed_str = format_time(elapsed)
print(f"[{processed:5d}/{len(png_files)}] " print(f"[{processed:5d}/{len(png_files)}] "
f"Compressed: {compressed} | Skipped: {skipped} | " f"Compressed: {compressed} | Skipped: {skipped} | Corrupted: {corrupted} | "
f"Speed: {rate:.1f} files/sec | " f"Speed: {rate:.1f} files/sec | "
f"Elapsed: {elapsed_str} | ETA: {eta_str} | " f"Elapsed: {elapsed_str} | ETA: {eta_str} | "
f"Saved: {format_size(total_savings)} ({total_savings_pct:.1f}%)", end='\r') f"Saved: {format_size(total_savings)} ({total_savings_pct:.1f}%)", end='\r')
last_update_time = current_time last_update_time = current_time
else:
if is_corrupted:
corrupted += 1
# Move corrupted file to corrupted folder
move_success, move_error = move_to_corrupted(file_path, input_dir, corrupted_dir)
if move_success:
corrupted_files.append((file_path, error, f"Moved to {corrupted_dir / Path(file_path).relative_to(input_dir)}"))
print(f"\n[CORRUPTED] {file_path}: {error} -> Moved to corrupted folder")
else:
corrupted_files.append((file_path, error, f"Failed to move: {move_error}"))
print(f"\n[CORRUPTED] {file_path}: {error} (Failed to move: {move_error})")
else: else:
failed += 1 failed += 1
print(f"\n[ERROR] Failed to compress {file_path}: {error}") print(f"\n[ERROR] Failed to compress {file_path}: {error}")
@@ -308,9 +412,13 @@ def main():
print("\n" + "=" * 80) print("\n" + "=" * 80)
print(f"Compression complete!") print(f"Compression complete!")
print(f"Successfully compressed: {compressed} files") print(f"Successfully compressed: {compressed} files")
print(f"Skipped (already exist): {skipped} files") skipped_existing = skipped - len(jpg_files) if skipped >= len(jpg_files) else 0
if jpg_files: if jpg_files:
print(f"Ignored (JPG/JPEG): {len(jpg_files)} files") print(f"Skipped: {skipped} files ({skipped_existing} already exist, {len(jpg_files)} JPG/JPEG)")
else:
print(f"Skipped (already exist): {skipped} files")
if corrupted > 0:
print(f"Corrupted (bad PNGs): {corrupted} files")
print(f"Failed: {failed} files") print(f"Failed: {failed} files")
print(f"Total time: {format_time(total_time)}") print(f"Total time: {format_time(total_time)}")
print(f"Average speed: {avg_rate:.2f} files/second") print(f"Average speed: {avg_rate:.2f} files/second")
@@ -318,6 +426,22 @@ def main():
print(f"Compressed size: {format_size(total_new_size)}") print(f"Compressed size: {format_size(total_new_size)}")
print(f"Total savings: {format_size(total_savings)} ({total_savings_pct:.1f}%)") print(f"Total savings: {format_size(total_savings)} ({total_savings_pct:.1f}%)")
# Print list of corrupted files if any
if corrupted_files:
print("\n" + "=" * 80)
print("CORRUPTED FILES LIST:")
print("=" * 80)
for item in corrupted_files:
if len(item) == 3:
file_path, error, move_status = item
print(f" {file_path}")
print(f" Reason: {error}")
print(f" Status: {move_status}")
else:
file_path, error = item
print(f" {file_path}")
print(f" Reason: {error}")
if __name__ == '__main__': if __name__ == '__main__':
main() main()