| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- #!/usr/bin/env python3
- """
- Cleanup script for failed OCR processing.
- Finds files marked as processed but with no valid JSON output, and optionally removes them from the index.
- """
- import os
- import json
- from pathlib import Path
- import argparse
- from typing import Set, List, Dict
- class FailureCleanup:
- """Clean up failed processing attempts"""
- def __init__(self, index_file: str = "processing_index.json", downloads_dir: str = "./downloads", results_dir: str = "./results"):
- self.index_file = Path(index_file)
- self.downloads_dir = Path(downloads_dir)
- self.results_dir = Path(results_dir)
- def load_index(self) -> Dict:
- """Load the processing index"""
- if not self.index_file.exists():
- print(f"❌ Index file not found: {self.index_file}")
- return {"processed_files": [], "failed_files": []}
- with open(self.index_file, 'r') as f:
- return json.load(f)
- def get_relative_path(self, file_path: Path) -> str:
- """Get relative path from downloads directory"""
- try:
- return str(file_path.relative_to(self.downloads_dir))
- except ValueError:
- return str(file_path)
- def check_json_exists(self, relative_path: str) -> bool:
- """Check if JSON output exists for this file"""
- # Convert image path to JSON path
- json_path = self.results_dir / Path(relative_path).with_suffix('.json')
- return json_path.exists()
- def check_json_valid(self, relative_path: str) -> bool:
- """Check if JSON output is valid"""
- json_path = self.results_dir / Path(relative_path).with_suffix('.json')
- if not json_path.exists():
- return False
- try:
- with open(json_path, 'r') as f:
- json.load(f)
- return True
- except Exception:
- return False
- def find_failures(self) -> Dict[str, List[str]]:
- """Find all types of failures"""
- index_data = self.load_index()
- processed_files = set(index_data.get('processed_files', []))
- explicit_failures = index_data.get('failed_files', [])
- failures = {
- 'no_json': [], # Marked processed but no JSON exists
- 'invalid_json': [], # JSON exists but is invalid/corrupt
- 'explicit_failed': [], # Listed in failed_files
- 'orphaned_json': [] # JSON exists but not in processed list (shouldn't happen)
- }
- print("🔍 Scanning for failures...\n")
- # Check each processed file
- for relative_path in processed_files:
- if not self.check_json_exists(relative_path):
- failures['no_json'].append(relative_path)
- elif not self.check_json_valid(relative_path):
- failures['invalid_json'].append(relative_path)
- # Add explicit failures
- for failure in explicit_failures:
- filename = failure.get('filename') if isinstance(failure, dict) else failure
- failures['explicit_failed'].append(filename)
- # Find orphaned JSON files (exist but not marked as processed)
- if self.results_dir.exists():
- for json_file in self.results_dir.glob("**/*.json"):
- relative_path = str(json_file.relative_to(self.results_dir).with_suffix(''))
- # Add back the original extension (assuming .jpg, could be others)
- for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
- potential_path = relative_path + ext
- if potential_path in processed_files:
- break
- else:
- # Not found with any extension
- failures['orphaned_json'].append(str(json_file.relative_to(self.results_dir)))
- return failures
- def show_report(self, failures: Dict[str, List[str]]):
- """Display failure report"""
- print("=" * 70)
- print("FAILURE REPORT")
- print("=" * 70)
- total_failures = sum(len(v) for k, v in failures.items() if k != 'orphaned_json')
- if failures['no_json']:
- print(f"\n❌ NO JSON OUTPUT ({len(failures['no_json'])} files)")
- print(" Files marked as processed but no JSON result exists:")
- for f in failures['no_json'][:10]:
- print(f" - {f}")
- if len(failures['no_json']) > 10:
- print(f" ... and {len(failures['no_json']) - 10} more")
- if failures['invalid_json']:
- print(f"\n⚠️ INVALID JSON ({len(failures['invalid_json'])} files)")
- print(" JSON file exists but is corrupt/invalid:")
- for f in failures['invalid_json'][:10]:
- print(f" - {f}")
- if len(failures['invalid_json']) > 10:
- print(f" ... and {len(failures['invalid_json']) - 10} more")
- if failures['explicit_failed']:
- print(f"\n📋 EXPLICITLY FAILED ({len(failures['explicit_failed'])} files)")
- print(" Listed in failed_files in the index:")
- for f in failures['explicit_failed'][:10]:
- print(f" - {f}")
- if len(failures['explicit_failed']) > 10:
- print(f" ... and {len(failures['explicit_failed']) - 10} more")
- if failures['orphaned_json']:
- print(f"\n👻 ORPHANED JSON ({len(failures['orphaned_json'])} files)")
- print(" JSON files exist but not marked as processed (shouldn't happen):")
- for f in failures['orphaned_json'][:10]:
- print(f" - {f}")
- if len(failures['orphaned_json']) > 10:
- print(f" ... and {len(failures['orphaned_json']) - 10} more")
- print("\n" + "=" * 70)
- print(f"TOTAL FAILURES: {total_failures}")
- print("=" * 70)
- def cleanup(self, failures: Dict[str, List[str]], delete_invalid_json: bool = False):
- """Remove failed files from processed list"""
- index_data = self.load_index()
- processed_files = set(index_data.get('processed_files', []))
- files_to_remove = set()
- # Files to remove from processed list (so they can be retried)
- files_to_remove.update(failures['no_json'])
- files_to_remove.update(failures['invalid_json'])
- files_to_remove.update(failures['explicit_failed'])
- # Remove from processed list
- original_count = len(processed_files)
- processed_files -= files_to_remove
- removed_count = original_count - len(processed_files)
- # Update index
- index_data['processed_files'] = sorted(list(processed_files))
- index_data['failed_files'] = [] # Clear failed files list
- # Save updated index
- with open(self.index_file, 'w') as f:
- json.dump(index_data, f, indent=2)
- print(f"\n✅ Removed {removed_count} files from processed list")
- print(f" These files will be retried on next run")
- # Optionally delete invalid JSON files
- if delete_invalid_json and failures['invalid_json']:
- deleted = 0
- for relative_path in failures['invalid_json']:
- json_path = self.results_dir / Path(relative_path).with_suffix('.json')
- if json_path.exists():
- json_path.unlink()
- deleted += 1
- print(f"🗑️ Deleted {deleted} invalid JSON files")
- def main():
- parser = argparse.ArgumentParser(description="Clean up failed OCR processing attempts")
- parser.add_argument("--doit", action="store_true", help="Actually perform cleanup (default: dry run)")
- parser.add_argument("--delete-invalid-json", action="store_true", help="Also delete invalid JSON files")
- parser.add_argument("--index", default="processing_index.json", help="Index file path")
- parser.add_argument("--downloads-dir", default="./downloads", help="Downloads directory")
- parser.add_argument("--results-dir", default="./results", help="Results directory")
- args = parser.parse_args()
- cleanup = FailureCleanup(
- index_file=args.index,
- downloads_dir=args.downloads_dir,
- results_dir=args.results_dir
- )
- # Find failures
- failures = cleanup.find_failures()
- # Show report
- cleanup.show_report(failures)
- # Check if there's anything to clean
- total_failures = sum(len(v) for k, v in failures.items() if k != 'orphaned_json')
- if total_failures == 0:
- print("\n✨ No failures found - everything looks good!")
- return
- # Perform cleanup if requested
- if args.doit:
- print("\n🚨 PERFORMING CLEANUP...")
- response = input("Are you sure? This will remove failed files from the processed list. (yes/no): ")
- if response.lower() == 'yes':
- cleanup.cleanup(failures, delete_invalid_json=args.delete_invalid_json)
- print("\n✅ Cleanup complete!")
- else:
- print("❌ Cleanup cancelled")
- else:
- print("\n💡 This was a DRY RUN - no changes made")
- print(" Run with --doit to actually remove failed files from the processed list")
- print(" Add --delete-invalid-json to also delete corrupt JSON files")
- if __name__ == "__main__":
- main()
|