cleanup_failed.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. #!/usr/bin/env python3
  2. """
  3. Cleanup script for failed OCR processing.
  4. Finds files marked as processed but with no valid JSON output, and optionally removes them from the index.
  5. """
  6. import os
  7. import json
  8. from pathlib import Path
  9. import argparse
  10. from typing import Set, List, Dict
  11. class FailureCleanup:
  12. """Clean up failed processing attempts"""
  13. def __init__(self, index_file: str = "processing_index.json", downloads_dir: str = "./downloads", results_dir: str = "./results"):
  14. self.index_file = Path(index_file)
  15. self.downloads_dir = Path(downloads_dir)
  16. self.results_dir = Path(results_dir)
  17. def load_index(self) -> Dict:
  18. """Load the processing index"""
  19. if not self.index_file.exists():
  20. print(f"❌ Index file not found: {self.index_file}")
  21. return {"processed_files": [], "failed_files": []}
  22. with open(self.index_file, 'r') as f:
  23. return json.load(f)
  24. def get_relative_path(self, file_path: Path) -> str:
  25. """Get relative path from downloads directory"""
  26. try:
  27. return str(file_path.relative_to(self.downloads_dir))
  28. except ValueError:
  29. return str(file_path)
  30. def check_json_exists(self, relative_path: str) -> bool:
  31. """Check if JSON output exists for this file"""
  32. # Convert image path to JSON path
  33. json_path = self.results_dir / Path(relative_path).with_suffix('.json')
  34. return json_path.exists()
  35. def check_json_valid(self, relative_path: str) -> bool:
  36. """Check if JSON output is valid"""
  37. json_path = self.results_dir / Path(relative_path).with_suffix('.json')
  38. if not json_path.exists():
  39. return False
  40. try:
  41. with open(json_path, 'r') as f:
  42. json.load(f)
  43. return True
  44. except Exception:
  45. return False
  46. def find_failures(self) -> Dict[str, List[str]]:
  47. """Find all types of failures"""
  48. index_data = self.load_index()
  49. processed_files = set(index_data.get('processed_files', []))
  50. explicit_failures = index_data.get('failed_files', [])
  51. failures = {
  52. 'no_json': [], # Marked processed but no JSON exists
  53. 'invalid_json': [], # JSON exists but is invalid/corrupt
  54. 'explicit_failed': [], # Listed in failed_files
  55. 'orphaned_json': [] # JSON exists but not in processed list (shouldn't happen)
  56. }
  57. print("🔍 Scanning for failures...\n")
  58. # Check each processed file
  59. for relative_path in processed_files:
  60. if not self.check_json_exists(relative_path):
  61. failures['no_json'].append(relative_path)
  62. elif not self.check_json_valid(relative_path):
  63. failures['invalid_json'].append(relative_path)
  64. # Add explicit failures
  65. for failure in explicit_failures:
  66. filename = failure.get('filename') if isinstance(failure, dict) else failure
  67. failures['explicit_failed'].append(filename)
  68. # Find orphaned JSON files (exist but not marked as processed)
  69. if self.results_dir.exists():
  70. for json_file in self.results_dir.glob("**/*.json"):
  71. relative_path = str(json_file.relative_to(self.results_dir).with_suffix(''))
  72. # Add back the original extension (assuming .jpg, could be others)
  73. for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
  74. potential_path = relative_path + ext
  75. if potential_path in processed_files:
  76. break
  77. else:
  78. # Not found with any extension
  79. failures['orphaned_json'].append(str(json_file.relative_to(self.results_dir)))
  80. return failures
  81. def show_report(self, failures: Dict[str, List[str]]):
  82. """Display failure report"""
  83. print("=" * 70)
  84. print("FAILURE REPORT")
  85. print("=" * 70)
  86. total_failures = sum(len(v) for k, v in failures.items() if k != 'orphaned_json')
  87. if failures['no_json']:
  88. print(f"\n❌ NO JSON OUTPUT ({len(failures['no_json'])} files)")
  89. print(" Files marked as processed but no JSON result exists:")
  90. for f in failures['no_json'][:10]:
  91. print(f" - {f}")
  92. if len(failures['no_json']) > 10:
  93. print(f" ... and {len(failures['no_json']) - 10} more")
  94. if failures['invalid_json']:
  95. print(f"\n⚠️ INVALID JSON ({len(failures['invalid_json'])} files)")
  96. print(" JSON file exists but is corrupt/invalid:")
  97. for f in failures['invalid_json'][:10]:
  98. print(f" - {f}")
  99. if len(failures['invalid_json']) > 10:
  100. print(f" ... and {len(failures['invalid_json']) - 10} more")
  101. if failures['explicit_failed']:
  102. print(f"\n📋 EXPLICITLY FAILED ({len(failures['explicit_failed'])} files)")
  103. print(" Listed in failed_files in the index:")
  104. for f in failures['explicit_failed'][:10]:
  105. print(f" - {f}")
  106. if len(failures['explicit_failed']) > 10:
  107. print(f" ... and {len(failures['explicit_failed']) - 10} more")
  108. if failures['orphaned_json']:
  109. print(f"\n👻 ORPHANED JSON ({len(failures['orphaned_json'])} files)")
  110. print(" JSON files exist but not marked as processed (shouldn't happen):")
  111. for f in failures['orphaned_json'][:10]:
  112. print(f" - {f}")
  113. if len(failures['orphaned_json']) > 10:
  114. print(f" ... and {len(failures['orphaned_json']) - 10} more")
  115. print("\n" + "=" * 70)
  116. print(f"TOTAL FAILURES: {total_failures}")
  117. print("=" * 70)
  118. def cleanup(self, failures: Dict[str, List[str]], delete_invalid_json: bool = False):
  119. """Remove failed files from processed list"""
  120. index_data = self.load_index()
  121. processed_files = set(index_data.get('processed_files', []))
  122. files_to_remove = set()
  123. # Files to remove from processed list (so they can be retried)
  124. files_to_remove.update(failures['no_json'])
  125. files_to_remove.update(failures['invalid_json'])
  126. files_to_remove.update(failures['explicit_failed'])
  127. # Remove from processed list
  128. original_count = len(processed_files)
  129. processed_files -= files_to_remove
  130. removed_count = original_count - len(processed_files)
  131. # Update index
  132. index_data['processed_files'] = sorted(list(processed_files))
  133. index_data['failed_files'] = [] # Clear failed files list
  134. # Save updated index
  135. with open(self.index_file, 'w') as f:
  136. json.dump(index_data, f, indent=2)
  137. print(f"\n✅ Removed {removed_count} files from processed list")
  138. print(f" These files will be retried on next run")
  139. # Optionally delete invalid JSON files
  140. if delete_invalid_json and failures['invalid_json']:
  141. deleted = 0
  142. for relative_path in failures['invalid_json']:
  143. json_path = self.results_dir / Path(relative_path).with_suffix('.json')
  144. if json_path.exists():
  145. json_path.unlink()
  146. deleted += 1
  147. print(f"🗑️ Deleted {deleted} invalid JSON files")
  148. def main():
  149. parser = argparse.ArgumentParser(description="Clean up failed OCR processing attempts")
  150. parser.add_argument("--doit", action="store_true", help="Actually perform cleanup (default: dry run)")
  151. parser.add_argument("--delete-invalid-json", action="store_true", help="Also delete invalid JSON files")
  152. parser.add_argument("--index", default="processing_index.json", help="Index file path")
  153. parser.add_argument("--downloads-dir", default="./downloads", help="Downloads directory")
  154. parser.add_argument("--results-dir", default="./results", help="Results directory")
  155. args = parser.parse_args()
  156. cleanup = FailureCleanup(
  157. index_file=args.index,
  158. downloads_dir=args.downloads_dir,
  159. results_dir=args.results_dir
  160. )
  161. # Find failures
  162. failures = cleanup.find_failures()
  163. # Show report
  164. cleanup.show_report(failures)
  165. # Check if there's anything to clean
  166. total_failures = sum(len(v) for k, v in failures.items() if k != 'orphaned_json')
  167. if total_failures == 0:
  168. print("\n✨ No failures found - everything looks good!")
  169. return
  170. # Perform cleanup if requested
  171. if args.doit:
  172. print("\n🚨 PERFORMING CLEANUP...")
  173. response = input("Are you sure? This will remove failed files from the processed list. (yes/no): ")
  174. if response.lower() == 'yes':
  175. cleanup.cleanup(failures, delete_invalid_json=args.delete_invalid_json)
  176. print("\n✅ Cleanup complete!")
  177. else:
  178. print("❌ Cleanup cancelled")
  179. else:
  180. print("\n💡 This was a DRY RUN - no changes made")
  181. print(" Run with --doit to actually remove failed files from the processed list")
  182. print(" Add --delete-invalid-json to also delete corrupt JSON files")
  183. if __name__ == "__main__":
  184. main()