translator.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. """
  2. Translation and sanity checking logic.
  3. """
  4. from typing import Tuple, Optional
  5. from vtt_utils import VTTFile, has_japanese_characters
  6. from ollama_client import OllamaClient
  7. class TranslationProcessor:
  8. """Handles translation and sanity checking of VTT chunks."""
  9. def __init__(self, ollama_client: OllamaClient):
  10. """
  11. Initialize translation processor.
  12. Args:
  13. ollama_client: OllamaClient instance for making translation requests
  14. """
  15. self.ollama_client = ollama_client
  16. def translate_chunk(self, vtt_chunk: VTTFile) -> Optional[VTTFile]:
  17. """
  18. Translate a VTT chunk by translating each subtitle individually.
  19. Args:
  20. vtt_chunk: VTTFile chunk to translate
  21. Returns:
  22. Translated VTTFile or None if translation fails
  23. """
  24. if not vtt_chunk.subtitles:
  25. return None
  26. from vtt_utils import Subtitle
  27. translated_subtitles = []
  28. failed_count = 0
  29. for original_sub in vtt_chunk.subtitles:
  30. # Translate each subtitle individually for maximum accuracy
  31. translated_text = self.ollama_client.translate(original_sub.text)
  32. if translated_text is None or not translated_text.strip():
  33. failed_count += 1
  34. translated_text = ""
  35. translated_subtitles.append(Subtitle(
  36. start_time=original_sub.start_time,
  37. end_time=original_sub.end_time,
  38. text=translated_text
  39. ))
  40. # If more than 50% failed, something is very wrong
  41. if failed_count > len(vtt_chunk.subtitles) * 0.5:
  42. return None
  43. # Create new VTT file with translated subtitles
  44. translated_chunk = VTTFile.__new__(VTTFile)
  45. translated_chunk.filepath = vtt_chunk.filepath
  46. translated_chunk.subtitles = translated_subtitles
  47. return translated_chunk
  48. def sanity_check(self, vtt_chunk: VTTFile) -> Tuple[bool, str]:
  49. """
  50. Perform sanity checks on translated chunk.
  51. Args:
  52. vtt_chunk: VTTFile to check
  53. Returns:
  54. Tuple of (is_valid, reason_for_failure)
  55. is_valid is True if all checks pass
  56. """
  57. if not vtt_chunk.subtitles:
  58. return False, "Chunk is empty"
  59. empty_count = 0
  60. japanese_count = 0
  61. for i, subtitle in enumerate(vtt_chunk.subtitles):
  62. # Check for empty text
  63. if not subtitle.text.strip():
  64. empty_count += 1
  65. continue
  66. # Check for Japanese characters
  67. if has_japanese_characters(subtitle.text):
  68. japanese_count += 1
  69. # Allow some empties (up to 10% of subtitles)
  70. if empty_count > len(vtt_chunk.subtitles) * 0.1:
  71. return False, f"Too many empty subtitles ({empty_count})"
  72. # Allow some Japanese (up to 10% - probably proper nouns/names)
  73. # This is more lenient since some names and special terms might not translate
  74. japanese_percent = (japanese_count / len(vtt_chunk.subtitles)) * 100
  75. if japanese_percent > 10:
  76. return False, f"{japanese_count} subtitles ({japanese_percent:.1f}%) contain Japanese characters"
  77. return True, ""
  78. def process_chunk_with_retry(self, vtt_chunk: VTTFile,
  79. chunk_id: int) -> Optional[VTTFile]:
  80. """
  81. Process a chunk with one retry on sanity check failure.
  82. Args:
  83. vtt_chunk: VTT chunk to process
  84. chunk_id: Identifier for logging purposes
  85. Returns:
  86. Processed and verified VTT chunk or None if processing fails
  87. """
  88. # First attempt
  89. translated = self.translate_chunk(vtt_chunk)
  90. if translated is None:
  91. print(f" ✗ Chunk {chunk_id} translation returned None")
  92. return None
  93. # Check sanity
  94. is_valid, reason = self.sanity_check(translated)
  95. if is_valid:
  96. return translated
  97. # One retry
  98. print(f" ⚠ Chunk {chunk_id} sanity check failed: {reason}. Retrying...")
  99. translated = self.translate_chunk(vtt_chunk)
  100. if translated is None:
  101. print(f" ✗ Chunk {chunk_id} retry translation returned None")
  102. return None
  103. # Check again
  104. is_valid, reason = self.sanity_check(translated)
  105. if is_valid:
  106. return translated
  107. # Failed after retry
  108. print(f" ✗ Chunk {chunk_id} failed after retry: {reason}")
  109. return None