| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- """
- Translation and sanity checking logic.
- """
- from typing import Tuple, Optional
- from vtt_utils import VTTFile, has_japanese_characters
- from ollama_client import OllamaClient
- class TranslationProcessor:
- """Handles translation and sanity checking of VTT chunks."""
- def __init__(self, ollama_client: OllamaClient):
- """
- Initialize translation processor.
- Args:
- ollama_client: OllamaClient instance for making translation requests
- """
- self.ollama_client = ollama_client
- def translate_chunk(self, vtt_chunk: VTTFile) -> Optional[VTTFile]:
- """
- Translate a VTT chunk by translating each subtitle individually.
- Args:
- vtt_chunk: VTTFile chunk to translate
- Returns:
- Translated VTTFile or None if translation fails
- """
- if not vtt_chunk.subtitles:
- return None
- from vtt_utils import Subtitle
- translated_subtitles = []
- failed_count = 0
- for original_sub in vtt_chunk.subtitles:
- # Translate each subtitle individually for maximum accuracy
- translated_text = self.ollama_client.translate(original_sub.text)
- if translated_text is None or not translated_text.strip():
- failed_count += 1
- translated_text = ""
- translated_subtitles.append(Subtitle(
- start_time=original_sub.start_time,
- end_time=original_sub.end_time,
- text=translated_text
- ))
- # If more than 50% failed, something is very wrong
- if failed_count > len(vtt_chunk.subtitles) * 0.5:
- return None
- # Create new VTT file with translated subtitles
- translated_chunk = VTTFile.__new__(VTTFile)
- translated_chunk.filepath = vtt_chunk.filepath
- translated_chunk.subtitles = translated_subtitles
- return translated_chunk
- def sanity_check(self, vtt_chunk: VTTFile) -> Tuple[bool, str]:
- """
- Perform sanity checks on translated chunk.
- Args:
- vtt_chunk: VTTFile to check
- Returns:
- Tuple of (is_valid, reason_for_failure)
- is_valid is True if all checks pass
- """
- if not vtt_chunk.subtitles:
- return False, "Chunk is empty"
- empty_count = 0
- japanese_count = 0
- for i, subtitle in enumerate(vtt_chunk.subtitles):
- # Check for empty text
- if not subtitle.text.strip():
- empty_count += 1
- continue
- # Check for Japanese characters
- if has_japanese_characters(subtitle.text):
- japanese_count += 1
- # Allow some empties (up to 10% of subtitles)
- if empty_count > len(vtt_chunk.subtitles) * 0.1:
- return False, f"Too many empty subtitles ({empty_count})"
- # Allow some Japanese (up to 10% - probably proper nouns/names)
- # This is more lenient since some names and special terms might not translate
- japanese_percent = (japanese_count / len(vtt_chunk.subtitles)) * 100
- if japanese_percent > 10:
- return False, f"{japanese_count} subtitles ({japanese_percent:.1f}%) contain Japanese characters"
- return True, ""
- def process_chunk_with_retry(self, vtt_chunk: VTTFile,
- chunk_id: int) -> Optional[VTTFile]:
- """
- Process a chunk with one retry on sanity check failure.
- Args:
- vtt_chunk: VTT chunk to process
- chunk_id: Identifier for logging purposes
- Returns:
- Processed and verified VTT chunk or None if processing fails
- """
- # First attempt
- translated = self.translate_chunk(vtt_chunk)
- if translated is None:
- print(f" ✗ Chunk {chunk_id} translation returned None")
- return None
- # Check sanity
- is_valid, reason = self.sanity_check(translated)
- if is_valid:
- return translated
- # One retry
- print(f" ⚠ Chunk {chunk_id} sanity check failed: {reason}. Retrying...")
- translated = self.translate_chunk(vtt_chunk)
- if translated is None:
- print(f" ✗ Chunk {chunk_id} retry translation returned None")
- return None
- # Check again
- is_valid, reason = self.sanity_check(translated)
- if is_valid:
- return translated
- # Failed after retry
- print(f" ✗ Chunk {chunk_id} failed after retry: {reason}")
- return None
|