mitch
/
Japanese-VTT-Translator


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
							#!/usr/bin/env python3
"""
Main script for translating Japanese VTT files to English using Ollama.

This script orchestrates the entire translation pipeline:
1. Prompts user for input VTT file
2. Analyzes and chunks the file
3. Translates each chunk via Ollama
4. Validates translations
5. Reassembles into final output
"""

import os
import sys
import tempfile
import shutil
from pathlib import Path

from vtt_utils import VTTFile, Subtitle
from chunker import VTTChunker
from ollama_client import OllamaClient
from translator import TranslationProcessor
from reassembler import VTTReassembler
from tui import ProgressDisplay


# Configuration from environment / hardcoded defaults
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://ai-house:11434/')
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'translategemma:12b')
TEMP_DIR = '/tmp/'


def get_input_file() -> str:
    """
    Prompt user for input VTT file path.

    Returns:
        Absolute path to the VTT file
    """
    display = ProgressDisplay()
    display.print_banner("Japanese VTT Translator")

    while True:
        display.print_info("Enter the path to your Japanese VTT file:")
        file_path = input("  > ").strip()

        if not file_path:
            display.print_warning("Please enter a valid path.")
            continue

        # Expand user home directory
        expanded_path = os.path.expanduser(file_path)

        # Convert to absolute path
        if not os.path.isabs(expanded_path):
            expanded_path = os.path.abspath(expanded_path)

        if not os.path.exists(expanded_path):
            display.print_error(f"File not found: {expanded_path}")
            continue

        if not expanded_path.lower().endswith('.vtt'):
            display.print_warning("File must be a .vtt file.")
            continue

        return expanded_path

    return ""


def validate_ollama_connection() -> bool:
    """
    Validate that Ollama server is available and has the required model.

    Returns:
        True if connection is valid, False otherwise
    """
    display = ProgressDisplay()

    display.print_section("Validating Ollama Connection")
    display.print_info(f"Server URL: {OLLAMA_BASE_URL}")
    display.print_info(f"Model: {OLLAMA_MODEL}")

    client = OllamaClient(OLLAMA_BASE_URL, OLLAMA_MODEL)

    if not client.is_available():
        display.print_error("Cannot connect to Ollama server.")
        display.print_info(f"Make sure Ollama is running at {OLLAMA_BASE_URL}")
        return False

    display.print_success("✓ Connected to Ollama")

    # Try to get model info
    model_info = client.get_model_info()
    if model_info:
        display.print_success(f"✓ Model '{OLLAMA_MODEL}' is available")
    else:
        display.print_warning(f"Could not verify model '{OLLAMA_MODEL}' availability")
        display.print_info("Proceeding anyway - may fail during translation")

    return True


def main():
    """Main execution flow."""
    display = ProgressDisplay()

    try:
        # Step 1: Get input file
        display.print_step(1, 6, "Select Input File")
        input_file = get_input_file()
        if not input_file:
            display.print_error("No valid file selected. Exiting.")
            return

        display.print_success(f"✓ Selected: {input_file}")

        # Step 2: Validate Ollama connection
        display.print_step(2, 6, "Validate Ollama Connection")
        if not validate_ollama_connection():
            display.print_error("Cannot proceed without Ollama connection. Exiting.")
            return

        # Step 3: Load and analyze input file
        display.print_step(3, 6, "Load and Analyze VTT File")
        display.print_info("Loading VTT file...")

        try:
            vtt_file = VTTFile(input_file)
        except Exception as e:
            display.print_error(f"Failed to parse VTT file: {e}")
            return

        display.print_success(f"✓ Loaded {len(vtt_file.subtitles)} subtitles")

        total_minutes, total_hours = vtt_file.get_duration()
        display.print_file_info(
            os.path.basename(input_file),
            total_minutes,
            total_hours,
            0  # Will update after chunking
        )

        # Step 4: Chunk the file
        display.print_step(4, 6, "Chunk VTT File")
        display.print_info("Chunking file respecting token limits...")

        chunker = VTTChunker(vtt_file)
        chunks = chunker.chunk()

        display.print_success(f"✓ Created {len(chunks)} chunks")

        token_estimates = chunker.get_chunk_token_estimates()
        display.print_info(f"Average tokens per chunk: {sum(token_estimates) // len(token_estimates)}")

        # Step 5: Translate chunks
        display.print_step(5, 6, "Translate Chunks")
        display.print_info(
            f"Translating {len(chunks)} chunks via Ollama (this may take several minutes)..."
        )

        client = OllamaClient(OLLAMA_BASE_URL, OLLAMA_MODEL)
        processor = TranslationProcessor(client)
        translated_chunks = []
        failed_chunks = []

        for i, chunk in enumerate(chunks, 1):
            display.print_chunk_status(
                i, len(chunks), "⏳ Processing...",
                f"{len(chunk.subtitles)} subtitles"
            )

            # Create a custom processor that shows progress
            client = OllamaClient(OLLAMA_BASE_URL, OLLAMA_MODEL)

            # Translate subtitles with progress
            translated_subs = []
            for j, sub in enumerate(chunk.subtitles, 1):
                # Show progress bar for every subtitle (or every 10 if there are many)
                progress_interval = max(1, len(chunk.subtitles) // 20) if len(chunk.subtitles) > 50 else 1
                if j % progress_interval == 0 or j == 1 or j == len(chunk.subtitles):
                    display.print_progress_bar(
                        j, len(chunk.subtitles),
                        label=f"Chunk {i}"
                    )

                # Translate with feedback
                print(f"    Translating subtitle {j}/{len(chunk.subtitles)}...", end="\r", flush=True)
                translated_text = client.translate(sub.text)
                if translated_text is None:
                    translated_text = ""
                translated_subs.append(Subtitle(
                    start_time=sub.start_time,
                    end_time=sub.end_time,
                    text=translated_text
                ))
            print()  # Clear the progress line

            # Create translated chunk
            processed_chunk = VTTFile.__new__(VTTFile)
            processed_chunk.filepath = chunk.filepath
            processed_chunk.subtitles = translated_subs

            # Sanity check
            is_valid, reason = processor.sanity_check(processed_chunk)
            if is_valid:
                translated_chunks.append(processed_chunk)
                display.print_chunk_status(i, len(chunks), "✓ Translated")
            else:
                # Try once more
                display.print_warning(f"  Sanity check failed: {reason}. Retrying chunk {i}...")
                translated_subs = []
                for j, sub in enumerate(chunk.subtitles, 1):
                    print(f"    Retrying subtitle {j}/{len(chunk.subtitles)}...", end="\r", flush=True)
                    translated_text = client.translate(sub.text)
                    if translated_text is None:
                        translated_text = ""
                    translated_subs.append(Subtitle(
                        start_time=sub.start_time,
                        end_time=sub.end_time,
                        text=translated_text
                    ))
                print()  # Clear the progress line

                processed_chunk = VTTFile.__new__(VTTFile)
                processed_chunk.filepath = chunk.filepath
                processed_chunk.subtitles = translated_subs

                is_valid, reason = processor.sanity_check(processed_chunk)
                if is_valid:
                    translated_chunks.append(processed_chunk)
                    display.print_chunk_status(i, len(chunks), "✓ Translated (retry)")
                else:
                    failed_chunks.append(i)
                    display.print_chunk_status(i, len(chunks), f"✗ Failed: {reason}")

        if failed_chunks:
            display.print_warning(
                f"Failed to translate {len(failed_chunks)} chunk(s): {failed_chunks}"
            )
            if len(failed_chunks) == len(chunks):
                display.print_error("All chunks failed. Cannot proceed. Exiting.")
                return
        else:
            display.print_success(f"✓ All {len(chunks)} chunks translated successfully")

        # Step 6: Reassemble and finalize
        display.print_step(6, 6, "Reassemble and Finalize")
        display.print_info("Reassembling translated chunks...")

        if not translated_chunks:
            display.print_error("No translated chunks available. Exiting.")
            return

        output_dir = os.path.dirname(input_file)
        output_path = VTTReassembler.reassemble(
            translated_chunks,
            os.path.basename(input_file),
            output_dir
        )

        display.print_success(f"✓ Reassembled into single file")

        # Final summary
        display.print_banner("Translation Complete!")
        display.print_info(f"Output file: {output_path}")

        if failed_chunks:
            display.print_warning(
                f"Note: {len(failed_chunks)} chunk(s) could not be translated. "
                f"Output is incomplete."
            )

        display.print_success("Translation pipeline completed successfully!")

    except KeyboardInterrupt:
        display.print_warning("\nInterrupted by user.")
        sys.exit(1)
    except Exception as e:
        display.print_error(f"Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == '__main__':
    main()