mitch
/
Japanese-VTT-Translator


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							"""
VTT (WebVTT) file parsing and manipulation utilities.
"""

import re
from dataclasses import dataclass
from typing import List, Tuple
from datetime import timedelta


@dataclass
class Subtitle:
    """Represents a single subtitle entry."""
    start_time: str
    end_time: str
    text: str

    @property
    def start_seconds(self) -> float:
        """Convert start time to seconds."""
        return self._time_to_seconds(self.start_time)

    @property
    def end_seconds(self) -> float:
        """Convert end time to seconds."""
        return self._time_to_seconds(self.end_time)

    @staticmethod
    def _time_to_seconds(time_str: str) -> float:
        """Convert HH:MM:SS.mmm to seconds."""
        parts = time_str.replace(',', '.').split(':')
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds = float(parts[2])
        return hours * 3600 + minutes * 60 + seconds

    def __str__(self) -> str:
        """Format as VTT subtitle entry."""
        return f"{self.start_time} --> {self.end_time}\n{self.text}"


class VTTFile:
    """Handles VTT file parsing and manipulation."""

    def __init__(self, filepath: str):
        """Initialize VTT file handler."""
        self.filepath = filepath
        self.subtitles: List[Subtitle] = []
        self._parse()

    def _parse(self) -> None:
        """Parse VTT file into subtitle objects."""
        with open(self.filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Split by double newlines to get subtitle blocks
        blocks = content.strip().split('\n\n')

        for block in blocks:
            # Skip the WEBVTT header
            if block.startswith('WEBVTT') or not block.strip():
                continue

            lines = block.strip().split('\n')
            if len(lines) < 2:
                continue

            # First line should be timestamps
            timestamp_line = lines[0]
            if '-->' not in timestamp_line:
                continue

            # Parse timestamps
            parts = timestamp_line.split('-->')
            if len(parts) != 2:
                continue

            start_time = parts[0].strip()
            end_time = parts[1].strip()

            # Join remaining lines as text
            text = '\n'.join(lines[1:])

            self.subtitles.append(Subtitle(
                start_time=start_time,
                end_time=end_time,
                text=text
            ))

    def get_duration(self) -> Tuple[int, int]:
        """
        Get total duration of VTT file in (minutes, hours).
        Returns tuple of (total_minutes, total_hours).
        """
        if not self.subtitles:
            return 0, 0

        last_subtitle = self.subtitles[-1]
        total_seconds = last_subtitle.end_seconds

        total_minutes = int(total_seconds / 60)
        total_hours = total_minutes / 60

        return total_minutes, total_hours

    def to_string(self) -> str:
        """Convert VTT file back to string format."""
        lines = ['WEBVTT', '']
        for subtitle in self.subtitles:
            lines.append(str(subtitle))
            lines.append('')
        return '\n'.join(lines)

    def save(self, filepath: str) -> None:
        """Save VTT file to disk."""
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(self.to_string())

    def get_subtitle_range(self, start_idx: int, end_idx: int) -> 'VTTFile':
        """
        Create a new VTTFile with subtitles in the specified range.
        Returns a new VTTFile object containing subtitles[start_idx:end_idx].
        """
        new_file = VTTFile.__new__(VTTFile)
        new_file.filepath = self.filepath
        new_file.subtitles = self.subtitles[start_idx:end_idx]
        return new_file


def estimate_token_count(text: str, avg_tokens_per_word: float = 1.3) -> int:
    """
    Rough estimate of token count using word count.
    Japanese typically has 1.3 tokens per word with most tokenizers.
    """
    words = len(text.split())
    return int(words * avg_tokens_per_word)


def has_japanese_characters(text: str) -> bool:
    """Check if text contains Japanese characters (Hiragana, Katakana, Kanji)."""
    # Japanese Unicode ranges:
    # Hiragana: 3040-309F
    # Katakana: 30A0-30FF
    # Kanji: 4E00-9FFF
    japanese_pattern = r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]'
    return bool(re.search(japanese_pattern, text))