""" VTT (WebVTT) file parsing and manipulation utilities. """ import re from dataclasses import dataclass from typing import List, Tuple from datetime import timedelta @dataclass class Subtitle: """Represents a single subtitle entry.""" start_time: str end_time: str text: str @property def start_seconds(self) -> float: """Convert start time to seconds.""" return self._time_to_seconds(self.start_time) @property def end_seconds(self) -> float: """Convert end time to seconds.""" return self._time_to_seconds(self.end_time) @staticmethod def _time_to_seconds(time_str: str) -> float: """Convert HH:MM:SS.mmm to seconds.""" parts = time_str.replace(',', '.').split(':') hours = int(parts[0]) minutes = int(parts[1]) seconds = float(parts[2]) return hours * 3600 + minutes * 60 + seconds def __str__(self) -> str: """Format as VTT subtitle entry.""" return f"{self.start_time} --> {self.end_time}\n{self.text}" class VTTFile: """Handles VTT file parsing and manipulation.""" def __init__(self, filepath: str): """Initialize VTT file handler.""" self.filepath = filepath self.subtitles: List[Subtitle] = [] self._parse() def _parse(self) -> None: """Parse VTT file into subtitle objects.""" with open(self.filepath, 'r', encoding='utf-8') as f: content = f.read() # Split by double newlines to get subtitle blocks blocks = content.strip().split('\n\n') for block in blocks: # Skip the WEBVTT header if block.startswith('WEBVTT') or not block.strip(): continue lines = block.strip().split('\n') if len(lines) < 2: continue # First line should be timestamps timestamp_line = lines[0] if '-->' not in timestamp_line: continue # Parse timestamps parts = timestamp_line.split('-->') if len(parts) != 2: continue start_time = parts[0].strip() end_time = parts[1].strip() # Join remaining lines as text text = '\n'.join(lines[1:]) self.subtitles.append(Subtitle( start_time=start_time, end_time=end_time, text=text )) def get_duration(self) -> Tuple[int, int]: """ Get total duration of VTT file in (minutes, hours). Returns tuple of (total_minutes, total_hours). """ if not self.subtitles: return 0, 0 last_subtitle = self.subtitles[-1] total_seconds = last_subtitle.end_seconds total_minutes = int(total_seconds / 60) total_hours = total_minutes / 60 return total_minutes, total_hours def to_string(self) -> str: """Convert VTT file back to string format.""" lines = ['WEBVTT', ''] for subtitle in self.subtitles: lines.append(str(subtitle)) lines.append('') return '\n'.join(lines) def save(self, filepath: str) -> None: """Save VTT file to disk.""" with open(filepath, 'w', encoding='utf-8') as f: f.write(self.to_string()) def get_subtitle_range(self, start_idx: int, end_idx: int) -> 'VTTFile': """ Create a new VTTFile with subtitles in the specified range. Returns a new VTTFile object containing subtitles[start_idx:end_idx]. """ new_file = VTTFile.__new__(VTTFile) new_file.filepath = self.filepath new_file.subtitles = self.subtitles[start_idx:end_idx] return new_file def estimate_token_count(text: str, avg_tokens_per_word: float = 1.3) -> int: """ Rough estimate of token count using word count. Japanese typically has 1.3 tokens per word with most tokenizers. """ words = len(text.split()) return int(words * avg_tokens_per_word) def has_japanese_characters(text: str) -> bool: """Check if text contains Japanese characters (Hiragana, Katakana, Kanji).""" # Japanese Unicode ranges: # Hiragana: 3040-309F # Katakana: 30A0-30FF # Kanji: 4E00-9FFF japanese_pattern = r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]' return bool(re.search(japanese_pattern, text))