| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- """
- VTT (WebVTT) file parsing and manipulation utilities.
- """
- import re
- from dataclasses import dataclass
- from typing import List, Tuple
- from datetime import timedelta
- @dataclass
- class Subtitle:
- """Represents a single subtitle entry."""
- start_time: str
- end_time: str
- text: str
- @property
- def start_seconds(self) -> float:
- """Convert start time to seconds."""
- return self._time_to_seconds(self.start_time)
- @property
- def end_seconds(self) -> float:
- """Convert end time to seconds."""
- return self._time_to_seconds(self.end_time)
- @staticmethod
- def _time_to_seconds(time_str: str) -> float:
- """Convert HH:MM:SS.mmm to seconds."""
- parts = time_str.replace(',', '.').split(':')
- hours = int(parts[0])
- minutes = int(parts[1])
- seconds = float(parts[2])
- return hours * 3600 + minutes * 60 + seconds
- def __str__(self) -> str:
- """Format as VTT subtitle entry."""
- return f"{self.start_time} --> {self.end_time}\n{self.text}"
- class VTTFile:
- """Handles VTT file parsing and manipulation."""
- def __init__(self, filepath: str):
- """Initialize VTT file handler."""
- self.filepath = filepath
- self.subtitles: List[Subtitle] = []
- self._parse()
- def _parse(self) -> None:
- """Parse VTT file into subtitle objects."""
- with open(self.filepath, 'r', encoding='utf-8') as f:
- content = f.read()
- # Split by double newlines to get subtitle blocks
- blocks = content.strip().split('\n\n')
- for block in blocks:
- # Skip the WEBVTT header
- if block.startswith('WEBVTT') or not block.strip():
- continue
- lines = block.strip().split('\n')
- if len(lines) < 2:
- continue
- # First line should be timestamps
- timestamp_line = lines[0]
- if '-->' not in timestamp_line:
- continue
- # Parse timestamps
- parts = timestamp_line.split('-->')
- if len(parts) != 2:
- continue
- start_time = parts[0].strip()
- end_time = parts[1].strip()
- # Join remaining lines as text
- text = '\n'.join(lines[1:])
- self.subtitles.append(Subtitle(
- start_time=start_time,
- end_time=end_time,
- text=text
- ))
- def get_duration(self) -> Tuple[int, int]:
- """
- Get total duration of VTT file in (minutes, hours).
- Returns tuple of (total_minutes, total_hours).
- """
- if not self.subtitles:
- return 0, 0
- last_subtitle = self.subtitles[-1]
- total_seconds = last_subtitle.end_seconds
- total_minutes = int(total_seconds / 60)
- total_hours = total_minutes / 60
- return total_minutes, total_hours
- def to_string(self) -> str:
- """Convert VTT file back to string format."""
- lines = ['WEBVTT', '']
- for subtitle in self.subtitles:
- lines.append(str(subtitle))
- lines.append('')
- return '\n'.join(lines)
- def save(self, filepath: str) -> None:
- """Save VTT file to disk."""
- with open(filepath, 'w', encoding='utf-8') as f:
- f.write(self.to_string())
- def get_subtitle_range(self, start_idx: int, end_idx: int) -> 'VTTFile':
- """
- Create a new VTTFile with subtitles in the specified range.
- Returns a new VTTFile object containing subtitles[start_idx:end_idx].
- """
- new_file = VTTFile.__new__(VTTFile)
- new_file.filepath = self.filepath
- new_file.subtitles = self.subtitles[start_idx:end_idx]
- return new_file
- def estimate_token_count(text: str, avg_tokens_per_word: float = 1.3) -> int:
- """
- Rough estimate of token count using word count.
- Japanese typically has 1.3 tokens per word with most tokenizers.
- """
- words = len(text.split())
- return int(words * avg_tokens_per_word)
- def has_japanese_characters(text: str) -> bool:
- """Check if text contains Japanese characters (Hiragana, Katakana, Kanji)."""
- # Japanese Unicode ranges:
- # Hiragana: 3040-309F
- # Katakana: 30A0-30FF
- # Kanji: 4E00-9FFF
- japanese_pattern = r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]'
- return bool(re.search(japanese_pattern, text))
|