""" VTT file chunking logic respecting token limits. """ from typing import List, Tuple from vtt_utils import VTTFile, Subtitle, estimate_token_count class VTTChunker: """Chunks VTT files respecting token limits.""" # Reserve tokens for: # - Master prompt: ~100 tokens # - Instructions and overhead: ~200 tokens # - Translated output (approximately same as input): input size # Total budget: 32000 # So input + output + overhead = 32000 # If we assume output ≈ input, then: 2*input + 300 = 32000 # So max input: ~15850 tokens MAX_TOKENS_PER_CHUNK = 15000 # Conservative limit def __init__(self, vtt_file: VTTFile): """Initialize chunker with a VTT file.""" self.vtt_file = vtt_file self.chunks: List[VTTFile] = [] def chunk(self) -> List[VTTFile]: """ Split VTT file into chunks respecting token limits. Returns list of VTTFile objects. """ self.chunks = [] current_chunk_subtitles: List[Subtitle] = [] current_token_count = 0 for subtitle in self.vtt_file.subtitles: subtitle_tokens = estimate_token_count(subtitle.text) # If adding this subtitle would exceed limit and we have content, # create a new chunk if (current_token_count + subtitle_tokens > self.MAX_TOKENS_PER_CHUNK and current_chunk_subtitles): self._create_chunk(current_chunk_subtitles) current_chunk_subtitles = [] current_token_count = 0 current_chunk_subtitles.append(subtitle) current_token_count += subtitle_tokens # Don't forget the last chunk if current_chunk_subtitles: self._create_chunk(current_chunk_subtitles) return self.chunks def _create_chunk(self, subtitles: List[Subtitle]) -> None: """Create a VTT chunk from a list of subtitles.""" chunk = VTTFile.__new__(VTTFile) chunk.filepath = self.vtt_file.filepath chunk.subtitles = subtitles self.chunks.append(chunk) def get_chunk_count(self) -> int: """Get the number of chunks after chunking.""" return len(self.chunks) def get_chunk_token_estimates(self) -> List[int]: """Get estimated token count for each chunk.""" return [ sum(estimate_token_count(s.text) for s in chunk.subtitles) for chunk in self.chunks ]