| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- """
- VTT file chunking logic respecting token limits.
- """
- from typing import List, Tuple
- from vtt_utils import VTTFile, Subtitle, estimate_token_count
- class VTTChunker:
- """Chunks VTT files respecting token limits."""
- # Reserve tokens for:
- # - Master prompt: ~100 tokens
- # - Instructions and overhead: ~200 tokens
- # - Translated output (approximately same as input): input size
- # Total budget: 32000
- # So input + output + overhead = 32000
- # If we assume output ≈ input, then: 2*input + 300 = 32000
- # So max input: ~15850 tokens
- MAX_TOKENS_PER_CHUNK = 15000 # Conservative limit
- def __init__(self, vtt_file: VTTFile):
- """Initialize chunker with a VTT file."""
- self.vtt_file = vtt_file
- self.chunks: List[VTTFile] = []
- def chunk(self) -> List[VTTFile]:
- """
- Split VTT file into chunks respecting token limits.
- Returns list of VTTFile objects.
- """
- self.chunks = []
- current_chunk_subtitles: List[Subtitle] = []
- current_token_count = 0
- for subtitle in self.vtt_file.subtitles:
- subtitle_tokens = estimate_token_count(subtitle.text)
- # If adding this subtitle would exceed limit and we have content,
- # create a new chunk
- if (current_token_count + subtitle_tokens > self.MAX_TOKENS_PER_CHUNK
- and current_chunk_subtitles):
- self._create_chunk(current_chunk_subtitles)
- current_chunk_subtitles = []
- current_token_count = 0
- current_chunk_subtitles.append(subtitle)
- current_token_count += subtitle_tokens
- # Don't forget the last chunk
- if current_chunk_subtitles:
- self._create_chunk(current_chunk_subtitles)
- return self.chunks
- def _create_chunk(self, subtitles: List[Subtitle]) -> None:
- """Create a VTT chunk from a list of subtitles."""
- chunk = VTTFile.__new__(VTTFile)
- chunk.filepath = self.vtt_file.filepath
- chunk.subtitles = subtitles
- self.chunks.append(chunk)
- def get_chunk_count(self) -> int:
- """Get the number of chunks after chunking."""
- return len(self.chunks)
- def get_chunk_token_estimates(self) -> List[int]:
- """Get estimated token count for each chunk."""
- return [
- sum(estimate_token_count(s.text) for s in chunk.subtitles)
- for chunk in self.chunks
- ]
|