chunker.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. """
  2. VTT file chunking logic respecting token limits.
  3. """
  4. from typing import List, Tuple
  5. from vtt_utils import VTTFile, Subtitle, estimate_token_count
  6. class VTTChunker:
  7. """Chunks VTT files respecting token limits."""
  8. # Reserve tokens for:
  9. # - Master prompt: ~100 tokens
  10. # - Instructions and overhead: ~200 tokens
  11. # - Translated output (approximately same as input): input size
  12. # Total budget: 32000
  13. # So input + output + overhead = 32000
  14. # If we assume output ≈ input, then: 2*input + 300 = 32000
  15. # So max input: ~15850 tokens
  16. MAX_TOKENS_PER_CHUNK = 15000 # Conservative limit
  17. def __init__(self, vtt_file: VTTFile):
  18. """Initialize chunker with a VTT file."""
  19. self.vtt_file = vtt_file
  20. self.chunks: List[VTTFile] = []
  21. def chunk(self) -> List[VTTFile]:
  22. """
  23. Split VTT file into chunks respecting token limits.
  24. Returns list of VTTFile objects.
  25. """
  26. self.chunks = []
  27. current_chunk_subtitles: List[Subtitle] = []
  28. current_token_count = 0
  29. for subtitle in self.vtt_file.subtitles:
  30. subtitle_tokens = estimate_token_count(subtitle.text)
  31. # If adding this subtitle would exceed limit and we have content,
  32. # create a new chunk
  33. if (current_token_count + subtitle_tokens > self.MAX_TOKENS_PER_CHUNK
  34. and current_chunk_subtitles):
  35. self._create_chunk(current_chunk_subtitles)
  36. current_chunk_subtitles = []
  37. current_token_count = 0
  38. current_chunk_subtitles.append(subtitle)
  39. current_token_count += subtitle_tokens
  40. # Don't forget the last chunk
  41. if current_chunk_subtitles:
  42. self._create_chunk(current_chunk_subtitles)
  43. return self.chunks
  44. def _create_chunk(self, subtitles: List[Subtitle]) -> None:
  45. """Create a VTT chunk from a list of subtitles."""
  46. chunk = VTTFile.__new__(VTTFile)
  47. chunk.filepath = self.vtt_file.filepath
  48. chunk.subtitles = subtitles
  49. self.chunks.append(chunk)
  50. def get_chunk_count(self) -> int:
  51. """Get the number of chunks after chunking."""
  52. return len(self.chunks)
  53. def get_chunk_token_estimates(self) -> List[int]:
  54. """Get estimated token count for each chunk."""
  55. return [
  56. sum(estimate_token_count(s.text) for s in chunk.subtitles)
  57. for chunk in self.chunks
  58. ]