ollama_captionizer.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. #!/usr/bin/env python3
  2. import json
  3. import base64
  4. import time
  5. import urllib.request
  6. import os
  7. import sys
  8. import subprocess
  9. from pathlib import Path
  10. from typing import List, Dict, Any, Optional
  11. from rich.console import Console
  12. from rich.panel import Panel
  13. from rich.live import Live
  14. from rich.table import Table
  15. from rich.progress import Progress, SpinnerColumn, TextColumn
  16. # --- EMOJIS ---
  17. EMOJI_CONFIG = "⚙️"
  18. EMOJI_IMAGE = "🖼️"
  19. EMOJI_PROMPT = "💬"
  20. EMOJI_START = "🚀"
  21. EMOJI_EXIT = "🚪"
  22. EMOJI_SUCCESS = "✅"
  23. EMOJI_FAIL = "❌"
  24. EMOJI_SKIP = "⏭️"
  25. EMOJI_LOG = "📝"
  26. # --- CONFIGURATION ---
  27. CONFIG_FILE = "config.json"
  28. DEFAULT_CONFIG = {
  29. "image_source": "directory", # 'directory' or 'specific_files'
  30. "image_dir": ".",
  31. "specific_files": [],
  32. "image_ext": ".jpg",
  33. "model": "moondream",
  34. "api_url": "http://localhost:11434/api/generate",
  35. "prompt": "Describe this image in a single, descriptive sentence.",
  36. }
  37. console = Console()
  38. def run_gum_command(command: List[str]) -> Optional[str]:
  39. """Runs a gum command and returns its stripped output."""
  40. try:
  41. result = subprocess.run(
  42. ["gum"] + command,
  43. capture_output=True,
  44. text=True,
  45. check=True,
  46. )
  47. return result.stdout.strip()
  48. except (subprocess.CalledProcessError, FileNotFoundError) as e:
  49. console.print(
  50. f"[bold red]Error running 'gum'. Is it installed and in your PATH? ({e})[/bold red]"
  51. )
  52. return None
  53. def load_config() -> Dict[str, Any]:
  54. """Loads configuration from JSON file or returns default."""
  55. if Path(CONFIG_FILE).exists():
  56. with open(CONFIG_FILE, "r") as f:
  57. try:
  58. config = json.load(f)
  59. # Ensure all keys from default are present
  60. for key, value in DEFAULT_CONFIG.items():
  61. config.setdefault(key, value)
  62. return config
  63. except json.JSONDecodeError:
  64. return DEFAULT_CONFIG
  65. return DEFAULT_CONFIG
  66. def save_config(config: Dict[str, Any]):
  67. """Saves configuration to JSON file."""
  68. with open(CONFIG_FILE, "w") as f:
  69. json.dump(config, f, indent=4)
  70. def get_image_files(config: Dict[str, Any]) -> List[Path]:
  71. """Gets a list of image files based on the configuration."""
  72. if config["image_source"] == "directory":
  73. source_dir = Path(config["image_dir"])
  74. if not source_dir.is_dir():
  75. console.print(
  76. f"[bold red]Error: Image directory '{source_dir}' not found.[/bold red]"
  77. )
  78. return []
  79. return sorted(list(source_dir.glob(f"*{config['image_ext']}")))
  80. elif config["image_source"] == "specific_files":
  81. return [Path(f) for f in config["specific_files"]]
  82. return []
  83. def get_caption(
  84. image_path: Path,
  85. config: Dict[str, Any]
  86. ) -> Optional[str]:
  87. """Sends image to Ollama and returns the caption."""
  88. try:
  89. with open(image_path, "rb") as img_f:
  90. image_data = base64.b64encode(img_f.read()).decode("utf-8")
  91. payload = {
  92. "model": config["model"],
  93. "prompt": config["prompt"],
  94. "images": [image_data],
  95. "stream": False,
  96. }
  97. req = urllib.request.Request(
  98. config["api_url"],
  99. data=json.dumps(payload).encode("utf-8"),
  100. headers={"Content-Type": "application/json"},
  101. )
  102. with urllib.request.urlopen(req) as response:
  103. result = json.loads(response.read().decode("utf-8"))
  104. return result.get("response", "").strip()
  105. except Exception as e:
  106. return f"ERROR: {e}"
  107. def show_main_menu(config: Dict[str, Any]):
  108. """Displays the main interactive menu."""
  109. while True:
  110. console.print(
  111. Panel(
  112. f"[bold cyan]Ollama Image Captionizer[/bold cyan]\n\n"
  113. f"{EMOJI_CONFIG} [bold]Current Settings:[/bold]\n"
  114. f" - [yellow]Model[/yellow]: {config['model']}\n"
  115. f" - [yellow]Image Source[/yellow]: {config['image_source']}\n"
  116. f" - [yellow]Image Ext[/yellow]: {config['image_ext']}\n"
  117. f" - [yellow]Image Dir/Files[/yellow]: {config['image_dir'] if config['image_source'] == 'directory' else f'{len(config['specific_files'])} files'}\n\n"
  118. f"{EMOJI_PROMPT} [bold]Prompt:[/bold] \"{config['prompt']}"",
  119. title="Main Menu",
  120. border_style="green",
  121. expand=False,
  122. )
  123. )
  124. choice = run_gum_command(
  125. [
  126. "choose",
  127. f"{EMOJI_IMAGE} Set Image Source",
  128. f"{EMOJI_PROMPT} Edit Prompt",
  129. f"{EMOJI_START} Start Captioning",
  130. f"{EMOJI_EXIT} Exit",
  131. ]
  132. )
  133. if choice and choice.startswith(EMOJI_IMAGE):
  134. set_image_source(config)
  135. elif choice and choice.startswith(EMOJI_PROMPT):
  136. new_prompt = run_gum_command(
  137. [
  138. "input",
  139. "--value",
  140. config["prompt"],
  141. "--header",
  142. "Enter the new prompt",
  143. ]
  144. )
  145. if new_prompt is not None:
  146. config["prompt"] = new_prompt
  147. save_config(config)
  148. elif choice and choice.startswith(EMOJI_START):
  149. files_to_process = get_image_files(config)
  150. if files_to_process:
  151. process_images(files_to_process, config)
  152. else:
  153. console.print("[bold yellow]No image files found to process.[/bold yellow]")
  154. time.sleep(2)
  155. elif choice and choice.startswith(EMOJI_EXIT):
  156. console.print("[bold magenta]Goodbye![/bold magenta]")
  157. sys.exit(0)
  158. elif choice is None: # Gum was cancelled
  159. console.print("[bold magenta]Goodbye![/bold magenta]")
  160. sys.exit(0)
  161. def set_image_source(config: Dict[str, Any]):
  162. """Menu to set the image source."""
  163. choice = run_gum_command(
  164. [
  165. "choose",
  166. "Process a directory of images",
  167. "Select specific image files",
  168. ]
  169. )
  170. if choice and choice.startswith("Process"):
  171. new_dir = run_gum_command(
  172. [
  173. "input",
  174. "--value",
  175. config["image_dir"],
  176. "--header",
  177. "Enter the directory path",
  178. ]
  179. )
  180. if new_dir is not None:
  181. config["image_source"] = "directory"
  182. config["image_dir"] = new_dir
  183. save_config(config)
  184. elif choice and choice.startswith("Select"):
  185. files_str = run_gum_command(["file", "--multiple", "--file", config["image_dir"]])
  186. if files_str:
  187. files = files_str.split("\n")
  188. config["image_source"] = "specific_files"
  189. config["specific_files"] = files
  190. save_config(config)
  191. def process_images(image_files: List[Path], config: Dict[str, Any]):
  192. """Processes the list of images and displays progress."""
  193. log_table = Table(
  194. title=f"{EMOJI_LOG} Captioning Log",
  195. expand=True,
  196. border_style="blue",
  197. )
  198. log_table.add_column("File", style="cyan", no_wrap=True)
  199. log_table.add_column("Status", style="magenta")
  200. log_table.add_column("Caption/Error", style="green")
  201. with Live(log_table, refresh_per_second=4, console=console) as live:
  202. for image_path in image_files:
  203. output_file = image_path.with_suffix(".txt")
  204. if not image_path.exists():
  205. log_table.add_row(
  206. str(image_path.name),
  207. f"{EMOJI_SKIP} Skipped",
  208. "[yellow]Image file not found.",
  209. )
  210. continue
  211. with Progress(
  212. SpinnerColumn(),
  213. TextColumn("[progress.description]{task.description}"),
  214. transient=True,
  215. ) as progress:
  216. progress.add_task(f"Processing {image_path.name}", total=None)
  217. caption = get_caption(image_path, config)
  218. if caption and not caption.startswith("ERROR:"):
  219. if len(caption.split()) <= 1:
  220. status = f"{EMOJI_FAIL} Warning"
  221. details = f"[yellow]Single-word caption: '{caption}'"
  222. else:
  223. status = f"{EMOJI_SUCCESS} Success"
  224. details = f'"{caption[:60].replace(os.linesep, " ")}"...'
  225. with open(output_file, "w") as out_f:
  226. out_f.write(caption)
  227. log_table.add_row(str(image_path.name), status, details)
  228. elif not caption:
  229. log_table.add_row(
  230. str(image_path.name),
  231. f"{EMOJI_FAIL} Failed",
  232. "[red]Model returned an empty string.",
  233. )
  234. else: # Error case
  235. log_table.add_row(
  236. str(image_path.name),
  237. f"{EMOJI_FAIL} Error",
  238. f"[bold red]{caption}",
  239. )
  240. live.update(log_table)
  241. time.sleep(0.5) #- Rate limit
  242. console.print("[bold green]\nAll processing complete.[/bold green]")
  243. console.print("Press Enter to return to the main menu.")
  244. input()
  245. def main():
  246. """Main function to run the captionizer."""
  247. try:
  248. config = load_config()
  249. show_main_menu(config)
  250. except KeyboardInterrupt:
  251. console.print("\n[bold magenta]Exiting gracefully. Goodbye![/bold magenta]")
  252. except Exception as e:
  253. console.print(f"[bold red]An unexpected error occurred: {e}[/bold red]")
  254. if __name__ == "__main__":
  255. main()