1 lună în urmă · 3e2d7f2a12
--- a/README.md
+++ b/README.md
@@ -0,0 +1,64 @@
 
				+# 🖼️ Ollama Image Captionizer
			
 
				+
			
 
				+A Python script that uses a local [Ollama](https://ollama.com/) multimodal model to generate captions for your images. It features a rich, interactive terminal user interface (TUI) for easy operation, configuration, and live progress tracking. This is mainly a tool for preparing image datasets for training with FLUX. They are captions, as unlike Stable Diffusion, FLUX relies on natural language processing over keyword processing.
			
 
				+
			
 
				+![Screenshot of Ollama Image Captionizer](https://i.imgur.com/example.png)  
			
 
				+*(Note: Replace with an actual screenshot of the script in action)*
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## ✨ Features
			
 
				+
			
 
				+*   **Interactive TUI:** A user-friendly, menu-driven interface built with `rich` and `gum`. No need to edit the script to change settings!
			
 
				+*   **Flexible Image Selection:** Process an entire directory of images or use the file picker to select specific images.
			
 
				+*   **Live Progress Logging:** A beautiful, real-time table shows you which files are being processed, their status, and a preview of the generated caption.
			
 
				+*   **Smart Feedback:** Uses emojis and colors to clearly indicate successes, skips, failures, and warnings for low-quality (e.g., single-word) captions.
			
 
				+*   **Persistent Configuration:** Your last-used settings (model, prompt, image source) are automatically saved to a `config.json` file for your next session.
			
 
				+*   **Cross-Platform:** Built with Python, it's designed to be compatible with macOS, Linux, and Windows.
			
 
				+
			
 
				+## ⚙️ Requirements
			
 
				+
			
 
				+Before you begin, ensure you have the following installed and running:
			
 
				+
			
 
				+1.  **Python 3.x**
			
 
				+2.  **Ollama:** The script requires a running Ollama instance.
			
 
				+3.  **A Multimodal Ollama Model:** You need a model capable of processing images, such as `moondream`.
			
 
				+    ```bash
			
 
				+    ollama pull moondream
			
 
				+    ```
			
 
				+4.  **Rich:** A Python library for rich text and beautiful formatting in the terminal.
			
 
				+    ```bash
			
 
				+    pip install rich
			
 
				+    ```
			
 
				+5.  **Gum:** A tool for glamorous shell scripts, used for the interactive menus.
			
 
				+    *   **macOS:** `brew install gum`
			
 
				+    *   **Other Systems:** See the official [Gum installation guide](https://github.com/charmbracelet/gum#installation).
			
 
				+
			
 
				+## 🚀 Quick Start
			
 
				+
			
 
				+1.  **Install Dependencies:**
			
 
				+    Make sure you have installed Python, Rich, and Gum as listed in the requirements section.
			
 
				+
			
 
				+2.  **Start Ollama:**
			
 
				+    Ensure the Ollama application is running and the server is active.
			
 
				+
			
 
				+3.  **Run the Script:**
			
 
				+    Save the code as `ollama_captionizer.py` and run it from your terminal:
			
 
				+    ```bash
			
 
				+    python3 ollama_captionizer.py
			
 
				+    ```
			
 
				+4.  **Use the Menu:**
			
 
				+    You will be greeted by the main menu, where you can:
			
 
				+    *   **Set Image Source:** Choose a directory or select specific image files.
			
 
				+    *   **Edit Prompt:** Customize the prompt sent to the model.
			
 
				+    *   **Start Captioning:** Begin the process.
			
 
				+
			
 
				+Captions will be saved as `.txt` files with the same name as the original image (e.g., `my_photo.jpg` -> `my_photo.txt`).
			
 
				+
			
 
				+## 🖥️ Cross-Platform Compatibility
			
 
				+
			
 
				+This script is written in Python and is designed to be cross-platform. It should work on **macOS, Linux, and Windows** provided the dependencies are met.
			
 
				+
			
 
				+A key feature is that it communicates with the Ollama server over its network API (e.g., `http://localhost:11434`). This means **you do not need to modify the script to handle different executable names** like `ollama.exe` on Windows.
			
 
				+
			
 
				+The primary consideration for cross-platform use is ensuring that the `gum` command-line tool is properly installed and accessible in your system's `PATH`.
			
--- a/ollama_captionizer.py
+++ b/ollama_captionizer.py
@@ -0,0 +1,289 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+import json
			
 
				+import base64
			
 
				+import time
			
 
				+import urllib.request
			
 
				+import os
			
 
				+import sys
			
 
				+import subprocess
			
 
				+from pathlib import Path
			
 
				+from typing import List, Dict, Any, Optional
			
 
				+
			
 
				+from rich.console import Console
			
 
				+from rich.panel import Panel
			
 
				+from rich.live import Live
			
 
				+from rich.table import Table
			
 
				+from rich.progress import Progress, SpinnerColumn, TextColumn
			
 
				+
			
 
				+# --- EMOJIS ---
			
 
				+EMOJI_CONFIG = "⚙️"
			
 
				+EMOJI_IMAGE = "🖼️"
			
 
				+EMOJI_PROMPT = "💬"
			
 
				+EMOJI_START = "🚀"
			
 
				+EMOJI_EXIT = "🚪"
			
 
				+EMOJI_SUCCESS = "✅"
			
 
				+EMOJI_FAIL = "❌"
			
 
				+EMOJI_SKIP = "⏭️"
			
 
				+EMOJI_LOG = "📝"
			
 
				+
			
 
				+# --- CONFIGURATION ---
			
 
				+CONFIG_FILE = "config.json"
			
 
				+DEFAULT_CONFIG = {
			
 
				+    "image_source": "directory",  # 'directory' or 'specific_files'
			
 
				+    "image_dir": ".",
			
 
				+    "specific_files": [],
			
 
				+    "image_ext": ".jpg",
			
 
				+    "model": "moondream",
			
 
				+    "api_url": "http://localhost:11434/api/generate",
			
 
				+    "prompt": "Describe this image in a single, descriptive sentence.",
			
 
				+}
			
 
				+
			
 
				+console = Console()
			
 
				+
			
 
				+
			
 
				+def run_gum_command(command: List[str]) -> Optional[str]:
			
 
				+    """Runs a gum command and returns its stripped output."""
			
 
				+    try:
			
 
				+        result = subprocess.run(
			
 
				+            ["gum"] + command,
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            check=True,
			
 
				+        )
			
 
				+        return result.stdout.strip()
			
 
				+    except (subprocess.CalledProcessError, FileNotFoundError) as e:
			
 
				+        console.print(
			
 
				+            f"[bold red]Error running 'gum'. Is it installed and in your PATH? ({e})[/bold red]"
			
 
				+        )
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def load_config() -> Dict[str, Any]:
			
 
				+    """Loads configuration from JSON file or returns default."""
			
 
				+    if Path(CONFIG_FILE).exists():
			
 
				+        with open(CONFIG_FILE, "r") as f:
			
 
				+            try:
			
 
				+                config = json.load(f)
			
 
				+                # Ensure all keys from default are present
			
 
				+                for key, value in DEFAULT_CONFIG.items():
			
 
				+                    config.setdefault(key, value)
			
 
				+                return config
			
 
				+            except json.JSONDecodeError:
			
 
				+                return DEFAULT_CONFIG
			
 
				+    return DEFAULT_CONFIG
			
 
				+
			
 
				+def save_config(config: Dict[str, Any]):
			
 
				+    """Saves configuration to JSON file."""
			
 
				+    with open(CONFIG_FILE, "w") as f:
			
 
				+        json.dump(config, f, indent=4)
			
 
				+
			
 
				+def get_image_files(config: Dict[str, Any]) -> List[Path]:
			
 
				+    """Gets a list of image files based on the configuration."""
			
 
				+    if config["image_source"] == "directory":
			
 
				+        source_dir = Path(config["image_dir"])
			
 
				+        if not source_dir.is_dir():
			
 
				+            console.print(
			
 
				+                f"[bold red]Error: Image directory '{source_dir}' not found.[/bold red]"
			
 
				+            )
			
 
				+            return []
			
 
				+        return sorted(list(source_dir.glob(f"*{config['image_ext']}")))
			
 
				+    elif config["image_source"] == "specific_files":
			
 
				+        return [Path(f) for f in config["specific_files"]]
			
 
				+    return []
			
 
				+
			
 
				+def get_caption(
			
 
				+    image_path: Path,
			
 
				+    config: Dict[str, Any]
			
 
				+) -> Optional[str]:
			
 
				+    """Sends image to Ollama and returns the caption."""
			
 
				+    try:
			
 
				+        with open(image_path, "rb") as img_f:
			
 
				+            image_data = base64.b64encode(img_f.read()).decode("utf-8")
			
 
				+
			
 
				+        payload = {
			
 
				+            "model": config["model"],
			
 
				+            "prompt": config["prompt"],
			
 
				+            "images": [image_data],
			
 
				+            "stream": False,
			
 
				+        }
			
 
				+
			
 
				+        req = urllib.request.Request(
			
 
				+            config["api_url"],
			
 
				+            data=json.dumps(payload).encode("utf-8"),
			
 
				+            headers={"Content-Type": "application/json"},
			
 
				+        )
			
 
				+
			
 
				+        with urllib.request.urlopen(req) as response:
			
 
				+            result = json.loads(response.read().decode("utf-8"))
			
 
				+            return result.get("response", "").strip()
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        return f"ERROR: {e}"
			
 
				+
			
 
				+def show_main_menu(config: Dict[str, Any]):
			
 
				+    """Displays the main interactive menu."""
			
 
				+    while True:
			
 
				+        console.print(
			
 
				+            Panel(
			
 
				+                f"[bold cyan]Ollama Image Captionizer[/bold cyan]\n\n"
			
 
				+                f"{EMOJI_CONFIG} [bold]Current Settings:[/bold]\n"
			
 
				+                f"  - [yellow]Model[/yellow]: {config['model']}\n"
			
 
				+                f"  - [yellow]Image Source[/yellow]: {config['image_source']}\n"
			
 
				+                f"  - [yellow]Image Ext[/yellow]: {config['image_ext']}\n"
			
 
				+                f"  - [yellow]Image Dir/Files[/yellow]: {config['image_dir'] if config['image_source'] == 'directory' else f'{len(config['specific_files'])} files'}\n\n"
			
 
				+                f"{EMOJI_PROMPT} [bold]Prompt:[/bold] \"{config['prompt']}"",
			
 
				+                title="Main Menu",
			
 
				+                border_style="green",
			
 
				+                expand=False,
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        choice = run_gum_command(
			
 
				+            [
			
 
				+                "choose",
			
 
				+                f"{EMOJI_IMAGE} Set Image Source",
			
 
				+                f"{EMOJI_PROMPT} Edit Prompt",
			
 
				+                f"{EMOJI_START} Start Captioning",
			
 
				+                f"{EMOJI_EXIT} Exit",
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+        if choice and choice.startswith(EMOJI_IMAGE):
			
 
				+            set_image_source(config)
			
 
				+        elif choice and choice.startswith(EMOJI_PROMPT):
			
 
				+            new_prompt = run_gum_command(
			
 
				+                [
			
 
				+                    "input",
			
 
				+                    "--value",
			
 
				+                    config["prompt"],
			
 
				+                    "--header",
			
 
				+                    "Enter the new prompt",
			
 
				+                ]
			
 
				+            )
			
 
				+            if new_prompt is not None:
			
 
				+                config["prompt"] = new_prompt
			
 
				+                save_config(config)
			
 
				+        elif choice and choice.startswith(EMOJI_START):
			
 
				+            files_to_process = get_image_files(config)
			
 
				+            if files_to_process:
			
 
				+                process_images(files_to_process, config)
			
 
				+            else:
			
 
				+                console.print("[bold yellow]No image files found to process.[/bold yellow]")
			
 
				+                time.sleep(2)
			
 
				+
			
 
				+        elif choice and choice.startswith(EMOJI_EXIT):
			
 
				+            console.print("[bold magenta]Goodbye![/bold magenta]")
			
 
				+            sys.exit(0)
			
 
				+        elif choice is None: # Gum was cancelled
			
 
				+            console.print("[bold magenta]Goodbye![/bold magenta]")
			
 
				+            sys.exit(0)
			
 
				+
			
 
				+def set_image_source(config: Dict[str, Any]):
			
 
				+    """Menu to set the image source."""
			
 
				+    choice = run_gum_command(
			
 
				+        [
			
 
				+            "choose",
			
 
				+            "Process a directory of images",
			
 
				+            "Select specific image files",
			
 
				+        ]
			
 
				+    )
			
 
				+
			
 
				+    if choice and choice.startswith("Process"):
			
 
				+        new_dir = run_gum_command(
			
 
				+            [
			
 
				+                "input",
			
 
				+                "--value",
			
 
				+                config["image_dir"],
			
 
				+                "--header",
			
 
				+                "Enter the directory path",
			
 
				+            ]
			
 
				+        )
			
 
				+        if new_dir is not None:
			
 
				+            config["image_source"] = "directory"
			
 
				+            config["image_dir"] = new_dir
			
 
				+            save_config(config)
			
 
				+
			
 
				+    elif choice and choice.startswith("Select"):
			
 
				+        files_str = run_gum_command(["file", "--multiple", "--file", config["image_dir"]])
			
 
				+        if files_str:
			
 
				+            files = files_str.split("\n")
			
 
				+            config["image_source"] = "specific_files"
			
 
				+            config["specific_files"] = files
			
 
				+            save_config(config)
			
 
				+
			
 
				+def process_images(image_files: List[Path], config: Dict[str, Any]):
			
 
				+    """Processes the list of images and displays progress."""
			
 
				+    log_table = Table(
			
 
				+        title=f"{EMOJI_LOG} Captioning Log",
			
 
				+        expand=True,
			
 
				+        border_style="blue",
			
 
				+    )
			
 
				+    log_table.add_column("File", style="cyan", no_wrap=True)
			
 
				+    log_table.add_column("Status", style="magenta")
			
 
				+    log_table.add_column("Caption/Error", style="green")
			
 
				+
			
 
				+    with Live(log_table, refresh_per_second=4, console=console) as live:
			
 
				+        for image_path in image_files:
			
 
				+            output_file = image_path.with_suffix(".txt")
			
 
				+
			
 
				+            if not image_path.exists():
			
 
				+                log_table.add_row(
			
 
				+                    str(image_path.name),
			
 
				+                    f"{EMOJI_SKIP} Skipped",
			
 
				+                    "[yellow]Image file not found.",
			
 
				+                )
			
 
				+                continue
			
 
				+            
			
 
				+            with Progress(
			
 
				+                SpinnerColumn(),
			
 
				+                TextColumn("[progress.description]{task.description}"),
			
 
				+                transient=True,
			
 
				+            ) as progress:
			
 
				+                progress.add_task(f"Processing {image_path.name}", total=None)
			
 
				+                caption = get_caption(image_path, config)
			
 
				+            
			
 
				+            if caption and not caption.startswith("ERROR:"):
			
 
				+                if len(caption.split()) <= 1:
			
 
				+                     status = f"{EMOJI_FAIL} Warning"
			
 
				+                     details = f"[yellow]Single-word caption: '{caption}'"
			
 
				+                else:
			
 
				+                    status = f"{EMOJI_SUCCESS} Success"
			
 
				+                    details = f'"{caption[:60].replace(os.linesep, " ")}"...'
			
 
				+                
			
 
				+                with open(output_file, "w") as out_f:
			
 
				+                    out_f.write(caption)
			
 
				+
			
 
				+                log_table.add_row(str(image_path.name), status, details)
			
 
				+            elif not caption:
			
 
				+                 log_table.add_row(
			
 
				+                    str(image_path.name),
			
 
				+                    f"{EMOJI_FAIL} Failed",
			
 
				+                    "[red]Model returned an empty string.",
			
 
				+                )
			
 
				+            else: # Error case
			
 
				+                log_table.add_row(
			
 
				+                    str(image_path.name),
			
 
				+                    f"{EMOJI_FAIL} Error",
			
 
				+                    f"[bold red]{caption}",
			
 
				+                )
			
 
				+            live.update(log_table)
			
 
				+            time.sleep(0.5) #- Rate limit
			
 
				+
			
 
				+    console.print("[bold green]\nAll processing complete.[/bold green]")
			
 
				+    console.print("Press Enter to return to the main menu.")
			
 
				+    input()
			
 
				+
			
 
				+def main():
			
 
				+    """Main function to run the captionizer."""
			
 
				+    try:
			
 
				+        config = load_config()
			
 
				+        show_main_menu(config)
			
 
				+    except KeyboardInterrupt:
			
 
				+        console.print("\n[bold magenta]Exiting gracefully. Goodbye![/bold magenta]")
			
 
				+    except Exception as e:
			
 
				+        console.print(f"[bold red]An unexpected error occurred: {e}[/bold red]")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()