#!/usr/bin/env python3 """ Batch process multiple media files using Gemini API. Supports all Gemini modalities: - Audio: Transcription, analysis, summarization - Image: Captioning, detection, OCR, analysis - Video: Summarization, Q&A, scene detection - Document: PDF extraction, structured output - Generation: Image creation via Imagen 4 or Nano Banana (Gemini native) - Nano Banana 2 (gemini-3.1-flash-image-preview): Fastest, 95% Pro quality (default) - Nano Banana Pro (gemini-3-pro-image-preview): Quality/4K text/reasoning - Imagen 4 (imagen-4.0-*): Production-grade generation """ import argparse import json import os import sys import time from pathlib import Path from typing import List, Dict, Any, Optional import csv import shutil # Import centralized environment resolver (works for both local and global installs) CLAUDE_ROOT = Path(__file__).parent.parent.parent.parent sys.path.insert(0, str(CLAUDE_ROOT / 'scripts')) try: from resolve_env import resolve_env CENTRALIZED_RESOLVER_AVAILABLE = True except ImportError: # Fallback if centralized resolver not available CENTRALIZED_RESOLVER_AVAILABLE = False try: from dotenv import load_dotenv except ImportError: load_dotenv = None # Import key rotation support sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'common')) try: from api_key_rotator import KeyRotator, is_rate_limit_error, is_server_error from api_key_helper import find_all_api_keys KEY_ROTATION_AVAILABLE = True except ImportError: KEY_ROTATION_AVAILABLE = False KeyRotator = None is_rate_limit_error = None is_server_error = None find_all_api_keys = None try: from google import genai from google.genai import types except ImportError: print("Error: google-genai package not installed") print("Install with: pip install google-genai") sys.exit(1) # Image generation model configuration # Default: gemini-3.1-flash-image-preview (Nano Banana 2 - 3-5x faster, 95% Pro quality) # Alternative: imagen-4.0-generate-001 (production quality) # All image generation requires billing - no completely free option exists IMAGE_MODEL_DEFAULT = 'gemini-3.1-flash-image-preview' # Nano Banana 2 (fastest, near-Pro quality) IMAGE_MODEL_FALLBACK = 'gemini-2.5-flash-image' # Fallback if Nano Banana 2 fails IMAGEN_MODELS = { 'imagen-4.0-generate-001', 'imagen-4.0-ultra-generate-001', 'imagen-4.0-fast-generate-001', } # Video models have no fallback - Veo always requires billing def find_api_key() -> Optional[str]: """Find Gemini API key using centralized resolver or fallback. Uses ~/.opencode/scripts/resolve_env.py for consistent resolution across all skills. Falls back to local resolution if centralized resolver not available. Priority order (highest to lowest): 1. process.env (runtime environment variables) 2. PROJECT/.opencode/skills/ai-multimodal/.env (skill-specific) 3. PROJECT/.opencode/skills/.env (shared skills) 4. PROJECT/.opencode/.env (project global) 5. ~/.opencode/skills/ai-multimodal/.env (user skill-specific) 6. ~/.opencode/skills/.env (user shared) 7. ~/.opencode/.env (user global) """ if CENTRALIZED_RESOLVER_AVAILABLE: # Use centralized resolver (recommended) return resolve_env('GEMINI_API_KEY', skill='ai-multimodal') # Fallback: Local resolution (legacy) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key if load_dotenv: script_dir = Path(__file__).parent skill_dir = script_dir.parent skills_dir = skill_dir.parent claude_dir = skills_dir.parent env_files = [ claude_dir / '.env', skills_dir / '.env', skill_dir / '.env', ] for env_file in env_files: if env_file.exists(): load_dotenv(env_file, override=True) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key return None def get_default_model(task: str) -> str: """Get default model for task from environment or fallback. Priority: 1. Environment variable for specific capability 2. Legacy GEMINI_MODEL variable 3. Hard-coded defaults """ if task == 'generate': # Image generation model = os.getenv('IMAGE_GEN_MODEL') if model: return model # Fallback to legacy model = os.getenv('GEMINI_IMAGE_GEN_MODEL') if model: return model # Default to Nano Banana 2 (fastest, near-Pro quality) # Alternative: imagen-4.0-generate-001 for production quality return 'gemini-3.1-flash-image-preview' elif task == 'generate-video': model = os.getenv('VIDEO_GEN_MODEL') if model: return model return 'veo-3.1-generate-preview' # New default elif task in ['analyze', 'transcribe', 'extract']: model = os.getenv('MULTIMODAL_MODEL') if model: return model # Fallback to legacy model = os.getenv('GEMINI_MODEL') if model: return model return 'gemini-2.5-flash' # Existing default return 'gemini-2.5-flash' def validate_model_task_combination(model: str, task: str) -> None: """Validate model is compatible with task. Raises: ValueError: If combination is invalid """ # Video generation requires Veo if task == 'generate-video': if not model.startswith('veo-'): raise ValueError( f"Video generation requires Veo model, got '{model}'\n" f"Valid models: veo-3.1-generate-preview, veo-3.1-fast-generate-preview, " f"veo-3.0-generate-001, veo-3.0-fast-generate-001" ) # Image generation models if task == 'generate': valid_image_models = [ 'imagen-4.0-generate-001', 'imagen-4.0-ultra-generate-001', 'imagen-4.0-fast-generate-001', 'gemini-3.1-flash-image-preview', 'gemini-3-pro-image-preview', 'gemini-2.5-flash-image', 'gemini-2.5-flash-image-preview', ] if model not in valid_image_models: # Allow gemini models for analysis-based generation (backward compat) if not model.startswith('gemini-'): raise ValueError( f"Image generation requires Imagen/Gemini image model, got '{model}'\n" f"Valid models: {', '.join(valid_image_models)}" ) def infer_task_from_file(file_path: str) -> str: """Infer task type from file extension. Returns: 'transcribe' for audio files 'analyze' for image/video/document files """ ext = Path(file_path).suffix.lower() audio_extensions = {'.mp3', '.wav', '.aac', '.flac', '.ogg', '.aiff', '.m4a'} image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.heic', '.heif', '.gif', '.bmp'} video_extensions = {'.mp4', '.mpeg', '.mov', '.avi', '.flv', '.mpg', '.webm', '.wmv', '.3gpp', '.mkv'} document_extensions = {'.pdf', '.txt', '.html', '.md', '.doc', '.docx'} if ext in audio_extensions: return 'transcribe' elif ext in image_extensions: return 'analyze' elif ext in video_extensions: return 'analyze' elif ext in document_extensions: return 'extract' # Default to analyze for unknown types return 'analyze' def get_mime_type(file_path: str) -> str: """Determine MIME type from file extension.""" ext = Path(file_path).suffix.lower() mime_types = { # Audio '.mp3': 'audio/mp3', '.wav': 'audio/wav', '.aac': 'audio/aac', '.flac': 'audio/flac', '.ogg': 'audio/ogg', '.aiff': 'audio/aiff', # Image '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.webp': 'image/webp', '.heic': 'image/heic', '.heif': 'image/heif', # Video '.mp4': 'video/mp4', '.mpeg': 'video/mpeg', '.mov': 'video/quicktime', '.avi': 'video/x-msvideo', '.flv': 'video/x-flv', '.mpg': 'video/mpeg', '.webm': 'video/webm', '.wmv': 'video/x-ms-wmv', '.3gpp': 'video/3gpp', # Document '.pdf': 'application/pdf', '.txt': 'text/plain', '.html': 'text/html', '.md': 'text/markdown', } return mime_types.get(ext, 'application/octet-stream') def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any: """Upload file to Gemini File API.""" if verbose: print(f"Uploading {file_path}...") myfile = client.files.upload(file=file_path) # Wait for processing (video/audio files need processing) mime_type = get_mime_type(file_path) if mime_type.startswith('video/') or mime_type.startswith('audio/'): max_wait = 300 # 5 minutes elapsed = 0 while myfile.state.name == 'PROCESSING' and elapsed < max_wait: time.sleep(2) myfile = client.files.get(name=myfile.name) elapsed += 2 if verbose and elapsed % 10 == 0: print(f" Processing... {elapsed}s") if myfile.state.name == 'FAILED': raise ValueError(f"File processing failed: {file_path}") if myfile.state.name == 'PROCESSING': raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}") if verbose: print(f" Uploaded: {myfile.name}") return myfile def _is_billing_error(error: Exception) -> bool: """Check if error is due to billing/access restrictions.""" error_str = str(error).lower() billing_indicators = [ 'billing', 'billed users', 'payment', 'access denied', 'not authorized', 'permission denied', ] return any(indicator in error_str for indicator in billing_indicators) def _is_free_tier_quota_error(error: Exception) -> bool: """Check if error indicates free tier has zero quota for this model. Free tier users have NO access to image/video generation models. The API returns 'limit: 0' or 'RESOURCE_EXHAUSTED' with quota details. """ error_str = str(error) # Check for zero quota indicators return ( 'RESOURCE_EXHAUSTED' in error_str and ('limit: 0' in error_str or 'free_tier' in error_str.lower()) ) FREE_TIER_NO_ACCESS_MSG = """ [FREE TIER LIMITATION] Image/Video generation is NOT available on free tier. Free tier users have zero quota (limit: 0) for: - All Imagen models (imagen-4.0-*) - All Veo models (veo-*) - Gemini image models (gemini-*-image, gemini-*-image-preview) To use image/video generation: 1. Enable billing: https://aistudio.google.com/apikey 2. Or use Google Cloud $300 free credits: https://cloud.google.com/free STOP: Do not retry image/video generation on free tier - it will always fail. """.strip() def generate_image_imagen4( client, prompt: str, model: str, num_images: int = 1, aspect_ratio: str = '1:1', size: str = '1K', verbose: bool = False ) -> Dict[str, Any]: """Generate image using Imagen 4 models. Returns special status 'billing_required' if model needs billing, allowing caller to fallback to free-tier generate_content API. """ try: # Build config based on model (Fast doesn't support imageSize) config_params = { 'numberOfImages': num_images, 'aspectRatio': aspect_ratio } # Only Standard and Ultra support imageSize parameter if 'fast' not in model.lower() and model.startswith('imagen-'): config_params['imageSize'] = size gen_config = types.GenerateImagesConfig(**config_params) if verbose: print(f" Generating with: {model}") print(f" Config: {num_images} images, {aspect_ratio}", end='') if 'fast' not in model.lower() and model.startswith('imagen-'): print(f", {size}") else: print() response = client.models.generate_images( model=model, prompt=prompt, config=gen_config ) # Save images generated_files = [] for i, generated_image in enumerate(response.generated_images): # Find project root script_dir = Path(__file__).parent project_root = script_dir for parent in [script_dir] + list(script_dir.parents): if (parent / '.git').exists() or (parent / '.claude').exists(): project_root = parent break output_dir = project_root / 'docs' / 'assets' output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / f"imagen4_generated_{int(time.time())}_{i}.png" with open(output_file, 'wb') as f: f.write(generated_image.image.image_bytes) generated_files.append(str(output_file)) if verbose: print(f" Saved: {output_file}") return { 'status': 'success', 'generated_images': generated_files, 'model': model } except Exception as e: # Return special status for billing errors so caller can fallback if _is_billing_error(e) and model in IMAGEN_MODELS: return { 'status': 'billing_required', 'original_model': model, 'error': str(e) } if verbose: print(f" Error: {str(e)}") import traceback traceback.print_exc() return { 'status': 'error', 'error': str(e) } def generate_video_veo( client, prompt: str, model: str, resolution: str = '1080p', aspect_ratio: str = '16:9', reference_images: Optional[List[str]] = None, verbose: bool = False ) -> Dict[str, Any]: """Generate video using Veo models. For image-to-video with first/last frames (Veo 3.1): - First reference image becomes the opening frame (image parameter) - Second reference image becomes the closing frame (last_frame config) - Model interpolates between them to create smooth video """ try: # Build config with snake_case for Python SDK config_params = { 'aspect_ratio': aspect_ratio, 'resolution': resolution } # Prepare first frame and last frame images first_frame = None last_frame = None if reference_images: import mimetypes def load_image(img_path_str: str) -> types.Image: """Load image file as types.Image with bytes and mime type.""" img_path = Path(img_path_str) image_bytes = img_path.read_bytes() mime_type, _ = mimetypes.guess_type(str(img_path)) if not mime_type: mime_type = 'image/png' return types.Image( image_bytes=image_bytes, mime_type=mime_type ) # First image = opening frame if len(reference_images) >= 1: first_frame = load_image(reference_images[0]) # Second image = closing frame (last_frame in config) if len(reference_images) >= 2: last_frame = load_image(reference_images[1]) config_params['last_frame'] = last_frame gen_config = types.GenerateVideosConfig(**config_params) if verbose: print(f" Generating video with Veo: {model}") print(f" Config: {resolution}, {aspect_ratio}") if first_frame: print(f" First frame: provided") if last_frame: print(f" Last frame: provided (interpolation mode)") start = time.time() if verbose: print(f" Starting video generation (this may take 11s-6min)...") # Call generate_videos with image parameter for first frame operation = client.models.generate_videos( model=model, prompt=prompt, image=first_frame, # First frame as opening image config=gen_config ) # Poll operation until complete poll_count = 0 while not operation.done: poll_count += 1 if verbose and poll_count % 3 == 0: # Update every 30s elapsed = time.time() - start print(f" Still generating... ({elapsed:.0f}s elapsed)") time.sleep(10) operation = client.operations.get(operation) duration = time.time() - start # Access generated video from operation response generated_video = operation.response.generated_videos[0] # Download the video file first client.files.download(file=generated_video.video) # Save video script_dir = Path(__file__).parent project_root = script_dir for parent in [script_dir] + list(script_dir.parents): if (parent / '.git').exists() or (parent / '.claude').exists(): project_root = parent break output_dir = project_root / 'docs' / 'assets' output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / f"veo_generated_{int(time.time())}.mp4" # Now save to file generated_video.video.save(str(output_file)) file_size = output_file.stat().st_size / (1024 * 1024) # MB if verbose: print(f" Generated in {duration:.1f}s") print(f" File size: {file_size:.2f} MB") print(f" Saved: {output_file}") return { 'status': 'success', 'generated_video': str(output_file), 'generation_time': duration, 'file_size_mb': file_size, 'model': model } except Exception as e: if verbose: print(f" Error: {str(e)}") import traceback traceback.print_exc() return { 'status': 'error', 'error': str(e) } def process_file( client: genai.Client, file_path: Optional[str], prompt: str, model: str, task: str, format_output: str, aspect_ratio: Optional[str] = None, image_size: Optional[str] = None, verbose: bool = False, max_retries: int = 3 ) -> Dict[str, Any]: """Process a single file with retry logic. Args: image_size: Image size for Nano Banana models (1K, 2K, 4K). Must be uppercase K. Note: Not all models support image_size - only pass when explicitly needed. """ for attempt in range(max_retries): try: # For generation tasks without input files if task == 'generate' and not file_path: content = [prompt] else: # Process input file file_path = Path(file_path) # Determine if we need File API file_size = file_path.stat().st_size use_file_api = file_size > 20 * 1024 * 1024 # >20MB if use_file_api: # Upload to File API myfile = upload_file(client, str(file_path), verbose) content = [prompt, myfile] else: # Inline data with open(file_path, 'rb') as f: file_bytes = f.read() mime_type = get_mime_type(str(file_path)) content = [ prompt, types.Part.from_bytes(data=file_bytes, mime_type=mime_type) ] # Configure request config_args = {} if task == 'generate': # Nano Banana requires fully uppercase 'IMAGE' per API spec config_args['response_modalities'] = ['IMAGE'] # Build image_config with aspect_ratio and/or image_size image_config_args = {} if aspect_ratio: image_config_args['aspect_ratio'] = aspect_ratio if image_size: # image_size must be uppercase K (1K, 2K, 4K) image_config_args['image_size'] = image_size if image_config_args: config_args['image_config'] = types.ImageConfig(**image_config_args) if format_output == 'json': config_args['response_mime_type'] = 'application/json' config = types.GenerateContentConfig(**config_args) if config_args else None # Generate content response = client.models.generate_content( model=model, contents=content, config=config ) # Extract response result = { 'file': str(file_path) if file_path else 'generated', 'status': 'success', 'response': response.text if hasattr(response, 'text') else None } # Handle image output if task == 'generate' and hasattr(response, 'candidates'): for i, part in enumerate(response.candidates[0].content.parts): if part.inline_data: # Determine output directory - use project root docs/assets if file_path: output_dir = Path(file_path).parent base_name = Path(file_path).stem else: # Find project root (look for .git or .claude directory) script_dir = Path(__file__).parent project_root = script_dir for parent in [script_dir] + list(script_dir.parents): if (parent / '.git').exists() or (parent / '.claude').exists(): project_root = parent break output_dir = project_root / 'docs' / 'assets' output_dir.mkdir(parents=True, exist_ok=True) base_name = "generated" output_file = output_dir / f"{base_name}_generated_{i}.png" with open(output_file, 'wb') as f: f.write(part.inline_data.data) result['generated_image'] = str(output_file) if verbose: print(f" Saved image to: {output_file}") return result except Exception as e: # Don't retry on billing/free tier errors - they won't resolve if _is_billing_error(e) or _is_free_tier_quota_error(e): return { 'file': str(file_path) if file_path else 'generated', 'status': 'error', 'error': str(e) } # Check if this is a rate limit error (candidate for key rotation) is_rate_limited = ( KEY_ROTATION_AVAILABLE and is_rate_limit_error and is_rate_limit_error(e) ) # Check if this is a transient server error (503, 500, etc.) is_5xx = ( KEY_ROTATION_AVAILABLE and is_server_error and is_server_error(e) ) # Use more retries for transient 5xx errors (up to 5 attempts) effective_max = max(max_retries, 5) if is_5xx else max_retries if attempt == effective_max - 1: return { 'file': str(file_path) if file_path else 'generated', 'status': 'error', 'error': str(e), 'rate_limited': is_rate_limited # Flag for caller to handle rotation } # Longer backoff for 5xx (4s, 8s, 16s, 32s) vs default (1s, 2s, 4s) if is_5xx: wait_time = 4 * (2 ** attempt) # 4, 8, 16, 32, 64 else: wait_time = 2 ** attempt # 1, 2, 4 if verbose: error_type = "5xx server error" if is_5xx else "error" print(f" Retry {attempt + 1}/{effective_max - 1} after {wait_time}s ({error_type}): {e}") time.sleep(wait_time) def batch_process( files: List[str], prompt: str, model: str, task: str, format_output: str, aspect_ratio: Optional[str] = None, num_images: int = 1, size: str = '1K', resolution: str = '1080p', reference_images: Optional[List[str]] = None, output_file: Optional[str] = None, verbose: bool = False, dry_run: bool = False ) -> List[Dict[str, Any]]: """Batch process multiple files with automatic key rotation.""" # Initialize key rotator or fall back to single key rotator = None api_key = None if KEY_ROTATION_AVAILABLE and find_all_api_keys: all_keys = find_all_api_keys() if all_keys: if len(all_keys) > 1: rotator = KeyRotator(keys=all_keys, verbose=verbose) api_key = rotator.get_key() if verbose: print(f"✓ Key rotation enabled with {len(all_keys)} keys", file=sys.stderr) else: api_key = all_keys[0] if verbose: print(f"✓ Using single API key: {api_key[:8]}...", file=sys.stderr) # Fallback to original single-key lookup if not api_key: api_key = find_api_key() if not api_key: print("Error: GEMINI_API_KEY not found in any location") print("\nSearched locations (highest to lowest priority):") print(" 1. OS environment (process.env)") if CENTRALIZED_RESOLVER_AVAILABLE: from resolve_env import get_env_file_paths for i, (desc, path) in enumerate(get_env_file_paths('ai-multimodal'), 2): exists = "[OK]" if path.exists() else "[ ]" print(f" {i}. {exists} {path}") else: print(" 2-7. .env files (centralized resolver unavailable)") print("\nQuick fix — add your key to any .env file above:") print(" echo 'GEMINI_API_KEY=your-key' >> ~/.opencode/.env") print("\nOther options:") print(" - Run setup checker: python scripts/check_setup.py") print(" - Show full hierarchy: python ~/.opencode/scripts/resolve_env.py --show-hierarchy --skill ai-multimodal -v") print("\nFor key rotation, add multiple keys to any .env:") print(" GEMINI_API_KEY=key1") print(" GEMINI_API_KEY_2=key2") print(" GEMINI_API_KEY_3=key3") sys.exit(1) if dry_run: print("DRY RUN MODE - No API calls will be made") print(f"Files to process: {len(files)}") print(f"Model: {model}") print(f"Task: {task}") print(f"Prompt: {prompt}") if rotator: print(f"API keys available: {rotator.key_count}") return [] # Create client with current key client = genai.Client(api_key=api_key) results = [] def get_client_with_rotation(error: Optional[Exception] = None) -> Optional[genai.Client]: """Get client, rotating key if rate limited.""" nonlocal client, api_key if error and rotator and is_rate_limit_error and is_rate_limit_error(error): # Try to rotate to next key if rotator.mark_rate_limited(str(error)): new_key = rotator.get_key() if new_key: api_key = new_key client = genai.Client(api_key=api_key) return client # All keys exhausted return None return client # For generation tasks without input files, process once if task == 'generate' and not files: if verbose: print(f"\nGenerating image from prompt...") # Use Imagen 4 API for imagen models if model.startswith('imagen-') or model in IMAGEN_MODELS: result = generate_image_imagen4( client=client, prompt=prompt, model=model, num_images=num_images, aspect_ratio=aspect_ratio or '1:1', size=size or '1K', # Default to 1K for Imagen models verbose=verbose ) # Silent fallback to cheaper model if Imagen billing required if result.get('status') == 'billing_required': if verbose: print(f" Falling back to: {IMAGE_MODEL_FALLBACK}") result = process_file( client=client, file_path=None, prompt=prompt, model=IMAGE_MODEL_FALLBACK, task=task, format_output=format_output, aspect_ratio=aspect_ratio, image_size=size, verbose=verbose ) # Check if free tier (zero quota) - stop immediately with clear message error_str = result.get('error', '') if result.get('status') == 'error': if _is_free_tier_quota_error(Exception(error_str)): result['error'] = FREE_TIER_NO_ACCESS_MSG elif _is_billing_error(Exception(error_str)): result['error'] = ( "Image generation requires billing. Enable billing at: " "https://aistudio.google.com/apikey or use Google Cloud credits." ) else: # Nano Banana (Flash/Pro) or other models via generate_content API result = process_file( client=client, file_path=None, prompt=prompt, model=model, task=task, format_output=format_output, aspect_ratio=aspect_ratio, image_size=size, verbose=verbose ) # Check for free tier error if result.get('status') == 'error': error_str = result.get('error', '') if _is_free_tier_quota_error(Exception(error_str)): result['error'] = FREE_TIER_NO_ACCESS_MSG results.append(result) if verbose: status = result.get('status', 'unknown') print(f" Status: {status}") elif task == 'generate-video' and not files: if verbose: print(f"\nGenerating video from prompt...") result = generate_video_veo( client=client, prompt=prompt, model=model, resolution=resolution, aspect_ratio=aspect_ratio or '16:9', reference_images=reference_images, verbose=verbose ) # Check for free tier error - video gen has NO free tier access if result.get('status') == 'error': error_str = result.get('error', '') if _is_free_tier_quota_error(Exception(error_str)) or _is_billing_error(Exception(error_str)): result['error'] = FREE_TIER_NO_ACCESS_MSG results.append(result) if verbose: status = result.get('status', 'unknown') print(f" Status: {status}") else: # Process input files with key rotation support for i, file_path in enumerate(files, 1): if verbose: print(f"\n[{i}/{len(files)}] Processing: {file_path}") # Try processing with key rotation on rate limit max_rotation_attempts = rotator.key_count if rotator else 1 result = None for rotation_attempt in range(max_rotation_attempts): result = process_file( client=client, file_path=file_path, prompt=prompt, model=model, task=task, format_output=format_output, aspect_ratio=aspect_ratio, image_size=size, verbose=verbose ) # Check if rate limited and can rotate if (result.get('rate_limited') and rotator and rotation_attempt < max_rotation_attempts - 1): new_client = get_client_with_rotation(Exception(result.get('error', ''))) if new_client: client = new_client if verbose: print(f" Retrying with rotated key...") continue else: # All keys exhausted - mark result with clear error if verbose: print(f" ⚠ All API keys exhausted (on cooldown)", file=sys.stderr) result['error'] = "All API keys exhausted (rate limited). Try again later." break results.append(result) if verbose: status = result.get('status', 'unknown') print(f" Status: {status}") # Save results if output_file: save_results(results, output_file, format_output) return results def print_results(results: List[Dict[str, Any]], task: str) -> None: """Print results to stdout for LLM workflows. Always prints actual results (not just success/fail counts) so LLMs can continue processing based on the output. """ if not results: return print("\n=== RESULTS ===\n") for result in results: file_name = result.get('file', 'generated') status = result.get('status', 'unknown') print(f"[{file_name}]") print(f"Status: {status}") if status == 'success': # Print task-specific output if task in ['analyze', 'transcribe', 'extract']: response = result.get('response') if response: print(f"Result:\n{response}") elif task == 'generate': # Image generation generated_images = result.get('generated_images', []) if generated_images: print(f"Generated images: {len(generated_images)}") for img in generated_images: print(f" - {img}") else: generated_image = result.get('generated_image') if generated_image: print(f"Generated image: {generated_image}") elif task == 'generate-video': generated_video = result.get('generated_video') if generated_video: print(f"Generated video: {generated_video}") gen_time = result.get('generation_time') if gen_time: print(f"Generation time: {gen_time:.1f}s") file_size = result.get('file_size_mb') if file_size: print(f"File size: {file_size:.2f} MB") elif status == 'error': error = result.get('error', 'Unknown error') print(f"Error: {error}") print() # Blank line between results def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str): """Save results to file.""" output_path = Path(output_file) # Special handling for image generation - if output has image extension, copy the generated image image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'} video_extensions = {'.mp4', '.mov', '.avi', '.webm'} if output_path.suffix.lower() in image_extensions and len(results) == 1: # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Check for multiple generated images generated_images = results[0].get('generated_images') if generated_images: # Copy first image to the specified output location shutil.copy2(generated_images[0], output_path) return # Legacy single image field generated_image = results[0].get('generated_image') if generated_image: shutil.copy2(generated_image, output_path) return else: # Don't write text reports to image files - save error as .txt instead output_path = output_path.with_suffix('.error.txt') output_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists print(f"Warning: Generation failed, saving error report to: {output_path}") if output_path.suffix.lower() in video_extensions and len(results) == 1: # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) generated_video = results[0].get('generated_video') if generated_video: shutil.copy2(generated_video, output_path) return else: output_path = output_path.with_suffix('.error.txt') output_path.parent.mkdir(parents=True, exist_ok=True) print(f"Warning: Video generation failed, saving error report to: {output_path}") if format_output == 'json': with open(output_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2) elif format_output == 'csv': with open(output_path, 'w', newline='', encoding='utf-8') as f: fieldnames = ['file', 'status', 'response', 'error'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for result in results: writer.writerow({ 'file': result.get('file', ''), 'status': result.get('status', ''), 'response': result.get('response', ''), 'error': result.get('error', '') }) else: # markdown with open(output_path, 'w', encoding='utf-8') as f: f.write("# Batch Processing Results\n\n") for i, result in enumerate(results, 1): f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n") f.write(f"**Status**: {result.get('status', 'unknown')}\n\n") if result.get('response'): f.write(f"**Response**:\n\n{result['response']}\n\n") if result.get('error'): f.write(f"**Error**: {result['error']}\n\n") def main(): parser = argparse.ArgumentParser( description='Batch process media files with Gemini API', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Transcribe multiple audio files %(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash # Analyze images %(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\ --model gemini-2.5-flash # Process PDFs to JSON %(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\ --format json --output results.json # Generate images with Nano Banana Flash (fast) %(prog)s --task generate --prompt "A mountain landscape at sunset" \\ --model gemini-2.5-flash-image --aspect-ratio 16:9 --size 2K # Generate images with Nano Banana Pro (4K text, reasoning) %(prog)s --task generate --prompt "Travel poster with text 'EXPLORE'" \\ --model gemini-3-pro-image-preview --aspect-ratio 3:4 --size 4K # Generate images with Imagen 4 (production quality) %(prog)s --task generate --prompt "Product photo of coffee mug" \\ --model imagen-4.0-ultra-generate-001 --aspect-ratio 1:1 --size 2K """ ) parser.add_argument('--files', nargs='*', help='Input files to process') parser.add_argument('--task', choices=['transcribe', 'analyze', 'extract', 'generate', 'generate-video'], help='Task to perform (auto-detected from file type if not specified)') parser.add_argument('--prompt', help='Prompt for analysis/generation') parser.add_argument('--model', help='Model to use (default: auto-detected from task and env vars)') parser.add_argument('--format', dest='format_output', default='text', choices=['text', 'json', 'csv', 'markdown'], help='Output format (default: text)') # Image generation options # All 10 aspect ratios supported by Nano Banana / Imagen 4 parser.add_argument('--aspect-ratio', choices=['1:1', '2:3', '3:2', '3:4', '4:3', '4:5', '5:4', '9:16', '16:9', '21:9'], help='Aspect ratio for image/video generation') parser.add_argument('--num-images', type=int, default=1, help='Number of images to generate (1-4, default: 1)') # 4K available for Nano Banana Pro (gemini-3-pro-image-preview) # Note: Not all models support --size, only use when needed parser.add_argument('--size', choices=['1K', '2K', '4K'], default=None, help='Image size - 1K/2K for Imagen 4, 1K/2K/4K for Nano Banana (optional)') # Video generation options parser.add_argument('--resolution', choices=['720p', '1080p'], default='1080p', help='Video resolution (default: 1080p)') parser.add_argument('--reference-images', nargs='+', help='Reference images for video generation (max 3)') parser.add_argument('--output', help='Output file for results') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making API calls') args = parser.parse_args() # Auto-detect task from file type if not specified if not args.task: if args.files and len(args.files) > 0: args.task = infer_task_from_file(args.files[0]) if args.verbose: print(f"Auto-detected task: {args.task} (from file extension)") else: parser.error("--task required when no input files provided") # Auto-detect model if not specified if not args.model: args.model = get_default_model(args.task) if args.verbose: print(f"Auto-detected model: {args.model}") # Validate model/task combination try: validate_model_task_combination(args.model, args.task) except ValueError as e: parser.error(str(e)) # Validate arguments if args.task not in ['generate', 'generate-video'] and not args.files: parser.error("--files required for non-generation tasks") if args.task in ['generate', 'generate-video'] and not args.prompt: parser.error("--prompt required for generation tasks") if args.task not in ['generate', 'generate-video'] and not args.prompt: # Set default prompts if args.task == 'transcribe': args.prompt = 'Generate a transcript with timestamps' elif args.task == 'analyze': args.prompt = 'Analyze this content' elif args.task == 'extract': args.prompt = 'Extract key information' # Process files files = args.files or [] results = batch_process( files=files, prompt=args.prompt, model=args.model, task=args.task, format_output=args.format_output, aspect_ratio=args.aspect_ratio, num_images=args.num_images, size=args.size, resolution=args.resolution, reference_images=args.reference_images, output_file=args.output, verbose=args.verbose, dry_run=args.dry_run ) # Print results and summary if not args.dry_run and results: # Always print actual results for LLM workflows print_results(results, args.task) # Print summary success = sum(1 for r in results if r.get('status') == 'success') failed = len(results) - success print(f"{'='*50}") print(f"Summary: {len(results)} processed, {success} success, {failed} failed") if args.output: print(f"Results saved to: {args.output}") if __name__ == '__main__': main()