1212 lines
44 KiB
Python
Executable File
1212 lines
44 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Batch process multiple media files using Gemini API.
|
|
|
|
Supports all Gemini modalities:
|
|
- Audio: Transcription, analysis, summarization
|
|
- Image: Captioning, detection, OCR, analysis
|
|
- Video: Summarization, Q&A, scene detection
|
|
- Document: PDF extraction, structured output
|
|
- Generation: Image creation via Imagen 4 or Nano Banana (Gemini native)
|
|
- Nano Banana 2 (gemini-3.1-flash-image-preview): Fastest, 95% Pro quality (default)
|
|
- Nano Banana Pro (gemini-3-pro-image-preview): Quality/4K text/reasoning
|
|
- Imagen 4 (imagen-4.0-*): Production-grade generation
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
import csv
|
|
import shutil
|
|
|
|
# Import centralized environment resolver (works for both local and global installs)
|
|
CLAUDE_ROOT = Path(__file__).parent.parent.parent.parent
|
|
sys.path.insert(0, str(CLAUDE_ROOT / 'scripts'))
|
|
try:
|
|
from resolve_env import resolve_env
|
|
CENTRALIZED_RESOLVER_AVAILABLE = True
|
|
except ImportError:
|
|
# Fallback if centralized resolver not available
|
|
CENTRALIZED_RESOLVER_AVAILABLE = False
|
|
try:
|
|
from dotenv import load_dotenv
|
|
except ImportError:
|
|
load_dotenv = None
|
|
|
|
# Import key rotation support
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'common'))
|
|
try:
|
|
from api_key_rotator import KeyRotator, is_rate_limit_error, is_server_error
|
|
from api_key_helper import find_all_api_keys
|
|
KEY_ROTATION_AVAILABLE = True
|
|
except ImportError:
|
|
KEY_ROTATION_AVAILABLE = False
|
|
KeyRotator = None
|
|
is_rate_limit_error = None
|
|
is_server_error = None
|
|
find_all_api_keys = None
|
|
|
|
try:
|
|
from google import genai
|
|
from google.genai import types
|
|
except ImportError:
|
|
print("Error: google-genai package not installed")
|
|
print("Install with: pip install google-genai")
|
|
sys.exit(1)
|
|
|
|
|
|
# Image generation model configuration
|
|
# Default: gemini-3.1-flash-image-preview (Nano Banana 2 - 3-5x faster, 95% Pro quality)
|
|
# Alternative: imagen-4.0-generate-001 (production quality)
|
|
# All image generation requires billing - no completely free option exists
|
|
IMAGE_MODEL_DEFAULT = 'gemini-3.1-flash-image-preview' # Nano Banana 2 (fastest, near-Pro quality)
|
|
IMAGE_MODEL_FALLBACK = 'gemini-2.5-flash-image' # Fallback if Nano Banana 2 fails
|
|
IMAGEN_MODELS = {
|
|
'imagen-4.0-generate-001',
|
|
'imagen-4.0-ultra-generate-001',
|
|
'imagen-4.0-fast-generate-001',
|
|
}
|
|
# Video models have no fallback - Veo always requires billing
|
|
|
|
|
|
def find_api_key() -> Optional[str]:
|
|
"""Find Gemini API key using centralized resolver or fallback.
|
|
|
|
Uses ~/.opencode/scripts/resolve_env.py for consistent resolution across all skills.
|
|
Falls back to local resolution if centralized resolver not available.
|
|
|
|
Priority order (highest to lowest):
|
|
1. process.env (runtime environment variables)
|
|
2. PROJECT/.opencode/skills/ai-multimodal/.env (skill-specific)
|
|
3. PROJECT/.opencode/skills/.env (shared skills)
|
|
4. PROJECT/.opencode/.env (project global)
|
|
5. ~/.opencode/skills/ai-multimodal/.env (user skill-specific)
|
|
6. ~/.opencode/skills/.env (user shared)
|
|
7. ~/.opencode/.env (user global)
|
|
"""
|
|
if CENTRALIZED_RESOLVER_AVAILABLE:
|
|
# Use centralized resolver (recommended)
|
|
return resolve_env('GEMINI_API_KEY', skill='ai-multimodal')
|
|
|
|
# Fallback: Local resolution (legacy)
|
|
api_key = os.getenv('GEMINI_API_KEY')
|
|
if api_key:
|
|
return api_key
|
|
|
|
if load_dotenv:
|
|
script_dir = Path(__file__).parent
|
|
skill_dir = script_dir.parent
|
|
skills_dir = skill_dir.parent
|
|
claude_dir = skills_dir.parent
|
|
|
|
env_files = [
|
|
claude_dir / '.env',
|
|
skills_dir / '.env',
|
|
skill_dir / '.env',
|
|
]
|
|
|
|
for env_file in env_files:
|
|
if env_file.exists():
|
|
load_dotenv(env_file, override=True)
|
|
|
|
api_key = os.getenv('GEMINI_API_KEY')
|
|
if api_key:
|
|
return api_key
|
|
|
|
return None
|
|
|
|
|
|
def get_default_model(task: str) -> str:
|
|
"""Get default model for task from environment or fallback.
|
|
|
|
Priority:
|
|
1. Environment variable for specific capability
|
|
2. Legacy GEMINI_MODEL variable
|
|
3. Hard-coded defaults
|
|
"""
|
|
if task == 'generate': # Image generation
|
|
model = os.getenv('IMAGE_GEN_MODEL')
|
|
if model:
|
|
return model
|
|
# Fallback to legacy
|
|
model = os.getenv('GEMINI_IMAGE_GEN_MODEL')
|
|
if model:
|
|
return model
|
|
# Default to Nano Banana 2 (fastest, near-Pro quality)
|
|
# Alternative: imagen-4.0-generate-001 for production quality
|
|
return 'gemini-3.1-flash-image-preview'
|
|
|
|
elif task == 'generate-video':
|
|
model = os.getenv('VIDEO_GEN_MODEL')
|
|
if model:
|
|
return model
|
|
return 'veo-3.1-generate-preview' # New default
|
|
|
|
elif task in ['analyze', 'transcribe', 'extract']:
|
|
model = os.getenv('MULTIMODAL_MODEL')
|
|
if model:
|
|
return model
|
|
# Fallback to legacy
|
|
model = os.getenv('GEMINI_MODEL')
|
|
if model:
|
|
return model
|
|
return 'gemini-2.5-flash' # Existing default
|
|
|
|
return 'gemini-2.5-flash'
|
|
|
|
|
|
def validate_model_task_combination(model: str, task: str) -> None:
|
|
"""Validate model is compatible with task.
|
|
|
|
Raises:
|
|
ValueError: If combination is invalid
|
|
"""
|
|
# Video generation requires Veo
|
|
if task == 'generate-video':
|
|
if not model.startswith('veo-'):
|
|
raise ValueError(
|
|
f"Video generation requires Veo model, got '{model}'\n"
|
|
f"Valid models: veo-3.1-generate-preview, veo-3.1-fast-generate-preview, "
|
|
f"veo-3.0-generate-001, veo-3.0-fast-generate-001"
|
|
)
|
|
|
|
# Image generation models
|
|
if task == 'generate':
|
|
valid_image_models = [
|
|
'imagen-4.0-generate-001',
|
|
'imagen-4.0-ultra-generate-001',
|
|
'imagen-4.0-fast-generate-001',
|
|
'gemini-3.1-flash-image-preview',
|
|
'gemini-3-pro-image-preview',
|
|
'gemini-2.5-flash-image',
|
|
'gemini-2.5-flash-image-preview',
|
|
]
|
|
if model not in valid_image_models:
|
|
# Allow gemini models for analysis-based generation (backward compat)
|
|
if not model.startswith('gemini-'):
|
|
raise ValueError(
|
|
f"Image generation requires Imagen/Gemini image model, got '{model}'\n"
|
|
f"Valid models: {', '.join(valid_image_models)}"
|
|
)
|
|
|
|
|
|
def infer_task_from_file(file_path: str) -> str:
|
|
"""Infer task type from file extension.
|
|
|
|
Returns:
|
|
'transcribe' for audio files
|
|
'analyze' for image/video/document files
|
|
"""
|
|
ext = Path(file_path).suffix.lower()
|
|
|
|
audio_extensions = {'.mp3', '.wav', '.aac', '.flac', '.ogg', '.aiff', '.m4a'}
|
|
image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.heic', '.heif', '.gif', '.bmp'}
|
|
video_extensions = {'.mp4', '.mpeg', '.mov', '.avi', '.flv', '.mpg', '.webm', '.wmv', '.3gpp', '.mkv'}
|
|
document_extensions = {'.pdf', '.txt', '.html', '.md', '.doc', '.docx'}
|
|
|
|
if ext in audio_extensions:
|
|
return 'transcribe'
|
|
elif ext in image_extensions:
|
|
return 'analyze'
|
|
elif ext in video_extensions:
|
|
return 'analyze'
|
|
elif ext in document_extensions:
|
|
return 'extract'
|
|
|
|
# Default to analyze for unknown types
|
|
return 'analyze'
|
|
|
|
|
|
def get_mime_type(file_path: str) -> str:
|
|
"""Determine MIME type from file extension."""
|
|
ext = Path(file_path).suffix.lower()
|
|
|
|
mime_types = {
|
|
# Audio
|
|
'.mp3': 'audio/mp3',
|
|
'.wav': 'audio/wav',
|
|
'.aac': 'audio/aac',
|
|
'.flac': 'audio/flac',
|
|
'.ogg': 'audio/ogg',
|
|
'.aiff': 'audio/aiff',
|
|
# Image
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.png': 'image/png',
|
|
'.webp': 'image/webp',
|
|
'.heic': 'image/heic',
|
|
'.heif': 'image/heif',
|
|
# Video
|
|
'.mp4': 'video/mp4',
|
|
'.mpeg': 'video/mpeg',
|
|
'.mov': 'video/quicktime',
|
|
'.avi': 'video/x-msvideo',
|
|
'.flv': 'video/x-flv',
|
|
'.mpg': 'video/mpeg',
|
|
'.webm': 'video/webm',
|
|
'.wmv': 'video/x-ms-wmv',
|
|
'.3gpp': 'video/3gpp',
|
|
# Document
|
|
'.pdf': 'application/pdf',
|
|
'.txt': 'text/plain',
|
|
'.html': 'text/html',
|
|
'.md': 'text/markdown',
|
|
}
|
|
|
|
return mime_types.get(ext, 'application/octet-stream')
|
|
|
|
|
|
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
|
|
"""Upload file to Gemini File API."""
|
|
if verbose:
|
|
print(f"Uploading {file_path}...")
|
|
|
|
myfile = client.files.upload(file=file_path)
|
|
|
|
# Wait for processing (video/audio files need processing)
|
|
mime_type = get_mime_type(file_path)
|
|
if mime_type.startswith('video/') or mime_type.startswith('audio/'):
|
|
max_wait = 300 # 5 minutes
|
|
elapsed = 0
|
|
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
|
time.sleep(2)
|
|
myfile = client.files.get(name=myfile.name)
|
|
elapsed += 2
|
|
if verbose and elapsed % 10 == 0:
|
|
print(f" Processing... {elapsed}s")
|
|
|
|
if myfile.state.name == 'FAILED':
|
|
raise ValueError(f"File processing failed: {file_path}")
|
|
|
|
if myfile.state.name == 'PROCESSING':
|
|
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
|
|
|
|
if verbose:
|
|
print(f" Uploaded: {myfile.name}")
|
|
|
|
return myfile
|
|
|
|
|
|
def _is_billing_error(error: Exception) -> bool:
|
|
"""Check if error is due to billing/access restrictions."""
|
|
error_str = str(error).lower()
|
|
billing_indicators = [
|
|
'billing',
|
|
'billed users',
|
|
'payment',
|
|
'access denied',
|
|
'not authorized',
|
|
'permission denied',
|
|
]
|
|
return any(indicator in error_str for indicator in billing_indicators)
|
|
|
|
|
|
def _is_free_tier_quota_error(error: Exception) -> bool:
|
|
"""Check if error indicates free tier has zero quota for this model.
|
|
|
|
Free tier users have NO access to image/video generation models.
|
|
The API returns 'limit: 0' or 'RESOURCE_EXHAUSTED' with quota details.
|
|
"""
|
|
error_str = str(error)
|
|
# Check for zero quota indicators
|
|
return (
|
|
'RESOURCE_EXHAUSTED' in error_str and
|
|
('limit: 0' in error_str or 'free_tier' in error_str.lower())
|
|
)
|
|
|
|
|
|
FREE_TIER_NO_ACCESS_MSG = """
|
|
[FREE TIER LIMITATION] Image/Video generation is NOT available on free tier.
|
|
|
|
Free tier users have zero quota (limit: 0) for:
|
|
- All Imagen models (imagen-4.0-*)
|
|
- All Veo models (veo-*)
|
|
- Gemini image models (gemini-*-image, gemini-*-image-preview)
|
|
|
|
To use image/video generation:
|
|
1. Enable billing: https://aistudio.google.com/apikey
|
|
2. Or use Google Cloud $300 free credits: https://cloud.google.com/free
|
|
|
|
STOP: Do not retry image/video generation on free tier - it will always fail.
|
|
""".strip()
|
|
|
|
|
|
def generate_image_imagen4(
|
|
client,
|
|
prompt: str,
|
|
model: str,
|
|
num_images: int = 1,
|
|
aspect_ratio: str = '1:1',
|
|
size: str = '1K',
|
|
verbose: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""Generate image using Imagen 4 models.
|
|
|
|
Returns special status 'billing_required' if model needs billing,
|
|
allowing caller to fallback to free-tier generate_content API.
|
|
"""
|
|
try:
|
|
# Build config based on model (Fast doesn't support imageSize)
|
|
config_params = {
|
|
'numberOfImages': num_images,
|
|
'aspectRatio': aspect_ratio
|
|
}
|
|
|
|
# Only Standard and Ultra support imageSize parameter
|
|
if 'fast' not in model.lower() and model.startswith('imagen-'):
|
|
config_params['imageSize'] = size
|
|
|
|
gen_config = types.GenerateImagesConfig(**config_params)
|
|
|
|
if verbose:
|
|
print(f" Generating with: {model}")
|
|
print(f" Config: {num_images} images, {aspect_ratio}", end='')
|
|
if 'fast' not in model.lower() and model.startswith('imagen-'):
|
|
print(f", {size}")
|
|
else:
|
|
print()
|
|
|
|
response = client.models.generate_images(
|
|
model=model,
|
|
prompt=prompt,
|
|
config=gen_config
|
|
)
|
|
|
|
# Save images
|
|
generated_files = []
|
|
for i, generated_image in enumerate(response.generated_images):
|
|
# Find project root
|
|
script_dir = Path(__file__).parent
|
|
project_root = script_dir
|
|
for parent in [script_dir] + list(script_dir.parents):
|
|
if (parent / '.git').exists() or (parent / '.claude').exists():
|
|
project_root = parent
|
|
break
|
|
|
|
output_dir = project_root / 'docs' / 'assets'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / f"imagen4_generated_{int(time.time())}_{i}.png"
|
|
|
|
with open(output_file, 'wb') as f:
|
|
f.write(generated_image.image.image_bytes)
|
|
generated_files.append(str(output_file))
|
|
|
|
if verbose:
|
|
print(f" Saved: {output_file}")
|
|
|
|
return {
|
|
'status': 'success',
|
|
'generated_images': generated_files,
|
|
'model': model
|
|
}
|
|
|
|
except Exception as e:
|
|
# Return special status for billing errors so caller can fallback
|
|
if _is_billing_error(e) and model in IMAGEN_MODELS:
|
|
return {
|
|
'status': 'billing_required',
|
|
'original_model': model,
|
|
'error': str(e)
|
|
}
|
|
|
|
if verbose:
|
|
print(f" Error: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return {
|
|
'status': 'error',
|
|
'error': str(e)
|
|
}
|
|
|
|
|
|
def generate_video_veo(
|
|
client,
|
|
prompt: str,
|
|
model: str,
|
|
resolution: str = '1080p',
|
|
aspect_ratio: str = '16:9',
|
|
reference_images: Optional[List[str]] = None,
|
|
verbose: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""Generate video using Veo models.
|
|
|
|
For image-to-video with first/last frames (Veo 3.1):
|
|
- First reference image becomes the opening frame (image parameter)
|
|
- Second reference image becomes the closing frame (last_frame config)
|
|
- Model interpolates between them to create smooth video
|
|
"""
|
|
try:
|
|
# Build config with snake_case for Python SDK
|
|
config_params = {
|
|
'aspect_ratio': aspect_ratio,
|
|
'resolution': resolution
|
|
}
|
|
|
|
# Prepare first frame and last frame images
|
|
first_frame = None
|
|
last_frame = None
|
|
|
|
if reference_images:
|
|
import mimetypes
|
|
|
|
def load_image(img_path_str: str) -> types.Image:
|
|
"""Load image file as types.Image with bytes and mime type."""
|
|
img_path = Path(img_path_str)
|
|
image_bytes = img_path.read_bytes()
|
|
mime_type, _ = mimetypes.guess_type(str(img_path))
|
|
if not mime_type:
|
|
mime_type = 'image/png'
|
|
return types.Image(
|
|
image_bytes=image_bytes,
|
|
mime_type=mime_type
|
|
)
|
|
|
|
# First image = opening frame
|
|
if len(reference_images) >= 1:
|
|
first_frame = load_image(reference_images[0])
|
|
|
|
# Second image = closing frame (last_frame in config)
|
|
if len(reference_images) >= 2:
|
|
last_frame = load_image(reference_images[1])
|
|
config_params['last_frame'] = last_frame
|
|
|
|
gen_config = types.GenerateVideosConfig(**config_params)
|
|
|
|
if verbose:
|
|
print(f" Generating video with Veo: {model}")
|
|
print(f" Config: {resolution}, {aspect_ratio}")
|
|
if first_frame:
|
|
print(f" First frame: provided")
|
|
if last_frame:
|
|
print(f" Last frame: provided (interpolation mode)")
|
|
|
|
start = time.time()
|
|
|
|
if verbose:
|
|
print(f" Starting video generation (this may take 11s-6min)...")
|
|
|
|
# Call generate_videos with image parameter for first frame
|
|
operation = client.models.generate_videos(
|
|
model=model,
|
|
prompt=prompt,
|
|
image=first_frame, # First frame as opening image
|
|
config=gen_config
|
|
)
|
|
|
|
# Poll operation until complete
|
|
poll_count = 0
|
|
while not operation.done:
|
|
poll_count += 1
|
|
if verbose and poll_count % 3 == 0: # Update every 30s
|
|
elapsed = time.time() - start
|
|
print(f" Still generating... ({elapsed:.0f}s elapsed)")
|
|
time.sleep(10)
|
|
operation = client.operations.get(operation)
|
|
|
|
duration = time.time() - start
|
|
|
|
# Access generated video from operation response
|
|
generated_video = operation.response.generated_videos[0]
|
|
|
|
# Download the video file first
|
|
client.files.download(file=generated_video.video)
|
|
|
|
# Save video
|
|
script_dir = Path(__file__).parent
|
|
project_root = script_dir
|
|
for parent in [script_dir] + list(script_dir.parents):
|
|
if (parent / '.git').exists() or (parent / '.claude').exists():
|
|
project_root = parent
|
|
break
|
|
|
|
output_dir = project_root / 'docs' / 'assets'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / f"veo_generated_{int(time.time())}.mp4"
|
|
|
|
# Now save to file
|
|
generated_video.video.save(str(output_file))
|
|
|
|
file_size = output_file.stat().st_size / (1024 * 1024) # MB
|
|
|
|
if verbose:
|
|
print(f" Generated in {duration:.1f}s")
|
|
print(f" File size: {file_size:.2f} MB")
|
|
print(f" Saved: {output_file}")
|
|
|
|
return {
|
|
'status': 'success',
|
|
'generated_video': str(output_file),
|
|
'generation_time': duration,
|
|
'file_size_mb': file_size,
|
|
'model': model
|
|
}
|
|
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" Error: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return {
|
|
'status': 'error',
|
|
'error': str(e)
|
|
}
|
|
|
|
|
|
def process_file(
|
|
client: genai.Client,
|
|
file_path: Optional[str],
|
|
prompt: str,
|
|
model: str,
|
|
task: str,
|
|
format_output: str,
|
|
aspect_ratio: Optional[str] = None,
|
|
image_size: Optional[str] = None,
|
|
verbose: bool = False,
|
|
max_retries: int = 3
|
|
) -> Dict[str, Any]:
|
|
"""Process a single file with retry logic.
|
|
|
|
Args:
|
|
image_size: Image size for Nano Banana models (1K, 2K, 4K). Must be uppercase K.
|
|
Note: Not all models support image_size - only pass when explicitly needed.
|
|
"""
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
# For generation tasks without input files
|
|
if task == 'generate' and not file_path:
|
|
content = [prompt]
|
|
else:
|
|
# Process input file
|
|
file_path = Path(file_path)
|
|
# Determine if we need File API
|
|
file_size = file_path.stat().st_size
|
|
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
|
|
|
|
if use_file_api:
|
|
# Upload to File API
|
|
myfile = upload_file(client, str(file_path), verbose)
|
|
content = [prompt, myfile]
|
|
else:
|
|
# Inline data
|
|
with open(file_path, 'rb') as f:
|
|
file_bytes = f.read()
|
|
|
|
mime_type = get_mime_type(str(file_path))
|
|
content = [
|
|
prompt,
|
|
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
|
|
]
|
|
|
|
# Configure request
|
|
config_args = {}
|
|
if task == 'generate':
|
|
# Nano Banana requires fully uppercase 'IMAGE' per API spec
|
|
config_args['response_modalities'] = ['IMAGE']
|
|
# Build image_config with aspect_ratio and/or image_size
|
|
image_config_args = {}
|
|
if aspect_ratio:
|
|
image_config_args['aspect_ratio'] = aspect_ratio
|
|
if image_size:
|
|
# image_size must be uppercase K (1K, 2K, 4K)
|
|
image_config_args['image_size'] = image_size
|
|
if image_config_args:
|
|
config_args['image_config'] = types.ImageConfig(**image_config_args)
|
|
|
|
if format_output == 'json':
|
|
config_args['response_mime_type'] = 'application/json'
|
|
|
|
config = types.GenerateContentConfig(**config_args) if config_args else None
|
|
|
|
# Generate content
|
|
response = client.models.generate_content(
|
|
model=model,
|
|
contents=content,
|
|
config=config
|
|
)
|
|
|
|
# Extract response
|
|
result = {
|
|
'file': str(file_path) if file_path else 'generated',
|
|
'status': 'success',
|
|
'response': response.text if hasattr(response, 'text') else None
|
|
}
|
|
|
|
# Handle image output
|
|
if task == 'generate' and hasattr(response, 'candidates'):
|
|
for i, part in enumerate(response.candidates[0].content.parts):
|
|
if part.inline_data:
|
|
# Determine output directory - use project root docs/assets
|
|
if file_path:
|
|
output_dir = Path(file_path).parent
|
|
base_name = Path(file_path).stem
|
|
else:
|
|
# Find project root (look for .git or .claude directory)
|
|
script_dir = Path(__file__).parent
|
|
project_root = script_dir
|
|
for parent in [script_dir] + list(script_dir.parents):
|
|
if (parent / '.git').exists() or (parent / '.claude').exists():
|
|
project_root = parent
|
|
break
|
|
|
|
output_dir = project_root / 'docs' / 'assets'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
base_name = "generated"
|
|
|
|
output_file = output_dir / f"{base_name}_generated_{i}.png"
|
|
with open(output_file, 'wb') as f:
|
|
f.write(part.inline_data.data)
|
|
result['generated_image'] = str(output_file)
|
|
if verbose:
|
|
print(f" Saved image to: {output_file}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
# Don't retry on billing/free tier errors - they won't resolve
|
|
if _is_billing_error(e) or _is_free_tier_quota_error(e):
|
|
return {
|
|
'file': str(file_path) if file_path else 'generated',
|
|
'status': 'error',
|
|
'error': str(e)
|
|
}
|
|
|
|
# Check if this is a rate limit error (candidate for key rotation)
|
|
is_rate_limited = (
|
|
KEY_ROTATION_AVAILABLE and
|
|
is_rate_limit_error and
|
|
is_rate_limit_error(e)
|
|
)
|
|
|
|
# Check if this is a transient server error (503, 500, etc.)
|
|
is_5xx = (
|
|
KEY_ROTATION_AVAILABLE and
|
|
is_server_error and
|
|
is_server_error(e)
|
|
)
|
|
|
|
# Use more retries for transient 5xx errors (up to 5 attempts)
|
|
effective_max = max(max_retries, 5) if is_5xx else max_retries
|
|
|
|
if attempt == effective_max - 1:
|
|
return {
|
|
'file': str(file_path) if file_path else 'generated',
|
|
'status': 'error',
|
|
'error': str(e),
|
|
'rate_limited': is_rate_limited # Flag for caller to handle rotation
|
|
}
|
|
|
|
# Longer backoff for 5xx (4s, 8s, 16s, 32s) vs default (1s, 2s, 4s)
|
|
if is_5xx:
|
|
wait_time = 4 * (2 ** attempt) # 4, 8, 16, 32, 64
|
|
else:
|
|
wait_time = 2 ** attempt # 1, 2, 4
|
|
if verbose:
|
|
error_type = "5xx server error" if is_5xx else "error"
|
|
print(f" Retry {attempt + 1}/{effective_max - 1} after {wait_time}s ({error_type}): {e}")
|
|
time.sleep(wait_time)
|
|
|
|
|
|
def batch_process(
|
|
files: List[str],
|
|
prompt: str,
|
|
model: str,
|
|
task: str,
|
|
format_output: str,
|
|
aspect_ratio: Optional[str] = None,
|
|
num_images: int = 1,
|
|
size: str = '1K',
|
|
resolution: str = '1080p',
|
|
reference_images: Optional[List[str]] = None,
|
|
output_file: Optional[str] = None,
|
|
verbose: bool = False,
|
|
dry_run: bool = False
|
|
) -> List[Dict[str, Any]]:
|
|
"""Batch process multiple files with automatic key rotation."""
|
|
|
|
# Initialize key rotator or fall back to single key
|
|
rotator = None
|
|
api_key = None
|
|
|
|
if KEY_ROTATION_AVAILABLE and find_all_api_keys:
|
|
all_keys = find_all_api_keys()
|
|
if all_keys:
|
|
if len(all_keys) > 1:
|
|
rotator = KeyRotator(keys=all_keys, verbose=verbose)
|
|
api_key = rotator.get_key()
|
|
if verbose:
|
|
print(f"✓ Key rotation enabled with {len(all_keys)} keys", file=sys.stderr)
|
|
else:
|
|
api_key = all_keys[0]
|
|
if verbose:
|
|
print(f"✓ Using single API key: {api_key[:8]}...", file=sys.stderr)
|
|
|
|
# Fallback to original single-key lookup
|
|
if not api_key:
|
|
api_key = find_api_key()
|
|
|
|
if not api_key:
|
|
print("Error: GEMINI_API_KEY not found in any location")
|
|
print("\nSearched locations (highest to lowest priority):")
|
|
print(" 1. OS environment (process.env)")
|
|
if CENTRALIZED_RESOLVER_AVAILABLE:
|
|
from resolve_env import get_env_file_paths
|
|
for i, (desc, path) in enumerate(get_env_file_paths('ai-multimodal'), 2):
|
|
exists = "[OK]" if path.exists() else "[ ]"
|
|
print(f" {i}. {exists} {path}")
|
|
else:
|
|
print(" 2-7. .env files (centralized resolver unavailable)")
|
|
print("\nQuick fix — add your key to any .env file above:")
|
|
print(" echo 'GEMINI_API_KEY=your-key' >> ~/.opencode/.env")
|
|
print("\nOther options:")
|
|
print(" - Run setup checker: python scripts/check_setup.py")
|
|
print(" - Show full hierarchy: python ~/.opencode/scripts/resolve_env.py --show-hierarchy --skill ai-multimodal -v")
|
|
print("\nFor key rotation, add multiple keys to any .env:")
|
|
print(" GEMINI_API_KEY=key1")
|
|
print(" GEMINI_API_KEY_2=key2")
|
|
print(" GEMINI_API_KEY_3=key3")
|
|
sys.exit(1)
|
|
|
|
if dry_run:
|
|
print("DRY RUN MODE - No API calls will be made")
|
|
print(f"Files to process: {len(files)}")
|
|
print(f"Model: {model}")
|
|
print(f"Task: {task}")
|
|
print(f"Prompt: {prompt}")
|
|
if rotator:
|
|
print(f"API keys available: {rotator.key_count}")
|
|
return []
|
|
|
|
# Create client with current key
|
|
client = genai.Client(api_key=api_key)
|
|
results = []
|
|
|
|
def get_client_with_rotation(error: Optional[Exception] = None) -> Optional[genai.Client]:
|
|
"""Get client, rotating key if rate limited."""
|
|
nonlocal client, api_key
|
|
|
|
if error and rotator and is_rate_limit_error and is_rate_limit_error(error):
|
|
# Try to rotate to next key
|
|
if rotator.mark_rate_limited(str(error)):
|
|
new_key = rotator.get_key()
|
|
if new_key:
|
|
api_key = new_key
|
|
client = genai.Client(api_key=api_key)
|
|
return client
|
|
# All keys exhausted
|
|
return None
|
|
return client
|
|
|
|
# For generation tasks without input files, process once
|
|
if task == 'generate' and not files:
|
|
if verbose:
|
|
print(f"\nGenerating image from prompt...")
|
|
|
|
# Use Imagen 4 API for imagen models
|
|
if model.startswith('imagen-') or model in IMAGEN_MODELS:
|
|
result = generate_image_imagen4(
|
|
client=client,
|
|
prompt=prompt,
|
|
model=model,
|
|
num_images=num_images,
|
|
aspect_ratio=aspect_ratio or '1:1',
|
|
size=size or '1K', # Default to 1K for Imagen models
|
|
verbose=verbose
|
|
)
|
|
|
|
# Silent fallback to cheaper model if Imagen billing required
|
|
if result.get('status') == 'billing_required':
|
|
if verbose:
|
|
print(f" Falling back to: {IMAGE_MODEL_FALLBACK}")
|
|
result = process_file(
|
|
client=client,
|
|
file_path=None,
|
|
prompt=prompt,
|
|
model=IMAGE_MODEL_FALLBACK,
|
|
task=task,
|
|
format_output=format_output,
|
|
aspect_ratio=aspect_ratio,
|
|
image_size=size,
|
|
verbose=verbose
|
|
)
|
|
# Check if free tier (zero quota) - stop immediately with clear message
|
|
error_str = result.get('error', '')
|
|
if result.get('status') == 'error':
|
|
if _is_free_tier_quota_error(Exception(error_str)):
|
|
result['error'] = FREE_TIER_NO_ACCESS_MSG
|
|
elif _is_billing_error(Exception(error_str)):
|
|
result['error'] = (
|
|
"Image generation requires billing. Enable billing at: "
|
|
"https://aistudio.google.com/apikey or use Google Cloud credits."
|
|
)
|
|
else:
|
|
# Nano Banana (Flash/Pro) or other models via generate_content API
|
|
result = process_file(
|
|
client=client,
|
|
file_path=None,
|
|
prompt=prompt,
|
|
model=model,
|
|
task=task,
|
|
format_output=format_output,
|
|
aspect_ratio=aspect_ratio,
|
|
image_size=size,
|
|
verbose=verbose
|
|
)
|
|
# Check for free tier error
|
|
if result.get('status') == 'error':
|
|
error_str = result.get('error', '')
|
|
if _is_free_tier_quota_error(Exception(error_str)):
|
|
result['error'] = FREE_TIER_NO_ACCESS_MSG
|
|
|
|
results.append(result)
|
|
|
|
if verbose:
|
|
status = result.get('status', 'unknown')
|
|
print(f" Status: {status}")
|
|
|
|
elif task == 'generate-video' and not files:
|
|
if verbose:
|
|
print(f"\nGenerating video from prompt...")
|
|
|
|
result = generate_video_veo(
|
|
client=client,
|
|
prompt=prompt,
|
|
model=model,
|
|
resolution=resolution,
|
|
aspect_ratio=aspect_ratio or '16:9',
|
|
reference_images=reference_images,
|
|
verbose=verbose
|
|
)
|
|
|
|
# Check for free tier error - video gen has NO free tier access
|
|
if result.get('status') == 'error':
|
|
error_str = result.get('error', '')
|
|
if _is_free_tier_quota_error(Exception(error_str)) or _is_billing_error(Exception(error_str)):
|
|
result['error'] = FREE_TIER_NO_ACCESS_MSG
|
|
|
|
results.append(result)
|
|
|
|
if verbose:
|
|
status = result.get('status', 'unknown')
|
|
print(f" Status: {status}")
|
|
else:
|
|
# Process input files with key rotation support
|
|
for i, file_path in enumerate(files, 1):
|
|
if verbose:
|
|
print(f"\n[{i}/{len(files)}] Processing: {file_path}")
|
|
|
|
# Try processing with key rotation on rate limit
|
|
max_rotation_attempts = rotator.key_count if rotator else 1
|
|
result = None
|
|
|
|
for rotation_attempt in range(max_rotation_attempts):
|
|
result = process_file(
|
|
client=client,
|
|
file_path=file_path,
|
|
prompt=prompt,
|
|
model=model,
|
|
task=task,
|
|
format_output=format_output,
|
|
aspect_ratio=aspect_ratio,
|
|
image_size=size,
|
|
verbose=verbose
|
|
)
|
|
|
|
# Check if rate limited and can rotate
|
|
if (result.get('rate_limited') and rotator and
|
|
rotation_attempt < max_rotation_attempts - 1):
|
|
new_client = get_client_with_rotation(Exception(result.get('error', '')))
|
|
if new_client:
|
|
client = new_client
|
|
if verbose:
|
|
print(f" Retrying with rotated key...")
|
|
continue
|
|
else:
|
|
# All keys exhausted - mark result with clear error
|
|
if verbose:
|
|
print(f" ⚠ All API keys exhausted (on cooldown)", file=sys.stderr)
|
|
result['error'] = "All API keys exhausted (rate limited). Try again later."
|
|
break
|
|
|
|
results.append(result)
|
|
|
|
if verbose:
|
|
status = result.get('status', 'unknown')
|
|
print(f" Status: {status}")
|
|
|
|
# Save results
|
|
if output_file:
|
|
save_results(results, output_file, format_output)
|
|
|
|
return results
|
|
|
|
|
|
def print_results(results: List[Dict[str, Any]], task: str) -> None:
|
|
"""Print results to stdout for LLM workflows.
|
|
|
|
Always prints actual results (not just success/fail counts) so LLMs
|
|
can continue processing based on the output.
|
|
"""
|
|
if not results:
|
|
return
|
|
|
|
print("\n=== RESULTS ===\n")
|
|
|
|
for result in results:
|
|
file_name = result.get('file', 'generated')
|
|
status = result.get('status', 'unknown')
|
|
|
|
print(f"[{file_name}]")
|
|
print(f"Status: {status}")
|
|
|
|
if status == 'success':
|
|
# Print task-specific output
|
|
if task in ['analyze', 'transcribe', 'extract']:
|
|
response = result.get('response')
|
|
if response:
|
|
print(f"Result:\n{response}")
|
|
|
|
elif task == 'generate':
|
|
# Image generation
|
|
generated_images = result.get('generated_images', [])
|
|
if generated_images:
|
|
print(f"Generated images: {len(generated_images)}")
|
|
for img in generated_images:
|
|
print(f" - {img}")
|
|
else:
|
|
generated_image = result.get('generated_image')
|
|
if generated_image:
|
|
print(f"Generated image: {generated_image}")
|
|
|
|
elif task == 'generate-video':
|
|
generated_video = result.get('generated_video')
|
|
if generated_video:
|
|
print(f"Generated video: {generated_video}")
|
|
gen_time = result.get('generation_time')
|
|
if gen_time:
|
|
print(f"Generation time: {gen_time:.1f}s")
|
|
file_size = result.get('file_size_mb')
|
|
if file_size:
|
|
print(f"File size: {file_size:.2f} MB")
|
|
|
|
elif status == 'error':
|
|
error = result.get('error', 'Unknown error')
|
|
print(f"Error: {error}")
|
|
|
|
print() # Blank line between results
|
|
|
|
|
|
def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str):
|
|
"""Save results to file."""
|
|
output_path = Path(output_file)
|
|
|
|
# Special handling for image generation - if output has image extension, copy the generated image
|
|
image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'}
|
|
video_extensions = {'.mp4', '.mov', '.avi', '.webm'}
|
|
|
|
if output_path.suffix.lower() in image_extensions and len(results) == 1:
|
|
# Ensure output directory exists
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check for multiple generated images
|
|
generated_images = results[0].get('generated_images')
|
|
if generated_images:
|
|
# Copy first image to the specified output location
|
|
shutil.copy2(generated_images[0], output_path)
|
|
return
|
|
|
|
# Legacy single image field
|
|
generated_image = results[0].get('generated_image')
|
|
if generated_image:
|
|
shutil.copy2(generated_image, output_path)
|
|
return
|
|
else:
|
|
# Don't write text reports to image files - save error as .txt instead
|
|
output_path = output_path.with_suffix('.error.txt')
|
|
output_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
|
|
print(f"Warning: Generation failed, saving error report to: {output_path}")
|
|
|
|
if output_path.suffix.lower() in video_extensions and len(results) == 1:
|
|
# Ensure output directory exists
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
generated_video = results[0].get('generated_video')
|
|
if generated_video:
|
|
shutil.copy2(generated_video, output_path)
|
|
return
|
|
else:
|
|
output_path = output_path.with_suffix('.error.txt')
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
print(f"Warning: Video generation failed, saving error report to: {output_path}")
|
|
|
|
if format_output == 'json':
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, indent=2)
|
|
elif format_output == 'csv':
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
fieldnames = ['file', 'status', 'response', 'error']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for result in results:
|
|
writer.writerow({
|
|
'file': result.get('file', ''),
|
|
'status': result.get('status', ''),
|
|
'response': result.get('response', ''),
|
|
'error': result.get('error', '')
|
|
})
|
|
else: # markdown
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write("# Batch Processing Results\n\n")
|
|
for i, result in enumerate(results, 1):
|
|
f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n")
|
|
f.write(f"**Status**: {result.get('status', 'unknown')}\n\n")
|
|
if result.get('response'):
|
|
f.write(f"**Response**:\n\n{result['response']}\n\n")
|
|
if result.get('error'):
|
|
f.write(f"**Error**: {result['error']}\n\n")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Batch process media files with Gemini API',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Transcribe multiple audio files
|
|
%(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash
|
|
|
|
# Analyze images
|
|
%(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\
|
|
--model gemini-2.5-flash
|
|
|
|
# Process PDFs to JSON
|
|
%(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\
|
|
--format json --output results.json
|
|
|
|
# Generate images with Nano Banana Flash (fast)
|
|
%(prog)s --task generate --prompt "A mountain landscape at sunset" \\
|
|
--model gemini-2.5-flash-image --aspect-ratio 16:9 --size 2K
|
|
|
|
# Generate images with Nano Banana Pro (4K text, reasoning)
|
|
%(prog)s --task generate --prompt "Travel poster with text 'EXPLORE'" \\
|
|
--model gemini-3-pro-image-preview --aspect-ratio 3:4 --size 4K
|
|
|
|
# Generate images with Imagen 4 (production quality)
|
|
%(prog)s --task generate --prompt "Product photo of coffee mug" \\
|
|
--model imagen-4.0-ultra-generate-001 --aspect-ratio 1:1 --size 2K
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--files', nargs='*', help='Input files to process')
|
|
parser.add_argument('--task',
|
|
choices=['transcribe', 'analyze', 'extract', 'generate', 'generate-video'],
|
|
help='Task to perform (auto-detected from file type if not specified)')
|
|
parser.add_argument('--prompt', help='Prompt for analysis/generation')
|
|
parser.add_argument('--model',
|
|
help='Model to use (default: auto-detected from task and env vars)')
|
|
parser.add_argument('--format', dest='format_output', default='text',
|
|
choices=['text', 'json', 'csv', 'markdown'],
|
|
help='Output format (default: text)')
|
|
|
|
# Image generation options
|
|
# All 10 aspect ratios supported by Nano Banana / Imagen 4
|
|
parser.add_argument('--aspect-ratio',
|
|
choices=['1:1', '2:3', '3:2', '3:4', '4:3', '4:5', '5:4', '9:16', '16:9', '21:9'],
|
|
help='Aspect ratio for image/video generation')
|
|
parser.add_argument('--num-images', type=int, default=1,
|
|
help='Number of images to generate (1-4, default: 1)')
|
|
# 4K available for Nano Banana Pro (gemini-3-pro-image-preview)
|
|
# Note: Not all models support --size, only use when needed
|
|
parser.add_argument('--size', choices=['1K', '2K', '4K'], default=None,
|
|
help='Image size - 1K/2K for Imagen 4, 1K/2K/4K for Nano Banana (optional)')
|
|
|
|
# Video generation options
|
|
parser.add_argument('--resolution', choices=['720p', '1080p'], default='1080p',
|
|
help='Video resolution (default: 1080p)')
|
|
parser.add_argument('--reference-images', nargs='+',
|
|
help='Reference images for video generation (max 3)')
|
|
|
|
parser.add_argument('--output', help='Output file for results')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Verbose output')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be done without making API calls')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Auto-detect task from file type if not specified
|
|
if not args.task:
|
|
if args.files and len(args.files) > 0:
|
|
args.task = infer_task_from_file(args.files[0])
|
|
if args.verbose:
|
|
print(f"Auto-detected task: {args.task} (from file extension)")
|
|
else:
|
|
parser.error("--task required when no input files provided")
|
|
|
|
# Auto-detect model if not specified
|
|
if not args.model:
|
|
args.model = get_default_model(args.task)
|
|
if args.verbose:
|
|
print(f"Auto-detected model: {args.model}")
|
|
|
|
# Validate model/task combination
|
|
try:
|
|
validate_model_task_combination(args.model, args.task)
|
|
except ValueError as e:
|
|
parser.error(str(e))
|
|
|
|
# Validate arguments
|
|
if args.task not in ['generate', 'generate-video'] and not args.files:
|
|
parser.error("--files required for non-generation tasks")
|
|
|
|
if args.task in ['generate', 'generate-video'] and not args.prompt:
|
|
parser.error("--prompt required for generation tasks")
|
|
|
|
if args.task not in ['generate', 'generate-video'] and not args.prompt:
|
|
# Set default prompts
|
|
if args.task == 'transcribe':
|
|
args.prompt = 'Generate a transcript with timestamps'
|
|
elif args.task == 'analyze':
|
|
args.prompt = 'Analyze this content'
|
|
elif args.task == 'extract':
|
|
args.prompt = 'Extract key information'
|
|
|
|
# Process files
|
|
files = args.files or []
|
|
results = batch_process(
|
|
files=files,
|
|
prompt=args.prompt,
|
|
model=args.model,
|
|
task=args.task,
|
|
format_output=args.format_output,
|
|
aspect_ratio=args.aspect_ratio,
|
|
num_images=args.num_images,
|
|
size=args.size,
|
|
resolution=args.resolution,
|
|
reference_images=args.reference_images,
|
|
output_file=args.output,
|
|
verbose=args.verbose,
|
|
dry_run=args.dry_run
|
|
)
|
|
|
|
# Print results and summary
|
|
if not args.dry_run and results:
|
|
# Always print actual results for LLM workflows
|
|
print_results(results, args.task)
|
|
|
|
# Print summary
|
|
success = sum(1 for r in results if r.get('status') == 'success')
|
|
failed = len(results) - success
|
|
print(f"{'='*50}")
|
|
print(f"Summary: {len(results)} processed, {success} success, {failed} failed")
|
|
if args.output:
|
|
print(f"Results saved to: {args.output}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|