This commit is contained in:
2026-04-12 01:06:31 +07:00
commit 10d660cbcb
1066 changed files with 228596 additions and 0 deletions

View File

@@ -0,0 +1,230 @@
# Google Gemini API Configuration
# ============================================================================
# OPTION 1: Google AI Studio (Default - Recommended for most users)
# ============================================================================
# Get your API key: https://aistudio.google.com/apikey
GEMINI_API_KEY=your_api_key_here
# ============================================================================
# API Key Rotation (Optional - For high-volume usage)
# ============================================================================
# Add multiple API keys for automatic rotation on rate limit errors.
# Free tier accounts are heavily rate-limited; rotation helps distribute load.
#
# Format: GEMINI_API_KEY_N where N is 2, 3, 4, etc.
# The primary GEMINI_API_KEY is always used first.
#
# GEMINI_API_KEY_2=your_second_api_key
# GEMINI_API_KEY_3=your_third_api_key
# GEMINI_API_KEY_4=your_fourth_api_key
#
# Features:
# - Auto-rotates on RESOURCE_EXHAUSTED / 429 errors
# - 60-second cooldown per key after rate limit
# - Logs rotation events with --verbose flag
# - Backward compatible: single key still works
# ============================================================================
# OPTION 2: Vertex AI (Google Cloud Platform)
# ============================================================================
# Uncomment these lines to use Vertex AI instead of Google AI Studio
# GEMINI_USE_VERTEX=true
# VERTEX_PROJECT_ID=your-gcp-project-id
# VERTEX_LOCATION=us-central1
# ============================================================================
# Model Selection (Optional)
# ============================================================================
# Override default models for specific capabilities
# If not set, intelligent defaults are used based on task type
# --- Image Generation ---
# Used by: --task generate (image)
# Default: gemini-2.5-flash-image (Nano Banana Flash - fast, cost-effective)
# Alternative: imagen-4.0-generate-001 (production quality)
# NOTE: All image generation requires billing - no free tier available (limit: 0)
# Options:
# gemini-2.5-flash-image - Nano Banana Flash: fast, ~$1/1M tokens (DEFAULT)
# gemini-3-pro-image-preview - Nano Banana Pro: 4K text, reasoning (requires billing)
# imagen-4.0-generate-001 - Imagen 4 Standard: production quality (~$0.02/image)
# imagen-4.0-ultra-generate-001 - Imagen 4 Ultra: maximum quality (~$0.04/image)
# imagen-4.0-fast-generate-001 - Imagen 4 Fast: speed-optimized (~$0.01/image)
# IMAGE_GEN_MODEL=gemini-2.5-flash-image
# --- Video Generation ---
# Used by: --task generate-video (new capability)
# Default: veo-3.1-generate-preview
# NOTE: Video generation requires billing - no free tier fallback available
# Options:
# veo-3.1-generate-preview - Latest, native audio, frame control (requires billing)
# veo-3.1-fast-generate-preview - Speed-optimized for business (requires billing)
# veo-3.0-generate-001 - Stable, native audio, 8s videos (requires billing)
# veo-3.0-fast-generate-001 - Stable fast variant (requires billing)
# VIDEO_GEN_MODEL=veo-3.1-generate-preview
# --- Multimodal Analysis ---
# Used by: --task analyze, transcribe, extract
# Default: gemini-2.5-flash
# Options:
# gemini-3-pro-preview - Latest, agentic workflows, 1M context
# gemini-2.5-flash - Best price/performance (recommended)
# gemini-2.5-pro - Highest quality
# MULTIMODAL_MODEL=gemini-2.5-flash
# --- Legacy Compatibility ---
# Generic model override (use specific variables above instead)
# GEMINI_MODEL=gemini-2.5-flash
# GEMINI_IMAGE_GEN_MODEL=gemini-2.5-flash-image
# ============================================================================
# MiniMax API Configuration (Optional - for image/video/speech/music generation)
# ============================================================================
# Get your API key: https://platform.minimax.io/user-center/basic-information/interface-key
# MINIMAX_API_KEY=your_minimax_api_key_here
# --- MiniMax Image Generation ---
# Models: image-01 (standard), image-01-live (enhanced)
# Cost: ~$0.03/image | Rate: 10 RPM
# MINIMAX_IMAGE_MODEL=image-01
# --- MiniMax Video Generation (Hailuo) ---
# Models: MiniMax-Hailuo-2.3, MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-02, S2V-01
# Cost: $0.25-0.52/video | Rate: 5 RPM
# MINIMAX_VIDEO_MODEL=MiniMax-Hailuo-2.3
# --- MiniMax Speech/TTS ---
# Models: speech-2.8-hd (best), speech-2.8-turbo (fast)
# Cost: $30-50/1M chars | Rate: 60 RPM | 300+ voices, 40+ languages
# MINIMAX_SPEECH_MODEL=speech-2.8-hd
# --- MiniMax Music Generation ---
# Models: music-2.5 (4-minute songs with vocals)
# Cost: $0.03-0.075/gen | Rate: 120 RPM
# MINIMAX_MUSIC_MODEL=music-2.5
# ============================================================================
# Rate Limiting Configuration (Optional)
# ============================================================================
# Requests per minute limit (adjust based on your tier)
# GEMINI_RPM_LIMIT=15
# Tokens per minute limit
# GEMINI_TPM_LIMIT=4000000
# Requests per day limit
# GEMINI_RPD_LIMIT=1500
# ============================================================================
# Video Generation Options (Optional)
# ============================================================================
# Video duration in seconds (8s only for now)
# VEO_DURATION=8
# Video resolution: 720p or 1080p
# VEO_RESOLUTION=1080p
# Aspect ratio: 16:9, 9:16, 1:1 (16:9 is default)
# VEO_ASPECT_RATIO=16:9
# Frame rate: 24fps (fixed for now)
# VEO_FPS=24
# Enable native audio generation
# VEO_AUDIO=true
# ============================================================================
# Image Generation Options (Optional)
# ============================================================================
# Number of images to generate (1-4)
# IMAGEN_NUM_IMAGES=1
# Image size: 1K or 2K (Ultra/Standard only)
# IMAGEN_SIZE=1K
# Aspect ratio: 1:1, 16:9, 9:16, 4:3, 3:4
# IMAGEN_ASPECT_RATIO=1:1
# Enable person generation (restricted in EEA, CH, UK)
# IMAGEN_PERSON_GENERATION=true
# Add SynthID watermark (always enabled by default)
# IMAGEN_WATERMARK=true
# ============================================================================
# Processing Options (Optional)
# ============================================================================
# Video resolution mode: default or low-res
# low-res uses ~100 tokens/second vs ~300 for default
# GEMINI_VIDEO_RESOLUTION=default
# Audio quality: default (16 Kbps mono, auto-downsampled)
# GEMINI_AUDIO_QUALITY=default
# PDF processing mode: inline (<20MB) or file-api (>20MB, automatic)
# GEMINI_PDF_MODE=auto
# ============================================================================
# Retry Configuration (Optional)
# ============================================================================
# Maximum retry attempts for failed requests
# GEMINI_MAX_RETRIES=3
# Initial retry delay in seconds (uses exponential backoff)
# GEMINI_RETRY_DELAY=1
# ============================================================================
# Output Configuration (Optional)
# ============================================================================
# Default output directory for generated images
# OUTPUT_DIR=./output
# Image output format (png or jpeg)
# IMAGE_FORMAT=png
# Image quality for JPEG (1-100)
# IMAGE_QUALITY=95
# ============================================================================
# Context Caching (Optional)
# ============================================================================
# Enable context caching for repeated queries on same file
# GEMINI_ENABLE_CACHING=true
# Cache TTL in seconds (default: 1800 = 30 minutes)
# GEMINI_CACHE_TTL=1800
# ============================================================================
# Logging (Optional)
# ============================================================================
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
# LOG_LEVEL=INFO
# Log file path
# LOG_FILE=./logs/gemini.log
# ============================================================================
# Pricing Reference (as of 2025-11)
# ============================================================================
# Gemini 2.5 Flash: $1.00/1M input, $0.10/1M output
# Gemini 2.5 Pro: $3.00/1M input, $12.00/1M output
# Gemini 3 Pro: $2.00/1M input (<200k), $4.00 (>200k), $12/$18 output
# Imagen 4: ~$0.01-$0.04 per image (varies by variant)
# Veo 3: TBD (preview pricing)
# Monitor: https://ai.google.dev/pricing
# ============================================================================
# Notes
# ============================================================================
# 1. Never commit API keys to version control
# 2. Add .env to .gitignore
# 3. API keys can be restricted in Google Cloud Console
# 4. Monitor usage at: https://aistudio.google.com/apikey
# 5. Free tier limits: 15 RPM, 1M-4M TPM, 1,500 RPD
# 6. Vertex AI requires GCP authentication via gcloud CLI
# 7. Model defaults (Dec 2025):
# - Image gen: gemini-2.5-flash-image (Nano Banana Flash - default)
# - Image gen: imagen-4.0-generate-001 (alternative for production)
# - Video gen: veo-3.1-generate-preview
# - Analysis: gemini-2.5-flash
# 8. Preview models (veo-3.1, gemini-3) may have API changes

View File

@@ -0,0 +1,112 @@
---
name: ck:ai-multimodal
description: Analyze images/audio/video with Gemini API (better vision than Claude). Generate images (Imagen 4, Nano Banana 2, MiniMax), videos (Veo 3, Hailuo), speech (MiniMax TTS), music (MiniMax). Use for vision analysis, transcription, OCR, design extraction, multimodal AI.
license: MIT
allowed-tools:
- Bash
- Read
- Write
- Edit
argument-hint: "[file-path] [prompt]"
---
# AI Multimodal
Process audio, images, videos, documents using Gemini. Generate images, videos, speech, music via Gemini + MiniMax.
## Setup
```bash
# Google Gemini (analysis + image/video gen)
export GEMINI_API_KEY="your-key" # https://aistudio.google.com/apikey
# MiniMax (image/video/speech/music gen)
export MINIMAX_API_KEY="your-key" # https://platform.minimax.io/user-center/basic-information/interface-key
pip install google-genai python-dotenv pillow requests
```
### API Key Rotation (Optional)
For high-volume Gemini usage, configure multiple keys:
```bash
export GEMINI_API_KEY="key1"
export GEMINI_API_KEY_2="key2" # auto-rotates on rate limit
```
## Quick Start
**Verify setup**: `python scripts/check_setup.py`
**Analyze media**: `python scripts/gemini_batch_process.py --files <file> --task <analyze|transcribe|extract>`
- TIP: When you're asked to analyze an image, check if `gemini` command is available, then use `echo "<prompt to analyze image>" | gemini -y -m <gemini.model>` command (read model from `$HOME/.opencode/.ck.json`: `gemini.model`). If `gemini` command is not available, use `python scripts/gemini_batch_process.py --files <file> --task analyze` command.
**Generate (Gemini)**: `python scripts/gemini_batch_process.py --task <generate|generate-video> --prompt "desc"`
**Generate (MiniMax)**: `python scripts/minimax_cli.py --task <generate|generate-video|generate-speech|generate-music> --prompt "desc"`
> **Stdin support**: Pipe files via stdin for Gemini analysis (auto-detects PNG/JPG/PDF/WAV/MP3).
## Models
### Google Gemini / Imagen
- **Image gen**: `gemini-3.1-flash-image-preview` (Nano Banana 2 - DEFAULT), `gemini-2.5-flash-image` (Flash), `gemini-3-pro-image-preview` (Pro 4K), `imagen-4.0-generate-001` (standard), `imagen-4.0-ultra-generate-001` (quality), `imagen-4.0-fast-generate-001` (speed)
- **Video gen**: `veo-3.1-generate-preview` (8s clips with audio)
- **Analysis**: `gemini-2.5-flash` (recommended), `gemini-2.5-pro` (advanced)
### MiniMax (NEW)
- **Image gen**: `image-01` (standard), `image-01-live` (enhanced) - $0.03/image, 1-9 batch
- **Video gen (Hailuo)**: `MiniMax-Hailuo-2.3` (1080p), `MiniMax-Hailuo-2.3-Fast` (50% cheaper), `MiniMax-Hailuo-02` (first+last frame), `S2V-01` (subject ref)
- **Speech/TTS**: `speech-2.8-hd` (best), `speech-2.8-turbo` (fast) - 300+ voices, 40+ languages, emotion control
- **Music**: `music-2.5` - 4-minute songs with vocals, synchronized lyrics
## Scripts
- **`gemini_batch_process.py`**: Gemini CLI for `transcribe|analyze|extract|generate|generate-video`. Auto-resolves API keys, Imagen 4 + Veo + Nano Banana workflows.
- **`minimax_cli.py`**: MiniMax CLI for `generate|generate-video|generate-speech|generate-music`. Supports all MiniMax models.
- **`minimax_generate.py`**: MiniMax generation functions (image, video, speech, music). Library for programmatic use.
- **`minimax_api_client.py`**: MiniMax HTTP client, auth, async polling, file download utilities.
- **`media_optimizer.py`**: ffmpeg/Pillow preflight: compress/resize/convert media to stay within API limits.
- **`document_converter.py`**: Gemini-powered PDF/image/Office → markdown converter.
- **`check_setup.py`**: Setup checker for API keys and dependencies.
Use `--help` for options.
## References
Load for detailed guidance:
| Topic | File | Description |
|-------|------|-------------|
| Music | `references/music-generation.md` | Lyria RealTime API for background music generation, style prompts, real-time control, integration with video production. |
| Audio | `references/audio-processing.md` | Audio formats and limits, transcription (timestamps, speakers, segments), non-speech analysis, File API vs inline input, TTS models, best practices, cost and token math, and concrete meeting/podcast/interview recipes. |
| Images | `references/vision-understanding.md` | Vision capabilities overview, supported formats and models, captioning/classification/VQA, detection and segmentation, OCR and document reading, multi-image workflows, structured JSON output, token costs, best practices, and common product/screenshot/chart/scene use cases. |
| Image Gen | `references/image-generation.md` | Imagen 4 and Gemini image model overview, generate_images vs generate_content APIs, aspect ratios and costs, text/image/both modalities, editing and composition, style and quality control, safety settings, best practices, troubleshooting, and common marketing/concept-art/UI scenarios. |
| Video | `references/video-analysis.md` | Video analysis capabilities and supported formats, model/context choices, local/inline/YouTube inputs, clipping and FPS control, multi-video comparison, temporal Q&A and scene detection, transcription with visual context, token and cost guidance, and optimization/best-practice patterns. |
| Video Gen | `references/video-generation.md` | Veo model matrix, text-to-video and image-to-video quick start, multi-reference and extension flows, camera and timing control, configuration (resolution, aspect, audio, safety), prompt design patterns, performance tips, limitations, troubleshooting, and cost estimates. |
| MiniMax | `references/minimax-generation.md` | MiniMax image (image-01), video (Hailuo 2.3), speech (TTS 2.8), and music (2.5) generation APIs. Endpoints, models, parameters, async workflows, pricing, rate limits, voice library, and examples. |
## Limits
**Formats**: Audio (WAV/MP3/AAC, 9.5h), Images (PNG/JPEG/WEBP, 3.6k), Video (MP4/MOV, 6h), PDF (1k pages)
**Size**: 20MB inline, 2GB File API
**Important:**
- If you are going to generate a transcript of the audio, and the audio length is longer than 15 minutes, the transcript often gets truncated due to output token limits in the Gemini API response. To get the full transcript, you need to split the audio into smaller chunks (max 15 minutes per chunk) and transcribe each segment for a complete transcript.
- If you are going to generate a transcript of the video and the video length is longer than 15 minutes, use ffmpeg to extract the audio from the video, truncate the audio to 15 minutes, transcribe all audio segments, and then combine the transcripts into a single transcript.
**Transcription Output Requirements:**
- Format: Markdown
- Metadata: Duration, file size, generated date, description, file name, topics covered, etc.
- Parts: from-to (e.g., 00:00-00:15), audio chunk name, transcript, status, etc.
- Transcript format:
```
[HH:MM:SS -> HH:MM:SS] transcript content
[HH:MM:SS -> HH:MM:SS] transcript content
...
```
## Outputs
**IMPORTANT:** Invoke "/ck:project-organization" skill to organize the outputs.
## Resources
- [Gemini API Docs](https://ai.google.dev/gemini-api/docs/)
- [Gemini Pricing](https://ai.google.dev/pricing)
- [MiniMax API Docs](https://platform.minimax.io/docs/api-reference/api-overview)
- [MiniMax Pricing](https://platform.minimax.io/pricing)

View File

@@ -0,0 +1,387 @@
# Audio Processing Reference
Comprehensive guide for audio analysis and speech generation using Gemini API.
## Audio Understanding
### Supported Formats
| Format | MIME Type | Best Use |
|--------|-----------|----------|
| WAV | `audio/wav` | Uncompressed, highest quality |
| MP3 | `audio/mp3` | Compressed, widely compatible |
| AAC | `audio/aac` | Compressed, good quality |
| FLAC | `audio/flac` | Lossless compression |
| OGG Vorbis | `audio/ogg` | Open format |
| AIFF | `audio/aiff` | Apple format |
### Specifications
- **Maximum length**: 9.5 hours per request
- **Multiple files**: Unlimited count, combined max 9.5 hours
- **Token rate**: 32 tokens/second (1 minute = 1,920 tokens)
- **Processing**: Auto-downsampled to 16 Kbps mono
- **File size limits**:
- Inline: 20 MB max total request
- File API: 2 GB per file, 20 GB project quota
- Retention: 48 hours auto-delete
- **Important:** if you are going to generate a transcript of the audio, and the audio length is longer than 15 minutes, the transcript often gets truncated due to output token limits in the Gemini API response. To get the full transcript, you need to split the audio into smaller chunks (max 15 minutes per chunk) and transcribe each segment for a complete transcript.
## Transcription
### Basic Transcription
```python
from google import genai
import os
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
# Upload audio
myfile = client.files.upload(file='meeting.mp3')
# Transcribe
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Generate a transcript of the speech.', myfile]
)
print(response.text)
```
### With Timestamps
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Generate transcript with timestamps in MM:SS format.', myfile]
)
```
### Multi-Speaker Identification
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Transcribe with speaker labels. Format: [Speaker 1], [Speaker 2], etc.', myfile]
)
```
### Segment-Specific Transcription
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Transcribe only the segment from 02:30 to 05:15.', myfile]
)
```
## Audio Analysis
### Summarization
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Summarize key points in 5 bullets with timestamps.', myfile]
)
```
### Non-Speech Audio Analysis
```python
# Music analysis
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Identify the musical instruments and genre.', myfile]
)
# Environmental sounds
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Identify all sounds: voices, music, ambient noise.', myfile]
)
# Birdsong identification
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Identify bird species based on their calls.', myfile]
)
```
### Timestamp-Based Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['What is discussed from 10:30 to 15:45? Provide key points.', myfile]
)
```
## Input Methods
### File Upload (>20MB or Reuse)
```python
# Upload once, use multiple times
myfile = client.files.upload(file='large-audio.mp3')
# First query
response1 = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Transcribe this', myfile]
)
# Second query (reuses same file)
response2 = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Summarize this', myfile]
)
```
### Inline Data (<20MB)
```python
from google.genai import types
with open('small-audio.mp3', 'rb') as f:
audio_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Describe this audio',
types.Part.from_bytes(data=audio_bytes, mime_type='audio/mp3')
]
)
```
## Speech Generation (TTS)
### Available Models
| Model | Quality | Speed | Cost/1M tokens |
|-------|---------|-------|----------------|
| `gemini-2.5-flash-native-audio-preview-09-2025` | High | Fast | $10 |
| `gemini-2.5-pro` TTS mode | Premium | Slower | $20 |
### Basic TTS
```python
response = client.models.generate_content(
model='gemini-2.5-flash-native-audio-preview-09-2025',
contents='Generate audio: Welcome to today\'s episode.'
)
# Save audio
with open('output.wav', 'wb') as f:
f.write(response.audio_data)
```
### Controllable Voice Style
```python
# Professional tone
response = client.models.generate_content(
model='gemini-2.5-flash-native-audio-preview-09-2025',
contents='Generate audio in a professional, clear tone: Welcome to our quarterly earnings call.'
)
# Casual and friendly
response = client.models.generate_content(
model='gemini-2.5-flash-native-audio-preview-09-2025',
contents='Generate audio in a friendly, conversational tone: Hey there! Let\'s dive into today\'s topic.'
)
# Narrative style
response = client.models.generate_content(
model='gemini-2.5-flash-native-audio-preview-09-2025',
contents='Generate audio in a narrative, storytelling tone: Once upon a time, in a land far away...'
)
```
### Voice Control Parameters
- **Style**: Professional, casual, narrative, conversational
- **Pace**: Slow, normal, fast
- **Tone**: Friendly, serious, enthusiastic
- **Accent**: Natural language control (e.g., "British accent", "Southern drawl")
## Best Practices
### File Management
1. Use File API for files >20MB
2. Use File API for repeated queries (saves tokens)
3. Files auto-delete after 48 hours
4. Clean up manually when done:
```python
client.files.delete(name=myfile.name)
```
### Prompt Engineering
**Effective prompts**:
- "Transcribe from 02:30 to 03:29 in MM:SS format"
- "Identify speakers and extract dialogue with timestamps"
- "Summarize key points with relevant timestamps"
- "Transcribe and analyze sentiment for each speaker"
**Context improves accuracy**:
- "This is a medical interview - use appropriate terminology"
- "Transcribe this legal deposition with precise terminology"
- "This is a technical podcast about machine learning"
**Combined tasks**:
- "Transcribe and summarize in bullet points"
- "Extract key quotes with timestamps and speaker labels"
- "Transcribe and identify action items with timestamps"
### Cost Optimization
**Token calculation**:
- 1 minute audio = 1,920 tokens
- 1 hour audio = 115,200 tokens
- 9.5 hours = 1,094,400 tokens
**Model selection**:
- Use `gemini-2.5-flash` ($1/1M tokens) for most tasks
- Upgrade to `gemini-2.5-pro` ($3/1M tokens) for complex analysis
- For high-volume: `gemini-1.5-flash` ($0.70/1M tokens)
**Reduce costs**:
- Process only relevant segments using timestamps
- Use lower-quality audio when possible
- Batch multiple short files in one request
- Cache context for repeated queries
### Error Handling
```python
import time
def transcribe_with_retry(file_path, max_retries=3):
"""Transcribe audio with exponential backoff retry"""
for attempt in range(max_retries):
try:
myfile = client.files.upload(file=file_path)
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Transcribe with timestamps', myfile]
)
return response.text
except Exception as e:
if attempt == max_retries - 1:
raise
wait_time = 2 ** attempt
print(f"Retry {attempt + 1} after {wait_time}s")
time.sleep(wait_time)
```
## Common Use Cases
### 1. Meeting Transcription
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Transcribe this meeting with:
1. Speaker labels
2. Timestamps for topic changes
3. Action items highlighted
''',
myfile
]
)
```
### 2. Podcast Summary
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Create podcast summary with:
1. Main topics with timestamps
2. Key quotes from each speaker
3. Recommended episode highlights
''',
myfile
]
)
```
### 3. Interview Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Analyze interview:
1. Questions asked with timestamps
2. Key responses from interviewee
3. Overall sentiment and tone
''',
myfile
]
)
```
### 4. Content Verification
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Verify audio content:
1. Check for specific keywords or phrases
2. Identify any compliance issues
3. Note any concerning statements with timestamps
''',
myfile
]
)
```
### 5. Multilingual Transcription
```python
# Gemini auto-detects language
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Transcribe this audio and translate to English if needed.', myfile]
)
```
## Token Costs
**Audio Input** (32 tokens/second):
- 1 minute = 1,920 tokens
- 10 minutes = 19,200 tokens
- 1 hour = 115,200 tokens
- 9.5 hours = 1,094,400 tokens
**Example costs** (Gemini 2.5 Flash at $1/1M):
- 1 hour audio: 115,200 tokens = $0.12
- Full day podcast (8 hours): 921,600 tokens = $0.92
## Limitations
- Maximum 9.5 hours per request
- Auto-downsampled to 16 Kbps mono (quality loss)
- Files expire after 48 hours
- No real-time streaming support
- Non-speech audio less accurate than speech
---
## Related References
**Current**: Audio Processing
**Related Capabilities**:
- [Video Analysis](./video-analysis.md) - Extract audio from videos
- [Video Generation](./video-generation.md) - Generate videos with native audio
- [Image Understanding](./vision-understanding.md) - Analyze audio with visual context
**Back to**: [AI Multimodal Skill](../SKILL.md)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,141 @@
# MiniMax Generation Reference
## Overview
MiniMax provides image, video (Hailuo), speech (TTS), and music generation APIs.
Base URL: `https://api.minimax.io/v1` | Auth: `Bearer {MINIMAX_API_KEY}`
## Image Generation
**Endpoint**: `POST /image_generation`
**Models**: `image-01` (standard), `image-01-live` (enhanced)
**Rate**: 10 RPM | **Cost**: ~$0.03/image
```json
{
"model": "image-01",
"prompt": "A girl looking into the distance",
"aspect_ratio": "16:9",
"n": 2,
"response_format": "url",
"prompt_optimizer": true,
"subject_reference": [{"type": "character", "image_file": "url", "weight": 0.8}]
}
```
**Aspect ratios**: 1:1, 16:9, 4:3, 3:2, 2:3, 3:4, 9:16, 21:9
**Custom dims**: 512-2048px (divisible by 8)
**Batch**: 1-9 images per request
## Video Generation (Hailuo)
**Endpoints**: POST `/video_generation` → GET `/query/video_generation` → GET `/files/retrieve`
**Async workflow**: Submit task → poll every 10s → download file (URL valid 9h)
### Models
| Model | Features | Resolution |
|-------|----------|-----------|
| `MiniMax-Hailuo-2.3` | Text/image-to-video | 720p/1080p |
| `MiniMax-Hailuo-2.3-Fast` | Same, 50% faster+cheaper | 720p/1080p |
| `MiniMax-Hailuo-02` | First+last frame mode | 720p |
| `S2V-01` | Subject reference | 720p |
**Rate**: 5 RPM | **Cost**: $0.25 (6s/768p), $0.52 (10s/768p)
```json
// Text-to-video
{"prompt": "A dancer", "model": "MiniMax-Hailuo-2.3", "duration": 6, "resolution": "1080P"}
// Image-to-video
{"prompt": "Scene desc", "first_frame_image": "url", "model": "MiniMax-Hailuo-2.3", "duration": 6}
// First+last frame
{"prompt": "Transition", "first_frame_image": "url", "last_frame_image": "url", "model": "MiniMax-Hailuo-02"}
// Subject reference
{"prompt": "Scene with character", "subject_reference": [{"type": "character", "image": ["url"]}], "model": "S2V-01"}
```
## Speech/TTS
**Endpoint**: `POST /speech/speech_t2a_input`
**Models**: `speech-2.8-hd` (best), `speech-2.8-turbo` (fast), `speech-2.6-hd/turbo`, `speech-02-hd/turbo`
**Rate**: 60 RPM | **Cost**: $30-50/1M chars
```json
{
"model": "speech-2.8-hd",
"text": "Your text here",
"voice": "English_Warm_Bestie",
"emotion": "happy",
"rate": 1.0,
"volume": 1.0,
"pitch": 1.0,
"output_format": "mp3"
}
```
**Voices**: 300+ system voices, 40+ languages
**Emotions**: happy, sad, angry, fearful, disgusted, surprised, neutral
**Formats**: mp3, wav, pcm, flac
**Text limit**: 10,000 chars
### Voice Cloning
```json
POST /voice_clone
{"audio_url": "https://sample.wav", "clone_name": "my_voice"}
```
Requires 10+ seconds of reference audio. Rate: 60 RPM.
## Music Generation
**Endpoint**: `POST /music_generation`
**Models**: `music-2.5` (latest, vocals+accompaniment, 4min songs)
**Rate**: 120 RPM | **Cost**: $0.03-0.075/generation
```json
{
"model": "music-2.5",
"lyrics": "Verse 1\nLine one\n\n[Chorus]\nChorus line",
"prompt": "Upbeat pop with electronic elements",
"output_format": "url",
"audio_setting": {"sample_rate": 44100, "bitrate": 128000, "format": "mp3"}
}
```
**Lyrics**: 1-3500 chars, supports structure tags ([Verse], [Chorus], etc.)
**Prompt**: 0-2000 chars, style/mood description
**Sample rates**: 16000, 24000, 32000, 44100 Hz
**Bitrates**: 32000, 64000, 128000, 256000 bps
## Error Codes
| Code | Meaning |
|------|---------|
| 0 | Success |
| 1002 | Rate limit exceeded |
| 1008 | Insufficient balance |
| 2013 | Invalid parameters |
## CLI Examples
```bash
# Image
python minimax_cli.py --task generate --prompt "A cyberpunk city" --model image-01 --aspect-ratio 16:9
# Video
python minimax_cli.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3 --duration 6
# Speech
python minimax_cli.py --task generate-speech --text "Hello world" --model speech-2.8-hd --voice English_Warm_Bestie --emotion happy
# Music
python minimax_cli.py --task generate-music --lyrics "La la la\nOh yeah" --prompt "upbeat pop" --model music-2.5
```
## References
- [API Overview](https://platform.minimax.io/docs/api-reference/api-overview)
- [Video Guide](https://platform.minimax.io/docs/guides/video-generation)
- [Speech API](https://platform.minimax.io/docs/api-reference/speech-t2a-intro)
- [Music API](https://platform.minimax.io/docs/api-reference/music-generation)

View File

@@ -0,0 +1,311 @@
# Music Generation Reference
Real-time music generation using Lyria RealTime via WebSocket API.
## Core Capabilities
- **Real-time streaming**: Bidirectional WebSocket for continuous generation
- **Dynamic control**: Modify music in real-time during generation
- **Style steering**: Genre, mood, instrumentation guidance
- **Audio output**: 48kHz stereo 16-bit PCM
## Model
**Lyria RealTime** (Experimental)
- WebSocket-based streaming
- Real-time parameter adjustment
- Instrumental only (no vocals)
- Watermarked output
## Quick Start
### Python
```python
from google import genai
import asyncio
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
async def generate_music():
async with client.aio.live.music.connect() as session:
# Set style prompts with weights (0.0-1.0)
await session.set_weighted_prompts([
{"prompt": "Upbeat corporate background music", "weight": 0.8},
{"prompt": "Modern electronic elements", "weight": 0.5}
])
# Configure generation parameters
await session.set_music_generation_config(
guidance=4.0, # Prompt adherence (0.0-6.0)
bpm=120, # Tempo (60-200)
density=0.6, # Note density (0.0-1.0)
brightness=0.5 # Tonal quality (0.0-1.0)
)
# Start playback and collect audio
await session.play()
audio_chunks = []
async for chunk in session:
audio_chunks.append(chunk.audio_data)
return b''.join(audio_chunks)
```
### JavaScript
```javascript
const client = new GenaiClient({ apiKey: process.env.GEMINI_API_KEY });
async function generateMusic() {
const session = await client.live.music.connect();
await session.setWeightedPrompts([
{ prompt: "Calm ambient background", weight: 0.9 },
{ prompt: "Nature sounds influence", weight: 0.3 }
]);
await session.setMusicGenerationConfig({
guidance: 3.5,
bpm: 80,
density: 0.4,
brightness: 0.6
});
session.onAudio((audioChunk) => {
// Process 48kHz stereo PCM audio
audioBuffer.push(audioChunk);
});
await session.play();
}
```
## Configuration Parameters
| Parameter | Range | Default | Description |
|-----------|-------|---------|-------------|
| `guidance` | 0.0-6.0 | 4.0 | Prompt adherence (higher = stricter) |
| `bpm` | 60-200 | 120 | Tempo in beats per minute |
| `density` | 0.0-1.0 | 0.5 | Note/sound density |
| `brightness` | 0.0-1.0 | 0.5 | Tonal quality (higher = brighter) |
| `scale` | 12 keys | C Major | Musical key |
| `mute_bass` | bool | false | Remove bass elements |
| `mute_drums` | bool | false | Remove drum elements |
| `mode` | enum | QUALITY | QUALITY, DIVERSITY, VOCALIZATION |
| `temperature` | 0.0-2.0 | 1.0 | Sampling randomness |
| `top_k` | int | 40 | Sampling top-k |
| `seed` | int | random | Reproducibility seed |
## Weighted Prompts
Control generation direction with weighted prompts:
```python
await session.set_weighted_prompts([
{"prompt": "Main style description", "weight": 1.0}, # Primary
{"prompt": "Secondary influence", "weight": 0.5}, # Supporting
{"prompt": "Subtle element", "weight": 0.2} # Accent
])
```
**Weight guidelines**:
- 0.8-1.0: Dominant influence
- 0.5-0.7: Secondary contribution
- 0.2-0.4: Subtle accent
- 0.0-0.1: Minimal effect
## Style Prompts by Use Case
### Corporate/Marketing
```python
prompts = [
{"prompt": "Professional corporate background music, modern", "weight": 0.9},
{"prompt": "Uplifting, optimistic mood", "weight": 0.6},
{"prompt": "Clean production, minimal complexity", "weight": 0.5}
]
config = {"bpm": 100, "brightness": 0.6, "density": 0.5}
```
### Social Media/Short-form
```python
prompts = [
{"prompt": "Trending pop electronic beat", "weight": 0.9},
{"prompt": "Energetic, catchy rhythm", "weight": 0.7},
{"prompt": "Bass-heavy, punchy", "weight": 0.5}
]
config = {"bpm": 128, "brightness": 0.7, "density": 0.7}
```
### Emotional/Cinematic
```python
prompts = [
{"prompt": "Cinematic orchestral underscore", "weight": 0.9},
{"prompt": "Emotional, inspiring", "weight": 0.7},
{"prompt": "Building tension and release", "weight": 0.5}
]
config = {"bpm": 70, "brightness": 0.4, "density": 0.4}
```
### Ambient/Background
```python
prompts = [
{"prompt": "Calm ambient soundscape", "weight": 0.9},
{"prompt": "Minimal, atmospheric", "weight": 0.6},
{"prompt": "Lo-fi textures", "weight": 0.4}
]
config = {"bpm": 80, "brightness": 0.4, "density": 0.3}
```
## Real-time Transitions
Smoothly transition between styles during generation:
```python
async def dynamic_music_generation():
async with client.aio.live.music.connect() as session:
# Start with intro style
await session.set_weighted_prompts([
{"prompt": "Soft ambient intro", "weight": 0.9}
])
await session.play()
# Collect intro (4 seconds)
intro_chunks = []
for _ in range(192): # ~4 seconds at 48kHz
chunk = await session.__anext__()
intro_chunks.append(chunk.audio_data)
# Transition to main section
await session.set_weighted_prompts([
{"prompt": "Building energy", "weight": 0.7},
{"prompt": "Full beat drop", "weight": 0.5}
])
# Continue with new style...
```
## Output Specifications
- **Format**: Raw 16-bit PCM
- **Sample Rate**: 48,000 Hz
- **Channels**: 2 (stereo)
- **Bit Depth**: 16 bits
- **Watermarking**: Always enabled (SynthID)
### Save to WAV
```python
import wave
def save_pcm_to_wav(pcm_data, filename):
with wave.open(filename, 'wb') as wav_file:
wav_file.setnchannels(2) # Stereo
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(48000) # 48kHz
wav_file.writeframes(pcm_data)
```
### Convert to MP3
```bash
# Using FFmpeg
ffmpeg -f s16le -ar 48000 -ac 2 -i input.pcm output.mp3
```
## Integration with Video Production
### Generate Background Music for Video
```python
async def generate_video_background(duration_seconds, mood):
"""Generate background music matching video length"""
# Configure for video background
prompts = [
{"prompt": f"{mood} background music for video", "weight": 0.9},
{"prompt": "Non-distracting, supportive underscore", "weight": 0.6}
]
async with client.aio.live.music.connect() as session:
await session.set_weighted_prompts(prompts)
await session.set_music_generation_config(
guidance=4.0,
density=0.4, # Keep sparse for background
brightness=0.5
)
await session.play()
# Calculate chunks needed (48kHz stereo = 192000 bytes/second)
total_chunks = duration_seconds * 48000 // 512 # Chunk size estimate
audio_data = []
async for i, chunk in enumerate(session):
audio_data.append(chunk.audio_data)
if i >= total_chunks:
break
return b''.join(audio_data)
```
### Sync with Storyboard Timing
```python
async def generate_scene_music(scenes):
"""Generate music with transitions matching scene changes"""
all_audio = []
async with client.aio.live.music.connect() as session:
for scene in scenes:
# Update style for each scene
await session.set_weighted_prompts([
{"prompt": scene['mood'], "weight": 0.9},
{"prompt": scene['style'], "weight": 0.5}
])
if scene['index'] == 0:
await session.play()
# Collect audio for scene duration
chunks = int(scene['duration'] * 48000 / 512)
for _ in range(chunks):
chunk = await session.__anext__()
all_audio.append(chunk.audio_data)
return b''.join(all_audio)
```
## Limitations
- **Instrumental only**: No vocal/singing generation
- **WebSocket required**: Real-time streaming connection
- **Safety filtering**: Prompts undergo safety review
- **Watermarking**: All output contains SynthID watermark
- **Experimental**: API may change
## Best Practices
1. **Buffer audio**: Implement robust buffering for smooth playback
2. **Gradual transitions**: Avoid drastic prompt changes mid-stream
3. **Sparse for backgrounds**: Lower density for video backgrounds
4. **Test prompts**: Iterate on prompt combinations
5. **Cross-fade transitions**: Blend audio at style changes
6. **Match video mood**: Align music tempo/energy with visuals
## Resources
- [Lyria RealTime Docs](https://ai.google.dev/gemini-api/docs/music-generation)
- [Audio Processing Guide](./audio-processing.md)
- [Video Generation](./video-generation.md)
---
**Related**: [Audio Processing](./audio-processing.md) | [Video Generation](./video-generation.md)
**Back to**: [AI Multimodal Skill](../SKILL.md)

View File

@@ -0,0 +1,515 @@
# Video Analysis Reference
Comprehensive guide for video understanding, temporal analysis, and YouTube processing using Gemini API.
> **Note**: This guide covers video *analysis* (understanding existing videos). For video *generation* (creating new videos), see [Video Generation Reference](./video-generation.md).
## Core Capabilities
- **Video Summarization**: Create concise summaries
- **Question Answering**: Answer specific questions about content
- **Transcription**: Audio transcription with visual descriptions
- **Timestamp References**: Query specific moments (MM:SS format)
- **Video Clipping**: Process specific segments
- **Scene Detection**: Identify scene changes and transitions
- **Multiple Videos**: Compare up to 10 videos (2.5+)
- **YouTube Support**: Analyze YouTube videos directly
- **Custom Frame Rate**: Adjust FPS sampling
## Supported Formats
- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
## Model Selection
### Gemini 3 Series (Latest)
- **gemini-3-pro-preview**: Latest, agentic workflows, 1M context, dynamic thinking
### Gemini 2.5 Series (Recommended)
- **gemini-2.5-pro**: Best quality, 1M-2M context
- **gemini-2.5-flash**: Balanced, 1M-2M context (recommended)
### Context Windows
- **2M token models**: ~2 hours (default) or ~6 hours (low-res)
- **1M token models**: ~1 hour (default) or ~3 hours (low-res)
## Basic Video Analysis
### Local Video
```python
from google import genai
import os
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
# Upload video (File API for >20MB)
myfile = client.files.upload(file='video.mp4')
# Wait for processing
import time
while myfile.state.name == 'PROCESSING':
time.sleep(1)
myfile = client.files.get(name=myfile.name)
if myfile.state.name == 'FAILED':
raise ValueError('Video processing failed')
# Analyze
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Summarize this video in 3 key points', myfile]
)
print(response.text)
```
### YouTube Video
```python
from google.genai import types
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Summarize the main topics discussed',
types.Part.from_uri(
uri='https://www.youtube.com/watch?v=VIDEO_ID',
mime_type='video/mp4'
)
]
)
```
### Inline Video (<20MB)
```python
with open('short-clip.mp4', 'rb') as f:
video_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'What happens in this video?',
types.Part.from_bytes(data=video_bytes, mime_type='video/mp4')
]
)
```
## Advanced Features
### Video Clipping
```python
# Analyze specific time range
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Summarize this segment',
types.Part.from_video_metadata(
file_uri=myfile.uri,
start_offset='40s',
end_offset='80s'
)
]
)
```
### Custom Frame Rate
```python
# Lower FPS for static content (saves tokens)
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Analyze this presentation',
types.Part.from_video_metadata(
file_uri=myfile.uri,
fps=0.5 # Sample every 2 seconds
)
]
)
# Higher FPS for fast-moving content
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Analyze rapid movements in this sports video',
types.Part.from_video_metadata(
file_uri=myfile.uri,
fps=5 # Sample 5 times per second
)
]
)
```
### Multiple Videos (2.5+)
```python
video1 = client.files.upload(file='demo1.mp4')
video2 = client.files.upload(file='demo2.mp4')
# Wait for processing
for video in [video1, video2]:
while video.state.name == 'PROCESSING':
time.sleep(1)
video = client.files.get(name=video.name)
response = client.models.generate_content(
model='gemini-2.5-pro',
contents=[
'Compare these two product demos. Which explains features better?',
video1,
video2
]
)
```
## Temporal Understanding
### Timestamp-Based Questions
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'What happens at 01:15 and how does it relate to 02:30?',
myfile
]
)
```
### Timeline Creation
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Create a timeline with timestamps:
- Key events
- Scene changes
- Important moments
Format: MM:SS - Description
''',
myfile
]
)
```
### Scene Detection
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Identify all scene changes with timestamps and describe each scene',
myfile
]
)
```
## Transcription
### Basic Transcription
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Transcribe the audio from this video',
myfile
]
)
```
### With Visual Descriptions
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Transcribe with visual context:
- Audio transcription
- Visual descriptions of important moments
- Timestamps for salient events
''',
myfile
]
)
```
### Speaker Identification
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Transcribe with speaker labels and timestamps',
myfile
]
)
```
## Common Use Cases
### 1. Video Summarization
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Summarize this video:
1. Main topic and purpose
2. Key points with timestamps
3. Conclusion or call-to-action
''',
myfile
]
)
```
### 2. Educational Content
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Create educational materials:
1. List key concepts taught
2. Create 5 quiz questions with answers
3. Provide timestamp for each concept
''',
myfile
]
)
```
### 3. Action Detection
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'List all actions performed in this tutorial with timestamps',
myfile
]
)
```
### 4. Content Moderation
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Review video content:
1. Identify any problematic content
2. Note timestamps of concerns
3. Provide content rating recommendation
''',
myfile
]
)
```
### 5. Interview Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Analyze interview:
1. Questions asked (timestamps)
2. Key responses
3. Candidate body language and demeanor
4. Overall assessment
''',
myfile
]
)
```
### 6. Sports Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Analyze sports video:
1. Key plays with timestamps
2. Player movements and positioning
3. Game strategy observations
''',
types.Part.from_video_metadata(
file_uri=myfile.uri,
fps=5 # Higher FPS for fast action
)
]
)
```
## YouTube Specific Features
### Public Video Requirements
- Video must be public (not private or unlisted)
- No age-restricted content
- Valid video ID required
### Usage Example
```python
# YouTube URL
youtube_uri = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Create chapter markers with timestamps',
types.Part.from_uri(uri=youtube_uri, mime_type='video/mp4')
]
)
```
### Rate Limits
- **Free tier**: 8 hours of YouTube video per day
- **Paid tier**: No length-based limits
- Public videos only
## Token Calculation
Video tokens depend on resolution and FPS:
**Default resolution** (~300 tokens/second):
- 1 minute = 18,000 tokens
- 10 minutes = 180,000 tokens
- 1 hour = 1,080,000 tokens
**Low resolution** (~100 tokens/second):
- 1 minute = 6,000 tokens
- 10 minutes = 60,000 tokens
- 1 hour = 360,000 tokens
**Context windows**:
- 2M tokens ≈ 2 hours (default) or 6 hours (low-res)
- 1M tokens ≈ 1 hour (default) or 3 hours (low-res)
## Best Practices
### File Management
1. Use File API for videos >20MB (most videos)
2. Wait for ACTIVE state before analysis
3. Files auto-delete after 48 hours
4. Clean up manually:
```python
client.files.delete(name=myfile.name)
```
### Optimization Strategies
**Reduce token usage**:
- Process specific segments using start/end offsets
- Use lower FPS for static content
- Use low-resolution mode for long videos
- Split very long videos into chunks
**Improve accuracy**:
- Provide context in prompts
- Use higher FPS for fast-moving content
- Use Pro model for complex analysis
- Be specific about what to extract
### Prompt Engineering
**Effective prompts**:
- "Summarize key points with timestamps in MM:SS format"
- "Identify all scene changes and describe each scene"
- "Extract action items mentioned with timestamps"
- "Compare these two videos on: X, Y, Z criteria"
**Structured output**:
```python
from pydantic import BaseModel
from typing import List
class VideoEvent(BaseModel):
timestamp: str # MM:SS format
description: str
category: str
class VideoAnalysis(BaseModel):
summary: str
events: List[VideoEvent]
duration: str
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Analyze this video', myfile],
config=genai.types.GenerateContentConfig(
response_mime_type='application/json',
response_schema=VideoAnalysis
)
)
```
### Error Handling
```python
import time
def upload_and_process_video(file_path, max_wait=300):
"""Upload video and wait for processing"""
myfile = client.files.upload(file=file_path)
elapsed = 0
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
time.sleep(5)
myfile = client.files.get(name=myfile.name)
elapsed += 5
if myfile.state.name == 'FAILED':
raise ValueError(f'Video processing failed: {myfile.state.name}')
if myfile.state.name == 'PROCESSING':
raise TimeoutError(f'Processing timeout after {max_wait}s')
return myfile
```
## Cost Optimization
**Token costs** (Gemini 2.5 Flash at $1/1M):
- 1 minute video (default): 18,000 tokens = $0.018
- 10 minute video: 180,000 tokens = $0.18
- 1 hour video: 1,080,000 tokens = $1.08
**Strategies**:
- Use video clipping for specific segments
- Lower FPS for static content
- Use low-resolution mode for long videos
- Batch related queries on same video
- Use context caching for repeated queries
## Limitations
- Maximum 6 hours (low-res) or 2 hours (default)
- YouTube videos must be public
- No live streaming analysis
- Files expire after 48 hours
- Processing time varies by video length
- No real-time processing
- Limited to 10 videos per request (2.5+)
---
## Related References
**Current**: Video Analysis
**Related Capabilities**:
- [Video Generation](./video-generation.md) - Creating videos from text/images
- [Audio Processing](./audio-processing.md) - Extract and analyze audio tracks
- [Image Understanding](./vision-understanding.md) - Analyze individual frames
**Back to**: [AI Multimodal Skill](../SKILL.md)

View File

@@ -0,0 +1,457 @@
# Video Generation Reference
Comprehensive guide for video creation using Veo models via Gemini API.
## Core Capabilities
- **Text-to-Video**: Generate 8-second videos from text prompts
- **Image-to-Video**: Animate images with text direction
- **Video Extension**: Continue previously generated videos
- **Frame Control**: Precise camera movements and effects
- **Native Audio**: Synchronized audio generation
- **Multiple Resolutions**: 720p and 1080p output
- **Aspect Ratios**: 16:9, 9:16, 1:1
## Models
### Veo 3.1 Preview (Latest)
**veo-3.1-generate-preview** - Latest with advanced controls
- Frame-specific generation
- Up to 3 reference images for image-to-video
- Video extension capability
- Native audio generation
- Resolution: 720p, 1080p
- Duration: 8 seconds at 24fps
- Status: Preview (API may change)
- Updated: September 2025
**veo-3.1-fast-generate-preview** - Speed-optimized
- Optimized for business use cases
- Programmatic ad creation
- Social media content
- Same features as standard but faster
- Status: Preview
- Updated: September 2025
### Veo 3.0 Stable
**veo-3.0-generate-001** - Production-ready
- Native audio generation
- Text-to-video and image-to-video
- 720p and 1080p (16:9 only)
- 8 seconds at 24fps
- Status: Stable
- Updated: July 2025
**veo-3.0-fast-generate-001** - Stable fast variant
- Speed-optimized stable version
- Same reliability as 3.0
- Status: Stable
- Updated: July 2025
## Model Comparison
| Model | Speed | Features | Audio | Status | Best For |
|-------|-------|----------|-------|--------|----------|
| veo-3.1-preview | Medium | All | ✓ | Preview | Latest features |
| veo-3.1-fast | Fast | All | ✓ | Preview | Business/speed |
| veo-3.0-001 | Medium | Standard | ✓ | Stable | Production |
| veo-3.0-fast | Fast | Standard | ✓ | Stable | Production/speed |
## Quick Start
### Text-to-Video
```python
from google import genai
from google.genai import types
import os
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
# Basic generation
response = client.models.generate_video(
model='veo-3.1-generate-preview',
prompt='A serene beach at sunset with gentle waves rolling onto the shore',
config=types.VideoGenerationConfig(
resolution='1080p',
aspect_ratio='16:9'
)
)
# Save video
with open('output.mp4', 'wb') as f:
f.write(response.video.data)
```
### Image-to-Video
```python
import PIL.Image
# Load reference image
ref_image = PIL.Image.open('beach.jpg')
# Animate the image
response = client.models.generate_video(
model='veo-3.1-generate-preview',
prompt='Camera slowly pans across the scene from left to right',
reference_images=[ref_image],
config=types.VideoGenerationConfig(
resolution='1080p'
)
)
```
### Multiple Reference Images
```python
# Use up to 3 reference images for complex scenes
img1 = PIL.Image.open('foreground.jpg')
img2 = PIL.Image.open('background.jpg')
img3 = PIL.Image.open('subject.jpg')
response = client.models.generate_video(
model='veo-3.1-generate-preview',
prompt='Combine these elements into a cohesive animated scene',
reference_images=[img1, img2, img3],
config=types.VideoGenerationConfig(
resolution='1080p',
aspect_ratio='16:9'
)
)
```
## Advanced Features
### Video Extension
```python
# Continue from previously generated video
previous_video = open('part1.mp4', 'rb').read()
response = client.models.extend_video(
model='veo-3.1-generate-preview',
video=previous_video,
prompt='The scene transitions to nighttime with stars appearing'
)
```
### Frame Control
```python
# Precise camera movements
response = client.models.generate_video(
model='veo-3.1-generate-preview',
prompt='A mountain landscape',
config=types.VideoGenerationConfig(
resolution='1080p',
camera_motion='zoom_in', # Options: zoom_in, zoom_out, pan_left, pan_right, tilt_up, tilt_down, static
motion_speed='slow' # Options: slow, medium, fast
)
)
```
## Prompt Engineering
### Effective Video Prompts
**Structure**:
1. **Subject**: What's in the scene
2. **Action**: What's happening
3. **Camera**: How it's filmed
4. **Style**: Visual treatment
5. **Timing**: Pacing details
**Example**:
```
"A hummingbird [subject] hovers near a red flower, then flies away [action].
Slow-motion close-up shot [camera] with vibrant colors and soft focus background [style].
Gentle, peaceful pacing [timing]."
```
### Action Verbs
**Movement**:
- "walks", "runs", "flies", "swims", "dances"
- "rotates", "spins", "rolls", "bounces"
- "emerges", "disappears", "transforms"
**Camera**:
- "zoom in on", "pull back from", "follow"
- "orbit around", "track alongside"
- "tilt up to reveal", "pan across"
**Transitions**:
- "gradually changes from... to..."
- "morphs into", "dissolves into"
- "cuts to", "fades to"
### Timing Control
```python
# Explicit timing in prompt
prompt = '''
0-2s: Close-up of a seed in soil
2-4s: Time-lapse of sprout emerging
4-6s: Growing into a small plant
6-8s: Zoom out to show garden context
'''
```
## Configuration Options
### Resolution
```python
config = types.VideoGenerationConfig(
resolution='1080p' # Options: 720p, 1080p
)
```
**Considerations**:
- 1080p: Higher quality, longer generation time, larger file
- 720p: Faster generation, smaller file, good for drafts
### Aspect Ratios
```python
config = types.VideoGenerationConfig(
aspect_ratio='16:9' # Options: 16:9, 9:16, 1:1
)
```
**Use Cases**:
- 16:9: Landscape, YouTube, traditional video
- 9:16: Mobile, TikTok, Instagram Stories
- 1:1: Square, Instagram feed, versatile
### Audio Control
```python
config = types.VideoGenerationConfig(
include_audio=True # Default: True
)
```
Native audio is generated automatically and synchronized with video content.
## Best Practices
### 1. Prompt Quality
**Be specific**:
- ❌ "A person walking"
- ✅ "A young woman in a red coat walking through a park in autumn"
**Include motion**:
- ❌ "A city street"
- ✅ "A busy city street with cars passing and people crossing"
**Specify camera**:
- ❌ "A mountain"
- ✅ "Aerial drone shot slowly ascending over a snow-capped mountain"
### 2. Reference Images
**Quality**:
- Use high-resolution images (1080p+)
- Clear, well-lit subjects
- Minimal motion blur
**Composition**:
- Match desired final aspect ratio
- Leave room for motion/movement
- Consider camera angle in prompt
### 3. Performance Optimization
**Generation Time**:
- 720p: ~30-60 seconds
- 1080p: ~60-120 seconds
- Fast models: 30-50% faster
**Strategies**:
- Use 720p for iteration/drafts
- Use fast models for rapid feedback
- Batch multiple requests
- Use async processing for UI responsiveness
## Common Use Cases
### 1. Product Demos
```python
response = client.models.generate_video(
model='veo-3.0-fast-generate-001',
prompt='''
Professional product video:
- Sleek smartphone rotating on a pedestal
- Clean white background with soft shadows
- Slow 360-degree rotation
- Spotlight highlighting premium design
- Modern, minimalist aesthetic
''',
config=types.VideoGenerationConfig(
resolution='1080p',
aspect_ratio='1:1'
)
)
```
### 2. Social Media Content
```python
response = client.models.generate_video(
model='veo-3.1-fast-generate-preview',
prompt='''
Trendy social media clip:
- Text overlay "NEW ARRIVAL" appears
- Fashion product showcase
- Quick cuts and dynamic camera
- Vibrant colors, high energy
- Upbeat pacing
''',
config=types.VideoGenerationConfig(
resolution='1080p',
aspect_ratio='9:16' # Mobile
)
)
```
### 3. Explainer Animations
```python
response = client.models.generate_video(
model='veo-3.1-generate-preview',
prompt='''
Educational animation:
- Simple diagram illustrating data flow
- Arrows and icons animating in sequence
- Clean, clear visual hierarchy
- Smooth transitions between steps
- Professional corporate style
''',
config=types.VideoGenerationConfig(
resolution='720p',
aspect_ratio='16:9'
)
)
```
## Safety & Content Policy
### Safety Settings
```python
config = types.VideoGenerationConfig(
safety_settings=[
types.SafetySetting(
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
)
]
)
```
### Prohibited Content
- Violence, gore, harm
- Sexually explicit content
- Hate speech, harassment
- Copyrighted characters/brands
- Real people (without consent)
- Misleading/deceptive content
## Limitations
- **Duration**: Fixed 8 seconds (as of Sept 2025)
- **Frame Rate**: 24fps only
- **File Size**: ~5-20MB per video
- **Generation Time**: 30s-2min depending on resolution
- **Reference Images**: Max 3 images
- **Preview Status**: API may change (3.1 models)
- **Audio**: Cannot upload custom audio (native only)
- **No real-time**: Pre-generation required
## Troubleshooting
### Long Generation Times
```python
import time
# Track generation progress
start = time.time()
response = client.models.generate_video(...)
duration = time.time() - start
print(f"Generated in {duration:.1f}s")
```
**Expected times**:
- Fast models + 720p: 30-45s
- Standard models + 720p: 45-90s
- Fast models + 1080p: 45-60s
- Standard models + 1080p: 60-120s
### Safety Filter Blocking
```python
try:
response = client.models.generate_video(...)
except Exception as e:
if 'safety' in str(e).lower():
print("Video blocked by safety filters")
# Modify prompt and retry
```
### Quota Exceeded
```python
# Implement exponential backoff
import time
def generate_with_retry(model, prompt, max_retries=3):
for attempt in range(max_retries):
try:
return client.models.generate_video(model=model, prompt=prompt)
except Exception as e:
if '429' in str(e): # Rate limit
wait = 2 ** attempt
print(f"Rate limited, waiting {wait}s...")
time.sleep(wait)
else:
raise
raise Exception("Max retries exceeded")
```
## Cost Estimation
**Pricing**: TBD (preview models)
**Estimated based on compute**:
- Fast + 720p: ~$0.05-$0.10 per video
- Standard + 1080p: ~$0.15-$0.25 per video
**Monitor**: https://ai.google.dev/pricing
## Resources
- [Veo API Docs](https://ai.google.dev/gemini-api/docs/video)
- [Video Generation Guide](https://ai.google.dev/gemini-api/docs/video#model-versions)
- [Content Policy](https://ai.google.dev/gemini-api/docs/safety)
- [Get API Key](https://aistudio.google.com/apikey)
---
## Related References
**Current**: Video Generation
**Related Capabilities**:
- [Video Analysis](./video-analysis.md) - Understanding existing videos
- [Image Generation](./image-generation.md) - Creating static images
- [Image Understanding](./vision-understanding.md) - Analyzing reference images
**Back to**: [AI Multimodal Skill](../SKILL.md)

View File

@@ -0,0 +1,492 @@
# Vision Understanding Reference
Comprehensive guide for image analysis, object detection, and visual understanding using Gemini API.
## Core Capabilities
- **Captioning**: Generate descriptive text for images
- **Classification**: Categorize and identify content
- **Visual Q&A**: Answer questions about images
- **Object Detection**: Locate objects with bounding boxes (2.0+)
- **Segmentation**: Create pixel-level masks (2.5+)
- **Multi-image**: Compare up to 3,600 images
- **OCR**: Extract text from images
- **Document Understanding**: Process PDFs with vision
## Supported Formats
- **Images**: PNG, JPEG, WEBP, HEIC, HEIF
- **Documents**: PDF (up to 1,000 pages)
- **Size Limits**:
- Inline: 20MB max total request
- File API: 2GB per file
- Max images: 3,600 per request
## Model Selection
### Gemini 2.5 Series
- **gemini-2.5-pro**: Best quality, segmentation + detection
- **gemini-2.5-flash**: Fast, efficient, all features
- **gemini-2.5-flash-lite**: Lightweight, all features
### Feature Requirements
- **Segmentation**: Requires 2.5+ models
- **Object Detection**: Requires 2.0+ models
- **Multi-image**: All models (up to 3,600 images)
## Basic Image Analysis
### Image Captioning
```python
from google import genai
import os
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
# Local file
with open('image.jpg', 'rb') as f:
img_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Describe this image in detail',
genai.types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
]
)
print(response.text)
```
### Image Classification
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Classify this image. Provide category and confidence level.',
img_part
]
)
```
### Visual Question Answering
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'How many people are in this image and what are they doing?',
img_part
]
)
```
## Advanced Features
### Object Detection (2.5+)
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Detect all objects in this image and provide bounding boxes',
img_part
]
)
# Returns bounding box coordinates: [ymin, xmin, ymax, xmax]
# Normalized to [0, 1000] range
```
### Segmentation (2.5+)
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Create a segmentation mask for all people in this image',
img_part
]
)
# Returns pixel-level masks for requested objects
```
### Multi-Image Comparison
```python
import PIL.Image
img1 = PIL.Image.open('photo1.jpg')
img2 = PIL.Image.open('photo2.jpg')
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Compare these two images. What are the differences?',
img1,
img2
]
)
```
### OCR and Text Extraction
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Extract all visible text from this image',
img_part
]
)
```
## Input Methods
### Inline Data (<20MB)
```python
from google.genai import types
# From file
with open('image.jpg', 'rb') as f:
img_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Analyze this image',
types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
]
)
```
### PIL Image
```python
import PIL.Image
img = PIL.Image.open('photo.jpg')
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['What is in this image?', img]
)
```
### File API (>20MB or Reuse)
```python
# Upload once
myfile = client.files.upload(file='large-image.jpg')
# Use multiple times
response1 = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Describe this image', myfile]
)
response2 = client.models.generate_content(
model='gemini-2.5-flash',
contents=['What colors dominate this image?', myfile]
)
```
### URL (Public Images)
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Analyze this image',
types.Part.from_uri(
uri='https://example.com/image.jpg',
mime_type='image/jpeg'
)
]
)
```
## Token Calculation
Images consume tokens based on size:
**Small images** (≤384px both dimensions): 258 tokens
**Large images**: Tiled into 768×768 chunks, 258 tokens each
**Formula**:
```
crop_unit = floor(min(width, height) / 1.5)
tiles = (width / crop_unit) × (height / crop_unit)
total_tokens = tiles × 258
```
**Examples**:
- 256×256: 258 tokens (small)
- 512×512: 258 tokens (small)
- 960×540: 6 tiles = 1,548 tokens
- 1920×1080: 6 tiles = 1,548 tokens
- 3840×2160 (4K): 24 tiles = 6,192 tokens
## Structured Output
### JSON Schema Output
```python
from pydantic import BaseModel
from typing import List
class ObjectDetection(BaseModel):
object_name: str
confidence: float
bounding_box: List[int] # [ymin, xmin, ymax, xmax]
class ImageAnalysis(BaseModel):
description: str
objects: List[ObjectDetection]
scene_type: str
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Analyze this image', img_part],
config=genai.types.GenerateContentConfig(
response_mime_type='application/json',
response_schema=ImageAnalysis
)
)
result = ImageAnalysis.model_validate_json(response.text)
```
## Multi-Image Analysis
### Batch Processing
```python
images = [
PIL.Image.open(f'image{i}.jpg')
for i in range(10)
]
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=['Analyze these images and find common themes'] + images
)
```
### Image Comparison
```python
before = PIL.Image.open('before.jpg')
after = PIL.Image.open('after.jpg')
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Compare before and after. List all visible changes.',
before,
after
]
)
```
### Visual Search
```python
reference = PIL.Image.open('target.jpg')
candidates = [PIL.Image.open(f'option{i}.jpg') for i in range(5)]
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Find which candidate images contain objects similar to the reference',
reference
] + candidates
)
```
## Best Practices
### Image Quality
1. **Resolution**: Use clear, non-blurry images
2. **Rotation**: Verify correct orientation
3. **Lighting**: Ensure good contrast and lighting
4. **Size optimization**: Balance quality vs token cost
5. **Format**: JPEG for photos, PNG for graphics
### Prompt Engineering
**Specific instructions**:
- "Identify all vehicles with their colors and positions"
- "Count people wearing blue shirts"
- "Extract text from the sign in the top-left corner"
**Output format**:
- "Return results as JSON with fields: category, count, description"
- "Format as markdown table"
- "List findings as numbered items"
**Few-shot examples**:
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Example: For an image of a cat on a sofa, respond: "Object: cat, Location: sofa"',
'Now analyze this image:',
img_part
]
)
```
### File Management
1. Use File API for images >20MB
2. Use File API for repeated queries (saves tokens)
3. Files auto-delete after 48 hours
4. Clean up manually:
```python
client.files.delete(name=myfile.name)
```
### Cost Optimization
**Token-efficient strategies**:
- Resize large images before upload
- Use File API for repeated queries
- Batch multiple images when related
- Use appropriate model (Flash vs Pro)
**Token costs** (Gemini 2.5 Flash at $1/1M):
- Small image (258 tokens): $0.000258
- HD image (1,548 tokens): $0.001548
- 4K image (6,192 tokens): $0.006192
## Common Use Cases
### 1. Product Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Analyze this product image:
1. Identify the product
2. List visible features
3. Assess condition
4. Estimate value range
''',
img_part
]
)
```
### 2. Screenshot Analysis
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Extract all text and UI elements from this screenshot',
img_part
]
)
```
### 3. Medical Imaging (Informational Only)
```python
response = client.models.generate_content(
model='gemini-2.5-pro',
contents=[
'Describe visible features in this medical image. Note: This is for informational purposes only.',
img_part
]
)
```
### 4. Chart/Graph Reading
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'Extract data from this chart and format as JSON',
img_part
]
)
```
### 5. Scene Understanding
```python
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
'''Analyze this scene:
1. Location type
2. Time of day
3. Weather conditions
4. Activities happening
5. Mood/atmosphere
''',
img_part
]
)
```
## Error Handling
```python
import time
def analyze_image_with_retry(image_path, prompt, max_retries=3):
"""Analyze image with exponential backoff retry"""
for attempt in range(max_retries):
try:
with open(image_path, 'rb') as f:
img_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
prompt,
genai.types.Part.from_bytes(
data=img_bytes,
mime_type='image/jpeg'
)
]
)
return response.text
except Exception as e:
if attempt == max_retries - 1:
raise
wait_time = 2 ** attempt
print(f"Retry {attempt + 1} after {wait_time}s: {e}")
time.sleep(wait_time)
```
## Limitations
- Maximum 3,600 images per request
- OCR accuracy varies with text quality
- Object detection requires 2.0+ models
- Segmentation requires 2.5+ models
- No video frame extraction (use video API)
- Regional restrictions on child images (EEA, CH, UK)
---
## Related References
**Current**: Image Understanding
**Related Capabilities**:
- [Image Generation](./image-generation.md) - Create and edit images
- [Video Analysis](./video-analysis.md) - Analyze video frames
- [Video Generation](./video-generation.md) - Reference images for video generation
**Back to**: [AI Multimodal Skill](../SKILL.md)

Binary file not shown.

View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
Validate ai-multimodal skill setup and configuration.
Checks:
- API key presence and format
- Python dependencies
- Centralized resolver availability
- Directory structure
"""
import os
import sys
from pathlib import Path
# Fix Windows cp1252 encoding: Unicode symbols (✓, ⚠, ✗) can't encode on Windows.
# Reconfigure stdout to UTF-8 with replacement (Python 3.7+).
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
# Color codes for terminal output
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BLUE = '\033[94m'
RESET = '\033[0m'
BOLD = '\033[1m'
def print_header(text):
"""Print section header."""
print(f"\n{BOLD}{BLUE}{'='*60}{RESET}")
print(f"{BOLD}{BLUE}{text}{RESET}")
print(f"{BOLD}{BLUE}{'='*60}{RESET}\n")
def print_success(text):
"""Print success message."""
print(f"{GREEN}{text}{RESET}")
def print_warning(text):
"""Print warning message."""
print(f"{YELLOW}{text}{RESET}")
def print_error(text):
"""Print error message."""
print(f"{RED}{text}{RESET}")
def print_info(text):
"""Print info message."""
print(f"{BLUE} {text}{RESET}")
def check_dependencies():
"""Check if required Python packages are installed."""
print_header("Checking Python Dependencies")
dependencies = {
'google.genai': 'google-genai',
'dotenv': 'python-dotenv',
'PIL': 'pillow'
}
missing = []
for module_name, package_name in dependencies.items():
try:
__import__(module_name)
print_success(f"{package_name} is installed")
except ImportError:
print_error(f"{package_name} is NOT installed")
missing.append(package_name)
if missing:
print_error("\nMissing dependencies detected!")
print_info(f"Install with: pip install {' '.join(missing)}")
return False
return True
def check_centralized_resolver():
"""Check if centralized resolver is available."""
print_header("Checking Centralized Resolver")
claude_root = Path(__file__).parent.parent.parent.parent
resolver_path = claude_root / 'scripts' / 'resolve_env.py'
if resolver_path.exists():
print_success(f"Centralized resolver found: {resolver_path}")
# Try to import it
sys.path.insert(0, str(resolver_path.parent))
try:
from resolve_env import resolve_env
print_success("Centralized resolver can be imported")
return True
except ImportError as e:
print_error(f"Centralized resolver exists but cannot be imported: {e}")
return False
else:
print_warning(f"Centralized resolver not found: {resolver_path}")
print_info("Skill will use fallback resolution logic")
return True # Not critical, fallback works
def find_api_key():
"""Find and validate API key using centralized resolver."""
print_header("Checking API Key Configuration")
# Try to use centralized resolver
claude_root = Path(__file__).parent.parent.parent.parent
sys.path.insert(0, str(claude_root / 'scripts'))
try:
from resolve_env import resolve_env
print_info("Using centralized resolver...")
api_key = resolve_env('GEMINI_API_KEY', skill='ai-multimodal')
if api_key:
print_success("API key found via centralized resolver")
print_info(f"Key preview: {api_key[:20]}...{api_key[-4:]}")
# Show hierarchy
print_info("\nTo see where the key was found, run:")
print_info("python ~/.opencode/scripts/resolve_env.py GEMINI_API_KEY --skill ai-multimodal --verbose")
return api_key
else:
print_error("API key not found in any location")
return None
except ImportError:
print_warning("Centralized resolver not available, using fallback")
# Fallback: check environment
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
print_success("API key found in process.env")
print_info(f"Key preview: {api_key[:20]}...{api_key[-4:]}")
return api_key
else:
print_error("API key not found")
return None
def validate_api_key_format(api_key):
"""Basic validation of API key format."""
if not api_key:
return False
# Google AI Studio keys typically start with 'AIza'
if api_key.startswith('AIza'):
print_success("API key format looks valid (Google AI Studio)")
return True
elif len(api_key) > 20:
print_warning("API key format not recognized (may be Vertex AI or custom)")
return True
else:
print_error("API key format looks invalid (too short)")
return False
def test_api_connection(api_key):
"""Test API connection with a simple request."""
print_header("Testing API Connection")
try:
from google import genai
print_info("Initializing Gemini client...")
client = genai.Client(api_key=api_key)
print_info("Fetching available models...")
# List models to verify API key works
models = list(client.models.list())
print_success(f"API connection successful! Found {len(models)} available models")
# Show some available models
print_info("\nSample available models:")
for model in models[:5]:
print(f" - {model.name}")
return True
except ImportError:
print_error("google-genai package not installed")
return False
except Exception as e:
print_error(f"API connection failed: {str(e)}")
return False
def check_directory_structure():
"""Verify skill directory structure."""
print_header("Checking Directory Structure")
script_dir = Path(__file__).parent
skill_dir = script_dir.parent
required_files = [
('SKILL.md', skill_dir / 'SKILL.md'),
('.env.example', skill_dir / '.env.example'),
('gemini_batch_process.py', script_dir / 'gemini_batch_process.py'),
]
all_exist = True
for name, path in required_files:
if path.exists():
print_success(f"{name} exists")
else:
print_error(f"{name} NOT found at {path}")
all_exist = False
return all_exist
def provide_setup_instructions():
"""Provide setup instructions if configuration is incomplete."""
print_header("Setup Instructions")
print_info("To configure the ai-multimodal skill:")
print("\n1. Get a Gemini API key:")
print(" → Visit: https://aistudio.google.com/apikey")
print("\n2. Configure the API key (choose one method):")
print(f"\n Option A: User global config (recommended)")
print(f" $ echo 'GEMINI_API_KEY=your-api-key-here' >> ~/.opencode/.env")
script_dir = Path(__file__).parent
skill_dir = script_dir.parent
print(f"\n Option B: Skill-specific config")
print(f" $ cd {skill_dir}")
print(f" $ cp .env.example .env")
print(f" $ # Edit .env and add your API key")
print(f"\n Option C: Runtime environment (temporary)")
print(f" $ export GEMINI_API_KEY='your-api-key-here'")
print("\n3. Verify setup:")
print(f" $ python {Path(__file__)}")
print("\n4. Debug if needed:")
print(f" $ python ~/.opencode/scripts/resolve_env.py --show-hierarchy --skill ai-multimodal")
print(f" $ python ~/.opencode/scripts/resolve_env.py GEMINI_API_KEY --skill ai-multimodal --verbose")
def main():
"""Run all setup checks."""
print(f"\n{BOLD}AI Multimodal Skill - Setup Checker{RESET}")
all_passed = True
# Check directory structure
if not check_directory_structure():
all_passed = False
# Check centralized resolver
check_centralized_resolver()
# Check dependencies
if not check_dependencies():
all_passed = False
provide_setup_instructions()
sys.exit(1)
# Check API key
api_key = find_api_key()
if not api_key:
print_error("\n❌ GEMINI_API_KEY not found in any location")
all_passed = False
provide_setup_instructions()
sys.exit(1)
# Validate API key format
if not validate_api_key_format(api_key):
all_passed = False
# Test API connection
if not test_api_connection(api_key):
all_passed = False
# Final summary
print_header("Setup Summary")
if all_passed:
print_success("✅ All checks passed! The ai-multimodal skill is ready to use.")
print_info("\nNext steps:")
print(" • Read SKILL.md for usage examples")
print(" • Try: python scripts/gemini_batch_process.py --help")
print("\nImage generation models:")
print(" • gemini-2.5-flash-image - Nano Banana Flash (DEFAULT - fast)")
print(" • imagen-4.0-generate-001 - Imagen 4 (alternative - production)")
print(" • gemini-3-pro-image-preview - Nano Banana Pro (4K text, reasoning)")
print("\nExample (uses default model):")
print(" python scripts/gemini_batch_process.py --task generate \\")
print(" --prompt 'A sunset over mountains' --aspect-ratio 16:9 --size 2K")
else:
print_error("❌ Some checks failed. Please fix the issues above.")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,395 @@
#!/usr/bin/env python3
"""
Convert documents to Markdown using Gemini API.
Supports all document types:
- PDF documents (native vision processing)
- Images (JPEG, PNG, WEBP, HEIC)
- Office documents (DOCX, XLSX, PPTX)
- HTML, TXT, and other text formats
Features:
- Converts to clean markdown format
- Preserves structure, tables, and formatting
- Extracts text from images and scanned documents
- Batch conversion support
- Saves to docs/assets/document-extraction.md by default
"""
import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional, List, Dict, Any
try:
from google import genai
from google.genai import types
except ImportError:
print("Error: google-genai package not installed")
print("Install with: pip install google-genai")
sys.exit(1)
try:
from dotenv import load_dotenv
except ImportError:
load_dotenv = None
def find_api_key() -> Optional[str]:
"""Find Gemini API key using correct priority order.
Priority order (highest to lowest):
1. process.env (runtime environment variables)
2. .opencode/skills/ai-multimodal/.env (skill-specific config)
3. .opencode/skills/.env (shared skills config)
4. .opencode/.env (Claude global config)
"""
# Priority 1: Already in process.env (highest)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Load .env files if dotenv available
if load_dotenv:
# Determine base paths
script_dir = Path(__file__).parent
skill_dir = script_dir.parent # .opencode/skills/ai-multimodal
skills_dir = skill_dir.parent # .opencode/skills
claude_dir = skills_dir.parent # .claude
# Priority 2: Skill-specific .env
env_file = skill_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Priority 3: Shared skills .env
env_file = skills_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Priority 4: Claude global .env
env_file = claude_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
return None
def find_project_root() -> Path:
"""Find project root directory."""
script_dir = Path(__file__).parent
# Look for .git or .claude directory
for parent in [script_dir] + list(script_dir.parents):
if (parent / '.git').exists() or (parent / '.claude').exists():
return parent
return script_dir
def get_mime_type(file_path: str) -> str:
"""Determine MIME type from file extension."""
ext = Path(file_path).suffix.lower()
mime_types = {
# Documents
'.pdf': 'application/pdf',
'.txt': 'text/plain',
'.html': 'text/html',
'.htm': 'text/html',
'.md': 'text/markdown',
'.csv': 'text/csv',
# Images
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.webp': 'image/webp',
'.heic': 'image/heic',
'.heif': 'image/heif',
# Office (need to be uploaded as binary)
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
}
return mime_types.get(ext, 'application/octet-stream')
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
"""Upload file to Gemini File API."""
if verbose:
print(f"Uploading {file_path}...")
myfile = client.files.upload(file=file_path)
# Wait for processing if needed
max_wait = 300 # 5 minutes
elapsed = 0
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
time.sleep(2)
myfile = client.files.get(name=myfile.name)
elapsed += 2
if verbose and elapsed % 10 == 0:
print(f" Processing... {elapsed}s")
if myfile.state.name == 'FAILED':
raise ValueError(f"File processing failed: {file_path}")
if myfile.state.name == 'PROCESSING':
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
if verbose:
print(f" Uploaded: {myfile.name}")
return myfile
def convert_to_markdown(
client: genai.Client,
file_path: str,
model: str = 'gemini-2.5-flash',
custom_prompt: Optional[str] = None,
verbose: bool = False,
max_retries: int = 3
) -> Dict[str, Any]:
"""Convert a document to markdown using Gemini."""
for attempt in range(max_retries):
try:
file_path_obj = Path(file_path)
file_size = file_path_obj.stat().st_size
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
# Default prompt for markdown conversion
if custom_prompt:
prompt = custom_prompt
else:
prompt = """Convert this document to clean, well-formatted Markdown.
Requirements:
- Preserve all content, structure, and formatting
- Convert tables to markdown table format
- Maintain heading hierarchy (# ## ### etc)
- Preserve lists, code blocks, and quotes
- Extract text from images if present
- Keep formatting consistent and readable
Output only the markdown content without any preamble or explanation."""
# Upload or inline the file
if use_file_api:
myfile = upload_file(client, str(file_path), verbose)
content = [prompt, myfile]
else:
with open(file_path, 'rb') as f:
file_bytes = f.read()
mime_type = get_mime_type(str(file_path))
content = [
prompt,
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
]
# Generate markdown
response = client.models.generate_content(
model=model,
contents=content
)
markdown_content = response.text if hasattr(response, 'text') else ''
return {
'file': str(file_path),
'status': 'success',
'markdown': markdown_content
}
except Exception as e:
if attempt == max_retries - 1:
return {
'file': str(file_path),
'status': 'error',
'error': str(e),
'markdown': None
}
wait_time = 2 ** attempt
if verbose:
print(f" Retry {attempt + 1} after {wait_time}s: {e}")
time.sleep(wait_time)
def batch_convert(
files: List[str],
output_file: Optional[str] = None,
auto_name: bool = False,
model: str = 'gemini-2.5-flash',
custom_prompt: Optional[str] = None,
verbose: bool = False
) -> List[Dict[str, Any]]:
"""Batch convert multiple files to markdown."""
api_key = find_api_key()
if not api_key:
print("Error: GEMINI_API_KEY not found")
print("Set via: export GEMINI_API_KEY='your-key'")
print("Or create .env file with: GEMINI_API_KEY=your-key")
sys.exit(1)
client = genai.Client(api_key=api_key)
results = []
# Determine output path
if not output_file:
project_root = find_project_root()
output_dir = project_root / 'docs' / 'assets'
if auto_name and len(files) == 1:
# Auto-generate meaningful filename from input
input_path = Path(files[0])
base_name = input_path.stem
output_file = str(output_dir / f"{base_name}-extraction.md")
else:
output_file = str(output_dir / 'document-extraction.md')
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Process each file
for i, file_path in enumerate(files, 1):
if verbose:
print(f"\n[{i}/{len(files)}] Converting: {file_path}")
result = convert_to_markdown(
client=client,
file_path=file_path,
model=model,
custom_prompt=custom_prompt,
verbose=verbose
)
results.append(result)
if verbose:
status = result.get('status', 'unknown')
print(f" Status: {status}")
# Save combined markdown
with open(output_path, 'w', encoding='utf-8') as f:
f.write("# Document Extraction Results\n\n")
f.write(f"Converted {len(files)} document(s) to markdown.\n\n")
f.write("---\n\n")
for result in results:
f.write(f"## {Path(result['file']).name}\n\n")
if result['status'] == 'success' and result.get('markdown'):
f.write(result['markdown'])
f.write("\n\n")
elif result['status'] == 'success':
f.write("**Note**: Conversion succeeded but no content was returned.\n\n")
else:
f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n")
f.write("---\n\n")
if verbose or True: # Always show output location
print(f"\n{'='*50}")
print(f"Converted: {len(results)} file(s)")
print(f"Success: {sum(1 for r in results if r['status'] == 'success')}")
print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}")
print(f"Output saved to: {output_path}")
return results
def main():
parser = argparse.ArgumentParser(
description='Convert documents to Markdown using Gemini API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Convert single PDF to markdown (default name)
%(prog)s --input document.pdf
# Auto-generate meaningful filename
%(prog)s --input testpdf.pdf --auto-name
# Output: docs/assets/testpdf-extraction.md
# Convert multiple files
%(prog)s --input doc1.pdf doc2.docx image.png
# Specify custom output location
%(prog)s --input document.pdf --output ./output.md
# Use custom prompt
%(prog)s --input document.pdf --prompt "Extract only the tables as markdown"
# Batch convert directory
%(prog)s --input ./documents/*.pdf --verbose
Supported formats:
- PDF documents (up to 1,000 pages)
- Images (JPEG, PNG, WEBP, HEIC)
- Office documents (DOCX, XLSX, PPTX)
- Text formats (TXT, HTML, Markdown, CSV)
Default output: <project-root>/docs/assets/document-extraction.md
"""
)
parser.add_argument('--input', '-i', nargs='+', required=True,
help='Input file(s) to convert')
parser.add_argument('--output', '-o',
help='Output markdown file (default: docs/assets/document-extraction.md)')
parser.add_argument('--auto-name', '-a', action='store_true',
help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)')
parser.add_argument('--model', default='gemini-2.5-flash',
help='Gemini model to use (default: gemini-2.5-flash)')
parser.add_argument('--prompt', '-p',
help='Custom prompt for conversion')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output')
args = parser.parse_args()
# Validate input files
files = []
for file_pattern in args.input:
file_path = Path(file_pattern)
if file_path.exists() and file_path.is_file():
files.append(str(file_path))
else:
# Try glob pattern
import glob
matched = glob.glob(file_pattern)
files.extend([f for f in matched if Path(f).is_file()])
if not files:
print("Error: No valid input files found")
sys.exit(1)
# Convert files
batch_convert(
files=files,
output_file=args.output,
auto_name=args.auto_name,
model=args.model,
custom_prompt=args.prompt,
verbose=args.verbose
)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,506 @@
#!/usr/bin/env python3
"""
Optimize media files for Gemini API processing.
Features:
- Compress videos/audio for size limits
- Resize images appropriately
- Split long videos into chunks
- Format conversion
- Quality vs size optimization
- Validation before upload
"""
import argparse
import json
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional, Dict, Any, List
try:
from dotenv import load_dotenv
except ImportError:
load_dotenv = None
def load_env_files():
"""Load .env files in correct priority order.
Priority order (highest to lowest):
1. process.env (runtime environment variables)
2. .opencode/skills/ai-multimodal/.env (skill-specific config)
3. .opencode/skills/.env (shared skills config)
4. .opencode/.env (Claude global config)
"""
if not load_dotenv:
return
# Determine base paths
script_dir = Path(__file__).parent
skill_dir = script_dir.parent # .opencode/skills/ai-multimodal
skills_dir = skill_dir.parent # .opencode/skills
claude_dir = skills_dir.parent # .claude
# Priority 2: Skill-specific .env
env_file = skill_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
# Priority 3: Shared skills .env
env_file = skills_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
# Priority 4: Claude global .env
env_file = claude_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
# Load environment variables at module level
load_env_files()
def check_ffmpeg() -> bool:
"""Check if ffmpeg is installed."""
try:
subprocess.run(['ffmpeg', '-version'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
return False
def get_media_info(file_path: str) -> Dict[str, Any]:
"""Get media file information using ffprobe."""
if not check_ffmpeg():
return {}
try:
cmd = [
'ffprobe',
'-v', 'quiet',
'-print_format', 'json',
'-show_format',
'-show_streams',
file_path
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
info = {
'size': int(data['format'].get('size', 0)),
'duration': float(data['format'].get('duration', 0)),
'bit_rate': int(data['format'].get('bit_rate', 0)),
}
# Get video/audio specific info
for stream in data.get('streams', []):
if stream['codec_type'] == 'video':
info['width'] = stream.get('width', 0)
info['height'] = stream.get('height', 0)
info['fps'] = eval(stream.get('r_frame_rate', '0/1'))
elif stream['codec_type'] == 'audio':
info['sample_rate'] = int(stream.get('sample_rate', 0))
info['channels'] = stream.get('channels', 0)
return info
except (subprocess.CalledProcessError, json.JSONDecodeError, Exception):
return {}
def optimize_video(
input_path: str,
output_path: str,
target_size_mb: Optional[int] = None,
max_duration: Optional[int] = None,
quality: int = 23,
resolution: Optional[str] = None,
verbose: bool = False
) -> bool:
"""Optimize video file for Gemini API."""
if not check_ffmpeg():
print("Error: ffmpeg not installed")
print("Install: apt-get install ffmpeg (Linux) or brew install ffmpeg (Mac)")
return False
info = get_media_info(input_path)
if not info:
print(f"Error: Could not read media info from {input_path}")
return False
if verbose:
print(f"Input: {Path(input_path).name}")
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
print(f" Duration: {info['duration']:.2f}s")
if 'width' in info:
print(f" Resolution: {info['width']}x{info['height']}")
print(f" Bit rate: {info['bit_rate'] / 1000:.0f} kbps")
# Build ffmpeg command
cmd = ['ffmpeg', '-i', input_path, '-y']
# Video codec
cmd.extend(['-c:v', 'libx264', '-crf', str(quality)])
# Resolution
if resolution:
cmd.extend(['-vf', f'scale={resolution}'])
elif 'width' in info and info['width'] > 1920:
cmd.extend(['-vf', 'scale=1920:-2']) # Max 1080p
# Audio codec
cmd.extend(['-c:a', 'aac', '-b:a', '128k', '-ac', '2'])
# Duration limit
if max_duration and info['duration'] > max_duration:
cmd.extend(['-t', str(max_duration)])
# Target size (rough estimate using bitrate)
if target_size_mb:
target_bits = target_size_mb * 8 * 1024 * 1024
duration = min(info['duration'], max_duration) if max_duration else info['duration']
target_bitrate = int(target_bits / duration)
# Reserve some for audio (128kbps)
video_bitrate = max(target_bitrate - 128000, 500000)
cmd.extend(['-b:v', str(video_bitrate)])
cmd.append(output_path)
if verbose:
print(f"\nOptimizing...")
print(f" Command: {' '.join(cmd)}")
try:
subprocess.run(cmd, check=True, capture_output=not verbose)
# Check output
output_info = get_media_info(output_path)
if output_info and verbose:
print(f"\nOutput: {Path(output_path).name}")
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
print(f" Duration: {output_info['duration']:.2f}s")
if 'width' in output_info:
print(f" Resolution: {output_info['width']}x{output_info['height']}")
compression = (1 - output_info['size'] / info['size']) * 100
print(f" Compression: {compression:.1f}%")
return True
except subprocess.CalledProcessError as e:
print(f"Error optimizing video: {e}")
return False
def optimize_audio(
input_path: str,
output_path: str,
target_size_mb: Optional[int] = None,
bitrate: str = '64k',
sample_rate: int = 16000,
verbose: bool = False
) -> bool:
"""Optimize audio file for Gemini API."""
if not check_ffmpeg():
print("Error: ffmpeg not installed")
return False
info = get_media_info(input_path)
if not info:
print(f"Error: Could not read media info from {input_path}")
return False
if verbose:
print(f"Input: {Path(input_path).name}")
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
print(f" Duration: {info['duration']:.2f}s")
# Build command
cmd = [
'ffmpeg', '-i', input_path, '-y',
'-c:a', 'aac',
'-b:a', bitrate,
'-ar', str(sample_rate),
'-ac', '1', # Mono (Gemini uses mono anyway)
output_path
]
if verbose:
print(f"\nOptimizing...")
try:
subprocess.run(cmd, check=True, capture_output=not verbose)
output_info = get_media_info(output_path)
if output_info and verbose:
print(f"\nOutput: {Path(output_path).name}")
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
compression = (1 - output_info['size'] / info['size']) * 100
print(f" Compression: {compression:.1f}%")
return True
except subprocess.CalledProcessError as e:
print(f"Error optimizing audio: {e}")
return False
def optimize_image(
input_path: str,
output_path: str,
max_width: int = 1920,
quality: int = 85,
verbose: bool = False
) -> bool:
"""Optimize image file for Gemini API."""
try:
from PIL import Image
except ImportError:
print("Error: Pillow not installed")
print("Install with: pip install pillow")
return False
try:
img = Image.open(input_path)
if verbose:
print(f"Input: {Path(input_path).name}")
print(f" Size: {Path(input_path).stat().st_size / 1024:.2f} KB")
print(f" Resolution: {img.width}x{img.height}")
# Resize if needed
if img.width > max_width:
ratio = max_width / img.width
new_height = int(img.height * ratio)
img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
if verbose:
print(f" Resized to: {img.width}x{img.height}")
# Convert RGBA to RGB if saving as JPEG
if output_path.lower().endswith('.jpg') or output_path.lower().endswith('.jpeg'):
if img.mode == 'RGBA':
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
rgb_img.paste(img, mask=img.split()[3])
img = rgb_img
# Save
img.save(output_path, quality=quality, optimize=True)
if verbose:
print(f"\nOutput: {Path(output_path).name}")
print(f" Size: {Path(output_path).stat().st_size / 1024:.2f} KB")
compression = (1 - Path(output_path).stat().st_size / Path(input_path).stat().st_size) * 100
print(f" Compression: {compression:.1f}%")
return True
except Exception as e:
print(f"Error optimizing image: {e}")
return False
def split_video(
input_path: str,
output_dir: str,
chunk_duration: int = 3600,
verbose: bool = False
) -> List[str]:
"""Split long video into chunks."""
if not check_ffmpeg():
print("Error: ffmpeg not installed")
return []
info = get_media_info(input_path)
if not info:
return []
total_duration = info['duration']
num_chunks = int(total_duration / chunk_duration) + 1
if num_chunks == 1:
if verbose:
print("Video is short enough, no splitting needed")
return [input_path]
Path(output_dir).mkdir(parents=True, exist_ok=True)
output_files = []
for i in range(num_chunks):
start_time = i * chunk_duration
output_file = Path(output_dir) / f"{Path(input_path).stem}_chunk_{i+1}.mp4"
cmd = [
'ffmpeg', '-i', input_path, '-y',
'-ss', str(start_time),
'-t', str(chunk_duration),
'-c', 'copy',
str(output_file)
]
if verbose:
print(f"Creating chunk {i+1}/{num_chunks}...")
try:
subprocess.run(cmd, check=True, capture_output=not verbose)
output_files.append(str(output_file))
except subprocess.CalledProcessError as e:
print(f"Error creating chunk {i+1}: {e}")
return output_files
def main():
parser = argparse.ArgumentParser(
description='Optimize media files for Gemini API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Optimize video to 100MB
%(prog)s --input video.mp4 --output optimized.mp4 --target-size 100
# Optimize audio
%(prog)s --input audio.mp3 --output optimized.m4a --bitrate 64k
# Resize image
%(prog)s --input image.jpg --output resized.jpg --max-width 1920
# Split long video
%(prog)s --input long-video.mp4 --split --chunk-duration 3600 --output-dir ./chunks
# Batch optimize directory
%(prog)s --input-dir ./videos --output-dir ./optimized --quality 85
"""
)
parser.add_argument('--input', help='Input file')
parser.add_argument('--output', help='Output file')
parser.add_argument('--input-dir', help='Input directory for batch processing')
parser.add_argument('--output-dir', help='Output directory for batch processing')
parser.add_argument('--target-size', type=int, help='Target size in MB')
parser.add_argument('--quality', type=int, default=85,
help='Quality (video: 0-51 CRF, image: 1-100) (default: 85)')
parser.add_argument('--max-width', type=int, default=1920,
help='Max image width (default: 1920)')
parser.add_argument('--bitrate', default='64k',
help='Audio bitrate (default: 64k)')
parser.add_argument('--resolution', help='Video resolution (e.g., 1920x1080)')
parser.add_argument('--split', action='store_true', help='Split long video into chunks')
parser.add_argument('--chunk-duration', type=int, default=3600,
help='Chunk duration in seconds (default: 3600 = 1 hour)')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
args = parser.parse_args()
# Validate arguments
if not args.input and not args.input_dir:
parser.error("Either --input or --input-dir required")
# Single file processing
if args.input:
input_path = Path(args.input)
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
sys.exit(1)
if args.split:
output_dir = args.output_dir or './chunks'
chunks = split_video(str(input_path), output_dir, args.chunk_duration, args.verbose)
print(f"\nCreated {len(chunks)} chunks in {output_dir}")
sys.exit(0)
if not args.output:
parser.error("--output required for single file processing")
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Determine file type
ext = input_path.suffix.lower()
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
success = optimize_video(
str(input_path),
str(output_path),
target_size_mb=args.target_size,
quality=args.quality,
resolution=args.resolution,
verbose=args.verbose
)
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
success = optimize_audio(
str(input_path),
str(output_path),
target_size_mb=args.target_size,
bitrate=args.bitrate,
verbose=args.verbose
)
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
success = optimize_image(
str(input_path),
str(output_path),
max_width=args.max_width,
quality=args.quality,
verbose=args.verbose
)
else:
print(f"Error: Unsupported file type: {ext}")
sys.exit(1)
sys.exit(0 if success else 1)
# Batch processing
if args.input_dir:
if not args.output_dir:
parser.error("--output-dir required for batch processing")
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Find all media files
patterns = ['*.mp4', '*.mov', '*.avi', '*.mkv', '*.webm',
'*.mp3', '*.wav', '*.m4a', '*.flac',
'*.jpg', '*.jpeg', '*.png', '*.webp']
files = []
for pattern in patterns:
files.extend(input_dir.glob(pattern))
if not files:
print(f"No media files found in {input_dir}")
sys.exit(1)
print(f"Found {len(files)} files to process")
success_count = 0
for input_file in files:
output_file = output_dir / input_file.name
ext = input_file.suffix.lower()
success = False
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
success = optimize_video(str(input_file), str(output_file),
quality=args.quality, verbose=args.verbose)
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
success = optimize_audio(str(input_file), str(output_file),
bitrate=args.bitrate, verbose=args.verbose)
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
success = optimize_image(str(input_file), str(output_file),
max_width=args.max_width, quality=args.quality,
verbose=args.verbose)
if success:
success_count += 1
print(f"\nProcessed: {success_count}/{len(files)} files")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
MiniMax API client - shared HTTP utilities for all MiniMax generation tasks.
Handles authentication, API calls, async task polling, and file downloads.
Base URL: https://api.minimax.io/v1
Auth: Bearer token via MINIMAX_API_KEY environment variable.
"""
import json
import os
import sys
import time
from pathlib import Path
from typing import Dict, Any, Optional
try:
import requests
except ImportError:
print("Error: requests package not installed")
print("Install with: pip install requests")
sys.exit(1)
# Import centralized environment resolver
CLAUDE_ROOT = Path(__file__).parent.parent.parent.parent
sys.path.insert(0, str(CLAUDE_ROOT / 'scripts'))
try:
from resolve_env import resolve_env
CENTRALIZED_RESOLVER_AVAILABLE = True
except ImportError:
CENTRALIZED_RESOLVER_AVAILABLE = False
BASE_URL = "https://api.minimax.io/v1"
def find_minimax_api_key() -> Optional[str]:
"""Find MINIMAX_API_KEY using centralized resolver or environment."""
if CENTRALIZED_RESOLVER_AVAILABLE:
return resolve_env('MINIMAX_API_KEY', skill='ai-multimodal')
# Fallback: check environment and .env files
api_key = os.getenv('MINIMAX_API_KEY')
if api_key:
return api_key
# Check .env files in skill directory hierarchy
try:
from dotenv import load_dotenv
skill_dir = Path(__file__).parent.parent
for env_path in [skill_dir / '.env', skill_dir.parent / '.env']:
if env_path.exists():
load_dotenv(env_path, override=True)
api_key = os.getenv('MINIMAX_API_KEY')
if api_key:
return api_key
except ImportError:
pass
return None
def get_headers(api_key: str) -> Dict[str, str]:
"""Build authorization headers for MiniMax API."""
return {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def api_post(endpoint: str, payload: Dict[str, Any], api_key: str,
verbose: bool = False, timeout: int = 120) -> Dict[str, Any]:
"""Make POST request to MiniMax API with error handling."""
url = f"{BASE_URL}/{endpoint}"
headers = get_headers(api_key)
if verbose:
print(f" POST {url}", file=sys.stderr)
response = requests.post(url, headers=headers, json=payload, timeout=timeout)
if response.status_code != 200:
raise Exception(
f"MiniMax API error (HTTP {response.status_code}): {response.text}"
)
data = response.json()
# Check MiniMax-specific error codes
base_resp = data.get("base_resp", {})
status_code = base_resp.get("status_code", 0)
if status_code != 0:
raise Exception(
f"MiniMax API error (code {status_code}): "
f"{base_resp.get('status_msg', 'Unknown error')}"
)
return data
def api_get(endpoint: str, params: Dict[str, str], api_key: str,
verbose: bool = False) -> Dict[str, Any]:
"""Make GET request to MiniMax API."""
url = f"{BASE_URL}/{endpoint}"
headers = get_headers(api_key)
if verbose:
print(f" GET {url}", file=sys.stderr)
response = requests.get(url, headers=headers, params=params, timeout=60)
if response.status_code != 200:
raise Exception(
f"MiniMax API error (HTTP {response.status_code}): {response.text}"
)
return response.json()
def poll_async_task(task_id: str, task_type: str, api_key: str,
poll_interval: int = 10, max_wait: int = 600,
verbose: bool = False) -> Dict[str, Any]:
"""Poll async task (video/music) until completion.
Args:
task_id: The task ID returned from creation endpoint
task_type: 'video_generation' or 'music_generation'
poll_interval: Seconds between polls (default 10)
max_wait: Maximum wait time in seconds (default 600)
"""
elapsed = 0
while elapsed < max_wait:
result = api_get(
f"query/{task_type}",
{"task_id": task_id},
api_key,
verbose=False
)
status = result.get("status", "Unknown")
if verbose and elapsed > 0 and elapsed % 30 == 0:
print(f" Polling... {elapsed}s elapsed, status: {status}",
file=sys.stderr)
if status == "Success":
return result
elif status in ("Failed", "Error"):
raise Exception(f"Task failed: {json.dumps(result)}")
time.sleep(poll_interval)
elapsed += poll_interval
raise TimeoutError(f"Task {task_id} timed out after {max_wait}s")
def download_file(file_id: str, api_key: str, output_path: str,
verbose: bool = False) -> str:
"""Download file from MiniMax file service."""
result = api_get("files/retrieve", {"file_id": file_id}, api_key, verbose)
download_url = result.get("file", {}).get("download_url")
if not download_url:
raise Exception(f"No download URL in response: {json.dumps(result)}")
if verbose:
print(f" Downloading to: {output_path}", file=sys.stderr)
response = requests.get(download_url, stream=True, timeout=300)
response.raise_for_status()
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return output_path
def get_output_dir() -> Path:
"""Get project output directory for generated assets."""
script_dir = Path(__file__).parent
for parent in [script_dir] + list(script_dir.parents):
if (parent / '.git').exists() or (parent / '.claude').exists():
output_dir = parent / 'docs' / 'assets'
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir
# Fallback
output_dir = script_dir.parent / 'assets'
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir

View File

@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
MiniMax CLI entry point - standalone CLI for MiniMax generation tasks.
Can be called directly or delegated to from gemini_batch_process.py
when MiniMax models are detected.
Usage:
python minimax_cli.py --task generate --prompt "A cat" --model image-01
python minimax_cli.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3
python minimax_cli.py --task generate-speech --text "Hello" --model speech-2.8-hd --voice English_Warm_Bestie
python minimax_cli.py --task generate-music --lyrics "La la la" --prompt "pop song" --model music-2.5
"""
import argparse
import json
import shutil
import sys
from pathlib import Path
from minimax_api_client import find_minimax_api_key
from minimax_generate import (
generate_image, generate_video, generate_speech, generate_music
)
TASK_DEFAULTS = {
'generate': 'image-01',
'generate-video': 'MiniMax-Hailuo-2.3',
'generate-speech': 'speech-2.8-hd',
'generate-music': 'music-2.5'
}
def main():
parser = argparse.ArgumentParser(
description='MiniMax AI generation CLI (image/video/speech/music)',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate image
%(prog)s --task generate --prompt "A cyberpunk city at night" --model image-01 --aspect-ratio 16:9
# Generate video (async, ~30-60s)
%(prog)s --task generate-video --prompt "A dancer performing" --model MiniMax-Hailuo-2.3
# Generate speech
%(prog)s --task generate-speech --text "Welcome to the show" --model speech-2.8-hd --voice English_Warm_Bestie
# Generate music with lyrics
%(prog)s --task generate-music --lyrics "Verse 1\\nHello world" --prompt "upbeat pop" --model music-2.5
"""
)
parser.add_argument('--task', required=True,
choices=['generate', 'generate-video',
'generate-speech', 'generate-music'],
help='Generation task type')
parser.add_argument('--prompt', help='Text prompt for generation')
parser.add_argument('--text', help='Text for speech generation')
parser.add_argument('--lyrics', help='Lyrics for music generation')
parser.add_argument('--model', help='Model name (auto-detected from task)')
parser.add_argument('--aspect-ratio', default='1:1',
choices=['1:1', '16:9', '4:3', '3:2', '2:3',
'3:4', '9:16', '21:9'],
help='Aspect ratio for image generation')
parser.add_argument('--num-images', type=int, default=1,
help='Number of images (1-9, default: 1)')
parser.add_argument('--duration', type=int, default=6,
choices=[6, 10],
help='Video duration in seconds (6 or 10)')
parser.add_argument('--resolution', default='1080P',
choices=['720P', '1080P'],
help='Video resolution')
parser.add_argument('--voice', default='English_expressive_narrator',
help='Voice ID for speech (default: English_expressive_narrator)')
parser.add_argument('--emotion', default='neutral',
choices=['happy', 'sad', 'angry', 'fearful',
'disgusted', 'surprised', 'neutral'],
help='Emotion for speech')
parser.add_argument('--output-format', default='mp3',
choices=['mp3', 'wav', 'flac', 'pcm'],
help='Audio output format')
parser.add_argument('--first-frame', help='Image URL for video first frame')
parser.add_argument('--output', '-o', help='Output file path')
parser.add_argument('--verbose', '-v', action='store_true')
args = parser.parse_args()
# Auto-detect model from task
if not args.model:
args.model = TASK_DEFAULTS.get(args.task, 'image-01')
if args.verbose:
print(f"Auto-detected model: {args.model}")
# Find API key
api_key = find_minimax_api_key()
if not api_key:
print("Error: MINIMAX_API_KEY not found")
print("\nSetup:")
print("1. export MINIMAX_API_KEY='your-key'")
print("2. Or add to .env: MINIMAX_API_KEY=your-key")
print("\nGet key at: https://platform.minimax.io/user-center/basic-information/interface-key")
sys.exit(1)
# Dispatch to task handler
try:
if args.task == 'generate':
if not args.prompt:
parser.error("--prompt required for image generation")
result = generate_image(
api_key, args.prompt, args.model,
args.aspect_ratio, args.num_images,
args.output, args.verbose
)
elif args.task == 'generate-video':
if not args.prompt:
parser.error("--prompt required for video generation")
result = generate_video(
api_key, args.prompt, args.model,
args.duration, args.resolution,
args.first_frame, args.output, args.verbose
)
elif args.task == 'generate-speech':
text = args.text or args.prompt
if not text:
parser.error("--text or --prompt required for speech")
result = generate_speech(
api_key, text, args.model,
args.voice, args.emotion, args.output_format,
output=args.output, verbose=args.verbose
)
elif args.task == 'generate-music':
if not args.lyrics and not args.prompt:
parser.error("--lyrics or --prompt required for music")
result = generate_music(
api_key, args.lyrics or '', args.prompt or '',
args.model, args.output_format,
args.output, args.verbose
)
else:
parser.error(f"Unknown task: {args.task}")
return
# Print results
print_result(result, args.task)
except Exception as e:
print(f"\nError: {e}", file=sys.stderr)
sys.exit(1)
def print_result(result: dict, task: str):
"""Print generation result in LLM-friendly format."""
print(f"\n=== RESULTS ===\n")
print(f"[{task}]")
print(f"Status: {result.get('status', 'unknown')}")
if result.get('status') == 'success':
if 'generated_images' in result:
for img in result['generated_images']:
print(f"Generated image: {img}")
if 'generated_video' in result:
print(f"Generated video: {result['generated_video']}")
if 'generation_time' in result:
print(f"Generation time: {result['generation_time']:.1f}s")
if 'generated_audio' in result:
print(f"Generated audio: {result['generated_audio']}")
if 'duration_ms' in result:
dur = result['duration_ms'] / 1000
print(f"Duration: {dur:.1f}s")
elif result.get('error'):
print(f"Error: {result['error']}")
print(f"\nModel: {result.get('model', 'unknown')}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,278 @@
#!/usr/bin/env python3
"""
MiniMax generation CLI - image, video, speech, and music generation.
Models:
- Image: image-01, image-01-live
- Video: MiniMax-Hailuo-2.3, MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-02, S2V-01
- Speech: speech-2.8-hd, speech-2.8-turbo, speech-2.6-hd, speech-2.6-turbo
- Music: music-2.5
Usage:
python minimax_generate.py --task generate --prompt "A cat in space" --model image-01
python minimax_generate.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3
python minimax_generate.py --task generate-speech --text "Hello world" --model speech-2.8-hd
python minimax_generate.py --task generate-music --lyrics "Verse 1..." --model music-2.5
"""
import argparse
import base64
import json
import shutil
import sys
import time
from pathlib import Path
from minimax_api_client import (
find_minimax_api_key, api_post, poll_async_task,
download_file, get_output_dir
)
# Model registries
MINIMAX_IMAGE_MODELS = {'image-01', 'image-01-live'}
MINIMAX_VIDEO_MODELS = {
'MiniMax-Hailuo-2.3', 'MiniMax-Hailuo-2.3-Fast',
'MiniMax-Hailuo-02', 'S2V-01'
}
MINIMAX_SPEECH_MODELS = {
'speech-2.8-hd', 'speech-2.8-turbo',
'speech-2.6-hd', 'speech-2.6-turbo',
'speech-02-hd', 'speech-02-turbo'
}
MINIMAX_MUSIC_MODELS = {'music-2.5', 'music-2.0'}
ALL_MINIMAX_MODELS = (
MINIMAX_IMAGE_MODELS | MINIMAX_VIDEO_MODELS |
MINIMAX_SPEECH_MODELS | MINIMAX_MUSIC_MODELS
)
def is_minimax_model(model: str) -> bool:
"""Check if model is a MiniMax model."""
return (
model in ALL_MINIMAX_MODELS or
model.startswith('MiniMax-') or
model.startswith('image-01') or
model.startswith('speech-') or
model.startswith('music-') or
model.startswith('S2V-')
)
def generate_image(api_key: str, prompt: str, model: str = 'image-01',
aspect_ratio: str = '1:1', num_images: int = 1,
output: str = None, verbose: bool = False) -> dict:
"""Generate image using MiniMax image-01 model."""
payload = {
"model": model,
"prompt": prompt,
"aspect_ratio": aspect_ratio,
"n": min(num_images, 9),
"response_format": "url",
"prompt_optimizer": True
}
if verbose:
print(f"Generating {num_images} image(s) with {model}...")
result = api_post("image_generation", payload, api_key, verbose)
# Download images
image_urls = result.get("data", {}).get("image_urls", [])
if not image_urls:
return {"status": "error", "error": "No images in response"}
output_dir = get_output_dir()
saved_files = []
import requests as req
for i, url in enumerate(image_urls):
ts = int(time.time())
fname = f"minimax_image_{ts}_{i}.png"
fpath = output_dir / fname
resp = req.get(url, timeout=60)
resp.raise_for_status()
with open(fpath, 'wb') as f:
f.write(resp.content)
saved_files.append(str(fpath))
if verbose:
print(f" Saved: {fpath}")
# Copy first image to output if specified
if output and saved_files:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(saved_files[0], output)
return {"status": "success", "generated_images": saved_files, "model": model}
def generate_video(api_key: str, prompt: str, model: str = 'MiniMax-Hailuo-2.3',
duration: int = 6, resolution: str = '1080P',
first_frame: str = None, output: str = None,
verbose: bool = False) -> dict:
"""Generate video using MiniMax Hailuo models (async)."""
payload = {
"prompt": prompt,
"model": model,
"duration": duration,
"resolution": resolution
}
if first_frame:
payload["first_frame_image"] = first_frame
if verbose:
print(f"Submitting video generation with {model}...")
result = api_post("video_generation", payload, api_key, verbose)
task_id = result.get("task_id")
if not task_id:
return {"status": "error", "error": f"No task_id: {json.dumps(result)}"}
if verbose:
print(f" Task ID: {task_id}, polling...")
start = time.time()
poll_result = poll_async_task(task_id, "video_generation", api_key,
poll_interval=10, verbose=verbose)
file_id = poll_result.get("file_id")
if not file_id:
return {"status": "error", "error": f"No file_id: {json.dumps(poll_result)}"}
output_dir = get_output_dir()
ts = int(time.time())
output_path = str(output_dir / f"minimax_video_{ts}.mp4")
download_file(file_id, api_key, output_path, verbose)
elapsed = time.time() - start
file_size = Path(output_path).stat().st_size / (1024 * 1024)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
print(f" Generated in {elapsed:.1f}s, size: {file_size:.2f} MB")
return {
"status": "success", "generated_video": output_path,
"generation_time": elapsed, "file_size_mb": file_size, "model": model
}
def generate_speech(api_key: str, text: str, model: str = 'speech-2.8-hd',
voice: str = 'English_expressive_narrator',
emotion: str = 'neutral', output_format: str = 'mp3',
rate: float = 1.0, output: str = None,
verbose: bool = False) -> dict:
"""Generate speech using MiniMax TTS v2 API."""
payload = {
"model": model,
"text": text[:10000],
"stream": False,
"language_boost": "auto",
"output_format": "hex",
"voice_setting": {
"voice_id": voice,
"speed": rate,
"vol": 1.0,
"pitch": 0
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": output_format,
"channel": 1
}
}
if verbose:
print(f"Generating speech with {model}, voice: {voice}...")
result = api_post("t2a_v2", payload, api_key, verbose)
audio_data = result.get("data", {}).get("audio")
if not audio_data:
return {"status": "error", "error": "No audio in response"}
output_dir = get_output_dir()
ts = int(time.time())
ext = output_format if output_format in ('mp3', 'wav', 'flac') else 'mp3'
output_path = str(output_dir / f"minimax_speech_{ts}.{ext}")
# Audio returned as hex-encoded string from t2a_v2
audio_bytes = bytes.fromhex(audio_data)
with open(output_path, 'wb') as f:
f.write(audio_bytes)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
size_kb = len(audio_bytes) / 1024
print(f" Saved: {output_path} ({size_kb:.1f} KB)")
return {"status": "success", "generated_audio": output_path, "model": model}
def generate_music(api_key: str, lyrics: str = '', prompt: str = '',
model: str = 'music-2.5', output_format: str = 'mp3',
output: str = None, verbose: bool = False) -> dict:
"""Generate music using MiniMax music models."""
payload = {
"model": model,
"output_format": "url",
"audio_setting": {
"sample_rate": 44100,
"bitrate": 128000,
"format": output_format
}
}
if lyrics:
payload["lyrics"] = lyrics[:3500]
if prompt:
payload["prompt"] = prompt[:2000]
if verbose:
print(f"Generating music with {model}...")
result = api_post("music_generation", payload, api_key, verbose, timeout=300)
audio_data = result.get("data", {}).get("audio")
extra = result.get("extra_info", {})
duration_ms = extra.get("music_duration", 0)
if not audio_data:
return {"status": "error", "error": "No audio in response"}
output_dir = get_output_dir()
ts = int(time.time())
output_path = str(output_dir / f"minimax_music_{ts}.{output_format}")
# Download from URL or decode hex
if audio_data.startswith("http"):
import requests as req
resp = req.get(audio_data, timeout=120)
resp.raise_for_status()
with open(output_path, 'wb') as f:
f.write(resp.content)
else:
audio_bytes = bytes.fromhex(audio_data)
with open(output_path, 'wb') as f:
f.write(audio_bytes)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
dur_s = duration_ms / 1000 if duration_ms else 0
print(f" Saved: {output_path} ({dur_s:.1f}s)")
return {
"status": "success", "generated_audio": output_path,
"duration_ms": duration_ms, "model": model
}

View File

@@ -0,0 +1,26 @@
# AI Multimodal Skill Dependencies
# Python 3.10+ required
# Google Gemini API
google-genai>=0.1.0
# PDF processing
pypdf>=4.0.0
# Document conversion
python-docx>=1.0.0
docx2pdf>=0.1.8 # Windows only, optional on Linux/macOS
# Markdown processing
markdown>=3.5.0
# Image processing
Pillow>=10.0.0
# Environment variable management
python-dotenv>=1.0.0
# Testing dependencies (dev)
pytest>=8.0.0
pytest-cov>=4.1.0
pytest-mock>=3.12.0

Binary file not shown.

View File

@@ -0,0 +1,20 @@
# Core dependencies
google-genai>=0.2.0
python-dotenv>=1.0.0
# Image processing
pillow>=10.0.0
# PDF processing
pypdf>=3.0.0
# Document conversion
markdown>=3.5
# Testing
pytest>=7.4.0
pytest-cov>=4.1.0
pytest-mock>=3.12.0
# Optional dependencies for full functionality
# ffmpeg-python>=0.2.0 # For media optimization (requires ffmpeg installed)

View File

@@ -0,0 +1,74 @@
"""
Tests for document_converter.py
"""
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock, mock_open
sys.path.insert(0, str(Path(__file__).parent.parent))
import document_converter as dc
class TestAPIKeyFinder:
"""Test API key finding logic."""
@patch.dict('os.environ', {'GEMINI_API_KEY': 'test-key-from-env'})
def test_find_api_key_from_env(self):
"""Test finding API key from environment."""
api_key = dc.find_api_key()
assert api_key == 'test-key-from-env'
@patch.dict('os.environ', {}, clear=True)
@patch('document_converter.load_dotenv', None)
def test_find_api_key_no_key(self):
"""Test when no API key is available."""
api_key = dc.find_api_key()
assert api_key is None
class TestProjectRoot:
"""Test project root finding."""
@patch('pathlib.Path.exists')
def test_find_project_root_with_git(self, mock_exists):
"""Test finding project root with .git directory."""
root = dc.find_project_root()
assert isinstance(root, Path)
class TestMimeType:
"""Test MIME type detection."""
def test_pdf_mime_type(self):
"""Test PDF MIME type."""
assert dc.get_mime_type('document.pdf') == 'application/pdf'
def test_image_mime_types(self):
"""Test image MIME types."""
assert dc.get_mime_type('image.jpg') == 'image/jpeg'
assert dc.get_mime_type('image.png') == 'image/png'
def test_unknown_mime_type(self):
"""Test unknown file extension."""
assert dc.get_mime_type('file.unknown') == 'application/octet-stream'
class TestIntegration:
"""Integration tests."""
def test_mime_type_integration(self):
"""Test MIME type detection with various extensions."""
test_cases = [
('document.pdf', 'application/pdf'),
('image.jpg', 'image/jpeg'),
('unknown.xyz', 'application/octet-stream'),
]
for file_path, expected_mime in test_cases:
assert dc.get_mime_type(file_path) == expected_mime
if __name__ == '__main__':
pytest.main([__file__, '-v', '--cov=document_converter', '--cov-report=term-missing'])

View File

@@ -0,0 +1,362 @@
"""
Tests for gemini_batch_process.py
"""
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
import gemini_batch_process as gbp
class TestAPIKeyFinder:
"""Test API key detection."""
def test_find_api_key_from_env(self, monkeypatch):
"""Test finding API key from environment variable."""
monkeypatch.setenv('GEMINI_API_KEY', 'test_key_123')
assert gbp.find_api_key() == 'test_key_123'
@patch('gemini_batch_process.load_dotenv')
def test_find_api_key_not_found(self, mock_load_dotenv, monkeypatch):
"""Test when API key is not found."""
monkeypatch.delenv('GEMINI_API_KEY', raising=False)
# Mock load_dotenv to not actually load any files
mock_load_dotenv.return_value = None
assert gbp.find_api_key() is None
class TestMimeTypeDetection:
"""Test MIME type detection."""
def test_audio_mime_types(self):
"""Test audio file MIME types."""
assert gbp.get_mime_type('test.mp3') == 'audio/mp3'
assert gbp.get_mime_type('test.wav') == 'audio/wav'
assert gbp.get_mime_type('test.aac') == 'audio/aac'
assert gbp.get_mime_type('test.flac') == 'audio/flac'
def test_image_mime_types(self):
"""Test image file MIME types."""
assert gbp.get_mime_type('test.jpg') == 'image/jpeg'
assert gbp.get_mime_type('test.jpeg') == 'image/jpeg'
assert gbp.get_mime_type('test.png') == 'image/png'
assert gbp.get_mime_type('test.webp') == 'image/webp'
def test_video_mime_types(self):
"""Test video file MIME types."""
assert gbp.get_mime_type('test.mp4') == 'video/mp4'
assert gbp.get_mime_type('test.mov') == 'video/quicktime'
assert gbp.get_mime_type('test.avi') == 'video/x-msvideo'
def test_document_mime_types(self):
"""Test document file MIME types."""
assert gbp.get_mime_type('test.pdf') == 'application/pdf'
assert gbp.get_mime_type('test.txt') == 'text/plain'
def test_unknown_mime_type(self):
"""Test unknown file extension."""
assert gbp.get_mime_type('test.xyz') == 'application/octet-stream'
def test_case_insensitive(self):
"""Test case-insensitive extension matching."""
assert gbp.get_mime_type('TEST.MP3') == 'audio/mp3'
assert gbp.get_mime_type('Test.JPG') == 'image/jpeg'
class TestFileUpload:
"""Test file upload functionality."""
@patch('gemini_batch_process.genai.Client')
def test_upload_file_success(self, mock_client_class):
"""Test successful file upload."""
# Mock client and file
mock_client = Mock()
mock_file = Mock()
mock_file.state.name = 'ACTIVE'
mock_file.name = 'test_file'
mock_client.files.upload.return_value = mock_file
result = gbp.upload_file(mock_client, 'test.jpg', verbose=False)
assert result == mock_file
mock_client.files.upload.assert_called_once_with(file='test.jpg')
@patch('gemini_batch_process.genai.Client')
@patch('gemini_batch_process.time.sleep')
def test_upload_video_with_processing(self, mock_sleep, mock_client_class):
"""Test video upload with processing wait."""
mock_client = Mock()
# First call: PROCESSING, second call: ACTIVE
mock_file_processing = Mock()
mock_file_processing.state.name = 'PROCESSING'
mock_file_processing.name = 'test_video'
mock_file_active = Mock()
mock_file_active.state.name = 'ACTIVE'
mock_file_active.name = 'test_video'
mock_client.files.upload.return_value = mock_file_processing
mock_client.files.get.return_value = mock_file_active
result = gbp.upload_file(mock_client, 'test.mp4', verbose=False)
assert result.state.name == 'ACTIVE'
@patch('gemini_batch_process.genai.Client')
def test_upload_file_failed(self, mock_client_class):
"""Test failed file upload."""
mock_client = Mock()
mock_file = Mock()
mock_file.state.name = 'FAILED'
mock_client.files.upload.return_value = mock_file
mock_client.files.get.return_value = mock_file
with pytest.raises(ValueError, match="File processing failed"):
gbp.upload_file(mock_client, 'test.mp4', verbose=False)
class TestProcessFile:
"""Test file processing functionality."""
@patch('gemini_batch_process.genai.Client')
@patch('builtins.open', create=True)
@patch('pathlib.Path.stat')
def test_process_small_file_inline(self, mock_stat, mock_open, mock_client_class):
"""Test processing small file with inline data."""
# Mock small file
mock_stat.return_value.st_size = 10 * 1024 * 1024 # 10MB
# Mock file content
mock_open.return_value.__enter__.return_value.read.return_value = b'test_data'
# Mock client and response
mock_client = Mock()
mock_response = Mock()
mock_response.text = 'Test response'
mock_client.models.generate_content.return_value = mock_response
result = gbp.process_file(
client=mock_client,
file_path='test.jpg',
prompt='Describe this image',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False
)
assert result['status'] == 'success'
assert result['response'] == 'Test response'
@patch('gemini_batch_process.upload_file')
@patch('gemini_batch_process.genai.Client')
@patch('pathlib.Path.stat')
def test_process_large_file_api(self, mock_stat, mock_client_class, mock_upload):
"""Test processing large file with File API."""
# Mock large file
mock_stat.return_value.st_size = 50 * 1024 * 1024 # 50MB
# Mock upload and response
mock_file = Mock()
mock_upload.return_value = mock_file
mock_client = Mock()
mock_response = Mock()
mock_response.text = 'Test response'
mock_client.models.generate_content.return_value = mock_response
result = gbp.process_file(
client=mock_client,
file_path='test.mp4',
prompt='Summarize this video',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False
)
assert result['status'] == 'success'
mock_upload.assert_called_once()
@patch('gemini_batch_process.genai.Client')
@patch('builtins.open', create=True)
@patch('pathlib.Path.stat')
def test_process_file_error_handling(self, mock_stat, mock_open, mock_client_class):
"""Test error handling in file processing."""
mock_stat.return_value.st_size = 1024
# Mock file read
mock_file = MagicMock()
mock_file.__enter__.return_value.read.return_value = b'test_data'
mock_open.return_value = mock_file
mock_client = Mock()
mock_client.models.generate_content.side_effect = Exception("API Error")
result = gbp.process_file(
client=mock_client,
file_path='test.jpg',
prompt='Test',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False,
max_retries=1
)
assert result['status'] == 'error'
assert 'API Error' in result['error']
@patch('gemini_batch_process.genai.Client')
@patch('builtins.open', create=True)
@patch('pathlib.Path.stat')
def test_image_generation_with_aspect_ratio(self, mock_stat, mock_open, mock_client_class):
"""Test image generation with aspect ratio config."""
mock_stat.return_value.st_size = 1024
# Mock file read
mock_file = MagicMock()
mock_file.__enter__.return_value.read.return_value = b'test'
mock_open.return_value = mock_file
mock_client = Mock()
mock_response = Mock()
mock_response.candidates = [Mock()]
mock_response.candidates[0].content.parts = [
Mock(inline_data=Mock(data=b'fake_image_data'))
]
mock_client.models.generate_content.return_value = mock_response
result = gbp.process_file(
client=mock_client,
file_path='test.txt',
prompt='Generate mountain landscape',
model='gemini-2.5-flash-image',
task='generate',
format_output='text',
aspect_ratio='16:9',
verbose=False
)
# Verify config was called with correct structure
call_args = mock_client.models.generate_content.call_args
config = call_args.kwargs.get('config')
assert config is not None
assert result['status'] == 'success'
assert 'generated_image' in result
class TestBatchProcessing:
"""Test batch processing functionality."""
@patch('gemini_batch_process.find_api_key')
@patch('gemini_batch_process.process_file')
@patch('gemini_batch_process.genai.Client')
def test_batch_process_success(self, mock_client_class, mock_process, mock_find_key):
"""Test successful batch processing."""
mock_find_key.return_value = 'test_key'
mock_process.return_value = {'status': 'success', 'response': 'Test'}
results = gbp.batch_process(
files=['test1.jpg', 'test2.jpg'],
prompt='Analyze',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False,
dry_run=False
)
assert len(results) == 2
assert all(r['status'] == 'success' for r in results)
@patch('gemini_batch_process.find_api_key')
def test_batch_process_no_api_key(self, mock_find_key):
"""Test batch processing without API key."""
mock_find_key.return_value = None
with pytest.raises(SystemExit):
gbp.batch_process(
files=['test.jpg'],
prompt='Test',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False,
dry_run=False
)
@patch('gemini_batch_process.find_api_key')
def test_batch_process_dry_run(self, mock_find_key):
"""Test dry run mode."""
# API key not needed for dry run, but we mock it to avoid sys.exit
mock_find_key.return_value = 'test_key'
results = gbp.batch_process(
files=['test1.jpg', 'test2.jpg'],
prompt='Test',
model='gemini-2.5-flash',
task='analyze',
format_output='text',
verbose=False,
dry_run=True
)
assert results == []
class TestResultsSaving:
"""Test results saving functionality."""
@patch('builtins.open', create=True)
@patch('json.dump')
def test_save_results_json(self, mock_json_dump, mock_open):
"""Test saving results as JSON."""
results = [
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
]
gbp.save_results(results, 'output.json', 'json')
mock_json_dump.assert_called_once()
@patch('builtins.open', create=True)
@patch('csv.DictWriter')
def test_save_results_csv(self, mock_csv_writer, mock_open):
"""Test saving results as CSV."""
results = [
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
]
gbp.save_results(results, 'output.csv', 'csv')
# Verify CSV writer was used
mock_csv_writer.assert_called_once()
@patch('builtins.open', create=True)
def test_save_results_markdown(self, mock_open):
"""Test saving results as Markdown."""
mock_file = MagicMock()
mock_open.return_value.__enter__.return_value = mock_file
results = [
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
{'file': 'test2.jpg', 'status': 'error', 'error': 'Failed'}
]
gbp.save_results(results, 'output.md', 'markdown')
# Verify write was called
assert mock_file.write.call_count > 0
if __name__ == '__main__':
pytest.main([__file__, '-v', '--cov=gemini_batch_process', '--cov-report=term-missing'])

View File

@@ -0,0 +1,373 @@
"""
Tests for media_optimizer.py
"""
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import json
sys.path.insert(0, str(Path(__file__).parent.parent))
import media_optimizer as mo
class TestEnvLoading:
"""Test environment variable loading."""
@patch('media_optimizer.load_dotenv')
@patch('pathlib.Path.exists')
def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
"""Test successful .env file loading."""
mock_exists.return_value = True
mo.load_env_files()
# Should be called for skill, skills, and claude dirs
assert mock_load_dotenv.call_count >= 1
@patch('media_optimizer.load_dotenv', None)
def test_load_env_files_no_dotenv(self):
"""Test when dotenv is not available."""
# Should not raise an error
mo.load_env_files()
class TestFFmpegCheck:
"""Test ffmpeg availability checking."""
@patch('subprocess.run')
def test_ffmpeg_installed(self, mock_run):
"""Test when ffmpeg is installed."""
mock_run.return_value = Mock()
assert mo.check_ffmpeg() is True
@patch('subprocess.run')
def test_ffmpeg_not_installed(self, mock_run):
"""Test when ffmpeg is not installed."""
mock_run.side_effect = FileNotFoundError()
assert mo.check_ffmpeg() is False
@patch('subprocess.run')
def test_ffmpeg_error(self, mock_run):
"""Test ffmpeg command error."""
mock_run.side_effect = Exception("Error")
assert mo.check_ffmpeg() is False
class TestMediaInfo:
"""Test media information extraction."""
@patch('media_optimizer.check_ffmpeg')
@patch('subprocess.run')
def test_get_video_info(self, mock_run, mock_check):
"""Test extracting video information."""
mock_check.return_value = True
mock_result = Mock()
mock_result.stdout = json.dumps({
'format': {
'size': '10485760',
'duration': '120.5',
'bit_rate': '691200'
},
'streams': [
{
'codec_type': 'video',
'width': 1920,
'height': 1080,
'r_frame_rate': '30/1'
},
{
'codec_type': 'audio',
'sample_rate': '48000',
'channels': 2
}
]
})
mock_run.return_value = mock_result
info = mo.get_media_info('test.mp4')
assert info['size'] == 10485760
assert info['duration'] == 120.5
assert info['width'] == 1920
assert info['height'] == 1080
assert info['sample_rate'] == 48000
@patch('media_optimizer.check_ffmpeg')
def test_get_media_info_no_ffmpeg(self, mock_check):
"""Test when ffmpeg is not available."""
mock_check.return_value = False
info = mo.get_media_info('test.mp4')
assert info == {}
@patch('media_optimizer.check_ffmpeg')
@patch('subprocess.run')
def test_get_media_info_error(self, mock_run, mock_check):
"""Test error handling in media info extraction."""
mock_check.return_value = True
mock_run.side_effect = Exception("Error")
info = mo.get_media_info('test.mp4')
assert info == {}
class TestVideoOptimization:
"""Test video optimization functionality."""
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
@patch('subprocess.run')
def test_optimize_video_success(self, mock_run, mock_info, mock_check):
"""Test successful video optimization."""
mock_check.return_value = True
mock_info.side_effect = [
# Input info
{
'size': 50 * 1024 * 1024,
'duration': 120.0,
'bit_rate': 3500000,
'width': 1920,
'height': 1080
},
# Output info
{
'size': 25 * 1024 * 1024,
'duration': 120.0,
'width': 1920,
'height': 1080
}
]
result = mo.optimize_video(
'input.mp4',
'output.mp4',
quality=23,
verbose=False
)
assert result is True
mock_run.assert_called_once()
@patch('media_optimizer.check_ffmpeg')
def test_optimize_video_no_ffmpeg(self, mock_check):
"""Test video optimization without ffmpeg."""
mock_check.return_value = False
result = mo.optimize_video('input.mp4', 'output.mp4')
assert result is False
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
def test_optimize_video_no_info(self, mock_info, mock_check):
"""Test video optimization when info cannot be read."""
mock_check.return_value = True
mock_info.return_value = {}
result = mo.optimize_video('input.mp4', 'output.mp4')
assert result is False
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
@patch('subprocess.run')
def test_optimize_video_with_target_size(self, mock_run, mock_info, mock_check):
"""Test video optimization with target size."""
mock_check.return_value = True
mock_info.side_effect = [
{'size': 100 * 1024 * 1024, 'duration': 60.0, 'bit_rate': 3500000},
{'size': 50 * 1024 * 1024, 'duration': 60.0}
]
result = mo.optimize_video(
'input.mp4',
'output.mp4',
target_size_mb=50,
verbose=False
)
assert result is True
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
@patch('subprocess.run')
def test_optimize_video_with_resolution(self, mock_run, mock_info, mock_check):
"""Test video optimization with custom resolution."""
mock_check.return_value = True
mock_info.side_effect = [
{'size': 50 * 1024 * 1024, 'duration': 120.0, 'bit_rate': 3500000},
{'size': 25 * 1024 * 1024, 'duration': 120.0}
]
result = mo.optimize_video(
'input.mp4',
'output.mp4',
resolution='1280x720',
verbose=False
)
assert result is True
class TestAudioOptimization:
"""Test audio optimization functionality."""
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
@patch('subprocess.run')
def test_optimize_audio_success(self, mock_run, mock_info, mock_check):
"""Test successful audio optimization."""
mock_check.return_value = True
mock_info.side_effect = [
{'size': 10 * 1024 * 1024, 'duration': 300.0},
{'size': 5 * 1024 * 1024, 'duration': 300.0}
]
result = mo.optimize_audio(
'input.mp3',
'output.m4a',
bitrate='64k',
verbose=False
)
assert result is True
mock_run.assert_called_once()
@patch('media_optimizer.check_ffmpeg')
def test_optimize_audio_no_ffmpeg(self, mock_check):
"""Test audio optimization without ffmpeg."""
mock_check.return_value = False
result = mo.optimize_audio('input.mp3', 'output.m4a')
assert result is False
class TestImageOptimization:
"""Test image optimization functionality."""
@patch('PIL.Image.open')
@patch('pathlib.Path.stat')
def test_optimize_image_success(self, mock_stat, mock_image_open):
"""Test successful image optimization."""
# Mock image
mock_resized = Mock()
mock_resized.mode = 'RGB'
mock_img = Mock()
mock_img.width = 3840
mock_img.height = 2160
mock_img.mode = 'RGB'
mock_img.resize.return_value = mock_resized
mock_image_open.return_value = mock_img
# Mock file sizes
mock_stat.return_value.st_size = 5 * 1024 * 1024
result = mo.optimize_image(
'input.jpg',
'output.jpg',
max_width=1920,
quality=85,
verbose=False
)
assert result is True
# Since image is resized, save is called on the resized image
mock_resized.save.assert_called_once()
@patch('PIL.Image.open')
@patch('pathlib.Path.stat')
def test_optimize_image_resize(self, mock_stat, mock_image_open):
"""Test image resizing during optimization."""
mock_img = Mock()
mock_img.width = 3840
mock_img.height = 2160
mock_img.mode = 'RGB'
mock_resized = Mock()
mock_img.resize.return_value = mock_resized
mock_image_open.return_value = mock_img
mock_stat.return_value.st_size = 5 * 1024 * 1024
mo.optimize_image('input.jpg', 'output.jpg', max_width=1920, verbose=False)
mock_img.resize.assert_called_once()
@patch('PIL.Image.open')
@patch('pathlib.Path.stat')
def test_optimize_image_rgba_to_jpg(self, mock_stat, mock_image_open):
"""Test converting RGBA to RGB for JPEG."""
mock_img = Mock()
mock_img.width = 1920
mock_img.height = 1080
mock_img.mode = 'RGBA'
mock_img.split.return_value = [Mock(), Mock(), Mock(), Mock()]
mock_image_open.return_value = mock_img
mock_stat.return_value.st_size = 1024 * 1024
with patch('PIL.Image.new') as mock_new:
mock_rgb = Mock()
mock_new.return_value = mock_rgb
mo.optimize_image('input.png', 'output.jpg', verbose=False)
mock_new.assert_called_once()
def test_optimize_image_no_pillow(self):
"""Test image optimization without Pillow."""
with patch.dict('sys.modules', {'PIL': None}):
result = mo.optimize_image('input.jpg', 'output.jpg')
# Will fail to import but function handles it
assert result is False
class TestVideoSplitting:
"""Test video splitting functionality."""
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
@patch('subprocess.run')
@patch('pathlib.Path.mkdir')
def test_split_video_success(self, mock_mkdir, mock_run, mock_info, mock_check):
"""Test successful video splitting."""
mock_check.return_value = True
mock_info.return_value = {'duration': 7200.0} # 2 hours
result = mo.split_video(
'input.mp4',
'./chunks',
chunk_duration=3600, # 1 hour chunks
verbose=False
)
# Duration 7200s / 3600s = 2, +1 for safety = 3 chunks
assert len(result) == 3
assert mock_run.call_count == 3
@patch('media_optimizer.check_ffmpeg')
@patch('media_optimizer.get_media_info')
def test_split_video_short_duration(self, mock_info, mock_check):
"""Test splitting video shorter than chunk duration."""
mock_check.return_value = True
mock_info.return_value = {'duration': 1800.0} # 30 minutes
result = mo.split_video(
'input.mp4',
'./chunks',
chunk_duration=3600, # 1 hour
verbose=False
)
assert result == ['input.mp4']
@patch('media_optimizer.check_ffmpeg')
def test_split_video_no_ffmpeg(self, mock_check):
"""Test video splitting without ffmpeg."""
mock_check.return_value = False
result = mo.split_video('input.mp4', './chunks')
assert result == []
if __name__ == '__main__':
pytest.main([__file__, '-v', '--cov=media_optimizer', '--cov-report=term-missing'])

View File

@@ -0,0 +1,232 @@
"""
Tests for minimax_api_client.py - HTTP utilities, auth, polling, downloads.
"""
import json
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
sys.path.insert(0, str(Path(__file__).parent.parent))
import minimax_api_client as mac
class TestFindMinimaxApiKey:
"""Test API key discovery."""
def test_find_key_from_env(self, monkeypatch):
monkeypatch.setenv('MINIMAX_API_KEY', 'test-minimax-key')
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', False):
assert mac.find_minimax_api_key() == 'test-minimax-key'
def test_find_key_not_found(self, monkeypatch):
monkeypatch.delenv('MINIMAX_API_KEY', raising=False)
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', False):
result = mac.find_minimax_api_key()
assert result is None
def test_find_key_via_centralized_resolver(self, monkeypatch):
mock_resolve = Mock(return_value='resolved-key')
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', True), \
patch.object(mac, 'resolve_env', mock_resolve, create=True):
result = mac.find_minimax_api_key()
assert result == 'resolved-key'
mock_resolve.assert_called_once_with(
'MINIMAX_API_KEY', skill='ai-multimodal'
)
class TestGetHeaders:
"""Test header generation."""
def test_headers_contain_bearer_token(self):
headers = mac.get_headers('my-api-key')
assert headers['Authorization'] == 'Bearer my-api-key'
assert headers['Content-Type'] == 'application/json'
def test_headers_with_different_key(self):
headers = mac.get_headers('another-key-123')
assert 'another-key-123' in headers['Authorization']
class TestApiPost:
"""Test POST request handling."""
@patch('minimax_api_client.requests.post')
def test_successful_post(self, mock_post):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"base_resp": {"status_code": 0},
"data": {"result": "ok"}
}
mock_post.return_value = mock_resp
result = mac.api_post("test_endpoint", {"key": "val"}, "api-key")
assert result["data"]["result"] == "ok"
mock_post.assert_called_once()
@patch('minimax_api_client.requests.post')
def test_http_error_raises(self, mock_post):
mock_resp = Mock()
mock_resp.status_code = 401
mock_resp.text = "Unauthorized"
mock_post.return_value = mock_resp
with pytest.raises(Exception, match="HTTP 401"):
mac.api_post("endpoint", {}, "bad-key")
@patch('minimax_api_client.requests.post')
def test_minimax_error_code_raises(self, mock_post):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"base_resp": {"status_code": 1002, "status_msg": "Rate limit"}
}
mock_post.return_value = mock_resp
with pytest.raises(Exception, match="code 1002.*Rate limit"):
mac.api_post("endpoint", {}, "api-key")
@patch('minimax_api_client.requests.post')
def test_custom_timeout(self, mock_post):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
mock_post.return_value = mock_resp
mac.api_post("endpoint", {}, "key", timeout=300)
_, kwargs = mock_post.call_args
assert kwargs['timeout'] == 300
@patch('minimax_api_client.requests.post')
def test_default_timeout_is_120(self, mock_post):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
mock_post.return_value = mock_resp
mac.api_post("endpoint", {}, "key")
_, kwargs = mock_post.call_args
assert kwargs['timeout'] == 120
@patch('minimax_api_client.requests.post')
def test_verbose_prints_url(self, mock_post, capsys):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
mock_post.return_value = mock_resp
mac.api_post("image_generation", {}, "key", verbose=True)
captured = capsys.readouterr()
assert "image_generation" in captured.err
class TestApiGet:
"""Test GET request handling."""
@patch('minimax_api_client.requests.get')
def test_successful_get(self, mock_get):
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.json.return_value = {"status": "Success", "file_id": "abc"}
mock_get.return_value = mock_resp
result = mac.api_get("query/video_generation", {"task_id": "t1"}, "key")
assert result["status"] == "Success"
@patch('minimax_api_client.requests.get')
def test_get_http_error(self, mock_get):
mock_resp = Mock()
mock_resp.status_code = 500
mock_resp.text = "Server Error"
mock_get.return_value = mock_resp
with pytest.raises(Exception, match="HTTP 500"):
mac.api_get("endpoint", {}, "key")
class TestPollAsyncTask:
"""Test async task polling."""
@patch('minimax_api_client.time.sleep')
@patch('minimax_api_client.api_get')
def test_poll_success_first_try(self, mock_get, mock_sleep):
mock_get.return_value = {"status": "Success", "file_id": "f123"}
result = mac.poll_async_task("task1", "video_generation", "key")
assert result["file_id"] == "f123"
mock_sleep.assert_not_called()
@patch('minimax_api_client.time.sleep')
@patch('minimax_api_client.api_get')
def test_poll_success_after_processing(self, mock_get, mock_sleep):
mock_get.side_effect = [
{"status": "Processing"},
{"status": "Processing"},
{"status": "Success", "file_id": "f456"}
]
result = mac.poll_async_task("task2", "video_generation", "key",
poll_interval=1)
assert result["file_id"] == "f456"
assert mock_sleep.call_count == 2
@patch('minimax_api_client.time.sleep')
@patch('minimax_api_client.api_get')
def test_poll_task_failed(self, mock_get, mock_sleep):
mock_get.return_value = {"status": "Failed", "error": "bad input"}
with pytest.raises(Exception, match="Task failed"):
mac.poll_async_task("task3", "video_generation", "key")
@patch('minimax_api_client.time.sleep')
@patch('minimax_api_client.api_get')
def test_poll_timeout(self, mock_get, mock_sleep):
mock_get.return_value = {"status": "Processing"}
with pytest.raises(TimeoutError, match="timed out"):
mac.poll_async_task("task4", "video_generation", "key",
poll_interval=1, max_wait=3)
class TestDownloadFile:
"""Test file download."""
@patch('minimax_api_client.requests.get')
@patch('minimax_api_client.api_get')
def test_download_success(self, mock_api_get, mock_req_get, tmp_path):
mock_api_get.return_value = {
"file": {"download_url": "https://cdn.minimax.io/video.mp4"}
}
mock_resp = Mock()
mock_resp.raise_for_status = Mock()
mock_resp.iter_content.return_value = [b"video_data"]
mock_req_get.return_value = mock_resp
output = str(tmp_path / "test.mp4")
result = mac.download_file("file123", "key", output)
assert result == output
assert Path(output).exists()
@patch('minimax_api_client.api_get')
def test_download_no_url_raises(self, mock_api_get):
mock_api_get.return_value = {"file": {}}
with pytest.raises(Exception, match="No download URL"):
mac.download_file("file123", "key", "/tmp/test.mp4")
class TestGetOutputDir:
"""Test output directory resolution."""
def test_returns_path_object(self):
result = mac.get_output_dir()
assert isinstance(result, Path)
def test_directory_exists(self):
result = mac.get_output_dir()
assert result.exists()
assert result.is_dir()

View File

@@ -0,0 +1,185 @@
"""
Tests for minimax_cli.py - CLI argument parsing and task dispatch.
"""
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
sys.path.insert(0, str(Path(__file__).parent.parent))
import minimax_cli as cli
class TestTaskDefaults:
"""Test task-to-model default mapping."""
def test_generate_defaults_to_image_01(self):
assert cli.TASK_DEFAULTS['generate'] == 'image-01'
def test_generate_video_defaults_to_hailuo(self):
assert cli.TASK_DEFAULTS['generate-video'] == 'MiniMax-Hailuo-2.3'
def test_generate_speech_defaults_to_speech_28_hd(self):
assert cli.TASK_DEFAULTS['generate-speech'] == 'speech-2.8-hd'
def test_generate_music_defaults_to_music_25(self):
assert cli.TASK_DEFAULTS['generate-music'] == 'music-2.5'
class TestPrintResult:
"""Test result formatting."""
def test_success_image(self, capsys):
result = {
"status": "success",
"generated_images": ["/path/to/img.png"],
"model": "image-01"
}
cli.print_result(result, "generate")
output = capsys.readouterr().out
assert "success" in output.lower()
assert "/path/to/img.png" in output
assert "image-01" in output
def test_success_video(self, capsys):
result = {
"status": "success",
"generated_video": "/path/to/vid.mp4",
"generation_time": 45.2,
"model": "MiniMax-Hailuo-2.3"
}
cli.print_result(result, "generate-video")
output = capsys.readouterr().out
assert "/path/to/vid.mp4" in output
assert "45.2s" in output
def test_success_audio(self, capsys):
result = {
"status": "success",
"generated_audio": "/path/to/audio.mp3",
"duration_ms": 140000,
"model": "music-2.5"
}
cli.print_result(result, "generate-music")
output = capsys.readouterr().out
assert "/path/to/audio.mp3" in output
assert "140.0s" in output
def test_error_result(self, capsys):
result = {"status": "error", "error": "Rate limit exceeded"}
cli.print_result(result, "generate")
output = capsys.readouterr().out
assert "Rate limit exceeded" in output
def test_unknown_status(self, capsys):
result = {"model": "image-01"}
cli.print_result(result, "generate")
output = capsys.readouterr().out
assert "unknown" in output.lower()
class TestMainCLI:
"""Test CLI main() argument parsing and dispatch."""
@patch('minimax_cli.find_minimax_api_key', return_value=None)
def test_no_api_key_exits(self, mock_key, capsys):
with patch('sys.argv', ['cli', '--task', 'generate', '--prompt', 'x']):
with pytest.raises(SystemExit) as exc_info:
cli.main()
assert exc_info.value.code == 1
@patch('minimax_cli.generate_image')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_generate_image_dispatch(self, mock_key, mock_gen):
mock_gen.return_value = {"status": "success", "generated_images": [],
"model": "image-01"}
with patch('sys.argv', ['cli', '--task', 'generate',
'--prompt', 'A cat']):
cli.main()
mock_gen.assert_called_once()
args = mock_gen.call_args
assert args[0][0] == 'test-key'
assert args[0][1] == 'A cat'
@patch('minimax_cli.generate_speech')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_generate_speech_dispatch(self, mock_key, mock_gen):
mock_gen.return_value = {"status": "success",
"generated_audio": "/x.mp3",
"model": "speech-2.8-hd"}
with patch('sys.argv', ['cli', '--task', 'generate-speech',
'--text', 'Hello world']):
cli.main()
mock_gen.assert_called_once()
@patch('minimax_cli.generate_speech')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_speech_uses_text_or_prompt(self, mock_key, mock_gen):
mock_gen.return_value = {"status": "success",
"generated_audio": "/x.mp3",
"model": "speech-2.8-hd"}
# --prompt should work as fallback for --text
with patch('sys.argv', ['cli', '--task', 'generate-speech',
'--prompt', 'Fallback text']):
cli.main()
call_args = mock_gen.call_args
assert call_args[0][1] == 'Fallback text'
@patch('minimax_cli.generate_music')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_generate_music_dispatch(self, mock_key, mock_gen):
mock_gen.return_value = {"status": "success",
"generated_audio": "/x.mp3",
"duration_ms": 60000,
"model": "music-2.5"}
with patch('sys.argv', ['cli', '--task', 'generate-music',
'--lyrics', 'La la la']):
cli.main()
mock_gen.assert_called_once()
@patch('minimax_cli.generate_video')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_generate_video_dispatch(self, mock_key, mock_gen):
mock_gen.return_value = {"status": "success",
"generated_video": "/x.mp4",
"generation_time": 30.0,
"model": "MiniMax-Hailuo-2.3"}
with patch('sys.argv', ['cli', '--task', 'generate-video',
'--prompt', 'A dancer']):
cli.main()
mock_gen.assert_called_once()
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_auto_model_detection(self, mock_key):
with patch('sys.argv', ['cli', '--task', 'generate-speech',
'--text', 'hi']):
with patch('minimax_cli.generate_speech') as mock_gen:
mock_gen.return_value = {"status": "success",
"generated_audio": "/x.mp3",
"model": "speech-2.8-hd"}
cli.main()
# Model should be auto-detected
assert mock_gen.call_args[0][2] == 'speech-2.8-hd'
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_explicit_model_override(self, mock_key):
with patch('sys.argv', ['cli', '--task', 'generate-speech',
'--text', 'hi', '--model', 'speech-2.8-turbo']):
with patch('minimax_cli.generate_speech') as mock_gen:
mock_gen.return_value = {"status": "success",
"generated_audio": "/x.mp3",
"model": "speech-2.8-turbo"}
cli.main()
assert mock_gen.call_args[0][2] == 'speech-2.8-turbo'
@patch('minimax_cli.generate_image')
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
def test_exception_exits_with_1(self, mock_key, mock_gen):
mock_gen.side_effect = Exception("API timeout")
with patch('sys.argv', ['cli', '--task', 'generate',
'--prompt', 'test']):
with pytest.raises(SystemExit) as exc_info:
cli.main()
assert exc_info.value.code == 1

View File

@@ -0,0 +1,393 @@
"""
Tests for minimax_generate.py - generation functions for image, video, speech, music.
"""
import json
import pytest
import sys
import time
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock, call
sys.path.insert(0, str(Path(__file__).parent.parent))
import minimax_generate as mg
class TestModelRegistries:
"""Test model set definitions."""
def test_image_models(self):
assert 'image-01' in mg.MINIMAX_IMAGE_MODELS
assert 'image-01-live' in mg.MINIMAX_IMAGE_MODELS
def test_video_models(self):
assert 'MiniMax-Hailuo-2.3' in mg.MINIMAX_VIDEO_MODELS
assert 'MiniMax-Hailuo-2.3-Fast' in mg.MINIMAX_VIDEO_MODELS
assert 'S2V-01' in mg.MINIMAX_VIDEO_MODELS
def test_speech_models(self):
assert 'speech-2.8-hd' in mg.MINIMAX_SPEECH_MODELS
assert 'speech-2.8-turbo' in mg.MINIMAX_SPEECH_MODELS
def test_music_models(self):
assert 'music-2.5' in mg.MINIMAX_MUSIC_MODELS
def test_all_models_is_union(self):
expected = (mg.MINIMAX_IMAGE_MODELS | mg.MINIMAX_VIDEO_MODELS |
mg.MINIMAX_SPEECH_MODELS | mg.MINIMAX_MUSIC_MODELS)
assert mg.ALL_MINIMAX_MODELS == expected
class TestIsMinimaxModel:
"""Test model detection."""
def test_known_image_model(self):
assert mg.is_minimax_model('image-01') is True
def test_known_video_model(self):
assert mg.is_minimax_model('MiniMax-Hailuo-2.3') is True
def test_known_speech_model(self):
assert mg.is_minimax_model('speech-2.8-hd') is True
def test_known_music_model(self):
assert mg.is_minimax_model('music-2.5') is True
def test_prefix_minimax(self):
assert mg.is_minimax_model('MiniMax-Future-Model') is True
def test_prefix_speech(self):
assert mg.is_minimax_model('speech-3.0-ultra') is True
def test_prefix_s2v(self):
assert mg.is_minimax_model('S2V-02') is True
def test_non_minimax_model(self):
assert mg.is_minimax_model('gemini-2.5-flash') is False
def test_non_minimax_imagen(self):
assert mg.is_minimax_model('imagen-4.0-generate-001') is False
class TestGenerateImage:
"""Test image generation."""
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_success(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {
"data": {"image_urls": ["https://cdn.minimax.io/img1.png"]}
}
with patch('requests.get') as mock_req_get:
mock_resp = Mock()
mock_resp.content = b'\x89PNG\r\n\x1a\n'
mock_resp.raise_for_status = Mock()
mock_req_get.return_value = mock_resp
result = mg.generate_image("key", "A cat", "image-01")
assert result["status"] == "success"
assert len(result["generated_images"]) == 1
assert result["model"] == "image-01"
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_no_images_returns_error(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {"data": {"image_urls": []}}
result = mg.generate_image("key", "A cat", "image-01")
assert result["status"] == "error"
@patch('minimax_generate.api_post')
def test_payload_structure(self, mock_post):
mock_post.return_value = {"data": {"image_urls": []}}
mg.generate_image("key", "A dog", "image-01", "16:9", 3)
payload = mock_post.call_args[0][1]
assert payload["model"] == "image-01"
assert payload["prompt"] == "A dog"
assert payload["aspect_ratio"] == "16:9"
assert payload["n"] == 3
assert payload["response_format"] == "url"
assert payload["prompt_optimizer"] is True
@patch('minimax_generate.api_post')
def test_num_images_capped_at_9(self, mock_post):
mock_post.return_value = {"data": {"image_urls": []}}
mg.generate_image("key", "test", "image-01", num_images=15)
payload = mock_post.call_args[0][1]
assert payload["n"] == 9
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_output_copy(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {
"data": {"image_urls": ["https://cdn.minimax.io/img.png"]}
}
with patch('requests.get') as mock_req_get:
mock_resp = Mock()
mock_resp.content = b'image_bytes'
mock_resp.raise_for_status = Mock()
mock_req_get.return_value = mock_resp
output_path = str(tmp_path / "custom_output.png")
result = mg.generate_image("key", "test", output=output_path)
assert Path(output_path).exists()
class TestGenerateVideo:
"""Test video generation (async workflow)."""
@patch('minimax_generate.download_file')
@patch('minimax_generate.poll_async_task')
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_success(self, mock_post, mock_dir, mock_poll, mock_dl, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {"task_id": "vid-task-123"}
mock_poll.return_value = {"file_id": "file-456"}
# Create a fake video file so stat() works
mock_dl.side_effect = lambda fid, key, path, v: (
Path(path).write_bytes(b'fake_video') or path
)
result = mg.generate_video("key", "A dancer")
assert result["status"] == "success"
assert "generated_video" in result
assert result["model"] == "MiniMax-Hailuo-2.3"
mock_poll.assert_called_once()
@patch('minimax_generate.api_post')
def test_no_task_id_error(self, mock_post):
mock_post.return_value = {"error": "bad request"}
result = mg.generate_video("key", "test")
assert result["status"] == "error"
assert "No task_id" in result["error"]
@patch('minimax_generate.poll_async_task')
@patch('minimax_generate.api_post')
def test_no_file_id_error(self, mock_post, mock_poll):
mock_post.return_value = {"task_id": "t1"}
mock_poll.return_value = {"status": "Success"}
result = mg.generate_video("key", "test")
assert result["status"] == "error"
assert "No file_id" in result["error"]
@patch('minimax_generate.api_post')
def test_payload_with_first_frame(self, mock_post):
mock_post.return_value = {"task_id": None}
mg.generate_video("key", "test", first_frame="https://img.url/frame.png")
payload = mock_post.call_args[0][1]
assert payload["first_frame_image"] == "https://img.url/frame.png"
@patch('minimax_generate.api_post')
def test_payload_duration_resolution(self, mock_post):
mock_post.return_value = {"task_id": None}
mg.generate_video("key", "test", duration=10, resolution="720P")
payload = mock_post.call_args[0][1]
assert payload["duration"] == 10
assert payload["resolution"] == "720P"
class TestGenerateSpeech:
"""Test speech/TTS generation."""
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_success(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
# hex-encoded audio bytes
mock_post.return_value = {
"data": {"audio": "48656c6c6f"} # "Hello" in hex
}
result = mg.generate_speech("key", "Hello world")
assert result["status"] == "success"
assert "generated_audio" in result
assert result["model"] == "speech-2.8-hd"
# Verify file was written
audio_path = Path(result["generated_audio"])
assert audio_path.exists()
assert audio_path.read_bytes() == bytes.fromhex("48656c6c6f")
@patch('minimax_generate.api_post')
def test_no_audio_returns_error(self, mock_post):
mock_post.return_value = {"data": {}}
result = mg.generate_speech("key", "test")
assert result["status"] == "error"
@patch('minimax_generate.api_post')
def test_payload_structure(self, mock_post):
mock_post.return_value = {"data": {}}
mg.generate_speech("key", "Test text", "speech-2.8-turbo",
voice="English_Warm_Bestie", emotion="happy",
output_format="wav", rate=1.5)
payload = mock_post.call_args[0][1]
assert payload["model"] == "speech-2.8-turbo"
assert payload["text"] == "Test text"
assert payload["stream"] is False
assert payload["output_format"] == "hex"
assert payload["voice_setting"]["voice_id"] == "English_Warm_Bestie"
assert payload["voice_setting"]["speed"] == 1.5
assert payload["audio_setting"]["format"] == "wav"
assert payload["audio_setting"]["sample_rate"] == 32000
@patch('minimax_generate.api_post')
def test_text_truncated_at_10000(self, mock_post):
mock_post.return_value = {"data": {}}
long_text = "x" * 15000
mg.generate_speech("key", long_text)
payload = mock_post.call_args[0][1]
assert len(payload["text"]) == 10000
@patch('minimax_generate.api_post')
def test_uses_t2a_v2_endpoint(self, mock_post):
mock_post.return_value = {"data": {}}
mg.generate_speech("key", "test")
endpoint = mock_post.call_args[0][0]
assert endpoint == "t2a_v2"
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_wav_extension(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {"data": {"audio": "aabb"}}
result = mg.generate_speech("key", "test", output_format="wav")
assert result["generated_audio"].endswith(".wav")
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_pcm_defaults_to_mp3_ext(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {"data": {"audio": "aabb"}}
result = mg.generate_speech("key", "test", output_format="pcm")
assert result["generated_audio"].endswith(".mp3")
class TestGenerateMusic:
"""Test music generation."""
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_success_with_url(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {
"data": {"audio": "https://cdn.minimax.io/music.mp3"},
"extra_info": {"music_duration": 120000}
}
with patch('requests.get') as mock_req_get:
mock_resp = Mock()
mock_resp.content = b'music_data'
mock_resp.raise_for_status = Mock()
mock_req_get.return_value = mock_resp
result = mg.generate_music("key", lyrics="La la la",
prompt="pop")
assert result["status"] == "success"
assert result["duration_ms"] == 120000
assert result["model"] == "music-2.5"
@patch('minimax_generate.get_output_dir')
@patch('minimax_generate.api_post')
def test_success_with_hex(self, mock_post, mock_dir, tmp_path):
mock_dir.return_value = tmp_path
mock_post.return_value = {
"data": {"audio": "deadbeef"},
"extra_info": {"music_duration": 60000}
}
result = mg.generate_music("key", lyrics="test")
assert result["status"] == "success"
audio_path = Path(result["generated_audio"])
assert audio_path.read_bytes() == bytes.fromhex("deadbeef")
@patch('minimax_generate.api_post')
def test_no_audio_returns_error(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
result = mg.generate_music("key", lyrics="test")
assert result["status"] == "error"
@patch('minimax_generate.api_post')
def test_payload_structure(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
mg.generate_music("key", lyrics="Verse 1\nHello",
prompt="upbeat pop", model="music-2.5",
output_format="wav")
payload = mock_post.call_args[0][1]
assert payload["model"] == "music-2.5"
assert payload["lyrics"] == "Verse 1\nHello"
assert payload["prompt"] == "upbeat pop"
assert payload["output_format"] == "url"
assert payload["audio_setting"]["format"] == "wav"
assert payload["audio_setting"]["sample_rate"] == 44100
@patch('minimax_generate.api_post')
def test_lyrics_truncated_at_3500(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
mg.generate_music("key", lyrics="x" * 5000)
payload = mock_post.call_args[0][1]
assert len(payload["lyrics"]) == 3500
@patch('minimax_generate.api_post')
def test_prompt_truncated_at_2000(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
mg.generate_music("key", prompt="y" * 3000)
payload = mock_post.call_args[0][1]
assert len(payload["prompt"]) == 2000
@patch('minimax_generate.api_post')
def test_uses_300s_timeout(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
mg.generate_music("key", lyrics="test")
# Check timeout kwarg passed to api_post
_, kwargs = mock_post.call_args
assert kwargs.get('timeout') == 300
@patch('minimax_generate.api_post')
def test_empty_lyrics_omitted(self, mock_post):
mock_post.return_value = {"data": {}, "extra_info": {}}
mg.generate_music("key", lyrics="", prompt="jazz")
payload = mock_post.call_args[0][1]
assert "lyrics" not in payload
assert payload["prompt"] == "jazz"