init
This commit is contained in:
230
.opencode/skills/ai-multimodal/.env.example
Normal file
230
.opencode/skills/ai-multimodal/.env.example
Normal file
@@ -0,0 +1,230 @@
|
||||
# Google Gemini API Configuration
|
||||
|
||||
# ============================================================================
|
||||
# OPTION 1: Google AI Studio (Default - Recommended for most users)
|
||||
# ============================================================================
|
||||
# Get your API key: https://aistudio.google.com/apikey
|
||||
GEMINI_API_KEY=your_api_key_here
|
||||
|
||||
# ============================================================================
|
||||
# API Key Rotation (Optional - For high-volume usage)
|
||||
# ============================================================================
|
||||
# Add multiple API keys for automatic rotation on rate limit errors.
|
||||
# Free tier accounts are heavily rate-limited; rotation helps distribute load.
|
||||
#
|
||||
# Format: GEMINI_API_KEY_N where N is 2, 3, 4, etc.
|
||||
# The primary GEMINI_API_KEY is always used first.
|
||||
#
|
||||
# GEMINI_API_KEY_2=your_second_api_key
|
||||
# GEMINI_API_KEY_3=your_third_api_key
|
||||
# GEMINI_API_KEY_4=your_fourth_api_key
|
||||
#
|
||||
# Features:
|
||||
# - Auto-rotates on RESOURCE_EXHAUSTED / 429 errors
|
||||
# - 60-second cooldown per key after rate limit
|
||||
# - Logs rotation events with --verbose flag
|
||||
# - Backward compatible: single key still works
|
||||
|
||||
# ============================================================================
|
||||
# OPTION 2: Vertex AI (Google Cloud Platform)
|
||||
# ============================================================================
|
||||
# Uncomment these lines to use Vertex AI instead of Google AI Studio
|
||||
# GEMINI_USE_VERTEX=true
|
||||
# VERTEX_PROJECT_ID=your-gcp-project-id
|
||||
# VERTEX_LOCATION=us-central1
|
||||
|
||||
# ============================================================================
|
||||
# Model Selection (Optional)
|
||||
# ============================================================================
|
||||
# Override default models for specific capabilities
|
||||
# If not set, intelligent defaults are used based on task type
|
||||
|
||||
# --- Image Generation ---
|
||||
# Used by: --task generate (image)
|
||||
# Default: gemini-2.5-flash-image (Nano Banana Flash - fast, cost-effective)
|
||||
# Alternative: imagen-4.0-generate-001 (production quality)
|
||||
# NOTE: All image generation requires billing - no free tier available (limit: 0)
|
||||
# Options:
|
||||
# gemini-2.5-flash-image - Nano Banana Flash: fast, ~$1/1M tokens (DEFAULT)
|
||||
# gemini-3-pro-image-preview - Nano Banana Pro: 4K text, reasoning (requires billing)
|
||||
# imagen-4.0-generate-001 - Imagen 4 Standard: production quality (~$0.02/image)
|
||||
# imagen-4.0-ultra-generate-001 - Imagen 4 Ultra: maximum quality (~$0.04/image)
|
||||
# imagen-4.0-fast-generate-001 - Imagen 4 Fast: speed-optimized (~$0.01/image)
|
||||
# IMAGE_GEN_MODEL=gemini-2.5-flash-image
|
||||
|
||||
# --- Video Generation ---
|
||||
# Used by: --task generate-video (new capability)
|
||||
# Default: veo-3.1-generate-preview
|
||||
# NOTE: Video generation requires billing - no free tier fallback available
|
||||
# Options:
|
||||
# veo-3.1-generate-preview - Latest, native audio, frame control (requires billing)
|
||||
# veo-3.1-fast-generate-preview - Speed-optimized for business (requires billing)
|
||||
# veo-3.0-generate-001 - Stable, native audio, 8s videos (requires billing)
|
||||
# veo-3.0-fast-generate-001 - Stable fast variant (requires billing)
|
||||
# VIDEO_GEN_MODEL=veo-3.1-generate-preview
|
||||
|
||||
# --- Multimodal Analysis ---
|
||||
# Used by: --task analyze, transcribe, extract
|
||||
# Default: gemini-2.5-flash
|
||||
# Options:
|
||||
# gemini-3-pro-preview - Latest, agentic workflows, 1M context
|
||||
# gemini-2.5-flash - Best price/performance (recommended)
|
||||
# gemini-2.5-pro - Highest quality
|
||||
# MULTIMODAL_MODEL=gemini-2.5-flash
|
||||
|
||||
# --- Legacy Compatibility ---
|
||||
# Generic model override (use specific variables above instead)
|
||||
# GEMINI_MODEL=gemini-2.5-flash
|
||||
# GEMINI_IMAGE_GEN_MODEL=gemini-2.5-flash-image
|
||||
|
||||
# ============================================================================
|
||||
# MiniMax API Configuration (Optional - for image/video/speech/music generation)
|
||||
# ============================================================================
|
||||
# Get your API key: https://platform.minimax.io/user-center/basic-information/interface-key
|
||||
# MINIMAX_API_KEY=your_minimax_api_key_here
|
||||
|
||||
# --- MiniMax Image Generation ---
|
||||
# Models: image-01 (standard), image-01-live (enhanced)
|
||||
# Cost: ~$0.03/image | Rate: 10 RPM
|
||||
# MINIMAX_IMAGE_MODEL=image-01
|
||||
|
||||
# --- MiniMax Video Generation (Hailuo) ---
|
||||
# Models: MiniMax-Hailuo-2.3, MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-02, S2V-01
|
||||
# Cost: $0.25-0.52/video | Rate: 5 RPM
|
||||
# MINIMAX_VIDEO_MODEL=MiniMax-Hailuo-2.3
|
||||
|
||||
# --- MiniMax Speech/TTS ---
|
||||
# Models: speech-2.8-hd (best), speech-2.8-turbo (fast)
|
||||
# Cost: $30-50/1M chars | Rate: 60 RPM | 300+ voices, 40+ languages
|
||||
# MINIMAX_SPEECH_MODEL=speech-2.8-hd
|
||||
|
||||
# --- MiniMax Music Generation ---
|
||||
# Models: music-2.5 (4-minute songs with vocals)
|
||||
# Cost: $0.03-0.075/gen | Rate: 120 RPM
|
||||
# MINIMAX_MUSIC_MODEL=music-2.5
|
||||
|
||||
# ============================================================================
|
||||
# Rate Limiting Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Requests per minute limit (adjust based on your tier)
|
||||
# GEMINI_RPM_LIMIT=15
|
||||
|
||||
# Tokens per minute limit
|
||||
# GEMINI_TPM_LIMIT=4000000
|
||||
|
||||
# Requests per day limit
|
||||
# GEMINI_RPD_LIMIT=1500
|
||||
|
||||
# ============================================================================
|
||||
# Video Generation Options (Optional)
|
||||
# ============================================================================
|
||||
# Video duration in seconds (8s only for now)
|
||||
# VEO_DURATION=8
|
||||
|
||||
# Video resolution: 720p or 1080p
|
||||
# VEO_RESOLUTION=1080p
|
||||
|
||||
# Aspect ratio: 16:9, 9:16, 1:1 (16:9 is default)
|
||||
# VEO_ASPECT_RATIO=16:9
|
||||
|
||||
# Frame rate: 24fps (fixed for now)
|
||||
# VEO_FPS=24
|
||||
|
||||
# Enable native audio generation
|
||||
# VEO_AUDIO=true
|
||||
|
||||
# ============================================================================
|
||||
# Image Generation Options (Optional)
|
||||
# ============================================================================
|
||||
# Number of images to generate (1-4)
|
||||
# IMAGEN_NUM_IMAGES=1
|
||||
|
||||
# Image size: 1K or 2K (Ultra/Standard only)
|
||||
# IMAGEN_SIZE=1K
|
||||
|
||||
# Aspect ratio: 1:1, 16:9, 9:16, 4:3, 3:4
|
||||
# IMAGEN_ASPECT_RATIO=1:1
|
||||
|
||||
# Enable person generation (restricted in EEA, CH, UK)
|
||||
# IMAGEN_PERSON_GENERATION=true
|
||||
|
||||
# Add SynthID watermark (always enabled by default)
|
||||
# IMAGEN_WATERMARK=true
|
||||
|
||||
# ============================================================================
|
||||
# Processing Options (Optional)
|
||||
# ============================================================================
|
||||
# Video resolution mode: default or low-res
|
||||
# low-res uses ~100 tokens/second vs ~300 for default
|
||||
# GEMINI_VIDEO_RESOLUTION=default
|
||||
|
||||
# Audio quality: default (16 Kbps mono, auto-downsampled)
|
||||
# GEMINI_AUDIO_QUALITY=default
|
||||
|
||||
# PDF processing mode: inline (<20MB) or file-api (>20MB, automatic)
|
||||
# GEMINI_PDF_MODE=auto
|
||||
|
||||
# ============================================================================
|
||||
# Retry Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Maximum retry attempts for failed requests
|
||||
# GEMINI_MAX_RETRIES=3
|
||||
|
||||
# Initial retry delay in seconds (uses exponential backoff)
|
||||
# GEMINI_RETRY_DELAY=1
|
||||
|
||||
# ============================================================================
|
||||
# Output Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Default output directory for generated images
|
||||
# OUTPUT_DIR=./output
|
||||
|
||||
# Image output format (png or jpeg)
|
||||
# IMAGE_FORMAT=png
|
||||
|
||||
# Image quality for JPEG (1-100)
|
||||
# IMAGE_QUALITY=95
|
||||
|
||||
# ============================================================================
|
||||
# Context Caching (Optional)
|
||||
# ============================================================================
|
||||
# Enable context caching for repeated queries on same file
|
||||
# GEMINI_ENABLE_CACHING=true
|
||||
|
||||
# Cache TTL in seconds (default: 1800 = 30 minutes)
|
||||
# GEMINI_CACHE_TTL=1800
|
||||
|
||||
# ============================================================================
|
||||
# Logging (Optional)
|
||||
# ============================================================================
|
||||
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
# LOG_LEVEL=INFO
|
||||
|
||||
# Log file path
|
||||
# LOG_FILE=./logs/gemini.log
|
||||
|
||||
# ============================================================================
|
||||
# Pricing Reference (as of 2025-11)
|
||||
# ============================================================================
|
||||
# Gemini 2.5 Flash: $1.00/1M input, $0.10/1M output
|
||||
# Gemini 2.5 Pro: $3.00/1M input, $12.00/1M output
|
||||
# Gemini 3 Pro: $2.00/1M input (<200k), $4.00 (>200k), $12/$18 output
|
||||
# Imagen 4: ~$0.01-$0.04 per image (varies by variant)
|
||||
# Veo 3: TBD (preview pricing)
|
||||
# Monitor: https://ai.google.dev/pricing
|
||||
|
||||
# ============================================================================
|
||||
# Notes
|
||||
# ============================================================================
|
||||
# 1. Never commit API keys to version control
|
||||
# 2. Add .env to .gitignore
|
||||
# 3. API keys can be restricted in Google Cloud Console
|
||||
# 4. Monitor usage at: https://aistudio.google.com/apikey
|
||||
# 5. Free tier limits: 15 RPM, 1M-4M TPM, 1,500 RPD
|
||||
# 6. Vertex AI requires GCP authentication via gcloud CLI
|
||||
# 7. Model defaults (Dec 2025):
|
||||
# - Image gen: gemini-2.5-flash-image (Nano Banana Flash - default)
|
||||
# - Image gen: imagen-4.0-generate-001 (alternative for production)
|
||||
# - Video gen: veo-3.1-generate-preview
|
||||
# - Analysis: gemini-2.5-flash
|
||||
# 8. Preview models (veo-3.1, gemini-3) may have API changes
|
||||
112
.opencode/skills/ai-multimodal/SKILL.md
Normal file
112
.opencode/skills/ai-multimodal/SKILL.md
Normal file
@@ -0,0 +1,112 @@
|
||||
---
|
||||
name: ck:ai-multimodal
|
||||
description: Analyze images/audio/video with Gemini API (better vision than Claude). Generate images (Imagen 4, Nano Banana 2, MiniMax), videos (Veo 3, Hailuo), speech (MiniMax TTS), music (MiniMax). Use for vision analysis, transcription, OCR, design extraction, multimodal AI.
|
||||
license: MIT
|
||||
allowed-tools:
|
||||
- Bash
|
||||
- Read
|
||||
- Write
|
||||
- Edit
|
||||
argument-hint: "[file-path] [prompt]"
|
||||
---
|
||||
|
||||
# AI Multimodal
|
||||
|
||||
Process audio, images, videos, documents using Gemini. Generate images, videos, speech, music via Gemini + MiniMax.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
# Google Gemini (analysis + image/video gen)
|
||||
export GEMINI_API_KEY="your-key" # https://aistudio.google.com/apikey
|
||||
# MiniMax (image/video/speech/music gen)
|
||||
export MINIMAX_API_KEY="your-key" # https://platform.minimax.io/user-center/basic-information/interface-key
|
||||
pip install google-genai python-dotenv pillow requests
|
||||
```
|
||||
|
||||
### API Key Rotation (Optional)
|
||||
|
||||
For high-volume Gemini usage, configure multiple keys:
|
||||
|
||||
```bash
|
||||
export GEMINI_API_KEY="key1"
|
||||
export GEMINI_API_KEY_2="key2" # auto-rotates on rate limit
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
**Verify setup**: `python scripts/check_setup.py`
|
||||
**Analyze media**: `python scripts/gemini_batch_process.py --files <file> --task <analyze|transcribe|extract>`
|
||||
- TIP: When you're asked to analyze an image, check if `gemini` command is available, then use `echo "<prompt to analyze image>" | gemini -y -m <gemini.model>` command (read model from `$HOME/.opencode/.ck.json`: `gemini.model`). If `gemini` command is not available, use `python scripts/gemini_batch_process.py --files <file> --task analyze` command.
|
||||
**Generate (Gemini)**: `python scripts/gemini_batch_process.py --task <generate|generate-video> --prompt "desc"`
|
||||
**Generate (MiniMax)**: `python scripts/minimax_cli.py --task <generate|generate-video|generate-speech|generate-music> --prompt "desc"`
|
||||
|
||||
> **Stdin support**: Pipe files via stdin for Gemini analysis (auto-detects PNG/JPG/PDF/WAV/MP3).
|
||||
|
||||
## Models
|
||||
|
||||
### Google Gemini / Imagen
|
||||
- **Image gen**: `gemini-3.1-flash-image-preview` (Nano Banana 2 - DEFAULT), `gemini-2.5-flash-image` (Flash), `gemini-3-pro-image-preview` (Pro 4K), `imagen-4.0-generate-001` (standard), `imagen-4.0-ultra-generate-001` (quality), `imagen-4.0-fast-generate-001` (speed)
|
||||
- **Video gen**: `veo-3.1-generate-preview` (8s clips with audio)
|
||||
- **Analysis**: `gemini-2.5-flash` (recommended), `gemini-2.5-pro` (advanced)
|
||||
|
||||
### MiniMax (NEW)
|
||||
- **Image gen**: `image-01` (standard), `image-01-live` (enhanced) - $0.03/image, 1-9 batch
|
||||
- **Video gen (Hailuo)**: `MiniMax-Hailuo-2.3` (1080p), `MiniMax-Hailuo-2.3-Fast` (50% cheaper), `MiniMax-Hailuo-02` (first+last frame), `S2V-01` (subject ref)
|
||||
- **Speech/TTS**: `speech-2.8-hd` (best), `speech-2.8-turbo` (fast) - 300+ voices, 40+ languages, emotion control
|
||||
- **Music**: `music-2.5` - 4-minute songs with vocals, synchronized lyrics
|
||||
|
||||
## Scripts
|
||||
|
||||
- **`gemini_batch_process.py`**: Gemini CLI for `transcribe|analyze|extract|generate|generate-video`. Auto-resolves API keys, Imagen 4 + Veo + Nano Banana workflows.
|
||||
- **`minimax_cli.py`**: MiniMax CLI for `generate|generate-video|generate-speech|generate-music`. Supports all MiniMax models.
|
||||
- **`minimax_generate.py`**: MiniMax generation functions (image, video, speech, music). Library for programmatic use.
|
||||
- **`minimax_api_client.py`**: MiniMax HTTP client, auth, async polling, file download utilities.
|
||||
- **`media_optimizer.py`**: ffmpeg/Pillow preflight: compress/resize/convert media to stay within API limits.
|
||||
- **`document_converter.py`**: Gemini-powered PDF/image/Office → markdown converter.
|
||||
- **`check_setup.py`**: Setup checker for API keys and dependencies.
|
||||
|
||||
Use `--help` for options.
|
||||
|
||||
## References
|
||||
|
||||
Load for detailed guidance:
|
||||
|
||||
| Topic | File | Description |
|
||||
|-------|------|-------------|
|
||||
| Music | `references/music-generation.md` | Lyria RealTime API for background music generation, style prompts, real-time control, integration with video production. |
|
||||
| Audio | `references/audio-processing.md` | Audio formats and limits, transcription (timestamps, speakers, segments), non-speech analysis, File API vs inline input, TTS models, best practices, cost and token math, and concrete meeting/podcast/interview recipes. |
|
||||
| Images | `references/vision-understanding.md` | Vision capabilities overview, supported formats and models, captioning/classification/VQA, detection and segmentation, OCR and document reading, multi-image workflows, structured JSON output, token costs, best practices, and common product/screenshot/chart/scene use cases. |
|
||||
| Image Gen | `references/image-generation.md` | Imagen 4 and Gemini image model overview, generate_images vs generate_content APIs, aspect ratios and costs, text/image/both modalities, editing and composition, style and quality control, safety settings, best practices, troubleshooting, and common marketing/concept-art/UI scenarios. |
|
||||
| Video | `references/video-analysis.md` | Video analysis capabilities and supported formats, model/context choices, local/inline/YouTube inputs, clipping and FPS control, multi-video comparison, temporal Q&A and scene detection, transcription with visual context, token and cost guidance, and optimization/best-practice patterns. |
|
||||
| Video Gen | `references/video-generation.md` | Veo model matrix, text-to-video and image-to-video quick start, multi-reference and extension flows, camera and timing control, configuration (resolution, aspect, audio, safety), prompt design patterns, performance tips, limitations, troubleshooting, and cost estimates. |
|
||||
| MiniMax | `references/minimax-generation.md` | MiniMax image (image-01), video (Hailuo 2.3), speech (TTS 2.8), and music (2.5) generation APIs. Endpoints, models, parameters, async workflows, pricing, rate limits, voice library, and examples. |
|
||||
|
||||
## Limits
|
||||
|
||||
**Formats**: Audio (WAV/MP3/AAC, 9.5h), Images (PNG/JPEG/WEBP, 3.6k), Video (MP4/MOV, 6h), PDF (1k pages)
|
||||
**Size**: 20MB inline, 2GB File API
|
||||
**Important:**
|
||||
- If you are going to generate a transcript of the audio, and the audio length is longer than 15 minutes, the transcript often gets truncated due to output token limits in the Gemini API response. To get the full transcript, you need to split the audio into smaller chunks (max 15 minutes per chunk) and transcribe each segment for a complete transcript.
|
||||
- If you are going to generate a transcript of the video and the video length is longer than 15 minutes, use ffmpeg to extract the audio from the video, truncate the audio to 15 minutes, transcribe all audio segments, and then combine the transcripts into a single transcript.
|
||||
**Transcription Output Requirements:**
|
||||
- Format: Markdown
|
||||
- Metadata: Duration, file size, generated date, description, file name, topics covered, etc.
|
||||
- Parts: from-to (e.g., 00:00-00:15), audio chunk name, transcript, status, etc.
|
||||
- Transcript format:
|
||||
```
|
||||
[HH:MM:SS -> HH:MM:SS] transcript content
|
||||
[HH:MM:SS -> HH:MM:SS] transcript content
|
||||
...
|
||||
```
|
||||
|
||||
## Outputs
|
||||
|
||||
**IMPORTANT:** Invoke "/ck:project-organization" skill to organize the outputs.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Gemini API Docs](https://ai.google.dev/gemini-api/docs/)
|
||||
- [Gemini Pricing](https://ai.google.dev/pricing)
|
||||
- [MiniMax API Docs](https://platform.minimax.io/docs/api-reference/api-overview)
|
||||
- [MiniMax Pricing](https://platform.minimax.io/pricing)
|
||||
387
.opencode/skills/ai-multimodal/references/audio-processing.md
Normal file
387
.opencode/skills/ai-multimodal/references/audio-processing.md
Normal file
@@ -0,0 +1,387 @@
|
||||
# Audio Processing Reference
|
||||
|
||||
Comprehensive guide for audio analysis and speech generation using Gemini API.
|
||||
|
||||
## Audio Understanding
|
||||
|
||||
### Supported Formats
|
||||
|
||||
| Format | MIME Type | Best Use |
|
||||
|--------|-----------|----------|
|
||||
| WAV | `audio/wav` | Uncompressed, highest quality |
|
||||
| MP3 | `audio/mp3` | Compressed, widely compatible |
|
||||
| AAC | `audio/aac` | Compressed, good quality |
|
||||
| FLAC | `audio/flac` | Lossless compression |
|
||||
| OGG Vorbis | `audio/ogg` | Open format |
|
||||
| AIFF | `audio/aiff` | Apple format |
|
||||
|
||||
### Specifications
|
||||
|
||||
- **Maximum length**: 9.5 hours per request
|
||||
- **Multiple files**: Unlimited count, combined max 9.5 hours
|
||||
- **Token rate**: 32 tokens/second (1 minute = 1,920 tokens)
|
||||
- **Processing**: Auto-downsampled to 16 Kbps mono
|
||||
- **File size limits**:
|
||||
- Inline: 20 MB max total request
|
||||
- File API: 2 GB per file, 20 GB project quota
|
||||
- Retention: 48 hours auto-delete
|
||||
- **Important:** if you are going to generate a transcript of the audio, and the audio length is longer than 15 minutes, the transcript often gets truncated due to output token limits in the Gemini API response. To get the full transcript, you need to split the audio into smaller chunks (max 15 minutes per chunk) and transcribe each segment for a complete transcript.
|
||||
|
||||
## Transcription
|
||||
|
||||
### Basic Transcription
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Upload audio
|
||||
myfile = client.files.upload(file='meeting.mp3')
|
||||
|
||||
# Transcribe
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Generate a transcript of the speech.', myfile]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### With Timestamps
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Generate transcript with timestamps in MM:SS format.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-Speaker Identification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe with speaker labels. Format: [Speaker 1], [Speaker 2], etc.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Segment-Specific Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe only the segment from 02:30 to 05:15.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Audio Analysis
|
||||
|
||||
### Summarization
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize key points in 5 bullets with timestamps.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Non-Speech Audio Analysis
|
||||
|
||||
```python
|
||||
# Music analysis
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify the musical instruments and genre.', myfile]
|
||||
)
|
||||
|
||||
# Environmental sounds
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify all sounds: voices, music, ambient noise.', myfile]
|
||||
)
|
||||
|
||||
# Birdsong identification
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify bird species based on their calls.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Timestamp-Based Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What is discussed from 10:30 to 15:45? Provide key points.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Input Methods
|
||||
|
||||
### File Upload (>20MB or Reuse)
|
||||
|
||||
```python
|
||||
# Upload once, use multiple times
|
||||
myfile = client.files.upload(file='large-audio.mp3')
|
||||
|
||||
# First query
|
||||
response1 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe this', myfile]
|
||||
)
|
||||
|
||||
# Second query (reuses same file)
|
||||
response2 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize this', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Inline Data (<20MB)
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
with open('small-audio.mp3', 'rb') as f:
|
||||
audio_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Describe this audio',
|
||||
types.Part.from_bytes(data=audio_bytes, mime_type='audio/mp3')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Speech Generation (TTS)
|
||||
|
||||
### Available Models
|
||||
|
||||
| Model | Quality | Speed | Cost/1M tokens |
|
||||
|-------|---------|-------|----------------|
|
||||
| `gemini-2.5-flash-native-audio-preview-09-2025` | High | Fast | $10 |
|
||||
| `gemini-2.5-pro` TTS mode | Premium | Slower | $20 |
|
||||
|
||||
### Basic TTS
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio: Welcome to today\'s episode.'
|
||||
)
|
||||
|
||||
# Save audio
|
||||
with open('output.wav', 'wb') as f:
|
||||
f.write(response.audio_data)
|
||||
```
|
||||
|
||||
### Controllable Voice Style
|
||||
|
||||
```python
|
||||
# Professional tone
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a professional, clear tone: Welcome to our quarterly earnings call.'
|
||||
)
|
||||
|
||||
# Casual and friendly
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a friendly, conversational tone: Hey there! Let\'s dive into today\'s topic.'
|
||||
)
|
||||
|
||||
# Narrative style
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a narrative, storytelling tone: Once upon a time, in a land far away...'
|
||||
)
|
||||
```
|
||||
|
||||
### Voice Control Parameters
|
||||
|
||||
- **Style**: Professional, casual, narrative, conversational
|
||||
- **Pace**: Slow, normal, fast
|
||||
- **Tone**: Friendly, serious, enthusiastic
|
||||
- **Accent**: Natural language control (e.g., "British accent", "Southern drawl")
|
||||
|
||||
## Best Practices
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for files >20MB
|
||||
2. Use File API for repeated queries (saves tokens)
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually when done:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Effective prompts**:
|
||||
- "Transcribe from 02:30 to 03:29 in MM:SS format"
|
||||
- "Identify speakers and extract dialogue with timestamps"
|
||||
- "Summarize key points with relevant timestamps"
|
||||
- "Transcribe and analyze sentiment for each speaker"
|
||||
|
||||
**Context improves accuracy**:
|
||||
- "This is a medical interview - use appropriate terminology"
|
||||
- "Transcribe this legal deposition with precise terminology"
|
||||
- "This is a technical podcast about machine learning"
|
||||
|
||||
**Combined tasks**:
|
||||
- "Transcribe and summarize in bullet points"
|
||||
- "Extract key quotes with timestamps and speaker labels"
|
||||
- "Transcribe and identify action items with timestamps"
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
**Token calculation**:
|
||||
- 1 minute audio = 1,920 tokens
|
||||
- 1 hour audio = 115,200 tokens
|
||||
- 9.5 hours = 1,094,400 tokens
|
||||
|
||||
**Model selection**:
|
||||
- Use `gemini-2.5-flash` ($1/1M tokens) for most tasks
|
||||
- Upgrade to `gemini-2.5-pro` ($3/1M tokens) for complex analysis
|
||||
- For high-volume: `gemini-1.5-flash` ($0.70/1M tokens)
|
||||
|
||||
**Reduce costs**:
|
||||
- Process only relevant segments using timestamps
|
||||
- Use lower-quality audio when possible
|
||||
- Batch multiple short files in one request
|
||||
- Cache context for repeated queries
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def transcribe_with_retry(file_path, max_retries=3):
|
||||
"""Transcribe audio with exponential backoff retry"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
myfile = client.files.upload(file=file_path)
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe with timestamps', myfile]
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Retry {attempt + 1} after {wait_time}s")
|
||||
time.sleep(wait_time)
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Meeting Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Transcribe this meeting with:
|
||||
1. Speaker labels
|
||||
2. Timestamps for topic changes
|
||||
3. Action items highlighted
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Podcast Summary
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create podcast summary with:
|
||||
1. Main topics with timestamps
|
||||
2. Key quotes from each speaker
|
||||
3. Recommended episode highlights
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Interview Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze interview:
|
||||
1. Questions asked with timestamps
|
||||
2. Key responses from interviewee
|
||||
3. Overall sentiment and tone
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Content Verification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Verify audio content:
|
||||
1. Check for specific keywords or phrases
|
||||
2. Identify any compliance issues
|
||||
3. Note any concerning statements with timestamps
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Multilingual Transcription
|
||||
|
||||
```python
|
||||
# Gemini auto-detects language
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe this audio and translate to English if needed.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Token Costs
|
||||
|
||||
**Audio Input** (32 tokens/second):
|
||||
- 1 minute = 1,920 tokens
|
||||
- 10 minutes = 19,200 tokens
|
||||
- 1 hour = 115,200 tokens
|
||||
- 9.5 hours = 1,094,400 tokens
|
||||
|
||||
**Example costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- 1 hour audio: 115,200 tokens = $0.12
|
||||
- Full day podcast (8 hours): 921,600 tokens = $0.92
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 9.5 hours per request
|
||||
- Auto-downsampled to 16 Kbps mono (quality loss)
|
||||
- Files expire after 48 hours
|
||||
- No real-time streaming support
|
||||
- Non-speech audio less accurate than speech
|
||||
|
||||
---
|
||||
|
||||
## Related References
|
||||
|
||||
**Current**: Audio Processing
|
||||
|
||||
**Related Capabilities**:
|
||||
- [Video Analysis](./video-analysis.md) - Extract audio from videos
|
||||
- [Video Generation](./video-generation.md) - Generate videos with native audio
|
||||
- [Image Understanding](./vision-understanding.md) - Analyze audio with visual context
|
||||
|
||||
**Back to**: [AI Multimodal Skill](../SKILL.md)
|
||||
1002
.opencode/skills/ai-multimodal/references/image-generation.md
Normal file
1002
.opencode/skills/ai-multimodal/references/image-generation.md
Normal file
File diff suppressed because it is too large
Load Diff
141
.opencode/skills/ai-multimodal/references/minimax-generation.md
Normal file
141
.opencode/skills/ai-multimodal/references/minimax-generation.md
Normal file
@@ -0,0 +1,141 @@
|
||||
# MiniMax Generation Reference
|
||||
|
||||
## Overview
|
||||
|
||||
MiniMax provides image, video (Hailuo), speech (TTS), and music generation APIs.
|
||||
Base URL: `https://api.minimax.io/v1` | Auth: `Bearer {MINIMAX_API_KEY}`
|
||||
|
||||
## Image Generation
|
||||
|
||||
**Endpoint**: `POST /image_generation`
|
||||
**Models**: `image-01` (standard), `image-01-live` (enhanced)
|
||||
**Rate**: 10 RPM | **Cost**: ~$0.03/image
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "image-01",
|
||||
"prompt": "A girl looking into the distance",
|
||||
"aspect_ratio": "16:9",
|
||||
"n": 2,
|
||||
"response_format": "url",
|
||||
"prompt_optimizer": true,
|
||||
"subject_reference": [{"type": "character", "image_file": "url", "weight": 0.8}]
|
||||
}
|
||||
```
|
||||
|
||||
**Aspect ratios**: 1:1, 16:9, 4:3, 3:2, 2:3, 3:4, 9:16, 21:9
|
||||
**Custom dims**: 512-2048px (divisible by 8)
|
||||
**Batch**: 1-9 images per request
|
||||
|
||||
## Video Generation (Hailuo)
|
||||
|
||||
**Endpoints**: POST `/video_generation` → GET `/query/video_generation` → GET `/files/retrieve`
|
||||
**Async workflow**: Submit task → poll every 10s → download file (URL valid 9h)
|
||||
|
||||
### Models
|
||||
| Model | Features | Resolution |
|
||||
|-------|----------|-----------|
|
||||
| `MiniMax-Hailuo-2.3` | Text/image-to-video | 720p/1080p |
|
||||
| `MiniMax-Hailuo-2.3-Fast` | Same, 50% faster+cheaper | 720p/1080p |
|
||||
| `MiniMax-Hailuo-02` | First+last frame mode | 720p |
|
||||
| `S2V-01` | Subject reference | 720p |
|
||||
|
||||
**Rate**: 5 RPM | **Cost**: $0.25 (6s/768p), $0.52 (10s/768p)
|
||||
|
||||
```json
|
||||
// Text-to-video
|
||||
{"prompt": "A dancer", "model": "MiniMax-Hailuo-2.3", "duration": 6, "resolution": "1080P"}
|
||||
|
||||
// Image-to-video
|
||||
{"prompt": "Scene desc", "first_frame_image": "url", "model": "MiniMax-Hailuo-2.3", "duration": 6}
|
||||
|
||||
// First+last frame
|
||||
{"prompt": "Transition", "first_frame_image": "url", "last_frame_image": "url", "model": "MiniMax-Hailuo-02"}
|
||||
|
||||
// Subject reference
|
||||
{"prompt": "Scene with character", "subject_reference": [{"type": "character", "image": ["url"]}], "model": "S2V-01"}
|
||||
```
|
||||
|
||||
## Speech/TTS
|
||||
|
||||
**Endpoint**: `POST /speech/speech_t2a_input`
|
||||
**Models**: `speech-2.8-hd` (best), `speech-2.8-turbo` (fast), `speech-2.6-hd/turbo`, `speech-02-hd/turbo`
|
||||
**Rate**: 60 RPM | **Cost**: $30-50/1M chars
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "speech-2.8-hd",
|
||||
"text": "Your text here",
|
||||
"voice": "English_Warm_Bestie",
|
||||
"emotion": "happy",
|
||||
"rate": 1.0,
|
||||
"volume": 1.0,
|
||||
"pitch": 1.0,
|
||||
"output_format": "mp3"
|
||||
}
|
||||
```
|
||||
|
||||
**Voices**: 300+ system voices, 40+ languages
|
||||
**Emotions**: happy, sad, angry, fearful, disgusted, surprised, neutral
|
||||
**Formats**: mp3, wav, pcm, flac
|
||||
**Text limit**: 10,000 chars
|
||||
|
||||
### Voice Cloning
|
||||
```json
|
||||
POST /voice_clone
|
||||
{"audio_url": "https://sample.wav", "clone_name": "my_voice"}
|
||||
```
|
||||
Requires 10+ seconds of reference audio. Rate: 60 RPM.
|
||||
|
||||
## Music Generation
|
||||
|
||||
**Endpoint**: `POST /music_generation`
|
||||
**Models**: `music-2.5` (latest, vocals+accompaniment, 4min songs)
|
||||
**Rate**: 120 RPM | **Cost**: $0.03-0.075/generation
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "music-2.5",
|
||||
"lyrics": "Verse 1\nLine one\n\n[Chorus]\nChorus line",
|
||||
"prompt": "Upbeat pop with electronic elements",
|
||||
"output_format": "url",
|
||||
"audio_setting": {"sample_rate": 44100, "bitrate": 128000, "format": "mp3"}
|
||||
}
|
||||
```
|
||||
|
||||
**Lyrics**: 1-3500 chars, supports structure tags ([Verse], [Chorus], etc.)
|
||||
**Prompt**: 0-2000 chars, style/mood description
|
||||
**Sample rates**: 16000, 24000, 32000, 44100 Hz
|
||||
**Bitrates**: 32000, 64000, 128000, 256000 bps
|
||||
|
||||
## Error Codes
|
||||
|
||||
| Code | Meaning |
|
||||
|------|---------|
|
||||
| 0 | Success |
|
||||
| 1002 | Rate limit exceeded |
|
||||
| 1008 | Insufficient balance |
|
||||
| 2013 | Invalid parameters |
|
||||
|
||||
## CLI Examples
|
||||
|
||||
```bash
|
||||
# Image
|
||||
python minimax_cli.py --task generate --prompt "A cyberpunk city" --model image-01 --aspect-ratio 16:9
|
||||
|
||||
# Video
|
||||
python minimax_cli.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3 --duration 6
|
||||
|
||||
# Speech
|
||||
python minimax_cli.py --task generate-speech --text "Hello world" --model speech-2.8-hd --voice English_Warm_Bestie --emotion happy
|
||||
|
||||
# Music
|
||||
python minimax_cli.py --task generate-music --lyrics "La la la\nOh yeah" --prompt "upbeat pop" --model music-2.5
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [API Overview](https://platform.minimax.io/docs/api-reference/api-overview)
|
||||
- [Video Guide](https://platform.minimax.io/docs/guides/video-generation)
|
||||
- [Speech API](https://platform.minimax.io/docs/api-reference/speech-t2a-intro)
|
||||
- [Music API](https://platform.minimax.io/docs/api-reference/music-generation)
|
||||
311
.opencode/skills/ai-multimodal/references/music-generation.md
Normal file
311
.opencode/skills/ai-multimodal/references/music-generation.md
Normal file
@@ -0,0 +1,311 @@
|
||||
# Music Generation Reference
|
||||
|
||||
Real-time music generation using Lyria RealTime via WebSocket API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Real-time streaming**: Bidirectional WebSocket for continuous generation
|
||||
- **Dynamic control**: Modify music in real-time during generation
|
||||
- **Style steering**: Genre, mood, instrumentation guidance
|
||||
- **Audio output**: 48kHz stereo 16-bit PCM
|
||||
|
||||
## Model
|
||||
|
||||
**Lyria RealTime** (Experimental)
|
||||
- WebSocket-based streaming
|
||||
- Real-time parameter adjustment
|
||||
- Instrumental only (no vocals)
|
||||
- Watermarked output
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import asyncio
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
async def generate_music():
|
||||
async with client.aio.live.music.connect() as session:
|
||||
# Set style prompts with weights (0.0-1.0)
|
||||
await session.set_weighted_prompts([
|
||||
{"prompt": "Upbeat corporate background music", "weight": 0.8},
|
||||
{"prompt": "Modern electronic elements", "weight": 0.5}
|
||||
])
|
||||
|
||||
# Configure generation parameters
|
||||
await session.set_music_generation_config(
|
||||
guidance=4.0, # Prompt adherence (0.0-6.0)
|
||||
bpm=120, # Tempo (60-200)
|
||||
density=0.6, # Note density (0.0-1.0)
|
||||
brightness=0.5 # Tonal quality (0.0-1.0)
|
||||
)
|
||||
|
||||
# Start playback and collect audio
|
||||
await session.play()
|
||||
|
||||
audio_chunks = []
|
||||
async for chunk in session:
|
||||
audio_chunks.append(chunk.audio_data)
|
||||
|
||||
return b''.join(audio_chunks)
|
||||
```
|
||||
|
||||
### JavaScript
|
||||
|
||||
```javascript
|
||||
const client = new GenaiClient({ apiKey: process.env.GEMINI_API_KEY });
|
||||
|
||||
async function generateMusic() {
|
||||
const session = await client.live.music.connect();
|
||||
|
||||
await session.setWeightedPrompts([
|
||||
{ prompt: "Calm ambient background", weight: 0.9 },
|
||||
{ prompt: "Nature sounds influence", weight: 0.3 }
|
||||
]);
|
||||
|
||||
await session.setMusicGenerationConfig({
|
||||
guidance: 3.5,
|
||||
bpm: 80,
|
||||
density: 0.4,
|
||||
brightness: 0.6
|
||||
});
|
||||
|
||||
session.onAudio((audioChunk) => {
|
||||
// Process 48kHz stereo PCM audio
|
||||
audioBuffer.push(audioChunk);
|
||||
});
|
||||
|
||||
await session.play();
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration Parameters
|
||||
|
||||
| Parameter | Range | Default | Description |
|
||||
|-----------|-------|---------|-------------|
|
||||
| `guidance` | 0.0-6.0 | 4.0 | Prompt adherence (higher = stricter) |
|
||||
| `bpm` | 60-200 | 120 | Tempo in beats per minute |
|
||||
| `density` | 0.0-1.0 | 0.5 | Note/sound density |
|
||||
| `brightness` | 0.0-1.0 | 0.5 | Tonal quality (higher = brighter) |
|
||||
| `scale` | 12 keys | C Major | Musical key |
|
||||
| `mute_bass` | bool | false | Remove bass elements |
|
||||
| `mute_drums` | bool | false | Remove drum elements |
|
||||
| `mode` | enum | QUALITY | QUALITY, DIVERSITY, VOCALIZATION |
|
||||
| `temperature` | 0.0-2.0 | 1.0 | Sampling randomness |
|
||||
| `top_k` | int | 40 | Sampling top-k |
|
||||
| `seed` | int | random | Reproducibility seed |
|
||||
|
||||
## Weighted Prompts
|
||||
|
||||
Control generation direction with weighted prompts:
|
||||
|
||||
```python
|
||||
await session.set_weighted_prompts([
|
||||
{"prompt": "Main style description", "weight": 1.0}, # Primary
|
||||
{"prompt": "Secondary influence", "weight": 0.5}, # Supporting
|
||||
{"prompt": "Subtle element", "weight": 0.2} # Accent
|
||||
])
|
||||
```
|
||||
|
||||
**Weight guidelines**:
|
||||
- 0.8-1.0: Dominant influence
|
||||
- 0.5-0.7: Secondary contribution
|
||||
- 0.2-0.4: Subtle accent
|
||||
- 0.0-0.1: Minimal effect
|
||||
|
||||
## Style Prompts by Use Case
|
||||
|
||||
### Corporate/Marketing
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
{"prompt": "Professional corporate background music, modern", "weight": 0.9},
|
||||
{"prompt": "Uplifting, optimistic mood", "weight": 0.6},
|
||||
{"prompt": "Clean production, minimal complexity", "weight": 0.5}
|
||||
]
|
||||
config = {"bpm": 100, "brightness": 0.6, "density": 0.5}
|
||||
```
|
||||
|
||||
### Social Media/Short-form
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
{"prompt": "Trending pop electronic beat", "weight": 0.9},
|
||||
{"prompt": "Energetic, catchy rhythm", "weight": 0.7},
|
||||
{"prompt": "Bass-heavy, punchy", "weight": 0.5}
|
||||
]
|
||||
config = {"bpm": 128, "brightness": 0.7, "density": 0.7}
|
||||
```
|
||||
|
||||
### Emotional/Cinematic
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
{"prompt": "Cinematic orchestral underscore", "weight": 0.9},
|
||||
{"prompt": "Emotional, inspiring", "weight": 0.7},
|
||||
{"prompt": "Building tension and release", "weight": 0.5}
|
||||
]
|
||||
config = {"bpm": 70, "brightness": 0.4, "density": 0.4}
|
||||
```
|
||||
|
||||
### Ambient/Background
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
{"prompt": "Calm ambient soundscape", "weight": 0.9},
|
||||
{"prompt": "Minimal, atmospheric", "weight": 0.6},
|
||||
{"prompt": "Lo-fi textures", "weight": 0.4}
|
||||
]
|
||||
config = {"bpm": 80, "brightness": 0.4, "density": 0.3}
|
||||
```
|
||||
|
||||
## Real-time Transitions
|
||||
|
||||
Smoothly transition between styles during generation:
|
||||
|
||||
```python
|
||||
async def dynamic_music_generation():
|
||||
async with client.aio.live.music.connect() as session:
|
||||
# Start with intro style
|
||||
await session.set_weighted_prompts([
|
||||
{"prompt": "Soft ambient intro", "weight": 0.9}
|
||||
])
|
||||
await session.play()
|
||||
|
||||
# Collect intro (4 seconds)
|
||||
intro_chunks = []
|
||||
for _ in range(192): # ~4 seconds at 48kHz
|
||||
chunk = await session.__anext__()
|
||||
intro_chunks.append(chunk.audio_data)
|
||||
|
||||
# Transition to main section
|
||||
await session.set_weighted_prompts([
|
||||
{"prompt": "Building energy", "weight": 0.7},
|
||||
{"prompt": "Full beat drop", "weight": 0.5}
|
||||
])
|
||||
|
||||
# Continue with new style...
|
||||
```
|
||||
|
||||
## Output Specifications
|
||||
|
||||
- **Format**: Raw 16-bit PCM
|
||||
- **Sample Rate**: 48,000 Hz
|
||||
- **Channels**: 2 (stereo)
|
||||
- **Bit Depth**: 16 bits
|
||||
- **Watermarking**: Always enabled (SynthID)
|
||||
|
||||
### Save to WAV
|
||||
|
||||
```python
|
||||
import wave
|
||||
|
||||
def save_pcm_to_wav(pcm_data, filename):
|
||||
with wave.open(filename, 'wb') as wav_file:
|
||||
wav_file.setnchannels(2) # Stereo
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setframerate(48000) # 48kHz
|
||||
wav_file.writeframes(pcm_data)
|
||||
```
|
||||
|
||||
### Convert to MP3
|
||||
|
||||
```bash
|
||||
# Using FFmpeg
|
||||
ffmpeg -f s16le -ar 48000 -ac 2 -i input.pcm output.mp3
|
||||
```
|
||||
|
||||
## Integration with Video Production
|
||||
|
||||
### Generate Background Music for Video
|
||||
|
||||
```python
|
||||
async def generate_video_background(duration_seconds, mood):
|
||||
"""Generate background music matching video length"""
|
||||
|
||||
# Configure for video background
|
||||
prompts = [
|
||||
{"prompt": f"{mood} background music for video", "weight": 0.9},
|
||||
{"prompt": "Non-distracting, supportive underscore", "weight": 0.6}
|
||||
]
|
||||
|
||||
async with client.aio.live.music.connect() as session:
|
||||
await session.set_weighted_prompts(prompts)
|
||||
await session.set_music_generation_config(
|
||||
guidance=4.0,
|
||||
density=0.4, # Keep sparse for background
|
||||
brightness=0.5
|
||||
)
|
||||
await session.play()
|
||||
|
||||
# Calculate chunks needed (48kHz stereo = 192000 bytes/second)
|
||||
total_chunks = duration_seconds * 48000 // 512 # Chunk size estimate
|
||||
|
||||
audio_data = []
|
||||
async for i, chunk in enumerate(session):
|
||||
audio_data.append(chunk.audio_data)
|
||||
if i >= total_chunks:
|
||||
break
|
||||
|
||||
return b''.join(audio_data)
|
||||
```
|
||||
|
||||
### Sync with Storyboard Timing
|
||||
|
||||
```python
|
||||
async def generate_scene_music(scenes):
|
||||
"""Generate music with transitions matching scene changes"""
|
||||
|
||||
all_audio = []
|
||||
|
||||
async with client.aio.live.music.connect() as session:
|
||||
for scene in scenes:
|
||||
# Update style for each scene
|
||||
await session.set_weighted_prompts([
|
||||
{"prompt": scene['mood'], "weight": 0.9},
|
||||
{"prompt": scene['style'], "weight": 0.5}
|
||||
])
|
||||
|
||||
if scene['index'] == 0:
|
||||
await session.play()
|
||||
|
||||
# Collect audio for scene duration
|
||||
chunks = int(scene['duration'] * 48000 / 512)
|
||||
for _ in range(chunks):
|
||||
chunk = await session.__anext__()
|
||||
all_audio.append(chunk.audio_data)
|
||||
|
||||
return b''.join(all_audio)
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Instrumental only**: No vocal/singing generation
|
||||
- **WebSocket required**: Real-time streaming connection
|
||||
- **Safety filtering**: Prompts undergo safety review
|
||||
- **Watermarking**: All output contains SynthID watermark
|
||||
- **Experimental**: API may change
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Buffer audio**: Implement robust buffering for smooth playback
|
||||
2. **Gradual transitions**: Avoid drastic prompt changes mid-stream
|
||||
3. **Sparse for backgrounds**: Lower density for video backgrounds
|
||||
4. **Test prompts**: Iterate on prompt combinations
|
||||
5. **Cross-fade transitions**: Blend audio at style changes
|
||||
6. **Match video mood**: Align music tempo/energy with visuals
|
||||
|
||||
## Resources
|
||||
|
||||
- [Lyria RealTime Docs](https://ai.google.dev/gemini-api/docs/music-generation)
|
||||
- [Audio Processing Guide](./audio-processing.md)
|
||||
- [Video Generation](./video-generation.md)
|
||||
|
||||
---
|
||||
|
||||
**Related**: [Audio Processing](./audio-processing.md) | [Video Generation](./video-generation.md)
|
||||
|
||||
**Back to**: [AI Multimodal Skill](../SKILL.md)
|
||||
515
.opencode/skills/ai-multimodal/references/video-analysis.md
Normal file
515
.opencode/skills/ai-multimodal/references/video-analysis.md
Normal file
@@ -0,0 +1,515 @@
|
||||
# Video Analysis Reference
|
||||
|
||||
Comprehensive guide for video understanding, temporal analysis, and YouTube processing using Gemini API.
|
||||
|
||||
> **Note**: This guide covers video *analysis* (understanding existing videos). For video *generation* (creating new videos), see [Video Generation Reference](./video-generation.md).
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Video Summarization**: Create concise summaries
|
||||
- **Question Answering**: Answer specific questions about content
|
||||
- **Transcription**: Audio transcription with visual descriptions
|
||||
- **Timestamp References**: Query specific moments (MM:SS format)
|
||||
- **Video Clipping**: Process specific segments
|
||||
- **Scene Detection**: Identify scene changes and transitions
|
||||
- **Multiple Videos**: Compare up to 10 videos (2.5+)
|
||||
- **YouTube Support**: Analyze YouTube videos directly
|
||||
- **Custom Frame Rate**: Adjust FPS sampling
|
||||
|
||||
## Supported Formats
|
||||
|
||||
- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
|
||||
|
||||
## Model Selection
|
||||
|
||||
### Gemini 3 Series (Latest)
|
||||
- **gemini-3-pro-preview**: Latest, agentic workflows, 1M context, dynamic thinking
|
||||
|
||||
### Gemini 2.5 Series (Recommended)
|
||||
- **gemini-2.5-pro**: Best quality, 1M-2M context
|
||||
- **gemini-2.5-flash**: Balanced, 1M-2M context (recommended)
|
||||
|
||||
### Context Windows
|
||||
- **2M token models**: ~2 hours (default) or ~6 hours (low-res)
|
||||
- **1M token models**: ~1 hour (default) or ~3 hours (low-res)
|
||||
|
||||
## Basic Video Analysis
|
||||
|
||||
### Local Video
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Upload video (File API for >20MB)
|
||||
myfile = client.files.upload(file='video.mp4')
|
||||
|
||||
# Wait for processing
|
||||
import time
|
||||
while myfile.state.name == 'PROCESSING':
|
||||
time.sleep(1)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError('Video processing failed')
|
||||
|
||||
# Analyze
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize this video in 3 key points', myfile]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### YouTube Video
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Summarize the main topics discussed',
|
||||
types.Part.from_uri(
|
||||
uri='https://www.youtube.com/watch?v=VIDEO_ID',
|
||||
mime_type='video/mp4'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Inline Video (<20MB)
|
||||
|
||||
```python
|
||||
with open('short-clip.mp4', 'rb') as f:
|
||||
video_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'What happens in this video?',
|
||||
types.Part.from_bytes(data=video_bytes, mime_type='video/mp4')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Video Clipping
|
||||
|
||||
```python
|
||||
# Analyze specific time range
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Summarize this segment',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
start_offset='40s',
|
||||
end_offset='80s'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Frame Rate
|
||||
|
||||
```python
|
||||
# Lower FPS for static content (saves tokens)
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this presentation',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=0.5 # Sample every 2 seconds
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Higher FPS for fast-moving content
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze rapid movements in this sports video',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=5 # Sample 5 times per second
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Multiple Videos (2.5+)
|
||||
|
||||
```python
|
||||
video1 = client.files.upload(file='demo1.mp4')
|
||||
video2 = client.files.upload(file='demo2.mp4')
|
||||
|
||||
# Wait for processing
|
||||
for video in [video1, video2]:
|
||||
while video.state.name == 'PROCESSING':
|
||||
time.sleep(1)
|
||||
video = client.files.get(name=video.name)
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-pro',
|
||||
contents=[
|
||||
'Compare these two product demos. Which explains features better?',
|
||||
video1,
|
||||
video2
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Temporal Understanding
|
||||
|
||||
### Timestamp-Based Questions
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'What happens at 01:15 and how does it relate to 02:30?',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Timeline Creation
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create a timeline with timestamps:
|
||||
- Key events
|
||||
- Scene changes
|
||||
- Important moments
|
||||
Format: MM:SS - Description
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Scene Detection
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Identify all scene changes with timestamps and describe each scene',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Transcription
|
||||
|
||||
### Basic Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Transcribe the audio from this video',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### With Visual Descriptions
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Transcribe with visual context:
|
||||
- Audio transcription
|
||||
- Visual descriptions of important moments
|
||||
- Timestamps for salient events
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Speaker Identification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Transcribe with speaker labels and timestamps',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Video Summarization
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Summarize this video:
|
||||
1. Main topic and purpose
|
||||
2. Key points with timestamps
|
||||
3. Conclusion or call-to-action
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Educational Content
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create educational materials:
|
||||
1. List key concepts taught
|
||||
2. Create 5 quiz questions with answers
|
||||
3. Provide timestamp for each concept
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Action Detection
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'List all actions performed in this tutorial with timestamps',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Content Moderation
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Review video content:
|
||||
1. Identify any problematic content
|
||||
2. Note timestamps of concerns
|
||||
3. Provide content rating recommendation
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Interview Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze interview:
|
||||
1. Questions asked (timestamps)
|
||||
2. Key responses
|
||||
3. Candidate body language and demeanor
|
||||
4. Overall assessment
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 6. Sports Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze sports video:
|
||||
1. Key plays with timestamps
|
||||
2. Player movements and positioning
|
||||
3. Game strategy observations
|
||||
''',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=5 # Higher FPS for fast action
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## YouTube Specific Features
|
||||
|
||||
### Public Video Requirements
|
||||
|
||||
- Video must be public (not private or unlisted)
|
||||
- No age-restricted content
|
||||
- Valid video ID required
|
||||
|
||||
### Usage Example
|
||||
|
||||
```python
|
||||
# YouTube URL
|
||||
youtube_uri = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Create chapter markers with timestamps',
|
||||
types.Part.from_uri(uri=youtube_uri, mime_type='video/mp4')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Rate Limits
|
||||
|
||||
- **Free tier**: 8 hours of YouTube video per day
|
||||
- **Paid tier**: No length-based limits
|
||||
- Public videos only
|
||||
|
||||
## Token Calculation
|
||||
|
||||
Video tokens depend on resolution and FPS:
|
||||
|
||||
**Default resolution** (~300 tokens/second):
|
||||
- 1 minute = 18,000 tokens
|
||||
- 10 minutes = 180,000 tokens
|
||||
- 1 hour = 1,080,000 tokens
|
||||
|
||||
**Low resolution** (~100 tokens/second):
|
||||
- 1 minute = 6,000 tokens
|
||||
- 10 minutes = 60,000 tokens
|
||||
- 1 hour = 360,000 tokens
|
||||
|
||||
**Context windows**:
|
||||
- 2M tokens ≈ 2 hours (default) or 6 hours (low-res)
|
||||
- 1M tokens ≈ 1 hour (default) or 3 hours (low-res)
|
||||
|
||||
## Best Practices
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for videos >20MB (most videos)
|
||||
2. Wait for ACTIVE state before analysis
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Optimization Strategies
|
||||
|
||||
**Reduce token usage**:
|
||||
- Process specific segments using start/end offsets
|
||||
- Use lower FPS for static content
|
||||
- Use low-resolution mode for long videos
|
||||
- Split very long videos into chunks
|
||||
|
||||
**Improve accuracy**:
|
||||
- Provide context in prompts
|
||||
- Use higher FPS for fast-moving content
|
||||
- Use Pro model for complex analysis
|
||||
- Be specific about what to extract
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Effective prompts**:
|
||||
- "Summarize key points with timestamps in MM:SS format"
|
||||
- "Identify all scene changes and describe each scene"
|
||||
- "Extract action items mentioned with timestamps"
|
||||
- "Compare these two videos on: X, Y, Z criteria"
|
||||
|
||||
**Structured output**:
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
class VideoEvent(BaseModel):
|
||||
timestamp: str # MM:SS format
|
||||
description: str
|
||||
category: str
|
||||
|
||||
class VideoAnalysis(BaseModel):
|
||||
summary: str
|
||||
events: List[VideoEvent]
|
||||
duration: str
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze this video', myfile],
|
||||
config=genai.types.GenerateContentConfig(
|
||||
response_mime_type='application/json',
|
||||
response_schema=VideoAnalysis
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def upload_and_process_video(file_path, max_wait=300):
|
||||
"""Upload video and wait for processing"""
|
||||
myfile = client.files.upload(file=file_path)
|
||||
|
||||
elapsed = 0
|
||||
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
||||
time.sleep(5)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
elapsed += 5
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError(f'Video processing failed: {myfile.state.name}')
|
||||
|
||||
if myfile.state.name == 'PROCESSING':
|
||||
raise TimeoutError(f'Processing timeout after {max_wait}s')
|
||||
|
||||
return myfile
|
||||
```
|
||||
|
||||
## Cost Optimization
|
||||
|
||||
**Token costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- 1 minute video (default): 18,000 tokens = $0.018
|
||||
- 10 minute video: 180,000 tokens = $0.18
|
||||
- 1 hour video: 1,080,000 tokens = $1.08
|
||||
|
||||
**Strategies**:
|
||||
- Use video clipping for specific segments
|
||||
- Lower FPS for static content
|
||||
- Use low-resolution mode for long videos
|
||||
- Batch related queries on same video
|
||||
- Use context caching for repeated queries
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 6 hours (low-res) or 2 hours (default)
|
||||
- YouTube videos must be public
|
||||
- No live streaming analysis
|
||||
- Files expire after 48 hours
|
||||
- Processing time varies by video length
|
||||
- No real-time processing
|
||||
- Limited to 10 videos per request (2.5+)
|
||||
|
||||
---
|
||||
|
||||
## Related References
|
||||
|
||||
**Current**: Video Analysis
|
||||
|
||||
**Related Capabilities**:
|
||||
- [Video Generation](./video-generation.md) - Creating videos from text/images
|
||||
- [Audio Processing](./audio-processing.md) - Extract and analyze audio tracks
|
||||
- [Image Understanding](./vision-understanding.md) - Analyze individual frames
|
||||
|
||||
**Back to**: [AI Multimodal Skill](../SKILL.md)
|
||||
457
.opencode/skills/ai-multimodal/references/video-generation.md
Normal file
457
.opencode/skills/ai-multimodal/references/video-generation.md
Normal file
@@ -0,0 +1,457 @@
|
||||
# Video Generation Reference
|
||||
|
||||
Comprehensive guide for video creation using Veo models via Gemini API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Text-to-Video**: Generate 8-second videos from text prompts
|
||||
- **Image-to-Video**: Animate images with text direction
|
||||
- **Video Extension**: Continue previously generated videos
|
||||
- **Frame Control**: Precise camera movements and effects
|
||||
- **Native Audio**: Synchronized audio generation
|
||||
- **Multiple Resolutions**: 720p and 1080p output
|
||||
- **Aspect Ratios**: 16:9, 9:16, 1:1
|
||||
|
||||
## Models
|
||||
|
||||
### Veo 3.1 Preview (Latest)
|
||||
|
||||
**veo-3.1-generate-preview** - Latest with advanced controls
|
||||
- Frame-specific generation
|
||||
- Up to 3 reference images for image-to-video
|
||||
- Video extension capability
|
||||
- Native audio generation
|
||||
- Resolution: 720p, 1080p
|
||||
- Duration: 8 seconds at 24fps
|
||||
- Status: Preview (API may change)
|
||||
- Updated: September 2025
|
||||
|
||||
**veo-3.1-fast-generate-preview** - Speed-optimized
|
||||
- Optimized for business use cases
|
||||
- Programmatic ad creation
|
||||
- Social media content
|
||||
- Same features as standard but faster
|
||||
- Status: Preview
|
||||
- Updated: September 2025
|
||||
|
||||
### Veo 3.0 Stable
|
||||
|
||||
**veo-3.0-generate-001** - Production-ready
|
||||
- Native audio generation
|
||||
- Text-to-video and image-to-video
|
||||
- 720p and 1080p (16:9 only)
|
||||
- 8 seconds at 24fps
|
||||
- Status: Stable
|
||||
- Updated: July 2025
|
||||
|
||||
**veo-3.0-fast-generate-001** - Stable fast variant
|
||||
- Speed-optimized stable version
|
||||
- Same reliability as 3.0
|
||||
- Status: Stable
|
||||
- Updated: July 2025
|
||||
|
||||
## Model Comparison
|
||||
|
||||
| Model | Speed | Features | Audio | Status | Best For |
|
||||
|-------|-------|----------|-------|--------|----------|
|
||||
| veo-3.1-preview | Medium | All | ✓ | Preview | Latest features |
|
||||
| veo-3.1-fast | Fast | All | ✓ | Preview | Business/speed |
|
||||
| veo-3.0-001 | Medium | Standard | ✓ | Stable | Production |
|
||||
| veo-3.0-fast | Fast | Standard | ✓ | Stable | Production/speed |
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Text-to-Video
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Basic generation
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
prompt='A serene beach at sunset with gentle waves rolling onto the shore',
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p',
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
|
||||
# Save video
|
||||
with open('output.mp4', 'wb') as f:
|
||||
f.write(response.video.data)
|
||||
```
|
||||
|
||||
### Image-to-Video
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
# Load reference image
|
||||
ref_image = PIL.Image.open('beach.jpg')
|
||||
|
||||
# Animate the image
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
prompt='Camera slowly pans across the scene from left to right',
|
||||
reference_images=[ref_image],
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Multiple Reference Images
|
||||
|
||||
```python
|
||||
# Use up to 3 reference images for complex scenes
|
||||
img1 = PIL.Image.open('foreground.jpg')
|
||||
img2 = PIL.Image.open('background.jpg')
|
||||
img3 = PIL.Image.open('subject.jpg')
|
||||
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
prompt='Combine these elements into a cohesive animated scene',
|
||||
reference_images=[img1, img2, img3],
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p',
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Video Extension
|
||||
|
||||
```python
|
||||
# Continue from previously generated video
|
||||
previous_video = open('part1.mp4', 'rb').read()
|
||||
|
||||
response = client.models.extend_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
video=previous_video,
|
||||
prompt='The scene transitions to nighttime with stars appearing'
|
||||
)
|
||||
```
|
||||
|
||||
### Frame Control
|
||||
|
||||
```python
|
||||
# Precise camera movements
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
prompt='A mountain landscape',
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p',
|
||||
camera_motion='zoom_in', # Options: zoom_in, zoom_out, pan_left, pan_right, tilt_up, tilt_down, static
|
||||
motion_speed='slow' # Options: slow, medium, fast
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Prompt Engineering
|
||||
|
||||
### Effective Video Prompts
|
||||
|
||||
**Structure**:
|
||||
1. **Subject**: What's in the scene
|
||||
2. **Action**: What's happening
|
||||
3. **Camera**: How it's filmed
|
||||
4. **Style**: Visual treatment
|
||||
5. **Timing**: Pacing details
|
||||
|
||||
**Example**:
|
||||
```
|
||||
"A hummingbird [subject] hovers near a red flower, then flies away [action].
|
||||
Slow-motion close-up shot [camera] with vibrant colors and soft focus background [style].
|
||||
Gentle, peaceful pacing [timing]."
|
||||
```
|
||||
|
||||
### Action Verbs
|
||||
|
||||
**Movement**:
|
||||
- "walks", "runs", "flies", "swims", "dances"
|
||||
- "rotates", "spins", "rolls", "bounces"
|
||||
- "emerges", "disappears", "transforms"
|
||||
|
||||
**Camera**:
|
||||
- "zoom in on", "pull back from", "follow"
|
||||
- "orbit around", "track alongside"
|
||||
- "tilt up to reveal", "pan across"
|
||||
|
||||
**Transitions**:
|
||||
- "gradually changes from... to..."
|
||||
- "morphs into", "dissolves into"
|
||||
- "cuts to", "fades to"
|
||||
|
||||
### Timing Control
|
||||
|
||||
```python
|
||||
# Explicit timing in prompt
|
||||
prompt = '''
|
||||
0-2s: Close-up of a seed in soil
|
||||
2-4s: Time-lapse of sprout emerging
|
||||
4-6s: Growing into a small plant
|
||||
6-8s: Zoom out to show garden context
|
||||
'''
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Resolution
|
||||
|
||||
```python
|
||||
config = types.VideoGenerationConfig(
|
||||
resolution='1080p' # Options: 720p, 1080p
|
||||
)
|
||||
```
|
||||
|
||||
**Considerations**:
|
||||
- 1080p: Higher quality, longer generation time, larger file
|
||||
- 720p: Faster generation, smaller file, good for drafts
|
||||
|
||||
### Aspect Ratios
|
||||
|
||||
```python
|
||||
config = types.VideoGenerationConfig(
|
||||
aspect_ratio='16:9' # Options: 16:9, 9:16, 1:1
|
||||
)
|
||||
```
|
||||
|
||||
**Use Cases**:
|
||||
- 16:9: Landscape, YouTube, traditional video
|
||||
- 9:16: Mobile, TikTok, Instagram Stories
|
||||
- 1:1: Square, Instagram feed, versatile
|
||||
|
||||
### Audio Control
|
||||
|
||||
```python
|
||||
config = types.VideoGenerationConfig(
|
||||
include_audio=True # Default: True
|
||||
)
|
||||
```
|
||||
|
||||
Native audio is generated automatically and synchronized with video content.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Prompt Quality
|
||||
|
||||
**Be specific**:
|
||||
- ❌ "A person walking"
|
||||
- ✅ "A young woman in a red coat walking through a park in autumn"
|
||||
|
||||
**Include motion**:
|
||||
- ❌ "A city street"
|
||||
- ✅ "A busy city street with cars passing and people crossing"
|
||||
|
||||
**Specify camera**:
|
||||
- ❌ "A mountain"
|
||||
- ✅ "Aerial drone shot slowly ascending over a snow-capped mountain"
|
||||
|
||||
### 2. Reference Images
|
||||
|
||||
**Quality**:
|
||||
- Use high-resolution images (1080p+)
|
||||
- Clear, well-lit subjects
|
||||
- Minimal motion blur
|
||||
|
||||
**Composition**:
|
||||
- Match desired final aspect ratio
|
||||
- Leave room for motion/movement
|
||||
- Consider camera angle in prompt
|
||||
|
||||
### 3. Performance Optimization
|
||||
|
||||
**Generation Time**:
|
||||
- 720p: ~30-60 seconds
|
||||
- 1080p: ~60-120 seconds
|
||||
- Fast models: 30-50% faster
|
||||
|
||||
**Strategies**:
|
||||
- Use 720p for iteration/drafts
|
||||
- Use fast models for rapid feedback
|
||||
- Batch multiple requests
|
||||
- Use async processing for UI responsiveness
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Product Demos
|
||||
|
||||
```python
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.0-fast-generate-001',
|
||||
prompt='''
|
||||
Professional product video:
|
||||
- Sleek smartphone rotating on a pedestal
|
||||
- Clean white background with soft shadows
|
||||
- Slow 360-degree rotation
|
||||
- Spotlight highlighting premium design
|
||||
- Modern, minimalist aesthetic
|
||||
''',
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p',
|
||||
aspect_ratio='1:1'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Social Media Content
|
||||
|
||||
```python
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-fast-generate-preview',
|
||||
prompt='''
|
||||
Trendy social media clip:
|
||||
- Text overlay "NEW ARRIVAL" appears
|
||||
- Fashion product showcase
|
||||
- Quick cuts and dynamic camera
|
||||
- Vibrant colors, high energy
|
||||
- Upbeat pacing
|
||||
''',
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='1080p',
|
||||
aspect_ratio='9:16' # Mobile
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Explainer Animations
|
||||
|
||||
```python
|
||||
response = client.models.generate_video(
|
||||
model='veo-3.1-generate-preview',
|
||||
prompt='''
|
||||
Educational animation:
|
||||
- Simple diagram illustrating data flow
|
||||
- Arrows and icons animating in sequence
|
||||
- Clean, clear visual hierarchy
|
||||
- Smooth transitions between steps
|
||||
- Professional corporate style
|
||||
''',
|
||||
config=types.VideoGenerationConfig(
|
||||
resolution='720p',
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Safety & Content Policy
|
||||
|
||||
### Safety Settings
|
||||
|
||||
```python
|
||||
config = types.VideoGenerationConfig(
|
||||
safety_settings=[
|
||||
types.SafetySetting(
|
||||
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
||||
threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Prohibited Content
|
||||
|
||||
- Violence, gore, harm
|
||||
- Sexually explicit content
|
||||
- Hate speech, harassment
|
||||
- Copyrighted characters/brands
|
||||
- Real people (without consent)
|
||||
- Misleading/deceptive content
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Duration**: Fixed 8 seconds (as of Sept 2025)
|
||||
- **Frame Rate**: 24fps only
|
||||
- **File Size**: ~5-20MB per video
|
||||
- **Generation Time**: 30s-2min depending on resolution
|
||||
- **Reference Images**: Max 3 images
|
||||
- **Preview Status**: API may change (3.1 models)
|
||||
- **Audio**: Cannot upload custom audio (native only)
|
||||
- **No real-time**: Pre-generation required
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Long Generation Times
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
# Track generation progress
|
||||
start = time.time()
|
||||
response = client.models.generate_video(...)
|
||||
duration = time.time() - start
|
||||
print(f"Generated in {duration:.1f}s")
|
||||
```
|
||||
|
||||
**Expected times**:
|
||||
- Fast models + 720p: 30-45s
|
||||
- Standard models + 720p: 45-90s
|
||||
- Fast models + 1080p: 45-60s
|
||||
- Standard models + 1080p: 60-120s
|
||||
|
||||
### Safety Filter Blocking
|
||||
|
||||
```python
|
||||
try:
|
||||
response = client.models.generate_video(...)
|
||||
except Exception as e:
|
||||
if 'safety' in str(e).lower():
|
||||
print("Video blocked by safety filters")
|
||||
# Modify prompt and retry
|
||||
```
|
||||
|
||||
### Quota Exceeded
|
||||
|
||||
```python
|
||||
# Implement exponential backoff
|
||||
import time
|
||||
|
||||
def generate_with_retry(model, prompt, max_retries=3):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return client.models.generate_video(model=model, prompt=prompt)
|
||||
except Exception as e:
|
||||
if '429' in str(e): # Rate limit
|
||||
wait = 2 ** attempt
|
||||
print(f"Rate limited, waiting {wait}s...")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
raise
|
||||
raise Exception("Max retries exceeded")
|
||||
```
|
||||
|
||||
## Cost Estimation
|
||||
|
||||
**Pricing**: TBD (preview models)
|
||||
|
||||
**Estimated based on compute**:
|
||||
- Fast + 720p: ~$0.05-$0.10 per video
|
||||
- Standard + 1080p: ~$0.15-$0.25 per video
|
||||
|
||||
**Monitor**: https://ai.google.dev/pricing
|
||||
|
||||
## Resources
|
||||
|
||||
- [Veo API Docs](https://ai.google.dev/gemini-api/docs/video)
|
||||
- [Video Generation Guide](https://ai.google.dev/gemini-api/docs/video#model-versions)
|
||||
- [Content Policy](https://ai.google.dev/gemini-api/docs/safety)
|
||||
- [Get API Key](https://aistudio.google.com/apikey)
|
||||
|
||||
---
|
||||
|
||||
## Related References
|
||||
|
||||
**Current**: Video Generation
|
||||
|
||||
**Related Capabilities**:
|
||||
- [Video Analysis](./video-analysis.md) - Understanding existing videos
|
||||
- [Image Generation](./image-generation.md) - Creating static images
|
||||
- [Image Understanding](./vision-understanding.md) - Analyzing reference images
|
||||
|
||||
**Back to**: [AI Multimodal Skill](../SKILL.md)
|
||||
@@ -0,0 +1,492 @@
|
||||
# Vision Understanding Reference
|
||||
|
||||
Comprehensive guide for image analysis, object detection, and visual understanding using Gemini API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Captioning**: Generate descriptive text for images
|
||||
- **Classification**: Categorize and identify content
|
||||
- **Visual Q&A**: Answer questions about images
|
||||
- **Object Detection**: Locate objects with bounding boxes (2.0+)
|
||||
- **Segmentation**: Create pixel-level masks (2.5+)
|
||||
- **Multi-image**: Compare up to 3,600 images
|
||||
- **OCR**: Extract text from images
|
||||
- **Document Understanding**: Process PDFs with vision
|
||||
|
||||
## Supported Formats
|
||||
|
||||
- **Images**: PNG, JPEG, WEBP, HEIC, HEIF
|
||||
- **Documents**: PDF (up to 1,000 pages)
|
||||
- **Size Limits**:
|
||||
- Inline: 20MB max total request
|
||||
- File API: 2GB per file
|
||||
- Max images: 3,600 per request
|
||||
|
||||
## Model Selection
|
||||
|
||||
### Gemini 2.5 Series
|
||||
- **gemini-2.5-pro**: Best quality, segmentation + detection
|
||||
- **gemini-2.5-flash**: Fast, efficient, all features
|
||||
- **gemini-2.5-flash-lite**: Lightweight, all features
|
||||
|
||||
### Feature Requirements
|
||||
- **Segmentation**: Requires 2.5+ models
|
||||
- **Object Detection**: Requires 2.0+ models
|
||||
- **Multi-image**: All models (up to 3,600 images)
|
||||
|
||||
## Basic Image Analysis
|
||||
|
||||
### Image Captioning
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Local file
|
||||
with open('image.jpg', 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Describe this image in detail',
|
||||
genai.types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
|
||||
]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### Image Classification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Classify this image. Provide category and confidence level.',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Visual Question Answering
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'How many people are in this image and what are they doing?',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Object Detection (2.5+)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Detect all objects in this image and provide bounding boxes',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
|
||||
# Returns bounding box coordinates: [ymin, xmin, ymax, xmax]
|
||||
# Normalized to [0, 1000] range
|
||||
```
|
||||
|
||||
### Segmentation (2.5+)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Create a segmentation mask for all people in this image',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
|
||||
# Returns pixel-level masks for requested objects
|
||||
```
|
||||
|
||||
### Multi-Image Comparison
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
img1 = PIL.Image.open('photo1.jpg')
|
||||
img2 = PIL.Image.open('photo2.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Compare these two images. What are the differences?',
|
||||
img1,
|
||||
img2
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### OCR and Text Extraction
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract all visible text from this image',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Input Methods
|
||||
|
||||
### Inline Data (<20MB)
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
# From file
|
||||
with open('image.jpg', 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this image',
|
||||
types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### PIL Image
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
img = PIL.Image.open('photo.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What is in this image?', img]
|
||||
)
|
||||
```
|
||||
|
||||
### File API (>20MB or Reuse)
|
||||
|
||||
```python
|
||||
# Upload once
|
||||
myfile = client.files.upload(file='large-image.jpg')
|
||||
|
||||
# Use multiple times
|
||||
response1 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Describe this image', myfile]
|
||||
)
|
||||
|
||||
response2 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What colors dominate this image?', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### URL (Public Images)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this image',
|
||||
types.Part.from_uri(
|
||||
uri='https://example.com/image.jpg',
|
||||
mime_type='image/jpeg'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Token Calculation
|
||||
|
||||
Images consume tokens based on size:
|
||||
|
||||
**Small images** (≤384px both dimensions): 258 tokens
|
||||
|
||||
**Large images**: Tiled into 768×768 chunks, 258 tokens each
|
||||
|
||||
**Formula**:
|
||||
```
|
||||
crop_unit = floor(min(width, height) / 1.5)
|
||||
tiles = (width / crop_unit) × (height / crop_unit)
|
||||
total_tokens = tiles × 258
|
||||
```
|
||||
|
||||
**Examples**:
|
||||
- 256×256: 258 tokens (small)
|
||||
- 512×512: 258 tokens (small)
|
||||
- 960×540: 6 tiles = 1,548 tokens
|
||||
- 1920×1080: 6 tiles = 1,548 tokens
|
||||
- 3840×2160 (4K): 24 tiles = 6,192 tokens
|
||||
|
||||
## Structured Output
|
||||
|
||||
### JSON Schema Output
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
class ObjectDetection(BaseModel):
|
||||
object_name: str
|
||||
confidence: float
|
||||
bounding_box: List[int] # [ymin, xmin, ymax, xmax]
|
||||
|
||||
class ImageAnalysis(BaseModel):
|
||||
description: str
|
||||
objects: List[ObjectDetection]
|
||||
scene_type: str
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze this image', img_part],
|
||||
config=genai.types.GenerateContentConfig(
|
||||
response_mime_type='application/json',
|
||||
response_schema=ImageAnalysis
|
||||
)
|
||||
)
|
||||
|
||||
result = ImageAnalysis.model_validate_json(response.text)
|
||||
```
|
||||
|
||||
## Multi-Image Analysis
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```python
|
||||
images = [
|
||||
PIL.Image.open(f'image{i}.jpg')
|
||||
for i in range(10)
|
||||
]
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze these images and find common themes'] + images
|
||||
)
|
||||
```
|
||||
|
||||
### Image Comparison
|
||||
|
||||
```python
|
||||
before = PIL.Image.open('before.jpg')
|
||||
after = PIL.Image.open('after.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Compare before and after. List all visible changes.',
|
||||
before,
|
||||
after
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Visual Search
|
||||
|
||||
```python
|
||||
reference = PIL.Image.open('target.jpg')
|
||||
candidates = [PIL.Image.open(f'option{i}.jpg') for i in range(5)]
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Find which candidate images contain objects similar to the reference',
|
||||
reference
|
||||
] + candidates
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Image Quality
|
||||
|
||||
1. **Resolution**: Use clear, non-blurry images
|
||||
2. **Rotation**: Verify correct orientation
|
||||
3. **Lighting**: Ensure good contrast and lighting
|
||||
4. **Size optimization**: Balance quality vs token cost
|
||||
5. **Format**: JPEG for photos, PNG for graphics
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Specific instructions**:
|
||||
- "Identify all vehicles with their colors and positions"
|
||||
- "Count people wearing blue shirts"
|
||||
- "Extract text from the sign in the top-left corner"
|
||||
|
||||
**Output format**:
|
||||
- "Return results as JSON with fields: category, count, description"
|
||||
- "Format as markdown table"
|
||||
- "List findings as numbered items"
|
||||
|
||||
**Few-shot examples**:
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Example: For an image of a cat on a sofa, respond: "Object: cat, Location: sofa"',
|
||||
'Now analyze this image:',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for images >20MB
|
||||
2. Use File API for repeated queries (saves tokens)
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
**Token-efficient strategies**:
|
||||
- Resize large images before upload
|
||||
- Use File API for repeated queries
|
||||
- Batch multiple images when related
|
||||
- Use appropriate model (Flash vs Pro)
|
||||
|
||||
**Token costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- Small image (258 tokens): $0.000258
|
||||
- HD image (1,548 tokens): $0.001548
|
||||
- 4K image (6,192 tokens): $0.006192
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Product Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze this product image:
|
||||
1. Identify the product
|
||||
2. List visible features
|
||||
3. Assess condition
|
||||
4. Estimate value range
|
||||
''',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Screenshot Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract all text and UI elements from this screenshot',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Medical Imaging (Informational Only)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-pro',
|
||||
contents=[
|
||||
'Describe visible features in this medical image. Note: This is for informational purposes only.',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Chart/Graph Reading
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract data from this chart and format as JSON',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Scene Understanding
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze this scene:
|
||||
1. Location type
|
||||
2. Time of day
|
||||
3. Weather conditions
|
||||
4. Activities happening
|
||||
5. Mood/atmosphere
|
||||
''',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def analyze_image_with_retry(image_path, prompt, max_retries=3):
|
||||
"""Analyze image with exponential backoff retry"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
prompt,
|
||||
genai.types.Part.from_bytes(
|
||||
data=img_bytes,
|
||||
mime_type='image/jpeg'
|
||||
)
|
||||
]
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Retry {attempt + 1} after {wait_time}s: {e}")
|
||||
time.sleep(wait_time)
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 3,600 images per request
|
||||
- OCR accuracy varies with text quality
|
||||
- Object detection requires 2.0+ models
|
||||
- Segmentation requires 2.5+ models
|
||||
- No video frame extraction (use video API)
|
||||
- Regional restrictions on child images (EEA, CH, UK)
|
||||
|
||||
---
|
||||
|
||||
## Related References
|
||||
|
||||
**Current**: Image Understanding
|
||||
|
||||
**Related Capabilities**:
|
||||
- [Image Generation](./image-generation.md) - Create and edit images
|
||||
- [Video Analysis](./video-analysis.md) - Analyze video frames
|
||||
- [Video Generation](./video-generation.md) - Reference images for video generation
|
||||
|
||||
**Back to**: [AI Multimodal Skill](../SKILL.md)
|
||||
BIN
.opencode/skills/ai-multimodal/scripts/.coverage
Normal file
BIN
.opencode/skills/ai-multimodal/scripts/.coverage
Normal file
Binary file not shown.
315
.opencode/skills/ai-multimodal/scripts/check_setup.py
Executable file
315
.opencode/skills/ai-multimodal/scripts/check_setup.py
Executable file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate ai-multimodal skill setup and configuration.
|
||||
|
||||
Checks:
|
||||
- API key presence and format
|
||||
- Python dependencies
|
||||
- Centralized resolver availability
|
||||
- Directory structure
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Fix Windows cp1252 encoding: Unicode symbols (✓, ⚠, ✗) can't encode on Windows.
|
||||
# Reconfigure stdout to UTF-8 with replacement (Python 3.7+).
|
||||
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
|
||||
if hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
if hasattr(sys.stderr, 'reconfigure'):
|
||||
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# Color codes for terminal output
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
|
||||
|
||||
def print_header(text):
|
||||
"""Print section header."""
|
||||
print(f"\n{BOLD}{BLUE}{'='*60}{RESET}")
|
||||
print(f"{BOLD}{BLUE}{text}{RESET}")
|
||||
print(f"{BOLD}{BLUE}{'='*60}{RESET}\n")
|
||||
|
||||
|
||||
def print_success(text):
|
||||
"""Print success message."""
|
||||
print(f"{GREEN}✓ {text}{RESET}")
|
||||
|
||||
|
||||
def print_warning(text):
|
||||
"""Print warning message."""
|
||||
print(f"{YELLOW}⚠ {text}{RESET}")
|
||||
|
||||
|
||||
def print_error(text):
|
||||
"""Print error message."""
|
||||
print(f"{RED}✗ {text}{RESET}")
|
||||
|
||||
|
||||
def print_info(text):
|
||||
"""Print info message."""
|
||||
print(f"{BLUE}ℹ {text}{RESET}")
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
"""Check if required Python packages are installed."""
|
||||
print_header("Checking Python Dependencies")
|
||||
|
||||
dependencies = {
|
||||
'google.genai': 'google-genai',
|
||||
'dotenv': 'python-dotenv',
|
||||
'PIL': 'pillow'
|
||||
}
|
||||
|
||||
missing = []
|
||||
|
||||
for module_name, package_name in dependencies.items():
|
||||
try:
|
||||
__import__(module_name)
|
||||
print_success(f"{package_name} is installed")
|
||||
except ImportError:
|
||||
print_error(f"{package_name} is NOT installed")
|
||||
missing.append(package_name)
|
||||
|
||||
if missing:
|
||||
print_error("\nMissing dependencies detected!")
|
||||
print_info(f"Install with: pip install {' '.join(missing)}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_centralized_resolver():
|
||||
"""Check if centralized resolver is available."""
|
||||
print_header("Checking Centralized Resolver")
|
||||
|
||||
claude_root = Path(__file__).parent.parent.parent.parent
|
||||
resolver_path = claude_root / 'scripts' / 'resolve_env.py'
|
||||
|
||||
if resolver_path.exists():
|
||||
print_success(f"Centralized resolver found: {resolver_path}")
|
||||
|
||||
# Try to import it
|
||||
sys.path.insert(0, str(resolver_path.parent))
|
||||
try:
|
||||
from resolve_env import resolve_env
|
||||
print_success("Centralized resolver can be imported")
|
||||
return True
|
||||
except ImportError as e:
|
||||
print_error(f"Centralized resolver exists but cannot be imported: {e}")
|
||||
return False
|
||||
else:
|
||||
print_warning(f"Centralized resolver not found: {resolver_path}")
|
||||
print_info("Skill will use fallback resolution logic")
|
||||
return True # Not critical, fallback works
|
||||
|
||||
|
||||
def find_api_key():
|
||||
"""Find and validate API key using centralized resolver."""
|
||||
print_header("Checking API Key Configuration")
|
||||
|
||||
# Try to use centralized resolver
|
||||
claude_root = Path(__file__).parent.parent.parent.parent
|
||||
sys.path.insert(0, str(claude_root / 'scripts'))
|
||||
try:
|
||||
from resolve_env import resolve_env
|
||||
|
||||
print_info("Using centralized resolver...")
|
||||
api_key = resolve_env('GEMINI_API_KEY', skill='ai-multimodal')
|
||||
|
||||
if api_key:
|
||||
print_success("API key found via centralized resolver")
|
||||
print_info(f"Key preview: {api_key[:20]}...{api_key[-4:]}")
|
||||
|
||||
# Show hierarchy
|
||||
print_info("\nTo see where the key was found, run:")
|
||||
print_info("python ~/.opencode/scripts/resolve_env.py GEMINI_API_KEY --skill ai-multimodal --verbose")
|
||||
|
||||
return api_key
|
||||
else:
|
||||
print_error("API key not found in any location")
|
||||
return None
|
||||
|
||||
except ImportError:
|
||||
print_warning("Centralized resolver not available, using fallback")
|
||||
|
||||
# Fallback: check environment
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
print_success("API key found in process.env")
|
||||
print_info(f"Key preview: {api_key[:20]}...{api_key[-4:]}")
|
||||
return api_key
|
||||
else:
|
||||
print_error("API key not found")
|
||||
return None
|
||||
|
||||
|
||||
def validate_api_key_format(api_key):
|
||||
"""Basic validation of API key format."""
|
||||
if not api_key:
|
||||
return False
|
||||
|
||||
# Google AI Studio keys typically start with 'AIza'
|
||||
if api_key.startswith('AIza'):
|
||||
print_success("API key format looks valid (Google AI Studio)")
|
||||
return True
|
||||
elif len(api_key) > 20:
|
||||
print_warning("API key format not recognized (may be Vertex AI or custom)")
|
||||
return True
|
||||
else:
|
||||
print_error("API key format looks invalid (too short)")
|
||||
return False
|
||||
|
||||
|
||||
def test_api_connection(api_key):
|
||||
"""Test API connection with a simple request."""
|
||||
print_header("Testing API Connection")
|
||||
|
||||
try:
|
||||
from google import genai
|
||||
|
||||
print_info("Initializing Gemini client...")
|
||||
client = genai.Client(api_key=api_key)
|
||||
|
||||
print_info("Fetching available models...")
|
||||
# List models to verify API key works
|
||||
models = list(client.models.list())
|
||||
|
||||
print_success(f"API connection successful! Found {len(models)} available models")
|
||||
|
||||
# Show some available models
|
||||
print_info("\nSample available models:")
|
||||
for model in models[:5]:
|
||||
print(f" - {model.name}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError:
|
||||
print_error("google-genai package not installed")
|
||||
return False
|
||||
except Exception as e:
|
||||
print_error(f"API connection failed: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def check_directory_structure():
|
||||
"""Verify skill directory structure."""
|
||||
print_header("Checking Directory Structure")
|
||||
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent
|
||||
|
||||
required_files = [
|
||||
('SKILL.md', skill_dir / 'SKILL.md'),
|
||||
('.env.example', skill_dir / '.env.example'),
|
||||
('gemini_batch_process.py', script_dir / 'gemini_batch_process.py'),
|
||||
]
|
||||
|
||||
all_exist = True
|
||||
|
||||
for name, path in required_files:
|
||||
if path.exists():
|
||||
print_success(f"{name} exists")
|
||||
else:
|
||||
print_error(f"{name} NOT found at {path}")
|
||||
all_exist = False
|
||||
|
||||
return all_exist
|
||||
|
||||
|
||||
def provide_setup_instructions():
|
||||
"""Provide setup instructions if configuration is incomplete."""
|
||||
print_header("Setup Instructions")
|
||||
|
||||
print_info("To configure the ai-multimodal skill:")
|
||||
print("\n1. Get a Gemini API key:")
|
||||
print(" → Visit: https://aistudio.google.com/apikey")
|
||||
|
||||
print("\n2. Configure the API key (choose one method):")
|
||||
|
||||
print(f"\n Option A: User global config (recommended)")
|
||||
print(f" $ echo 'GEMINI_API_KEY=your-api-key-here' >> ~/.opencode/.env")
|
||||
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent
|
||||
|
||||
print(f"\n Option B: Skill-specific config")
|
||||
print(f" $ cd {skill_dir}")
|
||||
print(f" $ cp .env.example .env")
|
||||
print(f" $ # Edit .env and add your API key")
|
||||
|
||||
print(f"\n Option C: Runtime environment (temporary)")
|
||||
print(f" $ export GEMINI_API_KEY='your-api-key-here'")
|
||||
|
||||
print("\n3. Verify setup:")
|
||||
print(f" $ python {Path(__file__)}")
|
||||
|
||||
print("\n4. Debug if needed:")
|
||||
print(f" $ python ~/.opencode/scripts/resolve_env.py --show-hierarchy --skill ai-multimodal")
|
||||
print(f" $ python ~/.opencode/scripts/resolve_env.py GEMINI_API_KEY --skill ai-multimodal --verbose")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all setup checks."""
|
||||
print(f"\n{BOLD}AI Multimodal Skill - Setup Checker{RESET}")
|
||||
|
||||
all_passed = True
|
||||
|
||||
# Check directory structure
|
||||
if not check_directory_structure():
|
||||
all_passed = False
|
||||
|
||||
# Check centralized resolver
|
||||
check_centralized_resolver()
|
||||
|
||||
# Check dependencies
|
||||
if not check_dependencies():
|
||||
all_passed = False
|
||||
provide_setup_instructions()
|
||||
sys.exit(1)
|
||||
|
||||
# Check API key
|
||||
api_key = find_api_key()
|
||||
|
||||
if not api_key:
|
||||
print_error("\n❌ GEMINI_API_KEY not found in any location")
|
||||
all_passed = False
|
||||
provide_setup_instructions()
|
||||
sys.exit(1)
|
||||
|
||||
# Validate API key format
|
||||
if not validate_api_key_format(api_key):
|
||||
all_passed = False
|
||||
|
||||
# Test API connection
|
||||
if not test_api_connection(api_key):
|
||||
all_passed = False
|
||||
|
||||
# Final summary
|
||||
print_header("Setup Summary")
|
||||
|
||||
if all_passed:
|
||||
print_success("✅ All checks passed! The ai-multimodal skill is ready to use.")
|
||||
print_info("\nNext steps:")
|
||||
print(" • Read SKILL.md for usage examples")
|
||||
print(" • Try: python scripts/gemini_batch_process.py --help")
|
||||
print("\nImage generation models:")
|
||||
print(" • gemini-2.5-flash-image - Nano Banana Flash (DEFAULT - fast)")
|
||||
print(" • imagen-4.0-generate-001 - Imagen 4 (alternative - production)")
|
||||
print(" • gemini-3-pro-image-preview - Nano Banana Pro (4K text, reasoning)")
|
||||
print("\nExample (uses default model):")
|
||||
print(" python scripts/gemini_batch_process.py --task generate \\")
|
||||
print(" --prompt 'A sunset over mountains' --aspect-ratio 16:9 --size 2K")
|
||||
else:
|
||||
print_error("❌ Some checks failed. Please fix the issues above.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
395
.opencode/skills/ai-multimodal/scripts/document_converter.py
Executable file
395
.opencode/skills/ai-multimodal/scripts/document_converter.py
Executable file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert documents to Markdown using Gemini API.
|
||||
|
||||
Supports all document types:
|
||||
- PDF documents (native vision processing)
|
||||
- Images (JPEG, PNG, WEBP, HEIC)
|
||||
- Office documents (DOCX, XLSX, PPTX)
|
||||
- HTML, TXT, and other text formats
|
||||
|
||||
Features:
|
||||
- Converts to clean markdown format
|
||||
- Preserves structure, tables, and formatting
|
||||
- Extracts text from images and scanned documents
|
||||
- Batch conversion support
|
||||
- Saves to docs/assets/document-extraction.md by default
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
except ImportError:
|
||||
print("Error: google-genai package not installed")
|
||||
print("Install with: pip install google-genai")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
load_dotenv = None
|
||||
|
||||
|
||||
def find_api_key() -> Optional[str]:
|
||||
"""Find Gemini API key using correct priority order.
|
||||
|
||||
Priority order (highest to lowest):
|
||||
1. process.env (runtime environment variables)
|
||||
2. .opencode/skills/ai-multimodal/.env (skill-specific config)
|
||||
3. .opencode/skills/.env (shared skills config)
|
||||
4. .opencode/.env (Claude global config)
|
||||
"""
|
||||
# Priority 1: Already in process.env (highest)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Load .env files if dotenv available
|
||||
if load_dotenv:
|
||||
# Determine base paths
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent # .opencode/skills/ai-multimodal
|
||||
skills_dir = skill_dir.parent # .opencode/skills
|
||||
claude_dir = skills_dir.parent # .claude
|
||||
|
||||
# Priority 2: Skill-specific .env
|
||||
env_file = skill_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 3: Shared skills .env
|
||||
env_file = skills_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 4: Claude global .env
|
||||
env_file = claude_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_project_root() -> Path:
|
||||
"""Find project root directory."""
|
||||
script_dir = Path(__file__).parent
|
||||
|
||||
# Look for .git or .claude directory
|
||||
for parent in [script_dir] + list(script_dir.parents):
|
||||
if (parent / '.git').exists() or (parent / '.claude').exists():
|
||||
return parent
|
||||
|
||||
return script_dir
|
||||
|
||||
|
||||
def get_mime_type(file_path: str) -> str:
|
||||
"""Determine MIME type from file extension."""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
mime_types = {
|
||||
# Documents
|
||||
'.pdf': 'application/pdf',
|
||||
'.txt': 'text/plain',
|
||||
'.html': 'text/html',
|
||||
'.htm': 'text/html',
|
||||
'.md': 'text/markdown',
|
||||
'.csv': 'text/csv',
|
||||
# Images
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png',
|
||||
'.webp': 'image/webp',
|
||||
'.heic': 'image/heic',
|
||||
'.heif': 'image/heif',
|
||||
# Office (need to be uploaded as binary)
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
}
|
||||
|
||||
return mime_types.get(ext, 'application/octet-stream')
|
||||
|
||||
|
||||
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
|
||||
"""Upload file to Gemini File API."""
|
||||
if verbose:
|
||||
print(f"Uploading {file_path}...")
|
||||
|
||||
myfile = client.files.upload(file=file_path)
|
||||
|
||||
# Wait for processing if needed
|
||||
max_wait = 300 # 5 minutes
|
||||
elapsed = 0
|
||||
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
||||
time.sleep(2)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
elapsed += 2
|
||||
if verbose and elapsed % 10 == 0:
|
||||
print(f" Processing... {elapsed}s")
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError(f"File processing failed: {file_path}")
|
||||
|
||||
if myfile.state.name == 'PROCESSING':
|
||||
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
|
||||
|
||||
if verbose:
|
||||
print(f" Uploaded: {myfile.name}")
|
||||
|
||||
return myfile
|
||||
|
||||
|
||||
def convert_to_markdown(
|
||||
client: genai.Client,
|
||||
file_path: str,
|
||||
model: str = 'gemini-2.5-flash',
|
||||
custom_prompt: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
max_retries: int = 3
|
||||
) -> Dict[str, Any]:
|
||||
"""Convert a document to markdown using Gemini."""
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
file_path_obj = Path(file_path)
|
||||
file_size = file_path_obj.stat().st_size
|
||||
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
|
||||
|
||||
# Default prompt for markdown conversion
|
||||
if custom_prompt:
|
||||
prompt = custom_prompt
|
||||
else:
|
||||
prompt = """Convert this document to clean, well-formatted Markdown.
|
||||
|
||||
Requirements:
|
||||
- Preserve all content, structure, and formatting
|
||||
- Convert tables to markdown table format
|
||||
- Maintain heading hierarchy (# ## ### etc)
|
||||
- Preserve lists, code blocks, and quotes
|
||||
- Extract text from images if present
|
||||
- Keep formatting consistent and readable
|
||||
|
||||
Output only the markdown content without any preamble or explanation."""
|
||||
|
||||
# Upload or inline the file
|
||||
if use_file_api:
|
||||
myfile = upload_file(client, str(file_path), verbose)
|
||||
content = [prompt, myfile]
|
||||
else:
|
||||
with open(file_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
|
||||
mime_type = get_mime_type(str(file_path))
|
||||
content = [
|
||||
prompt,
|
||||
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
|
||||
]
|
||||
|
||||
# Generate markdown
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=content
|
||||
)
|
||||
|
||||
markdown_content = response.text if hasattr(response, 'text') else ''
|
||||
|
||||
return {
|
||||
'file': str(file_path),
|
||||
'status': 'success',
|
||||
'markdown': markdown_content
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
return {
|
||||
'file': str(file_path),
|
||||
'status': 'error',
|
||||
'error': str(e),
|
||||
'markdown': None
|
||||
}
|
||||
|
||||
wait_time = 2 ** attempt
|
||||
if verbose:
|
||||
print(f" Retry {attempt + 1} after {wait_time}s: {e}")
|
||||
time.sleep(wait_time)
|
||||
|
||||
|
||||
def batch_convert(
|
||||
files: List[str],
|
||||
output_file: Optional[str] = None,
|
||||
auto_name: bool = False,
|
||||
model: str = 'gemini-2.5-flash',
|
||||
custom_prompt: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Batch convert multiple files to markdown."""
|
||||
|
||||
api_key = find_api_key()
|
||||
if not api_key:
|
||||
print("Error: GEMINI_API_KEY not found")
|
||||
print("Set via: export GEMINI_API_KEY='your-key'")
|
||||
print("Or create .env file with: GEMINI_API_KEY=your-key")
|
||||
sys.exit(1)
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
results = []
|
||||
|
||||
# Determine output path
|
||||
if not output_file:
|
||||
project_root = find_project_root()
|
||||
output_dir = project_root / 'docs' / 'assets'
|
||||
|
||||
if auto_name and len(files) == 1:
|
||||
# Auto-generate meaningful filename from input
|
||||
input_path = Path(files[0])
|
||||
base_name = input_path.stem
|
||||
output_file = str(output_dir / f"{base_name}-extraction.md")
|
||||
else:
|
||||
output_file = str(output_dir / 'document-extraction.md')
|
||||
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process each file
|
||||
for i, file_path in enumerate(files, 1):
|
||||
if verbose:
|
||||
print(f"\n[{i}/{len(files)}] Converting: {file_path}")
|
||||
|
||||
result = convert_to_markdown(
|
||||
client=client,
|
||||
file_path=file_path,
|
||||
model=model,
|
||||
custom_prompt=custom_prompt,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
status = result.get('status', 'unknown')
|
||||
print(f" Status: {status}")
|
||||
|
||||
# Save combined markdown
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# Document Extraction Results\n\n")
|
||||
f.write(f"Converted {len(files)} document(s) to markdown.\n\n")
|
||||
f.write("---\n\n")
|
||||
|
||||
for result in results:
|
||||
f.write(f"## {Path(result['file']).name}\n\n")
|
||||
|
||||
if result['status'] == 'success' and result.get('markdown'):
|
||||
f.write(result['markdown'])
|
||||
f.write("\n\n")
|
||||
elif result['status'] == 'success':
|
||||
f.write("**Note**: Conversion succeeded but no content was returned.\n\n")
|
||||
else:
|
||||
f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n")
|
||||
|
||||
f.write("---\n\n")
|
||||
|
||||
if verbose or True: # Always show output location
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Converted: {len(results)} file(s)")
|
||||
print(f"Success: {sum(1 for r in results if r['status'] == 'success')}")
|
||||
print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}")
|
||||
print(f"Output saved to: {output_path}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert documents to Markdown using Gemini API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Convert single PDF to markdown (default name)
|
||||
%(prog)s --input document.pdf
|
||||
|
||||
# Auto-generate meaningful filename
|
||||
%(prog)s --input testpdf.pdf --auto-name
|
||||
# Output: docs/assets/testpdf-extraction.md
|
||||
|
||||
# Convert multiple files
|
||||
%(prog)s --input doc1.pdf doc2.docx image.png
|
||||
|
||||
# Specify custom output location
|
||||
%(prog)s --input document.pdf --output ./output.md
|
||||
|
||||
# Use custom prompt
|
||||
%(prog)s --input document.pdf --prompt "Extract only the tables as markdown"
|
||||
|
||||
# Batch convert directory
|
||||
%(prog)s --input ./documents/*.pdf --verbose
|
||||
|
||||
Supported formats:
|
||||
- PDF documents (up to 1,000 pages)
|
||||
- Images (JPEG, PNG, WEBP, HEIC)
|
||||
- Office documents (DOCX, XLSX, PPTX)
|
||||
- Text formats (TXT, HTML, Markdown, CSV)
|
||||
|
||||
Default output: <project-root>/docs/assets/document-extraction.md
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', '-i', nargs='+', required=True,
|
||||
help='Input file(s) to convert')
|
||||
parser.add_argument('--output', '-o',
|
||||
help='Output markdown file (default: docs/assets/document-extraction.md)')
|
||||
parser.add_argument('--auto-name', '-a', action='store_true',
|
||||
help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)')
|
||||
parser.add_argument('--model', default='gemini-2.5-flash',
|
||||
help='Gemini model to use (default: gemini-2.5-flash)')
|
||||
parser.add_argument('--prompt', '-p',
|
||||
help='Custom prompt for conversion')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input files
|
||||
files = []
|
||||
for file_pattern in args.input:
|
||||
file_path = Path(file_pattern)
|
||||
if file_path.exists() and file_path.is_file():
|
||||
files.append(str(file_path))
|
||||
else:
|
||||
# Try glob pattern
|
||||
import glob
|
||||
matched = glob.glob(file_pattern)
|
||||
files.extend([f for f in matched if Path(f).is_file()])
|
||||
|
||||
if not files:
|
||||
print("Error: No valid input files found")
|
||||
sys.exit(1)
|
||||
|
||||
# Convert files
|
||||
batch_convert(
|
||||
files=files,
|
||||
output_file=args.output,
|
||||
auto_name=args.auto_name,
|
||||
model=args.model,
|
||||
custom_prompt=args.prompt,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
1211
.opencode/skills/ai-multimodal/scripts/gemini_batch_process.py
Executable file
1211
.opencode/skills/ai-multimodal/scripts/gemini_batch_process.py
Executable file
File diff suppressed because it is too large
Load Diff
506
.opencode/skills/ai-multimodal/scripts/media_optimizer.py
Executable file
506
.opencode/skills/ai-multimodal/scripts/media_optimizer.py
Executable file
@@ -0,0 +1,506 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimize media files for Gemini API processing.
|
||||
|
||||
Features:
|
||||
- Compress videos/audio for size limits
|
||||
- Resize images appropriately
|
||||
- Split long videos into chunks
|
||||
- Format conversion
|
||||
- Quality vs size optimization
|
||||
- Validation before upload
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
load_dotenv = None
|
||||
|
||||
|
||||
def load_env_files():
|
||||
"""Load .env files in correct priority order.
|
||||
|
||||
Priority order (highest to lowest):
|
||||
1. process.env (runtime environment variables)
|
||||
2. .opencode/skills/ai-multimodal/.env (skill-specific config)
|
||||
3. .opencode/skills/.env (shared skills config)
|
||||
4. .opencode/.env (Claude global config)
|
||||
"""
|
||||
if not load_dotenv:
|
||||
return
|
||||
|
||||
# Determine base paths
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent # .opencode/skills/ai-multimodal
|
||||
skills_dir = skill_dir.parent # .opencode/skills
|
||||
claude_dir = skills_dir.parent # .claude
|
||||
|
||||
# Priority 2: Skill-specific .env
|
||||
env_file = skill_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
# Priority 3: Shared skills .env
|
||||
env_file = skills_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
# Priority 4: Claude global .env
|
||||
env_file = claude_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
|
||||
# Load environment variables at module level
|
||||
load_env_files()
|
||||
|
||||
|
||||
def check_ffmpeg() -> bool:
|
||||
"""Check if ffmpeg is installed."""
|
||||
try:
|
||||
subprocess.run(['ffmpeg', '-version'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True)
|
||||
return True
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
||||
return False
|
||||
|
||||
|
||||
def get_media_info(file_path: str) -> Dict[str, Any]:
|
||||
"""Get media file information using ffprobe."""
|
||||
if not check_ffmpeg():
|
||||
return {}
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
'ffprobe',
|
||||
'-v', 'quiet',
|
||||
'-print_format', 'json',
|
||||
'-show_format',
|
||||
'-show_streams',
|
||||
file_path
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
|
||||
info = {
|
||||
'size': int(data['format'].get('size', 0)),
|
||||
'duration': float(data['format'].get('duration', 0)),
|
||||
'bit_rate': int(data['format'].get('bit_rate', 0)),
|
||||
}
|
||||
|
||||
# Get video/audio specific info
|
||||
for stream in data.get('streams', []):
|
||||
if stream['codec_type'] == 'video':
|
||||
info['width'] = stream.get('width', 0)
|
||||
info['height'] = stream.get('height', 0)
|
||||
info['fps'] = eval(stream.get('r_frame_rate', '0/1'))
|
||||
elif stream['codec_type'] == 'audio':
|
||||
info['sample_rate'] = int(stream.get('sample_rate', 0))
|
||||
info['channels'] = stream.get('channels', 0)
|
||||
|
||||
return info
|
||||
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError, Exception):
|
||||
return {}
|
||||
|
||||
|
||||
def optimize_video(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
target_size_mb: Optional[int] = None,
|
||||
max_duration: Optional[int] = None,
|
||||
quality: int = 23,
|
||||
resolution: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize video file for Gemini API."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
print("Install: apt-get install ffmpeg (Linux) or brew install ffmpeg (Mac)")
|
||||
return False
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
print(f"Error: Could not read media info from {input_path}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {info['duration']:.2f}s")
|
||||
if 'width' in info:
|
||||
print(f" Resolution: {info['width']}x{info['height']}")
|
||||
print(f" Bit rate: {info['bit_rate'] / 1000:.0f} kbps")
|
||||
|
||||
# Build ffmpeg command
|
||||
cmd = ['ffmpeg', '-i', input_path, '-y']
|
||||
|
||||
# Video codec
|
||||
cmd.extend(['-c:v', 'libx264', '-crf', str(quality)])
|
||||
|
||||
# Resolution
|
||||
if resolution:
|
||||
cmd.extend(['-vf', f'scale={resolution}'])
|
||||
elif 'width' in info and info['width'] > 1920:
|
||||
cmd.extend(['-vf', 'scale=1920:-2']) # Max 1080p
|
||||
|
||||
# Audio codec
|
||||
cmd.extend(['-c:a', 'aac', '-b:a', '128k', '-ac', '2'])
|
||||
|
||||
# Duration limit
|
||||
if max_duration and info['duration'] > max_duration:
|
||||
cmd.extend(['-t', str(max_duration)])
|
||||
|
||||
# Target size (rough estimate using bitrate)
|
||||
if target_size_mb:
|
||||
target_bits = target_size_mb * 8 * 1024 * 1024
|
||||
duration = min(info['duration'], max_duration) if max_duration else info['duration']
|
||||
target_bitrate = int(target_bits / duration)
|
||||
# Reserve some for audio (128kbps)
|
||||
video_bitrate = max(target_bitrate - 128000, 500000)
|
||||
cmd.extend(['-b:v', str(video_bitrate)])
|
||||
|
||||
cmd.append(output_path)
|
||||
|
||||
if verbose:
|
||||
print(f"\nOptimizing...")
|
||||
print(f" Command: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
|
||||
# Check output
|
||||
output_info = get_media_info(output_path)
|
||||
if output_info and verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {output_info['duration']:.2f}s")
|
||||
if 'width' in output_info:
|
||||
print(f" Resolution: {output_info['width']}x{output_info['height']}")
|
||||
compression = (1 - output_info['size'] / info['size']) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error optimizing video: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def optimize_audio(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
target_size_mb: Optional[int] = None,
|
||||
bitrate: str = '64k',
|
||||
sample_rate: int = 16000,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize audio file for Gemini API."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
return False
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
print(f"Error: Could not read media info from {input_path}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {info['duration']:.2f}s")
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
'ffmpeg', '-i', input_path, '-y',
|
||||
'-c:a', 'aac',
|
||||
'-b:a', bitrate,
|
||||
'-ar', str(sample_rate),
|
||||
'-ac', '1', # Mono (Gemini uses mono anyway)
|
||||
output_path
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print(f"\nOptimizing...")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
|
||||
output_info = get_media_info(output_path)
|
||||
if output_info and verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
|
||||
compression = (1 - output_info['size'] / info['size']) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error optimizing audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def optimize_image(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
max_width: int = 1920,
|
||||
quality: int = 85,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize image file for Gemini API."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
print("Error: Pillow not installed")
|
||||
print("Install with: pip install pillow")
|
||||
return False
|
||||
|
||||
try:
|
||||
img = Image.open(input_path)
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {Path(input_path).stat().st_size / 1024:.2f} KB")
|
||||
print(f" Resolution: {img.width}x{img.height}")
|
||||
|
||||
# Resize if needed
|
||||
if img.width > max_width:
|
||||
ratio = max_width / img.width
|
||||
new_height = int(img.height * ratio)
|
||||
img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
|
||||
if verbose:
|
||||
print(f" Resized to: {img.width}x{img.height}")
|
||||
|
||||
# Convert RGBA to RGB if saving as JPEG
|
||||
if output_path.lower().endswith('.jpg') or output_path.lower().endswith('.jpeg'):
|
||||
if img.mode == 'RGBA':
|
||||
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
|
||||
rgb_img.paste(img, mask=img.split()[3])
|
||||
img = rgb_img
|
||||
|
||||
# Save
|
||||
img.save(output_path, quality=quality, optimize=True)
|
||||
|
||||
if verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {Path(output_path).stat().st_size / 1024:.2f} KB")
|
||||
compression = (1 - Path(output_path).stat().st_size / Path(input_path).stat().st_size) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error optimizing image: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def split_video(
|
||||
input_path: str,
|
||||
output_dir: str,
|
||||
chunk_duration: int = 3600,
|
||||
verbose: bool = False
|
||||
) -> List[str]:
|
||||
"""Split long video into chunks."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
return []
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
return []
|
||||
|
||||
total_duration = info['duration']
|
||||
num_chunks = int(total_duration / chunk_duration) + 1
|
||||
|
||||
if num_chunks == 1:
|
||||
if verbose:
|
||||
print("Video is short enough, no splitting needed")
|
||||
return [input_path]
|
||||
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
output_files = []
|
||||
|
||||
for i in range(num_chunks):
|
||||
start_time = i * chunk_duration
|
||||
output_file = Path(output_dir) / f"{Path(input_path).stem}_chunk_{i+1}.mp4"
|
||||
|
||||
cmd = [
|
||||
'ffmpeg', '-i', input_path, '-y',
|
||||
'-ss', str(start_time),
|
||||
'-t', str(chunk_duration),
|
||||
'-c', 'copy',
|
||||
str(output_file)
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print(f"Creating chunk {i+1}/{num_chunks}...")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
output_files.append(str(output_file))
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error creating chunk {i+1}: {e}")
|
||||
|
||||
return output_files
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Optimize media files for Gemini API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Optimize video to 100MB
|
||||
%(prog)s --input video.mp4 --output optimized.mp4 --target-size 100
|
||||
|
||||
# Optimize audio
|
||||
%(prog)s --input audio.mp3 --output optimized.m4a --bitrate 64k
|
||||
|
||||
# Resize image
|
||||
%(prog)s --input image.jpg --output resized.jpg --max-width 1920
|
||||
|
||||
# Split long video
|
||||
%(prog)s --input long-video.mp4 --split --chunk-duration 3600 --output-dir ./chunks
|
||||
|
||||
# Batch optimize directory
|
||||
%(prog)s --input-dir ./videos --output-dir ./optimized --quality 85
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', help='Input file')
|
||||
parser.add_argument('--output', help='Output file')
|
||||
parser.add_argument('--input-dir', help='Input directory for batch processing')
|
||||
parser.add_argument('--output-dir', help='Output directory for batch processing')
|
||||
parser.add_argument('--target-size', type=int, help='Target size in MB')
|
||||
parser.add_argument('--quality', type=int, default=85,
|
||||
help='Quality (video: 0-51 CRF, image: 1-100) (default: 85)')
|
||||
parser.add_argument('--max-width', type=int, default=1920,
|
||||
help='Max image width (default: 1920)')
|
||||
parser.add_argument('--bitrate', default='64k',
|
||||
help='Audio bitrate (default: 64k)')
|
||||
parser.add_argument('--resolution', help='Video resolution (e.g., 1920x1080)')
|
||||
parser.add_argument('--split', action='store_true', help='Split long video into chunks')
|
||||
parser.add_argument('--chunk-duration', type=int, default=3600,
|
||||
help='Chunk duration in seconds (default: 3600 = 1 hour)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate arguments
|
||||
if not args.input and not args.input_dir:
|
||||
parser.error("Either --input or --input-dir required")
|
||||
|
||||
# Single file processing
|
||||
if args.input:
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file not found: {input_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.split:
|
||||
output_dir = args.output_dir or './chunks'
|
||||
chunks = split_video(str(input_path), output_dir, args.chunk_duration, args.verbose)
|
||||
print(f"\nCreated {len(chunks)} chunks in {output_dir}")
|
||||
sys.exit(0)
|
||||
|
||||
if not args.output:
|
||||
parser.error("--output required for single file processing")
|
||||
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determine file type
|
||||
ext = input_path.suffix.lower()
|
||||
|
||||
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
|
||||
success = optimize_video(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
target_size_mb=args.target_size,
|
||||
quality=args.quality,
|
||||
resolution=args.resolution,
|
||||
verbose=args.verbose
|
||||
)
|
||||
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
|
||||
success = optimize_audio(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
target_size_mb=args.target_size,
|
||||
bitrate=args.bitrate,
|
||||
verbose=args.verbose
|
||||
)
|
||||
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
|
||||
success = optimize_image(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
max_width=args.max_width,
|
||||
quality=args.quality,
|
||||
verbose=args.verbose
|
||||
)
|
||||
else:
|
||||
print(f"Error: Unsupported file type: {ext}")
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
# Batch processing
|
||||
if args.input_dir:
|
||||
if not args.output_dir:
|
||||
parser.error("--output-dir required for batch processing")
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find all media files
|
||||
patterns = ['*.mp4', '*.mov', '*.avi', '*.mkv', '*.webm',
|
||||
'*.mp3', '*.wav', '*.m4a', '*.flac',
|
||||
'*.jpg', '*.jpeg', '*.png', '*.webp']
|
||||
|
||||
files = []
|
||||
for pattern in patterns:
|
||||
files.extend(input_dir.glob(pattern))
|
||||
|
||||
if not files:
|
||||
print(f"No media files found in {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(files)} files to process")
|
||||
|
||||
success_count = 0
|
||||
for input_file in files:
|
||||
output_file = output_dir / input_file.name
|
||||
|
||||
ext = input_file.suffix.lower()
|
||||
success = False
|
||||
|
||||
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
|
||||
success = optimize_video(str(input_file), str(output_file),
|
||||
quality=args.quality, verbose=args.verbose)
|
||||
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
|
||||
success = optimize_audio(str(input_file), str(output_file),
|
||||
bitrate=args.bitrate, verbose=args.verbose)
|
||||
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
|
||||
success = optimize_image(str(input_file), str(output_file),
|
||||
max_width=args.max_width, quality=args.quality,
|
||||
verbose=args.verbose)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
|
||||
print(f"\nProcessed: {success_count}/{len(files)} files")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
189
.opencode/skills/ai-multimodal/scripts/minimax_api_client.py
Normal file
189
.opencode/skills/ai-multimodal/scripts/minimax_api_client.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MiniMax API client - shared HTTP utilities for all MiniMax generation tasks.
|
||||
|
||||
Handles authentication, API calls, async task polling, and file downloads.
|
||||
Base URL: https://api.minimax.io/v1
|
||||
Auth: Bearer token via MINIMAX_API_KEY environment variable.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("Error: requests package not installed")
|
||||
print("Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
# Import centralized environment resolver
|
||||
CLAUDE_ROOT = Path(__file__).parent.parent.parent.parent
|
||||
sys.path.insert(0, str(CLAUDE_ROOT / 'scripts'))
|
||||
try:
|
||||
from resolve_env import resolve_env
|
||||
CENTRALIZED_RESOLVER_AVAILABLE = True
|
||||
except ImportError:
|
||||
CENTRALIZED_RESOLVER_AVAILABLE = False
|
||||
|
||||
BASE_URL = "https://api.minimax.io/v1"
|
||||
|
||||
|
||||
def find_minimax_api_key() -> Optional[str]:
|
||||
"""Find MINIMAX_API_KEY using centralized resolver or environment."""
|
||||
if CENTRALIZED_RESOLVER_AVAILABLE:
|
||||
return resolve_env('MINIMAX_API_KEY', skill='ai-multimodal')
|
||||
|
||||
# Fallback: check environment and .env files
|
||||
api_key = os.getenv('MINIMAX_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Check .env files in skill directory hierarchy
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
skill_dir = Path(__file__).parent.parent
|
||||
for env_path in [skill_dir / '.env', skill_dir.parent / '.env']:
|
||||
if env_path.exists():
|
||||
load_dotenv(env_path, override=True)
|
||||
api_key = os.getenv('MINIMAX_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_headers(api_key: str) -> Dict[str, str]:
|
||||
"""Build authorization headers for MiniMax API."""
|
||||
return {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
|
||||
def api_post(endpoint: str, payload: Dict[str, Any], api_key: str,
|
||||
verbose: bool = False, timeout: int = 120) -> Dict[str, Any]:
|
||||
"""Make POST request to MiniMax API with error handling."""
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
headers = get_headers(api_key)
|
||||
|
||||
if verbose:
|
||||
print(f" POST {url}", file=sys.stderr)
|
||||
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=timeout)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
f"MiniMax API error (HTTP {response.status_code}): {response.text}"
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Check MiniMax-specific error codes
|
||||
base_resp = data.get("base_resp", {})
|
||||
status_code = base_resp.get("status_code", 0)
|
||||
if status_code != 0:
|
||||
raise Exception(
|
||||
f"MiniMax API error (code {status_code}): "
|
||||
f"{base_resp.get('status_msg', 'Unknown error')}"
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def api_get(endpoint: str, params: Dict[str, str], api_key: str,
|
||||
verbose: bool = False) -> Dict[str, Any]:
|
||||
"""Make GET request to MiniMax API."""
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
headers = get_headers(api_key)
|
||||
|
||||
if verbose:
|
||||
print(f" GET {url}", file=sys.stderr)
|
||||
|
||||
response = requests.get(url, headers=headers, params=params, timeout=60)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
f"MiniMax API error (HTTP {response.status_code}): {response.text}"
|
||||
)
|
||||
|
||||
return response.json()
|
||||
|
||||
|
||||
def poll_async_task(task_id: str, task_type: str, api_key: str,
|
||||
poll_interval: int = 10, max_wait: int = 600,
|
||||
verbose: bool = False) -> Dict[str, Any]:
|
||||
"""Poll async task (video/music) until completion.
|
||||
|
||||
Args:
|
||||
task_id: The task ID returned from creation endpoint
|
||||
task_type: 'video_generation' or 'music_generation'
|
||||
poll_interval: Seconds between polls (default 10)
|
||||
max_wait: Maximum wait time in seconds (default 600)
|
||||
"""
|
||||
elapsed = 0
|
||||
while elapsed < max_wait:
|
||||
result = api_get(
|
||||
f"query/{task_type}",
|
||||
{"task_id": task_id},
|
||||
api_key,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
status = result.get("status", "Unknown")
|
||||
if verbose and elapsed > 0 and elapsed % 30 == 0:
|
||||
print(f" Polling... {elapsed}s elapsed, status: {status}",
|
||||
file=sys.stderr)
|
||||
|
||||
if status == "Success":
|
||||
return result
|
||||
elif status in ("Failed", "Error"):
|
||||
raise Exception(f"Task failed: {json.dumps(result)}")
|
||||
|
||||
time.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
|
||||
raise TimeoutError(f"Task {task_id} timed out after {max_wait}s")
|
||||
|
||||
|
||||
def download_file(file_id: str, api_key: str, output_path: str,
|
||||
verbose: bool = False) -> str:
|
||||
"""Download file from MiniMax file service."""
|
||||
result = api_get("files/retrieve", {"file_id": file_id}, api_key, verbose)
|
||||
|
||||
download_url = result.get("file", {}).get("download_url")
|
||||
if not download_url:
|
||||
raise Exception(f"No download URL in response: {json.dumps(result)}")
|
||||
|
||||
if verbose:
|
||||
print(f" Downloading to: {output_path}", file=sys.stderr)
|
||||
|
||||
response = requests.get(download_url, stream=True, timeout=300)
|
||||
response.raise_for_status()
|
||||
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def get_output_dir() -> Path:
|
||||
"""Get project output directory for generated assets."""
|
||||
script_dir = Path(__file__).parent
|
||||
for parent in [script_dir] + list(script_dir.parents):
|
||||
if (parent / '.git').exists() or (parent / '.claude').exists():
|
||||
output_dir = parent / 'docs' / 'assets'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return output_dir
|
||||
# Fallback
|
||||
output_dir = script_dir.parent / 'assets'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return output_dir
|
||||
178
.opencode/skills/ai-multimodal/scripts/minimax_cli.py
Normal file
178
.opencode/skills/ai-multimodal/scripts/minimax_cli.py
Normal file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MiniMax CLI entry point - standalone CLI for MiniMax generation tasks.
|
||||
|
||||
Can be called directly or delegated to from gemini_batch_process.py
|
||||
when MiniMax models are detected.
|
||||
|
||||
Usage:
|
||||
python minimax_cli.py --task generate --prompt "A cat" --model image-01
|
||||
python minimax_cli.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3
|
||||
python minimax_cli.py --task generate-speech --text "Hello" --model speech-2.8-hd --voice English_Warm_Bestie
|
||||
python minimax_cli.py --task generate-music --lyrics "La la la" --prompt "pop song" --model music-2.5
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from minimax_api_client import find_minimax_api_key
|
||||
from minimax_generate import (
|
||||
generate_image, generate_video, generate_speech, generate_music
|
||||
)
|
||||
|
||||
TASK_DEFAULTS = {
|
||||
'generate': 'image-01',
|
||||
'generate-video': 'MiniMax-Hailuo-2.3',
|
||||
'generate-speech': 'speech-2.8-hd',
|
||||
'generate-music': 'music-2.5'
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='MiniMax AI generation CLI (image/video/speech/music)',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate image
|
||||
%(prog)s --task generate --prompt "A cyberpunk city at night" --model image-01 --aspect-ratio 16:9
|
||||
|
||||
# Generate video (async, ~30-60s)
|
||||
%(prog)s --task generate-video --prompt "A dancer performing" --model MiniMax-Hailuo-2.3
|
||||
|
||||
# Generate speech
|
||||
%(prog)s --task generate-speech --text "Welcome to the show" --model speech-2.8-hd --voice English_Warm_Bestie
|
||||
|
||||
# Generate music with lyrics
|
||||
%(prog)s --task generate-music --lyrics "Verse 1\\nHello world" --prompt "upbeat pop" --model music-2.5
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--task', required=True,
|
||||
choices=['generate', 'generate-video',
|
||||
'generate-speech', 'generate-music'],
|
||||
help='Generation task type')
|
||||
parser.add_argument('--prompt', help='Text prompt for generation')
|
||||
parser.add_argument('--text', help='Text for speech generation')
|
||||
parser.add_argument('--lyrics', help='Lyrics for music generation')
|
||||
parser.add_argument('--model', help='Model name (auto-detected from task)')
|
||||
parser.add_argument('--aspect-ratio', default='1:1',
|
||||
choices=['1:1', '16:9', '4:3', '3:2', '2:3',
|
||||
'3:4', '9:16', '21:9'],
|
||||
help='Aspect ratio for image generation')
|
||||
parser.add_argument('--num-images', type=int, default=1,
|
||||
help='Number of images (1-9, default: 1)')
|
||||
parser.add_argument('--duration', type=int, default=6,
|
||||
choices=[6, 10],
|
||||
help='Video duration in seconds (6 or 10)')
|
||||
parser.add_argument('--resolution', default='1080P',
|
||||
choices=['720P', '1080P'],
|
||||
help='Video resolution')
|
||||
parser.add_argument('--voice', default='English_expressive_narrator',
|
||||
help='Voice ID for speech (default: English_expressive_narrator)')
|
||||
parser.add_argument('--emotion', default='neutral',
|
||||
choices=['happy', 'sad', 'angry', 'fearful',
|
||||
'disgusted', 'surprised', 'neutral'],
|
||||
help='Emotion for speech')
|
||||
parser.add_argument('--output-format', default='mp3',
|
||||
choices=['mp3', 'wav', 'flac', 'pcm'],
|
||||
help='Audio output format')
|
||||
parser.add_argument('--first-frame', help='Image URL for video first frame')
|
||||
parser.add_argument('--output', '-o', help='Output file path')
|
||||
parser.add_argument('--verbose', '-v', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Auto-detect model from task
|
||||
if not args.model:
|
||||
args.model = TASK_DEFAULTS.get(args.task, 'image-01')
|
||||
if args.verbose:
|
||||
print(f"Auto-detected model: {args.model}")
|
||||
|
||||
# Find API key
|
||||
api_key = find_minimax_api_key()
|
||||
if not api_key:
|
||||
print("Error: MINIMAX_API_KEY not found")
|
||||
print("\nSetup:")
|
||||
print("1. export MINIMAX_API_KEY='your-key'")
|
||||
print("2. Or add to .env: MINIMAX_API_KEY=your-key")
|
||||
print("\nGet key at: https://platform.minimax.io/user-center/basic-information/interface-key")
|
||||
sys.exit(1)
|
||||
|
||||
# Dispatch to task handler
|
||||
try:
|
||||
if args.task == 'generate':
|
||||
if not args.prompt:
|
||||
parser.error("--prompt required for image generation")
|
||||
result = generate_image(
|
||||
api_key, args.prompt, args.model,
|
||||
args.aspect_ratio, args.num_images,
|
||||
args.output, args.verbose
|
||||
)
|
||||
elif args.task == 'generate-video':
|
||||
if not args.prompt:
|
||||
parser.error("--prompt required for video generation")
|
||||
result = generate_video(
|
||||
api_key, args.prompt, args.model,
|
||||
args.duration, args.resolution,
|
||||
args.first_frame, args.output, args.verbose
|
||||
)
|
||||
elif args.task == 'generate-speech':
|
||||
text = args.text or args.prompt
|
||||
if not text:
|
||||
parser.error("--text or --prompt required for speech")
|
||||
result = generate_speech(
|
||||
api_key, text, args.model,
|
||||
args.voice, args.emotion, args.output_format,
|
||||
output=args.output, verbose=args.verbose
|
||||
)
|
||||
elif args.task == 'generate-music':
|
||||
if not args.lyrics and not args.prompt:
|
||||
parser.error("--lyrics or --prompt required for music")
|
||||
result = generate_music(
|
||||
api_key, args.lyrics or '', args.prompt or '',
|
||||
args.model, args.output_format,
|
||||
args.output, args.verbose
|
||||
)
|
||||
else:
|
||||
parser.error(f"Unknown task: {args.task}")
|
||||
return
|
||||
|
||||
# Print results
|
||||
print_result(result, args.task)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def print_result(result: dict, task: str):
|
||||
"""Print generation result in LLM-friendly format."""
|
||||
print(f"\n=== RESULTS ===\n")
|
||||
print(f"[{task}]")
|
||||
print(f"Status: {result.get('status', 'unknown')}")
|
||||
|
||||
if result.get('status') == 'success':
|
||||
if 'generated_images' in result:
|
||||
for img in result['generated_images']:
|
||||
print(f"Generated image: {img}")
|
||||
if 'generated_video' in result:
|
||||
print(f"Generated video: {result['generated_video']}")
|
||||
if 'generation_time' in result:
|
||||
print(f"Generation time: {result['generation_time']:.1f}s")
|
||||
if 'generated_audio' in result:
|
||||
print(f"Generated audio: {result['generated_audio']}")
|
||||
if 'duration_ms' in result:
|
||||
dur = result['duration_ms'] / 1000
|
||||
print(f"Duration: {dur:.1f}s")
|
||||
elif result.get('error'):
|
||||
print(f"Error: {result['error']}")
|
||||
|
||||
print(f"\nModel: {result.get('model', 'unknown')}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
278
.opencode/skills/ai-multimodal/scripts/minimax_generate.py
Normal file
278
.opencode/skills/ai-multimodal/scripts/minimax_generate.py
Normal file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MiniMax generation CLI - image, video, speech, and music generation.
|
||||
|
||||
Models:
|
||||
- Image: image-01, image-01-live
|
||||
- Video: MiniMax-Hailuo-2.3, MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-02, S2V-01
|
||||
- Speech: speech-2.8-hd, speech-2.8-turbo, speech-2.6-hd, speech-2.6-turbo
|
||||
- Music: music-2.5
|
||||
|
||||
Usage:
|
||||
python minimax_generate.py --task generate --prompt "A cat in space" --model image-01
|
||||
python minimax_generate.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3
|
||||
python minimax_generate.py --task generate-speech --text "Hello world" --model speech-2.8-hd
|
||||
python minimax_generate.py --task generate-music --lyrics "Verse 1..." --model music-2.5
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from minimax_api_client import (
|
||||
find_minimax_api_key, api_post, poll_async_task,
|
||||
download_file, get_output_dir
|
||||
)
|
||||
|
||||
# Model registries
|
||||
MINIMAX_IMAGE_MODELS = {'image-01', 'image-01-live'}
|
||||
MINIMAX_VIDEO_MODELS = {
|
||||
'MiniMax-Hailuo-2.3', 'MiniMax-Hailuo-2.3-Fast',
|
||||
'MiniMax-Hailuo-02', 'S2V-01'
|
||||
}
|
||||
MINIMAX_SPEECH_MODELS = {
|
||||
'speech-2.8-hd', 'speech-2.8-turbo',
|
||||
'speech-2.6-hd', 'speech-2.6-turbo',
|
||||
'speech-02-hd', 'speech-02-turbo'
|
||||
}
|
||||
MINIMAX_MUSIC_MODELS = {'music-2.5', 'music-2.0'}
|
||||
|
||||
ALL_MINIMAX_MODELS = (
|
||||
MINIMAX_IMAGE_MODELS | MINIMAX_VIDEO_MODELS |
|
||||
MINIMAX_SPEECH_MODELS | MINIMAX_MUSIC_MODELS
|
||||
)
|
||||
|
||||
|
||||
def is_minimax_model(model: str) -> bool:
|
||||
"""Check if model is a MiniMax model."""
|
||||
return (
|
||||
model in ALL_MINIMAX_MODELS or
|
||||
model.startswith('MiniMax-') or
|
||||
model.startswith('image-01') or
|
||||
model.startswith('speech-') or
|
||||
model.startswith('music-') or
|
||||
model.startswith('S2V-')
|
||||
)
|
||||
|
||||
|
||||
def generate_image(api_key: str, prompt: str, model: str = 'image-01',
|
||||
aspect_ratio: str = '1:1', num_images: int = 1,
|
||||
output: str = None, verbose: bool = False) -> dict:
|
||||
"""Generate image using MiniMax image-01 model."""
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"aspect_ratio": aspect_ratio,
|
||||
"n": min(num_images, 9),
|
||||
"response_format": "url",
|
||||
"prompt_optimizer": True
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(f"Generating {num_images} image(s) with {model}...")
|
||||
|
||||
result = api_post("image_generation", payload, api_key, verbose)
|
||||
|
||||
# Download images
|
||||
image_urls = result.get("data", {}).get("image_urls", [])
|
||||
if not image_urls:
|
||||
return {"status": "error", "error": "No images in response"}
|
||||
|
||||
output_dir = get_output_dir()
|
||||
saved_files = []
|
||||
import requests as req
|
||||
|
||||
for i, url in enumerate(image_urls):
|
||||
ts = int(time.time())
|
||||
fname = f"minimax_image_{ts}_{i}.png"
|
||||
fpath = output_dir / fname
|
||||
|
||||
resp = req.get(url, timeout=60)
|
||||
resp.raise_for_status()
|
||||
with open(fpath, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
saved_files.append(str(fpath))
|
||||
|
||||
if verbose:
|
||||
print(f" Saved: {fpath}")
|
||||
|
||||
# Copy first image to output if specified
|
||||
if output and saved_files:
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(saved_files[0], output)
|
||||
|
||||
return {"status": "success", "generated_images": saved_files, "model": model}
|
||||
|
||||
|
||||
def generate_video(api_key: str, prompt: str, model: str = 'MiniMax-Hailuo-2.3',
|
||||
duration: int = 6, resolution: str = '1080P',
|
||||
first_frame: str = None, output: str = None,
|
||||
verbose: bool = False) -> dict:
|
||||
"""Generate video using MiniMax Hailuo models (async)."""
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"model": model,
|
||||
"duration": duration,
|
||||
"resolution": resolution
|
||||
}
|
||||
if first_frame:
|
||||
payload["first_frame_image"] = first_frame
|
||||
|
||||
if verbose:
|
||||
print(f"Submitting video generation with {model}...")
|
||||
|
||||
result = api_post("video_generation", payload, api_key, verbose)
|
||||
task_id = result.get("task_id")
|
||||
if not task_id:
|
||||
return {"status": "error", "error": f"No task_id: {json.dumps(result)}"}
|
||||
|
||||
if verbose:
|
||||
print(f" Task ID: {task_id}, polling...")
|
||||
|
||||
start = time.time()
|
||||
poll_result = poll_async_task(task_id, "video_generation", api_key,
|
||||
poll_interval=10, verbose=verbose)
|
||||
|
||||
file_id = poll_result.get("file_id")
|
||||
if not file_id:
|
||||
return {"status": "error", "error": f"No file_id: {json.dumps(poll_result)}"}
|
||||
|
||||
output_dir = get_output_dir()
|
||||
ts = int(time.time())
|
||||
output_path = str(output_dir / f"minimax_video_{ts}.mp4")
|
||||
download_file(file_id, api_key, output_path, verbose)
|
||||
|
||||
elapsed = time.time() - start
|
||||
file_size = Path(output_path).stat().st_size / (1024 * 1024)
|
||||
|
||||
if output:
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(output_path, output)
|
||||
|
||||
if verbose:
|
||||
print(f" Generated in {elapsed:.1f}s, size: {file_size:.2f} MB")
|
||||
|
||||
return {
|
||||
"status": "success", "generated_video": output_path,
|
||||
"generation_time": elapsed, "file_size_mb": file_size, "model": model
|
||||
}
|
||||
|
||||
|
||||
def generate_speech(api_key: str, text: str, model: str = 'speech-2.8-hd',
|
||||
voice: str = 'English_expressive_narrator',
|
||||
emotion: str = 'neutral', output_format: str = 'mp3',
|
||||
rate: float = 1.0, output: str = None,
|
||||
verbose: bool = False) -> dict:
|
||||
"""Generate speech using MiniMax TTS v2 API."""
|
||||
payload = {
|
||||
"model": model,
|
||||
"text": text[:10000],
|
||||
"stream": False,
|
||||
"language_boost": "auto",
|
||||
"output_format": "hex",
|
||||
"voice_setting": {
|
||||
"voice_id": voice,
|
||||
"speed": rate,
|
||||
"vol": 1.0,
|
||||
"pitch": 0
|
||||
},
|
||||
"audio_setting": {
|
||||
"sample_rate": 32000,
|
||||
"bitrate": 128000,
|
||||
"format": output_format,
|
||||
"channel": 1
|
||||
}
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(f"Generating speech with {model}, voice: {voice}...")
|
||||
|
||||
result = api_post("t2a_v2", payload, api_key, verbose)
|
||||
|
||||
audio_data = result.get("data", {}).get("audio")
|
||||
if not audio_data:
|
||||
return {"status": "error", "error": "No audio in response"}
|
||||
|
||||
output_dir = get_output_dir()
|
||||
ts = int(time.time())
|
||||
ext = output_format if output_format in ('mp3', 'wav', 'flac') else 'mp3'
|
||||
output_path = str(output_dir / f"minimax_speech_{ts}.{ext}")
|
||||
|
||||
# Audio returned as hex-encoded string from t2a_v2
|
||||
audio_bytes = bytes.fromhex(audio_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
if output:
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(output_path, output)
|
||||
|
||||
if verbose:
|
||||
size_kb = len(audio_bytes) / 1024
|
||||
print(f" Saved: {output_path} ({size_kb:.1f} KB)")
|
||||
|
||||
return {"status": "success", "generated_audio": output_path, "model": model}
|
||||
|
||||
|
||||
def generate_music(api_key: str, lyrics: str = '', prompt: str = '',
|
||||
model: str = 'music-2.5', output_format: str = 'mp3',
|
||||
output: str = None, verbose: bool = False) -> dict:
|
||||
"""Generate music using MiniMax music models."""
|
||||
payload = {
|
||||
"model": model,
|
||||
"output_format": "url",
|
||||
"audio_setting": {
|
||||
"sample_rate": 44100,
|
||||
"bitrate": 128000,
|
||||
"format": output_format
|
||||
}
|
||||
}
|
||||
if lyrics:
|
||||
payload["lyrics"] = lyrics[:3500]
|
||||
if prompt:
|
||||
payload["prompt"] = prompt[:2000]
|
||||
|
||||
if verbose:
|
||||
print(f"Generating music with {model}...")
|
||||
|
||||
result = api_post("music_generation", payload, api_key, verbose, timeout=300)
|
||||
|
||||
audio_data = result.get("data", {}).get("audio")
|
||||
extra = result.get("extra_info", {})
|
||||
duration_ms = extra.get("music_duration", 0)
|
||||
|
||||
if not audio_data:
|
||||
return {"status": "error", "error": "No audio in response"}
|
||||
|
||||
output_dir = get_output_dir()
|
||||
ts = int(time.time())
|
||||
output_path = str(output_dir / f"minimax_music_{ts}.{output_format}")
|
||||
|
||||
# Download from URL or decode hex
|
||||
if audio_data.startswith("http"):
|
||||
import requests as req
|
||||
resp = req.get(audio_data, timeout=120)
|
||||
resp.raise_for_status()
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
else:
|
||||
audio_bytes = bytes.fromhex(audio_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
if output:
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(output_path, output)
|
||||
|
||||
if verbose:
|
||||
dur_s = duration_ms / 1000 if duration_ms else 0
|
||||
print(f" Saved: {output_path} ({dur_s:.1f}s)")
|
||||
|
||||
return {
|
||||
"status": "success", "generated_audio": output_path,
|
||||
"duration_ms": duration_ms, "model": model
|
||||
}
|
||||
26
.opencode/skills/ai-multimodal/scripts/requirements.txt
Normal file
26
.opencode/skills/ai-multimodal/scripts/requirements.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
# AI Multimodal Skill Dependencies
|
||||
# Python 3.10+ required
|
||||
|
||||
# Google Gemini API
|
||||
google-genai>=0.1.0
|
||||
|
||||
# PDF processing
|
||||
pypdf>=4.0.0
|
||||
|
||||
# Document conversion
|
||||
python-docx>=1.0.0
|
||||
docx2pdf>=0.1.8 # Windows only, optional on Linux/macOS
|
||||
|
||||
# Markdown processing
|
||||
markdown>=3.5.0
|
||||
|
||||
# Image processing
|
||||
Pillow>=10.0.0
|
||||
|
||||
# Environment variable management
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Testing dependencies (dev)
|
||||
pytest>=8.0.0
|
||||
pytest-cov>=4.1.0
|
||||
pytest-mock>=3.12.0
|
||||
BIN
.opencode/skills/ai-multimodal/scripts/tests/.coverage
Normal file
BIN
.opencode/skills/ai-multimodal/scripts/tests/.coverage
Normal file
Binary file not shown.
@@ -0,0 +1,20 @@
|
||||
# Core dependencies
|
||||
google-genai>=0.2.0
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Image processing
|
||||
pillow>=10.0.0
|
||||
|
||||
# PDF processing
|
||||
pypdf>=3.0.0
|
||||
|
||||
# Document conversion
|
||||
markdown>=3.5
|
||||
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
pytest-cov>=4.1.0
|
||||
pytest-mock>=3.12.0
|
||||
|
||||
# Optional dependencies for full functionality
|
||||
# ffmpeg-python>=0.2.0 # For media optimization (requires ffmpeg installed)
|
||||
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Tests for document_converter.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock, mock_open
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import document_converter as dc
|
||||
|
||||
|
||||
class TestAPIKeyFinder:
|
||||
"""Test API key finding logic."""
|
||||
|
||||
@patch.dict('os.environ', {'GEMINI_API_KEY': 'test-key-from-env'})
|
||||
def test_find_api_key_from_env(self):
|
||||
"""Test finding API key from environment."""
|
||||
api_key = dc.find_api_key()
|
||||
assert api_key == 'test-key-from-env'
|
||||
|
||||
@patch.dict('os.environ', {}, clear=True)
|
||||
@patch('document_converter.load_dotenv', None)
|
||||
def test_find_api_key_no_key(self):
|
||||
"""Test when no API key is available."""
|
||||
api_key = dc.find_api_key()
|
||||
assert api_key is None
|
||||
|
||||
|
||||
class TestProjectRoot:
|
||||
"""Test project root finding."""
|
||||
|
||||
@patch('pathlib.Path.exists')
|
||||
def test_find_project_root_with_git(self, mock_exists):
|
||||
"""Test finding project root with .git directory."""
|
||||
root = dc.find_project_root()
|
||||
assert isinstance(root, Path)
|
||||
|
||||
|
||||
class TestMimeType:
|
||||
"""Test MIME type detection."""
|
||||
|
||||
def test_pdf_mime_type(self):
|
||||
"""Test PDF MIME type."""
|
||||
assert dc.get_mime_type('document.pdf') == 'application/pdf'
|
||||
|
||||
def test_image_mime_types(self):
|
||||
"""Test image MIME types."""
|
||||
assert dc.get_mime_type('image.jpg') == 'image/jpeg'
|
||||
assert dc.get_mime_type('image.png') == 'image/png'
|
||||
|
||||
def test_unknown_mime_type(self):
|
||||
"""Test unknown file extension."""
|
||||
assert dc.get_mime_type('file.unknown') == 'application/octet-stream'
|
||||
|
||||
|
||||
class TestIntegration:
|
||||
"""Integration tests."""
|
||||
|
||||
def test_mime_type_integration(self):
|
||||
"""Test MIME type detection with various extensions."""
|
||||
test_cases = [
|
||||
('document.pdf', 'application/pdf'),
|
||||
('image.jpg', 'image/jpeg'),
|
||||
('unknown.xyz', 'application/octet-stream'),
|
||||
]
|
||||
for file_path, expected_mime in test_cases:
|
||||
assert dc.get_mime_type(file_path) == expected_mime
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=document_converter', '--cov-report=term-missing'])
|
||||
@@ -0,0 +1,362 @@
|
||||
"""
|
||||
Tests for gemini_batch_process.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import gemini_batch_process as gbp
|
||||
|
||||
|
||||
class TestAPIKeyFinder:
|
||||
"""Test API key detection."""
|
||||
|
||||
def test_find_api_key_from_env(self, monkeypatch):
|
||||
"""Test finding API key from environment variable."""
|
||||
monkeypatch.setenv('GEMINI_API_KEY', 'test_key_123')
|
||||
assert gbp.find_api_key() == 'test_key_123'
|
||||
|
||||
@patch('gemini_batch_process.load_dotenv')
|
||||
def test_find_api_key_not_found(self, mock_load_dotenv, monkeypatch):
|
||||
"""Test when API key is not found."""
|
||||
monkeypatch.delenv('GEMINI_API_KEY', raising=False)
|
||||
# Mock load_dotenv to not actually load any files
|
||||
mock_load_dotenv.return_value = None
|
||||
assert gbp.find_api_key() is None
|
||||
|
||||
|
||||
class TestMimeTypeDetection:
|
||||
"""Test MIME type detection."""
|
||||
|
||||
def test_audio_mime_types(self):
|
||||
"""Test audio file MIME types."""
|
||||
assert gbp.get_mime_type('test.mp3') == 'audio/mp3'
|
||||
assert gbp.get_mime_type('test.wav') == 'audio/wav'
|
||||
assert gbp.get_mime_type('test.aac') == 'audio/aac'
|
||||
assert gbp.get_mime_type('test.flac') == 'audio/flac'
|
||||
|
||||
def test_image_mime_types(self):
|
||||
"""Test image file MIME types."""
|
||||
assert gbp.get_mime_type('test.jpg') == 'image/jpeg'
|
||||
assert gbp.get_mime_type('test.jpeg') == 'image/jpeg'
|
||||
assert gbp.get_mime_type('test.png') == 'image/png'
|
||||
assert gbp.get_mime_type('test.webp') == 'image/webp'
|
||||
|
||||
def test_video_mime_types(self):
|
||||
"""Test video file MIME types."""
|
||||
assert gbp.get_mime_type('test.mp4') == 'video/mp4'
|
||||
assert gbp.get_mime_type('test.mov') == 'video/quicktime'
|
||||
assert gbp.get_mime_type('test.avi') == 'video/x-msvideo'
|
||||
|
||||
def test_document_mime_types(self):
|
||||
"""Test document file MIME types."""
|
||||
assert gbp.get_mime_type('test.pdf') == 'application/pdf'
|
||||
assert gbp.get_mime_type('test.txt') == 'text/plain'
|
||||
|
||||
def test_unknown_mime_type(self):
|
||||
"""Test unknown file extension."""
|
||||
assert gbp.get_mime_type('test.xyz') == 'application/octet-stream'
|
||||
|
||||
def test_case_insensitive(self):
|
||||
"""Test case-insensitive extension matching."""
|
||||
assert gbp.get_mime_type('TEST.MP3') == 'audio/mp3'
|
||||
assert gbp.get_mime_type('Test.JPG') == 'image/jpeg'
|
||||
|
||||
|
||||
class TestFileUpload:
|
||||
"""Test file upload functionality."""
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_upload_file_success(self, mock_client_class):
|
||||
"""Test successful file upload."""
|
||||
# Mock client and file
|
||||
mock_client = Mock()
|
||||
mock_file = Mock()
|
||||
mock_file.state.name = 'ACTIVE'
|
||||
mock_file.name = 'test_file'
|
||||
mock_client.files.upload.return_value = mock_file
|
||||
|
||||
result = gbp.upload_file(mock_client, 'test.jpg', verbose=False)
|
||||
|
||||
assert result == mock_file
|
||||
mock_client.files.upload.assert_called_once_with(file='test.jpg')
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('gemini_batch_process.time.sleep')
|
||||
def test_upload_video_with_processing(self, mock_sleep, mock_client_class):
|
||||
"""Test video upload with processing wait."""
|
||||
mock_client = Mock()
|
||||
|
||||
# First call: PROCESSING, second call: ACTIVE
|
||||
mock_file_processing = Mock()
|
||||
mock_file_processing.state.name = 'PROCESSING'
|
||||
mock_file_processing.name = 'test_video'
|
||||
|
||||
mock_file_active = Mock()
|
||||
mock_file_active.state.name = 'ACTIVE'
|
||||
mock_file_active.name = 'test_video'
|
||||
|
||||
mock_client.files.upload.return_value = mock_file_processing
|
||||
mock_client.files.get.return_value = mock_file_active
|
||||
|
||||
result = gbp.upload_file(mock_client, 'test.mp4', verbose=False)
|
||||
|
||||
assert result.state.name == 'ACTIVE'
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_upload_file_failed(self, mock_client_class):
|
||||
"""Test failed file upload."""
|
||||
mock_client = Mock()
|
||||
mock_file = Mock()
|
||||
mock_file.state.name = 'FAILED'
|
||||
mock_client.files.upload.return_value = mock_file
|
||||
mock_client.files.get.return_value = mock_file
|
||||
|
||||
with pytest.raises(ValueError, match="File processing failed"):
|
||||
gbp.upload_file(mock_client, 'test.mp4', verbose=False)
|
||||
|
||||
|
||||
class TestProcessFile:
|
||||
"""Test file processing functionality."""
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_small_file_inline(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test processing small file with inline data."""
|
||||
# Mock small file
|
||||
mock_stat.return_value.st_size = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Mock file content
|
||||
mock_open.return_value.__enter__.return_value.read.return_value = b'test_data'
|
||||
|
||||
# Mock client and response
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = 'Test response'
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.jpg',
|
||||
prompt='Describe this image',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result['status'] == 'success'
|
||||
assert result['response'] == 'Test response'
|
||||
|
||||
@patch('gemini_batch_process.upload_file')
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_large_file_api(self, mock_stat, mock_client_class, mock_upload):
|
||||
"""Test processing large file with File API."""
|
||||
# Mock large file
|
||||
mock_stat.return_value.st_size = 50 * 1024 * 1024 # 50MB
|
||||
|
||||
# Mock upload and response
|
||||
mock_file = Mock()
|
||||
mock_upload.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = 'Test response'
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.mp4',
|
||||
prompt='Summarize this video',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result['status'] == 'success'
|
||||
mock_upload.assert_called_once()
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_file_error_handling(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test error handling in file processing."""
|
||||
mock_stat.return_value.st_size = 1024
|
||||
|
||||
# Mock file read
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = b'test_data'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_client.models.generate_content.side_effect = Exception("API Error")
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.jpg',
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
max_retries=1
|
||||
)
|
||||
|
||||
assert result['status'] == 'error'
|
||||
assert 'API Error' in result['error']
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_image_generation_with_aspect_ratio(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test image generation with aspect ratio config."""
|
||||
mock_stat.return_value.st_size = 1024
|
||||
|
||||
# Mock file read
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = b'test'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.candidates = [Mock()]
|
||||
mock_response.candidates[0].content.parts = [
|
||||
Mock(inline_data=Mock(data=b'fake_image_data'))
|
||||
]
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.txt',
|
||||
prompt='Generate mountain landscape',
|
||||
model='gemini-2.5-flash-image',
|
||||
task='generate',
|
||||
format_output='text',
|
||||
aspect_ratio='16:9',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Verify config was called with correct structure
|
||||
call_args = mock_client.models.generate_content.call_args
|
||||
config = call_args.kwargs.get('config')
|
||||
assert config is not None
|
||||
assert result['status'] == 'success'
|
||||
assert 'generated_image' in result
|
||||
|
||||
|
||||
class TestBatchProcessing:
|
||||
"""Test batch processing functionality."""
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
@patch('gemini_batch_process.process_file')
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_batch_process_success(self, mock_client_class, mock_process, mock_find_key):
|
||||
"""Test successful batch processing."""
|
||||
mock_find_key.return_value = 'test_key'
|
||||
mock_process.return_value = {'status': 'success', 'response': 'Test'}
|
||||
|
||||
results = gbp.batch_process(
|
||||
files=['test1.jpg', 'test2.jpg'],
|
||||
prompt='Analyze',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=False
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assert all(r['status'] == 'success' for r in results)
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
def test_batch_process_no_api_key(self, mock_find_key):
|
||||
"""Test batch processing without API key."""
|
||||
mock_find_key.return_value = None
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
gbp.batch_process(
|
||||
files=['test.jpg'],
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=False
|
||||
)
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
def test_batch_process_dry_run(self, mock_find_key):
|
||||
"""Test dry run mode."""
|
||||
# API key not needed for dry run, but we mock it to avoid sys.exit
|
||||
mock_find_key.return_value = 'test_key'
|
||||
|
||||
results = gbp.batch_process(
|
||||
files=['test1.jpg', 'test2.jpg'],
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=True
|
||||
)
|
||||
|
||||
assert results == []
|
||||
|
||||
|
||||
class TestResultsSaving:
|
||||
"""Test results saving functionality."""
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('json.dump')
|
||||
def test_save_results_json(self, mock_json_dump, mock_open):
|
||||
"""Test saving results as JSON."""
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.json', 'json')
|
||||
|
||||
mock_json_dump.assert_called_once()
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('csv.DictWriter')
|
||||
def test_save_results_csv(self, mock_csv_writer, mock_open):
|
||||
"""Test saving results as CSV."""
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.csv', 'csv')
|
||||
|
||||
# Verify CSV writer was used
|
||||
mock_csv_writer.assert_called_once()
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
def test_save_results_markdown(self, mock_open):
|
||||
"""Test saving results as Markdown."""
|
||||
mock_file = MagicMock()
|
||||
mock_open.return_value.__enter__.return_value = mock_file
|
||||
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'error', 'error': 'Failed'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.md', 'markdown')
|
||||
|
||||
# Verify write was called
|
||||
assert mock_file.write.call_count > 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=gemini_batch_process', '--cov-report=term-missing'])
|
||||
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Tests for media_optimizer.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
import json
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import media_optimizer as mo
|
||||
|
||||
|
||||
class TestEnvLoading:
|
||||
"""Test environment variable loading."""
|
||||
|
||||
@patch('media_optimizer.load_dotenv')
|
||||
@patch('pathlib.Path.exists')
|
||||
def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
|
||||
"""Test successful .env file loading."""
|
||||
mock_exists.return_value = True
|
||||
mo.load_env_files()
|
||||
# Should be called for skill, skills, and claude dirs
|
||||
assert mock_load_dotenv.call_count >= 1
|
||||
|
||||
@patch('media_optimizer.load_dotenv', None)
|
||||
def test_load_env_files_no_dotenv(self):
|
||||
"""Test when dotenv is not available."""
|
||||
# Should not raise an error
|
||||
mo.load_env_files()
|
||||
|
||||
|
||||
class TestFFmpegCheck:
|
||||
"""Test ffmpeg availability checking."""
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_installed(self, mock_run):
|
||||
"""Test when ffmpeg is installed."""
|
||||
mock_run.return_value = Mock()
|
||||
assert mo.check_ffmpeg() is True
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_not_installed(self, mock_run):
|
||||
"""Test when ffmpeg is not installed."""
|
||||
mock_run.side_effect = FileNotFoundError()
|
||||
assert mo.check_ffmpeg() is False
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_error(self, mock_run):
|
||||
"""Test ffmpeg command error."""
|
||||
mock_run.side_effect = Exception("Error")
|
||||
assert mo.check_ffmpeg() is False
|
||||
|
||||
|
||||
class TestMediaInfo:
|
||||
"""Test media information extraction."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('subprocess.run')
|
||||
def test_get_video_info(self, mock_run, mock_check):
|
||||
"""Test extracting video information."""
|
||||
mock_check.return_value = True
|
||||
|
||||
mock_result = Mock()
|
||||
mock_result.stdout = json.dumps({
|
||||
'format': {
|
||||
'size': '10485760',
|
||||
'duration': '120.5',
|
||||
'bit_rate': '691200'
|
||||
},
|
||||
'streams': [
|
||||
{
|
||||
'codec_type': 'video',
|
||||
'width': 1920,
|
||||
'height': 1080,
|
||||
'r_frame_rate': '30/1'
|
||||
},
|
||||
{
|
||||
'codec_type': 'audio',
|
||||
'sample_rate': '48000',
|
||||
'channels': 2
|
||||
}
|
||||
]
|
||||
})
|
||||
mock_run.return_value = mock_result
|
||||
|
||||
info = mo.get_media_info('test.mp4')
|
||||
|
||||
assert info['size'] == 10485760
|
||||
assert info['duration'] == 120.5
|
||||
assert info['width'] == 1920
|
||||
assert info['height'] == 1080
|
||||
assert info['sample_rate'] == 48000
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_get_media_info_no_ffmpeg(self, mock_check):
|
||||
"""Test when ffmpeg is not available."""
|
||||
mock_check.return_value = False
|
||||
info = mo.get_media_info('test.mp4')
|
||||
assert info == {}
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('subprocess.run')
|
||||
def test_get_media_info_error(self, mock_run, mock_check):
|
||||
"""Test error handling in media info extraction."""
|
||||
mock_check.return_value = True
|
||||
mock_run.side_effect = Exception("Error")
|
||||
|
||||
info = mo.get_media_info('test.mp4')
|
||||
assert info == {}
|
||||
|
||||
|
||||
class TestVideoOptimization:
|
||||
"""Test video optimization functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_success(self, mock_run, mock_info, mock_check):
|
||||
"""Test successful video optimization."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
# Input info
|
||||
{
|
||||
'size': 50 * 1024 * 1024,
|
||||
'duration': 120.0,
|
||||
'bit_rate': 3500000,
|
||||
'width': 1920,
|
||||
'height': 1080
|
||||
},
|
||||
# Output info
|
||||
{
|
||||
'size': 25 * 1024 * 1024,
|
||||
'duration': 120.0,
|
||||
'width': 1920,
|
||||
'height': 1080
|
||||
}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
quality=23,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_optimize_video_no_ffmpeg(self, mock_check):
|
||||
"""Test video optimization without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.optimize_video('input.mp4', 'output.mp4')
|
||||
assert result is False
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
def test_optimize_video_no_info(self, mock_info, mock_check):
|
||||
"""Test video optimization when info cannot be read."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {}
|
||||
|
||||
result = mo.optimize_video('input.mp4', 'output.mp4')
|
||||
assert result is False
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_with_target_size(self, mock_run, mock_info, mock_check):
|
||||
"""Test video optimization with target size."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 100 * 1024 * 1024, 'duration': 60.0, 'bit_rate': 3500000},
|
||||
{'size': 50 * 1024 * 1024, 'duration': 60.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
target_size_mb=50,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_with_resolution(self, mock_run, mock_info, mock_check):
|
||||
"""Test video optimization with custom resolution."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 50 * 1024 * 1024, 'duration': 120.0, 'bit_rate': 3500000},
|
||||
{'size': 25 * 1024 * 1024, 'duration': 120.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
resolution='1280x720',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
|
||||
|
||||
class TestAudioOptimization:
|
||||
"""Test audio optimization functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_audio_success(self, mock_run, mock_info, mock_check):
|
||||
"""Test successful audio optimization."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 10 * 1024 * 1024, 'duration': 300.0},
|
||||
{'size': 5 * 1024 * 1024, 'duration': 300.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_audio(
|
||||
'input.mp3',
|
||||
'output.m4a',
|
||||
bitrate='64k',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_optimize_audio_no_ffmpeg(self, mock_check):
|
||||
"""Test audio optimization without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.optimize_audio('input.mp3', 'output.m4a')
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestImageOptimization:
|
||||
"""Test image optimization functionality."""
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_success(self, mock_stat, mock_image_open):
|
||||
"""Test successful image optimization."""
|
||||
# Mock image
|
||||
mock_resized = Mock()
|
||||
mock_resized.mode = 'RGB'
|
||||
|
||||
mock_img = Mock()
|
||||
mock_img.width = 3840
|
||||
mock_img.height = 2160
|
||||
mock_img.mode = 'RGB'
|
||||
mock_img.resize.return_value = mock_resized
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
# Mock file sizes
|
||||
mock_stat.return_value.st_size = 5 * 1024 * 1024
|
||||
|
||||
result = mo.optimize_image(
|
||||
'input.jpg',
|
||||
'output.jpg',
|
||||
max_width=1920,
|
||||
quality=85,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
# Since image is resized, save is called on the resized image
|
||||
mock_resized.save.assert_called_once()
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_resize(self, mock_stat, mock_image_open):
|
||||
"""Test image resizing during optimization."""
|
||||
mock_img = Mock()
|
||||
mock_img.width = 3840
|
||||
mock_img.height = 2160
|
||||
mock_img.mode = 'RGB'
|
||||
mock_resized = Mock()
|
||||
mock_img.resize.return_value = mock_resized
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
mock_stat.return_value.st_size = 5 * 1024 * 1024
|
||||
|
||||
mo.optimize_image('input.jpg', 'output.jpg', max_width=1920, verbose=False)
|
||||
|
||||
mock_img.resize.assert_called_once()
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_rgba_to_jpg(self, mock_stat, mock_image_open):
|
||||
"""Test converting RGBA to RGB for JPEG."""
|
||||
mock_img = Mock()
|
||||
mock_img.width = 1920
|
||||
mock_img.height = 1080
|
||||
mock_img.mode = 'RGBA'
|
||||
mock_img.split.return_value = [Mock(), Mock(), Mock(), Mock()]
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
mock_stat.return_value.st_size = 1024 * 1024
|
||||
|
||||
with patch('PIL.Image.new') as mock_new:
|
||||
mock_rgb = Mock()
|
||||
mock_new.return_value = mock_rgb
|
||||
|
||||
mo.optimize_image('input.png', 'output.jpg', verbose=False)
|
||||
|
||||
mock_new.assert_called_once()
|
||||
|
||||
def test_optimize_image_no_pillow(self):
|
||||
"""Test image optimization without Pillow."""
|
||||
with patch.dict('sys.modules', {'PIL': None}):
|
||||
result = mo.optimize_image('input.jpg', 'output.jpg')
|
||||
# Will fail to import but function handles it
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestVideoSplitting:
|
||||
"""Test video splitting functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
@patch('pathlib.Path.mkdir')
|
||||
def test_split_video_success(self, mock_mkdir, mock_run, mock_info, mock_check):
|
||||
"""Test successful video splitting."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {'duration': 7200.0} # 2 hours
|
||||
|
||||
result = mo.split_video(
|
||||
'input.mp4',
|
||||
'./chunks',
|
||||
chunk_duration=3600, # 1 hour chunks
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Duration 7200s / 3600s = 2, +1 for safety = 3 chunks
|
||||
assert len(result) == 3
|
||||
assert mock_run.call_count == 3
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
def test_split_video_short_duration(self, mock_info, mock_check):
|
||||
"""Test splitting video shorter than chunk duration."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {'duration': 1800.0} # 30 minutes
|
||||
|
||||
result = mo.split_video(
|
||||
'input.mp4',
|
||||
'./chunks',
|
||||
chunk_duration=3600, # 1 hour
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result == ['input.mp4']
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_split_video_no_ffmpeg(self, mock_check):
|
||||
"""Test video splitting without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.split_video('input.mp4', './chunks')
|
||||
assert result == []
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=media_optimizer', '--cov-report=term-missing'])
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Tests for minimax_api_client.py - HTTP utilities, auth, polling, downloads.
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import minimax_api_client as mac
|
||||
|
||||
|
||||
class TestFindMinimaxApiKey:
|
||||
"""Test API key discovery."""
|
||||
|
||||
def test_find_key_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv('MINIMAX_API_KEY', 'test-minimax-key')
|
||||
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', False):
|
||||
assert mac.find_minimax_api_key() == 'test-minimax-key'
|
||||
|
||||
def test_find_key_not_found(self, monkeypatch):
|
||||
monkeypatch.delenv('MINIMAX_API_KEY', raising=False)
|
||||
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', False):
|
||||
result = mac.find_minimax_api_key()
|
||||
assert result is None
|
||||
|
||||
def test_find_key_via_centralized_resolver(self, monkeypatch):
|
||||
mock_resolve = Mock(return_value='resolved-key')
|
||||
with patch.object(mac, 'CENTRALIZED_RESOLVER_AVAILABLE', True), \
|
||||
patch.object(mac, 'resolve_env', mock_resolve, create=True):
|
||||
result = mac.find_minimax_api_key()
|
||||
assert result == 'resolved-key'
|
||||
mock_resolve.assert_called_once_with(
|
||||
'MINIMAX_API_KEY', skill='ai-multimodal'
|
||||
)
|
||||
|
||||
|
||||
class TestGetHeaders:
|
||||
"""Test header generation."""
|
||||
|
||||
def test_headers_contain_bearer_token(self):
|
||||
headers = mac.get_headers('my-api-key')
|
||||
assert headers['Authorization'] == 'Bearer my-api-key'
|
||||
assert headers['Content-Type'] == 'application/json'
|
||||
|
||||
def test_headers_with_different_key(self):
|
||||
headers = mac.get_headers('another-key-123')
|
||||
assert 'another-key-123' in headers['Authorization']
|
||||
|
||||
|
||||
class TestApiPost:
|
||||
"""Test POST request handling."""
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_successful_post(self, mock_post):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"base_resp": {"status_code": 0},
|
||||
"data": {"result": "ok"}
|
||||
}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
result = mac.api_post("test_endpoint", {"key": "val"}, "api-key")
|
||||
assert result["data"]["result"] == "ok"
|
||||
mock_post.assert_called_once()
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_http_error_raises(self, mock_post):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 401
|
||||
mock_resp.text = "Unauthorized"
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
with pytest.raises(Exception, match="HTTP 401"):
|
||||
mac.api_post("endpoint", {}, "bad-key")
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_minimax_error_code_raises(self, mock_post):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"base_resp": {"status_code": 1002, "status_msg": "Rate limit"}
|
||||
}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
with pytest.raises(Exception, match="code 1002.*Rate limit"):
|
||||
mac.api_post("endpoint", {}, "api-key")
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_custom_timeout(self, mock_post):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
mac.api_post("endpoint", {}, "key", timeout=300)
|
||||
_, kwargs = mock_post.call_args
|
||||
assert kwargs['timeout'] == 300
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_default_timeout_is_120(self, mock_post):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
mac.api_post("endpoint", {}, "key")
|
||||
_, kwargs = mock_post.call_args
|
||||
assert kwargs['timeout'] == 120
|
||||
|
||||
@patch('minimax_api_client.requests.post')
|
||||
def test_verbose_prints_url(self, mock_post, capsys):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {"base_resp": {"status_code": 0}}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
mac.api_post("image_generation", {}, "key", verbose=True)
|
||||
captured = capsys.readouterr()
|
||||
assert "image_generation" in captured.err
|
||||
|
||||
|
||||
class TestApiGet:
|
||||
"""Test GET request handling."""
|
||||
|
||||
@patch('minimax_api_client.requests.get')
|
||||
def test_successful_get(self, mock_get):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {"status": "Success", "file_id": "abc"}
|
||||
mock_get.return_value = mock_resp
|
||||
|
||||
result = mac.api_get("query/video_generation", {"task_id": "t1"}, "key")
|
||||
assert result["status"] == "Success"
|
||||
|
||||
@patch('minimax_api_client.requests.get')
|
||||
def test_get_http_error(self, mock_get):
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 500
|
||||
mock_resp.text = "Server Error"
|
||||
mock_get.return_value = mock_resp
|
||||
|
||||
with pytest.raises(Exception, match="HTTP 500"):
|
||||
mac.api_get("endpoint", {}, "key")
|
||||
|
||||
|
||||
class TestPollAsyncTask:
|
||||
"""Test async task polling."""
|
||||
|
||||
@patch('minimax_api_client.time.sleep')
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_poll_success_first_try(self, mock_get, mock_sleep):
|
||||
mock_get.return_value = {"status": "Success", "file_id": "f123"}
|
||||
|
||||
result = mac.poll_async_task("task1", "video_generation", "key")
|
||||
assert result["file_id"] == "f123"
|
||||
mock_sleep.assert_not_called()
|
||||
|
||||
@patch('minimax_api_client.time.sleep')
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_poll_success_after_processing(self, mock_get, mock_sleep):
|
||||
mock_get.side_effect = [
|
||||
{"status": "Processing"},
|
||||
{"status": "Processing"},
|
||||
{"status": "Success", "file_id": "f456"}
|
||||
]
|
||||
|
||||
result = mac.poll_async_task("task2", "video_generation", "key",
|
||||
poll_interval=1)
|
||||
assert result["file_id"] == "f456"
|
||||
assert mock_sleep.call_count == 2
|
||||
|
||||
@patch('minimax_api_client.time.sleep')
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_poll_task_failed(self, mock_get, mock_sleep):
|
||||
mock_get.return_value = {"status": "Failed", "error": "bad input"}
|
||||
|
||||
with pytest.raises(Exception, match="Task failed"):
|
||||
mac.poll_async_task("task3", "video_generation", "key")
|
||||
|
||||
@patch('minimax_api_client.time.sleep')
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_poll_timeout(self, mock_get, mock_sleep):
|
||||
mock_get.return_value = {"status": "Processing"}
|
||||
|
||||
with pytest.raises(TimeoutError, match="timed out"):
|
||||
mac.poll_async_task("task4", "video_generation", "key",
|
||||
poll_interval=1, max_wait=3)
|
||||
|
||||
|
||||
class TestDownloadFile:
|
||||
"""Test file download."""
|
||||
|
||||
@patch('minimax_api_client.requests.get')
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_download_success(self, mock_api_get, mock_req_get, tmp_path):
|
||||
mock_api_get.return_value = {
|
||||
"file": {"download_url": "https://cdn.minimax.io/video.mp4"}
|
||||
}
|
||||
mock_resp = Mock()
|
||||
mock_resp.raise_for_status = Mock()
|
||||
mock_resp.iter_content.return_value = [b"video_data"]
|
||||
mock_req_get.return_value = mock_resp
|
||||
|
||||
output = str(tmp_path / "test.mp4")
|
||||
result = mac.download_file("file123", "key", output)
|
||||
assert result == output
|
||||
assert Path(output).exists()
|
||||
|
||||
@patch('minimax_api_client.api_get')
|
||||
def test_download_no_url_raises(self, mock_api_get):
|
||||
mock_api_get.return_value = {"file": {}}
|
||||
|
||||
with pytest.raises(Exception, match="No download URL"):
|
||||
mac.download_file("file123", "key", "/tmp/test.mp4")
|
||||
|
||||
|
||||
class TestGetOutputDir:
|
||||
"""Test output directory resolution."""
|
||||
|
||||
def test_returns_path_object(self):
|
||||
result = mac.get_output_dir()
|
||||
assert isinstance(result, Path)
|
||||
|
||||
def test_directory_exists(self):
|
||||
result = mac.get_output_dir()
|
||||
assert result.exists()
|
||||
assert result.is_dir()
|
||||
185
.opencode/skills/ai-multimodal/scripts/tests/test_minimax_cli.py
Normal file
185
.opencode/skills/ai-multimodal/scripts/tests/test_minimax_cli.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""
|
||||
Tests for minimax_cli.py - CLI argument parsing and task dispatch.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import minimax_cli as cli
|
||||
|
||||
|
||||
class TestTaskDefaults:
|
||||
"""Test task-to-model default mapping."""
|
||||
|
||||
def test_generate_defaults_to_image_01(self):
|
||||
assert cli.TASK_DEFAULTS['generate'] == 'image-01'
|
||||
|
||||
def test_generate_video_defaults_to_hailuo(self):
|
||||
assert cli.TASK_DEFAULTS['generate-video'] == 'MiniMax-Hailuo-2.3'
|
||||
|
||||
def test_generate_speech_defaults_to_speech_28_hd(self):
|
||||
assert cli.TASK_DEFAULTS['generate-speech'] == 'speech-2.8-hd'
|
||||
|
||||
def test_generate_music_defaults_to_music_25(self):
|
||||
assert cli.TASK_DEFAULTS['generate-music'] == 'music-2.5'
|
||||
|
||||
|
||||
class TestPrintResult:
|
||||
"""Test result formatting."""
|
||||
|
||||
def test_success_image(self, capsys):
|
||||
result = {
|
||||
"status": "success",
|
||||
"generated_images": ["/path/to/img.png"],
|
||||
"model": "image-01"
|
||||
}
|
||||
cli.print_result(result, "generate")
|
||||
output = capsys.readouterr().out
|
||||
assert "success" in output.lower()
|
||||
assert "/path/to/img.png" in output
|
||||
assert "image-01" in output
|
||||
|
||||
def test_success_video(self, capsys):
|
||||
result = {
|
||||
"status": "success",
|
||||
"generated_video": "/path/to/vid.mp4",
|
||||
"generation_time": 45.2,
|
||||
"model": "MiniMax-Hailuo-2.3"
|
||||
}
|
||||
cli.print_result(result, "generate-video")
|
||||
output = capsys.readouterr().out
|
||||
assert "/path/to/vid.mp4" in output
|
||||
assert "45.2s" in output
|
||||
|
||||
def test_success_audio(self, capsys):
|
||||
result = {
|
||||
"status": "success",
|
||||
"generated_audio": "/path/to/audio.mp3",
|
||||
"duration_ms": 140000,
|
||||
"model": "music-2.5"
|
||||
}
|
||||
cli.print_result(result, "generate-music")
|
||||
output = capsys.readouterr().out
|
||||
assert "/path/to/audio.mp3" in output
|
||||
assert "140.0s" in output
|
||||
|
||||
def test_error_result(self, capsys):
|
||||
result = {"status": "error", "error": "Rate limit exceeded"}
|
||||
cli.print_result(result, "generate")
|
||||
output = capsys.readouterr().out
|
||||
assert "Rate limit exceeded" in output
|
||||
|
||||
def test_unknown_status(self, capsys):
|
||||
result = {"model": "image-01"}
|
||||
cli.print_result(result, "generate")
|
||||
output = capsys.readouterr().out
|
||||
assert "unknown" in output.lower()
|
||||
|
||||
|
||||
class TestMainCLI:
|
||||
"""Test CLI main() argument parsing and dispatch."""
|
||||
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value=None)
|
||||
def test_no_api_key_exits(self, mock_key, capsys):
|
||||
with patch('sys.argv', ['cli', '--task', 'generate', '--prompt', 'x']):
|
||||
with pytest.raises(SystemExit) as exc_info:
|
||||
cli.main()
|
||||
assert exc_info.value.code == 1
|
||||
|
||||
@patch('minimax_cli.generate_image')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_generate_image_dispatch(self, mock_key, mock_gen):
|
||||
mock_gen.return_value = {"status": "success", "generated_images": [],
|
||||
"model": "image-01"}
|
||||
with patch('sys.argv', ['cli', '--task', 'generate',
|
||||
'--prompt', 'A cat']):
|
||||
cli.main()
|
||||
mock_gen.assert_called_once()
|
||||
args = mock_gen.call_args
|
||||
assert args[0][0] == 'test-key'
|
||||
assert args[0][1] == 'A cat'
|
||||
|
||||
@patch('minimax_cli.generate_speech')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_generate_speech_dispatch(self, mock_key, mock_gen):
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_audio": "/x.mp3",
|
||||
"model": "speech-2.8-hd"}
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-speech',
|
||||
'--text', 'Hello world']):
|
||||
cli.main()
|
||||
mock_gen.assert_called_once()
|
||||
|
||||
@patch('minimax_cli.generate_speech')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_speech_uses_text_or_prompt(self, mock_key, mock_gen):
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_audio": "/x.mp3",
|
||||
"model": "speech-2.8-hd"}
|
||||
# --prompt should work as fallback for --text
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-speech',
|
||||
'--prompt', 'Fallback text']):
|
||||
cli.main()
|
||||
call_args = mock_gen.call_args
|
||||
assert call_args[0][1] == 'Fallback text'
|
||||
|
||||
@patch('minimax_cli.generate_music')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_generate_music_dispatch(self, mock_key, mock_gen):
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_audio": "/x.mp3",
|
||||
"duration_ms": 60000,
|
||||
"model": "music-2.5"}
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-music',
|
||||
'--lyrics', 'La la la']):
|
||||
cli.main()
|
||||
mock_gen.assert_called_once()
|
||||
|
||||
@patch('minimax_cli.generate_video')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_generate_video_dispatch(self, mock_key, mock_gen):
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_video": "/x.mp4",
|
||||
"generation_time": 30.0,
|
||||
"model": "MiniMax-Hailuo-2.3"}
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-video',
|
||||
'--prompt', 'A dancer']):
|
||||
cli.main()
|
||||
mock_gen.assert_called_once()
|
||||
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_auto_model_detection(self, mock_key):
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-speech',
|
||||
'--text', 'hi']):
|
||||
with patch('minimax_cli.generate_speech') as mock_gen:
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_audio": "/x.mp3",
|
||||
"model": "speech-2.8-hd"}
|
||||
cli.main()
|
||||
# Model should be auto-detected
|
||||
assert mock_gen.call_args[0][2] == 'speech-2.8-hd'
|
||||
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_explicit_model_override(self, mock_key):
|
||||
with patch('sys.argv', ['cli', '--task', 'generate-speech',
|
||||
'--text', 'hi', '--model', 'speech-2.8-turbo']):
|
||||
with patch('minimax_cli.generate_speech') as mock_gen:
|
||||
mock_gen.return_value = {"status": "success",
|
||||
"generated_audio": "/x.mp3",
|
||||
"model": "speech-2.8-turbo"}
|
||||
cli.main()
|
||||
assert mock_gen.call_args[0][2] == 'speech-2.8-turbo'
|
||||
|
||||
@patch('minimax_cli.generate_image')
|
||||
@patch('minimax_cli.find_minimax_api_key', return_value='test-key')
|
||||
def test_exception_exits_with_1(self, mock_key, mock_gen):
|
||||
mock_gen.side_effect = Exception("API timeout")
|
||||
with patch('sys.argv', ['cli', '--task', 'generate',
|
||||
'--prompt', 'test']):
|
||||
with pytest.raises(SystemExit) as exc_info:
|
||||
cli.main()
|
||||
assert exc_info.value.code == 1
|
||||
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
Tests for minimax_generate.py - generation functions for image, video, speech, music.
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock, call
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import minimax_generate as mg
|
||||
|
||||
|
||||
class TestModelRegistries:
|
||||
"""Test model set definitions."""
|
||||
|
||||
def test_image_models(self):
|
||||
assert 'image-01' in mg.MINIMAX_IMAGE_MODELS
|
||||
assert 'image-01-live' in mg.MINIMAX_IMAGE_MODELS
|
||||
|
||||
def test_video_models(self):
|
||||
assert 'MiniMax-Hailuo-2.3' in mg.MINIMAX_VIDEO_MODELS
|
||||
assert 'MiniMax-Hailuo-2.3-Fast' in mg.MINIMAX_VIDEO_MODELS
|
||||
assert 'S2V-01' in mg.MINIMAX_VIDEO_MODELS
|
||||
|
||||
def test_speech_models(self):
|
||||
assert 'speech-2.8-hd' in mg.MINIMAX_SPEECH_MODELS
|
||||
assert 'speech-2.8-turbo' in mg.MINIMAX_SPEECH_MODELS
|
||||
|
||||
def test_music_models(self):
|
||||
assert 'music-2.5' in mg.MINIMAX_MUSIC_MODELS
|
||||
|
||||
def test_all_models_is_union(self):
|
||||
expected = (mg.MINIMAX_IMAGE_MODELS | mg.MINIMAX_VIDEO_MODELS |
|
||||
mg.MINIMAX_SPEECH_MODELS | mg.MINIMAX_MUSIC_MODELS)
|
||||
assert mg.ALL_MINIMAX_MODELS == expected
|
||||
|
||||
|
||||
class TestIsMinimaxModel:
|
||||
"""Test model detection."""
|
||||
|
||||
def test_known_image_model(self):
|
||||
assert mg.is_minimax_model('image-01') is True
|
||||
|
||||
def test_known_video_model(self):
|
||||
assert mg.is_minimax_model('MiniMax-Hailuo-2.3') is True
|
||||
|
||||
def test_known_speech_model(self):
|
||||
assert mg.is_minimax_model('speech-2.8-hd') is True
|
||||
|
||||
def test_known_music_model(self):
|
||||
assert mg.is_minimax_model('music-2.5') is True
|
||||
|
||||
def test_prefix_minimax(self):
|
||||
assert mg.is_minimax_model('MiniMax-Future-Model') is True
|
||||
|
||||
def test_prefix_speech(self):
|
||||
assert mg.is_minimax_model('speech-3.0-ultra') is True
|
||||
|
||||
def test_prefix_s2v(self):
|
||||
assert mg.is_minimax_model('S2V-02') is True
|
||||
|
||||
def test_non_minimax_model(self):
|
||||
assert mg.is_minimax_model('gemini-2.5-flash') is False
|
||||
|
||||
def test_non_minimax_imagen(self):
|
||||
assert mg.is_minimax_model('imagen-4.0-generate-001') is False
|
||||
|
||||
|
||||
class TestGenerateImage:
|
||||
"""Test image generation."""
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_success(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {
|
||||
"data": {"image_urls": ["https://cdn.minimax.io/img1.png"]}
|
||||
}
|
||||
|
||||
with patch('requests.get') as mock_req_get:
|
||||
mock_resp = Mock()
|
||||
mock_resp.content = b'\x89PNG\r\n\x1a\n'
|
||||
mock_resp.raise_for_status = Mock()
|
||||
mock_req_get.return_value = mock_resp
|
||||
|
||||
result = mg.generate_image("key", "A cat", "image-01")
|
||||
|
||||
assert result["status"] == "success"
|
||||
assert len(result["generated_images"]) == 1
|
||||
assert result["model"] == "image-01"
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_no_images_returns_error(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {"data": {"image_urls": []}}
|
||||
|
||||
result = mg.generate_image("key", "A cat", "image-01")
|
||||
assert result["status"] == "error"
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_payload_structure(self, mock_post):
|
||||
mock_post.return_value = {"data": {"image_urls": []}}
|
||||
|
||||
mg.generate_image("key", "A dog", "image-01", "16:9", 3)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["model"] == "image-01"
|
||||
assert payload["prompt"] == "A dog"
|
||||
assert payload["aspect_ratio"] == "16:9"
|
||||
assert payload["n"] == 3
|
||||
assert payload["response_format"] == "url"
|
||||
assert payload["prompt_optimizer"] is True
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_num_images_capped_at_9(self, mock_post):
|
||||
mock_post.return_value = {"data": {"image_urls": []}}
|
||||
|
||||
mg.generate_image("key", "test", "image-01", num_images=15)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["n"] == 9
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_output_copy(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {
|
||||
"data": {"image_urls": ["https://cdn.minimax.io/img.png"]}
|
||||
}
|
||||
|
||||
with patch('requests.get') as mock_req_get:
|
||||
mock_resp = Mock()
|
||||
mock_resp.content = b'image_bytes'
|
||||
mock_resp.raise_for_status = Mock()
|
||||
mock_req_get.return_value = mock_resp
|
||||
|
||||
output_path = str(tmp_path / "custom_output.png")
|
||||
result = mg.generate_image("key", "test", output=output_path)
|
||||
|
||||
assert Path(output_path).exists()
|
||||
|
||||
|
||||
class TestGenerateVideo:
|
||||
"""Test video generation (async workflow)."""
|
||||
|
||||
@patch('minimax_generate.download_file')
|
||||
@patch('minimax_generate.poll_async_task')
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_success(self, mock_post, mock_dir, mock_poll, mock_dl, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {"task_id": "vid-task-123"}
|
||||
mock_poll.return_value = {"file_id": "file-456"}
|
||||
# Create a fake video file so stat() works
|
||||
mock_dl.side_effect = lambda fid, key, path, v: (
|
||||
Path(path).write_bytes(b'fake_video') or path
|
||||
)
|
||||
|
||||
result = mg.generate_video("key", "A dancer")
|
||||
|
||||
assert result["status"] == "success"
|
||||
assert "generated_video" in result
|
||||
assert result["model"] == "MiniMax-Hailuo-2.3"
|
||||
mock_poll.assert_called_once()
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_no_task_id_error(self, mock_post):
|
||||
mock_post.return_value = {"error": "bad request"}
|
||||
|
||||
result = mg.generate_video("key", "test")
|
||||
assert result["status"] == "error"
|
||||
assert "No task_id" in result["error"]
|
||||
|
||||
@patch('minimax_generate.poll_async_task')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_no_file_id_error(self, mock_post, mock_poll):
|
||||
mock_post.return_value = {"task_id": "t1"}
|
||||
mock_poll.return_value = {"status": "Success"}
|
||||
|
||||
result = mg.generate_video("key", "test")
|
||||
assert result["status"] == "error"
|
||||
assert "No file_id" in result["error"]
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_payload_with_first_frame(self, mock_post):
|
||||
mock_post.return_value = {"task_id": None}
|
||||
|
||||
mg.generate_video("key", "test", first_frame="https://img.url/frame.png")
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["first_frame_image"] == "https://img.url/frame.png"
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_payload_duration_resolution(self, mock_post):
|
||||
mock_post.return_value = {"task_id": None}
|
||||
|
||||
mg.generate_video("key", "test", duration=10, resolution="720P")
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["duration"] == 10
|
||||
assert payload["resolution"] == "720P"
|
||||
|
||||
|
||||
class TestGenerateSpeech:
|
||||
"""Test speech/TTS generation."""
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_success(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
# hex-encoded audio bytes
|
||||
mock_post.return_value = {
|
||||
"data": {"audio": "48656c6c6f"} # "Hello" in hex
|
||||
}
|
||||
|
||||
result = mg.generate_speech("key", "Hello world")
|
||||
|
||||
assert result["status"] == "success"
|
||||
assert "generated_audio" in result
|
||||
assert result["model"] == "speech-2.8-hd"
|
||||
# Verify file was written
|
||||
audio_path = Path(result["generated_audio"])
|
||||
assert audio_path.exists()
|
||||
assert audio_path.read_bytes() == bytes.fromhex("48656c6c6f")
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_no_audio_returns_error(self, mock_post):
|
||||
mock_post.return_value = {"data": {}}
|
||||
|
||||
result = mg.generate_speech("key", "test")
|
||||
assert result["status"] == "error"
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_payload_structure(self, mock_post):
|
||||
mock_post.return_value = {"data": {}}
|
||||
|
||||
mg.generate_speech("key", "Test text", "speech-2.8-turbo",
|
||||
voice="English_Warm_Bestie", emotion="happy",
|
||||
output_format="wav", rate=1.5)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["model"] == "speech-2.8-turbo"
|
||||
assert payload["text"] == "Test text"
|
||||
assert payload["stream"] is False
|
||||
assert payload["output_format"] == "hex"
|
||||
assert payload["voice_setting"]["voice_id"] == "English_Warm_Bestie"
|
||||
assert payload["voice_setting"]["speed"] == 1.5
|
||||
assert payload["audio_setting"]["format"] == "wav"
|
||||
assert payload["audio_setting"]["sample_rate"] == 32000
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_text_truncated_at_10000(self, mock_post):
|
||||
mock_post.return_value = {"data": {}}
|
||||
long_text = "x" * 15000
|
||||
|
||||
mg.generate_speech("key", long_text)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert len(payload["text"]) == 10000
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_uses_t2a_v2_endpoint(self, mock_post):
|
||||
mock_post.return_value = {"data": {}}
|
||||
|
||||
mg.generate_speech("key", "test")
|
||||
|
||||
endpoint = mock_post.call_args[0][0]
|
||||
assert endpoint == "t2a_v2"
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_wav_extension(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {"data": {"audio": "aabb"}}
|
||||
|
||||
result = mg.generate_speech("key", "test", output_format="wav")
|
||||
assert result["generated_audio"].endswith(".wav")
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_pcm_defaults_to_mp3_ext(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {"data": {"audio": "aabb"}}
|
||||
|
||||
result = mg.generate_speech("key", "test", output_format="pcm")
|
||||
assert result["generated_audio"].endswith(".mp3")
|
||||
|
||||
|
||||
class TestGenerateMusic:
|
||||
"""Test music generation."""
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_success_with_url(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {
|
||||
"data": {"audio": "https://cdn.minimax.io/music.mp3"},
|
||||
"extra_info": {"music_duration": 120000}
|
||||
}
|
||||
|
||||
with patch('requests.get') as mock_req_get:
|
||||
mock_resp = Mock()
|
||||
mock_resp.content = b'music_data'
|
||||
mock_resp.raise_for_status = Mock()
|
||||
mock_req_get.return_value = mock_resp
|
||||
|
||||
result = mg.generate_music("key", lyrics="La la la",
|
||||
prompt="pop")
|
||||
|
||||
assert result["status"] == "success"
|
||||
assert result["duration_ms"] == 120000
|
||||
assert result["model"] == "music-2.5"
|
||||
|
||||
@patch('minimax_generate.get_output_dir')
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_success_with_hex(self, mock_post, mock_dir, tmp_path):
|
||||
mock_dir.return_value = tmp_path
|
||||
mock_post.return_value = {
|
||||
"data": {"audio": "deadbeef"},
|
||||
"extra_info": {"music_duration": 60000}
|
||||
}
|
||||
|
||||
result = mg.generate_music("key", lyrics="test")
|
||||
|
||||
assert result["status"] == "success"
|
||||
audio_path = Path(result["generated_audio"])
|
||||
assert audio_path.read_bytes() == bytes.fromhex("deadbeef")
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_no_audio_returns_error(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
result = mg.generate_music("key", lyrics="test")
|
||||
assert result["status"] == "error"
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_payload_structure(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
mg.generate_music("key", lyrics="Verse 1\nHello",
|
||||
prompt="upbeat pop", model="music-2.5",
|
||||
output_format="wav")
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert payload["model"] == "music-2.5"
|
||||
assert payload["lyrics"] == "Verse 1\nHello"
|
||||
assert payload["prompt"] == "upbeat pop"
|
||||
assert payload["output_format"] == "url"
|
||||
assert payload["audio_setting"]["format"] == "wav"
|
||||
assert payload["audio_setting"]["sample_rate"] == 44100
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_lyrics_truncated_at_3500(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
mg.generate_music("key", lyrics="x" * 5000)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert len(payload["lyrics"]) == 3500
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_prompt_truncated_at_2000(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
mg.generate_music("key", prompt="y" * 3000)
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert len(payload["prompt"]) == 2000
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_uses_300s_timeout(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
mg.generate_music("key", lyrics="test")
|
||||
|
||||
# Check timeout kwarg passed to api_post
|
||||
_, kwargs = mock_post.call_args
|
||||
assert kwargs.get('timeout') == 300
|
||||
|
||||
@patch('minimax_generate.api_post')
|
||||
def test_empty_lyrics_omitted(self, mock_post):
|
||||
mock_post.return_value = {"data": {}, "extra_info": {}}
|
||||
|
||||
mg.generate_music("key", lyrics="", prompt="jazz")
|
||||
|
||||
payload = mock_post.call_args[0][1]
|
||||
assert "lyrics" not in payload
|
||||
assert payload["prompt"] == "jazz"
|
||||
Reference in New Issue
Block a user