Files
english/.opencode/skills/ai-multimodal/scripts/minimax_generate.py
2026-04-12 01:06:31 +07:00

279 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
MiniMax generation CLI - image, video, speech, and music generation.
Models:
- Image: image-01, image-01-live
- Video: MiniMax-Hailuo-2.3, MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-02, S2V-01
- Speech: speech-2.8-hd, speech-2.8-turbo, speech-2.6-hd, speech-2.6-turbo
- Music: music-2.5
Usage:
python minimax_generate.py --task generate --prompt "A cat in space" --model image-01
python minimax_generate.py --task generate-video --prompt "A dancer" --model MiniMax-Hailuo-2.3
python minimax_generate.py --task generate-speech --text "Hello world" --model speech-2.8-hd
python minimax_generate.py --task generate-music --lyrics "Verse 1..." --model music-2.5
"""
import argparse
import base64
import json
import shutil
import sys
import time
from pathlib import Path
from minimax_api_client import (
find_minimax_api_key, api_post, poll_async_task,
download_file, get_output_dir
)
# Model registries
MINIMAX_IMAGE_MODELS = {'image-01', 'image-01-live'}
MINIMAX_VIDEO_MODELS = {
'MiniMax-Hailuo-2.3', 'MiniMax-Hailuo-2.3-Fast',
'MiniMax-Hailuo-02', 'S2V-01'
}
MINIMAX_SPEECH_MODELS = {
'speech-2.8-hd', 'speech-2.8-turbo',
'speech-2.6-hd', 'speech-2.6-turbo',
'speech-02-hd', 'speech-02-turbo'
}
MINIMAX_MUSIC_MODELS = {'music-2.5', 'music-2.0'}
ALL_MINIMAX_MODELS = (
MINIMAX_IMAGE_MODELS | MINIMAX_VIDEO_MODELS |
MINIMAX_SPEECH_MODELS | MINIMAX_MUSIC_MODELS
)
def is_minimax_model(model: str) -> bool:
"""Check if model is a MiniMax model."""
return (
model in ALL_MINIMAX_MODELS or
model.startswith('MiniMax-') or
model.startswith('image-01') or
model.startswith('speech-') or
model.startswith('music-') or
model.startswith('S2V-')
)
def generate_image(api_key: str, prompt: str, model: str = 'image-01',
aspect_ratio: str = '1:1', num_images: int = 1,
output: str = None, verbose: bool = False) -> dict:
"""Generate image using MiniMax image-01 model."""
payload = {
"model": model,
"prompt": prompt,
"aspect_ratio": aspect_ratio,
"n": min(num_images, 9),
"response_format": "url",
"prompt_optimizer": True
}
if verbose:
print(f"Generating {num_images} image(s) with {model}...")
result = api_post("image_generation", payload, api_key, verbose)
# Download images
image_urls = result.get("data", {}).get("image_urls", [])
if not image_urls:
return {"status": "error", "error": "No images in response"}
output_dir = get_output_dir()
saved_files = []
import requests as req
for i, url in enumerate(image_urls):
ts = int(time.time())
fname = f"minimax_image_{ts}_{i}.png"
fpath = output_dir / fname
resp = req.get(url, timeout=60)
resp.raise_for_status()
with open(fpath, 'wb') as f:
f.write(resp.content)
saved_files.append(str(fpath))
if verbose:
print(f" Saved: {fpath}")
# Copy first image to output if specified
if output and saved_files:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(saved_files[0], output)
return {"status": "success", "generated_images": saved_files, "model": model}
def generate_video(api_key: str, prompt: str, model: str = 'MiniMax-Hailuo-2.3',
duration: int = 6, resolution: str = '1080P',
first_frame: str = None, output: str = None,
verbose: bool = False) -> dict:
"""Generate video using MiniMax Hailuo models (async)."""
payload = {
"prompt": prompt,
"model": model,
"duration": duration,
"resolution": resolution
}
if first_frame:
payload["first_frame_image"] = first_frame
if verbose:
print(f"Submitting video generation with {model}...")
result = api_post("video_generation", payload, api_key, verbose)
task_id = result.get("task_id")
if not task_id:
return {"status": "error", "error": f"No task_id: {json.dumps(result)}"}
if verbose:
print(f" Task ID: {task_id}, polling...")
start = time.time()
poll_result = poll_async_task(task_id, "video_generation", api_key,
poll_interval=10, verbose=verbose)
file_id = poll_result.get("file_id")
if not file_id:
return {"status": "error", "error": f"No file_id: {json.dumps(poll_result)}"}
output_dir = get_output_dir()
ts = int(time.time())
output_path = str(output_dir / f"minimax_video_{ts}.mp4")
download_file(file_id, api_key, output_path, verbose)
elapsed = time.time() - start
file_size = Path(output_path).stat().st_size / (1024 * 1024)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
print(f" Generated in {elapsed:.1f}s, size: {file_size:.2f} MB")
return {
"status": "success", "generated_video": output_path,
"generation_time": elapsed, "file_size_mb": file_size, "model": model
}
def generate_speech(api_key: str, text: str, model: str = 'speech-2.8-hd',
voice: str = 'English_expressive_narrator',
emotion: str = 'neutral', output_format: str = 'mp3',
rate: float = 1.0, output: str = None,
verbose: bool = False) -> dict:
"""Generate speech using MiniMax TTS v2 API."""
payload = {
"model": model,
"text": text[:10000],
"stream": False,
"language_boost": "auto",
"output_format": "hex",
"voice_setting": {
"voice_id": voice,
"speed": rate,
"vol": 1.0,
"pitch": 0
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": output_format,
"channel": 1
}
}
if verbose:
print(f"Generating speech with {model}, voice: {voice}...")
result = api_post("t2a_v2", payload, api_key, verbose)
audio_data = result.get("data", {}).get("audio")
if not audio_data:
return {"status": "error", "error": "No audio in response"}
output_dir = get_output_dir()
ts = int(time.time())
ext = output_format if output_format in ('mp3', 'wav', 'flac') else 'mp3'
output_path = str(output_dir / f"minimax_speech_{ts}.{ext}")
# Audio returned as hex-encoded string from t2a_v2
audio_bytes = bytes.fromhex(audio_data)
with open(output_path, 'wb') as f:
f.write(audio_bytes)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
size_kb = len(audio_bytes) / 1024
print(f" Saved: {output_path} ({size_kb:.1f} KB)")
return {"status": "success", "generated_audio": output_path, "model": model}
def generate_music(api_key: str, lyrics: str = '', prompt: str = '',
model: str = 'music-2.5', output_format: str = 'mp3',
output: str = None, verbose: bool = False) -> dict:
"""Generate music using MiniMax music models."""
payload = {
"model": model,
"output_format": "url",
"audio_setting": {
"sample_rate": 44100,
"bitrate": 128000,
"format": output_format
}
}
if lyrics:
payload["lyrics"] = lyrics[:3500]
if prompt:
payload["prompt"] = prompt[:2000]
if verbose:
print(f"Generating music with {model}...")
result = api_post("music_generation", payload, api_key, verbose, timeout=300)
audio_data = result.get("data", {}).get("audio")
extra = result.get("extra_info", {})
duration_ms = extra.get("music_duration", 0)
if not audio_data:
return {"status": "error", "error": "No audio in response"}
output_dir = get_output_dir()
ts = int(time.time())
output_path = str(output_dir / f"minimax_music_{ts}.{output_format}")
# Download from URL or decode hex
if audio_data.startswith("http"):
import requests as req
resp = req.get(audio_data, timeout=120)
resp.raise_for_status()
with open(output_path, 'wb') as f:
f.write(resp.content)
else:
audio_bytes = bytes.fromhex(audio_data)
with open(output_path, 'wb') as f:
f.write(audio_bytes)
if output:
Path(output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(output_path, output)
if verbose:
dur_s = duration_ms / 1000 if duration_ms else 0
print(f" Saved: {output_path} ({dur_s:.1f}s)")
return {
"status": "success", "generated_audio": output_path,
"duration_ms": duration_ms, "model": model
}