#!/usr/bin/env python3 """ Extract Writing Styles from assets/writing-styles/ directory. Supports multiple file types: - Text: .md, .txt - Documents: .pdf, .docx, .xlsx, .pptx (via document_converter.py) - Media: .jpg, .jpeg, .png, .webp, .mp4, .mov (via gemini_batch_process.py) Usage: python extract-writing-styles.py --list # List available style files python extract-writing-styles.py --style # Extract specific style python extract-writing-styles.py --all # Extract all styles python extract-writing-styles.py --all --json # Output as JSON """ import argparse import json import os import re import subprocess import sys from pathlib import Path from typing import Any, Dict, List, Optional # File type categories TEXT_FORMATS = {'.md', '.txt'} DOC_FORMATS = {'.pdf', '.docx', '.xlsx', '.pptx'} IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.webp', '.heic'} VIDEO_FORMATS = {'.mp4', '.mov', '.avi', '.mkv'} ALL_FORMATS = TEXT_FORMATS | DOC_FORMATS | IMAGE_FORMATS | VIDEO_FORMATS def find_project_root(start_dir: Path) -> Path: """Find project root by looking for .claude directory.""" for parent in [start_dir] + list(start_dir.parents): if (parent / '.claude').exists(): return parent return start_dir PROJECT_ROOT = find_project_root(Path(__file__).parent) STYLES_DIR = PROJECT_ROOT / 'assets' / 'writing-styles' AI_MULTIMODAL_SCRIPTS = PROJECT_ROOT / '.claude' / 'skills' / 'ai-multimodal' / 'scripts' def get_style_files() -> Dict[str, Any]: """List all style files in the writing-styles directory.""" if not STYLES_DIR.exists(): return {'error': f'Directory not found: {STYLES_DIR}', 'files': []} files = [] for f in STYLES_DIR.iterdir(): if f.is_file() and f.suffix.lower() in ALL_FORMATS: files.append({ 'name': f.stem, 'path': str(f), 'type': get_file_type(f), 'size': f.stat().st_size }) return {'files': sorted(files, key=lambda x: x['name']), 'directory': str(STYLES_DIR)} def get_file_type(file_path: Path) -> str: """Categorize file by type.""" ext = file_path.suffix.lower() if ext in TEXT_FORMATS: return 'text' if ext in DOC_FORMATS: return 'document' if ext in IMAGE_FORMATS: return 'image' if ext in VIDEO_FORMATS: return 'video' return 'unknown' def extract_text_content(file_path: Path) -> str: """Extract content from text files (.md, .txt).""" try: return file_path.read_text(encoding='utf-8') except Exception as e: return f'Error reading file: {e}' def extract_document_content(file_path: Path, verbose: bool = False) -> str: """Extract content from documents using document_converter.py.""" converter = AI_MULTIMODAL_SCRIPTS / 'document_converter.py' if not converter.exists(): return f'Error: document_converter.py not found at {converter}' output_file = STYLES_DIR / f'.temp_{file_path.stem}_extraction.md' try: cmd = [ sys.executable, str(converter), '--input', str(file_path), '--output', str(output_file), '--prompt', '''Extract the writing style characteristics from this document. Identify: tone, vocabulary, sentence structure, rhetorical devices, formatting patterns. Output as structured markdown with clear sections.''' ] if verbose: cmd.append('--verbose') result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if output_file.exists(): content = output_file.read_text(encoding='utf-8') output_file.unlink() # Clean up temp file return content else: return f'Conversion failed: {result.stderr}' except subprocess.TimeoutExpired: return 'Error: Document conversion timed out' except Exception as e: return f'Error: {e}' def extract_media_content(file_path: Path, verbose: bool = False) -> str: """Extract writing style from media using gemini_batch_process.py.""" processor = AI_MULTIMODAL_SCRIPTS / 'gemini_batch_process.py' if not processor.exists(): return f'Error: gemini_batch_process.py not found at {processor}' try: prompt = '''Analyze this content and identify any writing style characteristics visible. Look for: text overlays, captions, typography choices, messaging tone, branding voice. Describe the writing style in terms of: tone, vocabulary level, sentence structure, key phrases. Output as structured analysis.''' cmd = [ sys.executable, str(processor), '--files', str(file_path), '--task', 'analyze', '--prompt', prompt ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) return result.stdout if result.stdout else result.stderr except subprocess.TimeoutExpired: return 'Error: Media analysis timed out' except Exception as e: return f'Error: {e}' def extract_style_content(file_path: Path, verbose: bool = False) -> Dict[str, Any]: """Extract writing style content from any supported file type.""" if not file_path.exists(): return {'error': f'File not found: {file_path}'} file_type = get_file_type(file_path) if file_type == 'text': content = extract_text_content(file_path) elif file_type == 'document': content = extract_document_content(file_path, verbose) elif file_type in ('image', 'video'): content = extract_media_content(file_path, verbose) else: return {'error': f'Unsupported file type: {file_path.suffix}'} # Parse the content for style information result = { 'file': str(file_path), 'type': file_type, 'title': '', 'sections': [], 'styles': [], 'rawContent': content } # Extract title from first H1 title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if title_match: result['title'] = title_match.group(1).strip() # Extract sections (H2 headers) for i, match in enumerate(re.finditer(r'^##\s+(.+)$', content, re.MULTILINE)): result['sections'].append({ 'title': match.group(1).strip(), 'lineNumber': content[:match.start()].count('\n') + 1 }) # Extract style entries from tables table_pattern = r'\|.*?\|.*?\|.*?\|' for match in re.finditer(table_pattern, content, re.MULTILINE): row = match.group(0) if '---' not in row and 'Style' not in row: cols = [c.strip() for c in row.split('|') if c.strip()] if len(cols) >= 2: result['styles'].append({ 'name': re.sub(r'\*+', '', cols[0]), 'keywords': cols[1] if len(cols) > 1 else '', 'description': ' | '.join(cols[2:]) if len(cols) > 2 else '' }) return result def format_output(data: Dict[str, Any], as_json: bool = False) -> str: """Format output for display.""" if as_json: return json.dumps(data, indent=2, ensure_ascii=False) if 'error' in data: return f"Error: {data['error']}" output = [] if 'files' in data: # List mode output.append('# Available Writing Styles\n') output.append(f"Directory: {data['directory']}\n") if not data['files']: output.append('\nNo style files found. Add files to assets/writing-styles/') else: output.append('\n| Style | Type | Size |') output.append('|---|---|---|') for f in data['files']: size_kb = f['size'] / 1024 output.append(f"| {f['name']} | {f['type']} | {size_kb:.1f}KB |") elif 'title' in data: # Single style extraction if data.get('title'): output.append(f"# {data['title']}\n") output.append(f"**File Type:** {data.get('type', 'unknown')}\n") if data.get('styles'): output.append(f"\n## Extracted Styles ({len(data['styles'])})\n") for s in data['styles'][:30]: # Limit to 30 styles output.append(f"### {s['name']}") output.append(f"**Keywords:** {s['keywords']}\n") if data.get('sections'): output.append('\n## Sections\n') for s in data['sections']: output.append(f"- {s['title']} (line {s['lineNumber']})") return '\n'.join(output) def main(): parser = argparse.ArgumentParser( description='Extract writing styles from assets/writing-styles/ directory', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Supported formats: Text: .md, .txt Documents: .pdf, .docx, .xlsx, .pptx (requires GEMINI_API_KEY) Images: .jpg, .jpeg, .png, .webp (requires GEMINI_API_KEY) Videos: .mp4, .mov (requires GEMINI_API_KEY) Examples: python extract-writing-styles.py --list python extract-writing-styles.py --style default python extract-writing-styles.py --all --json ''' ) parser.add_argument('--list', action='store_true', help='List available style files') parser.add_argument('--style', type=str, help='Extract specific style by name') parser.add_argument('--all', action='store_true', help='Extract all styles') parser.add_argument('--json', action='store_true', help='Output as JSON') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') args = parser.parse_args() if args.list or (not args.style and not args.all): result = get_style_files() elif args.style: # Find the file with matching name style_files = get_style_files() if 'error' in style_files: result = style_files else: matching = [f for f in style_files['files'] if f['name'] == args.style] if matching: result = extract_style_content(Path(matching[0]['path']), args.verbose) else: result = {'error': f"Style '{args.style}' not found"} elif args.all: style_files = get_style_files() if 'error' in style_files: result = style_files else: result = { 'title': 'All Writing Styles', 'files': [] } for f in style_files['files']: extracted = extract_style_content(Path(f['path']), args.verbose) result['files'].append({'name': f['name'], **extracted}) else: result = get_style_files() print(format_output(result, args.json)) if __name__ == '__main__': main()