#!/usr/bin/env python3 """Generate llms.txt from a docs directory following llmstxt.org specification. Usage: python3 generate-llms-txt.py --source [--output ] [--base-url ] [--full] [--project-name ] [--project-description ] Examples: python3 generate-llms-txt.py --source ./docs --base-url https://example.com/docs python3 generate-llms-txt.py --source ./docs --output ./public --full --project-name "My Project" """ import argparse import os import re import sys from pathlib import Path def extract_title(content: str, filepath: Path) -> str: """Extract H1 title from markdown content, fallback to filename.""" match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if match: return match.group(1).strip() return filepath.stem.replace("-", " ").replace("_", " ").title() def extract_description(content: str) -> str: """Extract first meaningful paragraph after H1 as description.""" lines = content.split("\n") found_h1 = False paragraph_lines = [] for line in lines: stripped = line.strip() if not found_h1: if stripped.startswith("# "): found_h1 = True continue # Skip empty lines, frontmatter, other headings if not stripped: if paragraph_lines: break continue if stripped.startswith("#") or stripped.startswith("---"): if paragraph_lines: break continue if stripped.startswith(">"): # Use blockquote content as description paragraph_lines.append(stripped.lstrip("> ").strip()) continue if stripped.startswith("- ") or stripped.startswith("* "): if paragraph_lines: break continue paragraph_lines.append(stripped) desc = " ".join(paragraph_lines) # Truncate to ~150 chars if len(desc) > 150: desc = desc[:147].rsplit(" ", 1)[0] + "..." return desc def categorize_file(filepath: Path) -> str: """Categorize a doc file into a section based on path/name heuristics.""" parts = [p.lower() for p in filepath.parts] name = filepath.stem.lower() category_map = { "api": "API Reference", "api-reference": "API Reference", "reference": "API Reference", "guide": "Guides", "guides": "Guides", "tutorial": "Guides", "tutorials": "Guides", "getting-started": "Getting Started", "quickstart": "Getting Started", "quick-start": "Getting Started", "setup": "Getting Started", "installation": "Getting Started", "install": "Getting Started", "config": "Configuration", "configuration": "Configuration", "settings": "Configuration", "deploy": "Deployment", "deployment": "Deployment", "hosting": "Deployment", "architecture": "Architecture", "design": "Architecture", "faq": "Optional", "changelog": "Optional", "contributing": "Optional", "migration": "Optional", "troubleshoot": "Optional", "troubleshooting": "Optional", } # Check path parts and filename for part in parts + [name]: if part in category_map: return category_map[part] return "Documentation" def scan_docs(source: Path) -> list[dict]: """Scan directory for markdown files and extract metadata.""" docs = [] extensions = {".md", ".mdx"} for filepath in sorted(source.rglob("*")): if filepath.suffix not in extensions: continue if filepath.name.startswith("."): continue # Skip node_modules, hidden dirs if any(p.startswith(".") or p == "node_modules" for p in filepath.parts): continue try: content = filepath.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): continue title = extract_title(content, filepath) description = extract_description(content) category = categorize_file(filepath.relative_to(source)) rel_path = filepath.relative_to(source) docs.append({ "title": title, "description": description, "category": category, "rel_path": str(rel_path), "abs_path": str(filepath), "content": content, }) return docs def build_url(rel_path: str, base_url: str) -> str: """Build full URL from relative path and base URL.""" if not base_url: return rel_path base = base_url.rstrip("/") # Remove .md/.mdx extension for web URLs clean_path = re.sub(r"\.(md|mdx)$", "", rel_path) return f"{base}/{clean_path}" def generate_llms_txt( docs: list[dict], project_name: str, project_desc: str, base_url: str, ) -> str: """Generate llms.txt content from scanned docs.""" lines = [f"# {project_name}", ""] if project_desc: lines.append(f"> {project_desc}") lines.append("") # Group by category categories: dict[str, list[dict]] = {} for doc in docs: cat = doc["category"] categories.setdefault(cat, []).append(doc) # Sort categories: Getting Started first, Optional last, rest alphabetical priority = {"Getting Started": 0, "Documentation": 5, "Optional": 99} sorted_cats = sorted( categories.keys(), key=lambda c: (priority.get(c, 10), c), ) for cat in sorted_cats: cat_docs = categories[cat] lines.append(f"## {cat}") lines.append("") for doc in cat_docs: url = build_url(doc["rel_path"], base_url) desc_part = f": {doc['description']}" if doc["description"] else "" lines.append(f"- [{doc['title']}]({url}){desc_part}") lines.append("") return "\n".join(lines).rstrip() + "\n" def generate_llms_full_txt( docs: list[dict], project_name: str, project_desc: str, ) -> str: """Generate llms-full.txt with inline content.""" lines = [f"# {project_name}", ""] if project_desc: lines.append(f"> {project_desc}") lines.append("") # Group by category categories: dict[str, list[dict]] = {} for doc in docs: cat = doc["category"] categories.setdefault(cat, []).append(doc) priority = {"Getting Started": 0, "Documentation": 5, "Optional": 99} sorted_cats = sorted( categories.keys(), key=lambda c: (priority.get(c, 10), c), ) for cat in sorted_cats: cat_docs = categories[cat] lines.append(f"## {cat}") lines.append("") for doc in cat_docs: lines.append(f"### {doc['title']}") lines.append("") # Include full content minus the H1 content = doc["content"] # Strip frontmatter content = re.sub( r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL ) # Strip H1 content = re.sub(r"^#\s+.+\n*", "", content) lines.append(content.strip()) lines.append("") return "\n".join(lines).rstrip() + "\n" def detect_project_info(source: Path) -> tuple[str, str]: """Try to detect project name and description from common files.""" name = source.resolve().name desc = "" # Check package.json pkg = source / "package.json" if not pkg.exists(): pkg = source.parent / "package.json" if pkg.exists(): try: import json data = json.loads(pkg.read_text(encoding="utf-8")) name = data.get("name", name) desc = data.get("description", desc) except (OSError, json.JSONDecodeError): pass # Check README for H1 + first paragraph for readme_name in ["README.md", "readme.md", "Readme.md"]: readme = source / readme_name if not readme.exists(): readme = source.parent / readme_name if readme.exists(): try: content = readme.read_text(encoding="utf-8") h1_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if h1_match: name = h1_match.group(1).strip() if not desc: desc = extract_description(content) except OSError: pass break return name, desc def main(): parser = argparse.ArgumentParser( description="Generate llms.txt from documentation directory" ) parser.add_argument( "--source", required=True, help="Path to docs directory" ) parser.add_argument( "--output", default=".", help="Output directory (default: current directory)", ) parser.add_argument( "--base-url", default="", help="Base URL prefix for doc links", ) parser.add_argument( "--full", action="store_true", help="Also generate llms-full.txt with inline content", ) parser.add_argument( "--project-name", default="", help="Project name (auto-detected if not provided)", ) parser.add_argument( "--project-description", default="", help="Project description (auto-detected if not provided)", ) args = parser.parse_args() source = Path(args.source).resolve() if not source.is_dir(): print(f"Error: Source path '{source}' is not a directory", file=sys.stderr) sys.exit(1) output_dir = Path(args.output).resolve() output_dir.mkdir(parents=True, exist_ok=True) # Detect or use provided project info auto_name, auto_desc = detect_project_info(source) project_name = args.project_name or auto_name project_desc = args.project_description or auto_desc # Scan docs docs = scan_docs(source) if not docs: print(f"Warning: No markdown files found in '{source}'", file=sys.stderr) sys.exit(1) print(f"Found {len(docs)} documentation files") # Generate llms.txt llms_txt = generate_llms_txt(docs, project_name, project_desc, args.base_url) llms_path = output_dir / "llms.txt" llms_path.write_text(llms_txt, encoding="utf-8") print(f"Generated: {llms_path}") # Generate llms-full.txt if requested if args.full: llms_full = generate_llms_full_txt(docs, project_name, project_desc) full_path = output_dir / "llms-full.txt" full_path.write_text(llms_full, encoding="utf-8") print(f"Generated: {full_path}") print("Done!") if __name__ == "__main__": main()