init
This commit is contained in:
350
.opencode/skills/llms/scripts/generate-llms-txt.py
Executable file
350
.opencode/skills/llms/scripts/generate-llms-txt.py
Executable file
@@ -0,0 +1,350 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate llms.txt from a docs directory following llmstxt.org specification.
|
||||
|
||||
Usage:
|
||||
python3 generate-llms-txt.py --source <path> [--output <path>] [--base-url <url>] [--full] [--project-name <name>] [--project-description <desc>]
|
||||
|
||||
Examples:
|
||||
python3 generate-llms-txt.py --source ./docs --base-url https://example.com/docs
|
||||
python3 generate-llms-txt.py --source ./docs --output ./public --full --project-name "My Project"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_title(content: str, filepath: Path) -> str:
|
||||
"""Extract H1 title from markdown content, fallback to filename."""
|
||||
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
return filepath.stem.replace("-", " ").replace("_", " ").title()
|
||||
|
||||
|
||||
def extract_description(content: str) -> str:
|
||||
"""Extract first meaningful paragraph after H1 as description."""
|
||||
lines = content.split("\n")
|
||||
found_h1 = False
|
||||
paragraph_lines = []
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not found_h1:
|
||||
if stripped.startswith("# "):
|
||||
found_h1 = True
|
||||
continue
|
||||
# Skip empty lines, frontmatter, other headings
|
||||
if not stripped:
|
||||
if paragraph_lines:
|
||||
break
|
||||
continue
|
||||
if stripped.startswith("#") or stripped.startswith("---"):
|
||||
if paragraph_lines:
|
||||
break
|
||||
continue
|
||||
if stripped.startswith(">"):
|
||||
# Use blockquote content as description
|
||||
paragraph_lines.append(stripped.lstrip("> ").strip())
|
||||
continue
|
||||
if stripped.startswith("- ") or stripped.startswith("* "):
|
||||
if paragraph_lines:
|
||||
break
|
||||
continue
|
||||
paragraph_lines.append(stripped)
|
||||
|
||||
desc = " ".join(paragraph_lines)
|
||||
# Truncate to ~150 chars
|
||||
if len(desc) > 150:
|
||||
desc = desc[:147].rsplit(" ", 1)[0] + "..."
|
||||
return desc
|
||||
|
||||
|
||||
def categorize_file(filepath: Path) -> str:
|
||||
"""Categorize a doc file into a section based on path/name heuristics."""
|
||||
parts = [p.lower() for p in filepath.parts]
|
||||
name = filepath.stem.lower()
|
||||
|
||||
category_map = {
|
||||
"api": "API Reference",
|
||||
"api-reference": "API Reference",
|
||||
"reference": "API Reference",
|
||||
"guide": "Guides",
|
||||
"guides": "Guides",
|
||||
"tutorial": "Guides",
|
||||
"tutorials": "Guides",
|
||||
"getting-started": "Getting Started",
|
||||
"quickstart": "Getting Started",
|
||||
"quick-start": "Getting Started",
|
||||
"setup": "Getting Started",
|
||||
"installation": "Getting Started",
|
||||
"install": "Getting Started",
|
||||
"config": "Configuration",
|
||||
"configuration": "Configuration",
|
||||
"settings": "Configuration",
|
||||
"deploy": "Deployment",
|
||||
"deployment": "Deployment",
|
||||
"hosting": "Deployment",
|
||||
"architecture": "Architecture",
|
||||
"design": "Architecture",
|
||||
"faq": "Optional",
|
||||
"changelog": "Optional",
|
||||
"contributing": "Optional",
|
||||
"migration": "Optional",
|
||||
"troubleshoot": "Optional",
|
||||
"troubleshooting": "Optional",
|
||||
}
|
||||
|
||||
# Check path parts and filename
|
||||
for part in parts + [name]:
|
||||
if part in category_map:
|
||||
return category_map[part]
|
||||
|
||||
return "Documentation"
|
||||
|
||||
|
||||
def scan_docs(source: Path) -> list[dict]:
|
||||
"""Scan directory for markdown files and extract metadata."""
|
||||
docs = []
|
||||
extensions = {".md", ".mdx"}
|
||||
|
||||
for filepath in sorted(source.rglob("*")):
|
||||
if filepath.suffix not in extensions:
|
||||
continue
|
||||
if filepath.name.startswith("."):
|
||||
continue
|
||||
# Skip node_modules, hidden dirs
|
||||
if any(p.startswith(".") or p == "node_modules" for p in filepath.parts):
|
||||
continue
|
||||
|
||||
try:
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
continue
|
||||
|
||||
title = extract_title(content, filepath)
|
||||
description = extract_description(content)
|
||||
category = categorize_file(filepath.relative_to(source))
|
||||
rel_path = filepath.relative_to(source)
|
||||
|
||||
docs.append({
|
||||
"title": title,
|
||||
"description": description,
|
||||
"category": category,
|
||||
"rel_path": str(rel_path),
|
||||
"abs_path": str(filepath),
|
||||
"content": content,
|
||||
})
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def build_url(rel_path: str, base_url: str) -> str:
|
||||
"""Build full URL from relative path and base URL."""
|
||||
if not base_url:
|
||||
return rel_path
|
||||
base = base_url.rstrip("/")
|
||||
# Remove .md/.mdx extension for web URLs
|
||||
clean_path = re.sub(r"\.(md|mdx)$", "", rel_path)
|
||||
return f"{base}/{clean_path}"
|
||||
|
||||
|
||||
def generate_llms_txt(
|
||||
docs: list[dict],
|
||||
project_name: str,
|
||||
project_desc: str,
|
||||
base_url: str,
|
||||
) -> str:
|
||||
"""Generate llms.txt content from scanned docs."""
|
||||
lines = [f"# {project_name}", ""]
|
||||
|
||||
if project_desc:
|
||||
lines.append(f"> {project_desc}")
|
||||
lines.append("")
|
||||
|
||||
# Group by category
|
||||
categories: dict[str, list[dict]] = {}
|
||||
for doc in docs:
|
||||
cat = doc["category"]
|
||||
categories.setdefault(cat, []).append(doc)
|
||||
|
||||
# Sort categories: Getting Started first, Optional last, rest alphabetical
|
||||
priority = {"Getting Started": 0, "Documentation": 5, "Optional": 99}
|
||||
|
||||
sorted_cats = sorted(
|
||||
categories.keys(),
|
||||
key=lambda c: (priority.get(c, 10), c),
|
||||
)
|
||||
|
||||
for cat in sorted_cats:
|
||||
cat_docs = categories[cat]
|
||||
lines.append(f"## {cat}")
|
||||
lines.append("")
|
||||
for doc in cat_docs:
|
||||
url = build_url(doc["rel_path"], base_url)
|
||||
desc_part = f": {doc['description']}" if doc["description"] else ""
|
||||
lines.append(f"- [{doc['title']}]({url}){desc_part}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def generate_llms_full_txt(
|
||||
docs: list[dict],
|
||||
project_name: str,
|
||||
project_desc: str,
|
||||
) -> str:
|
||||
"""Generate llms-full.txt with inline content."""
|
||||
lines = [f"# {project_name}", ""]
|
||||
|
||||
if project_desc:
|
||||
lines.append(f"> {project_desc}")
|
||||
lines.append("")
|
||||
|
||||
# Group by category
|
||||
categories: dict[str, list[dict]] = {}
|
||||
for doc in docs:
|
||||
cat = doc["category"]
|
||||
categories.setdefault(cat, []).append(doc)
|
||||
|
||||
priority = {"Getting Started": 0, "Documentation": 5, "Optional": 99}
|
||||
sorted_cats = sorted(
|
||||
categories.keys(),
|
||||
key=lambda c: (priority.get(c, 10), c),
|
||||
)
|
||||
|
||||
for cat in sorted_cats:
|
||||
cat_docs = categories[cat]
|
||||
lines.append(f"## {cat}")
|
||||
lines.append("")
|
||||
for doc in cat_docs:
|
||||
lines.append(f"### {doc['title']}")
|
||||
lines.append("")
|
||||
# Include full content minus the H1
|
||||
content = doc["content"]
|
||||
# Strip frontmatter
|
||||
content = re.sub(
|
||||
r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL
|
||||
)
|
||||
# Strip H1
|
||||
content = re.sub(r"^#\s+.+\n*", "", content)
|
||||
lines.append(content.strip())
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def detect_project_info(source: Path) -> tuple[str, str]:
|
||||
"""Try to detect project name and description from common files."""
|
||||
name = source.resolve().name
|
||||
desc = ""
|
||||
|
||||
# Check package.json
|
||||
pkg = source / "package.json"
|
||||
if not pkg.exists():
|
||||
pkg = source.parent / "package.json"
|
||||
if pkg.exists():
|
||||
try:
|
||||
import json
|
||||
data = json.loads(pkg.read_text(encoding="utf-8"))
|
||||
name = data.get("name", name)
|
||||
desc = data.get("description", desc)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
pass
|
||||
|
||||
# Check README for H1 + first paragraph
|
||||
for readme_name in ["README.md", "readme.md", "Readme.md"]:
|
||||
readme = source / readme_name
|
||||
if not readme.exists():
|
||||
readme = source.parent / readme_name
|
||||
if readme.exists():
|
||||
try:
|
||||
content = readme.read_text(encoding="utf-8")
|
||||
h1_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
||||
if h1_match:
|
||||
name = h1_match.group(1).strip()
|
||||
if not desc:
|
||||
desc = extract_description(content)
|
||||
except OSError:
|
||||
pass
|
||||
break
|
||||
|
||||
return name, desc
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate llms.txt from documentation directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source", required=True, help="Path to docs directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=".",
|
||||
help="Output directory (default: current directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default="",
|
||||
help="Base URL prefix for doc links",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--full",
|
||||
action="store_true",
|
||||
help="Also generate llms-full.txt with inline content",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--project-name",
|
||||
default="",
|
||||
help="Project name (auto-detected if not provided)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--project-description",
|
||||
default="",
|
||||
help="Project description (auto-detected if not provided)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
source = Path(args.source).resolve()
|
||||
|
||||
if not source.is_dir():
|
||||
print(f"Error: Source path '{source}' is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
output_dir = Path(args.output).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Detect or use provided project info
|
||||
auto_name, auto_desc = detect_project_info(source)
|
||||
project_name = args.project_name or auto_name
|
||||
project_desc = args.project_description or auto_desc
|
||||
|
||||
# Scan docs
|
||||
docs = scan_docs(source)
|
||||
if not docs:
|
||||
print(f"Warning: No markdown files found in '{source}'", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(docs)} documentation files")
|
||||
|
||||
# Generate llms.txt
|
||||
llms_txt = generate_llms_txt(docs, project_name, project_desc, args.base_url)
|
||||
llms_path = output_dir / "llms.txt"
|
||||
llms_path.write_text(llms_txt, encoding="utf-8")
|
||||
print(f"Generated: {llms_path}")
|
||||
|
||||
# Generate llms-full.txt if requested
|
||||
if args.full:
|
||||
llms_full = generate_llms_full_txt(docs, project_name, project_desc)
|
||||
full_path = output_dir / "llms-full.txt"
|
||||
full_path.write_text(llms_full, encoding="utf-8")
|
||||
print(f"Generated: {full_path}")
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user