212 lines
5.1 KiB
JavaScript
Executable File
212 lines
5.1 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
/**
|
|
* llms.txt Analyzer Script
|
|
* Parses llms.txt content and categorizes URLs for optimal agent distribution
|
|
*/
|
|
|
|
const { loadEnv } = require('./utils/env-loader');
|
|
|
|
// Load environment
|
|
const env = loadEnv();
|
|
const DEBUG = env.DEBUG === 'true';
|
|
|
|
/**
|
|
* URL priority categories
|
|
*/
|
|
const PRIORITY_KEYWORDS = {
|
|
critical: [
|
|
'getting-started', 'quick-start', 'quickstart', 'introduction', 'intro', 'overview',
|
|
'installation', 'install', 'setup', 'basics', 'core-concepts', 'fundamentals',
|
|
],
|
|
supplementary: [
|
|
'advanced', 'internals', 'migration', 'migrate', 'troubleshooting', 'troubleshoot',
|
|
'faq', 'frequently-asked', 'changelog', 'contributing', 'contribute',
|
|
],
|
|
important: [
|
|
'guide', 'tutorial', 'example', 'api-reference', 'api', 'reference',
|
|
'configuration', 'config', 'routing', 'route', 'data-fetching', 'authentication', 'auth',
|
|
],
|
|
};
|
|
|
|
/**
|
|
* Categorize URL by priority
|
|
* @param {string} url - Documentation URL
|
|
* @returns {string} Priority level (critical/important/supplementary)
|
|
*/
|
|
function categorizeUrl(url) {
|
|
const urlLower = url.toLowerCase();
|
|
|
|
// Check in priority order: critical first, then supplementary, then important
|
|
// This ensures specific keywords (advanced, internals) are caught before generic ones
|
|
const priorities = ['critical', 'supplementary', 'important'];
|
|
|
|
for (const priority of priorities) {
|
|
const keywords = PRIORITY_KEYWORDS[priority];
|
|
for (const keyword of keywords) {
|
|
if (urlLower.includes(keyword)) {
|
|
return priority;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 'important'; // Default
|
|
}
|
|
|
|
/**
|
|
* Parse llms.txt content to extract URLs
|
|
* @param {string} content - llms.txt content
|
|
* @returns {Array<string>} Array of URLs
|
|
*/
|
|
function parseUrls(content) {
|
|
if (!content || typeof content !== 'string') {
|
|
return [];
|
|
}
|
|
|
|
const urls = [];
|
|
const lines = content.split('\n');
|
|
|
|
for (const line of lines) {
|
|
const trimmed = line.trim();
|
|
|
|
// Skip comments and empty lines
|
|
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
|
|
// Extract URLs (look for http/https)
|
|
const urlMatch = trimmed.match(/https?:\/\/[^\s<>"]+/i);
|
|
if (urlMatch) {
|
|
urls.push(urlMatch[0]);
|
|
}
|
|
}
|
|
|
|
return urls;
|
|
}
|
|
|
|
/**
|
|
* Group URLs by priority
|
|
* @param {Array<string>} urls - Array of URLs
|
|
* @returns {Object} URLs grouped by priority
|
|
*/
|
|
function groupByPriority(urls) {
|
|
const groups = {
|
|
critical: [],
|
|
important: [],
|
|
supplementary: [],
|
|
};
|
|
|
|
for (const url of urls) {
|
|
const priority = categorizeUrl(url);
|
|
groups[priority].push(url);
|
|
}
|
|
|
|
return groups;
|
|
}
|
|
|
|
/**
|
|
* Suggest optimal agent distribution
|
|
* @param {number} urlCount - Total number of URLs
|
|
* @returns {Object} Agent distribution suggestion
|
|
*/
|
|
function suggestAgentDistribution(urlCount) {
|
|
if (urlCount <= 3) {
|
|
return {
|
|
agentCount: 1,
|
|
strategy: 'single',
|
|
urlsPerAgent: urlCount,
|
|
description: 'Single agent can handle all URLs',
|
|
};
|
|
} else if (urlCount <= 10) {
|
|
const agents = Math.min(Math.ceil(urlCount / 2), 5);
|
|
return {
|
|
agentCount: agents,
|
|
strategy: 'parallel',
|
|
urlsPerAgent: Math.ceil(urlCount / agents),
|
|
description: `Deploy ${agents} agents in parallel`,
|
|
};
|
|
} else if (urlCount <= 20) {
|
|
return {
|
|
agentCount: 7,
|
|
strategy: 'parallel',
|
|
urlsPerAgent: Math.ceil(urlCount / 7),
|
|
description: 'Deploy 7 agents with balanced workload',
|
|
};
|
|
} else {
|
|
return {
|
|
agentCount: 7,
|
|
strategy: 'phased',
|
|
urlsPerAgent: Math.ceil(urlCount / 7),
|
|
phases: 2,
|
|
description: 'Use two-phase approach: critical first, then important',
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Analyze llms.txt content
|
|
* @param {string} content - llms.txt content
|
|
* @returns {Object} Analysis result
|
|
*/
|
|
function analyzeLlmsTxt(content) {
|
|
const urls = parseUrls(content);
|
|
const grouped = groupByPriority(urls);
|
|
const distribution = suggestAgentDistribution(urls.length);
|
|
|
|
return {
|
|
totalUrls: urls.length,
|
|
urls,
|
|
grouped,
|
|
distribution,
|
|
summary: {
|
|
critical: grouped.critical.length,
|
|
important: grouped.important.length,
|
|
supplementary: grouped.supplementary.length,
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* CLI entry point
|
|
*/
|
|
function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.length === 0) {
|
|
console.error('Usage: node analyze-llms-txt.js <content-file-or-stdin>');
|
|
console.error('Or pipe content: cat llms.txt | node analyze-llms-txt.js');
|
|
process.exit(1);
|
|
}
|
|
|
|
const fs = require('fs');
|
|
let content;
|
|
|
|
if (args[0] === '-') {
|
|
// Read from stdin
|
|
content = fs.readFileSync(0, 'utf8');
|
|
} else {
|
|
// Read from file
|
|
const filePath = args[0];
|
|
if (!fs.existsSync(filePath)) {
|
|
console.error(`Error: File not found: ${filePath}`);
|
|
process.exit(1);
|
|
}
|
|
content = fs.readFileSync(filePath, 'utf8');
|
|
}
|
|
|
|
const result = analyzeLlmsTxt(content);
|
|
console.log(JSON.stringify(result, null, 2));
|
|
process.exit(0);
|
|
}
|
|
|
|
// Run if called directly
|
|
if (require.main === module) {
|
|
main();
|
|
}
|
|
|
|
module.exports = {
|
|
analyzeLlmsTxt,
|
|
parseUrls,
|
|
groupByPriority,
|
|
categorizeUrl,
|
|
suggestAgentDistribution,
|
|
};
|