diff --git a/docs/lib/ai-agent-detection.ts b/docs/lib/ai-agent-detection.ts deleted file mode 100644 index be0a02a2e4..0000000000 --- a/docs/lib/ai-agent-detection.ts +++ /dev/null @@ -1,168 +0,0 @@ -/** - * AI Agent Detection Utility - * - * Multi-signal detection for AI agents/bots. Used to serve markdown - * responses when agents request docs pages. - * - * Three detection layers: - * 1. Known UA patterns (definitive) — curated from https://bots.fyi/?tags=ai_assistant - * 2. Signature-Agent header (definitive) — catches ChatGPT agent (RFC 9421) - * 3. Missing browser fingerprint heuristic — catches unknown bots - * - * Optimizes for recall over precision: serving markdown to a non-AI bot - * is low-harm; missing an AI agent means a worse experience. - * - * Last reviewed: 2026-03-20 against bots.fyi + official vendor docs - */ - -// Layer 1: Known AI agent UA substrings (lowercase). -const AI_AGENT_UA_PATTERNS = [ - // Anthropic — https://support.claude.com/en/articles/8896518 - "claudebot", - "claude-searchbot", - "claude-user", - "anthropic-ai", - "claude-web", - - // OpenAI — https://platform.openai.com/docs/bots - "chatgpt", - "gptbot", - "oai-searchbot", - "openai", - - // Google AI - "gemini", - "bard", - "google-cloudvertexbot", - "google-extended", - - // Meta - "meta-externalagent", - "meta-externalfetcher", - "meta-webindexer", - - // Search/Research AI - "perplexity", - "youbot", - "you.com", - "deepseekbot", - - // Coding assistants - "cursor", - "github-copilot", - "codeium", - "tabnine", - "sourcegraph", - - // Other AI agents / data scrapers (low-harm to serve markdown) - "cohere-ai", - "bytespider", - "amazonbot", - "ai2bot", - "diffbot", - "omgili", - "omgilibot", -]; - -// Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421). -const SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"]; - -// Layer 3: Traditional bot exclusion list — bots that should NOT trigger -// the heuristic layer (they're search engine crawlers, social previews, or -// monitoring tools, not AI agents). -const TRADITIONAL_BOT_PATTERNS = [ - "googlebot", - "bingbot", - "yandexbot", - "baiduspider", - "duckduckbot", - "slurp", - "msnbot", - "facebot", - "twitterbot", - "linkedinbot", - "whatsapp", - "telegrambot", - "pingdom", - "uptimerobot", - "newrelic", - "datadog", - "statuspage", - "site24x7", - "applebot", -]; - -// Broad regex for bot-like UA strings (used only in Layer 3 heuristic). -const BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i; - -export type DetectionMethod = "ua-match" | "signature-agent" | "heuristic"; - -export interface DetectionResult { - detected: boolean; - method: DetectionMethod | null; -} - -/** - * Detects AI agents from HTTP request headers. - * - * Returns both whether the agent was detected and which signal triggered, - * so callers can log the detection method for accuracy tracking. - */ -export function isAIAgent(request: { - headers: { get(name: string): string | null }; -}): DetectionResult { - const userAgent = request.headers.get("user-agent"); - - // Layer 1: Known UA pattern match - if (userAgent) { - const lowerUA = userAgent.toLowerCase(); - if (AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) { - return { detected: true, method: "ua-match" }; - } - } - - // Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent) - const signatureAgent = request.headers.get("signature-agent"); - if (signatureAgent) { - const lowerSig = signatureAgent.toLowerCase(); - if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) { - return { detected: true, method: "signature-agent" }; - } - } - - // Layer 3: Missing browser fingerprint heuristic - // Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode - // on navigation requests. Its absence signals a programmatic client. - const secFetchMode = request.headers.get("sec-fetch-mode"); - if (!secFetchMode && userAgent && BOT_LIKE_REGEX.test(userAgent)) { - const lowerUA = userAgent.toLowerCase(); - const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => - lowerUA.includes(pattern) - ); - if (!isTraditionalBot) { - return { detected: true, method: "heuristic" }; - } - } - - return { detected: false, method: null }; -} - -/** - * Generates a markdown response for AI agents that hit non-existent URLs. - */ -export function generateAgentNotFoundResponse(requestedPath: string): string { - return `# Page Not Found - -The URL \`${requestedPath}\` does not exist in the documentation. - -## How to find the correct page - -1. **Browse the sitemap**: [/sitemap.md](/sitemap.md) — A structured index of all pages with URLs, content types, and descriptions -2. **Browse the full index**: [/llms.txt](/llms.txt) — Complete documentation index - -## Tips for requesting documentation - -- For markdown responses, append \`.md\` to URLs (e.g., \`/docs/getting-started.md\`) -- Use \`Accept: text/markdown\` header for content negotiation -`; -} diff --git a/docs/package.json b/docs/package.json index 3879797b6e..68ef3dbef0 100644 --- a/docs/package.json +++ b/docs/package.json @@ -31,6 +31,7 @@ "@types/node": "catalog:", "@types/react": "^19.1.12", "@types/react-dom": "^19.1.9", + "@vercel/agent-readability": "^0.2.1", "@vercel/analytics": "^1.6.1", "@vercel/edge-config": "^1.4.0", "@vercel/speed-insights": "1.3.1", @@ -97,4 +98,4 @@ "tw-animate-css": "^1.4.0", "typescript": "catalog:" } -} +} \ No newline at end of file diff --git a/docs/proxy.ts b/docs/proxy.ts index 683a1f307c..4f0d6f118a 100644 --- a/docs/proxy.ts +++ b/docs/proxy.ts @@ -1,3 +1,4 @@ +import { generateNotFoundMarkdown, isAIAgent } from '@vercel/agent-readability'; import { createI18nMiddleware } from 'fumadocs-core/i18n/middleware'; import { isMarkdownPreferred, rewritePath } from 'fumadocs-core/negotiation'; import { @@ -7,7 +8,6 @@ import { } from 'next/server'; import { i18n } from '@/lib/geistdocs/i18n'; import { trackMdRequest } from '@/lib/md-tracking'; -import { isAIAgent } from '@/lib/ai-agent-detection'; const { rewrite: rewriteLLM } = rewritePath( '/docs/*path', @@ -57,15 +57,14 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { } // AI agent detection — rewrite docs pages to markdown for agents - // so they always get structured content without needing .md URLs or Accept headers if ( - (pathname === "/docs" || pathname.startsWith("/docs/")) && - !pathname.includes("/llms.mdx/") + (pathname === '/docs' || pathname.startsWith('/docs/')) && + !pathname.includes('/llms.mdx/') ) { const agentResult = isAIAgent(request); if (agentResult.detected && !isMarkdownPreferred(request)) { const result = - pathname === "/docs" + pathname === '/docs' ? `/${i18n.defaultLanguage}/llms.mdx` : rewriteLLM(pathname); @@ -73,15 +72,19 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { context.waitUntil( trackMdRequest({ path: pathname, - userAgent: request.headers.get("user-agent"), - referer: request.headers.get("referer"), - acceptHeader: request.headers.get("accept"), - requestType: "agent-rewrite", + userAgent: request.headers.get('user-agent'), + referer: request.headers.get('referer'), + acceptHeader: request.headers.get('accept'), + requestType: 'agent-rewrite', detectionMethod: agentResult.method, }) ); return NextResponse.rewrite(new URL(result, request.nextUrl)); } + // Agent requested a non-existent docs URL — return helpful markdown + return new NextResponse(generateNotFoundMarkdown(pathname), { + headers: { 'Content-Type': 'text/markdown; charset=utf-8' }, + }); } } @@ -107,10 +110,9 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { }; export const config = { - // Matcher ignoring `/_next/`, `/api/`, static assets, favicon, sitemap, robots, etc. matcher: [ '/((?!api|_next/static|_next/image|favicon.ico|sitemap.xml|robots.txt|og|.*\\.tgz$|.*\\.svg$|.*\\.zip$).*)', ], }; -export default proxy; +export default proxy; \ No newline at end of file