From e0a3c0a3995ecf8693108b8fc77dfa7dc264d652 Mon Sep 17 00:00:00 2001 From: molebox Date: Fri, 27 Mar 2026 14:31:01 +0100 Subject: [PATCH] Add AI agent detection and automatic markdown rewrites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When AI agents (Claude, ChatGPT, Cursor, etc.) request docs pages, the proxy now detects them and transparently rewrites to the markdown route — matching the geistdocs template default. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/docs/lib/ai-agent-detection.ts | 168 +++++++++++++++++++++++++ apps/docs/lib/geistdocs/md-tracking.ts | 6 +- apps/docs/proxy.ts | 30 +++++ 3 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 apps/docs/lib/ai-agent-detection.ts diff --git a/apps/docs/lib/ai-agent-detection.ts b/apps/docs/lib/ai-agent-detection.ts new file mode 100644 index 00000000..be0a02a2 --- /dev/null +++ b/apps/docs/lib/ai-agent-detection.ts @@ -0,0 +1,168 @@ +/** + * AI Agent Detection Utility + * + * Multi-signal detection for AI agents/bots. Used to serve markdown + * responses when agents request docs pages. + * + * Three detection layers: + * 1. Known UA patterns (definitive) — curated from https://bots.fyi/?tags=ai_assistant + * 2. Signature-Agent header (definitive) — catches ChatGPT agent (RFC 9421) + * 3. Missing browser fingerprint heuristic — catches unknown bots + * + * Optimizes for recall over precision: serving markdown to a non-AI bot + * is low-harm; missing an AI agent means a worse experience. + * + * Last reviewed: 2026-03-20 against bots.fyi + official vendor docs + */ + +// Layer 1: Known AI agent UA substrings (lowercase). +const AI_AGENT_UA_PATTERNS = [ + // Anthropic — https://support.claude.com/en/articles/8896518 + "claudebot", + "claude-searchbot", + "claude-user", + "anthropic-ai", + "claude-web", + + // OpenAI — https://platform.openai.com/docs/bots + "chatgpt", + "gptbot", + "oai-searchbot", + "openai", + + // Google AI + "gemini", + "bard", + "google-cloudvertexbot", + "google-extended", + + // Meta + "meta-externalagent", + "meta-externalfetcher", + "meta-webindexer", + + // Search/Research AI + "perplexity", + "youbot", + "you.com", + "deepseekbot", + + // Coding assistants + "cursor", + "github-copilot", + "codeium", + "tabnine", + "sourcegraph", + + // Other AI agents / data scrapers (low-harm to serve markdown) + "cohere-ai", + "bytespider", + "amazonbot", + "ai2bot", + "diffbot", + "omgili", + "omgilibot", +]; + +// Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421). +const SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"]; + +// Layer 3: Traditional bot exclusion list — bots that should NOT trigger +// the heuristic layer (they're search engine crawlers, social previews, or +// monitoring tools, not AI agents). +const TRADITIONAL_BOT_PATTERNS = [ + "googlebot", + "bingbot", + "yandexbot", + "baiduspider", + "duckduckbot", + "slurp", + "msnbot", + "facebot", + "twitterbot", + "linkedinbot", + "whatsapp", + "telegrambot", + "pingdom", + "uptimerobot", + "newrelic", + "datadog", + "statuspage", + "site24x7", + "applebot", +]; + +// Broad regex for bot-like UA strings (used only in Layer 3 heuristic). +const BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i; + +export type DetectionMethod = "ua-match" | "signature-agent" | "heuristic"; + +export interface DetectionResult { + detected: boolean; + method: DetectionMethod | null; +} + +/** + * Detects AI agents from HTTP request headers. + * + * Returns both whether the agent was detected and which signal triggered, + * so callers can log the detection method for accuracy tracking. + */ +export function isAIAgent(request: { + headers: { get(name: string): string | null }; +}): DetectionResult { + const userAgent = request.headers.get("user-agent"); + + // Layer 1: Known UA pattern match + if (userAgent) { + const lowerUA = userAgent.toLowerCase(); + if (AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) { + return { detected: true, method: "ua-match" }; + } + } + + // Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent) + const signatureAgent = request.headers.get("signature-agent"); + if (signatureAgent) { + const lowerSig = signatureAgent.toLowerCase(); + if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) { + return { detected: true, method: "signature-agent" }; + } + } + + // Layer 3: Missing browser fingerprint heuristic + // Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode + // on navigation requests. Its absence signals a programmatic client. + const secFetchMode = request.headers.get("sec-fetch-mode"); + if (!secFetchMode && userAgent && BOT_LIKE_REGEX.test(userAgent)) { + const lowerUA = userAgent.toLowerCase(); + const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => + lowerUA.includes(pattern) + ); + if (!isTraditionalBot) { + return { detected: true, method: "heuristic" }; + } + } + + return { detected: false, method: null }; +} + +/** + * Generates a markdown response for AI agents that hit non-existent URLs. + */ +export function generateAgentNotFoundResponse(requestedPath: string): string { + return `# Page Not Found + +The URL \`${requestedPath}\` does not exist in the documentation. + +## How to find the correct page + +1. **Browse the sitemap**: [/sitemap.md](/sitemap.md) — A structured index of all pages with URLs, content types, and descriptions +2. **Browse the full index**: [/llms.txt](/llms.txt) — Complete documentation index + +## Tips for requesting documentation + +- For markdown responses, append \`.md\` to URLs (e.g., \`/docs/getting-started.md\`) +- Use \`Accept: text/markdown\` header for content negotiation +`; +} diff --git a/apps/docs/lib/geistdocs/md-tracking.ts b/apps/docs/lib/geistdocs/md-tracking.ts index 4035976f..8dcdf550 100644 --- a/apps/docs/lib/geistdocs/md-tracking.ts +++ b/apps/docs/lib/geistdocs/md-tracking.ts @@ -3,11 +3,13 @@ import { siteId } from "@/geistdocs"; const PLATFORM_URL = "https://geistdocs.com/md-tracking"; interface TrackMdRequestParams { + /** Detection method used to identify the agent (only for agent-rewrite requests) */ + detectionMethod?: "ua-match" | "signature-agent" | "heuristic" | null; acceptHeader: string | null; path: string; referer: string | null; /** How the markdown was requested: 'md-url' for direct .md URLs, 'header-negotiated' for Accept header */ - requestType?: "md-url" | "header-negotiated"; + requestType?: "md-url" | "header-negotiated" | "agent-rewrite"; userAgent: string | null; } @@ -21,6 +23,7 @@ export async function trackMdRequest({ referer, acceptHeader, requestType, + detectionMethod, }: TrackMdRequestParams): Promise { try { const response = await fetch(PLATFORM_URL, { @@ -35,6 +38,7 @@ export async function trackMdRequest({ referer, acceptHeader, requestType, + detectionMethod, }), }); diff --git a/apps/docs/proxy.ts b/apps/docs/proxy.ts index baafb833..a141b164 100644 --- a/apps/docs/proxy.ts +++ b/apps/docs/proxy.ts @@ -6,6 +6,7 @@ import { NextResponse, } from "next/server"; import { i18n } from "@/lib/geistdocs/i18n"; +import { isAIAgent } from "@/lib/ai-agent-detection"; import { trackMdRequest } from "@/lib/geistdocs/md-tracking"; const { rewrite: rewriteLLM } = rewritePath( @@ -57,6 +58,35 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { } } + // AI agent detection — rewrite docs pages to markdown for agents + // so they always get structured content without needing .md URLs or Accept headers + if ( + (pathname === "/docs" || pathname.startsWith("/docs/")) && + !pathname.includes("/llms.mdx/") + ) { + const agentResult = isAIAgent(request); + if (agentResult.detected && !isMarkdownPreferred(request)) { + const result = + pathname === "/docs" + ? `/${i18n.defaultLanguage}/llms.mdx` + : rewriteLLM(pathname); + + if (result) { + context.waitUntil( + trackMdRequest({ + path: pathname, + userAgent: request.headers.get("user-agent"), + referer: request.headers.get("referer"), + acceptHeader: request.headers.get("accept"), + requestType: "agent-rewrite", + detectionMethod: agentResult.method, + }) + ); + return NextResponse.rewrite(new URL(result, request.nextUrl)); + } + } + } + // Handle Accept header content negotiation and track the request if (isMarkdownPreferred(request)) { const result = rewriteLLM(pathname);