diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index ce168d2a..65c8b999 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -69,7 +69,32 @@ ** xref:security:secrets.adoc[Secrets] ** xref:security:cloud-safety-reliability.adoc[Safety and Reliability] -* xref:ai-agents:index.adoc[AI Agents] +* xref:ai-agents:index.adoc[Agentic AI] +** xref:ai-agents:ai-gateway/index.adoc[AI Gateway] +*** xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[Overview] +*** xref:ai-agents:ai-gateway/ai-gateway.adoc[Quickstart] +**** xref:ai-agents:ai-gateway/quickstart-enhanced.adoc[enhanced quickstart] +*** xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[MCP Aggregation Guide] +*** xref:ai-agents:ai-gateway/observability-logs.adoc[] +*** xref:ai-agents:ai-gateway/observability-metrics.adoc[] +*** xref:ai-agents:ai-gateway/migration-guide.adoc[Migrate] +*** xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[] +*** xref:ai-agents:ai-gateway/integrations/index.adoc[Integrations] +**** Claude Code +***** xref:ai-agents:ai-gateway/integrations/claude-code-admin.adoc[Admin Guide] +***** xref:ai-agents:ai-gateway/integrations/claude-code-user.adoc[User Guide] +**** Cline +***** xref:ai-agents:ai-gateway/integrations/cline-admin.adoc[Admin Guide] +***** xref:ai-agents:ai-gateway/integrations/cline-user.adoc[User Guide] +**** Continue.dev +***** xref:ai-agents:ai-gateway/integrations/continue-admin.adoc[Admin Guide] +***** xref:ai-agents:ai-gateway/integrations/continue-user.adoc[User Guide] +**** Cursor IDE +***** xref:ai-agents:ai-gateway/integrations/cursor-admin.adoc[Admin Guide] +***** xref:ai-agents:ai-gateway/integrations/cursor-user.adoc[User Guide] +**** GitHub Copilot +***** xref:ai-agents:ai-gateway/integrations/github-copilot-admin.adoc[Admin Guide] +***** xref:ai-agents:ai-gateway/integrations/github-copilot-user.adoc[User Guide] ** xref:ai-agents:mcp/overview.adoc[MCP Overview] ** xref:ai-agents:mcp/local/index.adoc[Redpanda Cloud Management MCP Server] *** xref:ai-agents:mcp/local/overview.adoc[Overview] @@ -87,6 +112,7 @@ **** xref:ai-agents:mcp/remote/manage-servers.adoc[Manage Servers] **** xref:ai-agents:mcp/remote/scale-resources.adoc[Scale Resources] **** xref:ai-agents:mcp/remote/monitor-activity.adoc[Monitor Activity] +*** xref:ai-agents:mcp/remote/pipeline-patterns.adoc[MCP Server Patterns] * xref:develop:connect/about.adoc[Redpanda Connect] ** xref:develop:connect/connect-quickstart.adoc[Quickstart] diff --git a/modules/ai-agents/pages/ai-gateway/ai-gateway-overview.adoc b/modules/ai-agents/pages/ai-gateway/ai-gateway-overview.adoc new file mode 100644 index 00000000..193d0fd2 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/ai-gateway-overview.adoc @@ -0,0 +1,407 @@ += AI Gateway Overview +:description: Overview of Redpanda AI Gateway, its features, benefits, architecture, supported providers, deployment models, and common usage patterns. +:page-personas: app_developer, platform_admin + +Redpanda AI Gateway is a unified access layer for LLM providers and AI tools that sits between your applications and the AI services they use. It provides centralized routing, policy enforcement, cost management, and observability for all your AI traffic. + +After reading this page, you will be able to: + +* Explain how AI Gateway centralizes LLM provider management and reduces operational complexity. +* Identify key features (routing, observability, cost controls) that address common LLM integration challenges. +* Determine whether AI Gateway fits your use case based on traffic volume and provider diversity. + +== The problem + +Modern AI applications face several critical challenges: + +Provider fragmentation + +* Applications hardcode provider-specific SDKs (OpenAI, Anthropic, Google, etc.) +* Switching providers requires code changes and redeployment +* Testing across providers is time-consuming and error-prone +* Provider outages directly impact your application + +Cost spirals without visibility + +* No centralized view of token usage across teams and applications +* Difficult to attribute costs to specific customers, features, or environments +* Testing and debugging can rack up unexpected bills +* No way to enforce budgets or rate limits per team/customer + +Tool coordination complexity + +* Agents need access to multiple MCP (Model Context Protocol) servers +* Managing tool discovery and execution is repetitive across projects +* High token costs from loading all available tools upfront +* No centralized governance over which tools agents can access + +Observability gaps + +* Requests scattered across multiple provider dashboards +* Can't reconstruct user sessions that span multiple models +* No unified view of latency, errors, and costs +* Debugging "the AI gave the wrong answer" requires manual log diving + +== What AI Gateway solves + +Redpanda AI Gateway addresses these challenges through four core capabilities: + +=== 1. Unified LLM access (single endpoint for all providers) + +// PLACEHOLDER: Add architecture diagram showing: +// - Application → AI Gateway → Multiple LLM Providers (OpenAI, Anthropic, etc.) +// - Single baseURL configuration +// - Model routing via vendor/model_id format + +Before (direct integration) + +[source,python] +---- +# OpenAI +from openai import OpenAI +client = OpenAI(api_key="sk-...") +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) + +# Anthropic (different SDK, different patterns) +from anthropic import Anthropic +client = Anthropic(api_key="sk-ant-...") +response = client.messages.create( + model="claude-sonnet-3.5", + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}] +) +---- + +After (AI Gateway - OpenAI-compatible) + +[source,python] +---- +from openai import OpenAI + +# Single configuration, multiple providers +client = OpenAI( + base_url="https://{GATEWAY_ENDPOINT}", + api_key="your-redpanda-token", + default_headers={"rp-aigw-id": "{GATEWAY_ID}"} +) + +# Route to OpenAI +response = client.chat.completions.create( + model="openai/gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) + +# Route to Anthropic (same code, different model string) +response = client.chat.completions.create( + model="anthropic/claude-sonnet-3.5", + messages=[{"role": "user", "content": "Hello"}] +) +---- + +Result: Change `model` parameter to switch providers. No code redeployment needed. + +=== 2. Policy-based routing and cost control + +Define routing rules, rate limits, and budgets once; enforce them automatically: + +Example: Tier-based routing + +[source,cel] +---- +// Route premium users to best model, free users to cost-effective model +request.headers["x-user-tier"] == "premium" + ? "anthropic/claude-opus-4" + : "anthropic/claude-sonnet-3.5" +---- + +Example: Environment-based budget + +// PLACEHOLDER: Confirm exact policy configuration format + +[source,yaml] +---- +rate_limits: + staging: 100 requests/minute + production: 10000 requests/minute + +spend_limits: + staging: $500/month + production: $50000/month +---- + +Example: Automatic failover + +// PLACEHOLDER: Add details on pool configuration and failback behavior + +* Primary: OpenAI GPT-4 +* Fallback: Anthropic Claude Opus on rate limits or timeouts +* Result: 99.9% uptime even during provider outages + +=== 3. MCP aggregation and orchestration + +Agent tool access without the overhead + +Before: Agent loads all tools from multiple MCP servers upfront + +* Sends 50+ tool definitions with every request +* High token costs (thousands of tokens per request) +* Slow agent startup +* No centralized governance + +After: AI Gateway aggregates MCP servers + +* Deferred tool loading: Only search + orchestrator tools loaded initially +* 80-90% token reduction, depending on configuration +* Agent queries for specific tools only when needed +* Centralized approval of MCP servers + +Orchestrator for complex workflows + +* Single JavaScript-based orchestrator tool +* Reduces multi-step workflows from multiple round trips to one call +* Example: "Search vector DB → if results insufficient → fallback to web search" + +// PLACEHOLDER: Add link to MCP aggregation guide when ready + +=== 4. Unified observability and cost tracking + +Single dashboard for all LLM traffic + +// PLACEHOLDER: Add screenshots of: +// - Request logs view +// - Cost breakdown by model/provider +// - Latency histogram +// - Error rate tracking + +Track across all requests: + +* Volume (requests per gateway, model, provider) +* Token usage (prompt + completion tokens) +* Estimated spend (per model, with cross-provider comparison) +* Latency (p50, p95, p99) +* Errors (by type, provider, model) + +Use cases: + +* "Which model is the most cost-effective for our use case?" +* "Why did this specific user request fail?" +* "How much does our staging environment cost us per week?" +* "What's the latency difference between OpenAI and Anthropic for our workload?" + +== Cost comparison example + +// PLACEHOLDER: Insert real customer data or anonymized case study + +Scenario: SaaS chatbot with 1M requests/month, averaging 500 prompt + 300 completion tokens + +[cols="1,1,2"] +|=== +|Configuration |Monthly Cost |Notes + +|Direct integration (no gateway) +|// PLACEHOLDER: $X,XXX +|No caching, no routing optimization + +|AI Gateway (basic routing) +|// PLACEHOLDER: $X,XXX +|Provider failover, unified observability + +|Caching +|// PLACEHOLDER: $X,XXX +|// PLACEHOLDER: X% reduction from cache hits + +|Deferred tool loading +|// PLACEHOLDER: $X,XXX +|80-90% token reduction for agent workloads + +|Tier-based routing +|// PLACEHOLDER: $X,XXX +|Premium users → better model, free → cost-effective +|=== + +Total savings: // PLACEHOLDER: $X,XXX/month (XX% reduction) + +Hidden savings: + +* Developer time: No more managing multiple provider SDKs +* Incident response: Automatic failover reduces downtime costs +* Experimentation: Safe A/B testing without risking production + +== Common gateway patterns + +=== Pattern 1: Team isolation + +Use case: Multiple teams sharing infrastructure, need separate budgets and policies + +Setup: Create one gateway per team + +* Team A Gateway: $5K/month budget, staging + production environments +* Team B Gateway: $10K/month budget, different rate limits +* Each team sees only their traffic in observability dashboards + +// PLACEHOLDER: Link to multi-tenancy guide + +=== Pattern 2: Environment separation + +Use case: Prevent staging traffic from affecting production metrics + +Setup: Separate gateways for staging vs production + +* Staging Gateway: Lower rate limits, restricted model access, aggressive cost controls +* Production Gateway: High rate limits, all models enabled, alerting on anomalies + +=== Pattern 3: Primary and fallback for reliability + +Use case: Ensure uptime during provider outages + +Setup: Configure provider pools with automatic failover + +* Primary: OpenAI (preferred for quality) +* Fallback: Anthropic (activates on OpenAI rate limits or timeouts) +* Monitor fallback rate to detect primary provider issues early + +=== Pattern 4: A/B testing models + +Use case: Compare model quality/cost without dual integration + +Setup: Route percentage of traffic to different models + +// PLACEHOLDER: Confirm if percentage-based routing is supported, or if it's header-based only + +* 80% traffic → claude-sonnet-3.5 +* 20% traffic → claude-opus-4 +* Compare quality metrics and costs, then adjust + +=== Pattern 5: Customer-based routing + +Use case: SaaS product with tiered pricing (free, pro, enterprise) + +Setup: CEL routing based on request headers + +[source,cel] +---- +request.headers["x-customer-tier"] == "enterprise" ? "anthropic/claude-opus-4" : +request.headers["x-customer-tier"] == "pro" ? "anthropic/claude-sonnet-3.5" : +"anthropic/claude-haiku" +---- + +== Deployment model + +// PLACEHOLDER: Verify BYOC availability and any managed offering plans + +BYOC (Bring Your Own Cloud) + +* Currently available: BYOC version for // PLACEHOLDER: specific Redpanda version +* Deployment: Within your Redpanda Cloud cluster +* Data residency: All traffic stays in your cloud account +* Supported clouds: // PLACEHOLDER: AWS, GCP, Azure? + +// PLACEHOLDER: If managed offering is planned, add: +// *Managed (Redpanda Cloud)* +// - Coming soon: Fully managed AI Gateway +// - No infrastructure management +// - Global deployment regions +// - Uptime SLA + +== What's supported today + +LLM providers + +// PLACEHOLDER: Confirm currently supported providers + +* OpenAI +* Anthropic +* // PLACEHOLDER: Google, AWS Bedrock, Azure OpenAI, others? + +API compatibility + +* OpenAI-compatible `/v1/chat/completions` endpoint +* // PLACEHOLDER: Streaming support? +* // PLACEHOLDER: Embeddings support? +* // PLACEHOLDER: Other endpoints? + +Policy features + +* CEL-based routing expressions +* Rate limiting (// PLACEHOLDER: per-gateway, per-header, per-tenant?) +* Monthly spend limits (// PLACEHOLDER: per-gateway, per-workspace?) +* Provider pools with automatic failover +* // PLACEHOLDER: Caching support? + +MCP support + +* MCP server aggregation +* Deferred tool loading (80-90% token reduction) +* JavaScript orchestrator for multi-step workflows +* // PLACEHOLDER: Tool execution sandboxing? + +Observability + +* Request logs with full prompt/response history +* Token usage tracking +* Estimated cost per request +* Latency metrics +* // PLACEHOLDER: Metrics export? OpenTelemetry support? + +== What's not supported yet + +// PLACEHOLDER: List current limitations, for example: +// - Custom model deployments (Azure OpenAI BYOK, AWS Bedrock custom models) +// - Response caching +// - Prompt templates/versioning +// - Guardrails (PII detection, content moderation) +// - Multi-region active-active deployment +// - Metrics export to external systems +// - Budget alerts/notifications + +== Architecture + +// PLACEHOLDER: Add architecture diagram showing: +// 1. Control Plane: +// - Workspace management +// - Provider/model configuration +// - Gateway creation and policy definition +// - Admin console +// +// 2. Data Plane: +// - Request ingestion +// - Policy evaluation (rate limits → spend limits → routing → execution) +// - Provider pool selection and failover +// - MCP aggregation layer +// - Response logging and metrics +// +// 3. Observability Plane: +// - Request logs storage +// - Metrics aggregation +// - Dashboard UI + +Request lifecycle: + +. Application sends request to gateway endpoint with `rp-aigw-id` header +. Gateway authenticates request +. Rate limit policy evaluates (allow/deny) +. Spend limit policy evaluates (allow/deny) +. Routing policy evaluates (which model/provider to use) +. Provider pool selects backend (primary/fallback) +. Request forwarded to LLM provider +. Response returned to application +. Request logged with tokens, cost, latency, status + +MCP request lifecycle: + +. Application discovers tools via `/mcp` endpoint +. Gateway aggregates tools from approved MCP servers +. Application receives search + orchestrator tools (deferred loading) +. Application invokes specific tool +. Gateway routes to appropriate MCP server +. Tool execution result returned +. Request logged with execution time, status + +== Next steps + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Route your first request through AI Gateway. +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure MCP server aggregation for AI agents. +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor request logs, token usage, and costs. diff --git a/modules/ai-agents/pages/ai-gateway/ai-gateway.adoc b/modules/ai-agents/pages/ai-gateway/ai-gateway.adoc new file mode 100644 index 00000000..194e48bc --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/ai-gateway.adoc @@ -0,0 +1,351 @@ += AI Gateway Quickstart +:description: Quickstart to configure the AI Gateway for unified access to multiple LLM providers and MCP servers through a single endpoint. + + +NOTE: AI Gateway is supported on BYOC clusters running Redpanda version 25.3 and later. + +The Redpanda AI Gateway is a production-grade proxy that provides unified access to multiple Large Language Model (LLM) providers and Model Context Protocol (MCP) servers through a single endpoint. MCP servers expose tools that agents can discover and call. An AI Gateway maintains centralized control over routing, rate limiting, cost optimization, security, and observability. + +After completing this quickstart, you will be able to: + +* Route your first LLM request through AI Gateway using the Cloud Console and verify it in the observability dashboard. +* Configure a provider and gateway with correct authentication and routing policies. +* Test failover behavior and CEL routing rules in a development environment. + +== Prerequisites + +* Access to the AI Gateway UI (provided by your administrator) +* API key for at least one LLM provider: OpenAI or Anthropic +* Optional: MCP server endpoints if you plan to use tool aggregation + +== Get started + +Before you can create a gateway, an administrator must enable LLM providers and models. + +=== Step 1: Enable a provider + +Providers represent upstream services (Anthropic, OpenAI) and associated credentials. Providers are disabled by default. An administrator must enable them explicitly by adding credentials. + +. In AI Gateways, navigate to *Providers*. +. Select a provider (for example, Anthropic). +. On the *Configuration* tab for the provider, click *Add configuration* and enter your API Key. + +=== Step 2: Enable models + +The model catalog is the set of models made available through the gateway. Models are disabled by default. After enabling a provider, an administrator can enable its models. + +The infrastructure that is serving the model is different based on the provider you select. For example, OpenAI has different reliability and availability metrics than Anthropic. When you consider all the metrics, you can design your gateway to use different providers for different use cases. + +. Navigate to *Models*. +. Enable the models you want exposed through gateways. + +==== Model naming convention + +Model provider requests must use the `vendor/model_id` format in the model property of the request body, and include the `rp-aigw-id` header with the gateway ID the request is being sent to. The following example routes OpenAI API calls through Redpanda's AI Gateway for centralized control. + +[source,python] +---- +# Example: Using the OpenAI Python SDK with AI Gateway +from openai import OpenAI + +client = OpenAI( + base_url="https://gw.ai.panda.com", <1> + api_key="your-api-key", +) + +# Add header per request +response = client.chat.completions.create( + model="openai/gpt-5", <2> + messages=[{"role": "user", "content": "Hello!"}], + extra_headers={ + "rp-aigw-id": "gateway-abc" # Override for this request + } <3> +) +---- +<1> This redirects the OpenAI client to the AI Gateway endpoint. +<2> The `model` property uses the `vendor/model_id` format as required by the AI Gateway. +<3> Includes the `rp-aigw-id` header to specify which gateway configuration to use. + +=== Step 3: Create a gateway + +A gateway is a logical configuration boundary (policies + routing + observability) on top of a single deployment. It's a "virtual gateway" that you can create per team, environment (staging/production), product, or customer. + +. Navigate to *Gateways*. +. Click *Create Gateway*. +. Choose a name, workspace, and optional metadata. ++ +TIP: A _workspace_ is conceptually similar to a _resource group_ in Redpanda streaming. + +. After creation, copy the *Gateway Endpoint* from the gateway detail page. + +=== Step 4: Configure LLM routing + +On the Gateways page, select the *LLM* tab to configure rate limits, spend limits, routing, and provider pools with fallback options. + +The LLM routing pipeline visually represents the request lifecycle: + +. Rate Limit: For example, global rate limit of 100 requests/second. +. Spend Limit / Monthly Budget: For example, $15K/month with blocking enforcement, so it blocks requests after that budget is exceeded. +. Routing to a primary provider pool with optional fallback provider pools: For example, primary route to Anthropic backend pool, and if that fails, it will fallback to OpenAI pool. + +*Load balancing / multi-provider distribution:* +If a provider pool contains multiple providers, you can distribute traffic (for example, balancing across Anthropic and OpenAI). + +TIP: Provider pool (UI) = Backend pool (API) + +=== Step 5: Configure MCP tools + +On the Gateways page, select the *MCP* tab to configure your MCP tool discovery and tool execution. This MCP proxy is an aggregator of MCP servers, allowing multiple MCP servers behind a single endpoint. Agents can then find tools and call them through the gateway. To configure the MCP proxy, add the following: + +* Display name: When you drag a provider pool, you give it a name. +* Model dropdown: Choose a model from the available models in the catalog. +* Load Balancing options: If you have multiple providers, you can load balance requests between them; for example, round robin. + +MCP tools include a data catalog API, the memory store, a vector search service, and an MCP orchestrator. The *MCP orchestrator* is a built-in MCP server that enables programmatic tool calling. Agents can generate code to call multiple tools in a single orchestrated step, which reduces the number of round trips. For example, a workflow requiring 47 file reads can be reduced from 49 round trips to just 1. To add other tools, (for example, Slack), add the Slack MCP server endpoint. + +When many tools are aggregated, listing all tools can consume significant tokens. With *deferred tool loading*, instead of returning all tools, the MCP gateway initially returns a tool search capability and the MCP orchestrator. The agent then searches for the specific tool it needs and retrieves only that subset. That way, the exchange of messages between the MCP gateway and the agent is small. This can reduce token usage significantly when you have many tools configured. + +*REVIEWERS: When/how exactly do you use the orchestrator? Also what happens after they create a gateway? Please provide an example of how to validate end-to-end routing against the gateway endpoint!* + +*REVIEWERS: How do users connect to the ADP catalog + MCP servers exposed through RPCN?* + +== Observability + +After traffic flows through a gateway, you can inspect: + +* Request volume +* Token usage +* Estimated spend +* Latency +* Per-model breakdown + +This is central to governance: You can see and control usage by gateway boundary (for example, by team, environment, customer, or product). + +*REVIEWERS: Where do those metrics appear in the UI, or how does a user validate observability after setup?* + +== CEL routing + +The AI Gateway uses Common Expression Language (CEL) for flexible routing and policy application. CEL expressions let you create sophisticated routing rules based on request properties without code changes. Use CEL to: + +* Route requests to specific providers based on model family +* Apply different rate limits based on user tiers +* Enforce policies based on request content + +The editor in the UI helps you discover available request fields (headers, path, body, and so on). + +=== CEL examples + +Route based on model family: + +[,cel] +---- +request.body.model.startsWith("anthropic/") +---- + +Apply a rule to all requests: + +[,cel] +---- +true +---- + +Route based on a header (for example, product tier): + +[,cel] +---- +request.headers['tier'][0] == "premium" +---- + +Guard for field existence: + +[,cel] +---- +has(request.body.max_tokens) && request.body.max_tokens > 1000 +---- + +== Integrate with AI agents and tools + +The AI Gateway provides standardized endpoints that work with various AI development tools and agents. This section shows how to configure popular tools to use your AI Gateway endpoints. + +=== MCP server endpoint + +If you've configured MCP tools in your gateway, AI agents can connect to the aggregated MCP endpoint: + +* MCP endpoint URL: `https://gw.ai.panda.com/mcp` + +* Headers required: +** `Authorization: Bearer your-api-key` +** `rp-aigw-id: your-gateway-id` + +This endpoint aggregates all MCP servers configured in your gateway, providing a unified interface for tool discovery and execution. + +=== Environment variables + +For consistent configuration across tools, set these environment variables: + +[source,bash] +---- +export REDPANDA_GATEWAY_URL="https://gw.ai.panda.com" +export REDPANDA_GATEWAY_ID="your-gateway-id" +export REDPANDA_API_KEY="your-api-key" +---- + +Many tools and SDKs can automatically use these environment variables when configured appropriately. + +=== Claude Code + +Configure Claude Code to use AI Gateway endpoints using HTTP transport for the MCP connection. + +*For Claude Code CLI:* + +Use the `claude mcp add` command to configure the HTTP transport: + +[source,bash] +---- +claude mcp add --transport http redpanda-aigateway https://gw.ai.panda.com/mcp \ + --header "Authorization: Bearer YOUR_API_KEY" \ + --header "rp-aigw-id: GATEWAY_ID" +---- + +*Alternative configuration via config file:* + +Create or edit `~/.claude/config.json`: + +[source,json] +---- +{ + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + "apiProviders": { + "redpanda": { + "baseURL": "https://gw.ai.panda.com", + "headers": { + "rp-aigw-id": "your-gateway-id" + } + } + } +} +---- + +=== VS Code extensions + +Configure VS Code extensions that support OpenAI-compatible APIs: + +*Continue extension:* + +Edit your Continue config file (`~/.continue/config.json`): + +[source,json] +---- +{ + "models": [ + { + "title": "Redpanda AI Gateway - GPT-4", + "provider": "openai", + "model": "openai/gpt-4", + "apiBase": "https://gw.ai.panda.com", + "apiKey": "your-api-key", + "requestOptions": { + "headers": { + "rp-aigw-id": "your-gateway-id" + } + } + }, + { + "title": "Redpanda AI Gateway - Claude", + "provider": "anthropic", + "model": "anthropic/claude-3-5-sonnet-20241022", + "apiBase": "https://gw.ai.panda.com", + "apiKey": "your-api-key", + "requestOptions": { + "headers": { + "rp-aigw-id": "your-gateway-id" + } + } + } + ] +} +---- + +=== Cursor IDE + +Configure Cursor to route requests through the AI Gateway: + +. Open Cursor Settings (*Cursor* → *Settings* or `Cmd+,`) +. Navigate to *AI* settings +. Add a custom OpenAI-compatible provider: + +[source,json] +---- +{ + "cursor.ai.providers.openai.apiBase": "https://gw.ai.panda.com", + "cursor.ai.providers.openai.defaultHeaders": { + "rp-aigw-id": "your-gateway-id" + } +} +---- + +=== Custom applications + +For custom applications using OpenAI or Anthropic SDKs: + +*OpenAI SDK (Python):* + +[source,python] +---- +from openai import OpenAI + +client = OpenAI( + base_url="https://gw.ai.panda.com", + api_key="your-api-key", + default_headers={ + "rp-aigw-id": "your-gateway-id" + } +) +---- + +*Anthropic SDK (Python):* + +[source,python] +---- +from anthropic import Anthropic + +client = Anthropic( + base_url="https://gw.ai.panda.com", + api_key="your-api-key", + default_headers={ + "rp-aigw-id": "your-gateway-id" + } +) +---- + +*Node.js with OpenAI SDK:* + +[source,javascript] +---- +import OpenAI from 'openai'; + +const openai = new OpenAI({ + baseURL: 'https://gw.ai.panda.com', + apiKey: process.env.OPENAI_API_KEY, + defaultHeaders: { + 'rp-aigw-id': 'your-gateway-id' + } +}); +---- + +== Next steps + +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture, deployment models, and common usage patterns. +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Explore advanced CEL routing patterns for traffic distribution, cost optimization, and failover. +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure MCP server aggregation and deferred tool loading for AI agents. +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor request logs, token usage, and costs through the observability dashboard. +* xref:ai-agents:ai-gateway/integrations/index.adoc[]: Connect AI development tools like Claude Code, Cursor, and Continue to your gateway. \ No newline at end of file diff --git a/modules/ai-agents/pages/ai-gateway/cel-routing-cookbook.adoc b/modules/ai-agents/pages/ai-gateway/cel-routing-cookbook.adoc new file mode 100644 index 00000000..9ca04c05 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/cel-routing-cookbook.adoc @@ -0,0 +1,955 @@ += CEL Routing Cookbook +:description: CEL routing cookbook for Redpanda AI Gateway with common patterns, examples, and best practices. +:page-personas: app_developer, platform_admin + +Redpanda AI Gateway uses CEL (Common Expression Language) for dynamic request routing. CEL expressions evaluate request properties (headers, body, context) and determine which model or provider should handle each request. + +After reading this page, you will be able to: + +* Write CEL expressions to route requests based on user tier, environment, content complexity, or custom headers. +* Test CEL routing logic using the UI editor or test requests to verify expected model selection. +* Troubleshoot common CEL errors (type mismatches, missing fields, index out of bounds) using safe patterns. + +CEL enables: + +* User-based routing (free vs premium tiers) +* Content-based routing (by prompt topic, length, complexity) +* Environment-based routing (staging vs production models) +* Cost controls (reject expensive requests in test environments) +* A/B testing (route percentage of traffic to new models) +* Geographic routing (by region header) +* Custom business logic (any condition you can express) + +== CEL basics + +=== What is CEL? + +CEL (Common Expression Language) is a non-Turing-complete expression language designed for fast, safe evaluation. It's used by Google (Firebase, Cloud IAM), Kubernetes, Envoy, and other systems. + +Key properties: + +* Safe: Cannot loop infinitely or access system resources +* Fast: Evaluates in microseconds +* Readable: Similar to Python/JavaScript expressions +* Type-safe: Errors caught at configuration time, not runtime + +=== CEL syntax primer + +Comparison operators: + +[source,cel] +---- +== // equal +!= // Not equal +< // Less than +> // Greater than +<= // Less than or equal +>= // Greater than or equal +---- + + +Logical operators: + +[source,cel] +---- +&& // AND +|| // OR +! // NOT +---- + + +Ternary operator (most common pattern): + +[source,cel] +---- +condition ? value_if_true : value_if_false +---- + + +Functions: + +[source,cel] +---- +.size() // Length of string or array +.contains("text") // String contains substring +.startsWith("x") // String starts with +.endsWith("x") // String ends with +.matches("regex") // Regex match +has(field) // Check if field exists +---- + + +Examples: + +[source,cel] +---- +// Simple comparison +request.headers["tier"] == "premium" + +// Ternary (if-then-else) +request.headers["tier"] == "premium" ? "openai/gpt-4o" : "openai/gpt-4o-mini" + +// Logical AND +request.headers["tier"] == "premium" && request.headers["region"] == "us" + +// String contains +request.body.messages[0].content.contains("urgent") + +// Size check +request.body.messages.size() > 10 +---- + + +== Request object schema + +CEL expressions evaluate against the `request` object, which contains: + +// PLACEHOLDER: Confirm exact schema + +=== `request.headers` (map) + +All HTTP headers (lowercase keys). + +[source,cel] +---- +request.headers["x-user-tier"] // Custom header +request.headers["x-customer-id"] // Custom header +request.headers["user-agent"] // Standard header +request.headers["x-request-id"] // Standard header +---- + + +NOTE: Header names are case-insensitive in HTTP, but CEL requires lowercase keys. + +=== `request.body` (object) + +The JSON request body (for `/chat/completions`). + +[source,cel] +---- +request.body.model // String: Requested model +request.body.messages // Array: Conversation messages +request.body.messages[0].role // String: "system", "user", "assistant" +request.body.messages[0].content // String: Message content +request.body.messages.size() // Int: Number of messages +request.body.max_tokens // Int: Max completion tokens (if set) +request.body.temperature // Float: Temperature (if set) +request.body.stream // Bool: Streaming enabled (if set) +---- + + +NOTE: Fields are optional. Use `has()` to check existence: + +[source,cel] +---- +has(request.body.max_tokens) ? request.body.max_tokens : 1000 +---- + + +=== `request.path` (string) + +The request path. + +[source,cel] +---- +request.path == "/v1/chat/completions" +request.path.startsWith("/v1/") +---- + + +=== `request.method` (string) + +The HTTP method. + +[source,cel] +---- +request.method == "POST" +---- + + +// PLACEHOLDER: Are there other fields? User context? Gateway context? Timestamp? + +== CEL routing patterns + +Each pattern follows this structure: + +* When to use: Scenario description +* Expression: CEL code +* What happens: Routing behavior +* Verify: How to test +* Cost/performance impact: Implications + +=== Pattern 1: Tier-based routing + +When to use: Different user tiers (free, pro, enterprise) should get different model quality + +Expression: + +[source,cel] +---- +request.headers["x-user-tier"] == "enterprise" ? "openai/gpt-4o" : +request.headers["x-user-tier"] == "pro" ? "anthropic/claude-sonnet-3.5" : +"openai/gpt-4o-mini" +---- + + +What happens: + +* Enterprise users → GPT-4o (best quality) +* Pro users → Claude Sonnet 3.5 (balanced) +* Free users → GPT-4o-mini (cost-effective) + +Verify: + +[source,python] +---- +# Test enterprise +response = client.chat.completions.create( + model="auto", # PLACEHOLDER: How to trigger CEL routing? + messages=[{"role": "user", "content": "Test"}], + extra_headers={"x-user-tier": "enterprise"} +) +# Check logs: Should route to openai/gpt-4o + +# Test free +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Test"}], + extra_headers={"x-user-tier": "free"} +) +# Check logs: Should route to openai/gpt-4o-mini +---- + + +Cost impact: + +* Enterprise: ~$5.00 per 1K requests +* Pro: ~$3.50 per 1K requests +* Free: ~$0.50 per 1K requests + +Use case: SaaS product with tiered pricing where model quality is a differentiator + +=== Pattern 2: Environment-based routing + +When to use: Prevent staging from using expensive models + +Expression: + +[source,cel] +---- +request.headers["x-environment"] == "production" + ? "openai/gpt-4o" + : "openai/gpt-4o-mini" +---- + + +What happens: + +* Production → GPT-4o (best quality) +* Staging/dev → GPT-4o-mini (10x cheaper) + +Verify: + +[source,python] +---- +# Set environment header +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Test"}], + extra_headers={"x-environment": "staging"} +) +# Check logs: Should route to gpt-4o-mini +---- + + +Cost impact: + +* Prevents staging from inflating costs +* Example: Staging with 100K test requests/day + * GPT-4o: $500/day ($15K/month) + * GPT-4o-mini: $50/day ($1.5K/month) + * *Savings: $13.5K/month* + +Use case: Protect against runaway staging costs + +''' + +=== Pattern 3: Content-length guard rails + +When to use: Block or downgrade long prompts to prevent cost spikes + +Expression (Block): + +[source,cel] +---- +request.body.messages.size() > 10 || request.body.max_tokens > 4000 + ? "reject" + : "openai/gpt-4o" +---- + + +What happens: +* Requests with >10 messages or >4000 max_tokens → Rejected with 400 error +* Normal requests → GPT-4o + +Expression (Downgrade): + +[source,cel] +---- +request.body.messages.size() > 10 || request.body.max_tokens > 4000 + ? "openai/gpt-4o-mini" // Cheaper model + : "openai/gpt-4o" // Normal model +---- + + +What happens: + +* Long conversations → Downgraded to cheaper model +* Short conversations → Premium model + +Verify: + +[source,python] +---- +# Test rejection +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": f"Message {i}"} for i in range(15)], + max_tokens=5000 +) +# Should return 400 error (rejected) + +# Test normal +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Short message"}], + max_tokens=100 +) +# Should route to gpt-4o +---- + + +Cost impact: + +* Prevents unexpected bills from verbose prompts +* Example: Block requests >10K tokens (would cost $0.15 each) + +Use case: Staging cost controls, prevent prompt injection attacks that inflate token usage + +=== Pattern 4: Topic-based routing + +When to use: Route different question types to specialized models + +Expression: + +[source,cel] +---- +request.body.messages[0].content.contains("code") || +request.body.messages[0].content.contains("debug") || +request.body.messages[0].content.contains("programming") + ? "openai/gpt-4o" // Better at code + : "anthropic/claude-sonnet-3.5" // Better at general writing +---- + + +What happens: + +* Coding questions → GPT-4o (optimized for code) +* General questions → Claude Sonnet (better prose) + +Verify: + +[source,python] +---- +# Test code question +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Debug this Python code: ..."}] +) +# Check logs: Should route to gpt-4o + +# Test general question +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Write a blog post about AI"}] +) +# Check logs: Should route to claude-sonnet-3.5 +---- + + +Cost impact: + +* Optimize model selection for task type +* Could improve quality without increasing costs + +Use case: Multi-purpose chatbot with both coding and general queries + + +=== Pattern 5: Geographic/regional routing + +When to use: Route by user region for compliance or latency optimization + +Expression: + +[source,cel] +---- +request.headers["x-user-region"] == "eu" + ? "openai/gpt-4o-eu" // PLACEHOLDER: If regional models exist + : "openai/gpt-4o" +---- + + +What happens: + +* EU users → EU-region model (GDPR compliance) +* Other users → Default region + +Verify: + +[source,python] +---- +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Test"}], + extra_headers={"x-user-region": "eu"} +) +# Check logs: Should route to EU model +---- + + +Cost impact: Neutral (same model, different region) + +Use case: GDPR compliance, data residency requirements + + +=== Pattern 6: Customer-specific routing + +When to use: Different customers have different model access (enterprise features) + +Expression: + +[source,cel] +---- +request.headers["x-customer-id"] == "customer_vip_123" + ? "anthropic/claude-opus-4" // Most expensive, best quality + : "anthropic/claude-sonnet-3.5" // Standard +---- + + +What happens: + +* VIP customer → Best model +* Standard customers → Normal model + +Verify: + +[source,python] +---- +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Test"}], + extra_headers={"x-customer-id": "customer_vip_123"} +) +# Check logs: Should route to claude-opus-4 +---- + + +Cost impact: + +* VIP: ~$7.50 per 1K requests +* Standard: ~$3.50 per 1K requests + +Use case: Enterprise contracts with premium model access + + +=== Pattern 7: a/b testing (percentage-based routing) + +When to use: Test new models with a percentage of traffic + +// PLACEHOLDER: Confirm if CEL can access random functions or if A/B testing requires different mechanism + +Expression (if random is available): + +[source,cel] +---- +// PLACEHOLDER: Verify CEL random function availability +random() < 0.10 + ? "anthropic/claude-opus-4" // 10% traffic to new model + : "openai/gpt-4o" // 90% traffic to existing model +---- + + +Alternative (hash-based): + +[source,cel] +---- +// Use customer ID hash for stable routing +hash(request.headers["x-customer-id"]) % 100 < 10 + ? "anthropic/claude-opus-4" + : "openai/gpt-4o" +---- + + +What happens: + +* 10% of requests → New model (Opus 4) +* 90% of requests → Existing model (GPT-4o) + +Verify: + +[source,python] +---- +# Send 100 requests, count which model was used +for i in range(100): + response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": f"Test {i}"}], + extra_headers={"x-customer-id": f"customer_{i}"} + ) +# Check logs: ~10 should use opus-4, ~90 should use gpt-4o +---- + + +Cost impact: + +* Allows safe, incremental rollout of new models +* Monitor quality/cost for new model before full adoption + +Use case: Evaluate new models in production with real traffic + +=== Pattern 8: Complexity-based routing + +When to use: Route simple queries to cheap models, complex queries to expensive models + +Expression: + +[source,cel] +---- +request.body.messages.size() == 1 && +request.body.messages[0].content.size() < 100 + ? "openai/gpt-4o-mini" // Simple, short question + : "openai/gpt-4o" // Complex or long conversation +---- + + +What happens: + +* Single short message (<100 chars) → Cheap model +* Multi-turn or long messages → Premium model + +Verify: + +[source,python] +---- +# Test simple +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Hi"}] # 2 chars +) +# Check logs: Should route to gpt-4o-mini + +# Test complex +response = client.chat.completions.create( + model="auto", + messages=[ + {"role": "user", "content": "Long question here..." * 10}, + {"role": "assistant", "content": "Response"}, + {"role": "user", "content": "Follow-up"} + ] +) +# Check logs: Should route to gpt-4o +---- + + +Cost impact: + +* Can reduce costs significantly if simple queries are common +* Example: 50% of queries are simple, save 90% on those = 45% total savings + +Use case: FAQ chatbot with mix of simple lookups and complex questions + +=== Pattern 9: Time-based routing + +When to use: Use cheaper models during off-peak hours + +// PLACEHOLDER: Confirm if CEL has access to current timestamp + +Expression (if time functions available): + +[source,cel] +---- +// PLACEHOLDER: Verify CEL time function availability +now().hour >= 22 || now().hour < 6 // 10pm - 6am + ? "openai/gpt-4o-mini" // Off-peak: cheaper model + : "openai/gpt-4o" // Peak hours: best model +---- + + +What happens: + +* Off-peak hours (10pm-6am) → Cheap model +* Peak hours (6am-10pm) → Premium model + +Cost impact: + +* Optimize for user experience during peak usage +* Save costs during low-traffic hours + +Use case: Consumer apps with time-zone-specific usage patterns + + +=== Pattern 10: Fallback chain (multi-level) + +When to use: Complex fallback logic beyond simple primary/secondary + +Expression: + +[source,cel] +---- +request.headers["x-priority"] == "critical" + ? "openai/gpt-4o" // First choice for critical + : request.headers["x-user-tier"] == "premium" + ? "anthropic/claude-sonnet-3.5" // Second choice for premium + : "openai/gpt-4o-mini" // Default for everyone else +---- + + +What happens: + +* Critical requests → Always GPT-4o +* Premium non-critical → Claude Sonnet +* Everyone else → GPT-4o-mini + +Verify: Test with different header combinations + +Cost impact: Ensures SLA for critical requests while optimizing costs elsewhere + +Use case: Production systems with SLA requirements + + +== Advanced CEL patterns + +=== Pattern: Default values with `has()` + +Problem: Field might not exist in request + +Expression: + +[source,cel] +---- +has(request.body.max_tokens) && request.body.max_tokens > 2000 + ? "openai/gpt-4o" // Long response expected + : "openai/gpt-4o-mini" // Short response +---- + + +What happens: Safely checks if `max_tokens` exists before comparing + +=== Pattern: Multiple conditions with parentheses + +Expression: + +[source,cel] +---- +(request.headers["x-user-tier"] == "premium" || + request.headers["x-customer-id"] == "vip_123") && +request.headers["x-environment"] == "production" + ? "openai/gpt-4o" + : "openai/gpt-4o-mini" +---- + + +What happens: Premium users OR VIP customer, AND production → GPT-4o + +=== Pattern: Regex matching + +Expression: + +[source,cel] +---- +request.body.messages[0].content.matches("(?i)(urgent|asap|emergency)") + ? "openai/gpt-4o" // Route urgent requests to best model + : "openai/gpt-4o-mini" +---- + + +What happens: Messages containing "urgent", "ASAP", or "emergency" (case-insensitive) → GPT-4o + +=== Pattern: String array contains + +Expression: + +[source,cel] +---- +["customer_1", "customer_2", "customer_3"].exists(c, c == request.headers["x-customer-id"]) + ? "openai/gpt-4o" // Whitelist of customers + : "openai/gpt-4o-mini" +---- + + +What happens: Only specific customers get premium model + +=== Pattern: Reject invalid requests + +Expression: + +[source,cel] +---- +!has(request.body.messages) || request.body.messages.size() == 0 + ? "reject" // PLACEHOLDER: Confirm "reject" is supported + : "openai/gpt-4o" +---- + + +What happens: Requests without messages are rejected (400 error) + +== Test CEL expressions + +=== Option 1: CEL editor in UI (if available) + +// PLACEHOLDER: Add screenshot if UI has CEL editor with test mode + +1. Navigate to Gateways → Routing Rules +2. Enter CEL expression +3. Click "Test" +4. Input test headers/body +5. View evaluated result + +=== Option 2: Send test requests + +[source,python] +---- +def test_cel_routing(headers, messages): + """Test CEL routing with specific headers and messages""" + response = client.chat.completions.create( + model="auto", # PLACEHOLDER: Confirm trigger for CEL routing + messages=messages, + extra_headers=headers, + max_tokens=10 # Keep it cheap + ) + + # Check logs to see which model was used + print(f"Headers: {headers}") + print(f"Routed to: {response.model}") # PLACEHOLDER: Does response include actual model? + +# Test tier-based routing +test_cel_routing( + {"x-user-tier": "premium"}, + [{"role": "user", "content": "Test"}] +) +test_cel_routing( + {"x-user-tier": "free"}, + [{"role": "user", "content": "Test"}] +) +---- + + +=== Option 3: CLI test (if available) + +[source,bash] +---- +# PLACEHOLDER: If CLI tool exists for testing CEL +rpk cloud ai-gateway test-cel \ + --gateway-id gw_abc123 \ + --expression 'request.headers["tier"] == "premium" ? "openai/gpt-4o" : "openai/gpt-4o-mini"' \ + --header 'tier: premium' \ + --body '{"messages": [{"role": "user", "content": "Test"}]}' + +# Expected output: openai/gpt-4o +---- + + +== Common CEL errors + +=== Error: "unknown field" + +Symptom: + +[source,text] +---- +Error: Unknown field 'request.headers.x-user-tier' +---- + + +Cause: Wrong syntax (dot notation instead of bracket notation for headers) + +Fix: + +[source,cel] +---- +// Wrong +request.headers.x-user-tier + +// Correct +request.headers["x-user-tier"] +---- + + +=== Error: "type mismatch" + +Symptom: + +[source,text] +---- +Error: Type mismatch: expected bool, got string +---- + + +Cause: Forgot comparison operator + +Fix: + +[source,cel] +---- +// Wrong (returns string) +request.headers["tier"] + +// Correct (returns bool) +request.headers["tier"] == "premium" +---- + + +=== Error: "field does not exist" + +Symptom: + +[source,text] +---- +Error: No such key: max_tokens +---- + + +Cause: Accessing field that doesn't exist in request + +Fix: +[source,cel] +---- +// Wrong (crashes if max_tokens not in request) +request.body.max_tokens > 1000 + +// Correct (checks existence first) +has(request.body.max_tokens) && request.body.max_tokens > 1000 +---- + + +=== Error: "index out of bounds" + +Symptom: + +[source,text] +---- +Error: Index 0 out of bounds for array of size 0 +---- + + +Cause: Accessing array element that doesn't exist + +Fix: + +[source,cel] +---- +// Wrong (crashes if messages empty) +request.body.messages[0].content.contains("test") + +// Correct (checks size first) +request.body.messages.size() > 0 && request.body.messages[0].content.contains("test") +---- + + +== CEL performance considerations + +=== Expression complexity + +Fast (<1ms evaluation): + +[source,cel] +---- +request.headers["tier"] == "premium" ? "openai/gpt-4o" : "openai/gpt-4o-mini" +---- + + +Slower (~5-10ms evaluation): + +[source,cel] +---- +request.body.messages[0].content.matches("complex.*regex.*pattern") +---- + + +Recommendation: Keep expressions simple. Complex regex can add latency. + +=== Number of evaluations + +Each request evaluates CEL expression once. Total latency impact: +* Simple expression: <1ms +* Complex expression: ~5-10ms + +*Acceptable for most use cases.* + +== CEL function reference + +// PLACEHOLDER: Comprehensive list of available CEL functions in AI Gateway + +=== String functions + +[cols="2,3,3"] +|=== +| Function | Description | Example + +| `size()` +| String length +| `"hello".size() == 5` + +| `contains(s)` +| String contains +| `"hello".contains("ell")` + +| `startsWith(s)` +| String starts with +| `"hello".startsWith("he")` + +| `endsWith(s)` +| String ends with +| `"hello".endsWith("lo")` + +| `matches(regex)` +| Regex match +| `"hello".matches("h.*o")` +|=== + +=== Array functions + +[cols="2,3,3"] +|=== +| Function | Description | Example + +| `size()` +| Array length +| `[1,2,3].size() == 3` + +| `exists(x, cond)` +| Any element matches +| `[1,2,3].exists(x, x > 2)` + +| `all(x, cond)` +| All elements match +| `[1,2,3].all(x, x > 0)` +|=== + +=== Utility functions + +[cols="2,3,3"] +|=== +| Function | Description | Example + +| `has(field)` +| Field exists +| `has(request.body.max_tokens)` +|=== + +// PLACEHOLDER: Other functions like hash(), random(), now()? + +== Next steps + +* *Apply CEL routing* → [Gateway Configuration Guide](// PLACEHOLDER: link) +* *Monitor routing decisions* → [Observability: Logs](// PLACEHOLDER: link) diff --git a/modules/ai-agents/pages/ai-gateway/index.adoc b/modules/ai-agents/pages/ai-gateway/index.adoc new file mode 100644 index 00000000..a84ffbf2 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/index.adoc @@ -0,0 +1,3 @@ += AI Gateway +:description: Learn how to configure the AI Gateway for unified access to multiple LLM providers and MCP servers through a single endpoint. +:page-layout: index diff --git a/modules/ai-agents/pages/ai-gateway/integrations/claude-code-admin.adoc b/modules/ai-agents/pages/ai-gateway/integrations/claude-code-admin.adoc new file mode 100644 index 00000000..e8a9b962 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/claude-code-admin.adoc @@ -0,0 +1,511 @@ += Configure AI Gateway for Claude Code +:description: Configure Redpanda AI Gateway to support Claude Code clients. +:page-topic-type: how-to +:page-personas: platform_admin +:learning-objective-1: Configure AI Gateway endpoints for Claude Code connectivity +:learning-objective-2: Set up authentication and access control for Claude Code clients +:learning-objective-3: Deploy MCP tool aggregation for Claude Code tool discovery + +Configure Redpanda AI Gateway to support Claude Code clients accessing LLM providers and MCP tools through a unified endpoint. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +* AI Gateway deployed on a BYOC cluster running Redpanda version 25.3 or later +* Administrator access to the AI Gateway UI +* At least one LLM provider API key (OpenAI or Anthropic) +* Understanding of xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[AI Gateway concepts] + +== Architecture overview + +Claude Code connects to AI Gateway through two primary endpoints: + +* LLM endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1` for chat completions +* MCP endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp` for tool discovery and execution + +The gateway handles: + +. Authentication via bearer tokens in the `Authorization` header +. Gateway selection via the `rp-aigw-id` header +. Model routing using the `vendor/model_id` format +. MCP server aggregation for multi-tool workflows +. Request logging and cost tracking per gateway + +== Enable LLM providers + +Claude Code requires access to LLM providers through the gateway. Enable at least one provider. + +=== Configure Anthropic + +Claude Code uses Anthropic models by default. To enable Anthropic: + +. Navigate to *AI Gateway* > *Providers* in the Redpanda Cloud console +. Select *Anthropic* from the provider list +. Click *Add configuration* +. Enter your Anthropic API key +. Click *Save* + +The gateway can now route requests to Anthropic models. + +=== Configure OpenAI + +To enable OpenAI as a provider: + +. Navigate to *AI Gateway* > *Providers* +. Select *OpenAI* from the provider list +. Click *Add configuration* +. Enter your OpenAI API key +. Click *Save* + +=== Enable models in the catalog + +After enabling providers, enable specific models: + +. Navigate to *AI Gateway* > *Models* +. Enable the models you want Claude Code clients to access ++ +Common models for Claude Code: ++ +* `anthropic/claude-opus-4-5` +* `anthropic/claude-sonnet-4-5` +* `openai/gpt-4o` +* `openai/o1-mini` + +. Click *Save* + +Models appear in the catalog with the `vendor/model_id` format that Claude Code uses in requests. + +== Create a gateway for Claude Code clients + +Create a dedicated gateway to isolate Claude Code traffic and apply specific policies. + +=== Gateway configuration + +. Navigate to *AI Gateway* > *Gateways* +. Click *Create Gateway* +. Enter gateway details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`claude-code-gateway` (or your preferred name) + +|Workspace +|Select the workspace for access control grouping + +|Description +|Gateway for Claude Code IDE clients +|=== + +. Click *Create* +. Copy the gateway ID from the gateway details page + +The gateway ID is required in the `rp-aigw-id` header for all requests. + +=== Configure LLM routing + +Set up routing policies for Claude Code requests. + +==== Basic routing with failover + +Configure a primary provider with automatic failover: + +. Navigate to the gateway's *LLM* tab +. Under *Routing*, click *Add route* +. Configure the route: ++ +[source,cel] +---- +true # Matches all requests +---- + +. Add a *Primary provider pool*: ++ +* Provider: Anthropic +* Model: All enabled Anthropic models +* Load balancing: Round robin (if multiple Anthropic configurations exist) + +. Add a *Fallback provider pool*: ++ +* Provider: OpenAI +* Model: All enabled OpenAI models +* Failover conditions: Rate limits, timeouts, 5xx errors + +. Click *Save* + +Claude Code requests route to Anthropic by default and fail over to OpenAI if Anthropic is unavailable. + +==== User-based routing + +Route requests based on user identity (if Claude Code passes user identifiers): + +[source,cel] +---- +request.headers["x-user-tier"][0] == "premium" +---- + +Create separate routes: + +* Premium route: Claude Opus 4.5 (highest quality) +* Standard route: Claude Sonnet 4.5 (balanced cost and quality) + +=== Apply rate limits + +Prevent runaway usage from Claude Code clients: + +. Navigate to the gateway's *LLM* tab +. Under *Rate Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Recommended Value + +|Global rate limit +|100 requests per minute + +|Per-user rate limit +|10 requests per minute (if using user headers) +|=== + +. Click *Save* + +The gateway blocks requests exceeding these limits and returns HTTP 429 errors. + +=== Set spending limits + +Control LLM costs: + +. Under *Spend Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Monthly budget +|$5,000 (adjust based on expected usage) + +|Enforcement +|Block requests after budget exceeded +|=== + +. Click *Save* + +The gateway tracks estimated costs per request and blocks traffic when the monthly budget is exhausted. + +== Configure MCP tool aggregation + +Enable Claude Code to discover and use tools from multiple MCP servers through a single endpoint. + +=== Add MCP servers + +. Navigate to the gateway's *MCP* tab +. Click *Add MCP Server* +. Enter server details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Display name +|Descriptive name (for example, `redpanda-data-catalog`) + +|Endpoint URL +|MCP server endpoint (for example, xref:ai-agents:mcp/remote/overview.adoc[Remote MCP server] URL) + +|Authentication +|Bearer token or other authentication mechanism +|=== + +. Click *Save* + +Repeat for each MCP server you want to aggregate. + +=== Enable deferred tool loading + +Reduce token costs by deferring tool discovery: + +. Under *MCP Settings*, enable *Deferred tool loading* +. Click *Save* + +When enabled: + +* Claude Code initially receives only a search tool and orchestrator tool +* Claude Code queries for specific tools by name when needed +* Token usage decreases by 80-90% for agents with many tools configured + +=== Add the MCP orchestrator + +The MCP orchestrator reduces multi-step workflows to single calls: + +. Under *MCP Settings*, enable *MCP Orchestrator* +. Configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Orchestrator model +|Select a model with strong code generation capabilities (for example, `anthropic/claude-sonnet-4-5`) + +|Execution timeout +|30 seconds +|=== + +. Click *Save* + +Claude Code can now invoke the orchestrator tool to execute complex, multi-step operations in a single request. + +== Configure authentication + +Claude Code clients authenticate using bearer tokens. + +=== Generate API tokens + +. Navigate to *Security* > *API Tokens* in the Redpanda Cloud console +. Click *Create Token* +. Enter token details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`claude-code-access` + +|Scopes +|`ai-gateway:read`, `ai-gateway:write` + +|Expiration +|Set appropriate expiration based on security policies +|=== + +. Click *Create* +. Copy the token (it appears only once) + +Distribute this token to Claude Code users through secure channels. + +=== Token rotation + +Implement token rotation for security: + +. Create a new token before the existing token expires +. Distribute the new token to users +. Monitor usage of the old token in xref:ai-agents:ai-gateway/observability-logs.adoc[request logs] +. Revoke the old token after all users have migrated + +== Configure Claude Code clients + +Provide these instructions to users configuring Claude Code. + +=== CLI configuration + +Users can configure Claude Code using the CLI: + +[source,bash] +---- +claude mcp add \ + --transport http \ + redpanda-aigateway \ + https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp \ + --header "Authorization: Bearer YOUR_API_TOKEN" \ + --header "rp-aigw-id: GATEWAY_ID" +---- + +Replace: + +* `{CLUSTER_ID}`: Your Redpanda cluster ID +* `YOUR_API_TOKEN`: The API token generated earlier +* `GATEWAY_ID`: The gateway ID from gateway creation + +=== Configuration file + +Alternatively, users can edit `~/.claude/config.json`: + +[source,json] +---- +{ + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_TOKEN", + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + "apiProviders": { + "redpanda": { + "baseURL": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +This configuration: + +* Connects Claude Code to the aggregated MCP endpoint +* Routes LLM requests through the AI Gateway +* Includes authentication and gateway identification headers + +== Monitor Claude Code usage + +Track Claude Code activity through gateway observability features. + +=== View request logs + +. Navigate to *AI Gateway* > *Observability* > *Logs* +. Filter by gateway ID: `claude-code-gateway` +. Review: ++ +* Request timestamps and duration +* Model used per request +* Token usage (prompt and completion tokens) +* Estimated cost per request +* HTTP status codes and errors + +=== Analyze metrics + +. Navigate to *AI Gateway* > *Observability* > *Metrics* +. Select the Claude Code gateway +. Review: ++ +[cols="1,2"] +|=== +|Metric |Purpose + +|Request volume +|Identify usage patterns and peak times + +|Token usage +|Track consumption trends + +|Estimated spend +|Monitor costs against budget + +|Latency (p50, p95, p99) +|Detect performance issues + +|Error rate +|Identify failing requests or misconfigured clients +|=== + +For detailed metrics configuration, see xref:ai-agents:ai-gateway/observability-metrics.adoc[]. + +=== Query logs via API + +Programmatically access logs for integration with monitoring systems: + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "gateway_id": "GATEWAY_ID", + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-14T23:59:59Z", + "limit": 100 + }' +---- + +== Security considerations + +Apply these security best practices for Claude Code deployments. + +=== Limit token scope + +Create tokens with minimal required scopes: + +* `ai-gateway:read`: Required for MCP tool discovery +* `ai-gateway:write`: Required for LLM requests and tool execution + +Avoid granting broader scopes like `admin` or `cluster:write`. + +=== Implement network restrictions + +If Claude Code clients connect from known IP ranges, configure network policies: + +. Use cloud provider security groups to restrict access to AI Gateway endpoints +. Allowlist only the IP ranges where Claude Code clients operate +. Monitor for unauthorized access attempts in request logs + +=== Enforce token expiration + +Set short token lifetimes for high-security environments: + +* Development environments: 90 days +* Production environments: 30 days + +Automate token rotation to reduce manual overhead. + +=== Audit tool access + +Review which MCP tools Claude Code clients can access: + +. Periodically audit the MCP servers configured in the gateway +. Remove unused or deprecated MCP servers +. Monitor tool execution logs for unexpected behavior + +== Troubleshooting + +Common issues and solutions when configuring AI Gateway for Claude Code. + +=== Claude Code cannot connect to gateway + +Symptom: Connection errors when Claude Code tries to discover tools or send LLM requests. + +Causes and solutions: + +* **Invalid gateway ID**: Verify the `rp-aigw-id` header matches the gateway ID from the console +* **Expired token**: Generate a new API token and update the Claude Code configuration +* **Network connectivity**: Verify the cluster endpoint is accessible from the client network +* **Provider not enabled**: Ensure at least one LLM provider is enabled and has models in the catalog + +=== Tools not appearing in Claude Code + +Symptom: Claude Code does not discover MCP tools. + +Causes and solutions: + +* **MCP servers not configured**: Add MCP server endpoints in the gateway's MCP tab +* **Deferred loading enabled but search failing**: Check that the search tool is correctly configured +* **MCP server authentication failing**: Verify MCP server authentication credentials in the gateway configuration + +=== High costs or token usage + +Symptom: Token usage and costs exceed expectations. + +Causes and solutions: + +* **Deferred tool loading disabled**: Enable deferred tool loading to reduce tokens by 80-90% +* **No rate limits**: Apply per-minute rate limits to prevent runaway usage +* **Missing spending limits**: Set monthly budget limits with blocking enforcement +* **Expensive models**: Route to cost-effective models (for example, Claude Sonnet instead of Opus) for non-critical requests + +=== Requests failing with 429 errors + +Symptom: Claude Code receives HTTP 429 Too Many Requests errors. + +Causes and solutions: + +* **Rate limit exceeded**: Review and increase rate limits if usage is legitimate +* **Upstream provider rate limits**: Check if the upstream LLM provider is rate-limiting; configure failover pools +* **Budget exhausted**: Verify monthly spending limit has not been reached + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Analyze detailed request logs +* xref:ai-agents:ai-gateway/observability-metrics.adoc[]: Set up metrics dashboards +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Implement advanced routing rules +* xref:ai-agents:mcp/remote/overview.adoc[]: Deploy Remote MCP servers for custom tools diff --git a/modules/ai-agents/pages/ai-gateway/integrations/claude-code-user.adoc b/modules/ai-agents/pages/ai-gateway/integrations/claude-code-user.adoc new file mode 100644 index 00000000..79146771 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/claude-code-user.adoc @@ -0,0 +1,484 @@ += Configure Claude Code with AI Gateway +:description: Configure Claude Code to use Redpanda AI Gateway for unified LLM access and MCP tool aggregation. +:page-topic-type: how-to +:page-personas: ai_agent_developer, app_developer +:learning-objective-1: Configure Claude Code to connect to AI Gateway endpoints +:learning-objective-2: Set up MCP server integration through AI Gateway +:learning-objective-3: Verify Claude Code is routing requests through the gateway + +After xref:ai-agents:ai-gateway/ai-gateway.adoc[configuring your AI Gateway], set up Claude Code to route LLM requests and access MCP tools through the gateway's unified endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +Before configuring Claude Code, ensure you have: + +* Claude Code CLI installed (download from https://github.com/anthropics/claude-code[Anthropic's GitHub^]) +* An active Redpanda AI Gateway with: +** At least one LLM provider enabled (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-1-enable-a-provider[Enable a provider]) +** A gateway created and configured (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-3-create-a-gateway[Create a gateway]) +* Your AI Gateway credentials: +** Gateway endpoint URL (for example, `https://gw.ai.panda.com`) +** Gateway ID (for example, `gateway-abc123`) +** API key with access to the gateway + +== Configuration methods + +Claude Code supports two configuration approaches for connecting to AI Gateway: + +[cols="1,2,2"] +|=== +|Method |Best for |Trade-offs + +|CLI command +|Quick setup, single gateway +|Must re-run if configuration changes + +|Configuration file +|Multiple gateways, complex setups, version control +|Manual file editing required +|=== + +Choose the method that matches your workflow. The CLI command is faster for getting started, while the configuration file provides more flexibility for production use. + +== Configure using CLI + +The `claude mcp add` command configures Claude Code to connect to your AI Gateway's MCP endpoint. + +=== Add MCP server connection + +[,bash] +---- +claude mcp add \ + --transport http \ + redpanda-aigateway \ + https://gw.ai.panda.com/mcp \ + --header "Authorization: Bearer YOUR_API_KEY" \ + --header "rp-aigw-id: GATEWAY_ID" +---- + +Replace the following values: + +* `https://gw.ai.panda.com/mcp` - Your gateway's MCP endpoint +* `YOUR_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID from the AI Gateway UI + +This command configures the HTTP transport for MCP, which allows Claude Code to discover and invoke tools from all MCP servers configured in your gateway. + +=== Configure LLM routing through gateway + +To route Claude Code's LLM requests through the gateway instead of directly to Anthropic: + +[,bash] +---- +claude config set \ + --api-provider redpanda \ + --base-url https://gw.ai.panda.com \ + --header "rp-aigw-id: GATEWAY_ID" +---- + +This routes all Claude model requests through your gateway, giving you centralized observability and policy enforcement. + +== Configure using configuration file + +For more complex configurations or when managing multiple gateways, edit the Claude Code configuration file directly. + +=== Locate configuration file + +Claude Code stores configuration in: + +* macOS/Linux: `~/.claude/config.json` +* Windows: `%USERPROFILE%\.claude\config.json` + +Create the directory if it doesn't exist: + +[,bash] +---- +mkdir -p ~/.claude +---- + +=== Basic configuration + +Create or edit `~/.claude/config.json` with the following structure: + +[,json] +---- +{ + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + "apiProviders": { + "redpanda": { + "baseURL": "https://gw.ai.panda.com", + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +Replace placeholder values: + +* `YOUR_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID + +=== Multiple gateway configuration + +To configure different gateways for development and production: + +[,json] +---- +{ + "mcpServers": { + "redpanda-staging": { + "transport": "http", + "url": "https://gw.staging.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer STAGING_API_KEY", + "rp-aigw-id": "staging-gateway-123" + } + }, + "redpanda-production": { + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer PROD_API_KEY", + "rp-aigw-id": "prod-gateway-456" + } + } + }, + "apiProviders": { + "redpanda-staging": { + "baseURL": "https://gw.staging.ai.panda.com", + "headers": { + "rp-aigw-id": "staging-gateway-123" + } + }, + "redpanda-production": { + "baseURL": "https://gw.ai.panda.com", + "headers": { + "rp-aigw-id": "prod-gateway-456" + } + } + } +} +---- + +Switch between gateways by selecting the appropriate MCP server or API provider when using Claude Code. + +=== Configuration with environment variables + +For sensitive credentials, use environment variables instead of hardcoding values: + +[,json] +---- +{ + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "${REDPANDA_GATEWAY_URL}/mcp", + "headers": { + "Authorization": "Bearer ${REDPANDA_API_KEY}", + "rp-aigw-id": "${REDPANDA_GATEWAY_ID}" + } + } + }, + "apiProviders": { + "redpanda": { + "baseURL": "${REDPANDA_GATEWAY_URL}", + "headers": { + "rp-aigw-id": "${REDPANDA_GATEWAY_ID}" + } + } + } +} +---- + +Set environment variables before launching Claude Code: + +[,bash] +---- +export REDPANDA_GATEWAY_URL="https://gw.ai.panda.com" +export REDPANDA_GATEWAY_ID="gateway-abc123" +export REDPANDA_API_KEY="your-api-key" +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:REDPANDA_GATEWAY_URL = "https://gw.ai.panda.com" +$env:REDPANDA_GATEWAY_ID = "gateway-abc123" +$env:REDPANDA_API_KEY = "your-api-key" +---- + +== Verify configuration + +After configuring Claude Code, verify it connects correctly to your AI Gateway. + +=== Test MCP tool discovery + +List available MCP tools to confirm Claude Code can reach your gateway's MCP endpoint: + +[,bash] +---- +claude mcp list +---- + +Expected output should show: + +* The `redpanda-ai-gateway` server connection +* Status: Connected +* Available tools from your configured MCP servers + +If deferred tool loading is enabled in your gateway, you'll see a search tool and the MCP orchestrator tool instead of all tools upfront. + +=== Verify gateway routing + +Check that requests route through the gateway by monitoring the AI Gateway dashboard: + +. Open the Redpanda Cloud Console +. Navigate to your gateway's observability dashboard +. Send a test request from Claude Code: ++ +[,bash] +---- +echo "Write a simple Python hello world function" | claude +---- + +. Refresh the dashboard and verify: +** Request appears in the logs +** Model shows as `anthropic/claude-sonnet-4-5` (or your configured model) +** Request succeeded (status 200) +** Token usage and estimated cost are recorded + +If the request doesn't appear in the dashboard, see <>. + +== Advanced configuration + +=== Custom request timeout + +Configure timeout for MCP requests in the configuration file: + +[,json] +---- +{ + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + }, + "timeout": 30000 + } + } +} +---- + +The `timeout` value is in milliseconds. Default is 10000 (10 seconds). Increase this for MCP tools that perform long-running operations. + +=== Request retry configuration + +Configure retry behavior for transient failures: + +[,json] +---- +{ + "apiProviders": { + "redpanda": { + "baseURL": "https://gw.ai.panda.com", + "headers": { + "rp-aigw-id": "GATEWAY_ID" + }, + "retry": { + "maxRetries": 3, + "retryDelay": 1000, + "retryCondition": ["5xx", "timeout"] + } + } + } +} +---- + +This configuration retries requests up to 3 times on server errors (5xx status codes) or timeouts, with a 1-second delay between retries. + +=== Debug mode + +Enable debug logging to troubleshoot connection issues: + +[,bash] +---- +export CLAUDE_DEBUG=1 +claude +---- + +Debug mode shows: + +* HTTP request and response headers +* MCP tool discovery messages +* Gateway routing decisions (if exposed in response headers) +* Error details + +[[troubleshooting]] +== Troubleshooting + +=== MCP server not connecting + +**Symptom**: `claude mcp list` shows "Connection failed" or no tools available. + +**Causes and solutions**: + +. **Incorrect endpoint URL** ++ +Verify your MCP endpoint is correct. It should be `{gateway-url}/mcp`, not just `{gateway-url}`. ++ +[,bash] +---- +# Correct +https://gw.ai.panda.com/mcp + +# Incorrect +https://gw.ai.panda.com +---- + +. **Authentication failure** ++ +Check that your API key is valid and has access to the gateway: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/mcp +---- ++ +You should receive a valid MCP protocol response. If you get `401 Unauthorized`, regenerate your API key in the Redpanda Cloud Console. + +. **Gateway ID mismatch** ++ +Verify your gateway ID matches exactly (case-sensitive). Copy it directly from the AI Gateway UI rather than typing it manually. + +. **Network connectivity issues** ++ +Test basic connectivity to the gateway endpoint: ++ +[,bash] +---- +curl -I https://gw.ai.panda.com/mcp +---- ++ +If this times out, check your network configuration, firewall rules, or VPN connection. + +=== Requests not appearing in gateway dashboard + +**Symptom**: Claude Code works, but you don't see requests in the AI Gateway observability dashboard. + +**Causes and solutions**: + +. **Wrong gateway configured** ++ +Verify that the `rp-aigw-id` header in your configuration matches the gateway you're viewing in the dashboard. + +. **Using direct Anthropic API** ++ +If you didn't configure the `apiProviders` section, Claude Code may be routing directly to Anthropic instead of through your gateway. Verify the `apiProviders` section exists in your config file. + +. **Log ingestion delay** ++ +Gateway logs can take 5-10 seconds to appear in the dashboard. Wait briefly and refresh. + +. **Model name format error** ++ +Ensure requests use the `vendor/model_id` format (for example, `anthropic/claude-sonnet-4-5`), not just the model name (for example, `claude-sonnet-4-5`). + +=== High latency after gateway integration + +**Symptom**: Requests are slower after routing through the gateway. + +**Causes and solutions**: + +. **Gateway geographic distance** ++ +If your gateway is in a different region than you or the upstream provider, this adds network latency. Check gateway region in the Redpanda Cloud Console. + +. **Provider pool failover** ++ +If your gateway is configured with fallback providers, check the logs to see if requests are failing over. Failover adds latency. + +. **MCP tool aggregation overhead** ++ +Aggregating tools from multiple MCP servers adds processing time. Use deferred tool loading to reduce this overhead (see xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]). + +. **Rate limiting** ++ +If you're hitting rate limits, the gateway may be queuing requests. Check the observability dashboard for rate limit metrics. + +=== Configuration file not loading + +**Symptom**: Changes to `config.json` don't take effect. + +**Solutions**: + +. **Restart Claude Code** ++ +Configuration changes require restarting Claude Code: ++ +[,bash] +---- +# Kill any running Claude Code processes +pkill claude + +# Start Claude Code again +claude +---- + +. **Validate JSON syntax** ++ +Ensure your `config.json` is valid JSON. Use a JSON validator: ++ +[,bash] +---- +python3 -m json.tool ~/.claude/config.json +---- + +. **Check file permissions** ++ +Verify Claude Code can read the configuration file: ++ +[,bash] +---- +ls -la ~/.claude/config.json +---- ++ +The file should be readable by your user. If not, fix permissions: ++ +[,bash] +---- +chmod 600 ~/.claude/config.json +---- + +== Next steps + +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure deferred tool loading to reduce token costs +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor Claude Code requests in the gateway dashboard +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Use CEL expressions to route Claude Code requests based on context + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Create and configure your AI Gateway +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture and benefits diff --git a/modules/ai-agents/pages/ai-gateway/integrations/cline-admin.adoc b/modules/ai-agents/pages/ai-gateway/integrations/cline-admin.adoc new file mode 100644 index 00000000..3131ed80 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/cline-admin.adoc @@ -0,0 +1,590 @@ += Configure AI Gateway for Cline +:description: Configure Redpanda AI Gateway to support Cline clients. +:page-topic-type: how-to +:page-personas: platform_admin +:learning-objective-1: Configure AI Gateway endpoints for Cline connectivity +:learning-objective-2: Set up authentication and access control for Cline clients +:learning-objective-3: Deploy MCP tool aggregation for Cline tool discovery + +Configure Redpanda AI Gateway to support Cline (formerly Claude Dev) clients accessing LLM providers and MCP tools through a unified endpoint. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +* AI Gateway deployed on a BYOC cluster running Redpanda version 25.3 or later +* Administrator access to the AI Gateway UI +* At least one LLM provider API key (Anthropic or OpenAI) +* Understanding of xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[AI Gateway concepts] + +== About Cline + +Cline is a VS Code extension designed for autonomous AI development workflows. It connects to Claude models through the native Anthropic API format, sending requests to `/v1/messages` endpoints. Cline supports long-running tasks, browser integration, and autonomous operations, with full MCP support for tool discovery and execution. + +Key characteristics: + +* Uses native Anthropic format (compatible with OpenAI-compatible endpoints) +* Designed for autonomous, multi-step workflows +* Supports MCP protocol for external tool integration +* Operates as a VS Code extension with persistent context +* Requires configuration similar to Claude Code + +== Architecture overview + +Cline connects to AI Gateway through two primary endpoints: + +* LLM endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1` for chat completions +* MCP endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp` for tool discovery and execution + +The gateway handles: + +. Authentication via bearer tokens in the `Authorization` header +. Gateway selection via the `rp-aigw-id` header +. Model routing using the `vendor/model_id` format +. MCP server aggregation for multi-tool workflows +. Request logging and cost tracking per gateway + +== Enable LLM providers + +Cline requires access to LLM providers through the gateway. Enable at least one provider. + +=== Configure Anthropic + +Cline uses Anthropic models by default. To enable Anthropic: + +. Navigate to *AI Gateway* > *Providers* in the Redpanda Cloud console +. Select *Anthropic* from the provider list +. Click *Add configuration* +. Enter your Anthropic API key +. Click *Save* + +The gateway can now route requests to Anthropic models. + +=== Configure OpenAI + +To enable OpenAI as a provider: + +. Navigate to *AI Gateway* > *Providers* +. Select *OpenAI* from the provider list +. Click *Add configuration* +. Enter your OpenAI API key +. Click *Save* + +=== Enable models in the catalog + +After enabling providers, enable specific models: + +. Navigate to *AI Gateway* > *Models* +. Enable the models you want Cline clients to access ++ +Common models for Cline: ++ +* `anthropic/claude-opus-4-5` +* `anthropic/claude-sonnet-4-5` +* `openai/gpt-4o` +* `openai/o1-mini` + +. Click *Save* + +Models appear in the catalog with the `vendor/model_id` format that Cline uses in requests. + +== Create a gateway for Cline clients + +Create a dedicated gateway to isolate Cline traffic and apply specific policies. + +=== Gateway configuration + +. Navigate to *AI Gateway* > *Gateways* +. Click *Create Gateway* +. Enter gateway details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`cline-gateway` (or your preferred name) + +|Workspace +|Select the workspace for access control grouping + +|Description +|Gateway for Cline VS Code extension clients +|=== + +. Click *Create* +. Copy the gateway ID from the gateway details page + +The gateway ID is required in the `rp-aigw-id` header for all requests. + +=== Configure LLM routing + +Set up routing policies for Cline requests. + +==== Basic routing with failover + +Configure a primary provider with automatic failover: + +. Navigate to the gateway's *LLM* tab +. Under *Routing*, click *Add route* +. Configure the route: ++ +[source,cel] +---- +true # Matches all requests +---- + +. Add a *Primary provider pool*: ++ +* Provider: Anthropic +* Model: All enabled Anthropic models +* Load balancing: Round robin (if multiple Anthropic configurations exist) + +. Add a *Fallback provider pool*: ++ +* Provider: OpenAI +* Model: All enabled OpenAI models +* Failover conditions: Rate limits, timeouts, 5xx errors + +. Click *Save* + +Cline requests route to Anthropic by default and fail over to OpenAI if Anthropic is unavailable. + +==== Workspace-based routing + +Route requests based on VS Code workspace or project context (if Cline passes workspace identifiers): + +[source,cel] +---- +request.headers["x-workspace-type"][0] == "production" +---- + +Create separate routes: + +* Production route: Claude Opus 4.5 (highest quality, critical code) +* Development route: Claude Sonnet 4.5 (balanced cost and quality) +* Experimental route: OpenAI GPT-4o (cost-effective testing) + +=== Apply rate limits + +Prevent runaway usage from autonomous Cline sessions: + +. Navigate to the gateway's *LLM* tab +. Under *Rate Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Recommended Value + +|Global rate limit +|120 requests per minute + +|Per-user rate limit +|15 requests per minute (if using user headers) +|=== ++ +Cline can generate multiple requests during autonomous operations. Higher limits than typical interactive clients may be necessary. + +. Click *Save* + +The gateway blocks requests exceeding these limits and returns HTTP 429 errors. + +=== Set spending limits + +Control LLM costs during autonomous operations: + +. Under *Spend Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Monthly budget +|$8,000 (adjust based on expected autonomous usage) + +|Enforcement +|Block requests after budget exceeded +|=== ++ +Autonomous operations can consume significant tokens. Monitor spending patterns after deployment. + +. Click *Save* + +The gateway tracks estimated costs per request and blocks traffic when the monthly budget is exhausted. + +== Configure MCP tool aggregation + +Enable Cline to discover and use tools from multiple MCP servers through a single endpoint. + +=== Add MCP servers + +. Navigate to the gateway's *MCP* tab +. Click *Add MCP Server* +. Enter server details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Display name +|Descriptive name (for example, `filesystem-tools`, `code-analysis-tools`) + +|Endpoint URL +|MCP server endpoint (for example, xref:ai-agents:mcp/remote/overview.adoc[Remote MCP server] URL) + +|Authentication +|Bearer token or other authentication mechanism +|=== + +. Click *Save* + +Repeat for each MCP server you want to aggregate. + +=== Enable deferred tool loading + +Reduce token costs for Cline sessions with many available tools: + +. Under *MCP Settings*, enable *Deferred tool loading* +. Click *Save* + +When enabled: + +* Cline initially receives only a search tool and orchestrator tool +* Cline queries for specific tools by name when needed +* Token usage decreases by 80-90% for configurations with many tools + +This is particularly important for Cline because autonomous operations can make many tool discovery calls. + +=== Add the MCP orchestrator + +The MCP orchestrator reduces multi-step autonomous workflows to single calls: + +. Under *MCP Settings*, enable *MCP Orchestrator* +. Configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Orchestrator model +|Select a model with strong code generation capabilities (for example, `anthropic/claude-sonnet-4-5`) + +|Execution timeout +|45 seconds +|=== ++ +Longer timeout than typical interactive clients allows complex autonomous operations to complete. + +. Click *Save* + +Cline can now invoke the orchestrator tool to execute complex, multi-step operations in a single request, which is ideal for autonomous development workflows. + +== Configure authentication + +Cline clients authenticate using bearer tokens. + +=== Generate API tokens + +. Navigate to *Security* > *API Tokens* in the Redpanda Cloud console +. Click *Create Token* +. Enter token details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`cline-access` + +|Scopes +|`ai-gateway:read`, `ai-gateway:write` + +|Expiration +|Set appropriate expiration based on security policies +|=== + +. Click *Create* +. Copy the token (it appears only once) + +Distribute this token to Cline users through secure channels. + +=== Token rotation + +Implement token rotation for security: + +. Create a new token before the existing token expires +. Distribute the new token to users +. Monitor usage of the old token in xref:ai-agents:ai-gateway/observability-logs.adoc[request logs] +. Revoke the old token after all users have migrated + +== Configure Cline clients + +Provide these instructions to users configuring Cline in VS Code. + +=== VS Code settings configuration + +Users configure Cline through VS Code settings (either UI or `settings.json`). + +==== Using VS Code Settings UI + +. Open VS Code Settings (Cmd/Ctrl + ,) +. Search for "Cline" +. Configure the following settings: ++ +* *Cline: API Provider*: Select "Custom" or "Anthropic" +* *Cline: API Base URL*: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1` +* *Cline: API Key*: The API token generated earlier + +==== Using settings.json + +Alternatively, users can edit `.vscode/settings.json` in their workspace: + +[source,json] +---- +{ + "cline.apiProvider": "custom", + "cline.apiBaseUrl": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + "cline.apiKey": "YOUR_API_TOKEN", + "cline.customHeaders": { + "rp-aigw-id": "GATEWAY_ID" + } +} +---- + +Replace: + +* `{CLUSTER_ID}`: Your Redpanda cluster ID +* `YOUR_API_TOKEN`: The API token generated earlier +* `GATEWAY_ID`: The gateway ID from gateway creation + +=== MCP server configuration + +Configure Cline to connect to the aggregated MCP endpoint: + +[source,json] +---- +{ + "cline.mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_TOKEN", + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +This configuration: + +* Connects Cline to the aggregated MCP endpoint +* Routes LLM requests through the AI Gateway +* Includes authentication and gateway identification headers + +=== User settings vs workspace settings + +Cline settings can be configured at two levels: + +[cols="1,2,2"] +|=== +|Level |Location |Use Case + +|User settings +|`~/.vscode/settings.json` +|Personal API token, default gateway for all projects + +|Workspace settings +|`.vscode/settings.json` in project +|Project-specific gateway, shared team configuration +|=== + +Use workspace settings when different projects require different gateways (for example, development vs production environments). + +== Monitor Cline usage + +Track Cline activity through gateway observability features. + +=== View request logs + +. Navigate to *AI Gateway* > *Observability* > *Logs* +. Filter by gateway ID: `cline-gateway` +. Review: ++ +* Request timestamps and duration +* Model used per request +* Token usage (prompt and completion tokens) +* Estimated cost per request +* HTTP status codes and errors + +Cline autonomous operations may generate request sequences. Look for patterns to identify long-running sessions. + +=== Analyze metrics + +. Navigate to *AI Gateway* > *Observability* > *Metrics* +. Select the Cline gateway +. Review: ++ +[cols="1,2"] +|=== +|Metric |Purpose + +|Request volume +|Identify autonomous session patterns and peak times + +|Token usage +|Track consumption trends from multi-step operations + +|Estimated spend +|Monitor costs against budget (autonomous operations can be expensive) + +|Latency (p50, p95, p99) +|Detect performance issues in autonomous workflows + +|Error rate +|Identify failing requests or misconfigured clients +|=== + +For detailed metrics configuration, see xref:ai-agents:ai-gateway/observability-metrics.adoc[]. + +=== Query logs via API + +Programmatically access logs for integration with monitoring systems: + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "gateway_id": "GATEWAY_ID", + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-14T23:59:59Z", + "limit": 100 + }' +---- + +== Security considerations + +Apply these security best practices for Cline deployments. + +=== Limit token scope + +Create tokens with minimal required scopes: + +* `ai-gateway:read`: Required for MCP tool discovery +* `ai-gateway:write`: Required for LLM requests and tool execution + +Avoid granting broader scopes like `admin` or `cluster:write`. + +Because Cline performs autonomous operations, limit what tools it can access through MCP server selection. + +=== Implement network restrictions + +If Cline clients connect from known networks (corporate VPN, office IP ranges), configure network policies: + +. Use cloud provider security groups to restrict access to AI Gateway endpoints +. Allowlist only the IP ranges where Cline clients operate +. Monitor for unauthorized access attempts in request logs + +=== Enforce token expiration + +Set short token lifetimes for high-security environments: + +* Development environments: 90 days +* Production environments: 30 days + +Automate token rotation to reduce manual overhead. + +=== Audit tool access + +Review which MCP tools Cline clients can access: + +. Periodically audit the MCP servers configured in the gateway +. Remove unused or deprecated MCP servers +. Monitor tool execution logs for unexpected autonomous behavior +. Consider creating separate gateways for different trust levels + +Because Cline operates autonomously, carefully control which tools it can invoke. + +=== Monitor autonomous operations + +Set up alerts for unusual patterns: + +* Request rate spikes (may indicate runaway autonomous loops) +* High error rates (may indicate tool compatibility issues) +* Unexpected tool invocations (may indicate misconfigured autonomous behavior) +* Budget consumption spikes (autonomous operations can be expensive) + +== Troubleshooting + +Common issues and solutions when configuring AI Gateway for Cline. + +=== Cline cannot connect to gateway + +Symptom: Connection errors when Cline tries to discover tools or send LLM requests. + +Causes and solutions: + +* **Invalid gateway ID**: Verify the `rp-aigw-id` header matches the gateway ID from the console +* **Expired token**: Generate a new API token and update the Cline settings +* **Network connectivity**: Verify the cluster endpoint is accessible from the client network +* **Provider not enabled**: Ensure at least one LLM provider is enabled and has models in the catalog +* **VS Code settings not applied**: Reload VS Code window after changing settings (Cmd/Ctrl + Shift + P > "Reload Window") + +=== Tools not appearing in Cline + +Symptom: Cline does not discover MCP tools. + +Causes and solutions: + +* **MCP servers not configured**: Add MCP server endpoints in the gateway's MCP tab +* **Deferred loading enabled but search failing**: Check that the search tool is correctly configured +* **MCP server authentication failing**: Verify MCP server authentication credentials in the gateway configuration +* **Cline MCP configuration missing**: Ensure `cline.mcpServers` is configured in settings + +=== High costs or token usage + +Symptom: Token usage and costs exceed expectations. + +Causes and solutions: + +* **Deferred tool loading disabled**: Enable deferred tool loading to reduce tokens by 80-90% +* **Autonomous loops**: Monitor for repeated similar requests (may indicate autonomous operation stuck in a loop) +* **No rate limits**: Apply per-minute rate limits to prevent runaway autonomous usage +* **Missing spending limits**: Set monthly budget limits with blocking enforcement +* **Expensive models for autonomous work**: Route autonomous operations to cost-effective models (for example, Claude Sonnet instead of Opus) +* **Too many tools in context**: Reduce the number of aggregated MCP servers or enable deferred loading + +=== Requests failing with 429 errors + +Symptom: Cline receives HTTP 429 Too Many Requests errors. + +Causes and solutions: + +* **Rate limit exceeded**: Review and increase rate limits if autonomous usage is legitimate +* **Upstream provider rate limits**: Check if the upstream LLM provider is rate-limiting; configure failover pools +* **Budget exhausted**: Verify monthly spending limit has not been reached +* **Autonomous operation too aggressive**: Configure Cline to slow down request rate + +=== Autonomous operations timing out + +Symptom: Cline operations fail with timeout errors. + +Causes and solutions: + +* **MCP orchestrator timeout too short**: Increase orchestrator execution timeout to 60 seconds +* **Complex multi-step operations**: Break down tasks or use the orchestrator tool for better efficiency +* **Slow MCP server responses**: Check MCP server performance and consider caching + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Analyze detailed request logs +* xref:ai-agents:ai-gateway/observability-metrics.adoc[]: Set up metrics dashboards +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Implement advanced routing rules +* xref:ai-agents:mcp/remote/overview.adoc[]: Deploy Remote MCP servers for custom tools diff --git a/modules/ai-agents/pages/ai-gateway/integrations/cline-user.adoc b/modules/ai-agents/pages/ai-gateway/integrations/cline-user.adoc new file mode 100644 index 00000000..682110a2 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/cline-user.adoc @@ -0,0 +1,758 @@ += Configure Cline with AI Gateway +:description: Configure Cline to use Redpanda AI Gateway for unified LLM access, MCP tool integration, and autonomous coding workflows. +:page-topic-type: how-to +:page-personas: ai_agent_developer, app_developer +:learning-objective-1: Configure Cline to connect to AI Gateway for LLM requests and MCP tools +:learning-objective-2: Set up autonomous mode with custom instructions and browser integration +:learning-objective-3: Verify Cline routes requests through the gateway and optimize for cost + +After xref:ai-agents:ai-gateway/ai-gateway.adoc[configuring your AI Gateway], set up Cline (formerly Claude Dev) to route LLM requests and access MCP tools through the gateway's unified endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +Before configuring Cline, ensure you have: + +* Cline VS Code extension installed (search for "Cline" in VS Code Extensions) +* An active Redpanda AI Gateway with: +** At least one LLM provider enabled (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-1-enable-a-provider[Enable a provider]) +** A gateway created and configured (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-3-create-a-gateway[Create a gateway]) +* Your AI Gateway credentials: +** Gateway endpoint URL (for example, `https://gw.ai.panda.com`) +** Gateway ID (for example, `gateway-abc123`) +** API key with access to the gateway + +== About Cline + +Cline is an autonomous AI coding agent for VS Code that can: + +* Read and edit files in your workspace +* Execute terminal commands +* Browse the web for documentation and research +* Create and manage complex multi-file changes +* Work autonomously with approval checkpoints + +By routing Cline through AI Gateway, you gain centralized observability, cost controls, and the ability to aggregate multiple MCP servers into a single interface. + +== Configuration overview + +Cline supports two connection types for AI Gateway: + +[cols="1,2,2"] +|=== +|Connection type |Use for |Configuration location + +|OpenAI-compatible API +|LLM requests (chat, code generation) +|Cline Settings → API Configuration + +|MCP servers +|Tool discovery and execution +|Cline Settings → MCP Servers +|=== + +Both can route through AI Gateway independently or together, depending on your needs. + +== Configure LLM routing through gateway + +Set up Cline to route all LLM requests through your AI Gateway instead of directly to providers. + +=== Open Cline settings + +. Open VS Code +. Open Command Palette (Cmd+Shift+P or Ctrl+Shift+P) +. Search for `Cline: Open Settings` +. Select `Cline: Open Settings` + +Alternatively, click the gear icon in the Cline sidebar panel. + +=== Configure API provider + +In the Cline settings interface: + +. Navigate to *API Configuration* section +. Select *API Provider*: `OpenAI Compatible` +. Set *Base URL*: `https://gw.ai.panda.com` +. Set *API Key*: Your Redpanda API key +. Expand *Advanced Settings* +. Add custom headers: ++ +[,json] +---- +{ + "rp-aigw-id": "gateway-abc123" +} +---- + +Replace the following values: + +* `https://gw.ai.panda.com` - Your gateway's base URL (without `/v1` suffix) +* `gateway-abc123` - Your gateway ID from the AI Gateway UI + +=== Select model + +In the *Model* dropdown, enter the model using the `vendor/model_id` format: + +* For Anthropic Claude: `anthropic/claude-sonnet-4-5` +* For OpenAI: `openai/gpt-4o` +* For other providers: `{provider}/{model-name}` + +The gateway routes the request based on this format. If you use a non-prefixed model name (for example, `claude-sonnet-4-5`), the gateway may not route correctly. + +=== Verify configuration + +. Click *Test Connection* in Cline settings +. Verify status shows "Connected" +. Send a test message in the Cline chat panel + +If the connection fails, see <>. + +== Configure MCP server integration + +Connect Cline to your AI Gateway's MCP endpoint to aggregate tools from multiple MCP servers. + +=== Add MCP server connection + +In the Cline settings interface: + +. Navigate to *MCP Servers* section +. Click *Add MCP Server* +. Configure the connection: ++ +[,json] +---- +{ + "name": "redpanda-ai-gateway", + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + } +} +---- + +Replace placeholder values: + +* `YOUR_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID + +=== Enable tool discovery + +After adding the MCP server: + +. Click *Refresh Tools* to discover available tools +. Verify that tools from your configured MCP servers appear in the tool list +. If using deferred tool loading, you'll see a search tool and MCP orchestrator tool instead of all tools upfront + +Tools are now available for Cline to use autonomously during coding sessions. + +=== Alternative: Manual configuration file + +For more control, edit the VS Code settings directly: + +. Open VS Code settings (Cmd+, or Ctrl+,) +. Search for `cline.mcpServers` +. Click *Edit in settings.json* +. Add the MCP server configuration: ++ +[,json] +---- +{ + "cline.mcpServers": [ + { + "name": "redpanda-ai-gateway", + "transport": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + } + } + ] +} +---- + +Restart VS Code for changes to take effect. + +== Configure autonomous mode settings + +Optimize Cline's autonomous behavior when using AI Gateway. + +=== Set approval mode + +Control how often Cline requires your approval during autonomous tasks: + +[cols="1,2,2"] +|=== +|Mode |Behavior |Best for + +|*Always ask* +|Request approval for every action +|Testing, sensitive codebases, cost control + +|*Ask before terminal commands* +|Auto-approve file edits, ask for commands +|Trusted environments, faster iteration + +|*Autonomous* +|Complete tasks without interruption +|Well-scoped tasks, batch processing +|=== + +To set approval mode: + +. Open Cline settings +. Navigate to *Autonomous Mode* +. Select your preferred mode + +When using AI Gateway with spend limits, autonomous mode is safer because the gateway enforces budget controls even if Cline makes many requests. + +=== Configure custom instructions + +Add custom instructions to guide Cline's behavior and reduce token costs: + +. Open Cline settings +. Navigate to *Custom Instructions* +. Add instructions that reduce unnecessary requests: ++ +[,text] +---- +- Before making changes, analyze the codebase structure first +- Use existing code patterns instead of creating new ones +- Ask for clarification before large refactors +- Prefer small, incremental changes over complete rewrites +- Use MCP tools for research instead of multiple LLM calls +---- + +These instructions help Cline work more efficiently and reduce token usage. + +=== Enable browser integration + +Cline can use a browser to research documentation, which reduces the need for large context windows: + +. Open Cline settings +. Navigate to *Browser Integration* +. Enable *Allow Browser Access* +. Configure browser mode: +** *Headless* - Faster, lower resource usage +** *Visible* - See what Cline is browsing (useful for debugging) + +Browser integration is particularly useful with AI Gateway because: + +* Cline can look up current documentation instead of relying on outdated training data +* Reduces prompt token costs from pasting documentation into context +* Works with MCP tools that fetch web content + +== Verify configuration + +After configuring Cline, verify it connects correctly to your AI Gateway. + +=== Test LLM routing + +Send a test message in the Cline chat panel: + +. Open the Cline sidebar in VS Code +. Type a simple request: "Explain this file" (with a file open) +. Wait for response + +Then verify in the AI Gateway dashboard: + +. Open the Redpanda Cloud Console +. Navigate to your gateway's observability dashboard +. Filter by gateway ID +. Verify: +** Request appears in logs +** Model shows correct format (for example, `anthropic/claude-sonnet-4-5`) +** Token usage and cost are recorded + +If the request doesn't appear, see <>. + +=== Test MCP tool usage + +Verify Cline can discover and invoke MCP tools: + +. In the Cline chat, request a task that requires a tool +. For example: "Use the weather tool to check the forecast" +. Cline should: +** Discover the tool from the MCP server +** Invoke it with correct parameters +** Return the result + +Check the gateway dashboard for MCP tool invocation logs. + +=== Monitor token costs + +Track Cline's token usage to identify optimization opportunities: + +. Open the AI Gateway observability dashboard +. Filter by your gateway +. View metrics: +** Requests per hour +** Token usage per request (prompt + completion) +** Estimated cost per request + +High token costs may indicate: + +* Context windows that are too large (Cline includes many files unnecessarily) +* Repeated requests for the same information (use custom instructions to prevent this) +* Missing MCP tools that could replace multi-turn conversations + +== Advanced configuration + +=== Model selection strategies + +Different models have different cost and performance characteristics. Configure Cline to use the right model for each task: + +==== Strategy 1: Single high-quality model + +Use one premium model for all tasks. + +Configuration: + +* Model: `anthropic/claude-sonnet-4-5` +* Best for: Complex codebases, high-quality output requirements +* Cost: Higher, but consistent + +==== Strategy 2: Task-based model switching + +Use the gateway's CEL routing to automatically select models based on task complexity. + +Gateway configuration (set in AI Gateway UI): + +[,cel] +---- +// Route simple edits to cost-effective model +request.messages[0].content.contains("fix typo") || +request.messages[0].content.contains("rename") ? + "anthropic/claude-haiku" : + "anthropic/claude-sonnet-4-5" +---- + +This approach requires no changes to Cline configuration. The gateway makes routing decisions transparently. + +==== Strategy 3: Multiple Cline profiles + +Create separate VS Code workspace settings for different projects: + +.Project A (high complexity) +[,json] +---- +{ + "cline.apiProvider": "OpenAI Compatible", + "cline.baseURL": "https://gw.ai.panda.com", + "cline.model": "anthropic/claude-opus-4-5" +} +---- + +.Project B (simple tasks) +[,json] +---- +{ + "cline.apiProvider": "OpenAI Compatible", + "cline.baseURL": "https://gw.ai.panda.com", + "cline.model": "anthropic/claude-haiku" +} +---- + +=== Request timeout configuration + +For long-running tool executions or complex code generation: + +. Open VS Code settings +. Search for `cline.requestTimeout` +. Set timeout in milliseconds (default: 60000) ++ +[,json] +---- +{ + "cline.requestTimeout": 120000 +} +---- + +Increase this value if Cline times out during large refactoring tasks or when using slow MCP tools. + +=== Debug mode + +Enable debug logging to troubleshoot connection issues: + +. Open VS Code settings +. Search for `cline.debug` +. Enable debug mode: ++ +[,json] +---- +{ + "cline.debug": true +} +---- + +Debug logs appear in the VS Code Output panel: + +. Open Output panel (View → Output) +. Select "Cline" from the dropdown +. View HTTP request and response details + +Debug mode shows: + +* Full request and response payloads +* Gateway routing headers +* MCP tool discovery messages +* Error details + +=== Environment-based configuration + +Use different gateways for different environments without changing settings manually. + +Create workspace-specific configurations: + +.Development workspace (.vscode/settings.json) +[,json] +---- +{ + "cline.apiProvider": "OpenAI Compatible", + "cline.baseURL": "${GATEWAY_DEV_URL}", + "cline.customHeaders": { + "rp-aigw-id": "${GATEWAY_DEV_ID}" + } +} +---- + +.Production workspace (.vscode/settings.json) +[,json] +---- +{ + "cline.apiProvider": "OpenAI Compatible", + "cline.baseURL": "${GATEWAY_PROD_URL}", + "cline.customHeaders": { + "rp-aigw-id": "${GATEWAY_PROD_ID}" + } +} +---- + +Set environment variables before launching VS Code: + +[,bash] +---- +export GATEWAY_DEV_URL="https://gw.staging.ai.panda.com" +export GATEWAY_DEV_ID="staging-gateway-123" +export GATEWAY_PROD_URL="https://gw.ai.panda.com" +export GATEWAY_PROD_ID="prod-gateway-456" +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:GATEWAY_DEV_URL = "https://gw.staging.ai.panda.com" +$env:GATEWAY_DEV_ID = "staging-gateway-123" +---- + +[[troubleshooting]] +== Troubleshooting + +=== Cline shows "Connection failed" + +**Symptom**: Cline settings show connection failed, or requests return errors. + +**Causes and solutions**: + +. **Incorrect base URL** ++ +Verify your base URL does NOT include `/v1` or `/chat/completions`: ++ +[,text] +---- +# Correct +https://gw.ai.panda.com + +# Incorrect +https://gw.ai.panda.com/v1 +https://gw.ai.panda.com/chat/completions +---- ++ +Cline appends the correct path automatically. + +. **Authentication failure** ++ +Verify your API key is valid: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/v1/models +---- ++ +You should receive a list of available models. If you get `401 Unauthorized`, regenerate your API key in the Redpanda Cloud Console. + +. **Gateway ID mismatch** ++ +Check that the `rp-aigw-id` header matches your gateway ID exactly (case-sensitive). Copy it directly from the AI Gateway UI. + +. **Network connectivity issues** ++ +Test basic connectivity: ++ +[,bash] +---- +curl -I https://gw.ai.panda.com +---- ++ +If this times out, check your network configuration, firewall rules, or VPN connection. + +=== MCP tools not appearing + +**Symptom**: Cline doesn't see tools from the MCP server, or tool discovery fails. + +**Causes and solutions**: + +. **MCP endpoint incorrect** ++ +Verify the MCP endpoint is correct. It should be `{gateway-url}/mcp`, not just `{gateway-url}`: ++ +[,text] +---- +# Correct +https://gw.ai.panda.com/mcp + +# Incorrect +https://gw.ai.panda.com +---- + +. **No MCP servers configured in gateway** ++ +Verify your gateway has at least one MCP server enabled in the AI Gateway UI. + +. **Deferred tool loading enabled** ++ +If deferred tool loading is enabled, you'll see only a search tool initially. This is expected behavior. Tools load on-demand when Cline needs them. + +. **MCP server unreachable** ++ +Test the MCP endpoint directly: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/mcp +---- ++ +You should receive a valid MCP protocol response listing available tools. + +=== Requests not appearing in gateway dashboard + +**Symptom**: Cline works, but you don't see requests in the AI Gateway observability dashboard. + +**Causes and solutions**: + +. **Wrong gateway configured** ++ +Verify that the `rp-aigw-id` header in your Cline configuration matches the gateway you're viewing in the dashboard. + +. **Using direct provider connection** ++ +If you configured Cline with a provider's API directly (not the gateway URL), requests won't route through the gateway. Verify the base URL is your gateway endpoint. + +. **Log ingestion delay** ++ +Gateway logs can take 5-10 seconds to appear in the dashboard. Wait briefly and refresh. + +. **Model name format error** ++ +Ensure requests use the `vendor/model_id` format (for example, `anthropic/claude-sonnet-4-5`), not just the model name (for example, `claude-sonnet-4-5`). Check the model field in Cline settings. + +=== High token costs + +**Symptom**: Cline uses more tokens than expected, resulting in high costs. + +**Causes and solutions**: + +. **Large context windows** ++ +Cline may be including too many files in the context. Solutions: ++ +* Use custom instructions to limit file inclusion +* Create a `.clineignore` file to exclude unnecessary files +* Break large tasks into smaller, focused subtasks + +. **Repeated requests** ++ +Cline may be making redundant requests for the same information. Solutions: ++ +* Add custom instructions to prevent repeated analysis +* Use MCP tools to fetch external information instead of asking the LLM +* Enable caching in the gateway (if available) + +. **Wrong model selected** ++ +You may be using a premium model for simple tasks. Solutions: ++ +* Switch to a cost-effective model (for example, `anthropic/claude-haiku`) ++ +* Use gateway CEL routing to automatically select models based on task complexity + +. **MCP tool overhead** ++ +If not using deferred tool loading, all tools load with every request. Solution: ++ +* Enable deferred tool loading in your AI Gateway configuration (see xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]) + +=== Cline hangs or times out + +**Symptom**: Cline stops responding or shows timeout errors. + +**Causes and solutions**: + +. **Request timeout too low** ++ +Increase the timeout in VS Code settings: ++ +[,json] +---- +{ + "cline.requestTimeout": 120000 +} +---- + +. **Long-running MCP tool** ++ +Some MCP tools take time to execute. Check the gateway observability dashboard to see if tool execution is slow. + +. **Gateway rate limiting** ++ +You may be hitting rate limits. Check the dashboard for rate limit metrics and increase limits if needed. + +. **Provider outage** ++ +Check the AI Gateway dashboard for provider status. If the primary provider is down, configure failover (see xref:ai-agents:ai-gateway/quickstart-enhanced.adoc#step-6-configure-provider-pool-with-fallback[Configure failover]). + +=== Settings changes not taking effect + +**Symptom**: Changes to Cline settings or VS Code configuration don't apply. + +**Solutions**: + +. **Reload VS Code** ++ +Some settings require reloading: ++ +* Open Command Palette (Cmd+Shift+P or Ctrl+Shift+P) +* Search for `Developer: Reload Window` +* Select and confirm + +. **Workspace settings override** ++ +Check if workspace settings (`.vscode/settings.json`) override user settings. Workspace settings take precedence. + +. **Invalid JSON syntax** ++ +If editing `settings.json` manually, validate JSON syntax. VS Code shows syntax errors in the editor. + +== Cost optimization tips + +=== Use the right model for each task + +Match model selection to task complexity: + +[cols="1,2,1"] +|=== +|Task type |Recommended model |Reason + +|Simple edits (typos, renames) +|`anthropic/claude-haiku` +|Low cost, fast + +|Code review, analysis +|`anthropic/claude-sonnet-3.5` +|Balanced quality and cost + +|Complex refactors, architecture +|`anthropic/claude-sonnet-4-5` or `anthropic/claude-opus-4-5` +|High quality for critical work +|=== + +Configure CEL routing in the gateway to automate model selection. + +=== Reduce context window size + +Limit the number of files Cline includes in requests: + +. Create a `.clineignore` file in your workspace root: ++ +[,text] +---- +# Exclude build artifacts +dist/ +build/ +node_modules/ + +# Exclude test files when not testing +**/*.test.js +**/*.spec.ts + +# Exclude documentation +docs/ +*.md +---- + +. Use custom instructions to guide file selection: ++ +[,text] +---- +- Only include files directly related to the task +- Ask which files to include if unsure +- Exclude test files unless specifically working on tests +---- + +=== Use MCP tools instead of large prompts + +Replace long documentation pastes with MCP tools: + +Before (high token cost): + +* User pastes API documentation into Cline chat +* Cline uses documentation to write integration code +* Thousands of tokens used for documentation + +After (low token cost): + +* Configure an MCP tool that searches API documentation +* Cline queries the tool for specific information as needed +* Only relevant sections included in context + +See xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[] for MCP tool configuration. + +=== Enable deferred tool loading + +If using multiple MCP servers, enable deferred tool loading in your gateway configuration to reduce token costs by 80-90%. + +This loads only essential tools initially. Cline queries for additional tools on-demand. + +=== Monitor and set spend limits + +Use AI Gateway spend limits to prevent runaway costs: + +. Navigate to your gateway in the Redpanda Cloud Console +. Set monthly spend limit (for example, $500/month) +. Configure alerts before reaching limit + +The gateway automatically blocks requests that would exceed the limit. + +== Next steps + +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure deferred tool loading to reduce token costs +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor Cline requests in the gateway dashboard +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Use CEL expressions to route Cline requests based on task complexity + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Create and configure your AI Gateway +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture and benefits +* xref:ai-agents:ai-gateway/integrations/claude-code-user.adoc[]: Configure Claude Code with AI Gateway diff --git a/modules/ai-agents/pages/ai-gateway/integrations/continue-admin.adoc b/modules/ai-agents/pages/ai-gateway/integrations/continue-admin.adoc new file mode 100644 index 00000000..4ed0230e --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/continue-admin.adoc @@ -0,0 +1,756 @@ += Configure AI Gateway for Continue.dev +:description: Configure Redpanda AI Gateway to support Continue.dev clients. +:page-topic-type: how-to +:page-personas: platform_admin +:learning-objective-1: Configure AI Gateway endpoints for Continue.dev connectivity +:learning-objective-2: Set up multi-provider backends with native format routing +:learning-objective-3: Deploy MCP tool aggregation for Continue.dev tool discovery + +Configure Redpanda AI Gateway to support Continue.dev clients accessing multiple LLM providers and MCP tools through flexible, native-format endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +* AI Gateway deployed on a BYOC cluster running Redpanda version 25.3 or later +* Administrator access to the AI Gateway UI +* API keys for at least one LLM provider (Anthropic, OpenAI, or others) +* Understanding of xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[AI Gateway concepts] + +== About Continue.dev + +Continue.dev is a highly configurable open-source AI coding assistant that integrates with VS Code and JetBrains IDEs. Unlike other AI assistants, Continue.dev uses native provider API formats rather than requiring transforms to a unified format. This architectural choice provides maximum flexibility but requires specific gateway configuration. + +Key characteristics: + +* Uses native provider formats (Anthropic format for Anthropic, OpenAI format for OpenAI) +* Supports multiple LLM providers simultaneously with per-provider configuration +* Custom API endpoints via `apiBase` configuration +* Custom headers via `requestOptions.headers` +* Built-in MCP support for tool discovery and execution +* Autocomplete, chat, and inline edit modes + +== Architecture overview + +Continue.dev connects to AI Gateway differently than unified-format clients: + +* Each provider requires a separate backend configured without format transforms +* LLM endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/{provider}` (provider-specific paths) +* MCP endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp` for tool discovery and execution + +The gateway handles: + +. Authentication via bearer tokens in the `Authorization` header +. Gateway selection via the `rp-aigw-id` header +. Provider-specific request formats without transformation +. Model routing using provider-native model identifiers +. MCP server aggregation for multi-tool workflows +. Request logging and cost tracking per gateway + +== Enable LLM providers + +Continue.dev works with multiple providers. Enable the providers your users will access. + +=== Configure Anthropic + +To enable Anthropic with native format support: + +. Navigate to *AI Gateway* > *Providers* in the Redpanda Cloud console +. Select *Anthropic* from the provider list +. Click *Add configuration* +. Enter your Anthropic API key +. Under *Format*, select *Native Anthropic* (not OpenAI-compatible) +. Click *Save* + +The gateway now accepts Anthropic's native `/v1/messages` format. + +=== Configure OpenAI + +To enable OpenAI: + +. Navigate to *AI Gateway* > *Providers* +. Select *OpenAI* from the provider list +. Click *Add configuration* +. Enter your OpenAI API key +. Under *Format*, select *Native OpenAI* +. Click *Save* + +=== Configure additional providers + +Continue.dev supports many providers. For each provider: + +. Add the provider configuration in the gateway +. Ensure the format is set to the provider's native format +. Do not enable format transforms (Continue.dev handles format differences in its client code) + +Common additional providers: + +* Google Gemini (native Google format) +* Mistral AI (OpenAI-compatible format) +* Together AI (OpenAI-compatible format) +* Ollama (OpenAI-compatible format for local models) + +=== Enable models in the catalog + +After enabling providers, enable specific models: + +. Navigate to *AI Gateway* > *Models* +. Enable the models you want Continue.dev clients to access ++ +Common models for Continue.dev: ++ +* `claude-opus-4-5` (Anthropic, high quality) +* `claude-sonnet-4-5` (Anthropic, balanced) +* `gpt-4o` (OpenAI, high quality) +* `gpt-4o-mini` (OpenAI, fast autocomplete) +* `o1-mini` (OpenAI, reasoning) + +. Click *Save* + +Continue.dev uses provider-native model identifiers (for example, `claude-sonnet-4-5` not `anthropic/claude-sonnet-4-5`). + +== Create a gateway for Continue.dev clients + +Create a dedicated gateway to isolate Continue.dev traffic and apply specific policies. + +=== Gateway configuration + +. Navigate to *AI Gateway* > *Gateways* +. Click *Create Gateway* +. Enter gateway details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`continue-gateway` (or your preferred name) + +|Workspace +|Select the workspace for access control grouping + +|Description +|Gateway for Continue.dev IDE clients +|=== + +. Click *Create* +. Copy the gateway ID from the gateway details page + +The gateway ID is required in the `rp-aigw-id` header for all requests. + +=== Configure provider-specific backends + +Continue.dev requires separate backend configurations for each provider because it uses native formats. + +==== Anthropic backend + +. Navigate to the gateway's *Backends* tab +. Click *Add Backend* +. Configure: ++ +[cols="1,2"] +|=== +|Field |Value + +|Backend name +|`anthropic-native` + +|Provider +|Anthropic + +|Format +|Native Anthropic (no transform) + +|Path +|`/v1/anthropic` + +|Enabled models +|All Anthropic models you enabled in the catalog +|=== + +. Click *Save* + +Continue.dev will send requests to `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/anthropic` using Anthropic's native format. + +==== OpenAI backend + +. Click *Add Backend* +. Configure: ++ +[cols="1,2"] +|=== +|Field |Value + +|Backend name +|`openai-native` + +|Provider +|OpenAI + +|Format +|Native OpenAI (no transform) + +|Path +|`/v1/openai` + +|Enabled models +|All OpenAI models you enabled in the catalog +|=== + +. Click *Save* + +Continue.dev will send requests to `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/openai` using OpenAI's native format. + +==== Additional provider backends + +Repeat the backend configuration process for each provider: + +* Google Gemini: `/v1/google`, native Google format +* Mistral: `/v1/mistral`, OpenAI-compatible format +* Ollama (if proxying local models): `/v1/ollama`, OpenAI-compatible format + +=== Configure LLM routing + +Set up routing policies for Continue.dev requests. + +==== Per-provider routing + +Configure routing rules that apply to each backend: + +. Navigate to the gateway's *Routing* tab +. For each backend, click *Add Route* +. Configure basic routing: ++ +[source,cel] +---- +true # Matches all requests to this backend +---- + +. Add a primary provider configuration with your Anthropic API key +. (Optional) Add a fallback configuration for redundancy if you have multiple API keys +. Click *Save* + +==== Provider failover + +For providers with multiple API keys, configure failover: + +. In the backend's routing configuration, add multiple provider configurations +. Set failover conditions: ++ +* Rate limits (HTTP 429) +* Timeouts (no response within 30 seconds) +* 5xx errors (provider unavailable) + +. Configure load balancing: Round robin across available keys +. Click *Save* + +Continue.dev requests automatically fail over to healthy API keys when the primary key experiences issues. + +=== Apply rate limits + +Prevent runaway usage from Continue.dev clients: + +. Navigate to the gateway's *Rate Limits* tab +. Configure global limits: ++ +[cols="1,2"] +|=== +|Setting |Recommended Value + +|Global rate limit +|200 requests per minute (Continue.dev autocomplete can generate many requests) + +|Per-user rate limit +|20 requests per minute (if using user identification headers) + +|Per-backend limits +|Vary by provider (autocomplete backends need higher limits) +|=== + +. Click *Save* + +The gateway blocks requests exceeding these limits and returns HTTP 429 errors. + +==== Rate limit considerations for autocomplete + +Continue.dev's autocomplete feature generates frequent, short requests. Configure higher rate limits for autocomplete-specific backends: + +* Autocomplete models (for example, `gpt-4o-mini`): 100 requests per minute per user +* Chat models (for example, `claude-sonnet-4-5`): 20 requests per minute per user + +=== Set spending limits + +Control LLM costs across all providers: + +. Navigate to the gateway's *Spend Limits* tab +. Configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Monthly budget +|$10,000 (adjust based on expected usage) + +|Enforcement +|Block requests after budget exceeded + +|Alert threshold +|80% of budget (sends notification) +|=== + +. Click *Save* + +The gateway tracks estimated costs per request across all providers and blocks traffic when the monthly budget is exhausted. + +== Configure MCP tool aggregation + +Enable Continue.dev to discover and use tools from multiple MCP servers through a single endpoint. + +=== Add MCP servers + +. Navigate to the gateway's *MCP* tab +. Click *Add MCP Server* +. Enter server details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Display name +|Descriptive name (for example, `redpanda-data-catalog`, `code-search-tools`) + +|Endpoint URL +|MCP server endpoint (for example, xref:ai-agents:mcp/remote/overview.adoc[Remote MCP server] URL) + +|Authentication +|Bearer token or other authentication mechanism +|=== + +. Click *Save* + +Repeat for each MCP server you want to aggregate. + +=== Enable deferred tool loading + +Reduce token costs for Continue.dev sessions with many available tools: + +. Under *MCP Settings*, enable *Deferred tool loading* +. Click *Save* + +When enabled: + +* Continue.dev initially receives only a search tool and orchestrator tool +* Continue.dev queries for specific tools by name when needed +* Token usage decreases by 80-90% for configurations with many tools + +This is particularly important for Continue.dev because autocomplete and chat modes both use tool discovery. + +=== Add the MCP orchestrator + +The MCP orchestrator reduces multi-step workflows to single calls: + +. Under *MCP Settings*, enable *MCP Orchestrator* +. Configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Orchestrator model +|Select a model with strong code generation capabilities (for example, `claude-sonnet-4-5`) + +|Execution timeout +|30 seconds + +|Backend +|Select the Anthropic backend (orchestrator works best with Claude models) +|=== + +. Click *Save* + +Continue.dev can now invoke the orchestrator tool to execute complex, multi-step operations in a single request. + +== Configure authentication + +Continue.dev clients authenticate using bearer tokens. + +=== Generate API tokens + +. Navigate to *Security* > *API Tokens* in the Redpanda Cloud console +. Click *Create Token* +. Enter token details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`continue-access` + +|Scopes +|`ai-gateway:read`, `ai-gateway:write` + +|Expiration +|Set appropriate expiration based on security policies +|=== + +. Click *Create* +. Copy the token (it appears only once) + +Distribute this token to Continue.dev users through secure channels. + +=== Token rotation + +Implement token rotation for security: + +. Create a new token before the existing token expires +. Distribute the new token to users +. Monitor usage of the old token in xref:ai-agents:ai-gateway/observability-logs.adoc[request logs] +. Revoke the old token after all users have migrated + +== Configure Continue.dev clients + +Provide these instructions to users configuring Continue.dev in their IDE. + +=== Configuration file location + +Continue.dev uses a JSON configuration file: + +* VS Code: `~/.continue/config.json` +* JetBrains: `~/.continue/config.json` + +=== Multi-provider configuration + +Users configure Continue.dev with separate provider entries for each backend: + +[source,json] +---- +{ + "models": [ + { + "title": "Claude Sonnet (Redpanda)", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiBase": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/anthropic", + "apiKey": "YOUR_API_TOKEN", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + { + "title": "GPT-4o (Redpanda)", + "provider": "openai", + "model": "gpt-4o", + "apiBase": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/openai", + "apiKey": "YOUR_API_TOKEN", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + { + "title": "GPT-4o-mini (Autocomplete)", + "provider": "openai", + "model": "gpt-4o-mini", + "apiBase": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/openai", + "apiKey": "YOUR_API_TOKEN", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ], + "tabAutocompleteModel": { + "title": "GPT-4o-mini (Autocomplete)", + "provider": "openai", + "model": "gpt-4o-mini", + "apiBase": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/openai", + "apiKey": "YOUR_API_TOKEN", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +Replace: + +* `{CLUSTER_ID}`: Your Redpanda cluster ID +* `YOUR_API_TOKEN`: The API token generated earlier +* `GATEWAY_ID`: The gateway ID from gateway creation + +=== MCP server configuration + +Configure Continue.dev to connect to the aggregated MCP endpoint: + +[source,json] +---- +{ + "experimental": { + "modelContextProtocolServers": [ + { + "transport": { + "type": "http", + "url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_TOKEN", + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ] + } +} +---- + +This configuration: + +* Connects Continue.dev to the aggregated MCP endpoint +* Routes LLM requests through provider-specific backends +* Includes authentication and gateway identification headers + +=== Model selection strategy + +Configure different models for different Continue.dev modes: + +[cols="1,2,1"] +|=== +|Mode |Recommended Model |Reason + +|Chat +|`claude-sonnet-4-5` or `gpt-4o` +|High quality for complex questions + +|Autocomplete +|`gpt-4o-mini` +|Fast, cost-effective for frequent requests + +|Inline edit +|`claude-sonnet-4-5` +|Balanced quality and speed for code modifications + +|Embeddings +|`text-embedding-3-small` +|Cost-effective for code search +|=== + +== Monitor Continue.dev usage + +Track Continue.dev activity through gateway observability features. + +=== View request logs + +. Navigate to *AI Gateway* > *Observability* > *Logs* +. Filter by gateway ID: `continue-gateway` +. Review: ++ +* Request timestamps and duration +* Backend and model used per request +* Token usage (prompt and completion tokens) +* Estimated cost per request +* HTTP status codes and errors + +Continue.dev generates different request patterns: + +* Autocomplete: Many short requests with low token counts +* Chat: Longer requests with context and multi-turn conversations +* Inline edit: Medium-length requests with code context + +=== Analyze metrics + +. Navigate to *AI Gateway* > *Observability* > *Metrics* +. Select the Continue.dev gateway +. Review: ++ +[cols="1,2"] +|=== +|Metric |Purpose + +|Request volume by backend +|Identify which providers are most used + +|Token usage by model +|Track consumption patterns (autocomplete vs chat) + +|Estimated spend by backend +|Monitor costs across providers + +|Latency (p50, p95, p99) by backend +|Detect provider-specific performance issues + +|Error rate by backend +|Identify failing providers or misconfigured backends +|=== + +For detailed metrics configuration, see xref:ai-agents:ai-gateway/observability-metrics.adoc[]. + +=== Query logs via API + +Programmatically access logs for integration with monitoring systems: + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "gateway_id": "GATEWAY_ID", + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-14T23:59:59Z", + "limit": 100 + }' +---- + +== Security considerations + +Apply these security best practices for Continue.dev deployments. + +=== Limit token scope + +Create tokens with minimal required scopes: + +* `ai-gateway:read`: Required for MCP tool discovery +* `ai-gateway:write`: Required for LLM requests and tool execution + +Avoid granting broader scopes like `admin` or `cluster:write`. + +=== Implement network restrictions + +If Continue.dev clients connect from known networks, configure network policies: + +. Use cloud provider security groups to restrict access to AI Gateway endpoints +. Allowlist only the IP ranges where Continue.dev clients operate +. Monitor for unauthorized access attempts in request logs + +=== Enforce token expiration + +Set short token lifetimes for high-security environments: + +* Development environments: 90 days +* Production environments: 30 days + +Automate token rotation to reduce manual overhead. + +=== Audit tool access + +Review which MCP tools Continue.dev clients can access: + +. Periodically audit the MCP servers configured in the gateway +. Remove unused or deprecated MCP servers +. Monitor tool execution logs for unexpected behavior + +=== Protect API keys in configuration + +Continue.dev stores the API token in plain text in `config.json`. Remind users to: + +* Never commit `config.json` to version control +* Use file system permissions to restrict access (for example, `chmod 600 ~/.continue/config.json`) +* Rotate tokens if they suspect compromise + +== Troubleshooting + +Common issues and solutions when configuring AI Gateway for Continue.dev. + +=== Continue.dev cannot connect to gateway + +Symptom: Connection errors when Continue.dev tries to discover tools or send LLM requests. + +Causes and solutions: + +* **Invalid gateway ID**: Verify the `rp-aigw-id` header matches the gateway ID from the console +* **Expired token**: Generate a new API token and update the Continue.dev configuration +* **Wrong backend path**: Verify `apiBase` matches the backend path (for example, `/v1/anthropic` not `/v1`) +* **Network connectivity**: Verify the cluster endpoint is accessible from the client network +* **Provider not enabled**: Ensure at least one backend is configured with models enabled + +=== Model not found errors + +Symptom: Continue.dev shows "model not found" or similar errors. + +Causes and solutions: + +* **Model not enabled in catalog**: Enable the model in the gateway's model catalog +* **Model identifier mismatch**: Use provider-native names (for example, `claude-sonnet-4-5` not `anthropic/claude-sonnet-4-5`) +* **Wrong backend for model**: Verify the model is associated with the correct backend (Anthropic models with Anthropic backend) + +=== Format errors or unexpected responses + +Symptom: Responses are malformed or Continue.dev reports format errors. + +Causes and solutions: + +* **Transform enabled on backend**: Ensure backend format is set to native (no OpenAI-compatible transform for Anthropic) +* **Wrong provider for apiBase**: Verify Continue.dev's `provider` field matches the backend's provider +* **Headers not passed**: Confirm `requestOptions.headers` includes `rp-aigw-id` + +=== Autocomplete not working or slow + +Symptom: Autocomplete suggestions don't appear or are delayed. + +Causes and solutions: + +* **Wrong model for autocomplete**: Use a fast model like `gpt-4o-mini` in `tabAutocompleteModel` +* **Rate limits too restrictive**: Increase rate limits for autocomplete backend +* **High backend latency**: Check backend metrics and consider provider failover +* **Token exhaustion**: Verify spending limits haven't been reached + +=== Tools not appearing in Continue.dev + +Symptom: Continue.dev does not discover MCP tools. + +Causes and solutions: + +* **MCP configuration missing**: Ensure `experimental.modelContextProtocolServers` is configured +* **MCP servers not configured in gateway**: Add MCP server endpoints in the gateway's MCP tab +* **Deferred loading enabled but search failing**: Check that the search tool is correctly configured +* **MCP server authentication failing**: Verify MCP server authentication credentials in the gateway configuration + +=== High costs or token usage + +Symptom: Token usage and costs exceed expectations. + +Causes and solutions: + +* **Autocomplete using expensive model**: Configure `tabAutocompleteModel` to use `gpt-4o-mini` instead of larger models +* **Deferred tool loading disabled**: Enable deferred tool loading to reduce tokens by 80-90% +* **No rate limits**: Apply per-minute rate limits to prevent runaway usage +* **Missing spending limits**: Set monthly budget limits with blocking enforcement +* **Chat using wrong model**: Route chat requests to cost-effective models (for example, `claude-sonnet-4-5` instead of `claude-opus-4-5`) + +=== Requests failing with 429 errors + +Symptom: Continue.dev receives HTTP 429 Too Many Requests errors. + +Causes and solutions: + +* **Rate limit exceeded**: Review and increase rate limits if usage is legitimate (autocomplete needs higher limits) +* **Upstream provider rate limits**: Check if the upstream LLM provider is rate-limiting; configure failover to alternate API keys +* **Budget exhausted**: Verify monthly spending limit has not been reached + +=== Different results from different providers + +Symptom: Same prompt produces different results when switching providers. + +This is expected behavior, not a configuration issue: + +* Different models have different capabilities and response styles +* Continue.dev uses native formats, which may include provider-specific parameters +* Users should select the appropriate model for their task (quality vs speed vs cost) + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Analyze detailed request logs +* xref:ai-agents:ai-gateway/observability-metrics.adoc[]: Set up metrics dashboards +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Implement advanced routing rules +* xref:ai-agents:mcp/remote/overview.adoc[]: Deploy Remote MCP servers for custom tools diff --git a/modules/ai-agents/pages/ai-gateway/integrations/continue-user.adoc b/modules/ai-agents/pages/ai-gateway/integrations/continue-user.adoc new file mode 100644 index 00000000..3a7efbfa --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/continue-user.adoc @@ -0,0 +1,946 @@ += Configure Continue.dev with AI Gateway +:description: Configure Continue.dev to use Redpanda AI Gateway for unified LLM access, MCP tool integration, and AI-assisted coding. +:page-topic-type: how-to +:page-personas: ai_agent_developer, app_developer +:learning-objective-1: Configure Continue.dev to connect to AI Gateway for chat and autocomplete +:learning-objective-2: Set up MCP server integration through AI Gateway +:learning-objective-3: Optimize Continue.dev settings for cost and performance + +After xref:ai-agents:ai-gateway/ai-gateway.adoc[configuring your AI Gateway], set up Continue.dev to route LLM requests and access MCP tools through the gateway's unified endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +Before configuring Continue.dev, ensure you have: + +* Continue.dev extension installed in your code editor: +** VS Code: Search for "Continue" in Extensions +** JetBrains IDEs: Install from the JetBrains Marketplace +* An active Redpanda AI Gateway with: +** At least one LLM provider enabled (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-1-enable-a-provider[Enable a provider]) +** A gateway created and configured (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-3-create-a-gateway[Create a gateway]) +* Your AI Gateway credentials: +** Gateway endpoint URL (for example, `https://gw.ai.panda.com`) +** Gateway ID (for example, `gateway-abc123`) +** API key with access to the gateway + +== About Continue.dev + +Continue.dev is an open-source AI coding assistant that integrates with VS Code and JetBrains IDEs. It provides: + +* Chat interface for code questions and generation +* Tab autocomplete powered by LLMs +* Codebase indexing for context-aware suggestions +* Slash commands for common workflows +* Extensible architecture with custom context providers + +By routing Continue.dev through AI Gateway, you gain centralized observability, cost controls, and the ability to aggregate multiple MCP servers into a single interface. + +== Configuration file location + +Continue.dev stores configuration in `config.json`: + +* VS Code: `~/.continue/config.json` +* JetBrains: `~/.continue/config.json` (same location) + +Create the directory if it doesn't exist: + +[,bash] +---- +mkdir -p ~/.continue +---- + +== Basic configuration + +Create or edit `~/.continue/config.json` with the following structure to connect to AI Gateway: + +[,json] +---- +{ + "models": [ + { + "title": "Redpanda AI Gateway - Claude", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ] +} +---- + +Replace placeholder values: + +* `YOUR_REDPANDA_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID from the AI Gateway UI + +The `provider` field tells Continue.dev which SDK to use (Anthropic format), while `apiBase` routes the request through your gateway. The gateway then forwards the request to the appropriate provider based on the model name. + +== Configure multiple models + +Continue.dev can switch between different models for different tasks. Configure multiple models to optimize for quality and cost: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet (default)", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + { + "title": "Gateway - Claude Opus (complex tasks)", + "provider": "anthropic", + "model": "claude-opus-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + { + "title": "Gateway - GPT-4o", + "provider": "openai", + "model": "gpt-4o", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ] +} +---- + +Switch between models in Continue.dev's chat interface by clicking the model selector dropdown. + +== Configure tab autocomplete + +Continue.dev supports a separate model for tab autocomplete, which generates code suggestions as you type. Use a faster, cost-effective model for autocomplete: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ], + "tabAutocompleteModel": { + "title": "Gateway - Claude Haiku (autocomplete)", + "provider": "anthropic", + "model": "claude-haiku", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +This configuration uses Claude Sonnet for chat interactions and Claude Haiku for autocomplete. Haiku provides faster responses at lower cost, which is ideal for autocomplete where speed matters more than reasoning depth. + +== Configure with OpenAI provider format + +AI Gateway supports both native provider formats and OpenAI-compatible format. If you prefer using the OpenAI format for all models, configure Continue.dev with the `openai` provider: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet (OpenAI format)", + "provider": "openai", + "model": "anthropic/claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com/v1", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + }, + { + "title": "Gateway - GPT-4o (OpenAI format)", + "provider": "openai", + "model": "openai/gpt-4o", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com/v1", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ] +} +---- + +When using OpenAI provider format: + +* Set `provider` to `"openai"` +* Add `/v1` to the `apiBase` URL +* Use the `vendor/model_id` format for model names (for example, `anthropic/claude-sonnet-4-5`) + +== Configure MCP server integration + +Connect Continue.dev to your AI Gateway's MCP endpoint to aggregate tools from multiple MCP servers. + +Add the `experimental` section to `config.json`: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ], + "experimental": { + "modelContextProtocolServers": [ + { + "transport": { + "type": "http", + "url": "https://gw.ai.panda.com/mcp", + "headers": { + "Authorization": "Bearer YOUR_REDPANDA_API_KEY", + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ] + } +} +---- + +After adding this configuration: + +. Restart Continue.dev (reload your editor window) +. Click the tools icon in the Continue.dev sidebar +. Verify that tools from your configured MCP servers appear + +If using deferred tool loading in your gateway, you'll see a search tool and MCP orchestrator tool instead of all tools upfront. + +== Configure with environment variables + +For sensitive credentials or multi-environment setups, use environment variables: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "${REDPANDA_API_KEY}", + "apiBase": "${REDPANDA_GATEWAY_URL}", + "requestOptions": { + "headers": { + "rp-aigw-id": "${REDPANDA_GATEWAY_ID}" + } + } + } + ], + "experimental": { + "modelContextProtocolServers": [ + { + "transport": { + "type": "http", + "url": "${REDPANDA_GATEWAY_URL}/mcp", + "headers": { + "Authorization": "Bearer ${REDPANDA_API_KEY}", + "rp-aigw-id": "${REDPANDA_GATEWAY_ID}" + } + } + } + ] + } +} +---- + +Set environment variables before launching your editor: + +[,bash] +---- +export REDPANDA_GATEWAY_URL="https://gw.ai.panda.com" +export REDPANDA_GATEWAY_ID="gateway-abc123" +export REDPANDA_API_KEY="your-api-key" +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:REDPANDA_GATEWAY_URL = "https://gw.ai.panda.com" +$env:REDPANDA_GATEWAY_ID = "gateway-abc123" +$env:REDPANDA_API_KEY = "your-api-key" +---- + +== Project-level configuration + +Override global settings for specific projects by creating `.continuerc.json` in your project root: + +[,json] +---- +{ + "models": [ + { + "title": "Project Gateway - Claude Haiku", + "provider": "anthropic", + "model": "claude-haiku", + "apiKey": "${PROJECT_API_KEY}", + "apiBase": "https://gw.project.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "${PROJECT_GATEWAY_ID}" + } + } + } + ] +} +---- + +Project-level configuration takes precedence over global configuration. Use this to: + +* Route different projects through different gateways +* Use cost-effective models for internal projects +* Use premium models for customer-facing projects +* Separate billing between projects + +== Verify configuration + +After configuring Continue.dev, verify it connects correctly to your AI Gateway. + +=== Test chat interface + +. Open Continue.dev sidebar in your editor +. Type a simple question: "What does this function do?" (with a file open) +. Wait for response + +Then verify in the AI Gateway dashboard: + +. Open the Redpanda Cloud Console +. Navigate to your gateway's observability dashboard +. Filter by gateway ID +. Verify: +** Request appears in logs +** Model shows correct format (for example, `claude-sonnet-4-5` for Anthropic native or `anthropic/claude-sonnet-4-5` for OpenAI format) +** Token usage and cost are recorded + +If the request doesn't appear, see <>. + +=== Test tab autocomplete + +. Open a code file in your editor +. Start typing a function or class definition +. Wait for autocomplete suggestions to appear + +Autocomplete requests also appear in the gateway dashboard, typically with: + +* Lower token counts than chat requests +* Higher request frequency +* The autocomplete model you configured + +=== Test MCP tool integration + +If you configured MCP servers: + +. Open Continue.dev chat +. Ask a question that requires a tool: "What's the weather forecast?" +. Continue.dev should: +** Discover the tool from the MCP server +** Invoke it with correct parameters +** Return the result + +Check the gateway dashboard for MCP tool invocation logs. + +== Advanced configuration + +=== Custom request headers + +Add custom headers for request tracking or routing: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID", + "x-user-id": "developer-123", + "x-project": "main-app" + } + } + } + ] +} +---- + +Use these headers with gateway CEL routing to: + +* Track costs per developer +* Route based on project type +* Apply different rate limits per user + +=== Temperature and max tokens + +Configure model parameters for different behaviors: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Precise (low temperature)", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + }, + "completionOptions": { + "temperature": 0.2, + "maxTokens": 2048 + } + }, + { + "title": "Gateway - Creative (high temperature)", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + }, + "completionOptions": { + "temperature": 0.8, + "maxTokens": 4096 + } + } + ] +} +---- + +* Lower temperature (0.0-0.3): More deterministic, better for code generation +* Higher temperature (0.7-1.0): More creative, better for brainstorming +* `maxTokens`: Limit response length to control costs + +=== Context providers + +Configure which code context Continue.dev includes in requests: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ], + "contextProviders": [ + { + "name": "code", + "params": { + "maxFiles": 5 + } + }, + { + "name": "diff" + }, + { + "name": "terminal" + } + ] +} +---- + +Available context providers: + +* `code`: Includes open files and highlighted code +* `diff`: Includes git diff of current changes +* `terminal`: Includes recent terminal output +* `problems`: Includes editor warnings and errors +* `folder`: Includes file tree structure + +Limiting context providers reduces token usage and costs. + +=== Slash commands + +Configure custom slash commands for common workflows: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "apiKey": "YOUR_REDPANDA_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } + ], + "slashCommands": [ + { + "name": "review", + "description": "Review code for bugs and improvements", + "prompt": "Review this code for potential bugs, performance issues, and suggest improvements. Focus on:\n- Error handling\n- Edge cases\n- Code clarity\n\n{{{ input }}}" + }, + { + "name": "test", + "description": "Generate unit tests", + "prompt": "Generate comprehensive unit tests for this code. Include:\n- Happy path tests\n- Edge case tests\n- Error handling tests\n\n{{{ input }}}" + } + ] +} +---- + +Use slash commands in Continue.dev chat: + +* `/review` - Triggers code review prompt +* `/test` - Generates tests + +Custom commands help standardize prompts across teams and reduce token costs by avoiding repetitive instruction typing. + +[[troubleshooting]] +== Troubleshooting + +=== Continue.dev shows connection error + +**Symptom**: Continue.dev displays "Failed to connect" or requests return errors. + +**Causes and solutions**: + +. **Incorrect apiBase URL** ++ +Verify the URL format matches your provider choice: ++ +[,text] +---- +# Anthropic/native format (no /v1) +"apiBase": "https://gw.ai.panda.com" + +# OpenAI format (with /v1) +"apiBase": "https://gw.ai.panda.com/v1" +---- + +. **Provider mismatch** ++ +Ensure the `provider` field matches the API format you're using: ++ +* Native Anthropic: `"provider": "anthropic"` with no `/v1` in URL +* Native OpenAI: `"provider": "openai"` with `/v1` in URL +* OpenAI-compatible: `"provider": "openai"` with `/v1` in URL + +. **Authentication failure** ++ +Verify your API key is valid: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/v1/models +---- ++ +You should receive a list of available models. If you get `401 Unauthorized`, regenerate your API key in the Redpanda Cloud Console. + +. **Gateway ID mismatch** ++ +Check that the `rp-aigw-id` header matches your gateway ID exactly (case-sensitive). Copy it directly from the AI Gateway UI. + +. **Invalid JSON syntax** ++ +Validate your `config.json` file: ++ +[,bash] +---- +python3 -m json.tool ~/.continue/config.json +---- ++ +Fix any syntax errors reported. + +=== Autocomplete not working + +**Symptom**: Tab autocomplete suggestions don't appear or are very slow. + +**Causes and solutions**: + +. **No autocomplete model configured** ++ +Verify `tabAutocompleteModel` is set in `config.json`. If missing, Continue.dev may fall back to chat model, which is slower and more expensive. + +. **Model too slow** ++ +Use a faster model for autocomplete: ++ +[,json] +---- +{ + "tabAutocompleteModel": { + "title": "Gateway - Claude Haiku", + "provider": "anthropic", + "model": "claude-haiku", + "apiKey": "YOUR_API_KEY", + "apiBase": "https://gw.ai.panda.com", + "requestOptions": { + "headers": { + "rp-aigw-id": "GATEWAY_ID" + } + } + } +} +---- + +. **Network latency** ++ +Check gateway latency in the observability dashboard. If p95 latency is over 500ms, autocomplete will feel slow. Consider: ++ +* Using a gateway in a closer geographic region +* Switching to a faster model (Haiku over Sonnet) + +. **Autocomplete disabled** ++ +Check Continue.dev settings in your editor: ++ +* VS Code: Settings → Continue → Enable Tab Autocomplete +* JetBrains: Settings → Tools → Continue → Enable Autocomplete + +=== MCP tools not appearing + +**Symptom**: Continue.dev doesn't show tools from the MCP server. + +**Causes and solutions**: + +. **MCP configuration missing** ++ +Verify the `experimental.modelContextProtocolServers` section exists in `config.json`. + +. **Incorrect MCP endpoint** ++ +The MCP URL should be `{gateway-url}/mcp`: ++ +[,text] +---- +# Correct +"url": "https://gw.ai.panda.com/mcp" + +# Incorrect +"url": "https://gw.ai.panda.com" +---- + +. **No MCP servers in gateway** ++ +Verify your gateway has at least one MCP server configured in the AI Gateway UI. + +. **Deferred tool loading enabled** ++ +If deferred tool loading is enabled, you'll see only a search tool initially. This is expected behavior. + +. **Editor restart needed** ++ +MCP configuration changes require reloading the editor window: ++ +* VS Code: Command Palette → Developer: Reload Window +* JetBrains: File → Invalidate Caches / Restart + +=== Requests not appearing in gateway dashboard + +**Symptom**: Continue.dev works, but requests don't appear in the AI Gateway observability dashboard. + +**Causes and solutions**: + +. **Wrong gateway ID** ++ +Verify that the `rp-aigw-id` header matches the gateway you're viewing in the dashboard. + +. **Missing header** ++ +Ensure the `rp-aigw-id` header is in the `requestOptions.headers` section, not at the top level. + +. **Using direct provider connection** ++ +If `apiBase` points directly to a provider (for example, `https://api.anthropic.com`), requests won't route through the gateway. Verify it points to your gateway endpoint. + +. **Log ingestion delay** ++ +Gateway logs can take 5-10 seconds to appear in the dashboard. Wait briefly and refresh. + +=== High token costs + +**Symptom**: Continue.dev uses more tokens than expected, resulting in high costs. + +**Causes and solutions**: + +. **Too much context included** ++ +Continue.dev may be including too many files. Solutions: ++ +* Limit `maxFiles` in context providers +* Use `.continueignore` file to exclude unnecessary directories +* Close unused editor tabs before using Continue.dev + +. **Autocomplete using expensive model** ++ +Verify you're using a cost-effective model for autocomplete: ++ +[,json] +---- +{ + "tabAutocompleteModel": { + "provider": "anthropic", + "model": "claude-haiku" + } +} +---- + +. **Model parameters too high** ++ +Reduce `maxTokens` in `completionOptions` to limit response length: ++ +[,json] +---- +{ + "completionOptions": { + "maxTokens": 2048 + } +} +---- + +. **MCP overhead** ++ +If not using deferred tool loading, all tools load with every request. Enable deferred tool loading in your AI Gateway configuration (see xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]). + +=== Configuration changes not taking effect + +**Symptom**: Changes to `config.json` don't apply. + +**Solutions**: + +. **Reload editor window** ++ +Configuration changes require reloading: ++ +* VS Code: Command Palette → Developer: Reload Window +* JetBrains: File → Invalidate Caches / Restart + +. **Invalid JSON syntax** ++ +Validate JSON syntax: ++ +[,bash] +---- +python3 -m json.tool ~/.continue/config.json +---- + +. **Project config overriding** ++ +Check if `.continuerc.json` in your project root overrides global settings. + +. **File permissions** ++ +Verify Continue.dev can read the config file: ++ +[,bash] +---- +ls -la ~/.continue/config.json +---- ++ +Fix permissions if needed: ++ +[,bash] +---- +chmod 600 ~/.continue/config.json +---- + +== Cost optimization tips + +=== Use different models for chat and autocomplete + +Chat interactions benefit from reasoning depth, while autocomplete needs speed: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5" + } + ], + "tabAutocompleteModel": { + "title": "Gateway - Claude Haiku", + "provider": "anthropic", + "model": "claude-haiku" + } +} +---- + +This can reduce costs by 5-10x for autocomplete while maintaining quality for chat. + +=== Limit context window size + +Reduce the amount of code included in requests: + +Create `.continueignore` in your project root: + +[,text] +---- +# Exclude build artifacts +dist/ +build/ +node_modules/ + +# Exclude tests when not working on tests +**/*.test.* +**/*.spec.* + +# Exclude documentation +docs/ +*.md + +# Exclude large data files +*.json +*.csv +---- + +Then limit files in `config.json`: + +[,json] +---- +{ + "contextProviders": [ + { + "name": "code", + "params": { + "maxFiles": 3 + } + } + ] +} +---- + +=== Use MCP tools for documentation + +Instead of pasting documentation into chat, create MCP tools that fetch relevant sections on-demand. This reduces token costs by including only needed information. + +=== Monitor usage patterns + +Use the AI Gateway dashboard to identify optimization opportunities: + +. Navigate to your gateway's observability dashboard +. Filter by Continue.dev requests (use custom header if configured) +. Analyze: +** Token usage per request type (chat vs autocomplete) +** Most expensive queries +** High-frequency low-value requests + +=== Set model-specific limits + +Prevent runaway costs by configuring `maxTokens`: + +[,json] +---- +{ + "models": [ + { + "title": "Gateway - Claude Sonnet", + "provider": "anthropic", + "model": "claude-sonnet-4-5", + "completionOptions": { + "maxTokens": 2048 + } + } + ], + "tabAutocompleteModel": { + "completionOptions": { + "maxTokens": 256 + } + } +} +---- + +Autocomplete rarely needs more than 256 tokens, while chat responses can vary. + +== Next steps + +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure deferred tool loading to reduce token costs +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor Continue.dev requests in the gateway dashboard +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Use CEL expressions to route Continue.dev requests based on context + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Create and configure your AI Gateway +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture and benefits +* xref:ai-agents:ai-gateway/integrations/claude-code-user.adoc[]: Configure Claude Code with AI Gateway +* xref:ai-agents:ai-gateway/integrations/cline-user.adoc[]: Configure Cline with AI Gateway diff --git a/modules/ai-agents/pages/ai-gateway/integrations/cursor-admin.adoc b/modules/ai-agents/pages/ai-gateway/integrations/cursor-admin.adoc new file mode 100644 index 00000000..272218be --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/cursor-admin.adoc @@ -0,0 +1,815 @@ += Configure AI Gateway for Cursor IDE +:description: Configure Redpanda AI Gateway to support Cursor IDE clients. +:page-topic-type: how-to +:page-personas: platform_admin +:learning-objective-1: Configure AI Gateway endpoints for Cursor IDE connectivity +:learning-objective-2: Set up OpenAI-compatible transforms for multi-provider routing +:learning-objective-3: Deploy multi-tenant authentication strategies for Cursor clients + +Configure Redpanda AI Gateway to support Cursor IDE clients accessing multiple LLM providers and MCP tools through OpenAI-compatible endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +* AI Gateway deployed on a BYOC cluster running Redpanda version 25.3 or later +* Administrator access to the AI Gateway UI +* API keys for at least one LLM provider (Anthropic, OpenAI, or others) +* Understanding of xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[AI Gateway concepts] + +== About Cursor IDE + +Cursor is an AI-powered code editor built on VS Code that integrates multiple LLM providers for code completion, chat, and inline editing. Unlike other AI assistants, Cursor uses OpenAI's API format for all providers and routes to different models using a `vendor/model` prefix notation. + +Key characteristics: + +* Sends all requests in OpenAI-compatible format to `/v1/chat/completions` +* Routes using model prefixes (for example, `openai/gpt-4o`, `anthropic/claude-sonnet-4-5`) +* Limited support for custom headers (makes multi-tenant deployments challenging) +* Supports MCP protocol with a 40-tool limit +* Built-in code completion and chat modes +* Configuration via settings file (`~/.cursor/config.json`) + +== Architecture overview + +Cursor IDE connects to AI Gateway through standardized endpoints: + +* LLM endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/chat/completions` for all providers +* MCP endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp` for tool discovery and execution + +The gateway handles: + +. Authentication via bearer tokens in the `Authorization` header +. Gateway selection via the `rp-aigw-id` header (requires workarounds for multi-tenant scenarios) +. Model routing using vendor prefixes (for example, `anthropic/claude-sonnet-4-5`) +. Format transforms from OpenAI format to provider-native formats (for Anthropic, Google, etc.) +. MCP server aggregation for multi-tool workflows +. Request logging and cost tracking per gateway + +== Enable LLM providers + +Cursor IDE works with multiple providers through OpenAI-compatible transforms. Enable the providers your users will access. + +=== Configure Anthropic with OpenAI-compatible format + +Cursor sends OpenAI-formatted requests but can route to Anthropic models. Configure the gateway to transform these requests: + +. Navigate to *AI Gateway* > *Providers* in the Redpanda Cloud console +. Select *Anthropic* from the provider list +. Click *Add configuration* +. Enter your Anthropic API key +. Under *Format*, select *OpenAI-compatible* (enables automatic transform) +. Click *Save* + +The gateway now transforms OpenAI-format requests to Anthropic's native `/v1/messages` format. + +=== Configure OpenAI + +To enable OpenAI as a provider: + +. Navigate to *AI Gateway* > *Providers* +. Select *OpenAI* from the provider list +. Click *Add configuration* +. Enter your OpenAI API key +. Under *Format*, select *Native OpenAI* +. Click *Save* + +=== Configure additional providers + +Cursor supports many providers through OpenAI-compatible transforms. For each provider: + +. Add the provider configuration in the gateway +. Set the format to *OpenAI-compatible* (the gateway handles format transformation) +. Enable the transform layer to convert OpenAI request format to the provider's native format + +Common additional providers: + +* Google Gemini (requires OpenAI-compatible transform) +* Mistral AI (already OpenAI-compatible format) +* Together AI (already OpenAI-compatible format) + +=== Enable models in the catalog + +After enabling providers, enable specific models: + +. Navigate to *AI Gateway* > *Models* +. Enable the models you want Cursor clients to access ++ +Common models for Cursor: ++ +* `anthropic/claude-opus-4-5` +* `anthropic/claude-sonnet-4-5` +* `openai/gpt-4o` +* `openai/gpt-4o-mini` +* `openai/o1-mini` + +. Click *Save* + +Cursor uses the `vendor/model_id` format in requests. The gateway maps these to provider endpoints and applies the appropriate format transforms. + +== Create a gateway for Cursor clients + +Create a dedicated gateway to isolate Cursor traffic and apply specific policies. + +=== Gateway configuration + +. Navigate to *AI Gateway* > *Gateways* +. Click *Create Gateway* +. Enter gateway details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`cursor-gateway` (or your preferred name) + +|Workspace +|Select the workspace for access control grouping + +|Description +|Gateway for Cursor IDE clients +|=== + +. Click *Create* +. Copy the gateway ID from the gateway details page + +The gateway ID is required in the `rp-aigw-id` header for all requests. + +=== Configure unified LLM routing + +Cursor sends all requests to a single endpoint (`/v1/chat/completions`) and uses model prefixes for routing. Configure the gateway to route based on the requested model prefix. + +==== Model prefix routing + +Configure routing that inspects the model field to determine the target provider: + +. Navigate to the gateway's *LLM* tab +. Under *Routing*, click *Add route* +. Configure Anthropic routing: ++ +[source,cel] +---- +request.body.model.startsWith("anthropic/") +---- + +. Add a *Primary provider pool*: ++ +* Provider: Anthropic +* Model: All enabled Anthropic models +* Transform: OpenAI to Anthropic +* Load balancing: Round robin (if multiple Anthropic configurations exist) + +. Click *Save* +. Add another route for OpenAI: ++ +[source,cel] +---- +request.body.model.startsWith("openai/") +---- + +. Add a *Primary provider pool*: ++ +* Provider: OpenAI +* Model: All enabled OpenAI models +* Transform: None (already OpenAI format) + +. Click *Save* + +Cursor requests route to the appropriate provider based on the model prefix. + +==== Default routing with fallback + +Configure a catch-all route for requests without vendor prefixes: + +[source,cel] +---- +true # Matches all requests not matched by previous routes +---- + +Add a primary provider (for example, OpenAI) with fallback to Anthropic: + +* Primary: OpenAI (for requests with no prefix) +* Fallback: Anthropic (if OpenAI is unavailable) +* Failover conditions: Rate limits, timeouts, 5xx errors + +=== Apply rate limits + +Prevent runaway usage from Cursor clients: + +. Navigate to the gateway's *LLM* tab +. Under *Rate Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Recommended Value + +|Global rate limit +|150 requests per minute + +|Per-user rate limit +|15 requests per minute (if using user identification workarounds) +|=== + +. Click *Save* + +The gateway blocks requests exceeding these limits and returns HTTP 429 errors. + +==== Rate limit considerations for code completion + +Cursor's code completion feature generates frequent requests. Consider separate rate limits for completion vs chat: + +* Completion models (for example, `openai/gpt-4o-mini`): Higher rate limits +* Chat models (for example, `anthropic/claude-sonnet-4-5`): Standard rate limits + +Configure routing rules that apply different rate limits based on model selection. + +=== Set spending limits + +Control LLM costs across all providers: + +. Under *Spend Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Monthly budget +|$7,000 (adjust based on expected usage) + +|Enforcement +|Block requests after budget exceeded + +|Alert threshold +|80% of budget (sends notification) +|=== + +. Click *Save* + +The gateway tracks estimated costs per request across all providers and blocks traffic when the monthly budget is exhausted. + +== Configure MCP tool aggregation + +Enable Cursor to discover and use tools from multiple MCP servers through a single endpoint. Note that Cursor has a 40-tool limit, so carefully select which MCP servers to aggregate. + +=== Add MCP servers + +. Navigate to the gateway's *MCP* tab +. Click *Add MCP Server* +. Enter server details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Display name +|Descriptive name (for example, `redpanda-data-tools`, `code-search-tools`) + +|Endpoint URL +|MCP server endpoint (for example, xref:ai-agents:mcp/remote/overview.adoc[Remote MCP server] URL) + +|Authentication +|Bearer token or other authentication mechanism +|=== + +. Click *Save* + +Repeat for each MCP server you want to aggregate, keeping in mind the 40-tool limit. + +=== Work within the 40-tool limit + +Cursor imposes a 40-tool limit on MCP integrations. To stay within this limit: + +* Aggregate only essential MCP servers +* Use deferred tool loading (see next section) +* Prioritize high-value tools over comprehensive tool sets +* Consider creating multiple gateways with different tool sets for different use cases + +Monitor the total tool count across all aggregated MCP servers: + +. Navigate to the gateway's *MCP* tab +. Review the *Total Tools* count displayed at the top +. If the count exceeds 40, remove low-priority MCP servers + +=== Enable deferred tool loading + +Reduce the effective tool count by deferring tool discovery: + +. Under *MCP Settings*, enable *Deferred tool loading* +. Click *Save* + +When enabled: + +* Cursor initially receives only a search tool and orchestrator tool (2 tools total) +* Cursor queries for specific tools by name when needed +* The underlying MCP servers can provide more than 40 tools, but only the search and orchestrator tools count against the limit +* Token usage decreases by 80-90% for configurations with many tools + +Deferred tool loading is the recommended approach for Cursor deployments with multiple MCP servers. + +=== Add the MCP orchestrator + +The MCP orchestrator reduces multi-step workflows to single calls: + +. Under *MCP Settings*, enable *MCP Orchestrator* +. Configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Orchestrator model +|Select a model with strong code generation capabilities (for example, `anthropic/claude-sonnet-4-5`) + +|Execution timeout +|30 seconds + +|Backend +|Select the Anthropic backend (orchestrator works best with Claude models) +|=== + +. Click *Save* + +Cursor can now invoke the orchestrator tool to execute complex, multi-step operations in a single request. + +== Configure authentication + +Cursor clients authenticate using bearer tokens in the `Authorization` header. + +=== Generate API tokens + +. Navigate to *Security* > *API Tokens* in the Redpanda Cloud console +. Click *Create Token* +. Enter token details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`cursor-access` + +|Scopes +|`ai-gateway:read`, `ai-gateway:write` + +|Expiration +|Set appropriate expiration based on security policies +|=== + +. Click *Create* +. Copy the token (it appears only once) + +Distribute this token to Cursor users through secure channels. + +=== Token rotation + +Implement token rotation for security: + +. Create a new token before the existing token expires +. Distribute the new token to users +. Monitor usage of the old token in xref:ai-agents:ai-gateway/observability-logs.adoc[request logs] +. Revoke the old token after all users have migrated + +== Multi-tenant deployment strategies + +Cursor has limited support for custom headers, making traditional `rp-aigw-id` header-based multi-tenancy challenging. Use one of these alternative strategies. + +=== Strategy 1: Tenant-specific subdomains + +Configure different subdomains for each tenant or team: + +. Set up DNS records pointing to your AI Gateway cluster: ++ +* `team-alpha.aigateway.example.com` → Gateway ID: `alpha-cursor-gateway` +* `team-beta.aigateway.example.com` → Gateway ID: `beta-cursor-gateway` + +. Configure the gateway to extract tenant identity from the `Host` header: ++ +[source,cel] +---- +request.headers["host"][0].startsWith("team-alpha") +---- + +. Distribute tenant-specific URLs to each team +. Each team configures Cursor with their specific subdomain + +This approach works with standard Cursor configuration without requiring custom headers. + +**Configuration example for Team Alpha:** + +[source,json] +---- +{ + "apiProvider": "openai", + "apiBaseUrl": "https://team-alpha.aigateway.example.com/ai-gateway/v1", + "apiKey": "TEAM_ALPHA_TOKEN" +} +---- + +=== Strategy 2: Path-based routing + +Use URL path prefixes to identify tenants: + +. Configure gateway routing to extract tenant from the request path: ++ +[source,cel] +---- +request.path.startsWith("/ai-gateway/alpha/") +---- + +. Create routing rules that map path prefixes to specific gateways or policies +. Distribute tenant-specific base URLs + +**Configuration example for Team Alpha:** + +[source,json] +---- +{ + "apiProvider": "openai", + "apiBaseUrl": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/alpha/v1", + "apiKey": "TEAM_ALPHA_TOKEN" +} +---- + +This approach requires gateway-level path rewriting to remove the tenant prefix before forwarding to LLM providers. + +=== Strategy 3: Query parameter routing + +Embed tenant identity in query parameters: + +. Configure Cursor to append query parameters to the base URL: ++ +[source,json] +---- +{ + "apiProvider": "openai", + "apiBaseUrl": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1?tenant=alpha", + "apiKey": "TEAM_ALPHA_TOKEN" +} +---- + +. Configure gateway routing to extract tenant from query parameters: ++ +[source,cel] +---- +request.url.query["tenant"][0] == "alpha" +---- + +. Create routing rules and rate limits based on the tenant parameter + +This approach works with standard Cursor configuration but exposes tenant identity in URLs. + +=== Strategy 4: API token-based routing + +Use different API tokens to identify tenants: + +. Generate separate API tokens for each tenant +. Tag tokens with metadata indicating the tenant +. Configure gateway routing based on token identity: ++ +[source,cel] +---- +request.auth.metadata["tenant"] == "alpha" +---- + +. Apply tenant-specific routing, rate limits, and spending limits + +This approach is most transparent to users but requires gateway support for token metadata inspection. + +=== Choosing a multi-tenant strategy + +[cols="1,2,2,1"] +|=== +|Strategy |Pros |Cons |Best For + +|Subdomains +|Clean, standards-based, no URL modifications +|Requires DNS configuration, certificate management +|Organizations with infrastructure control + +|Path-based +|No DNS required, flexible routing +|Requires path rewriting, tenant exposed in logs +|Simpler deployments, testing environments + +|Query parameters +|No infrastructure changes +|Tenant exposed in URLs and logs, less clean +|Quick deployments, temporary solutions + +|Token-based +|Transparent to users, centralized control +|Requires advanced gateway features +|Large deployments, strong security requirements +|=== + +== Configure Cursor IDE clients + +Provide these instructions to users configuring Cursor IDE. + +=== Configuration file location + +Cursor uses a JSON configuration file: + +* macOS: `~/.cursor/config.json` +* Linux: `~/.cursor/config.json` +* Windows: `%USERPROFILE%\.cursor\config.json` + +=== Basic configuration + +Users configure Cursor with the AI Gateway endpoint: + +[source,json] +---- +{ + "apiProvider": "openai", + "apiBaseUrl": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + "apiKey": "YOUR_API_TOKEN", + "models": { + "chat": "anthropic/claude-sonnet-4-5", + "completion": "openai/gpt-4o-mini" + } +} +---- + +Replace: + +* `{CLUSTER_ID}`: Your Redpanda cluster ID +* `YOUR_API_TOKEN`: The API token generated earlier + +If using a multi-tenant strategy, adjust the `apiBaseUrl` according to your chosen approach (subdomain, path prefix, or query parameter). + +=== Model selection + +Configure different models for different Cursor modes: + +[cols="1,2,1"] +|=== +|Mode |Recommended Model |Reason + +|Chat +|`anthropic/claude-sonnet-4-5` or `openai/gpt-4o` +|High quality for complex questions + +|Code completion +|`openai/gpt-4o-mini` +|Fast, cost-effective for frequent requests + +|Inline edit +|`anthropic/claude-sonnet-4-5` +|Balanced quality and speed for code modifications +|=== + +=== MCP server configuration + +Configure Cursor to connect to the aggregated MCP endpoint: + +[source,json] +---- +{ + "experimental": { + "mcpServers": { + "redpanda-ai-gateway": { + "transport": "http", + "url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp", + "headers": { + "Authorization": "Bearer YOUR_API_TOKEN" + } + } + } + } +} +---- + +If using a multi-tenant strategy, ensure the MCP URL matches the tenant configuration. + +This configuration: + +* Connects Cursor to the aggregated MCP endpoint +* Routes LLM requests through the AI Gateway with OpenAI-compatible transforms +* Includes authentication headers + +== Monitor Cursor usage + +Track Cursor activity through gateway observability features. + +=== View request logs + +. Navigate to *AI Gateway* > *Observability* > *Logs* +. Filter by gateway ID: `cursor-gateway` +. Review: ++ +* Request timestamps and duration +* Model used per request (with vendor prefix) +* Token usage (prompt and completion tokens) +* Estimated cost per request +* HTTP status codes and errors +* Transform operations (OpenAI to provider-native format) + +Cursor generates different request patterns: + +* Code completion: Many short requests with low token counts +* Chat: Longer requests with context and multi-turn conversations +* Inline edit: Medium-length requests with code context + +=== Analyze metrics + +. Navigate to *AI Gateway* > *Observability* > *Metrics* +. Select the Cursor gateway +. Review: ++ +[cols="1,2"] +|=== +|Metric |Purpose + +|Request volume by provider +|Identify which providers are most used via model prefix routing + +|Token usage by model +|Track consumption patterns (completion vs chat) + +|Estimated spend by provider +|Monitor costs across providers with transforms + +|Latency (p50, p95, p99) +|Detect transform overhead and provider-specific performance issues + +|Error rate by provider +|Identify failing providers or transform issues + +|Transform success rate +|Monitor OpenAI-to-provider format conversion success +|=== + +For detailed metrics configuration, see xref:ai-agents:ai-gateway/observability-metrics.adoc[]. + +=== Query logs via API + +Programmatically access logs for integration with monitoring systems: + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "gateway_id": "GATEWAY_ID", + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-14T23:59:59Z", + "limit": 100 + }' +---- + +== Security considerations + +Apply these security best practices for Cursor deployments. + +=== Limit token scope + +Create tokens with minimal required scopes: + +* `ai-gateway:read`: Required for MCP tool discovery +* `ai-gateway:write`: Required for LLM requests and tool execution + +Avoid granting broader scopes like `admin` or `cluster:write`. + +=== Implement network restrictions + +If Cursor clients connect from known networks, configure network policies: + +. Use cloud provider security groups to restrict access to AI Gateway endpoints +. Allowlist only the IP ranges where Cursor clients operate +. Monitor for unauthorized access attempts in request logs + +=== Enforce token expiration + +Set short token lifetimes for high-security environments: + +* Development environments: 90 days +* Production environments: 30 days + +Automate token rotation to reduce manual overhead. + +=== Audit tool access + +Review which MCP tools Cursor clients can access: + +. Periodically audit the MCP servers configured in the gateway +. Remove unused or deprecated MCP servers +. Monitor tool execution logs for unexpected behavior +. Ensure total tool count stays within Cursor's 40-tool limit + +=== Protect API keys in configuration + +Cursor stores the API token in plain text in `config.json`. Remind users to: + +* Never commit `config.json` to version control +* Use file system permissions to restrict access (for example, `chmod 600 ~/.cursor/config.json` on Unix-like systems) +* Rotate tokens if they suspect compromise +* Consider using environment variables for API keys (if Cursor supports this) + +=== Monitor transform operations + +Because Cursor requires OpenAI-compatible transforms for non-OpenAI providers: + +. Review transform success rates in metrics +. Monitor for transform failures that may leak request details +. Test transforms thoroughly before production deployment +. Keep transform logic updated as provider APIs evolve + +== Troubleshooting + +Common issues and solutions when configuring AI Gateway for Cursor. + +=== Cursor cannot connect to gateway + +Symptom: Connection errors when Cursor tries to discover tools or send LLM requests. + +Causes and solutions: + +* **Invalid base URL**: Verify `apiBaseUrl` matches the gateway endpoint (including multi-tenant prefix if applicable) +* **Expired token**: Generate a new API token and update the Cursor configuration +* **Network connectivity**: Verify the cluster endpoint is accessible from the client network +* **Provider not enabled**: Ensure at least one provider is enabled and has models in the catalog +* **Missing gateway ID**: If using header-based routing, verify the `rp-aigw-id` header is configured (or use alternative multi-tenant strategy) + +=== Model not found errors + +Symptom: Cursor shows "model not found" or similar errors. + +Causes and solutions: + +* **Model not enabled in catalog**: Enable the model in the gateway's model catalog +* **Incorrect model prefix**: Use the correct vendor prefix (for example, `anthropic/claude-sonnet-4-5` not just `claude-sonnet-4-5`) +* **Transform not configured**: Verify OpenAI-compatible transform is enabled for non-OpenAI providers +* **Routing rule mismatch**: Check that routing rules correctly match the model prefix + +=== Transform errors or unexpected responses + +Symptom: Responses are malformed or Cursor reports format errors. + +Causes and solutions: + +* **Transform disabled**: Ensure OpenAI-compatible transform is enabled for Anthropic and other non-OpenAI providers +* **Transform version mismatch**: Verify the transform is compatible with the current provider API version +* **Model-specific transform issues**: Some models may require specific transform configurations +* **Check transform logs**: Review logs for transform errors and stack traces + +=== Tools not appearing in Cursor + +Symptom: Cursor does not discover MCP tools. + +Causes and solutions: + +* **MCP configuration missing**: Ensure `experimental.mcpServers` is configured in Cursor settings +* **MCP servers not configured in gateway**: Add MCP server endpoints in the gateway's MCP tab +* **Exceeds 40-tool limit**: Reduce the number of aggregated tools or enable deferred tool loading +* **Deferred loading enabled but search failing**: Check that the search tool is correctly configured +* **MCP server authentication failing**: Verify MCP server authentication credentials in the gateway configuration + +=== High costs or token usage + +Symptom: Token usage and costs exceed expectations. + +Causes and solutions: + +* **Code completion using expensive model**: Configure completion mode to use `openai/gpt-4o-mini` instead of larger models +* **Deferred tool loading disabled**: Enable deferred tool loading to reduce tokens by 80-90% +* **No rate limits**: Apply per-minute rate limits to prevent runaway usage +* **Missing spending limits**: Set monthly budget limits with blocking enforcement +* **Chat using wrong model**: Route chat requests to cost-effective models (for example, `anthropic/claude-sonnet-4-5` instead of `anthropic/claude-opus-4-5`) +* **Transform overhead**: Monitor if transforms add significant token overhead + +=== Requests failing with 429 errors + +Symptom: Cursor receives HTTP 429 Too Many Requests errors. + +Causes and solutions: + +* **Rate limit exceeded**: Review and increase rate limits if usage is legitimate (code completion needs higher limits) +* **Upstream provider rate limits**: Check if the upstream LLM provider is rate-limiting; configure failover to alternate providers +* **Budget exhausted**: Verify monthly spending limit has not been reached +* **Per-user limits too restrictive**: Adjust per-user rate limits if using multi-tenant strategies + +=== Multi-tenant routing failures + +Symptom: Requests route to wrong gateway or fail authorization. + +Causes and solutions: + +* **Subdomain not configured**: Verify DNS records and SSL certificates for tenant-specific subdomains +* **Path prefix mismatch**: Check that path-based routing rules correctly extract tenant identity +* **Query parameter missing**: Ensure query parameter is appended to all requests +* **Token metadata incorrect**: Verify token is tagged with correct tenant metadata +* **Routing rule conflicts**: Check for overlapping routing rules that may cause unexpected routing + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Analyze detailed request logs and transform operations +* xref:ai-agents:ai-gateway/observability-metrics.adoc[]: Set up metrics dashboards +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Implement advanced routing rules for model prefix routing +* xref:ai-agents:mcp/remote/overview.adoc[]: Deploy Remote MCP servers for custom tools diff --git a/modules/ai-agents/pages/ai-gateway/integrations/cursor-user.adoc b/modules/ai-agents/pages/ai-gateway/integrations/cursor-user.adoc new file mode 100644 index 00000000..c0357e08 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/cursor-user.adoc @@ -0,0 +1,837 @@ += Configure Cursor IDE with AI Gateway +:description: Configure Cursor IDE to use Redpanda AI Gateway for unified LLM access, MCP tool integration, and AI-assisted coding. +:page-topic-type: how-to +:page-personas: ai_agent_developer, app_developer +:learning-objective-1: Configure Cursor IDE to route LLM requests through AI Gateway +:learning-objective-2: Set up MCP server integration for tool access through the gateway +:learning-objective-3: Optimize Cursor settings for multi-tenancy and cost control + +After xref:ai-agents:ai-gateway/ai-gateway.adoc[configuring your AI Gateway], set up Cursor IDE to route LLM requests and access MCP tools through the gateway's unified endpoints. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +Before configuring Cursor IDE, ensure you have: + +* Cursor IDE installed (download from https://cursor.sh[cursor.sh^]) +* An active Redpanda AI Gateway with: +** At least one LLM provider enabled (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-1-enable-a-provider[Enable a provider]) +** A gateway created and configured (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-3-create-a-gateway[Create a gateway]) +* Your AI Gateway credentials: +** Gateway endpoint URL (for example, `https://gw.ai.panda.com`) +** Gateway ID (for example, `gateway-abc123`) +** API key with access to the gateway + +== About Cursor IDE + +Cursor IDE is an AI-powered code editor built on VS Code that provides: + +* Chat interface for code questions and generation +* AI-powered autocomplete with context awareness +* Codebase indexing for semantic search +* Inline code editing with AI assistance +* Terminal integration for command suggestions +* Native integration with multiple LLM providers + +By routing Cursor through AI Gateway, you gain centralized observability, cost controls, provider flexibility, and the ability to aggregate multiple MCP servers into a single interface. + +== Configuration methods + +Cursor IDE supports two configuration approaches for connecting to AI Gateway: + +[cols="1,2,2"] +|=== +|Method |Best for |Trade-offs + +|Settings UI +|Visual configuration, quick setup +|Limited to single provider configuration + +|Configuration file +|Multiple providers, environment-specific settings, version control +|Manual file editing required +|=== + +Choose the method that matches your workflow. The Settings UI is faster for getting started, while the configuration file provides more flexibility for production use. + +== Configure using Settings UI + +The Settings UI provides a visual interface for configuring Cursor's AI providers. + +=== Configure AI provider + +. Open Cursor Settings: +** macOS: *Cursor* > *Settings* or `Cmd+,` +** Windows/Linux: *File* > *Preferences* > *Settings* or `Ctrl+,` +. Navigate to *Features* > *AI* +. Under *OpenAI API*, configure the base URL and API key: + +[source,text] +---- +Override OpenAI Base URL: https://gw.ai.panda.com/v1 +Override OpenAI API Key: YOUR_REDPANDA_API_KEY +---- + +[start=4] +. Scroll to *Advanced Settings* and add custom headers: + +[source,json] +---- +{ + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID" + } +} +---- + +Replace placeholder values: + +* `YOUR_REDPANDA_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID from the AI Gateway UI + +=== Select models + +In the AI settings, configure which models to use: + +. Under *Model Selection*, choose your preferred model from the dropdown +. Cursor will automatically use the gateway endpoint configured above +. Models available depend on what you've enabled in your AI Gateway + +Model selection options: + +* `gpt-4o` - Routes to OpenAI GPT-4o through your gateway +* `gpt-4o-mini` - Routes to OpenAI GPT-4o-mini (cost-effective) +* `claude-sonnet-4-5` - Routes to Anthropic Claude Sonnet (if enabled in gateway) +* `claude-opus-4-5` - Routes to Anthropic Claude Opus (if enabled in gateway) + +Note: When routing through AI Gateway, Cursor uses the OpenAI SDK format. The gateway automatically translates requests to the appropriate provider based on the model name. + +== Configure using configuration file + +For more control over provider settings, multi-environment configurations, or version control, edit Cursor's configuration file directly. + +=== Locate configuration file + +Cursor stores configuration in `settings.json`: + +* macOS: `~/Library/Application Support/Cursor/User/settings.json` +* Windows: `%APPDATA%\Cursor\User\settings.json` +* Linux: `~/.config/Cursor/User/settings.json` + +Create the directory structure if it doesn't exist: + +[,bash] +---- +# macOS +mkdir -p ~/Library/Application\ Support/Cursor/User + +# Linux +mkdir -p ~/.config/Cursor/User +---- + +=== Basic configuration + +Create or edit `settings.json` with the following structure: + +[,json] +---- +{ + "cursor.overrideOpenAIBaseUrl": "https://gw.ai.panda.com/v1", + "cursor.overrideOpenAIApiKey": "YOUR_REDPANDA_API_KEY", + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID" + }, + "cursor.cpp.defaultModel": "gpt-4o", + "cursor.chat.defaultModel": "gpt-4o" +} +---- + +Replace placeholder values: + +* `YOUR_REDPANDA_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID + +Configuration fields: + +* `cursor.overrideOpenAIBaseUrl` - Gateway endpoint (always ends with `/v1` for OpenAI compatibility) +* `cursor.overrideOpenAIApiKey` - Your Redpanda API key (used for authentication) +* `openai.additionalHeaders` - Custom headers sent with every request +* `cursor.cpp.defaultModel` - Model for autocomplete (c++ refers to copilot++) +* `cursor.chat.defaultModel` - Model for chat interactions + +=== Multiple environment configuration + +To switch between development and production gateways, use workspace-specific settings. + +Create `.vscode/settings.json` in your project root: + +[,json] +---- +{ + "cursor.overrideOpenAIBaseUrl": "https://gw.staging.ai.panda.com/v1", + "openai.additionalHeaders": { + "rp-aigw-id": "staging-gateway-123", + "x-environment": "staging" + } +} +---- + +Workspace settings override global settings. Use this to: + +* Route different projects through different gateways +* Use cost-effective models for internal projects +* Use premium models for customer-facing projects +* Add project-specific tracking headers + +=== Configuration with environment variables + +For sensitive credentials, use environment variables instead of hardcoding values. + +In `settings.json`: + +[,json] +---- +{ + "cursor.overrideOpenAIBaseUrl": "${REDPANDA_GATEWAY_URL}/v1", + "cursor.overrideOpenAIApiKey": "${REDPANDA_API_KEY}", + "openai.additionalHeaders": { + "rp-aigw-id": "${REDPANDA_GATEWAY_ID}" + } +} +---- + +Set environment variables before launching Cursor: + +[,bash] +---- +export REDPANDA_GATEWAY_URL="https://gw.ai.panda.com" +export REDPANDA_GATEWAY_ID="gateway-abc123" +export REDPANDA_API_KEY="your-api-key" + +# Launch Cursor from terminal to inherit environment +cursor . +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:REDPANDA_GATEWAY_URL = "https://gw.ai.panda.com" +$env:REDPANDA_GATEWAY_ID = "gateway-abc123" +$env:REDPANDA_API_KEY = "your-api-key" + +# Launch Cursor +& "C:\Users\YourName\AppData\Local\Programs\cursor\Cursor.exe" +---- + +== Configure MCP server integration + +Cursor IDE supports MCP (Model Context Protocol) integration to access tools from MCP servers through AI Gateway. + +=== Understanding MCP tool limits + +Cursor IDE has a 40-tool limit for MCP integration. When you have many MCP servers with numerous tools, this limit is quickly exceeded. + +AI Gateway solves this through deferred tool loading: + +* Without deferred loading: All tools from all MCP servers load upfront (often 50+ tools) +* With deferred loading: Only 2 tools load initially (search + orchestrator) +* Agent queries for specific tools only when needed +* 80-90% token reduction, depending on configuration + +See xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[] for details on configuring deferred tool loading. + +=== Add MCP server connection + +Edit `settings.json` to add the MCP configuration: + +[,json] +---- +{ + "cursor.overrideOpenAIBaseUrl": "https://gw.ai.panda.com/v1", + "cursor.overrideOpenAIApiKey": "YOUR_REDPANDA_API_KEY", + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID" + }, + "cursor.mcp": { + "servers": { + "redpanda-ai-gateway": { + "command": "node", + "args": [ + "-e", + "require('http').request({hostname:'gw.ai.panda.com',path:'/mcp',method:'GET',headers:{'Authorization':'Bearer YOUR_REDPANDA_API_KEY','rp-aigw-id':'GATEWAY_ID'}}).end()" + ] + } + } + } +} +---- + +This configuration uses Node.js to make HTTP requests to the gateway's MCP endpoint. The gateway returns tool definitions that Cursor can use. + +Replace placeholder values: + +* `YOUR_REDPANDA_API_KEY` - Your Redpanda API key +* `GATEWAY_ID` - Your gateway ID + +=== Enable deferred tool loading + +To work within Cursor's 40-tool limit, configure deferred tool loading in your AI Gateway: + +. Navigate to your gateway configuration in the AI Gateway UI +. Under *MCP Settings*, enable *Deferred Tool Loading* +. Save the gateway configuration + +When deferred loading is enabled: + +* Cursor receives only the search tool and orchestrator tool initially (2 tools total) +* When you ask Cursor to perform a task requiring a specific tool, it queries the gateway +* The gateway returns only the relevant tool definitions +* Total tool count stays well under the 40-tool limit + +== Verify configuration + +After configuring Cursor IDE, verify it connects correctly to your AI Gateway. + +=== Test chat interface + +. Open Cursor IDE +. Press `Cmd+L` (macOS) or `Ctrl+L` (Windows/Linux) to open the chat panel +. Type a simple question: "What does this function do?" (with a file open) +. Wait for response + +Then verify in the AI Gateway dashboard: + +. Open the Redpanda Cloud Console +. Navigate to your gateway's observability dashboard +. Filter by gateway ID +. Verify: +** Request appears in logs +** Model shows correct format (for example, `gpt-4o`) +** Token usage and cost are recorded +** Request succeeded (status 200) + +If the request doesn't appear, see <>. + +=== Test inline code completion + +. Open a code file in Cursor +. Start typing a function definition +. Wait for inline suggestions to appear + +Autocomplete requests appear in the gateway dashboard with: + +* Lower token counts than chat requests +* Higher request frequency +* The autocomplete model you configured + +=== Test MCP tool integration + +If you configured MCP servers: + +. Open Cursor chat (`Cmd+L` or `Ctrl+L`) +. Ask a question that requires a tool: "What's the current date?" +. Cursor should: +** Discover available tools from the gateway +** Invoke the appropriate tool +** Return the result + +Check the gateway dashboard for MCP tool invocation logs. + +== Advanced configuration + +=== Custom request tracking headers + +Add custom headers for request tracking, user attribution, or routing policies: + +[,json] +---- +{ + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID", + "x-user-id": "developer-123", + "x-team": "backend", + "x-project": "api-service" + } +} +---- + +Use these headers with gateway CEL routing to: + +* Track costs per developer or team +* Route based on project type +* Apply different rate limits per user +* Generate team-specific usage reports + +=== Model-specific settings + +Configure different settings for chat vs autocomplete: + +[,json] +---- +{ + "cursor.chat.defaultModel": "claude-sonnet-4-5", + "cursor.cpp.defaultModel": "gpt-4o-mini", + "cursor.chat.temperature": 0.7, + "cursor.cpp.temperature": 0.2, + "cursor.chat.maxTokens": 4096, + "cursor.cpp.maxTokens": 512 +} +---- + +Settings explained: + +* Chat uses Claude Sonnet for reasoning depth +* Autocomplete uses GPT-4o-mini for speed and cost efficiency +* Chat temperature (0.7) allows creative responses +* Autocomplete temperature (0.2) produces deterministic code +* Chat allows longer responses (4096 tokens) +* Autocomplete limits responses (512 tokens) for speed + +=== Multi-tenancy with team-specific gateways + +For organizations with multiple teams sharing Cursor but requiring separate cost tracking and policies: + +[,json] +---- +{ + "cursor.overrideOpenAIBaseUrl": "https://gw.ai.panda.com/v1", + "cursor.overrideOpenAIApiKey": "${TEAM_API_KEY}", + "openai.additionalHeaders": { + "rp-aigw-id": "${TEAM_GATEWAY_ID}", + "x-team": "${TEAM_NAME}" + } +} +---- + +Each team configures their own: + +* `TEAM_API_KEY` - Team-specific API key +* `TEAM_GATEWAY_ID` - Gateway with team budget and rate limits +* `TEAM_NAME` - Identifier for usage reports + +This approach enables: + +* Per-team cost attribution +* Separate budgets and rate limits +* Team-specific model access policies +* Independent observability dashboards + +=== Request timeout configuration + +Configure timeout for LLM and MCP requests: + +[,json] +---- +{ + "cursor.requestTimeout": 30000, + "cursor.mcp.requestTimeout": 15000 +} +---- + +Timeout values are in milliseconds. Defaults: + +* LLM requests: 30000ms (30 seconds) +* MCP requests: 15000ms (15 seconds) + +Increase timeouts for: + +* Long-running MCP tools (database queries, web searches) +* High-latency network environments +* Complex reasoning tasks requiring extended processing + +=== Debug mode + +Enable debug logging to troubleshoot connection issues: + +[,json] +---- +{ + "cursor.debug": true, + "cursor.logLevel": "debug" +} +---- + +Debug mode shows: + +* HTTP request and response headers +* Model selection decisions +* Token usage calculations +* Error details with stack traces + +View debug logs: + +. Open Command Palette (`Cmd+Shift+P` or `Ctrl+Shift+P`) +. Type "Developer: Show Logs" +. Select "Extension Host" +. Filter by "cursor" + +[[troubleshooting]] +== Troubleshooting + +=== Cursor shows connection error + +**Symptom**: Cursor displays "Failed to connect to AI provider" or requests return errors. + +**Causes and solutions**: + +. **Incorrect base URL format** ++ +Verify the URL includes `/v1` at the end: ++ +[,text] +---- +# Correct +"cursor.overrideOpenAIBaseUrl": "https://gw.ai.panda.com/v1" + +# Incorrect +"cursor.overrideOpenAIBaseUrl": "https://gw.ai.panda.com" +---- + +. **Authentication failure** ++ +Verify your API key is valid: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/v1/models +---- ++ +You should receive a list of available models. If you get `401 Unauthorized`, regenerate your API key in the Redpanda Cloud Console. + +. **Gateway ID mismatch** ++ +Check that the `rp-aigw-id` header matches your gateway ID exactly (case-sensitive). Copy it directly from the AI Gateway UI. + +. **Missing headers** ++ +Ensure `openai.additionalHeaders` is configured in settings: ++ +[,json] +---- +{ + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID" + } +} +---- + +. **Invalid JSON syntax** ++ +Validate your `settings.json` file: ++ +[,bash] +---- +# macOS/Linux +python3 -m json.tool ~/Library/Application\ Support/Cursor/User/settings.json + +# Or use jq +jq . ~/Library/Application\ Support/Cursor/User/settings.json +---- ++ +Fix any syntax errors reported. + +=== Autocomplete not working + +**Symptom**: Inline autocomplete suggestions don't appear or are very slow. + +**Causes and solutions**: + +. **No autocomplete model configured** ++ +Verify `cursor.cpp.defaultModel` is set in `settings.json`: ++ +[,json] +---- +{ + "cursor.cpp.defaultModel": "gpt-4o-mini" +} +---- + +. **Model too slow** ++ +Use a faster, cost-effective model for autocomplete: ++ +[,json] +---- +{ + "cursor.cpp.defaultModel": "gpt-4o-mini", + "cursor.cpp.maxTokens": 256 +} +---- ++ +Smaller models like GPT-4o-mini or Claude Haiku provide faster responses ideal for autocomplete. + +. **Network latency** ++ +Check gateway latency in the observability dashboard. If p95 latency is over 500ms, autocomplete will feel slow. Consider: ++ +* Using a gateway in a closer geographic region +* Switching to a faster model +* Reducing `cursor.cpp.maxTokens` to 256 or lower + +. **Autocomplete disabled in settings** ++ +Verify autocomplete is enabled: ++ +. Open Settings (`Cmd+,` or `Ctrl+,`) +. Search for "cursor autocomplete" +. Ensure "Enable Autocomplete" is checked + +=== MCP tools not appearing + +**Symptom**: Cursor doesn't show tools from MCP servers, or shows error "Too many tools". + +**Causes and solutions**: + +. **40-tool limit exceeded** ++ +Cursor has a hard limit of 40 MCP tools. If your MCP servers expose more than 40 tools combined, enable deferred tool loading in your AI Gateway configuration. ++ +With deferred loading, only 2 tools (search + orchestrator) are sent to Cursor initially, staying well under the limit. + +. **MCP configuration missing** ++ +Verify the `cursor.mcp.servers` section exists in `settings.json`: ++ +[,json] +---- +{ + "cursor.mcp": { + "servers": { + "redpanda-ai-gateway": { + "command": "node", + "args": [/* ... */] + } + } + } +} +---- + +. **No MCP servers in gateway** ++ +Verify your gateway has at least one MCP server configured in the AI Gateway UI. + +. **MCP endpoint unreachable** ++ +Test connectivity to the MCP endpoint: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/mcp +---- ++ +You should receive a valid MCP protocol response. + +. **Cursor restart needed** ++ +MCP configuration changes require restarting Cursor: ++ +. Close all Cursor windows +. Relaunch Cursor +. Wait for MCP servers to initialize (may take 5-10 seconds) + +=== Requests not appearing in gateway dashboard + +**Symptom**: Cursor works, but requests don't appear in the AI Gateway observability dashboard. + +**Causes and solutions**: + +. **Wrong gateway ID** ++ +Verify that the `rp-aigw-id` header in your configuration matches the gateway you're viewing in the dashboard. + +. **Missing header** ++ +Ensure the `rp-aigw-id` header is in the `openai.additionalHeaders` section: ++ +[,json] +---- +{ + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID" + } +} +---- + +. **Using direct provider connection** ++ +If `cursor.overrideOpenAIBaseUrl` points directly to a provider (for example, `https://api.openai.com`), requests won't route through the gateway. Verify it points to your gateway endpoint. + +. **Log ingestion delay** ++ +Gateway logs can take 5-10 seconds to appear in the dashboard. Wait briefly and refresh. + +. **Workspace settings override** ++ +Check if `.vscode/settings.json` in your project root overrides global settings with different gateway configuration. + +=== High latency after gateway integration + +**Symptom**: Requests are slower after routing through the gateway. + +**Causes and solutions**: + +. **Gateway geographic distance** ++ +If your gateway is in a different region than you or the upstream provider, this adds network latency. Check gateway region in the Redpanda Cloud Console. + +. **Provider pool failover** ++ +If your gateway is configured with fallback providers, check the logs to see if requests are failing over. Failover adds latency. + +. **Model mismatch** ++ +Verify you're using fast models for autocomplete: ++ +[,json] +---- +{ + "cursor.cpp.defaultModel": "gpt-4o-mini" // Fast model +} +---- + +. **MCP tool aggregation overhead** ++ +Aggregating tools from multiple MCP servers adds processing time. Use deferred tool loading to reduce this overhead (see xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]). + +=== Configuration changes not taking effect + +**Symptom**: Changes to `settings.json` don't apply. + +**Solutions**: + +. **Restart Cursor** ++ +Configuration changes require restarting Cursor: ++ +. Close all Cursor windows +. Relaunch Cursor + +. **Invalid JSON syntax** ++ +Validate JSON syntax: ++ +[,bash] +---- +python3 -m json.tool ~/Library/Application\ Support/Cursor/User/settings.json +---- + +. **Workspace settings overriding** ++ +Check if `.vscode/settings.json` in your project root overrides global settings. + +. **File permissions** ++ +Verify Cursor can read the configuration file: ++ +[,bash] +---- +# macOS +ls -la ~/Library/Application\ Support/Cursor/User/settings.json + +# Linux +ls -la ~/.config/Cursor/User/settings.json +---- ++ +Fix permissions if needed: ++ +[,bash] +---- +chmod 600 ~/Library/Application\ Support/Cursor/User/settings.json +---- + +== Cost optimization tips + +=== Use different models for chat and autocomplete + +Chat interactions benefit from reasoning depth, while autocomplete needs speed: + +[,json] +---- +{ + "cursor.chat.defaultModel": "claude-sonnet-4-5", + "cursor.cpp.defaultModel": "gpt-4o-mini" +} +---- + +This can reduce costs by 5-10x for autocomplete while maintaining quality for chat. + +=== Limit token usage + +Reduce the maximum tokens for autocomplete to prevent runaway costs: + +[,json] +---- +{ + "cursor.cpp.maxTokens": 256, + "cursor.chat.maxTokens": 2048 +} +---- + +Autocomplete rarely needs more than 256 tokens, while chat responses can vary. + +=== Use MCP tools for documentation + +Instead of pasting large documentation into chat, create MCP tools that fetch relevant sections on-demand. This reduces token costs by including only needed information. + +=== Monitor usage patterns + +Use the AI Gateway dashboard to identify optimization opportunities: + +. Navigate to your gateway's observability dashboard +. Filter by Cursor requests (use custom header if configured) +. Analyze: +** Token usage per request type (chat vs autocomplete) +** Most expensive queries +** High-frequency low-value requests + +=== Team-based cost attribution + +Use custom headers to track costs per developer or team: + +[,json] +---- +{ + "openai.additionalHeaders": { + "rp-aigw-id": "GATEWAY_ID", + "x-user-id": "${USER_EMAIL}", + "x-team": "backend" + } +} +---- + +Generate team-specific cost reports from the gateway dashboard. + +=== Enable deferred MCP tool loading + +Configure deferred tool loading to reduce token costs by 80-90%: + +. Navigate to your gateway configuration +. Enable *Deferred Tool Loading* under MCP Settings +. Save configuration + +This sends only search + orchestrator tools initially, reducing token usage significantly. + +== Next steps + +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Configure deferred tool loading to work within Cursor's 40-tool limit +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor Cursor requests in the gateway dashboard +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Use CEL expressions to route Cursor requests based on context + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Create and configure your AI Gateway +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture and benefits +* xref:ai-agents:ai-gateway/integrations/claude-code-user.adoc[]: Configure Claude Code with AI Gateway +* xref:ai-agents:ai-gateway/integrations/continue-user.adoc[]: Configure Continue.dev with AI Gateway +* xref:ai-agents:ai-gateway/integrations/cline-user.adoc[]: Configure Cline with AI Gateway diff --git a/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-admin.adoc b/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-admin.adoc new file mode 100644 index 00000000..325d9dce --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-admin.adoc @@ -0,0 +1,828 @@ += Configure AI Gateway for GitHub Copilot +:description: Configure Redpanda AI Gateway to support GitHub Copilot clients. +:page-topic-type: how-to +:page-personas: platform_admin +:learning-objective-1: Configure AI Gateway endpoints for GitHub Copilot connectivity +:learning-objective-2: Deploy multi-tenant authentication strategies for Copilot clients +:learning-objective-3: Set up model aliasing and BYOK routing for GitHub Copilot + +Configure Redpanda AI Gateway to support GitHub Copilot clients accessing multiple LLM providers through OpenAI-compatible endpoints with bring-your-own-key (BYOK) support. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +* AI Gateway deployed on a BYOC cluster running Redpanda version 25.3 or later +* Administrator access to the AI Gateway UI +* API keys for at least one LLM provider (OpenAI, Anthropic, or others) +* Understanding of xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[AI Gateway concepts] +* GitHub Copilot Business or Enterprise subscription (for BYOK and custom endpoints) + +== About GitHub Copilot + +GitHub Copilot is an AI-powered code completion tool that integrates with popular IDEs including VS Code, Visual Studio, JetBrains IDEs, and Neovim. GitHub Copilot uses OpenAI models by default but supports BYOK (bring your own key) configurations for Business and Enterprise customers. + +Key characteristics: + +* Sends all requests in OpenAI-compatible format to `/v1/chat/completions` +* Limited support for custom headers (similar to Cursor IDE) +* Supports BYOK for Business/Enterprise subscriptions +* Built-in code completion, chat, and inline editing modes +* Configuration via IDE settings or organization policies +* High request volume from code completion features + +== Architecture overview + +GitHub Copilot connects to AI Gateway through standardized endpoints: + +* LLM endpoint: `https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/chat/completions` for all providers +* MCP endpoint support: Limited (GitHub Copilot does not natively support MCP protocol) + +The gateway handles: + +. Authentication via bearer tokens in the `Authorization` header +. Gateway selection via query parameters or custom headers (requires workarounds) +. Model routing and aliasing for friendly names +. Format transforms from OpenAI format to provider-native formats +. Request logging and cost tracking per gateway +. BYOK routing for different teams or users + +== Enable LLM providers + +GitHub Copilot works with multiple providers through OpenAI-compatible transforms. Enable the providers your users will access. + +=== Configure OpenAI (default provider) + +GitHub Copilot uses OpenAI by default. To enable OpenAI through the gateway: + +. Navigate to *AI Gateway* > *Providers* in the Redpanda Cloud console +. Select *OpenAI* from the provider list +. Click *Add configuration* +. Enter your OpenAI API key +. Under *Format*, select *Native OpenAI* +. Click *Save* + +=== Configure Anthropic with OpenAI-compatible format + +For BYOK deployments, you can route GitHub Copilot to Anthropic models. Configure the gateway to transform requests: + +. Navigate to *AI Gateway* > *Providers* +. Select *Anthropic* from the provider list +. Click *Add configuration* +. Enter your Anthropic API key +. Under *Format*, select *OpenAI-compatible* (enables automatic transform) +. Click *Save* + +The gateway now transforms OpenAI-format requests to Anthropic's native `/v1/messages` format. + +=== Configure additional providers + +GitHub Copilot supports multiple providers through OpenAI-compatible transforms. For each provider: + +. Add the provider configuration in the gateway +. Set the format to *OpenAI-compatible* (the gateway handles format transformation) +. Enable the transform layer to convert OpenAI request format to the provider's native format + +Common additional providers: + +* Google Gemini (requires OpenAI-compatible transform) +* Mistral AI (already OpenAI-compatible format) +* Azure OpenAI (already OpenAI-compatible format) + +=== Enable models in the catalog + +After enabling providers, enable specific models: + +. Navigate to *AI Gateway* > *Models* +. Enable the models you want GitHub Copilot clients to access ++ +Common models for GitHub Copilot: ++ +* `gpt-4o` (OpenAI) +* `gpt-4o-mini` (OpenAI) +* `o1-mini` (OpenAI) +* `claude-sonnet-4-5` (Anthropic, requires alias) + +. Click *Save* + +GitHub Copilot typically uses model names without vendor prefixes. You'll configure model aliasing in the next section to map friendly names to provider-specific models. + +== Create a gateway for GitHub Copilot clients + +Create a dedicated gateway to isolate GitHub Copilot traffic and apply specific policies. + +=== Gateway configuration + +. Navigate to *AI Gateway* > *Gateways* +. Click *Create Gateway* +. Enter gateway details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`github-copilot-gateway` (or your preferred name) + +|Workspace +|Select the workspace for access control grouping + +|Description +|Gateway for GitHub Copilot clients +|=== + +. Click *Create* +. Copy the gateway ID from the gateway details page + +The gateway ID is required for routing requests to this gateway. + +=== Configure model aliasing + +GitHub Copilot expects model names like `gpt-4o` without vendor prefixes. Configure aliases to map these to provider-specific models: + +. Navigate to the gateway's *Models* tab +. Click *Add Model Alias* +. Configure aliases: ++ +[cols="1,2,1"] +|=== +|Alias Name |Target Model |Provider + +|`gpt-4o` +|`openai/gpt-4o` +|OpenAI + +|`gpt-4o-mini` +|`openai/gpt-4o-mini` +|OpenAI + +|`claude-sonnet` +|`anthropic/claude-sonnet-4-5` +|Anthropic + +|`o1-mini` +|`openai/o1-mini` +|OpenAI +|=== + +. Click *Save* + +When GitHub Copilot requests `gpt-4o`, the gateway routes to OpenAI's `gpt-4o` model. Users can optionally request `claude-sonnet` for Anthropic models if the IDE configuration supports model selection. + +=== Configure unified LLM routing + +GitHub Copilot sends all requests to a single endpoint (`/v1/chat/completions`). Configure the gateway to route based on the requested model name. + +==== Model-based routing + +Configure routing that inspects the model field to determine the target provider: + +. Navigate to the gateway's *LLM* tab +. Under *Routing*, click *Add route* +. Configure OpenAI routing: ++ +[source,cel] +---- +request.body.model.startsWith("gpt-") || request.body.model.startsWith("o1-") +---- + +. Add a *Primary provider pool*: ++ +* Provider: OpenAI +* Model: All enabled OpenAI models +* Transform: None (already OpenAI format) +* Load balancing: Round robin (if multiple OpenAI configurations exist) + +. Click *Save* +. Add another route for Anthropic models: ++ +[source,cel] +---- +request.body.model.startsWith("claude-") +---- + +. Add a *Primary provider pool*: ++ +* Provider: Anthropic +* Model: All enabled Anthropic models +* Transform: OpenAI to Anthropic + +. Click *Save* + +GitHub Copilot requests route to the appropriate provider based on the model alias. + +==== Default routing with fallback + +Configure a catch-all route for requests without specific model prefixes: + +[source,cel] +---- +true # Matches all requests not matched by previous routes +---- + +Add a primary provider (for example, OpenAI) with fallback to Anthropic: + +* Primary: OpenAI (for requests with no specific model) +* Fallback: Anthropic (if OpenAI is unavailable) +* Failover conditions: Rate limits, timeouts, 5xx errors + +=== Apply rate limits + +Prevent runaway usage from GitHub Copilot clients. Code completion features generate very high request volumes. + +. Navigate to the gateway's *LLM* tab +. Under *Rate Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Recommended Value + +|Global rate limit +|300 requests per minute + +|Per-user rate limit +|30 requests per minute (if using user identification) +|=== + +. Click *Save* + +The gateway blocks requests exceeding these limits and returns HTTP 429 errors. + +==== Rate limit considerations for code completion + +GitHub Copilot's code completion feature generates extremely frequent requests (potentially dozens per minute per user). Consider: + +* Higher global rate limits than other AI coding assistants +* Separate rate limits for different request types if the gateway supports request classification +* Monitoring initial usage patterns to adjust limits appropriately + +=== Set spending limits + +Control LLM costs across all providers: + +. Under *Spend Limit*, configure: ++ +[cols="1,2"] +|=== +|Setting |Value + +|Monthly budget +|$10,000 (adjust based on expected usage) + +|Enforcement +|Block requests after budget exceeded + +|Alert threshold +|80% of budget (sends notification) +|=== + +. Click *Save* + +The gateway tracks estimated costs per request across all providers and blocks traffic when the monthly budget is exhausted. + +== Configure authentication + +GitHub Copilot clients authenticate using bearer tokens in the `Authorization` header. + +=== Generate API tokens + +. Navigate to *Security* > *API Tokens* in the Redpanda Cloud console +. Click *Create Token* +. Enter token details: ++ +[cols="1,2"] +|=== +|Field |Value + +|Name +|`copilot-access` + +|Scopes +|`ai-gateway:read`, `ai-gateway:write` + +|Expiration +|Set appropriate expiration based on security policies +|=== + +. Click *Create* +. Copy the token (it appears only once) + +Distribute this token to GitHub Copilot administrators through secure channels for organization-level configuration. + +=== Token rotation + +Implement token rotation for security: + +. Create a new token before the existing token expires +. Update organization-level GitHub Copilot configuration with the new token +. Monitor usage of the old token in xref:ai-agents:ai-gateway/observability-logs.adoc[request logs] +. Revoke the old token after the configuration update propagates + +== Multi-tenant deployment strategies + +GitHub Copilot has limited support for custom headers, making traditional `rp-aigw-id` header-based multi-tenancy challenging. Use one of these alternative strategies for BYOK deployments. + +=== Strategy 1: OAI Compatible Provider extension (recommended) + +For organizations using VS Code with GitHub Copilot, the OAI Compatible Provider extension enables custom headers including `rp-aigw-id`. + +==== Install the extension + +. Navigate to VS Code Extensions Marketplace +. Search for "OAI Compatible Provider" +. Install the extension +. Restart VS Code + +==== Configure the extension + +. Open VS Code settings (JSON) +. Add gateway configuration: ++ +[source,json] +---- +{ + "oai-compatible-provider.providers": [ + { + "name": "Redpanda AI Gateway", + "baseUrl": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + "headers": { + "Authorization": "Bearer YOUR_API_TOKEN", + "rp-aigw-id": "GATEWAY_ID" + }, + "models": [ + "gpt-4o", + "gpt-4o-mini", + "claude-sonnet" + ] + } + ] +} +---- + +. Replace: ++ +* `{CLUSTER_ID}`: Your Redpanda cluster ID +* `YOUR_API_TOKEN`: Team-specific API token +* `GATEWAY_ID`: Team-specific gateway ID + +This approach allows true multi-tenancy with proper gateway isolation per team. + +**Benefits:** + +* Full support for `rp-aigw-id` header +* Clean separation between tenants +* Standard authentication flow +* Works with any IDE supported by the extension + +**Limitations:** + +* Requires VS Code and extension installation +* Not available for all GitHub Copilot-supported IDEs +* Users must configure extension in addition to GitHub Copilot + +=== Strategy 2: Query parameter routing + +Embed tenant identity in query parameters for multi-tenant routing without custom headers. + +. Configure gateway routing to extract tenant from query parameters: ++ +[source,cel] +---- +request.url.query["tenant"][0] == "team-alpha" +---- + +. Distribute tenant-specific endpoints to each team +. Configure GitHub Copilot organization settings with the tenant-specific base URL + +**Configuration example for Team Alpha:** + +Organization-level GitHub Copilot settings: + +[source,json] +---- +{ + "copilot": { + "api_base_url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1?tenant=team-alpha", + "api_key": "TEAM_ALPHA_TOKEN" + } +} +---- + +**Benefits:** + +* Works with standard GitHub Copilot configuration +* No additional extensions required +* Simple to implement + +**Limitations:** + +* Tenant identity exposed in URLs and logs +* Less clean than header-based routing +* URL parameters may be logged by intermediate proxies + +=== Strategy 3: Token-based gateway mapping + +Use different API tokens to identify which gateway to route to: + +. Generate separate API tokens for each tenant or team +. Tag tokens with metadata indicating the target gateway +. Configure gateway routing based on token identity: ++ +[source,cel] +---- +request.auth.metadata["gateway_id"] == "team-alpha-gateway" +---- + +. Apply tenant-specific routing, rate limits, and spending limits based on the token + +**Benefits:** + +* Transparent to users +* No URL modifications needed +* Centralized control through token management + +**Limitations:** + +* Requires gateway support for token metadata inspection +* Token management overhead increases with number of tenants +* All tenants use the same base URL + +=== Strategy 4: Single-tenant mode + +For simpler deployments, configure a single gateway with shared access: + +. Create one gateway for all GitHub Copilot users +. Generate a shared API token +. Configure GitHub Copilot at the organization level +. Use rate limits and spending limits to control overall usage + +**Benefits:** + +* Simplest configuration +* No tenant routing complexity +* Easy to manage + +**Limitations:** + +* No per-team cost tracking or limits +* Shared rate limits may impact individual teams +* All users have the same model access + +=== Choosing a multi-tenant strategy + +[cols="1,2,2,1"] +|=== +|Strategy |Pros |Cons |Best For + +|OAI Compatible Provider +|Full `rp-aigw-id` support, clean separation +|Requires extension, VS Code only +|Organizations standardized on VS Code + +|Query parameters +|No extensions needed, simple setup +|Tenant exposed in URLs, less clean +|Quick deployments, small teams + +|Token-based +|Transparent to users, centralized control +|Requires advanced gateway features +|Large organizations with many teams + +|Single-tenant +|Simplest configuration and management +|No per-team isolation or limits +|Small organizations, proof of concept +|=== + +== Configure GitHub Copilot clients + +Provide these instructions based on your chosen multi-tenant strategy. + +=== Organization-level configuration (GitHub Enterprise) + +For GitHub Enterprise customers, configure Copilot at the organization level: + +. Navigate to your organization settings on GitHub +. Go to *Copilot* > *Policies* +. Enable *Allow use of Copilot with custom models* +. Configure the custom endpoint: ++ +[source,json] +---- +{ + "api_base_url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + "api_key": "YOUR_API_TOKEN" +} +---- + +. If using query parameter routing, append the tenant identifier: ++ +[source,json] +---- +{ + "api_base_url": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1?tenant=YOUR_TEAM", + "api_key": "YOUR_API_TOKEN" +} +---- + +This configuration applies to all users in the organization. + +=== IDE-specific configuration (individual users) + +For individual users or when organization-level configuration is not available: + +==== VS Code configuration + +. Open VS Code settings +. Search for "GitHub Copilot" +. Configure custom endpoint (if using OAI Compatible Provider): ++ +[source,json] +---- +{ + "github.copilot.advanced": { + "endpoint": "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1" + } +} +---- + +==== JetBrains IDEs + +. Open IDE Settings +. Navigate to *Tools* > *GitHub Copilot* +. Configure custom endpoint (support varies by IDE and Copilot version) + +==== Neovim + +. Edit Copilot configuration +. Add custom endpoint in the Copilot.vim or Copilot.lua configuration +. Refer to the Copilot.vim documentation for exact syntax + +=== Model selection + +Configure model preferences based on use case: + +[cols="1,2,1"] +|=== +|Use Case |Recommended Model |Reason + +|Code completion +|`gpt-4o-mini` +|Fast, cost-effective for frequent requests + +|Code explanation +|`gpt-4o` or `claude-sonnet` +|Higher quality for complex explanations + +|Code generation +|`gpt-4o` or `claude-sonnet` +|Better at generating complete functions + +|Documentation +|`gpt-4o-mini` +|Sufficient quality for docstrings and comments +|=== + +Model selection is typically configured at the organization level or through IDE settings. + +== Monitor GitHub Copilot usage + +Track GitHub Copilot activity through gateway observability features. + +=== View request logs + +. Navigate to *AI Gateway* > *Observability* > *Logs* +. Filter by gateway ID: `github-copilot-gateway` +. Review: ++ +* Request timestamps and duration +* Model used per request (including aliases) +* Token usage (prompt and completion tokens) +* Estimated cost per request +* HTTP status codes and errors +* Transform operations (OpenAI to provider-native format) + +GitHub Copilot generates distinct request patterns: + +* Code completion: Very high volume, short requests with low token counts +* Chat/explain: Medium volume, longer requests with code context +* Code generation: Lower volume, variable length requests + +=== Analyze metrics + +. Navigate to *AI Gateway* > *Observability* > *Metrics* +. Select the GitHub Copilot gateway +. Review: ++ +[cols="1,2"] +|=== +|Metric |Purpose + +|Request volume by model +|Identify most-used models via aliases + +|Token usage by model +|Track consumption patterns (completion vs chat) + +|Estimated spend by provider +|Monitor costs across providers with transforms + +|Latency (p50, p95, p99) +|Detect transform overhead and performance issues + +|Error rate by provider +|Identify failing providers or transform issues + +|Transform success rate +|Monitor OpenAI-to-provider format conversion success + +|Requests per user/tenant +|Track usage by team (if using multi-tenant strategies) +|=== + +For detailed metrics configuration, see xref:ai-agents:ai-gateway/observability-metrics.adoc[]. + +=== Query logs via API + +Programmatically access logs for integration with monitoring systems: + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "gateway_id": "GATEWAY_ID", + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-14T23:59:59Z", + "limit": 100 + }' +---- + +== Security considerations + +Apply these security best practices for GitHub Copilot deployments. + +=== Limit token scope + +Create tokens with minimal required scopes: + +* `ai-gateway:read`: Required for model discovery +* `ai-gateway:write`: Required for LLM requests + +Avoid granting broader scopes like `admin` or `cluster:write`. + +=== Implement network restrictions + +If GitHub Copilot clients connect from known networks, configure network policies: + +. Use cloud provider security groups to restrict access to AI Gateway endpoints +. Allowlist only the IP ranges where GitHub Copilot clients operate +. Monitor for unauthorized access attempts in request logs + +=== Enforce token expiration + +Set short token lifetimes for high-security environments: + +* Development environments: 90 days +* Production environments: 30 days + +Automate token rotation to reduce manual overhead. Coordinate with GitHub organization administrators when rotating tokens. + +=== Monitor transform operations + +Because GitHub Copilot may route to non-OpenAI providers through transforms: + +. Review transform success rates in metrics +. Monitor for transform failures that may leak request details +. Test transforms thoroughly before production deployment +. Keep transform logic updated as provider APIs evolve + +=== Audit model access + +Review which models GitHub Copilot clients can access: + +. Periodically audit enabled models and aliases +. Remove deprecated or unused model configurations +. Monitor model usage logs for unexpected patterns +. Ensure cost-effective models are used for high-volume completion requests + +=== Code completion security + +GitHub Copilot sends code context to LLM providers. Ensure: + +* Users understand what code context is sent with requests +. Proprietary code may be included in prompts +. Configure organization policies to limit code sharing if needed +* Review provider data retention policies +* Monitor logs for sensitive information in prompts (if logging includes prompt content) + +=== Organization-level controls + +For GitHub Enterprise customers: + +. Use organization-level policies to enforce custom endpoint usage +. Restrict which users can configure custom endpoints +. Monitor organization audit logs for configuration changes +. Implement approval workflows for endpoint changes + +== Troubleshooting + +Common issues and solutions when configuring AI Gateway for GitHub Copilot. + +=== GitHub Copilot cannot connect to gateway + +Symptom: Connection errors when GitHub Copilot tries to send requests. + +Causes and solutions: + +* **Invalid base URL**: Verify the configured endpoint matches the gateway URL (including query parameters if using query-based routing) +* **Expired token**: Generate a new API token and update the GitHub Copilot configuration +* **Network connectivity**: Verify the cluster endpoint is accessible from the client network +* **Provider not enabled**: Ensure at least one provider is enabled and has models in the catalog +* **SSL/TLS issues**: Verify the cluster has valid SSL certificates +* **Organization policy blocking custom endpoints**: Check GitHub organization settings + +=== Model not found errors + +Symptom: GitHub Copilot shows "model not found" or similar errors. + +Causes and solutions: + +* **Model not enabled in catalog**: Enable the model in the gateway's model catalog +* **Model alias missing**: Create an alias for the model name GitHub Copilot expects (for example, `gpt-4o`) +* **Incorrect model name**: Verify GitHub Copilot is requesting a model name that exists in your aliases +* **Routing rule mismatch**: Check that routing rules correctly match the requested model name + +=== Transform errors or unexpected responses + +Symptom: Responses are malformed or GitHub Copilot reports format errors. + +Causes and solutions: + +* **Transform disabled**: Ensure OpenAI-compatible transform is enabled for non-OpenAI providers (for example, Anthropic) +* **Transform version mismatch**: Verify the transform is compatible with the current provider API version +* **Model-specific transform issues**: Some models may require specific transform configurations +* **Check transform logs**: Review logs for transform errors and stack traces +* **Response format incompatibility**: Verify the provider's response can be transformed to OpenAI format + +=== High costs or token usage + +Symptom: Token usage and costs exceed expectations. + +Causes and solutions: + +* **Code completion using expensive model**: Configure completion to use `gpt-4o-mini` instead of larger models +* **No rate limits**: Apply per-minute rate limits to prevent runaway usage +* **Missing spending limits**: Set monthly budget limits with blocking enforcement +* **Chat using wrong model**: Ensure chat/explanation features use cost-effective models +* **Transform overhead**: Monitor if transforms add significant token overhead +* **High completion request volume**: Expected behavior, adjust budgets or implement stricter rate limits + +=== Requests failing with 429 errors + +Symptom: GitHub Copilot receives HTTP 429 Too Many Requests errors. + +Causes and solutions: + +* **Rate limit exceeded**: Review and increase rate limits if usage is legitimate (code completion needs very high limits) +* **Upstream provider rate limits**: Check if the upstream LLM provider is rate-limiting; configure failover to alternate providers +* **Budget exhausted**: Verify monthly spending limit has not been reached +* **Per-user limits too restrictive**: Adjust per-user rate limits if using multi-tenant strategies +* **Spike in usage**: Code completion can generate sudden usage spikes, consider burstable rate limits + +=== Multi-tenant routing failures + +Symptom: Requests route to wrong gateway or fail authorization. + +Causes and solutions: + +* **Query parameter missing**: Ensure query parameter is appended to all requests if using query-based routing +* **Token metadata incorrect**: Verify token is tagged with correct gateway metadata +* **Routing rule conflicts**: Check for overlapping routing rules that may cause unexpected routing +* **Organization policy override**: Verify GitHub organization settings aren't overriding user configurations +* **Extension not configured**: If using OAI Compatible Provider extension, verify proper installation and configuration + +=== Performance issues + +Symptom: Slow response times from GitHub Copilot. + +Causes and solutions: + +* **Transform latency**: Monitor metrics for transform processing time overhead +* **Provider latency**: Check latency metrics by provider to identify slow backends +* **Network latency**: Verify cluster is in a region with good connectivity to users +* **Cold start delays**: Some providers may have cold start latency on first request +* **Rate limiting overhead**: Check if rate limit enforcement is adding latency + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Analyze detailed request logs and transform operations +* xref:ai-agents:ai-gateway/observability-metrics.adoc[]: Set up metrics dashboards +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Implement advanced routing rules for model aliasing +* xref:ai-agents:ai-gateway/migration-guide.adoc[]: Migrate from direct provider access to AI Gateway diff --git a/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-user.adoc b/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-user.adoc new file mode 100644 index 00000000..d0ead4b5 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/github-copilot-user.adoc @@ -0,0 +1,1010 @@ += Configure GitHub Copilot with AI Gateway +:description: Configure GitHub Copilot to use Redpanda AI Gateway for unified LLM access and custom provider management. +:page-topic-type: how-to +:page-personas: ai_agent_developer, app_developer +:learning-objective-1: Configure GitHub Copilot in VS Code and JetBrains IDEs to route requests through AI Gateway +:learning-objective-2: Set up multi-tenancy with gateway ID headers for cost tracking +:learning-objective-3: Configure enterprise BYOK deployments for team-wide Copilot access + +After xref:ai-agents:ai-gateway/ai-gateway.adoc[configuring your AI Gateway], set up GitHub Copilot to route LLM requests through the gateway for centralized observability, cost management, and provider flexibility. + +After reading this page, you will be able to: + +* [ ] {learning-objective-1} +* [ ] {learning-objective-2} +* [ ] {learning-objective-3} + +== Prerequisites + +Before configuring GitHub Copilot, ensure you have: + +* GitHub Copilot subscription (Individual, Business, or Enterprise) +* An active Redpanda AI Gateway with: +** At least one LLM provider enabled (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-1-enable-a-provider[Enable a provider]) +** A gateway created and configured (see xref:ai-agents:ai-gateway/ai-gateway.adoc#step-3-create-a-gateway[Create a gateway]) +* Your AI Gateway credentials: +** Gateway endpoint URL (for example, `https://gw.ai.panda.com`) +** Gateway ID (for example, `gateway-abc123`) +** API key with access to the gateway +* Your IDE: +** VS Code with GitHub Copilot extension installed +** Or JetBrains IDE (IntelliJ IDEA, PyCharm, etc.) with GitHub Copilot plugin + +== About GitHub Copilot and AI Gateway + +GitHub Copilot provides AI-powered code completion and chat within your IDE. By default, Copilot routes requests directly to GitHub's infrastructure, which uses OpenAI and other LLM providers. + +When you route Copilot through AI Gateway, you gain: + +* Centralized observability across all Copilot usage +* Cost attribution per developer, team, or project +* Provider flexibility (use your own API keys or alternative models) +* Policy enforcement (rate limits, spend controls) +* Multi-tenancy support for enterprise deployments + +== Configuration approaches + +GitHub Copilot supports different configuration approaches depending on your IDE and subscription tier: + +[cols="1,2,2,1"] +|=== +|IDE |Method |Subscription Tier |Complexity + +|VS Code +|Custom OpenAI models +|Individual, Business, Enterprise +|Medium + +|VS Code +|OAI Compatible Provider extension +|Individual, Business, Enterprise +|Low + +|JetBrains +|Enterprise BYOK +|Enterprise +|Low +|=== + +Choose the approach that matches your environment. VS Code users have multiple options, while JetBrains users need GitHub Copilot Enterprise with BYOK support. + +== Configure in VS Code + +VS Code offers two approaches for routing Copilot through AI Gateway: + +. Custom OpenAI models (manual configuration) +. OAI Compatible Provider extension (simplified) + +=== Option 1: Custom OpenAI models + +This approach configures VS Code to recognize your AI Gateway as a custom OpenAI-compatible provider. + +==== Configure custom models + +. Open VS Code Settings: +** macOS: `Cmd+,` +** Windows/Linux: `Ctrl+,` +. Search for `github.copilot.advanced` +. Click *Edit in settings.json* +. Add the following configuration: + +[,json] +---- +{ + "github.copilot.advanced": { + "customModels": { + "redpanda-gateway": { + "endpoint": "https://gw.ai.panda.com/v1", + "apiKey": "${env:REDPANDA_API_KEY}", + "models": [ + { + "id": "anthropic/claude-sonnet-4-5", + "name": "Claude Sonnet 4.5 (Gateway)", + "type": "chat" + }, + { + "id": "openai/gpt-4o", + "name": "GPT-4o (Gateway)", + "type": "chat" + }, + { + "id": "openai/gpt-4o-mini", + "name": "GPT-4o Mini (Gateway)", + "type": "completion" + } + ] + } + } + } +} +---- + +Replace `https://gw.ai.panda.com/v1` with your gateway endpoint. + +==== Add gateway ID header + +The custom models configuration doesn't support custom headers directly. To add the `rp-aigw-id` header, use one of these approaches: + +**Approach A: Use OAI Compatible Provider extension** (recommended, see Option 2 below) + +**Approach B: Configure gateway to use API key for routing** (if your gateway supports this) + +Check your AI Gateway documentation to see if you can embed the gateway ID in the API key or use a different authentication method that doesn't require custom headers. + +==== Set environment variable + +Set the API key as an environment variable before launching VS Code: + +[,bash] +---- +export REDPANDA_API_KEY="your-api-key" +code . +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:REDPANDA_API_KEY = "your-api-key" +code . +---- + +==== Select model + +. Open a file in VS Code +. Open Copilot chat with `Cmd+I` (macOS) or `Ctrl+I` (Windows/Linux) +. Click the model selector dropdown +. Choose a model from the "redpanda-gateway" provider + +=== Option 2: OAI Compatible Provider extension + +The OAI Compatible Provider extension simplifies custom provider configuration and supports custom headers. + +==== Install extension + +. Open VS Code Extensions (`Cmd+Shift+X` or `Ctrl+Shift+X`) +. Search for "OAI Compatible Provider" +. Click *Install* + +==== Configure provider + +. Open VS Code Settings (`Cmd+,` or `Ctrl+,`) +. Search for `oai.provider` +. Configure the following settings: + +[,json] +---- +{ + "oai.provider.endpoint": "https://gw.ai.panda.com/v1", + "oai.provider.apiKey": "${env:REDPANDA_API_KEY}", + "oai.provider.headers": { + "rp-aigw-id": "gateway-abc123" + }, + "oai.provider.models": [ + { + "id": "anthropic/claude-sonnet-4-5", + "name": "Claude Sonnet 4.5", + "type": "chat" + }, + { + "id": "openai/gpt-4o-mini", + "name": "GPT-4o Mini", + "type": "completion" + } + ] +} +---- + +Replace placeholder values: + +* `https://gw.ai.panda.com/v1` - Your gateway endpoint +* `gateway-abc123` - Your gateway ID + +==== Enable for Copilot + +. Search for `github.copilot.advanced` in settings +. Add the following: + +[,json] +---- +{ + "github.copilot.advanced": { + "useOAIProvider": true + } +} +---- + +. Reload VS Code window: +** Command Palette (`Cmd+Shift+P` or `Ctrl+Shift+P`) +** Type "Developer: Reload Window" + +==== Set environment variable + +[,bash] +---- +export REDPANDA_API_KEY="your-api-key" +code . +---- + +On Windows (PowerShell): + +[,powershell] +---- +$env:REDPANDA_API_KEY = "your-api-key" +code . +---- + +== Configure in JetBrains IDEs + +JetBrains IDE integration requires GitHub Copilot Enterprise with Bring Your Own Key (BYOK) support. + +=== Prerequisites + +* GitHub Copilot Enterprise subscription +* BYOK enabled for your organization +* JetBrains IDE 2024.1 or later +* GitHub Copilot plugin version 1.4.0 or later + +=== Configure BYOK with AI Gateway + +. Open your JetBrains IDE (IntelliJ IDEA, PyCharm, etc.) +. Navigate to *Settings/Preferences*: +** macOS: `Cmd+,` +** Windows/Linux: `Ctrl+Alt+S` +. Go to *Tools* > *GitHub Copilot* +. Under *Advanced Settings*, find *Custom Model Configuration* +. Configure the OpenAI-compatible endpoint: + +[,text] +---- +Base URL: https://gw.ai.panda.com/v1 +API Key: your-redpanda-api-key +---- + +[start=6] +. Click *Advanced Headers* +. Add custom header: + +[,text] +---- +Header Name: rp-aigw-id +Header Value: gateway-abc123 +---- + +Replace placeholder values: + +* `https://gw.ai.panda.com/v1` - Your gateway endpoint +* `your-redpanda-api-key` - Your Redpanda API key +* `gateway-abc123` - Your gateway ID + +=== Configure model selection + +In the GitHub Copilot settings: + +. Expand *Model Selection* +. Choose your preferred models from the AI Gateway: +** Chat model: `anthropic/claude-sonnet-4-5` or `openai/gpt-4o` +** Code completion model: `openai/gpt-4o-mini` (faster, cost-effective) + +Model format uses `vendor/model_id` pattern to route through the gateway to the appropriate provider. + +=== Test configuration + +. Open a code file +. Trigger code completion (start typing) +. Or open Copilot chat: +** Right-click > *Copilot* > *Open Chat* +** Or use shortcut: `Cmd+Shift+C` (macOS) or `Ctrl+Shift+C` (Windows/Linux) +. Verify suggestions appear + +Check the AI Gateway dashboard to confirm requests are logged. + +== Multi-tenancy configuration + +For organizations with multiple teams or projects sharing AI Gateway, use gateway ID headers to track usage per team. + +=== Approach 1: One gateway per team + +Create separate gateways for each team: + +* Team A Gateway: ID `team-a-gateway-123` +* Team B Gateway: ID `team-b-gateway-456` + +Each team configures their IDE with their team's gateway ID: + +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "team-a-gateway-123" + } +} +---- + +Benefits: + +* Isolated cost tracking per team +* Team-specific rate limits and budgets +* Separate observability dashboards + +=== Approach 2: Shared gateway with custom headers + +Use a single gateway with custom headers for attribution: + +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "shared-gateway-789", + "x-team": "backend-team", + "x-project": "api-service" + } +} +---- + +Configure gateway CEL routing to read these headers: + +[,cel] +---- +request.headers["x-team"] == "backend-team" ? "openai/gpt-4o" : "openai/gpt-4o-mini" +---- + +Benefits: + +* Single gateway to manage +* Flexible cost attribution +* Header-based routing policies + +Filter observability dashboard by `x-team` or `x-project` headers to generate team-specific reports. + +=== Approach 3: Environment-based gateways + +Separate development, staging, and production environments: + +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "${env:GATEWAY_ID}", + "x-environment": "${env:ENVIRONMENT}" + } +} +---- + +Set environment variables per workspace: + +[,bash] +---- +# Development workspace +export GATEWAY_ID="dev-gateway-123" +export ENVIRONMENT="development" + +# Production workspace +export GATEWAY_ID="prod-gateway-456" +export ENVIRONMENT="production" +---- + +Benefits: + +* Prevent development usage from affecting production metrics +* Different rate limits and budgets per environment +* Environment-specific model access policies + +== Enterprise BYOK at scale + +For large organizations deploying GitHub Copilot Enterprise with AI Gateway across hundreds or thousands of developers. + +=== Centralized configuration management + +Distribute IDE configuration files via: + +* **Git repository**: Store `settings.json` or IDE configuration in a shared repository +* **Configuration management tools**: Puppet, Chef, Ansible +* **Group Policy** (Windows environments) +* **MDM solutions** (macOS environments) + +Example centralized configuration: + +[,json] +---- +{ + "oai.provider.endpoint": "https://gw.company.com/v1", + "oai.provider.apiKey": "${env:COPILOT_GATEWAY_KEY}", + "oai.provider.headers": { + "rp-aigw-id": "${env:COPILOT_GATEWAY_ID}", + "x-user-email": "${env:USER_EMAIL}", + "x-department": "${env:DEPARTMENT}" + } +} +---- + +Developers set environment variables locally or receive them from identity management systems. + +=== API key management + +**Option 1: Individual API keys** + +Each developer gets their own Redpanda API key: + +* Tied to their identity (email, employee ID) +* Revocable when they leave the organization +* Enables per-developer cost attribution + +**Option 2: Team API keys** + +Teams share API keys: + +* Simpler key management +* Cost attribution by team, not individual +* Use custom headers for finer-grained tracking + +**Option 3: Service account keys** + +Single key for all developers: + +* Simplest to deploy +* No per-developer tracking +* Use custom headers for all attribution + +=== Automated provisioning workflow + +. Developer joins organization +. Identity system (Okta, Azure AD, etc.) triggers provisioning: +.. Create Redpanda API key +.. Assign to appropriate gateway +.. Generate IDE configuration file with embedded keys +.. Distribute to developer workstation +. Developer installs IDE and GitHub Copilot +. Configuration auto-applies (via MDM or configuration management) +. Developer starts using Copilot immediately + +=== Observability and governance + +Track usage across the organization: + +. Navigate to AI Gateway dashboard +. Filter by custom headers: +** `x-department`: View costs per department +** `x-user-email`: Track usage per developer +** `x-project`: Attribute costs to specific projects +. Generate reports: +** Top 10 users by token usage +** Departments exceeding budget +** Projects using deprecated models +. Set alerts: +** Individual developer exceeds threshold (potential misuse) +** Department budget approaching limit +** Unusual request patterns (security concern) + +=== Policy enforcement + +Use gateway CEL routing to enforce policies: + +[,cel] +---- +// Limit junior developers to cost-effective models +request.headers["x-user-level"] == "junior" + ? "openai/gpt-4o-mini" + : "anthropic/claude-sonnet-4-5" + +// Block access for contractors to expensive models +request.headers["x-user-type"] == "contractor" && +request.model.contains("opus") + ? error("Contractors cannot use Opus models") + : request.model +---- + +== Verify configuration + +After configuring GitHub Copilot, verify it routes requests through your AI Gateway. + +=== Test code completion + +. Open a code file in your IDE +. Start typing a function definition +. Wait for code completion suggestions to appear + +Completion requests appear in the gateway dashboard with: + +* Low token counts (typically 50-200 tokens) +* High request frequency (as you type) +* The completion model you configured + +=== Test chat interface + +. Open Copilot chat: +** VS Code: `Cmd+I` (macOS) or `Ctrl+I` (Windows/Linux) +** JetBrains: Right-click > *Copilot* > *Open Chat* +. Ask a simple question: "Explain this function" +. Wait for response + +Chat requests appear in the gateway dashboard with: + +* Higher token counts (500-2000 tokens typical) +* The chat model you configured +* Response status (200 for success) + +=== Verify in dashboard + +. Open the Redpanda Cloud Console +. Navigate to your gateway's observability dashboard +. Filter by gateway ID +. Verify: +** Requests appear in logs +** Models show correct format (for example, `anthropic/claude-sonnet-4-5`) +** Token usage and cost are recorded +** Custom headers appear (if configured) + +If requests don't appear, see <>. + +== Advanced configuration + +=== Model-specific settings + +Configure different models for different tasks: + +[,json] +---- +{ + "oai.provider.models": [ + { + "id": "anthropic/claude-sonnet-4-5", + "name": "Claude Sonnet (chat)", + "type": "chat", + "temperature": 0.7, + "maxTokens": 4096 + }, + { + "id": "openai/gpt-4o-mini", + "name": "GPT-4o Mini (completion)", + "type": "completion", + "temperature": 0.2, + "maxTokens": 512 + } + ] +} +---- + +Settings explained: + +* Chat uses Claude Sonnet with higher temperature for creative responses +* Completion uses GPT-4o Mini with lower temperature for deterministic code +* Chat allows longer responses, completion limits tokens for speed + +=== Workspace-specific configuration + +Override global settings for specific projects using workspace settings. + +In VS Code, create `.vscode/settings.json` in your project root: + +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "project-gateway-123", + "x-project": "customer-portal" + } +} +---- + +Benefits: + +* Route different projects through different gateways +* Track costs per project +* Use different models per project (cost-effective for internal, premium for customer-facing) + +=== Custom request timeouts + +Configure timeout for AI Gateway requests: + +[,json] +---- +{ + "oai.provider.timeout": 30000 +} +---- + +Timeout is in milliseconds. Default is typically 30000 (30 seconds). + +Increase timeouts for: + +* High-latency network environments +* Complex code generation tasks +* Large file context + +=== Debug mode + +Enable debug logging to troubleshoot issues: + +[,json] +---- +{ + "oai.provider.debug": true, + "github.copilot.advanced": { + "debug": true + } +} +---- + +View debug logs: + +* VS Code: Developer Console (`Help` > `Toggle Developer Tools` > `Console` tab) +* JetBrains: `Help` > `Diagnostic Tools` > `Debug Log Settings` > Add `github.copilot` + +Debug mode shows: + +* HTTP request and response headers +* Model selection decisions +* Token usage calculations +* Error details + +[[troubleshooting]] +== Troubleshooting + +=== Copilot shows no suggestions + +**Symptom**: Code completion doesn't work or Copilot shows "No suggestions available". + +**Causes and solutions**: + +. **Configuration not loaded** ++ +Reload your IDE to apply configuration changes: ++ +* VS Code: Command Palette > "Developer: Reload Window" +* JetBrains: File > Invalidate Caches / Restart + +. **Incorrect endpoint URL** ++ +Verify the URL format includes `/v1` at the end: ++ +[,text] +---- +# Correct +https://gw.ai.panda.com/v1 + +# Incorrect +https://gw.ai.panda.com +---- + +. **Authentication failure** ++ +Verify your API key is valid: ++ +[,bash] +---- +curl -H "Authorization: Bearer YOUR_API_KEY" \ + -H "rp-aigw-id: GATEWAY_ID" \ + https://gw.ai.panda.com/v1/models +---- ++ +You should receive a list of available models. If you get `401 Unauthorized`, regenerate your API key in the Redpanda Cloud Console. + +. **Extension/plugin disabled** ++ +Verify GitHub Copilot is enabled: ++ +* VS Code: Extensions view > GitHub Copilot > Ensure "Enabled" +* JetBrains: Settings > Plugins > GitHub Copilot > Check "Enabled" + +. **Network connectivity issues** ++ +Test connectivity to the gateway: ++ +[,bash] +---- +curl -I https://gw.ai.panda.com/v1 +---- ++ +If this times out, check your network configuration, firewall rules, or VPN connection. + +=== Requests not appearing in gateway dashboard + +**Symptom**: Copilot works, but requests don't appear in the AI Gateway observability dashboard. + +**Causes and solutions**: + +. **Wrong gateway ID** ++ +Verify the `rp-aigw-id` header matches the gateway you're viewing in the dashboard (case-sensitive). + +. **Missing header configuration** ++ +Ensure `rp-aigw-id` header is configured: ++ +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "GATEWAY_ID" + } +} +---- ++ +For VS Code custom models without extension support, you may need to use Option 2 (OAI Compatible Provider extension). + +. **Using direct GitHub connection** ++ +If the endpoint configuration is missing or incorrect, Copilot may route directly to GitHub instead of your gateway. Verify endpoint configuration. + +. **Log ingestion delay** ++ +Gateway logs can take 5-10 seconds to appear in the dashboard. Wait briefly and refresh. + +. **Environment variable not set** ++ +If using environment variables like `${env:REDPANDA_API_KEY}`, verify they're set before launching the IDE: ++ +[,bash] +---- +echo $REDPANDA_API_KEY # Should print your API key +---- + +=== High latency or slow suggestions + +**Symptom**: Code completion is slow or chat responses take a long time. + +**Causes and solutions**: + +. **Gateway geographic distance** ++ +If your gateway is in a different region than you or the upstream provider, this adds network latency. Check gateway region in the Redpanda Cloud Console. + +. **Slow model for completion** ++ +Use a faster model for code completion: ++ +[,json] +---- +{ + "oai.provider.models": [ + { + "id": "openai/gpt-4o-mini", + "type": "completion" + } + ] +} +---- ++ +Models like GPT-4o Mini or Claude Haiku provide faster responses ideal for code completion. + +. **Provider pool failover** ++ +If your gateway is configured with fallback providers, check the logs to see if requests are failing over. Failover adds latency. + +. **Rate limiting** ++ +If you're hitting rate limits, the gateway may be queuing requests. Check the observability dashboard for rate limit metrics. + +. **Token limit too high** ++ +Reduce `maxTokens` for completion models to improve speed: ++ +[,json] +---- +{ + "oai.provider.models": [ + { + "id": "openai/gpt-4o-mini", + "type": "completion", + "maxTokens": 256 + } + ] +} +---- + +=== Custom headers not being sent + +**Symptom**: Custom headers (like `x-team` or `x-project`) don't appear in gateway logs. + +**Causes and solutions**: + +. **Extension not installed (VS Code)** ++ +Custom headers require the OAI Compatible Provider extension in VS Code. Install it from the Extensions marketplace. + +. **Header configuration location** ++ +Ensure headers are in the correct configuration section: ++ +[,json] +---- +{ + "oai.provider.headers": { + "rp-aigw-id": "GATEWAY_ID", + "x-custom": "value" + } +} +---- ++ +Not: ++ +[,json] +---- +{ + "github.copilot.advanced": { + "headers": { // Wrong location + "x-custom": "value" + } + } +} +---- + +. **Environment variable not expanded** ++ +If using `${env:VAR_NAME}` syntax, verify the environment variable is set before launching the IDE. + +=== Model not recognized + +**Symptom**: Error message "Model not found" or "Invalid model ID". + +**Causes and solutions**: + +. **Incorrect model format** ++ +Ensure model names use the `vendor/model_id` format: ++ +[,text] +---- +# Correct +anthropic/claude-sonnet-4-5 +openai/gpt-4o + +# Incorrect +claude-sonnet-4-5 +gpt-4o +---- + +. **Model not enabled in gateway** ++ +Verify the model is enabled in your AI Gateway configuration: ++ +.. Open Redpanda Cloud Console +.. Navigate to your gateway +.. Check enabled providers and models + +. **Typo in model ID** ++ +Double-check the model ID matches exactly (case-sensitive). Copy from the AI Gateway UI rather than typing manually. + +=== Configuration changes not taking effect + +**Symptom**: Changes to settings don't apply. + +**Solutions**: + +. **Reload IDE** ++ +Configuration changes require reloading: ++ +* VS Code: Command Palette > "Developer: Reload Window" +* JetBrains: File > Invalidate Caches / Restart + +. **Invalid JSON syntax** ++ +Validate your `settings.json` file: ++ +[,bash] +---- +python3 -m json.tool ~/.config/Code/User/settings.json +---- ++ +Fix any syntax errors reported. + +. **Workspace settings override** ++ +Check if `.vscode/settings.json` in your project root overrides global settings. Workspace settings take precedence over global settings. + +. **File permissions** ++ +Verify the IDE can read the configuration file: ++ +[,bash] +---- +ls -la ~/.config/Code/User/settings.json +---- ++ +Fix permissions if needed: ++ +[,bash] +---- +chmod 600 ~/.config/Code/User/settings.json +---- + +== Cost optimization tips + +=== Use different models for chat and completion + +Code completion needs speed, while chat benefits from reasoning depth: + +[,json] +---- +{ + "oai.provider.models": [ + { + "id": "anthropic/claude-sonnet-4-5", + "type": "chat" + }, + { + "id": "openai/gpt-4o-mini", + "type": "completion" + } + ] +} +---- + +This can reduce costs by 5-10x for code completion while maintaining chat quality. + +=== Limit token usage + +Reduce maximum tokens for completion to prevent runaway costs: + +[,json] +---- +{ + "oai.provider.models": [ + { + "id": "openai/gpt-4o-mini", + "type": "completion", + "maxTokens": 256 + } + ] +} +---- + +Code completion rarely needs more than 256 tokens. + +=== Monitor usage patterns + +Use the AI Gateway dashboard to identify optimization opportunities: + +. Navigate to your gateway's observability dashboard +. Filter by custom headers (for example, `x-team`, `x-user-email`) +. Analyze: +** Token usage per developer or team +** Most expensive queries +** High-frequency low-value requests + +=== Set team-based budgets + +Use separate gateways or CEL routing to enforce team budgets: + +[,cel] +---- +// Limit team to 1 million tokens per month +request.headers["x-team"] == "frontend" && +monthly_tokens > 1000000 + ? error("Team budget exceeded") + : request.model +---- + +Configure alerts in the dashboard when teams approach their limits. + +=== Track costs per project + +Use custom headers to attribute costs: + +[,json] +---- +{ + "oai.provider.headers": { + "x-project": "mobile-app" + } +} +---- + +Generate project-specific cost reports from the gateway dashboard. + +== Next steps + +* xref:ai-agents:ai-gateway/observability-logs.adoc[]: Monitor GitHub Copilot requests in the gateway dashboard +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[]: Use CEL expressions to route Copilot requests based on context +* xref:ai-agents:ai-gateway/mcp-aggregation-guide.adoc[]: Learn about MCP tool integration (if using Copilot Workspace) + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway.adoc[]: Create and configure your AI Gateway +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[]: Learn about AI Gateway architecture and benefits +* xref:ai-agents:ai-gateway/integrations/claude-code-user.adoc[]: Configure Claude Code with AI Gateway +* xref:ai-agents:ai-gateway/integrations/continue-user.adoc[]: Configure Continue.dev with AI Gateway +* xref:ai-agents:ai-gateway/integrations/cursor-user.adoc[]: Configure Cursor IDE with AI Gateway diff --git a/modules/ai-agents/pages/ai-gateway/integrations/index.adoc b/modules/ai-agents/pages/ai-gateway/integrations/index.adoc new file mode 100644 index 00000000..f899d2ac --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/integrations/index.adoc @@ -0,0 +1,3 @@ += AI Gateway Integrations +:description: Configure AI development tools and IDEs to connect to Redpanda AI Gateway for centralized LLM routing and MCP tool aggregation. +:page-layout: index diff --git a/modules/ai-agents/pages/ai-gateway/mcp-aggregation-guide.adoc b/modules/ai-agents/pages/ai-gateway/mcp-aggregation-guide.adoc new file mode 100644 index 00000000..94e014c4 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/mcp-aggregation-guide.adoc @@ -0,0 +1,1031 @@ += MCP Aggregation and Orchestration Guide +:description: Guide to MCP aggregation and orchestration in Redpanda AI Gateway, including architecture, deferred tool loading, orchestrator workflows, administration, observability, security, and integration examples. +:page-personas: app_developer + +AI Gateway provides MCP (Model Context Protocol) aggregation, allowing AI agents to access tools from multiple MCP servers through a single unified endpoint. This eliminates the need for agents to manage multiple MCP connections and significantly reduces token costs through deferred tool loading. + +After reading this page, you will be able to: + +* Configure MCP aggregation with deferred tool loading to reduce token costs by 80-90%. +* Write orchestrator workflows in JavaScript to reduce multi-step interactions from multiple round trips to a single request. +* Add and manage approved MCP servers with appropriate security controls and audit trails. + +MCP aggregation benefits: + +* Single endpoint: One MCP endpoint aggregates all approved MCP servers +* Token reduction: 80-90% fewer tokens through deferred tool loading +* Centralized governance: Admin-approved MCP servers only +* Orchestration: JavaScript-based orchestrator reduces multi-step round trips +* Security: Controlled tool execution environment + +== What is MCP? + +*Model Context Protocol (MCP)* is a standard for exposing tools (functions) that AI agents can discover and invoke. MCP servers provide tools like: + +* Database queries +* File system operations +* API integrations (CRM, payment, analytics) +* Search (web, vector, enterprise) +* Code execution +* Workflow automation + +[cols="1,1"] +|=== +| Without AI Gateway | With AI Gateway + +| Agent connects to each MCP server individually +| Agent connects to gateway's unified `/mcp` endpoint + +| Agent loads ALL tools from ALL servers upfront (high token cost) +| Gateway aggregates tools from approved MCP servers + +| No centralized governance or security +| Deferred loading: Only search + orchestrator tools sent initially + +| Complex configuration +| Agent queries for specific tools when needed (token savings) + +| +| Centralized governance and observability +|=== + +== Architecture + +[source,text] +---- +┌─────────────────┐ +│ AI Agent │ +│ (Claude, GPT) │ +└────────┬────────┘ + │ + │ 1. Discover tools via /mcp endpoint + │ 2. Invoke specific tool + │ +┌────────▼────────────────────────────────┐ +│ AI Gateway (MCP Aggregator) │ +│ │ +│ ┌─────────────────────────────────┐ │ +│ │ Deferred Tool Loading │ │ +│ │ (Send search + orchestrator │ │ +│ │ initially, defer others) │ │ +│ └─────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────┐ │ +│ │ Orchestrator (JavaScript) │ │ +│ │ (Reduce round trips for │ │ +│ │ multi-step workflows) │ │ +│ └─────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────┐ │ +│ │ Approved MCP Server Registry │ │ +│ │ (Admin-controlled) │ │ +│ └─────────────────────────────────┘ │ +└────────┬────────────────────────────────┘ + │ + │ Routes to appropriate MCP server + │ + ┌────▼─────┬──────────┬─────────┐ + │ │ │ │ +┌───▼────┐ ┌──▼─────┐ ┌──▼──────┐ ┌▼──────┐ +│ MCP │ │ MCP │ │ MCP │ │ MCP │ +│Database│ │Filesystem│ │ Slack │ │Search │ +│Server │ │ Server │ │ Server │ │Server │ +└────────┘ └────────┘ └─────────┘ └───────┘ +---- + + +== MCP request lifecycle + +=== 1. Tool discovery (initial connection) + +Agent request: + +[source,http] +---- +GET /mcp/tools +Headers: + Authorization: Bearer {TOKEN} + rp-aigw-id: {GATEWAY_ID} + rp-aigw-mcp-deferred: true # Enable deferred loading +---- + + +Gateway response (with deferred loading): + +[source,json] +---- +{ + "tools": [ + { + "name": "search_tools", + "description": "Query available tools by keyword or category", + "input_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "category": {"type": "string"} + } + } + }, + { + "name": "orchestrator", + "description": "Execute multi-step workflows with JavaScript logic", + "input_schema": { + "type": "object", + "properties": { + "workflow": {"type": "string"}, + "context": {"type": "object"} + } + } + } + ] +} +---- + + +Note: Only 2 tools returned initially (search + orchestrator), not all 50+ tools from all MCP servers. + +Token savings: + +* Without deferred loading: ~5,000-10,000 tokens (all tool definitions) +* With deferred loading: ~500-1,000 tokens (2 tool definitions) +* 80-90% reduction + +=== 2. Tool query (when agent needs specific tool) + +Agent request: + +[source,http] +---- +POST /mcp/tools/search_tools +Headers: + Authorization: Bearer {TOKEN} + rp-aigw-id: {GATEWAY_ID} +Body: +{ + "query": "database query" +} +---- + + +Gateway response: + +[source,json] +---- +{ + "tools": [ + { + "name": "execute_sql", + "description": "Execute SQL query against the database", + "mcp_server": "database-server", + "input_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "database": {"type": "string"} + }, + "required": ["query"] + } + }, + { + "name": "list_tables", + "description": "List all tables in the database", + "mcp_server": "database-server", + "input_schema": { + "type": "object", + "properties": { + "database": {"type": "string"} + } + } + } + ] +} +---- + + +Agent receives only relevant tools based on query. + +=== 3. Tool execution + +Agent request: + +[source,http] +---- +POST /mcp/tools/execute_sql +Headers: + Authorization: Bearer {TOKEN} + rp-aigw-id: {GATEWAY_ID} +Body: +{ + "query": "SELECT * FROM users WHERE tier = 'premium' LIMIT 10", + "database": "prod" +} +---- + + +Gateway: + +1. Routes to appropriate MCP server (database-server) +2. Executes tool +3. Returns result + +Gateway response: + +[source,json] +---- +{ + "result": [ + {"id": 1, "name": "Alice", "tier": "premium"}, + {"id": 2, "name": "Bob", "tier": "premium"}, + ... + ] +} +---- + + +Agent receives result and can continue reasoning. + +== Deferred tool loading + +=== How it works + +Traditional MCP (No deferred loading): + +1. Agent connects to MCP endpoint +2. Gateway sends ALL tools from ALL MCP servers (50+ tools) +3. Agent includes ALL tool definitions in EVERY LLM request +4. High token cost: ~5,000-10,000 tokens per request + +Deferred loading (AI Gateway): + +1. Agent connects to MCP endpoint with `rp-aigw-mcp-deferred: true` header +2. Gateway sends only 2 tools: `search_tools` + `orchestrator` +3. Agent includes only 2 tool definitions in LLM request (~500-1,000 tokens) +4. When agent needs specific tool: + * Agent calls `search_tools` with query (e.g., "database") + * Gateway returns matching tools + * Agent calls specific tool (e.g., `execute_sql`) +5. Total token cost: Initial 500-1,000 + per-query ~200-500 + * Still 80-90% lower than loading all tools + +=== When to use deferred loading + +Use deferred loading when: + +* You have 10+ tools across multiple MCP servers +* Agents don't need all tools for every request +* Token costs are a concern +* Agents can handle multi-step workflows (search → execute) + +Don't use deferred loading when: + +* You have <5 tools total (overhead not worth it) +* Agents need all tools for every request (rare) +* Latency is more important than token costs (deferred adds 1 round trip) + +=== Configure deferred loading + +// PLACEHOLDER: Add UI path or configuration method + +Option 1: Enable at gateway level (recommended) + +[source,yaml] +---- +# PLACEHOLDER: Actual configuration format +mcp: + deferred_loading: true # Default for all agents using this gateway +---- + + +Option 2: Enable per-request (agent-controlled) + +[source,python] +---- +# Agent includes header +headers = { + "rp-aigw-id": "gw_abc123", + "rp-aigw-mcp-deferred": "true" # Enable for this request +} +---- + + +=== Measure token savings + +Compare token usage before/after deferred loading: + +1. Check logs without deferred loading: + + * Filter: Gateway = your-gateway, Model = your-model, Date = before enabling + * Average tokens per request: // PLACEHOLDER: measure + +2. Enable deferred loading + +3. Check logs after deferred loading: + + * Filter: Same gateway/model, Date = after enabling + * Average tokens per request: // PLACEHOLDER: measure + +4. Calculate savings: ++ +[source,text] +---- +Savings % = ((Before - After) / Before) × 100 +---- + +Expected Results: 80-90% reduction in average tokens per request + +== Orchestrator: multi-step workflows + +=== What is the orchestrator? + +The *orchestrator* is a special tool that executes JavaScript workflows, reducing multi-step interactions from multiple round trips to a single request. + +Without Orchestrator: + +1. Agent: "Search vector database for relevant docs" → Round trip 1 +2. Agent receives results, evaluates: "Results insufficient" +3. Agent: "Fallback to web search" → Round trip 2 +4. Agent receives results, processes → Round trip 3 +5. *Total: 3 round trips* (high latency, 3× token cost) + +With Orchestrator: + +1. Agent: "Execute workflow: Search vector DB → if insufficient, fallback to web search" +2. Gateway executes entire workflow in JavaScript +3. Agent receives final result → *1 round trip* + +Benefits: + +* *Latency Reduction*: 1 round trip vs 3+ +* *Token Reduction*: No intermediate LLM calls needed +* *Reliability*: Workflow logic executes deterministically +* *Cost*: Single LLM call instead of multiple + +=== When to use orchestrator + +Use orchestrator when: + +* Multi-step workflows with conditional logic (if/else) +* Fallback patterns (try A, if fails, try B) +* Sequential tool calls with dependencies +* Loop-based operations (iterate, aggregate) + +Don't use orchestrator when: + +* Single tool call (no benefit) +* Agent needs to reason between steps (orchestrator is deterministic) +* Workflow requires LLM judgment at each step + +=== Orchestrator example: search with fallback + +Scenario: Search vector database; if results insufficient, fallback to web search. + +Without Orchestrator (3 round trips): + +[source,python] +---- +# Agent's internal reasoning (3 separate LLM calls) + +# Round trip 1: Search vector DB +vector_results = call_tool("vector_search", {"query": "Redpanda pricing"}) + +# Round trip 2: Agent evaluates results +if len(vector_results) < 3: + # Round trip 3: Fallback to web search + web_results = call_tool("web_search", {"query": "Redpanda pricing"}) + results = web_results +else: + results = vector_results + +# Agent processes final results +---- + + +With Orchestrator (1 round trip): + +[source,python] +---- +# Agent invokes orchestrator once +results = call_tool("orchestrator", { + "workflow": """ + // JavaScript workflow + const vectorResults = await tools.vector_search({ + query: context.query + }); + + if (vectorResults.length < 3) { + // Fallback to web search + const webResults = await tools.web_search({ + query: context.query + }); + return webResults; + } + + return vectorResults; + """, + "context": { + "query": "Redpanda pricing" + } +}) + +# Agent receives final results directly +---- + + +Savings: + +* Latency: ~3-5 seconds (3 round trips) → ~1-2 seconds (1 round trip) +* Tokens: ~1,500 tokens (3 LLM calls) → ~500 tokens (1 LLM call) +* Cost: ~$0.0075 → ~$0.0025 (67% reduction) + +=== Orchestrator API + +// PLACEHOLDER: Confirm orchestrator API details + +Tool name: `orchestrator` + +Input schema: + +[source,json] +---- +{ + "workflow": "string (JavaScript code)", + "context": "object (variables available to workflow)" +} +---- + + +Available in workflow: + +* `tools.{tool_name}(params)`: Call any tool from approved MCP servers +* `context.{variable}`: Access context variables +* Standard JavaScript: `if`, `for`, `while`, `try/catch`, `async/await` + +Security: + +* Sandboxed execution (no file system, network, or system access) +* Timeout: // PLACEHOLDER: e.g., 30 seconds +* Memory limit: // PLACEHOLDER: e.g., 128MB + +Limitations: + +* Cannot call external APIs directly (must use MCP tools) +* Cannot import npm packages (built-in JS only) +* // PLACEHOLDER: Other limitations? + +=== Orchestrator example: data aggregation + +Scenario: Fetch user data from database, calculate summary statistics. + +[source,python] +---- +results = call_tool("orchestrator", { + "workflow": """ + // Fetch all premium users + const users = await tools.execute_sql({ + query: "SELECT * FROM users WHERE tier = 'premium'", + database: "prod" + }); + + // Calculate statistics + const stats = { + total: users.length, + by_region: {}, + avg_spend: 0 + }; + + let totalSpend = 0; + for (const user of users) { + // Count by region + if (!stats.by_region[user.region]) { + stats.by_region[user.region] = 0; + } + stats.by_region[user.region]++; + + // Sum spend + totalSpend += user.monthly_spend; + } + + stats.avg_spend = totalSpend / users.length; + + return stats; + """, + "context": {} +}) +---- + + +Output: + +[source,json] +---- +{ + "total": 1250, + "by_region": { + "us-east": 600, + "us-west": 400, + "eu": 250 + }, + "avg_spend": 149.50 +} +---- + + +vs Without Orchestrator: + +* Would require fetching all users to agent → agent processes → 2 round trips +* Orchestrator: All processing in gateway → 1 round trip + +=== Orchestrator best practices + +DO: + +* Use for deterministic workflows (same input → same output) +* Use for sequential operations with dependencies +* Use for fallback patterns +* Handle errors with `try/catch` +* Keep workflows readable (add comments) + +DON'T: + +* Use for workflows requiring LLM reasoning at each step (let agent handle that) +* Execute long-running operations (timeout will hit) +* Access external resources (use MCP tools instead) +* Execute untrusted user input (security risk) + +== MCP server administration + +=== Add MCP servers + +// PLACEHOLDER: Add UI path for MCP server management + +Prerequisites: + +* MCP server URL +* Authentication method (if required) +* List of tools to enable + +Steps: + +1. Navigate to MCP servers: + + * Console → AI Gateway → MCP Servers → Add Server + +2. Configure server: ++ +[source,yaml] +---- +# PLACEHOLDER: Actual configuration format +name: database-server +url: https://mcp-database.example.com +authentication: + type: bearer_token + token: ${SECRET_REF} # Reference to secret +enabled_tools: + * execute_sql + * list_tables + * describe_table +---- + +3. Test connection: + + * Gateway attempts connection to MCP server + * Verifies authentication + * Retrieves tool list + +4. Enable server: + + * Server status: Active + * Tools available to agents + +Common MCP servers: + +* Database: PostgreSQL, MySQL, MongoDB query tools +* Filesystem: Read/write/search files +* API Integrations: Slack, GitHub, Salesforce, Stripe +* Search: Web search, vector search, enterprise search +* Code Execution: Python, JavaScript sandboxes +* Workflow: Zapier, n8n integrations + +=== MCP server approval workflow + +Why approval is required: + +* Security: Prevent agents from accessing unauthorized systems +* Governance: Control which tools are available +* Cost: Some tools are expensive (API calls, compute) +* Compliance: Audit trail of approved tools + +Approval process: + +// PLACEHOLDER: Confirm if there's an approval workflow or if admins directly enable servers + +1. Request: User/team requests MCP server +2. Review: Admin reviews security, cost, necessity +3. Approval/Rejection: Admin decision +4. Configuration: If approved, admin adds server to gateway + +Rejected server behavior: + +* Server not listed in tool discovery +* Agent cannot query or invoke tools from this server +* Requests return `403 Forbidden` + +=== Restrict MCP server access + +Per-gateway restrictions: + +[source,yaml] +---- +# PLACEHOLDER: Actual configuration format +gateways: + - name: production-gateway + mcp_servers: + allowed: + - database-server # Only this server allowed + denied: + - filesystem-server # Explicitly denied + + - name: staging-gateway + mcp_servers: + allowed: + - "*" # All approved servers allowed +---- + + +Use cases: + +* Production gateway: Only production-safe tools +* Staging gateway: All tools for testing +* Customer-specific gateway: Only tools relevant to customer + +=== MCP server versioning + +// PLACEHOLDER: How is MCP server versioning handled? + +Challenge: MCP server updates may change tool schemas + +Recommendations: + +1. Pin versions (if supported): ++ +[source,yaml] +---- +mcp_servers: + * name: database-server + version: "1.2.3" # Pin to specific version +---- + +2. Test in staging first: + + * Update MCP server in staging gateway + * Test agent workflows + * Promote to production when validated + +3. Monitor breaking changes: + + * Subscribe to MCP server changelogs + * Set up alerts for schema changes + +== MCP observability + +=== Logs + +MCP tool invocations appear in request logs with: + +* Tool name +* MCP server +* Input parameters +* Output result +* Execution time +* Errors (if any) + +Filter logs by MCP: + +[source,text] +---- +Filter: request.path.startsWith("/mcp") +---- + + +Common log fields: + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| Tool +| Tool invoked +| `execute_sql` + +| MCP Server +| Which server handled it +| `database-server` + +| Input +| Parameters sent +| `{"query": "SELECT ..."}` + +| Output +| Result returned +| `[{"id": 1, ...}]` + +| Latency +| Tool execution time +| `250ms` + +| Status +| Success/failure +| `200`, `500` +|=== + +=== Metrics + +// PLACEHOLDER: Confirm if MCP-specific metrics exist + +MCP-specific metrics (if available): + +* MCP requests per second +* Tool invocation count (by tool, by MCP server) +* MCP latency (p50, p95, p99) +* MCP error rate (by server, by tool) +* Orchestrator execution count +* Orchestrator execution time + +Dashboard: MCP Analytics + +* Top tools by usage +* Top MCP servers by latency +* Error rate by MCP server +* Token savings from deferred loading + +=== Debug MCP issues + +Issue: "Tool not found" + +Possible causes: + +1. MCP server not added to gateway +2. Tool not enabled in MCP server configuration +3. Deferred loading enabled but agent didn't query for tool first + +Solution: + +1. Verify MCP server is active: // PLACEHOLDER: UI path +2. Verify tool is in enabled_tools list +3. If deferred loading: Agent must call `search_tools` first + +Issue: "MCP server timeout" + +Possible causes: + +1. MCP server is down/unreachable +2. Tool execution is slow (e.g., expensive database query) +3. Gateway timeout too short + +Solution: + +1. Check MCP server health +2. Optimize tool (e.g., add database index) +3. Increase timeout: // PLACEHOLDER: How to configure? + +Issue: "Orchestrator workflow failed" + +Possible causes: + +1. JavaScript syntax error +2. Tool invocation failed inside workflow +3. Timeout exceeded +4. Memory limit exceeded + +Solution: + +1. Test workflow syntax in JavaScript playground +2. Check logs for tool error inside orchestrator +3. Simplify workflow or increase timeout +4. Reduce data processing in workflow + +== Security considerations + +=== Tool execution sandboxing + +// PLACEHOLDER: Confirm sandboxing implementation + +Orchestrator sandbox: + +* No file system access +* No network access (except via MCP tools) +* No system calls +* Memory limit: // PLACEHOLDER: e.g., 128MB +* Execution timeout: // PLACEHOLDER: e.g., 30s + +MCP tool execution: + +* Tools execute in MCP server's environment (not gateway) +* Gateway does not execute tool code (only proxies requests) +* Security is MCP server's responsibility + +=== Authentication + +Gateway → MCP server: + +* Bearer token (most common) +* API key +* mTLS (for high-security environments) + +Agent → Gateway: + +* Standard gateway authentication (Redpanda Cloud token) +* `rp-aigw-id` header identifies gateway (and its approved MCP servers) + +=== Audit trail + +All MCP operations logged: + +* Who (agent/user) invoked tool +* When (timestamp) +* What tool was invoked +* What parameters were sent +* What result was returned +* Whether it succeeded or failed + +Use case: Compliance, security investigation, debugging + +=== Restrict dangerous tools + +Recommendation: Don't enable destructive tools in production gateways + +Examples of dangerous tools*: + +* File deletion (`delete_file`) +* Database writes without safeguards (`execute_sql` with UPDATE/DELETE) +* Payment operations (`charge_customer`) +* System commands (`execute_bash`) + +Best practice: + +* Read-only tools in production gateway +* Write tools only in staging gateway (with approval workflows) +* Wrap dangerous operations in MCP server with safeguards (e.g., "require confirmation token") + +== MCP + LLM routing + +=== Combine MCP with CEL routing + +Use case: Route agents to different MCP servers based on customer tier + +CEL expression: + +[source,cel] +---- +request.headers["x-customer-tier"] == "enterprise" + ? "gateway-with-premium-mcp-servers" + : "gateway-with-basic-mcp-servers" +---- + + +Result: + +* Enterprise customers: Access to proprietary data, expensive APIs +* Basic customers: Access to public data, free APIs + +=== MCP with provider pools + +Scenario: Different agents use different models + different tools + +Configuration: + +* Gateway A: GPT-4o + database + CRM MCP servers +* Gateway B: Claude Sonnet + web search + analytics MCP servers + +Use case: Optimize model-tool pairing (some models better at certain tools) + +== Integration examples + +[tabs] +==== +Python (OpenAI SDK):: ++ +-- +[source,python] +---- +from openai import OpenAI + +# Initialize client with MCP endpoint +client = OpenAI( + base_url="https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + api_key=os.getenv("REDPANDA_CLOUD_TOKEN"), + default_headers={ + "rp-aigw-id": os.getenv("GATEWAY_ID"), + "rp-aigw-mcp-deferred": "true" # Enable deferred loading + } +) + +# Discover tools +tools_response = requests.get( + "https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp/tools", + headers={ + "Authorization": f"Bearer {os.getenv('REDPANDA_CLOUD_TOKEN')}", + "rp-aigw-id": os.getenv("GATEWAY_ID"), + "rp-aigw-mcp-deferred": "true" + } +) +tools = tools_response.json()["tools"] + +# Agent uses tools +response = client.chat.completions.create( + model="anthropic/claude-sonnet-3.5", + messages=[ + {"role": "user", "content": "Query the database for premium users"} + ], + tools=tools, # Pass MCP tools to agent + tool_choice="auto" +) + +# Handle tool calls +if response.choices[0].message.tool_calls: + for tool_call in response.choices[0].message.tool_calls: + # Execute tool via gateway + tool_result = requests.post( + f"https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/mcp/tools/{tool_call.function.name}", + headers={ + "Authorization": f"Bearer {os.getenv('REDPANDA_CLOUD_TOKEN')}", + "rp-aigw-id": os.getenv("GATEWAY_ID") + }, + json=json.loads(tool_call.function.arguments) + ) + + # Continue conversation with tool result + response = client.chat.completions.create( + model="anthropic/claude-sonnet-3.5", + messages=[ + {"role": "user", "content": "Query the database for premium users"}, + response.choices[0].message, + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(tool_result.json()) + } + ] + ) +---- +-- + +Claude Code CLI:: ++ +-- +[source,bash] +---- +# Configure gateway with MCP +export CLAUDE_API_BASE="https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1" +export ANTHROPIC_API_KEY="your-redpanda-token" + +# Claude Code automatically discovers MCP tools from gateway +claude code + +# Agent can now use aggregated MCP tools +---- +-- + +LangChain:: ++ +-- +[source,python] +---- +from langchain_openai import ChatOpenAI +from langchain.agents import initialize_agent, Tool + +# Initialize LLM with gateway +llm = ChatOpenAI( + base_url="https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", + api_key=os.getenv("REDPANDA_CLOUD_TOKEN"), + default_headers={ + "rp-aigw-id": os.getenv("GATEWAY_ID") + } +) + +# Fetch MCP tools from gateway +# PLACEHOLDER: LangChain-specific integration code + +# Create agent with MCP tools +agent = initialize_agent( + tools=mcp_tools, + llm=llm, + agent="openai-tools", + verbose=True +) + +# Agent can now use MCP tools +response = agent.run("Find all premium users in the database") +---- +-- +==== + + +== Next steps + +* Configure MCP servers → [MCP Server Administration Guide](// PLACEHOLDER: link) +* Write Orchestrator workflows → [Orchestrator Examples](// PLACEHOLDER: link) +* Monitor MCP usage → [Observability: MCP Metrics](// PLACEHOLDER: link) +* Optimize token costs → [Cost Optimization Guide](// PLACEHOLDER: link) +* Build agentic workflows → [Agent Patterns Guide](// PLACEHOLDER: link) \ No newline at end of file diff --git a/modules/ai-agents/pages/ai-gateway/migration-guide.adoc b/modules/ai-agents/pages/ai-gateway/migration-guide.adoc new file mode 100644 index 00000000..fdeea2ba --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/migration-guide.adoc @@ -0,0 +1,942 @@ += Migrate to AI Gateway +:description: Step-by-step migration guide to transition existing applications from direct LLM provider integrations to Redpanda AI Gateway with minimal disruption. +:page-personas: app_developer + +This guide helps you migrate existing applications from direct LLM provider integrations (OpenAI, Anthropic, and others) to Redpanda AI Gateway. Design the migration to be incremental and reversible, allowing you to test thoroughly before fully committing. + +**Downtime required:** None (supports parallel operation) + +**Rollback difficulty:** Easy (feature flag or environment variable) + +After completing this migration, you will be able to: + +* Migrate existing LLM integrations to AI Gateway with zero downtime using feature flags and parallel operation. +* Verify gateway connectivity and compare performance metrics between direct and gateway-routed requests. +* Roll back to direct integration immediately if issues arise during migration. + +== Prerequisites + +Before migrating, ensure you have: + +* AI Gateway configured in your Redpanda Cloud account +* Enabled providers and models (see [Admin Guide: Providers](// PLACEHOLDER: link)) +* Created gateway with appropriate policies (see [Gateway Creation Guide](// PLACEHOLDER: link)) +* Your gateway ID (`rp-aigw-id` header value) +* Your gateway endpoint URL + + + +== Migration strategy + +=== Recommended approach: Parallel operation + +Run both direct and gateway-routed requests simultaneously to validate behavior before full cutover. + +[source,text] +---- +┌─────────────────┐ +│ Application │ +└────────┬────────┘ + │ + ┌────▼─────┐ + │ Feature │ + │ Flag │ + └────┬─────┘ + │ + ┌────▼──────────────┐ + │ │ +┌───▼─────┐ ┌─────▼─────┐ +│ Direct │ │ Gateway │ +│Provider │ │ Route │ +└─────────┘ └───────────┘ +---- + + +Benefits: + +* No downtime +* Easy rollback +* Compare results side-by-side +* Gradual traffic shift + +== Step-by-step migration + +=== Step 1: Add environment variables + +Add gateway configuration to your environment without removing existing provider keys (yet). + +*.env (or equivalent)* +[source,bash] +---- +# Existing (keep these for now) +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... + +# New gateway configuration +REDPANDA_AI_GATEWAY_URL=https://{GATEWAY_ENDPOINT} +REDPANDA_AI_GATEWAY_ID={GATEWAY_ID} +REDPANDA_AI_GATEWAY_TOKEN={YOUR_TOKEN} + +# Feature flag (start with gateway disabled) +USE_AI_GATEWAY=false +---- + + +=== Step 2: Update your code + +==== Option A: OpenAI SDK (recommended for most use cases) + +Before (Direct OpenAI) + +[source,python] +---- +from openai import OpenAI + +client = OpenAI( + api_key=os.getenv("OPENAI_API_KEY") +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) +---- + + +After (Gateway-routed with feature flag) + +[source,python] +---- +from openai import OpenAI +import os + +# Feature flag determines which client to use +use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + +if use_gateway: + client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) + model = "openai/gpt-4o" # Add vendor prefix +else: + client = OpenAI( + api_key=os.getenv("OPENAI_API_KEY") + ) + model = "gpt-4o" # Original model name + +response = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "Hello"}] +) +---- + + +Better: Abstraction function + +[source,python] +---- +from openai import OpenAI +import os + +def get_llm_client(): + """Returns configured OpenAI client (direct or gateway-routed)""" + use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + + if use_gateway: + return OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) + else: + return OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def get_model_name(base_model: str) -> str: + """Returns model name with vendor prefix if using gateway""" + use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + return f"openai/{base_model}" if use_gateway else base_model + +# Usage +client = get_llm_client() +response = client.chat.completions.create( + model=get_model_name("gpt-4o"), + messages=[{"role": "user", "content": "Hello"}] +) +---- + + +==== Option B: Anthropic SDK + +Before (Direct Anthropic) + +[source,python] +---- +from anthropic import Anthropic + +client = Anthropic( + api_key=os.getenv("ANTHROPIC_API_KEY") +) + +response = client.messages.create( + model="claude-sonnet-3.5", + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}] +) +---- + + +After (Gateway via OpenAI-compatible wrapper) + +Because AI Gateway provides an OpenAI-compatible endpoint, we recommend migrating Anthropic SDK usage to OpenAI SDK for consistency: + +[source,python] +---- +from openai import OpenAI +import os + +use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + +if use_gateway: + # Use OpenAI SDK with gateway + client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) + + response = client.chat.completions.create( + model="anthropic/claude-sonnet-3.5", + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}] + ) +else: + # Keep existing Anthropic SDK + from anthropic import Anthropic + client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + + response = client.messages.create( + model="claude-sonnet-3.5", + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}] + ) +---- + + +Alternative: Keep Anthropic SDK with base_url override + +// PLACEHOLDER: Verify if Anthropic SDK supports base_url override for OpenAI-compatible endpoints + +[source,python] +---- +from anthropic import Anthropic + +use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + +if use_gateway: + client = Anthropic( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), # If supported + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) +else: + client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) +---- + + +==== Option C: Multiple providers + +Before (Separate SDKs) + +[source,python] +---- +from openai import OpenAI +from anthropic import Anthropic + +openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) +anthropic_client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + +# Different code paths +if use_openai: + response = openai_client.chat.completions.create(...) +else: + response = anthropic_client.messages.create(...) +---- + + +After (Unified via Gateway) + +[source,python] +---- +from openai import OpenAI + +# Single client for all providers +client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} +) + +# Same code, different models +if use_openai: + response = client.chat.completions.create( + model="openai/gpt-4o", + messages=[...] + ) +else: + response = client.chat.completions.create( + model="anthropic/claude-sonnet-3.5", + messages=[...] + ) +---- + + +=== Step 3: Test gateway connection + +Before changing the feature flag, verify gateway connectivity: + +Python Test Script + +[source,python] +---- +from openai import OpenAI +import os + +def test_gateway_connection(): + client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) + + try: + response = client.chat.completions.create( + model="openai/gpt-4o-mini", # Use cheap model for testing + messages=[{"role": "user", "content": "Test"}], + max_tokens=10 + ) + print("✅ Gateway connection successful") + print(f"Response: {response.choices[0].message.content}") + return True + except Exception as e: + print(f"❌ Gateway connection failed: {e}") + return False + +if __name__ == "__main__": + test_gateway_connection() +---- + + +Expected output: + +[source,text] +---- +Gateway connection successful +Response: Hello +---- + + +Common issues: + +* `401 Unauthorized` → Check `REDPANDA_AI_GATEWAY_TOKEN` +* `404 Not Found` → Check `REDPANDA_AI_GATEWAY_URL` (should end with `/v1/chat/completions` or base path) +* `Model not found` → Ensure model is enabled in gateway configuration +* No `rp-aigw-id` header → Verify header is set in `default_headers` + +See [Troubleshooting Guide](// PLACEHOLDER: link) for more details. + +=== Step 4: Verify in observability dashboard + +After successful test: + +1. Open AI Gateway observability dashboard +2. Navigate to // PLACEHOLDER: specific UI path, for example, "Gateways → {GATEWAY_NAME} → Logs" +3. Verify your test request appears +4. Check fields: + * Model: `openai/gpt-4o-mini` + * Provider: OpenAI + * Status: 200 + * Token count: ~10 prompt + ~10 completion + * Cost: // PLACEHOLDER: expected cost + +*If request doesn't appear*: Check [End-to-End Validation Guide](// PLACEHOLDER: link) + +=== Step 5: Enable gateway for subset of traffic + +Gradually roll out gateway usage: + +Staged rollout strategy: + +1. *Week 1*: Internal testing only (dev team accounts) +2. *Week 2*: 10% of production traffic +3. *Week 3*: 50% of production traffic +4. *Week 4*: 100% of production traffic + +Implementation options: + +Option A: Environment-based + +[source,python] +---- +# Enable gateway in staging first +use_gateway = os.getenv("ENVIRONMENT") in ["staging", "production"] +---- + + +Option B: Percentage-based + +[source,python] +---- +import random + +# Route 10% of traffic through gateway +use_gateway = random.random() < 0.10 +---- + + +Option C: User-based + +[source,python] +---- +# Enable for internal users first +use_gateway = user.email.endswith("@yourcompany.com") +---- + + +Option D: Feature flag service (recommended) + +[source,python] +---- +# LaunchDarkly, Split.io, etc. +use_gateway = feature_flags.is_enabled("ai-gateway", user_context) +---- + + +=== Step 6: Monitor and compare + +During parallel operation, compare metrics: + +Metrics to monitor: + +[cols="2,1,1,3"] +|=== +| Metric | Direct | Gateway | Notes + +| Success rate +| // track +| // track +| Should be identical + +| Latency p50 +| // track +| // track +| Gateway adds ~// PLACEHOLDER: Xms + +| Latency p99 +| // track +| // track +| Watch for outliers + +| Error rate +| // track +| // track +| Should be identical + +| Cost per 1K requests +| // track +| // track +| Compare estimated costs +|=== + +Monitoring code example: + +[source,python] +---- +import time + +def call_llm_with_metrics(use_gateway: bool, model: str, messages: list): + start_time = time.time() + + try: + client = get_llm_client(use_gateway) + response = client.chat.completions.create( + model=model, + messages=messages + ) + + latency = time.time() - start_time + + # Log metrics + metrics.record("llm.request.success", 1, tags={ + "routing": "gateway" if use_gateway else "direct", + "model": model + }) + metrics.record("llm.request.latency", latency, tags={ + "routing": "gateway" if use_gateway else "direct" + }) + + return response + + except Exception as e: + metrics.record("llm.request.error", 1, tags={ + "routing": "gateway" if use_gateway else "direct", + "error": str(e) + }) + raise +---- + + +=== Step 7: Full cutover + +Once metrics confirm gateway reliability: + +1. Set feature flag to 100%: ++ +[source,bash] +---- +USE_AI_GATEWAY=true +---- + +2. Deploy updated configuration + +3. Monitor for 24-48 hours + +4. Remove direct provider credentials (optional, for security): ++ +[source,bash] +---- +# .env +# OPENAI_API_KEY=sk-... # Remove after confirming gateway stability +# ANTHROPIC_API_KEY=sk-ant-... # Remove after confirming gateway stability + +REDPANDA_AI_GATEWAY_URL=https://{GATEWAY_ENDPOINT} +REDPANDA_AI_GATEWAY_ID={GATEWAY_ID} +REDPANDA_AI_GATEWAY_TOKEN={YOUR_TOKEN} +---- + +5. Remove direct integration code (optional, for cleanup): ++ +[source,python] +---- +# Remove feature flag logic, keep only gateway path +client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} +) +---- + +== Rollback procedure + +If issues arise, rollback is simple: + +Emergency rollback (< 1 minute): + +[source,bash] +---- +# Set feature flag back to false +USE_AI_GATEWAY=false + +# Restart application (if needed) +---- + + +Gradual rollback: + +[source,python] +---- +# Reduce gateway traffic percentage +use_gateway = random.random() < 0.50 # Back to 50% +use_gateway = random.random() < 0.10 # Back to 10% +use_gateway = False # Back to 0% +---- + + +*Keep direct provider credentials until you're confident in gateway stability.* + +== Framework-specific migration + +[tabs] +====== +LangChain:: ++ +-- +Before + +[source,python] +---- +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + model="gpt-4o", + api_key=os.getenv("OPENAI_API_KEY") +) +---- + +After + +[source,python] +---- +from langchain_openai import ChatOpenAI + +use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + +if use_gateway: + llm = ChatOpenAI( + model="openai/gpt-4o", + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} + ) +else: + llm = ChatOpenAI( + model="gpt-4o", + api_key=os.getenv("OPENAI_API_KEY") + ) +---- +-- + +LlamaIndex:: ++ +-- +Before + +[source,python] +---- +from llama_index.llms.openai import OpenAI + +llm = OpenAI(model="gpt-4o") +---- + +After + +[source,python] +---- +from llama_index.llms.openai import OpenAI + +use_gateway = os.getenv("USE_AI_GATEWAY", "false").lower() == "true" + +if use_gateway: + llm = OpenAI( + model="openai/gpt-4o", + api_base=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + additional_kwargs={"headers": {"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")}} + ) +else: + llm = OpenAI(model="gpt-4o") +---- + +// PLACEHOLDER: Verify LlamaIndex syntax for custom headers +-- + +Vercel AI SDK:: ++ +-- +Before + +[source,typescript] +---- +import { openai } from '@ai-sdk/openai'; + +const model = openai('gpt-4o'); +---- + +After + +[source,typescript] +---- +import { openai } from '@ai-sdk/openai'; + +const useGateway = process.env.USE_AI_GATEWAY === 'true'; + +const model = useGateway + ? openai('openai/gpt-4o', { + baseURL: process.env.REDPANDA_AI_GATEWAY_URL, + apiKey: process.env.REDPANDA_AI_GATEWAY_TOKEN, + headers: { + 'rp-aigw-id': process.env.REDPANDA_AI_GATEWAY_ID, + }, + }) + : openai('gpt-4o'); +---- + +// PLACEHOLDER: Verify Vercel AI SDK syntax +-- +====== + +== Migration checklist + +Use this checklist to track your migration: + +*Prerequisites* + + * [ ] Gateway configured and tested + * [ ] Providers enabled + * [ ] Models enabled + * [ ] Gateway ID and endpoint URL obtained + +*Code Changes* + + * [ ] Environment variables added + * [ ] Feature flag implemented + * [ ] Client initialization updated + * [ ] Model name prefix added (vendor/model_id) + * [ ] Headers added (rp-aigw-id) + +*Testing* + + * [ ] Gateway connection test passes + * [ ] Test request visible in observability dashboard + * [ ] Integration tests pass with gateway + * [ ] End-to-end tests pass with gateway + +*Staged rollout* + + * [ ] Week 1: Internal testing (dev team only) + * [ ] Week 2: 10% production traffic + * [ ] Week 3: 50% production traffic + * [ ] Week 4: 100% production traffic + +*Monitoring* + + * [ ] Success rate comparison (direct vs gateway) + * [ ] Latency comparison (direct vs gateway) + * [ ] Error rate comparison (direct vs gateway) + * [ ] Cost comparison (direct vs gateway) + +*Cleanup* (optional, after 30 days stable) + + * [ ] Remove direct provider credentials + * [ ] Remove feature flag logic + * [ ] Update documentation + * [ ] Archive direct integration code + +== Common migration issues + +=== Issue: "Model not found" error + +Symptom: +[source,text] +---- +Error: Model 'openai/gpt-4o' not found +---- + + +Causes: + +1. Model not enabled in gateway configuration +2. Wrong model name format (missing vendor prefix) +3. Typo in model name + +Solution: + +1. Verify model is enabled: // PLACEHOLDER: UI path or CLI command +2. Confirm format: `vendor/model_id` (for example, `openai/gpt-4o`, not `gpt-4o`) +3. Check supported models: // PLACEHOLDER: link to model catalog + +=== Issue: Missing `rp-aigw-id` header + +Symptom: + +[source,text] +---- +Error: Missing required header 'rp-aigw-id' +---- + + +Solution: + +[source,python] +---- +# Ensure header is set in default_headers +client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")} # ← Required +) +---- + + +=== Issue: Higher latency than expected + +Expected gateway overhead: // PLACEHOLDER: Xms p50, Yms p99 + +If latency is significantly higher: + +1. Check geographic routing (gateway → provider region) +2. Verify provider pool configuration (no unnecessary fallbacks) +3. Review CEL routing complexity +4. Check for rate limiting (adds retry latency) + +Solution: See [Performance Optimization Guide](// PLACEHOLDER: link) + +=== Issue: Requests not appearing in dashboard + +Causes: + +1. Wrong gateway ID +2. Request failed before reaching gateway +3. UI delay (logs may take // PLACEHOLDER: Xs to appear) + +Solution: See [End-to-End Validation Guide](// PLACEHOLDER: link) + +=== Issue: Different response format + +Symptom: Response structure differs between direct and gateway + +// PLACEHOLDER: Confirm if response format is identical to OpenAI API or if there are differences + +Solution: + +* AI Gateway should return OpenAI-compatible responses +* If differences exist, file a support ticket with request ID from logs + +== Advanced migration scenarios + +=== Scenario: Custom request timeouts + +Before + +[source,python] +---- +client = OpenAI(api_key=..., timeout=30.0) +---- + + +After + +[source,python] +---- +client = OpenAI( + base_url=os.getenv("REDPANDA_AI_GATEWAY_URL"), + api_key=os.getenv("REDPANDA_AI_GATEWAY_TOKEN"), + default_headers={"rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID")}, + timeout=30.0 # Still supported +) +---- + + +=== Scenario: Streaming responses + +// PLACEHOLDER: Verify streaming support + +Before + +[source,python] +---- +stream = client.chat.completions.create( + model="gpt-4o", + messages=[...], + stream=True +) + +for chunk in stream: + print(chunk.choices[0].delta.content, end="") +---- + + +After + +[source,python] +---- +stream = client.chat.completions.create( + model="openai/gpt-4o", # Add vendor prefix + messages=[...], + stream=True +) + +for chunk in stream: + print(chunk.choices[0].delta.content, end="") +---- + + +=== Scenario: Custom headers (for example, user tracking) + +Before + +[source,python] +---- +response = client.chat.completions.create( + model="gpt-4o", + messages=[...], + extra_headers={"X-User-ID": user.id} +) +---- + + +After + +[source,python] +---- +response = client.chat.completions.create( + model="openai/gpt-4o", + messages=[...], + extra_headers={ + "X-User-ID": user.id, # Custom headers still supported + "rp-aigw-id": os.getenv("REDPANDA_AI_GATEWAY_ID") # Required gateway header + } +) +---- + + +NOTE: Gateway may use custom headers for routing (for example, CEL expressions can reference `request.headers["X-User-ID"]`) + +== Post-migration benefits + +After successful migration, you gain: + +=== 1. Simplified provider management + +[source,python] +---- +# Switch providers with one config change (no code changes) +model = "anthropic/claude-sonnet-3.5" # Was openai/gpt-4o +---- + + +=== 2. Unified observability + +* All requests in one dashboard +* Cross-provider cost comparison +* Session reconstruction across models + +=== 3. Automatic failover + +* Configure once, benefit everywhere +* No application-level retry logic needed + +=== 4. Cost controls + +* Enforce budgets centrally +* Rate limit per team/customer +* No surprises in cloud bills + +=== 5. A/B testing + +* Test new models without code changes +* Compare quality/cost/latency +* Gradual rollout via routing policies + +== Next steps + +* Configure routing policies → [CEL Routing Guide](// PLACEHOLDER: link) +* Explore MCP → [MCP Aggregation Guide](// PLACEHOLDER: link) + +== Related pages + +* [Quickstart](// PLACEHOLDER: link) +* [OpenAI Integration](// PLACEHOLDER: link) +* [Anthropic Integration](// PLACEHOLDER: link) +* [LangChain Integration](// PLACEHOLDER: link) diff --git a/modules/ai-agents/pages/ai-gateway/observability-logs.adoc b/modules/ai-agents/pages/ai-gateway/observability-logs.adoc new file mode 100644 index 00000000..0203cb86 --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/observability-logs.adoc @@ -0,0 +1,775 @@ += Observability: Logs +:description: Guide to AI Gateway request logs, including where to find logs, log fields, filtering, searching, inspecting requests, common analysis tasks, log retention, export options, privacy/security, and troubleshooting. +:page-personas: platform_admin, app_developer + +AI Gateway logs every LLM request that passes through it, capturing the full request/response history, token usage, cost, latency, and routing decisions. This page explains how to find, filter, and interpret request logs. + +After reading this page, you will be able to: + +* Locate and filter request logs to debug specific failed requests or reconstruct user conversations. +* Interpret log fields (status codes, token usage, routing decisions) to diagnose performance and cost issues. +* Export logs for compliance auditing or long-term analysis. + +== Before you begin + +* You have an active AI Gateway with at least one request processed. +* You have access to the Redpanda Cloud Console. +* You have the appropriate permissions to view gateway logs. + +Use logs for: + +* Debugging specific failed requests +* Reconstructing user conversation sessions +* Auditing what prompts were sent and responses received +* Understanding which provider handled a request +* Investigating latency spikes or errors for specific users + +Use metrics for: Aggregate analytics, trends, cost tracking across time → See [Observability: Metrics](// PLACEHOLDER: link) + +== Where to find logs + +// PLACEHOLDER: Add exact UI navigation path + +1. Navigate to logs view: + * Console → AI Gateway → // PLACEHOLDER: exact path + * Or: Gateway detail page → Logs tab + +2. Select gateway: + * Filter by specific gateway, or view all gateways + * // PLACEHOLDER: screenshot of gateway selector + +3. Set time range: + * Default: Last 1 hour + * Options: Last 5 minutes, 1 hour, 24 hours, 7 days, 30 days, Custom + * // PLACEHOLDER: screenshot of time range picker + +== Request log fields + +Each log entry contains: + +=== Core request info + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| *Request ID* +| Unique identifier for this request +| `req_abc123...` + +| *Timestamp* +| When request was received (UTC) +| `2025-01-11T14:32:10.123Z` + +| *Gateway ID* +| Which gateway handled this request +| `gw_abc123...` + +| *Gateway Name* +| Human-readable gateway name +| `production-gateway` + +| *Status* +| HTTP status code +| `200`, `400`, `429`, `500` + +| *Latency* +| Total request duration (ms) +| `1250ms` +|=== + +=== Model and provider info + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| *Requested Model* +| Model specified in request +| `openai/gpt-4o` + +| *Actual Model* +| Model that handled request (may differ due to routing) +| `anthropic/claude-sonnet-3.5` + +| *Provider* +| Which provider handled the request +| `OpenAI`, `Anthropic` + +| *Provider Pool* +| Pool used (primary/fallback) +| `primary`, `fallback` + +| *Fallback Triggered* +| Whether fallback was used +| `true`/`false` + +| *Fallback Reason* +| Why fallback occurred +| `rate_limit`, `timeout`, `5xx_error` +|=== + +=== Token and cost info + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| *Prompt Tokens* +| Input tokens consumed +| `523` + +| *Completion Tokens* +| Output tokens generated +| `187` + +| *Total Tokens* +| Prompt + completion +| `710` + +| *Estimated Cost* +| Calculated cost for this request +| `$0.0142` + +| *Cost Breakdown* +| Per-token costs +| `Prompt: $0.005, Completion: $0.0092` +|=== + +=== Request content (expandable) + +[cols="1,2,2"] +|=== +| Field | Description | Notes + +| *Request Headers* +| All headers sent +| Includes `rp-aigw-id`, custom headers + +| *Request Body* +| Full request payload +| Includes messages, parameters + +| *Response Headers* +| Headers returned +| // PLACEHOLDER: Any gateway-specific headers? + +| *Response Body* +| Full response payload +| Includes message content, metadata +|=== + +=== Routing and policy info + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| *CEL Expression* +| Routing rule applied (if any) +| `request.headers["tier"] == "premium" ? ...` + +| *CEL Result* +| Model selected by CEL +| `openai/gpt-4o` + +| *Rate Limit Status* +| Whether rate limited +| `allowed`, `throttled`, `blocked` + +| *Spend Limit Status* +| Whether budget exceeded +| `allowed`, `blocked` + +| *Policy Stage* +| Where request was processed/blocked +| `rate_limit`, `routing`, `execution` +|=== + +=== Error info (if applicable) + +[cols="1,2,2"] +|=== +| Field | Description | Example + +| *Error Code* +| Gateway or provider error code +| `RATE_LIMIT_EXCEEDED`, `MODEL_NOT_FOUND` + +| *Error Message* +| Human-readable error +| `Request rate limit exceeded for gateway` + +| *Provider Error* +| Upstream provider error +| `OpenAI API returned 429: Rate limit exceeded` +|=== + +== Filter logs + +=== By gateway + +// PLACEHOLDER: Screenshot of gateway filter dropdown + +[source,text] +---- +Filter: Gateway = "production-gateway" +---- + + +Shows only requests for the selected gateway. + +Use case: Isolate production traffic from staging + +=== By model + +// PLACEHOLDER: Screenshot of model filter + +[source,text] +---- +Filter: Model = "openai/gpt-4o" +---- + + +Shows only requests for specific model. + +Use case: Compare quality/cost between models + +=== By provider + +[source,text] +---- +Filter: Provider = "OpenAI" +---- + + +Shows only requests handled by specific provider. + +Use case: Investigate provider-specific issues + +=== By status + +[source,text] +---- +Filter: Status = "429" +---- + + +Shows only requests with specific HTTP status. + +Common filters: + +* `200`: Successful requests +* `400`: Bad requests (client errors) +* `401`: Authentication errors +* `429`: Rate limited requests +* `500`: Server errors +* `5xx`: All server errors + +Use case: Find all failed requests + +=== By time range + +[source,text] +---- +Filter: Timestamp >= "2025-01-11T14:00:00Z" AND Timestamp <= "2025-01-11T15:00:00Z" +---- + + +Use case: Investigate incident during specific time window + +=== By custom header + +[source,text] +---- +Filter: request.headers["x-user-id"] = "user_123" +---- + + +Shows only requests for specific user. + +Use case: Debug user-reported issue + +=== By token range + +[source,text] +---- +Filter: Total Tokens > 10000 +---- + + +Shows only high-token requests. + +Use case: Find expensive requests + +=== By latency + +[source,text] +---- +Filter: Latency > 5000ms +---- + + +Shows only slow requests. + +Use case: Investigate performance issues + +=== Combined filters + +[source,text] +---- +Gateway = "production-gateway" +AND Status >= 500 +AND Timestamp >= "last 24 hours" +---- + + +Shows production server errors in last 24 hours. + +// PLACEHOLDER: Screenshot of multiple filters applied + +== Search logs + +=== Full-text search (if supported) + +// PLACEHOLDER: Confirm if full-text search is available + +[source,text] +---- +Search: "specific error message" +---- + + +Searches across all text fields (error messages, request/response content). + +=== Search by request content + +[source,text] +---- +Search in Request Body: "user's actual question" +---- + + +Find requests containing specific prompt text. + +Use case: "A user said the AI gave a wrong answer about X" → Search for "X" in prompts + +=== Search by response content + +[source,text] +---- +Search in Response Body: "specific AI response phrase" +---- + + +Find responses containing specific text. + +Use case: Find all requests where AI mentioned a competitor name + +== Inspect individual requests + +Click any log entry to expand full details. + +// PLACEHOLDER: Screenshot of expanded log entry + +=== Request details tab + +Shows: + +* Full request headers +* Full request body (formatted JSON) +* All parameters (temperature, max_tokens, etc.) +* Custom headers used for routing + +Example: + +[source,json] +---- +{ + "model": "openai/gpt-4o", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is Redpanda?" + } + ], + "temperature": 0.7, + "max_tokens": 500 +} +---- + + +=== Response details tab + +Shows: + +* Full response headers +* Full response body (formatted JSON) +* Finish reason (`stop`, `length`, `content_filter`) +* Response metadata + +Example: + +[source,json] +---- +{ + "id": "chatcmpl-...", + "choices": [ + { + "message": { + "role": "assistant", + "content": "Redpanda is a streaming data platform..." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 24, + "completion_tokens": 87, + "total_tokens": 111 + } +} +---- + + +=== Routing details tab + +Shows: + +* CEL expression evaluated (if any) +* CEL result (which model was selected) +* Provider pool used (primary/fallback) +* Fallback trigger reason (if applicable) +* Rate limit evaluation (allowed/blocked) +* Spend limit evaluation (allowed/blocked) + +Example: + +[source,yaml] +---- +CEL Expression: | + request.headers["x-user-tier"] == "premium" + ? "openai/gpt-4o" + : "openai/gpt-4o-mini" + +CEL Result: "openai/gpt-4o" + +Provider Pool: primary +Fallback Triggered: false + +Rate Limit: allowed (45/100 requests used) +Spend Limit: allowed ($1,234 / $50,000 budget used) +---- + + +=== Performance details tab + +Shows: + +* Total latency breakdown + * Gateway processing time: // PLACEHOLDER: Xms + * Provider API call time: // PLACEHOLDER: Xms + * Network time: // PLACEHOLDER: Xms +* Token generation rate (tokens/second) +* Time to first token (for streaming, if supported) + +Example: + +[source,text] +---- +Total Latency: 1,250ms +├─ Gateway Processing: 12ms +├─ Provider API Call: 1,215ms +└─ Network Overhead: 23ms + +Token Generation Rate: 71 tokens/second +---- + + +== Common log analysis tasks + +=== Task 1: "Why did this request fail?" + +1. Find the request: + + * Filter by timestamp (when user reported issue) + * Or search by request content + * Or filter by custom header (user ID) + +2. Check status: + + * `400` → Client error (bad request format, invalid parameters) + * `401` → Authentication issue + * `404` → Model not found + * `429` → Rate limited + * `500`/`5xx` → Provider or gateway error + +3. Check error message: + + * Gateway error: Issue with configuration, rate limits, etc. + * Provider error: Issue with upstream API (OpenAI, Anthropic, etc.) + +4. Check routing: + * Was fallback triggered? (May indicate primary provider issue) + * Was CEL rule applied correctly? + +Common causes: + +* Model not enabled in gateway +* Rate limit exceeded +* Monthly budget exceeded +* Invalid API key for provider +* Provider outage/rate limit +* Malformed request + +=== Task 2: "Reconstruct a user's conversation" + +1. *Filter by user*: ++ +[source,text] +---- +Filter: request.headers["x-user-id"] = "user_123" +---- + +2. *Sort by timestamp* (ascending) + +3. *Review conversation flow*: + + * Each request shows prompt + * Each response shows AI reply + * Reconstruct full conversation thread + +Use case: User says "the AI contradicted itself" → View full conversation history + +=== Task 3: "Why is latency high for this user?" + +1. *Find user's requests*: ++ +[source,text] +---- +Filter: request.headers["x-user-id"] = "user_123" +AND Latency > 3000ms +---- + +2. *Check Performance Details*: + + * Is gateway processing slow? (Likely CEL complexity) + * Is provider API slow? (Upstream latency) + * Is token generation rate normal? (Tokens/second) + +3. *Compare to other requests*: + + * Filter for same model + * Compare latency percentiles + * Identify if issue is user-specific or model-wide + +Common causes: + +* Complex CEL routing rules +* Provider performance degradation +* Large context windows (high token count) +* Network issues + +=== Task 4: "Which requests used the fallback provider?" + +1. *Filter by fallback*: ++ +[source,text] +---- +Filter: Fallback Triggered = true +---- + +2. *Group by Fallback Reason*: + + * Rate limit exceeded (primary provider throttled) + * Timeout (primary provider slow) + * 5xx error (primary provider error) + +3. *Analyze pattern*: + + * Is fallback happening frequently? (May indicate primary provider issue) + * Is fallback successful? (Check status of fallback requests) + +Use case: Verify failover is working as expected + +=== Task 5: "What did we spend on this customer today?" + +1. *Filter by customer*: ++ +[source,text] +---- +Filter: request.headers["x-customer-id"] = "customer_abc" +AND Timestamp >= "today" +---- + +2. *Sum estimated costs* (if UI supports): + + // PLACEHOLDER: Does UI have cost aggregation for filtered results? + * Total: $X.XX + * Breakdown by model + +3. *Export to CSV* (if supported): + + // PLACEHOLDER: Is CSV export available? + * For detailed billing analysis + +Use case: Chargeback/showback to customers + +== Log retention + +// PLACEHOLDER: Confirm log retention policy + +Retention period: // PLACEHOLDER: e.g., 30 days, 90 days, configurable + +After retention period: + +* Logs are deleted automatically +* Aggregate metrics retained longer (see [Metrics](// PLACEHOLDER: link)) + +Export logs (if needed for longer retention): + +// PLACEHOLDER: Is log export available? Via API? CSV? + +== Log export + +// PLACEHOLDER: Confirm export capabilities + +=== Export to CSV + +// PLACEHOLDER: Add UI path for export, or indicate not available + +1. Apply filters for desired logs +2. Click "Export to CSV" +3. Download includes all filtered logs with full fields + +=== Export via API + +// PLACEHOLDER: If API is available for log export + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/logs \ + -H "Authorization: Bearer ${REDPANDA_CLOUD_TOKEN}" \ + -G \ + --data-urlencode "gateway_id=gw_abc123" \ + --data-urlencode "start_time=2025-01-11T00:00:00Z" \ + --data-urlencode "end_time=2025-01-11T23:59:59Z" +---- + + +=== Integration with observability platforms + +// PLACEHOLDER: Are there integrations with external platforms? + +Supported integrations (if any): + +* OpenTelemetry export → Send logs to Jaeger, Datadog, New Relic +* CloudWatch Logs → For AWS deployments +* // PLACEHOLDER: Others? + +See [Observability Integrations](// PLACEHOLDER: link) for setup guides. + +== Privacy and security + +=== What is logged + +// PLACEHOLDER: Confirm what is logged by default + +AI Gateway logs by default: + +* Request headers (including custom headers) +* Request body (full prompt content) +* Response body (full AI response) +* Token usage, cost, latency +* Routing decisions, policy evaluations + +AI Gateway does not log (if applicable): + +* // PLACEHOLDER: Anything redacted? API keys? Specific headers? + +=== Redaction options + +// PLACEHOLDER: Are there options to redact PII or sensitive data? + +If redaction is supported: + +* Configure redaction rules for specific fields +* Mask PII (email addresses, phone numbers, etc.) +* Redact custom header values + +Example: + +[source,yaml] +---- +# PLACEHOLDER: Actual configuration format +redaction: + - field: request.headers.x-api-key + action: mask + - field: request.body.messages[].content + pattern: "\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b" # Email regex + action: replace + replacement: "[REDACTED_EMAIL]" +---- + + +=== Access control + +// PLACEHOLDER: Who can view logs? RBAC? + +Permissions required: + +* View logs: // PLACEHOLDER: role/permission name +* Export logs: // PLACEHOLDER: role/permission name + +Audit trail: + +* Log access is audited (who viewed which logs, when) +* // PLACEHOLDER: Where to find audit trail? + +== Troubleshoot log issues + +=== Issue: "Logs not appearing for my request" + +Possible causes: + +1. Log ingestion delay (wait // PLACEHOLDER: Xs) +2. Wrong gateway ID filter +3. Request failed before reaching gateway (authentication error) +4. Time range filter too narrow + +Solution: + +1. Wait a moment and refresh +2. Remove all filters, search by timestamp +3. Check client-side error logs +4. Expand time range to "Last 1 hour" + +=== Issue: "Missing request/response content" + +Possible causes: + +1. Payload too large (// PLACEHOLDER: size limit?) +2. Redaction rules applied +3. // PLACEHOLDER: Other reasons? + +Solution: + +// PLACEHOLDER: How to retrieve full content if truncated? + +=== Issue: "Cost estimate incorrect" + +Possible causes: + +1. Cost estimate based on public pricing (may differ from your contract) +2. Provider changed pricing +3. // PLACEHOLDER: Other reasons? + +Note: Cost estimates are approximate. Use provider invoices for billing. + +== Next steps + +* Aggregate analytics → [Observability: Metrics](// PLACEHOLDER: link) \ No newline at end of file diff --git a/modules/ai-agents/pages/ai-gateway/observability-metrics.adoc b/modules/ai-agents/pages/ai-gateway/observability-metrics.adoc new file mode 100644 index 00000000..bd8ea95e --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/observability-metrics.adoc @@ -0,0 +1,872 @@ += Observability: Metrics and Analytics +:description: Guide to AI Gateway metrics and analytics, including where to find metrics, key metrics explained, dashboard views, filtering/grouping, alerting, exporting, common analysis tasks, retention, API access, best practices, and troubleshooting. +:page-personas: platform_admin + +AI Gateway provides aggregate metrics and analytics dashboards to help you understand usage patterns, costs, performance, and errors across all your LLM traffic. + +After reading this page, you will be able to: + +* Monitor aggregate metrics (request volume, token usage, estimated spend) to track usage patterns and budget adherence. +* Compare model and provider performance using latency, error rate, and cost metrics. +* Configure alerts for budget thresholds and performance degradation. + +== Before you begin + +* You have an active AI Gateway with at least one request processed. +* You have access to the Redpanda Cloud Console. +* You have the appropriate permissions to view gateway metrics. + +Use metrics for: + +* Cost tracking and budget management +* Usage trends over time +* Performance monitoring (latency, error rates) +* Capacity planning +* Model/provider comparison + +Use logs for: Debugging specific requests, viewing full prompts/responses. See [Observability: Logs](// PLACEHOLDER: link) + +== Where to find metrics + +// PLACEHOLDER: Add exact UI navigation path + +1. Navigate to analytics dashboard: + * Console → AI Gateway → // PLACEHOLDER: exact path + * Or: Gateway detail page → Analytics tab + +2. Select gateway (optional): + * View all gateways (org-wide metrics) + * Or filter to specific gateway + +3. Set time range: + * Default: Last 7 days + * Options: Last 24 hours, 7 days, 30 days, 90 days, Custom + * // PLACEHOLDER: screenshot of time range picker + +== Key metrics + +=== Request volume + +What it shows: Total number of requests over time + +// PLACEHOLDER: Screenshot of request volume graph + +Graph type: Time series line chart + +Filters: + +* By gateway +* By model +* By provider +* By status (success/error) + +Use cases: + +* Identify usage patterns (peak hours, days of week) +* Detect traffic spikes or drops +* Capacity planning + +Example insights: + +* "Traffic doubles every Monday morning at 9am" → Scale infrastructure +* "Staging gateway has more traffic than prod" → Investigate runaway testing + +=== Token usage + +What it shows: Prompt, completion, and total tokens consumed + +// PLACEHOLDER: Screenshot of token usage graph + +Graph type: Stacked area chart (prompt vs completion tokens) + +Metrics: + +* Total tokens +* Prompt tokens (input) +* Completion tokens (output) +* Tokens per request (average) + +Breakdowns: + +* By gateway +* By model +* By provider + +Use cases: + +* Understand cost drivers (prompt vs completion tokens) +* Identify verbose prompts or responses +* Optimize token usage + +Example insights: + +* "90% of tokens are completion tokens" → Responses are verbose, optimize max_tokens +* "Staging uses 10x more tokens than prod" → Investigate test suite + +=== Estimated spend + +What it shows: Calculated cost based on token usage and public pricing + +// PLACEHOLDER: Screenshot of cost tracking dashboard + +Graph type: Time series line chart with cost breakdown + +Metrics: + +* Total estimated spend +* Spend by model +* Spend by provider +* Spend by gateway +* Cost per 1K requests +* Cost per 1M tokens + +Breakdowns: + +* By gateway (for chargeback/showback) +* By model (for cost optimization) +* By provider (for negotiation leverage) +* By custom header (if configured, e.g., `x-customer-id`) + +Use cases: + +* Budget tracking ("Are we staying under $50K/month?") +* Cost attribution ("Which team spent the most?") +* Model comparison ("Is Claude cheaper than GPT-4 for our use case?") +* Forecasting ("At this rate, we'll spend $X next month") + +Important notes: + +* *Estimates based on public pricing* (may differ from your contract) +* *Not a substitute for provider invoices* (use for approximation only) +* Update frequency: // PLACEHOLDER: Real-time? Hourly? Daily? + +Example insights: + +* "Customer A accounts for 60% of spend" → Consider rate limits or tiered pricing +* "GPT-4o is 3x more expensive than Claude Sonnet for similar quality" → Optimize routing + +=== Latency + +What it shows: Request duration from gateway to provider and back + +// PLACEHOLDER: Screenshot of latency histogram + +Metrics: + +* p50 (median) latency +* p95 latency +* p99 latency +* Min/max latency +* Average latency + +Breakdowns: + +* By gateway +* By model +* By provider +* By token range (longer responses = higher latency) + +Use cases: + +* Identify slow models or providers +* Set SLO targets (e.g., "p95 < 2 seconds") +* Detect performance regressions + +Example insights: + +* "GPT-4o p99 latency spiked to 10 seconds yesterday" → Investigate provider issue +* "Claude Sonnet is 30% faster than GPT-4o for same prompts" → Optimize for latency + +Latency components (if available): + +// PLACEHOLDER: Does gateway show latency breakdown? +* Gateway processing time +* Provider API time +* Network time + +=== Error rate + +What it shows: Percentage of failed requests over time + +// PLACEHOLDER: Screenshot of error rate graph + +Metrics: + +* Total error rate (%) +* Errors by status code (400, 401, 429, 500, etc.) +* Errors by model +* Errors by provider + +Graph type: Time series line chart with error percentage + +Breakdowns: + +* By error type: + * Client errors (4xx) + * Rate limits (429) + * Server errors (5xx) + * Provider errors + * Gateway errors + +Use cases: + +* Detect provider outages +* Identify configuration issues (e.g., model not enabled) +* Monitor rate limit breaches + +Example insights: + +* "Error rate spiked to 15% at 2pm" → OpenAI outage, fallback to Anthropic worked +* "10% of requests fail with 'model not found'" → Model not enabled in gateway + +=== Success rate + +What it shows: Percentage of successful (200) requests over time + +Metric: `Success Rate = (Successful Requests / Total Requests) × 100` + +Target: Typically 99%+ for production workloads + +Use cases: + +* Monitor overall health +* Set up alerts (e.g., "Alert if success rate < 95%") + +=== Fallback rate + +What it shows: Percentage of requests that used fallback provider + +// PLACEHOLDER: Screenshot of fallback rate graph + +Metric: `Fallback Rate = (Fallback Requests / Total Requests) × 100` + +Breakdowns: + +* By fallback reason: + * Rate limit exceeded + * Timeout + * 5xx error + +Use cases: + +* Monitor primary provider reliability +* Verify fallback is working +* Identify when to renegotiate rate limits + +Example insights: + +* "Fallback rate increased to 20% yesterday" → OpenAI hit rate limits, time to increase quota +* "Zero fallbacks in 30 days" → Fallback config may not be working, or primary provider is very reliable + +== Dashboard views + +=== Overview dashboard + +Shows: High-level metrics across all gateways + +// PLACEHOLDER: Screenshot of overview dashboard + +Widgets: + +* Total requests (last 24h, 7d, 30d) +* Total spend (last 24h, 7d, 30d) +* Success rate (current) +* Average latency (current) +* Top 5 models by request volume +* Top 5 gateways by spend + +Use case: Executive view, health at a glance + +=== Gateway dashboard + +Shows: Metrics for a specific gateway + +// PLACEHOLDER: Screenshot of gateway dashboard + +Widgets: + +* Request volume (time series) +* Token usage (time series) +* Estimated spend (time series) +* Latency percentiles (histogram) +* Error rate (time series) +* Model breakdown (pie chart) +* Provider breakdown (pie chart) + +Use case: Team-specific monitoring, gateway optimization + +=== Model comparison dashboard + +Shows: Side-by-side comparison of models + +// PLACEHOLDER: Screenshot of model comparison + +Metrics per model: + +* Request count +* Total tokens +* Estimated cost +* Cost per 1K requests +* Average latency +* Error rate + +Use case: Evaluate whether to switch models (cost vs performance) + +Example: + +[cols="2,1,1,1,1"] +|=== +| Model | Requests | Avg Latency | Cost per 1K | Error Rate + +| openai/gpt-4o +| 10,000 +| 1.2s +| $5.00 +| 0.5% + +| anthropic/claude-sonnet-3.5 +| 5,000 +| 0.9s +| $3.50 +| 0.3% + +| openai/gpt-4o-mini +| 20,000 +| 0.7s +| $0.50 +| 1.0% +|=== + +Insight: Claude Sonnet is 25% faster and 30% cheaper than GPT-4o with better reliability + +=== Provider comparison dashboard + +Shows: Side-by-side comparison of providers + +Metrics per provider: + +* Request count +* Total spend +* Average latency +* Error rate +* Fallback trigger rate + +Use case: Evaluate provider reliability, negotiate contracts + +=== Cost breakdown dashboard + +Shows: Detailed cost analysis + +// PLACEHOLDER: Screenshot of cost breakdown + +Widgets: + +* Spend by gateway (stacked bar chart) +* Spend by model (pie chart) +* Spend by provider (pie chart) +* Spend by custom dimension (if configured, e.g., customer ID) +* Spend trend (time series with forecast) +* Budget utilization (progress bar: $X / $Y monthly limit) + +Use case: FinOps, budget management, chargeback/showback + +== Filter and group + +=== Filter by gateway + +[source,text] +---- +Filter: Gateway = "production-gateway" +---- + + +Shows metrics for specific gateway only. + +Use case: Isolate prod from staging metrics + +=== Filter by model + +[source,text] +---- +Filter: Model = "openai/gpt-4o" +---- + + +Shows metrics for specific model only. + +Use case: Evaluate model performance in isolation + +=== Filter by provider + +[source,text] +---- +Filter: Provider = "OpenAI" +---- + + +Shows metrics for specific provider only. + +Use case: Evaluate provider reliability + +=== Filter by status + +[source,text] +---- +Filter: Status = "200" // Only successful requests +Filter: Status >= "500" // Only server errors +---- + + +Use case: Focus on errors, or calculate success rate + +=== Filter by custom dimension + +// PLACEHOLDER: Confirm if custom dimensions are supported for filtering + +[source,text] +---- +Filter: request.headers["x-customer-id"] = "customer_abc" +---- + + +Shows metrics for specific customer. + +Use case: Customer-specific cost tracking for chargeback + +=== Group by dimension + +Common groupings: + +* Group by Gateway +* Group by Model +* Group by Provider +* Group by Status +* Group by Hour/Day/Week/Month (time aggregation) + +Example: "Show me spend grouped by model, for production gateway, over last 30 days" + +== Alerting + +// PLACEHOLDER: Confirm if alerting is supported + +If alerting is supported: + +=== Alert types + +Budget alerts: + +* Alert when spend exceeds X% of monthly budget +* Alert when spend grows Y% week-over-week + +Performance alerts: + +* Alert when error rate > X% +* Alert when p99 latency > Xms +* Alert when success rate < X% + +Usage alerts: + +* Alert when request volume drops (potential outage) +* Alert when fallback rate > X% (primary provider issue) + +=== Alert channels + +// PLACEHOLDER: Supported notification channels +* Email +* Slack +* PagerDuty +* Webhook +* // PLACEHOLDER: Others? + +=== Example alert configuration + +[source,yaml] +---- +# PLACEHOLDER: Actual alert configuration format +alerts: + - name: "High Error Rate" + condition: error_rate > 5% + duration: 5 minutes + channels: [slack, email] + + - name: "Budget Threshold" + condition: monthly_spend > 80% of budget + channels: [email] + + - name: "Latency Spike" + condition: p99_latency > 5000ms + duration: 10 minutes + channels: [pagerduty] +---- + + +See [Alerting Guide](// PLACEHOLDER: link) for detailed setup. + +== Export metrics + +// PLACEHOLDER: Confirm export capabilities + +=== Export to CSV + +1. Apply filters for desired metrics +2. Click "Export to CSV" +3. Download includes time series data + +Use case: Import into spreadsheet for analysis, reporting + +=== Export via API + +// PLACEHOLDER: If API is available for metrics + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/metrics \ + -H "Authorization: Bearer ${REDPANDA_CLOUD_TOKEN}" \ + -G \ + --data-urlencode "gateway_id=gw_abc123" \ + --data-urlencode "start_time=2025-01-01T00:00:00Z" \ + --data-urlencode "end_time=2025-01-31T23:59:59Z" \ + --data-urlencode "metric=requests,tokens,cost" +---- + + +Response: + +[source,json] +---- +{ + "gateway_id": "gw_abc123", + "start_time": "2025-01-01T00:00:00Z", + "end_time": "2025-01-31T23:59:59Z", + "metrics": { + "requests": 1000000, + "tokens": 500000000, + "estimated_cost": 2500.00 + } +} +---- + + +=== Integration with observability platforms + +// PLACEHOLDER: OpenTelemetry support? Other integrations? + +Supported integrations (if any): + +* Prometheus: Metrics endpoint for scraping +* OpenTelemetry: Export metrics to OTel collector +* Datadog: Direct integration +* Grafana: Pre-built dashboards +* // PLACEHOLDER: Others? + +See [Observability Integrations](// PLACEHOLDER: link) for setup guides. + +== Common analysis tasks + +=== Task 1: "Are we staying within budget?" + +1. View cost breakdown dashboard +2. Check budget utilization widget: + * Current spend: $X + * Monthly budget: $Y + * Utilization: X% + * Days remaining in month: Z +3. Forecast: + * At current rate: $X × (30 / days_elapsed) + * On track to exceed budget? Yes/No + +Action: + +* If approaching limit: Adjust rate limits, optimize models, pause non-prod usage +* If well under budget: Opportunity to test more expensive models + +=== Task 2: "Which team is using the most resources?" + +1. Filter by gateway (assuming one gateway per team) +2. *Sort by Spend* (descending) +3. View table: + +[cols="2,1,1,1,1"] +|=== +| Gateway | Requests | Tokens | Spend | % of Total + +| team-ml +| 500K +| 250M +| $1,250 +| 50% + +| team-product +| 300K +| 150M +| $750 +| 30% + +| team-eng +| 200K +| 100M +| $500 +| 20% +|=== + +Action: Chargeback costs to teams, or investigate high-usage teams + +=== Task 3: "Is this model worth the extra cost?" + +1. *Open Model Comparison Dashboard* +2. Select models to compare: + * Expensive model: `openai/gpt-4o` + * Cheap model: `openai/gpt-4o-mini` +3. Compare metrics: + +[cols="2,1,1,2"] +|=== +| Metric | GPT-4o | GPT-4o-mini | Difference + +| Cost per 1K requests +| $5.00 +| $0.50 +| *10x* + +| Avg Latency +| 1.2s +| 0.7s +| 58% *faster* (mini) + +| Error Rate +| 0.5% +| 1.0% +| 2x errors (mini) +|=== + +Decision: If mini's error rate is acceptable, save 10x on costs + +=== Task 4: "Why did costs spike yesterday?" + +1. View cost trend graph +2. Identify spike (e.g., Jan 10th: $500 vs usual $100) +3. Drill down: + * By gateway: Which gateway caused the spike? + * By model: Did someone switch to expensive model? + * By hour: What time did spike occur? +4. Cross-reference with logs: + * Filter logs to spike timeframe + * Check for unusual request patterns + * Identify custom header (user ID, customer ID) if present + +Common causes: + +* Test suite running against prod gateway +* A/B test routing all traffic to expensive model +* User error (wrong model in config) +* Runaway loop in application code + +=== Task 5: "Is provider X more reliable than provider Y?" + +1. Open provider comparison dashboard +2. Compare error rates: + +[cols="2,1,1,2"] +|=== +| Provider | Requests | Error Rate | Fallback Triggers + +| OpenAI +| 500K +| 0.8% +| 50 (rate limits) + +| Anthropic +| 300K +| 0.3% +| 5 (timeouts) +|=== + +Insight: Anthropic has 62% lower error rate + +3. Compare latencies: + +[cols="2,1,1"] +|=== +| Provider | p50 Latency | p99 Latency + +| OpenAI +| 1.0s +| 3.5s + +| Anthropic +| 0.8s +| 2.5s +|=== + +Insight: Anthropic is 20% faster at p50, 28% faster at p99 + +Decision: Prioritize Anthropic in routing pools + +== Metrics retention + +// PLACEHOLDER: Confirm metrics retention policy + +Retention period: + +* *High-resolution* (1-minute granularity): // PLACEHOLDER: for example, 7 days +* *Medium-resolution* (1-hour granularity): // PLACEHOLDER: for example, 30 days +* *Low-resolution* (1-day granularity): // PLACEHOLDER: for example, 1 year + +Note: Aggregate metrics retained longer than individual request logs + +== API access to metrics + +// PLACEHOLDER: Document metrics API if available + +=== List available metrics + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/metrics/list \ + -H "Authorization: Bearer ${REDPANDA_CLOUD_TOKEN}" +---- + + +Response: + +[source,json] +---- +{ + "metrics": [ + "requests", + "tokens.prompt", + "tokens.completion", + "tokens.total", + "cost.estimated", + "latency.p50", + "latency.p95", + "latency.p99", + "errors.rate", + "success.rate", + "fallback.rate" + ] +} +---- + + +=== Query specific metric + +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/api/ai-gateway/metrics/query \ + -H "Authorization: Bearer ${REDPANDA_CLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "metric": "requests", + "gateway_id": "gw_abc123", + "start_time": "2025-01-01T00:00:00Z", + "end_time": "2025-01-31T23:59:59Z", + "granularity": "1d", + "group_by": ["model"] + }' +---- + + +Response: + +[source,json] +---- +{ + "metric": "requests", + "granularity": "1d", + "data": [ + { + "timestamp": "2025-01-01T00:00:00Z", + "model": "openai/gpt-4o", + "value": 10000 + }, + { + "timestamp": "2025-01-01T00:00:00Z", + "model": "anthropic/claude-sonnet-3.5", + "value": 5000 + }, + ... + ] +} +---- + + +== Best practices + +Set up budget alerts early + +* Don't wait for surprise bills +* Alert at 50%, 80%, 90% of budget +* Include multiple stakeholders (eng, finance) + +Create team dashboards + +* One dashboard per team showing their gateway(s) +* Empowers teams to self-optimize +* Reduces central ops burden + +Monitor fallback rate + +* Low fallback rate (0-5%): Normal, failover working +* High fallback rate (>20%): Investigate primary provider issues +* Zero fallback rate: Verify fallback config is correct + +Compare models regularly + +* Run A/B tests with metrics +* Reassess as pricing and models change +* Don't assume expensive = better quality for your use case + +Track trends, not point-in-time + +* Day-to-day variance is normal +* Look for week-over-week and month-over-month trends +* Seasonal patterns (e.g., more usage on weekdays) + +== Troubleshoot metrics issues + +=== Issue: "Metrics don't match my provider invoice" + +Possible causes: + +1. Metrics are estimates based on public pricing +2. Your contract has custom pricing +3. Provider changed pricing mid-month + +Solution: + +* Use metrics for trends and optimization decisions +* Use provider invoices for actual billing +* // PLACEHOLDER: Can users configure custom pricing in gateway? + +=== Issue: "Metrics are delayed or missing" + +Possible causes: + +1. Metrics aggregation has delay (// PLACEHOLDER: typical delay?) +2. Time range outside retention period +3. No requests in selected time range (empty data) + +Solution: + +1. Wait and refresh (// PLACEHOLDER: Xminutes typical delay) +2. Check retention policy +3. Verify requests were sent (check logs) + +=== Issue: "Dashboard shows 'no data'" + +Possible causes: + +1. Filters too restrictive (no matching requests) +2. Gateway has no traffic yet +3. Permissions issue (can't access this gateway's metrics) + +Solution: + +1. Remove filters, widen time range +2. Send test request (see [Quickstart](// PLACEHOLDER: link)) +3. Check permissions with admin + +== Next steps + +* View individual requests → [Observability: Logs](// PLACEHOLDER: link) diff --git a/modules/ai-agents/pages/ai-gateway/quickstart-enhanced.adoc b/modules/ai-agents/pages/ai-gateway/quickstart-enhanced.adoc new file mode 100644 index 00000000..1123acbc --- /dev/null +++ b/modules/ai-agents/pages/ai-gateway/quickstart-enhanced.adoc @@ -0,0 +1,491 @@ += DRAFT Quickstart enhanced +:description: Get started with AI Gateway by routing your first request, viewing observability data, testing failover and CEL routing. +:page-personas: app_developer + +Get your first request routed through Redpanda AI Gateway. + +After completing this quickstart, you will be able to: + +* Route your first LLM request through AI Gateway using the Cloud UI and verify it in the observability dashboard. +* Configure a provider and gateway with correct authentication and routing policies. +* Test failover behavior and CEL routing rules in a development environment. + +== Prerequisites + +Before starting, ensure you have: + +* Redpanda Cloud account with BYOC +* Admin access to configure providers and gateways +* API keys for at least one LLM provider (OpenAI, Anthropic, etc.) +* Python 3.8+ or Node.js 18+ (for examples) + +== Step 1: Configure a provider + +Providers must be configured before they can be used in gateways. + +// PLACEHOLDER: Add UI navigation path, e.g., "Console → AI Gateway → Providers → Add Provider" + +1. Navigate to *Providers*: + * Open Redpanda Cloud Console + * Go to // PLACEHOLDER: exact menu path + +2. Add provider: + ``` + Provider: OpenAI + API Key: sk-... + Enabled Models: gpt-4o, gpt-4o-mini + ``` + + // PLACEHOLDER: Add screenshot of provider configuration form + +3. Verify: + + * Provider status shows "Active" + * Models appear in model catalog + +Alternative: CLI (if available) + +[source,bash] +---- +# PLACEHOLDER: CLI command for adding provider +rpk cloud ai-gateway provider create \ + --provider openai \ + --api-key sk-... \ + --models gpt-4o,gpt-4o-mini +---- + + +Supported providers: + +// PLACEHOLDER: List currently supported providers +* OpenAI +* Anthropic +* // PLACEHOLDER: Others? + +See link:// PLACEHOLDER: link[Admin Guide: Providers] for detailed configuration options. + +== Step 2: Create a gateway + +Gateways define routing policies, rate limits, and observability scope. + +// PLACEHOLDER: Add UI navigation path + +1. Navigate to *Gateways*: + * Go to // PLACEHOLDER: exact menu path + +2. Create gateway: + + ``` + Name: my-first-gateway + Workspace: default + Description: Quickstart gateway for testing + ``` + + // PLACEHOLDER: Add screenshot of gateway creation form + +3. Save gateway ID: + + After creation, copy your gateway ID (required for requests): + ``` + Gateway ID: gw_abc123... + Gateway Endpoint: https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1 + ``` + + // PLACEHOLDER: Confirm exact endpoint format + +Recommended gateway patterns: + +* One gateway per environment (staging, production) +* One gateway per team (for budget isolation) +* One gateway per customer (for multi-tenant SaaS) + +See link:// PLACEHOLDER: link[Gateway Creation Guide] for best practices. + +== Step 3: Send your first request + +Route a request through your gateway. + +[tabs] +==== +Python:: ++ +-- +[source,python] +---- +from openai import OpenAI +import os + +# Configure client to use AI Gateway +client = OpenAI( + base_url="https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1", # Gateway endpoint + api_key=os.getenv("REDPANDA_CLOUD_TOKEN"), # Your Redpanda Cloud token + default_headers={ + "rp-aigw-id": "gw_abc123..." # Your gateway ID from Step 2 + } +) + +# Make a request (note the vendor/model_id format) +response = client.chat.completions.create( + model="openai/gpt-4o-mini", # Format: {provider}/{model} + messages=[ + {"role": "user", "content": "Say 'Hello from AI Gateway!'"} + ], + max_tokens=20 +) + +print(response.choices[0].message.content) +# Output: Hello from AI Gateway! +---- +-- + +TypeScript/JavaScript:: ++ +-- +[source,typescript] +---- +import OpenAI from 'openai'; + +const client = new OpenAI({ + baseURL: 'https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1', + apiKey: process.env.REDPANDA_CLOUD_TOKEN, + defaultHeaders: { + 'rp-aigw-id': 'gw_abc123...' + } +}); + +const response = await client.chat.completions.create({ + model: 'openai/gpt-4o-mini', + messages: [ + { role: 'user', content: 'Say "Hello from AI Gateway!"' } + ], + max_tokens: 20 +}); + +console.log(response.choices[0].message.content); +// Output: Hello from AI Gateway! +---- +-- + +cURL:: ++ +-- +[source,bash] +---- +curl https://{CLUSTER_ID}.cloud.redpanda.com/ai-gateway/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${REDPANDA_CLOUD_TOKEN}" \ + -H "rp-aigw-id: gw_abc123..." \ + -d '{ + "model": "openai/gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Say \"Hello from AI Gateway!\""} + ], + "max_tokens": 20 + }' +---- + +Expected response: + +[source,json] +---- +{ + "id": "chatcmpl-...", + "object": "chat.completion", + "created": 1704844800, + "model": "openai/gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello from AI Gateway!" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 8, + "completion_tokens": 5, + "total_tokens": 13 + } +} +---- +-- +==== + + +Troubleshooting: + +* `401 Unauthorized` → Check `REDPANDA_CLOUD_TOKEN` +* `404 Not Found` → Verify `base_url` is correct +* `Model not found` → Ensure model is enabled in Step 1 +* `Missing rp-aigw-id` → Verify header is set + +See link:// PLACEHOLDER: link[Troubleshooting Guide] for more help. + +== Step 4: Verify in observability dashboard + +Confirm your request appears in the AI Gateway dashboard. + +// PLACEHOLDER: Add UI navigation path and screenshots + +1. *Navigate to Logs*: + * Go to // PLACEHOLDER: Console → AI Gateway → {Gateway Name} → Logs + +2. *Find your request*: + * Filter by Gateway: `my-first-gateway` + * Filter by Model: `openai/gpt-4o-mini` + * Time range: Last 5 minutes + +3. *Verify fields*: + * Model: `openai/gpt-4o-mini` + * Provider: OpenAI + * Status: 200 + * Prompt tokens: ~8 + * Completion tokens: ~5 + * Estimated cost: // PLACEHOLDER: $X.XXXX + * Latency: // PLACEHOLDER: ~XXXms + +4. *Click Request to Expand*: + * View full prompt and response + * See request headers + * Check routing decision (which provider pool was used) + +If request doesn't appear: + +* Wait // PLACEHOLDER: Xs (logs may have delay) +* Check gateway ID matches +* Verify request succeeded (no error in client) +* See link:// PLACEHOLDER: link[End-to-End Validation Guide] + +== Next steps: Add failover (optional) + +Add automatic failover to a backup provider for reliability. + +=== Step 5: Add second provider + +Add Anthropic as a fallback option: + +// PLACEHOLDER: Add UI path + +1. *Navigate to Providers* → *Add Provider*: + + ``` + Provider: Anthropic + API Key: sk-ant-... + Enabled Models: claude-sonnet-3.5 + ``` + +2. *Verify*: + + * Anthropic provider status: Active + * Models appear in catalog + +=== Step 6: Configure provider pool with fallback + +Update your gateway to use OpenAI as primary, Anthropic as fallback. + +// PLACEHOLDER: Add UI path and configuration format + +1. *Navigate to Gateway Settings*: + + * Go to // PLACEHOLDER: AI Gateway → {Gateway Name} → Routing + +2. *Configure provider pool*: + + ```yaml + # PLACEHOLDER: Confirm actual configuration format + routing: + primary_pool: + * provider: openai + models: [gpt-4o, gpt-4o-mini] + fallback_pool: + * provider: anthropic + models: [claude-sonnet-3.5] + + fallback_triggers: + * rate_limit_exceeded + * timeout + * 5xx_errors + ``` + + // PLACEHOLDER: Add screenshot of routing configuration + +3. *Save configuration* + +=== Step 7: Test failover + +Simulate a provider failure to see fallback in action. + +// PLACEHOLDER: Add method to test failback, or skip if not easily testable + +*Option A: Disable primary provider temporarily* + +1. Disable OpenAI provider in settings +2. Send request with `openai/gpt-4o` model +3. Gateway should automatically route to Anthropic fallback +4. Check logs to confirm fallback was used + +*Option B: Trigger rate limit* + +1. Send many requests rapidly to hit rate limit +2. Gateway should fallback to Anthropic +3. Check logs for "fallback_triggered" indicator + +Verify fallback: + +[source,python] +---- +response = client.chat.completions.create( + model="openai/gpt-4o", # Request OpenAI model + messages=[{"role": "user", "content": "Test fallback"}] +) + +# Check which provider actually handled it +# PLACEHOLDER: How to verify this - response header? Log metadata? +---- + + +Check dashboards: + +* Request should show: + * Requested model: `openai/gpt-4o` + * Actual provider: Anthropic (fallback) + * Fallback reason: // PLACEHOLDER: rate_limit / timeout / error + + +== Next steps: Add routing rule (optional) + +Use CEL expressions to route requests based on headers or content. + +=== Step 8: Create CEL routing rule + +Route premium users to better models automatically. + +// PLACEHOLDER: Add UI path for CEL configuration + +1. Navigate to *Gateway Settings*: + + * Go to // PLACEHOLDER: AI Gateway → {Gateway Name} → Routing Rules + +2. Add CEL rule: + + ```cel + # Route based on user tier header + request.headers["x-user-tier"] == "premium" + ? "openai/gpt-4o" + : "openai/gpt-4o-mini" + ``` + + // PLACEHOLDER: Add screenshot of CEL editor with syntax highlighting + +3. Test rule (if UI supports testing): + + * Input test headers: `x-user-tier: premium` + * Verify output: `openai/gpt-4o` + * Input test headers: `x-user-tier: free` + * Verify output: `openai/gpt-4o-mini` + +4. Save rule + +=== Step 9: Test routing rule + +Send requests with different headers and verify routing. + +*Premium user request*: + +[source,python] +---- +response = client.chat.completions.create( + model="auto", # PLACEHOLDER: or how to trigger CEL routing + messages=[{"role": "user", "content": "Hello"}], + extra_headers={"x-user-tier": "premium"} +) + +# Should route to gpt-4o (premium model) +---- + + +*Free user request*: + +[source,python] +---- +response = client.chat.completions.create( + model="auto", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={"x-user-tier": "free"} +) + +# Should route to gpt-4o-mini (cost-effective model) +---- + + +*Verify in dashboard*: + +* Check request logs +* Confirm correct model was selected based on header +* View routing decision explanation + +== What's next? + +=== Immediate next steps + +1. *Set rate limits* → link:// PLACEHOLDER: link[Rate Limiting Guide] + * Protect against runaway costs + * Prevent abuse + +2. *Add spend limits* → link:// PLACEHOLDER: link[Budget Controls Guide] + * Set monthly budgets per gateway + * Get alerts before limits are hit + +3. *Configure MCP aggregation* → link:// PLACEHOLDER: link[MCP Guide] + * Give agents access to tools + * Reduce token costs with deferred loading + +=== Explore advanced features + +* *A/B testing models* → link:// PLACEHOLDER: link[A/B Testing Guide] +* *Multi-tenancy patterns* → link:// PLACEHOLDER: link[Multi-Tenancy Guide] +* *Cost optimization* → link:// PLACEHOLDER: link[Cost Optimization Guide] +* *Performance tuning* → link:// PLACEHOLDER: link[Performance Guide] + +=== Integration guides + +* link:// PLACEHOLDER: link[OpenAI SDK Integration] +* link:// PLACEHOLDER: link[Anthropic SDK Integration] +* link:// PLACEHOLDER: link[LangChain Integration] +* link:// PLACEHOLDER: link[LlamaIndex Integration] +* link:// PLACEHOLDER: link[Claude Code CLI] +* link:// PLACEHOLDER: link[VS Code Extension] +* link:// PLACEHOLDER: link[Cursor IDE] + +=== Migrate existing applications + +* link:// PLACEHOLDER: link[Migration Guide: From Direct Integration to Gateway] + +== Common questions + +*Q: How do I switch between providers without code changes?* +A: Change the model string in your gateway routing rules. No code deployment needed. + +*Q: How much latency does the gateway add?* +A: Typically // PLACEHOLDER: Xms overhead. See link:// PLACEHOLDER: link[Performance Benchmarks]. + +*Q: Can I use the same gateway for multiple applications?* +A: Yes, but we recommend separate gateways per environment or team for better cost tracking. + +*Q: How do I attribute costs to specific customers?* +A: Use CEL routing with custom headers, then filter logs by header value. See link:// PLACEHOLDER: link[Cost Attribution Guide]. + +*Q: Does the gateway work with streaming responses?* +A: // PLACEHOLDER: Yes/No, with any limitations + +*Q: What happens if the gateway goes down?* +A: // PLACEHOLDER: Describe high availability setup, or recommend keeping fallback to direct integration + + +== Related pages + +* xref:ai-agents:ai-gateway/ai-gateway-overview.adoc[] +* xref:ai-agents:ai-gateway/cel-routing-cookbook.adoc[] +* xref:ai-agents:ai-gateway/observability-logs.adoc[] diff --git a/modules/ai-agents/pages/index.adoc b/modules/ai-agents/pages/index.adoc index 9ac867a9..591ad65c 100644 --- a/modules/ai-agents/pages/index.adoc +++ b/modules/ai-agents/pages/index.adoc @@ -1,8 +1,4 @@ -= AI Agents in Redpanda Cloud -:description: Learn about AI agents and the tools Redpanda Cloud provides for building them. += Agentic AI +:description: Learn about the Redpanda Agentic Data Plane, including the AI Gateway, AI agents, and MCP servers. :page-layout: index :page-aliases: develop:agents/about.adoc, develop:ai-agents/about.adoc - -AI agents are configurable assistants that autonomously perform specialist tasks by leveraging large language models (LLMs) and connecting to external data sources and tools. - -Redpanda Cloud provides two complementary Model Context Protocol (MCP) options to help you build AI agents. diff --git a/modules/shared/partials/ai-gateway.png b/modules/shared/partials/ai-gateway.png new file mode 100644 index 00000000..0754146d Binary files /dev/null and b/modules/shared/partials/ai-gateway.png differ