diff --git a/.env.example b/.env.example index f9ce306a7..d11ecefce 100644 --- a/.env.example +++ b/.env.example @@ -209,6 +209,55 @@ OAUTH_MAX_RETRIES=3 # tokens are stored per-user to prevent cross-user token access. # Users must individually authorize each OAuth-protected gateway. +##################################### +# OAuth Dynamic Client Registration (DCR) and PKCE +##################################### + +# Enable Dynamic Client Registration (RFC 7591) +# When enabled, MCP Gateway can automatically register as an OAuth client with Authorization Servers +# that support DCR, eliminating the need for manual client credential configuration. +MCPGATEWAY_DCR_ENABLED=true + +# Auto-register when gateway has issuer but no client_id +# When true, gateway automatically registers with the Authorization Server when configured +# with an issuer URL but no client credentials. +MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS=true + +# Default scopes to request during DCR +# Comma-separated list of OAuth scopes to request when auto-registering +MCPGATEWAY_DCR_DEFAULT_SCOPES=mcp:read + +# Optional allowlist of issuer URLs for DCR (empty = allow any) +# Comma-separated list of trusted Authorization Server issuer URLs +# Example: https://auth.example.com,https://auth2.example.com +# Leave empty to allow DCR with any issuer (not recommended for production) +MCPGATEWAY_DCR_ALLOWED_ISSUERS= + +# Token endpoint authentication method for DCR +# Options: client_secret_basic (default), client_secret_post, none +# - client_secret_basic: Send credentials via HTTP Basic Auth header +# - client_secret_post: Send credentials in POST body +# - none: Public client (no client secret, PKCE-only) +MCPGATEWAY_DCR_TOKEN_ENDPOINT_AUTH_METHOD=client_secret_basic + +# AS metadata cache TTL in seconds (RFC 8414 discovery) +# How long to cache Authorization Server metadata after discovery +MCPGATEWAY_DCR_METADATA_CACHE_TTL=3600 + +# Template for client_name in DCR requests +# {gateway_name} will be replaced with the actual gateway name +MCPGATEWAY_DCR_CLIENT_NAME_TEMPLATE=MCP Gateway ({gateway_name}) + +# Enable OAuth AS metadata discovery (RFC 8414) +# When enabled, gateway automatically discovers Authorization Server endpoints +# from the issuer URL using well-known metadata endpoints +MCPGATEWAY_OAUTH_DISCOVERY_ENABLED=true + +# Preferred PKCE code challenge method +# Options: S256 (SHA-256, recommended), plain (not recommended) +# PKCE (Proof Key for Code Exchange) is always enabled for Authorization Code flows +MCPGATEWAY_OAUTH_PREFERRED_CODE_CHALLENGE_METHOD=S256 + # ============================================================================== # SSO (Single Sign-On) Configuration # ============================================================================== @@ -322,6 +371,34 @@ MCPGATEWAY_A2A_MAX_RETRIES=3 # Enable A2A agent metrics collection (true/false) MCPGATEWAY_A2A_METRICS_ENABLED=true +##################################### +# MCP Server Catalog Configuration +##################################### + +# Enable MCP server catalog feature (NEW in v0.7.0) +# Allows defining a catalog of pre-configured MCP servers in a YAML file +# for easy discovery and management via the Admin UI +# Options: true (default), false +MCPGATEWAY_CATALOG_ENABLED=true + +# Path to the catalog configuration file +# YAML file containing MCP server definitions +# Default: mcp-catalog.yml +MCPGATEWAY_CATALOG_FILE=mcp-catalog.yml + +# Automatically health check catalog servers on startup and periodically +# Options: true (default), false +MCPGATEWAY_CATALOG_AUTO_HEALTH_CHECK=true + +# Catalog cache TTL in seconds +# How long to cache catalog data before refreshing +# Default: 3600 (1 hour) +MCPGATEWAY_CATALOG_CACHE_TTL=3600 + +# Number of catalog servers to display per page +# Default: 100 +MCPGATEWAY_CATALOG_PAGE_SIZE=100 + ##################################### # Header Passthrough Configuration ##################################### @@ -493,6 +570,20 @@ SSE_KEEPALIVE_INTERVAL=30 # false: Stateless mode (better for scaling) USE_STATEFUL_SESSIONS=false +#################################### +# Session Pooling (SSE/WS) +#################################### +# Enable session pooling for SSE/WS (default: false) +SESSION_POOLING_ENABLED=false +# Comma-separated list of server IDs for which pooling is enabled (overrides global) +SESSION_POOLING_SERVERS=[] +# Max idle time (seconds) before pooled session is evicted (default: 600) +SESSION_POOL_MAX_IDLE=600 +# Max pooled sessions per user (default: 10) +SESSION_POOL_USER_LIMIT=10 + + + # Enable JSON response format for streaming HTTP # Options: true (default), false # true: Return JSON responses, false: Return SSE stream @@ -513,7 +604,8 @@ FEDERATION_DISCOVERY=false FEDERATION_PEERS=[] # Timeout for federation requests in seconds -FEDERATION_TIMEOUT=30 +# Default: 120 seconds (matches config.py) +FEDERATION_TIMEOUT=120 # Interval between federation sync operations in seconds FEDERATION_SYNC_INTERVAL=300 @@ -544,9 +636,11 @@ PROMPT_RENDER_TIMEOUT=10 # Health Check Configuration HEALTH_CHECK_INTERVAL=60 -HEALTH_CHECK_TIMEOUT=15 +# Health check timeout in seconds (default: 10, matches config.py) +HEALTH_CHECK_TIMEOUT=10 UNHEALTHY_THRESHOLD=5 -GATEWAY_VALIDATION_TIMEOUT=10 +# Gateway URL validation timeout in seconds (default: 5, matches config.py) +GATEWAY_VALIDATION_TIMEOUT=5 # File lock name for gateway service leader election # Used to coordinate multiple gateway instances when running in cluster mode diff --git a/MANIFEST.in b/MANIFEST.in index 27b81c041..9812b408f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -169,7 +169,7 @@ exclude plugins/external/opa/.env.template exclude plugins/external/opa/.ruff.toml exclude plugins/external/opa/Containerfile exclude plugins/external/opa/MANIFEST.in -exclude plugins/external/opa/opaserver/rego/example.rego +exclude plugins/external/opa/opaserver/rego/policy.rego exclude plugins/external/opa/pyproject.toml exclude plugins/external/opa/run-server.sh @@ -179,6 +179,5 @@ exclude plugins/external/llmguard/.env.template exclude plugins/external/llmguard/.ruff.toml exclude plugins/external/llmguard/Containerfile exclude plugins/external/llmguard/MANIFEST.in -exclude plugins/external/llmguard/opaserver/rego/example.rego exclude plugins/external/llmguard/pyproject.toml exclude plugins/external/llmguard/run-server.sh diff --git a/README.md b/README.md index 3374a871e..cf9664202 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ python3 -m mcpgateway.translate \ # 2️⃣ Register it with the gateway curl -s -X POST -H "Authorization: Bearer $MCPGATEWAY_BEARER_TOKEN" \ -H "Content-Type: application/json" \ - -d '{"name":"fast_time","url":"http://localhost:9000/sse"}' \ + -d '{"name":"fast_time","url":"http://localhost:8003/sse"}' \ http://localhost:4444/gateways # 3️⃣ Verify tool catalog @@ -462,6 +462,7 @@ When using a MCP Client such as Claude with stdio: ## Quick Start - Containers Use the official OCI image from GHCR with **Docker** *or* **Podman**. +Please note: Currently, arm64 is not supported. If you are e.g. running on MacOS, install via PyPi. --- @@ -1255,20 +1256,35 @@ You can get started by copying the provided [.env.example](https://github.com/IB | ------------------------------ | ------------------------------------------------ | --------------------- | ------- | | `SSO_AUTO_ADMIN_DOMAINS` | Email domains that automatically get admin privileges | `[]` | JSON array | -### Dynamic Client Registration & Virtual MCP Server Authentication - -ContextForge supports OAuth2 with Dynamic Client Registration (DCR) -for streamable HTTP servers through integration with an upstream API gateway, -such as HyperMCP gateway, enabling automatic OAuth2 client provisioning for MCP servers -without manual configuration. - -| Setting | Description | Default | Options | -|-----------------------------|--------------------------------------------------------|---------|---------| -| `JWT_AUDIENCE_VERIFICATION` | JWT audience verification needs to be disabled for DCR | `true` | bool | - -You can find an example for using dynamic client registration (DCR) with [HyprMCP Gateway (`hyprmcp/mcp-gateway`)](https://github.com/hyprmcp/mcp-gateway). - -Follow the tutorial at https://ibm.github.io/mcp-context-forge/tutorials/dcr-hyprmcp/ to get started. +### OAuth 2.0 Dynamic Client Registration (DCR) & PKCE + +ContextForge implements **OAuth 2.0 Dynamic Client Registration (RFC 7591)** and **PKCE (RFC 7636)** for seamless integration with OAuth-protected MCP servers and upstream API gateways like HyperMCP. + +**Key Features:** +- ✅ Automatic client registration with Authorization Servers (no manual credential configuration) +- ✅ Authorization Server metadata discovery (RFC 8414) +- ✅ PKCE (Proof Key for Code Exchange) enabled for all Authorization Code flows +- ✅ Support for public clients (PKCE-only, no client secret) +- ✅ Encrypted credential storage with Fernet encryption +- ✅ Configurable issuer allowlist for security + +| Setting | Description | Default | Options | +|-------------------------------------------------------|----------------------------------------------------------------|--------------------------------|---------------| +| `MCPGATEWAY_DCR_ENABLED` | Enable Dynamic Client Registration (RFC 7591) | `true` | bool | +| `MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS` | Auto-register when gateway has issuer but no client_id | `true` | bool | +| `MCPGATEWAY_DCR_DEFAULT_SCOPES` | Default OAuth scopes to request during DCR | `mcp:read` | string | +| `MCPGATEWAY_DCR_ALLOWED_ISSUERS` | Allowlist of trusted issuer URLs (empty = allow any) | `[]` | JSON array | +| `MCPGATEWAY_DCR_TOKEN_ENDPOINT_AUTH_METHOD` | Token endpoint auth method | `client_secret_basic` | `client_secret_basic`, `client_secret_post`, `none` | +| `MCPGATEWAY_DCR_METADATA_CACHE_TTL` | AS metadata cache TTL in seconds | `3600` | int | +| `MCPGATEWAY_DCR_CLIENT_NAME_TEMPLATE` | Template for client_name in DCR requests | `MCP Gateway ({gateway_name})` | string | +| `MCPGATEWAY_OAUTH_DISCOVERY_ENABLED` | Enable AS metadata discovery (RFC 8414) | `true` | bool | +| `MCPGATEWAY_OAUTH_PREFERRED_CODE_CHALLENGE_METHOD` | PKCE code challenge method | `S256` | `S256`, `plain` | +| `JWT_AUDIENCE_VERIFICATION` | JWT audience verification (disable for DCR) | `true` | bool | + +**Documentation:** +- [DCR Configuration Guide](https://ibm.github.io/mcp-context-forge/manage/dcr/) - Complete DCR setup and troubleshooting +- [OAuth 2.0 Integration](https://ibm.github.io/mcp-context-forge/manage/oauth/) - OAuth configuration and PKCE details +- [HyperMCP Tutorial](https://ibm.github.io/mcp-context-forge/tutorials/dcr-hyprmcp/) - End-to-end DCR setup with HyperMCP gateway ### Personal Teams Configuration @@ -1281,6 +1297,29 @@ Follow the tutorial at https://ibm.github.io/mcp-context-forge/tutorials/dcr-hyp | `INVITATION_EXPIRY_DAYS` | Number of days before team invitations expire | `7` | int > 0 | | `REQUIRE_EMAIL_VERIFICATION_FOR_INVITES` | Require email verification for team invitations | `true` | bool | +### MCP Server Catalog + +> 🆕 **New in v0.7.0**: The MCP Server Catalog allows you to define a catalog of pre-configured MCP servers in a YAML file for easy discovery and management via the Admin UI. + +| Setting | Description | Default | Options | +| ------------------------------------ | ------------------------------------------------ | ------------------ | ------- | +| `MCPGATEWAY_CATALOG_ENABLED` | Enable MCP server catalog feature | `true` | bool | +| `MCPGATEWAY_CATALOG_FILE` | Path to catalog configuration file | `mcp-catalog.yml` | string | +| `MCPGATEWAY_CATALOG_AUTO_HEALTH_CHECK` | Automatically health check catalog servers | `true` | bool | +| `MCPGATEWAY_CATALOG_CACHE_TTL` | Catalog cache TTL in seconds | `3600` | int > 0 | +| `MCPGATEWAY_CATALOG_PAGE_SIZE` | Number of catalog servers per page | `12` | int > 0 | + +**Key Features:** +- 🔄 Refresh Button - Manually refresh catalog without page reload +- 🔍 Debounced Search - Optimized search with 300ms debounce +- 📝 Custom Server Names - Specify custom names when registering +- 🔌 Transport Detection - Auto-detect SSE, WebSocket, or HTTP transports +- 🔐 OAuth Support - Register OAuth servers and configure later +- ⚡ Better Error Messages - User-friendly errors for common issues + +**Documentation:** +- [MCP Server Catalog Guide](https://ibm.github.io/mcp-context-forge/manage/catalog/) - Complete catalog setup and configuration + ### Security | Setting | Description | Default | Options | diff --git a/docs/docs/architecture/adr/.pages b/docs/docs/architecture/adr/.pages index c6aad17cb..bd05a21f9 100644 --- a/docs/docs/architecture/adr/.pages +++ b/docs/docs/architecture/adr/.pages @@ -1,5 +1,6 @@ title: Decision Records nav: + - Overview: index.md - 1 Adopt FastAPI + Pydantic: 001-adopt-fastapi-pydantic.md - 2 Use Async SQLAlchemy ORM: 002-use-async-sqlalchemy-orm.md - 3 Expose Multi-Transport Endpoints: 003-expose-multi-transport-endpoints.md @@ -8,7 +9,7 @@ nav: - 6 Gateway & Tool-Level Rate Limiting: 006-gateway-tool-rate-limiting.md - 7 Pluggable Cache Backend: 007-pluggable-cache-backend.md - 8 Federation & Auto-Discovery via DNS-SD: 008-federation-discovery.md - - 9 Built-in Health Checks: 000-built-in-health-checks.md + - 9 Built-in Health Checks: 009-built-in-health-checks.md - 10 Observability via Prometheus: 010-observability-prometheus.md - 11 Namespaced Tool Federation: 011-tool-federation.md - 12 Drop-down tool selection: 012-dropdown-ui-tool-selection.md diff --git a/docs/docs/architecture/plugins.md b/docs/docs/architecture/plugins.md index 309b9bf50..397895023 100644 --- a/docs/docs/architecture/plugins.md +++ b/docs/docs/architecture/plugins.md @@ -638,7 +638,7 @@ class PluginMode(str, Enum): DISABLED = "disabled" # Plugin loaded but not executed class HookType(str, Enum): - """Available hook points in MCP request lifecycle""" + """Available hook points in MCP request lifecycle""" PROMPT_PRE_FETCH = "prompt_pre_fetch" # Before prompt retrieval PROMPT_POST_FETCH = "prompt_post_fetch" # After prompt rendering TOOL_PRE_INVOKE = "tool_pre_invoke" # Before tool execution diff --git a/docs/docs/manage/.pages b/docs/docs/manage/.pages index f9bf97450..0b77fc115 100644 --- a/docs/docs/manage/.pages +++ b/docs/docs/manage/.pages @@ -1,7 +1,10 @@ nav: - index.md - configuration.md + - scale.md + - tuning.md - backup.md + - catalog.md - bulk-import.md - metadata-tracking.md - export-import.md @@ -20,7 +23,8 @@ nav: - sso-google-tutorial.md - sso-ibm-tutorial.md - sso-okta-tutorial.md - - tuning.md + - rbac.md + - teams.md - ui-customization.md - upgrade.md - well-known-uris.md diff --git a/docs/docs/manage/catalog.md b/docs/docs/manage/catalog.md new file mode 100644 index 000000000..4e9459d58 --- /dev/null +++ b/docs/docs/manage/catalog.md @@ -0,0 +1,529 @@ +# MCP Server Catalog + +> 🆕 **New in v0.7.0**: The MCP Server Catalog feature allows you to define a catalog of pre-configured MCP servers in a YAML file for easy discovery, registration, and management via the Admin UI and API. + +## Overview + +The MCP Server Catalog provides a declarative way to define and manage MCP servers, reducing manual configuration and enabling automated server registration and health monitoring. + +**Key Features:** + +- 📝 **Declarative Configuration**: Define servers in YAML format +- 🔍 **Automatic Discovery**: Servers are automatically registered on startup +- 💚 **Health Monitoring**: Automatic health checks for catalog servers +- 🗂️ **Categorization**: Organize servers with tags and descriptions +- ⚡ **Fast Onboarding**: Quickly add new MCP servers without API calls + +--- + +## Configuration + +### Environment Variables + +Configure the catalog feature using these environment variables: + +```bash +# Enable MCP server catalog feature (default: true) +MCPGATEWAY_CATALOG_ENABLED=true + +# Path to catalog configuration file (default: mcp-catalog.yml) +MCPGATEWAY_CATALOG_FILE=mcp-catalog.yml + +# Automatically health check catalog servers (default: true) +MCPGATEWAY_CATALOG_AUTO_HEALTH_CHECK=true + +# Catalog cache TTL in seconds (default: 3600) +MCPGATEWAY_CATALOG_CACHE_TTL=3600 + +# Number of catalog servers to display per page (default: 12) +MCPGATEWAY_CATALOG_PAGE_SIZE=12 +``` + +--- + +## Catalog File Format + +The catalog file uses YAML format with the following structure: + +### Basic Example + +```yaml +# mcp-catalog.yml +catalog_servers: + - id: "time-server" + name: "Time Server" + category: "Utilities" + url: "http://localhost:9000/sse" + auth_type: "Open" + provider: "Local" + description: "Fast time server providing current time utilities" + requires_api_key: false + tags: + - "utilities" + - "time" + - "development" + + - id: "git-server" + name: "Git Server" + category: "Software Development" + url: "http://localhost:9001/sse" + auth_type: "Open" + provider: "Local" + description: "Git repository MCP server" + requires_api_key: false + tags: + - "git" + - "version-control" + - "development" + +# Optional: Categories for UI filtering +categories: + - Utilities + - Software Development + +# Optional: Auth types for UI filtering +auth_types: + - Open + - OAuth2.1 + - API Key +``` + +### Full Example with All Fields + +```yaml +# Production MCP Server Catalog +catalog_servers: + - id: "production-time-server" + name: "Production Time Server" + category: "Utilities" + url: "https://time.api.example.com/sse" + transport: "SSE" # Optional: Explicitly specify transport type + auth_type: "OAuth2.1" + provider: "Internal Platform" + description: "Production time server with geo-replication" + requires_api_key: false + secure: true + tags: + - "production" + - "utilities" + - "time" + - "geo-replicated" + logo_url: "https://static.example.com/time-server-logo.png" + documentation_url: "https://docs.example.com/time-server" + + - id: "websocket-server" + name: "WebSocket MCP Server" + category: "Development Tools" + url: "wss://api.example.com/mcp" + transport: "WEBSOCKET" # Specify WebSocket transport + auth_type: "API Key" + provider: "Internal Platform" + description: "Real-time MCP server using WebSocket protocol" + requires_api_key: true + secure: true + tags: + - "production" + - "websocket" + - "real-time" + + - id: "database-server" + name: "Database Server" + category: "Database" + url: "https://db.api.example.com/sse" + auth_type: "OAuth2.1" + provider: "Internal Platform" + description: "Database query and management MCP server" + requires_api_key: false + secure: true + tags: + - "production" + - "database" + - "postgresql" + documentation_url: "https://docs.example.com/db-server" + + - id: "github-api" + name: "GitHub" + category: "Software Development" + url: "https://api.githubcopilot.com/mcp" + auth_type: "OAuth2.1" + provider: "GitHub" + description: "Version control and collaborative software development" + requires_api_key: false + secure: true + tags: + - "development" + - "git" + - "version-control" + logo_url: "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" + documentation_url: "https://docs.github.com" + +categories: + - Utilities + - Database + - Software Development + +auth_types: + - OAuth2.1 + - API Key + - Open +``` + +--- + +## Field Reference + +### Root Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `catalog_servers` | array | Yes | List of MCP server definitions | +| `categories` | array | No | List of available categories for UI filtering | +| `auth_types` | array | No | List of available auth types for UI filtering | + +### Catalog Server Fields + +Based on the `CatalogServer` schema (schemas.py:5371-5387): + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `id` | string | Yes | Unique identifier for the catalog server | +| `name` | string | Yes | Display name of the server | +| `category` | string | Yes | Server category (e.g., "Project Management", "Software Development") | +| `url` | string | Yes | Server endpoint URL | +| `auth_type` | string | Yes | Authentication type (e.g., "OAuth2.1", "API Key", "Open") | +| `provider` | string | Yes | Provider/vendor name | +| `description` | string | Yes | Server description | +| `requires_api_key` | boolean | No | Whether API key is required (default: `false`) | +| `secure` | boolean | No | Whether additional security is required (default: `false`) | +| `tags` | array | No | Tags for categorization (default: `[]`) | +| `transport` | string | No | Transport type: `SSE`, `STREAMABLEHTTP`, or `WEBSOCKET` (auto-detected if not specified) | +| `logo_url` | string | No | URL to server logo/icon | +| `documentation_url` | string | No | URL to server documentation | +| `is_registered` | boolean | No | Whether server is already registered (set by system) | +| `is_available` | boolean | No | Whether server is currently available (default: `true`) | + +--- + +## Usage + +### Automatic Registration on Startup + +When `MCPGATEWAY_CATALOG_ENABLED=true`, the gateway automatically: + +1. Reads the catalog file on startup +2. Registers all enabled servers +3. Starts health checks (if enabled) +4. Makes servers available via the Admin UI and API + +### Manual Catalog Reload + +Reload the catalog without restarting the gateway: + +```bash +# Using the CLI +mcpgateway catalog reload + +# Or via API +curl -X POST -H "Authorization: Bearer $TOKEN" \ + http://localhost:4444/admin/catalog/reload +``` + +### Listing Catalog Servers + +```bash +# Via CLI +mcpgateway catalog list + +# Via API +curl -H "Authorization: Bearer $TOKEN" \ + http://localhost:4444/admin/catalog/servers +``` + +### Filtering by Tags + +```bash +# List all production servers +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:4444/admin/catalog/servers?tag=production" + +# List all database servers +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:4444/admin/catalog/servers?tag=database" +``` + +--- + +## Best Practices + +### 1. Use Consistent Naming + +Use clear, descriptive IDs and names: + +```yaml +catalog_servers: + - id: "github-production" # ✅ Good: Clear and unique + name: "GitHub Production" + # ... other fields +``` + +### 2. Organize with Tags + +Use consistent tagging for easier filtering and management: + +```yaml +catalog_servers: + - id: "prod-db-server" + name: "Production Database" + category: "Database" + tags: + - "production" # Environment + - "postgresql" # Technology + - "critical" # Priority +``` + +### 3. Categorize Clearly + +Use standard categories that match your organization: + +```yaml +categories: + - "Software Development" + - "Database" + - "Productivity" + - "Project Management" +``` + +### 4. Document Server Metadata + +Include logo and documentation URLs for better UX: + +```yaml +catalog_servers: + - id: "time-server" + name: "Time Server" + description: "Provides current time utilities with geo-replication" + documentation_url: "https://docs.example.com/time-server" + logo_url: "https://static.example.com/logos/time-server.png" +``` + +--- + +## Examples + +### Development Environment + +```yaml +# mcp-catalog.dev.yml +catalog_servers: + - id: "local-time" + name: "Local Time Server" + category: "Utilities" + url: "http://localhost:9000/sse" + auth_type: "Open" + provider: "Local" + description: "Local development time server" + requires_api_key: false + tags: ["dev", "utilities", "time"] + + - id: "local-git" + name: "Local Git Server" + category: "Software Development" + url: "http://localhost:9001/sse" + auth_type: "Open" + provider: "Local" + description: "Local Git MCP server" + requires_api_key: false + tags: ["dev", "git", "version-control"] + +categories: + - "Utilities" + - "Software Development" + +auth_types: + - "Open" +``` + +### Production Environment + +```yaml +# mcp-catalog.prod.yml +catalog_servers: + - id: "prod-time-api" + name: "Production Time API" + category: "Utilities" + url: "https://time.api.example.com/sse" + auth_type: "OAuth2.1" + provider: "Platform Engineering" + description: "Production time API with geo-replication and high availability" + requires_api_key: false + secure: true + tags: ["production", "critical", "utilities"] + documentation_url: "https://docs.example.com/time-api" + + - id: "prod-database-api" + name: "Production Database API" + category: "Database" + url: "https://db.api.example.com/sse" + auth_type: "OAuth2.1" + provider: "Platform Engineering" + description: "Production PostgreSQL database API with RBAC" + requires_api_key: false + secure: true + tags: ["production", "critical", "database", "postgresql"] + documentation_url: "https://docs.example.com/db-api" + + - id: "stripe-payments" + name: "Stripe Payments" + category: "Payments" + url: "https://mcp.stripe.com/" + auth_type: "API Key" + provider: "Stripe" + description: "Payment processing and financial infrastructure" + requires_api_key: true + secure: true + tags: ["production", "payments", "finance"] + logo_url: "https://stripe.com/img/v3/home/social.png" + documentation_url: "https://stripe.com/docs" + +categories: + - "Utilities" + - "Database" + - "Payments" + +auth_types: + - "OAuth2.1" + - "API Key" +``` + +--- + +## Troubleshooting + +### Catalog File Not Loading + +**Symptoms:** Servers from catalog don't appear in the Admin UI + +**Solutions:** + +1. Check that catalog is enabled: + ```bash + echo $MCPGATEWAY_CATALOG_ENABLED # Should be "true" + ``` + +2. Verify catalog file path: + ```bash + ls -la mcp-catalog.yml # Or your configured path + ``` + +3. Check gateway logs for parsing errors: + ```bash + docker logs mcpgateway | grep -i catalog + ``` + +4. Validate YAML syntax: + ```bash + python3 -c "import yaml; yaml.safe_load(open('mcp-catalog.yml'))" + ``` + +### Servers Not Appearing in Catalog + +**Symptoms:** Catalog servers don't appear in the Admin UI + +**Solutions:** + +1. Verify server URLs are accessible: + ```bash + curl -v http://localhost:9000/sse + ``` + +2. Check server entry has all required fields: + ```yaml + catalog_servers: + - id: "my-server" # Required + name: "My Server" # Required + category: "Utilities" # Required + url: "http://..." # Required + auth_type: "Open" # Required + provider: "MyProvider" # Required + description: "..." # Required + ``` + +3. Validate YAML syntax: + ```bash + python3 -c "import yaml; print(yaml.safe_load(open('mcp-catalog.yml')))" + ``` + +### Authentication Errors + +**Symptoms:** 401/403 errors when accessing catalog servers after registration + +**Solutions:** + +1. Verify the `auth_type` matches the server's requirements: + - `"Open"` - No authentication required + - `"API Key"` - Requires API key (set `requires_api_key: true`) + - `"OAuth2.1"` - Requires OAuth configuration + +2. For OAuth servers, ensure you complete the OAuth flow after registration via the Admin UI + +3. For API Key servers, provide the API key during registration + +### Transport Type Issues + +**Symptoms:** WebSocket servers fail to connect after registration + +**Solutions:** + +1. Explicitly specify the `transport` field in your catalog YAML: + ```yaml + catalog_servers: + - id: "websocket-server" + url: "wss://api.example.com/mcp" + transport: "WEBSOCKET" # Explicitly set transport + ``` + +2. Verify URL scheme matches transport type: + - WebSocket: `ws://` or `wss://` + - SSE: `http://` or `https://` with `/sse` path + - HTTP: `http://` or `https://` with `/mcp` path + +--- + +## Recent Improvements (v0.7.0) + +### Enhanced UI Features + +The catalog UI now includes several UX improvements: + +- **🔄 Refresh Button**: Manually refresh the catalog without page reload +- **🔍 Debounced Search**: 300ms debounce on search input for better performance +- **📝 Custom Server Names**: Ability to specify custom names when registering servers +- **📄 Pagination with Filters**: Filter parameters preserved when navigating pages +- **⚡ Better Error Messages**: User-friendly error messages for common issues (connection, auth, SSL, etc.) +- **🔐 OAuth Support**: OAuth servers can be registered without credentials and configured later + +### Transport Type Detection + +The catalog now supports: + +- **Explicit Transport**: Specify `transport` field in catalog YAML (`SSE`, `WEBSOCKET`, `STREAMABLEHTTP`) +- **Auto-Detection**: Automatically detects transport from URL if not specified + - `ws://` or `wss://` → `WEBSOCKET` + - URLs ending in `/sse` → `SSE` + - URLs with `/mcp` path → `STREAMABLEHTTP` + - Default fallback → `SSE` + +### Authentication Improvements + +- **Custom Auth Headers**: Properly mapped as list of header key-value pairs +- **OAuth Registration**: OAuth servers can be registered in "disabled" state until OAuth flow is completed +- **API Key Modal**: Enhanced modal with custom name field and proper authorization headers + +--- + +## See Also + +- [Configuration Reference](./index.md) - Complete configuration guide +- [SSO Configuration](./sso.md) - Authentication and SSO setup +- [Export/Import](./export-import.md) - Bulk operations and data migration +- [Observability](./observability.md) - Monitoring and tracing diff --git a/docs/docs/manage/dcr.md b/docs/docs/manage/dcr.md index 4a2d5ff71..59af68079 100644 --- a/docs/docs/manage/dcr.md +++ b/docs/docs/manage/dcr.md @@ -8,13 +8,113 @@ DCR solves a common authentication challenge in distributed MCP deployments: - **Traditional OAuth2** requires pre-configured client credentials for each MCP server - **With DCR**, MCP clients can automatically register themselves as OAuth2 clients +**Key Benefits:** +- ✅ Zero-touch OAuth configuration - no manual credential management +- ✅ Automatic discovery via RFC 8414 (Authorization Server Metadata) +- ✅ Support for public clients (PKCE-only, no client secret) +- ✅ Credential encryption and secure storage +- ✅ Issuer allowlist for security + +--- + ## What is Dynamic Client Registration? Dynamic Client Registration is defined in [RFC 7591](https://tools.ietf.org/html/rfc7591) as a protocol that allows OAuth 2.0 clients to register with authorization servers dynamically. In the context of MCP Gateway: +1. **Discovery Phase**: Gateway discovers Authorization Server (AS) metadata via [RFC 8414](https://tools.ietf.org/html/rfc8414) +2. **Registration Phase**: Gateway registers itself as an OAuth client with the AS +3. **Token Phase**: Gateway uses registered credentials for standard OAuth flows + +--- + +## Environment Variables + +```bash +# DCR Feature Control +MCPGATEWAY_DCR_ENABLED=true # Enable/disable DCR (default: true) +MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS=true # Auto-register when gateway has issuer but no client_id (default: true) + +# DCR Configuration +MCPGATEWAY_DCR_DEFAULT_SCOPES="mcp:read" # Default scopes to request (comma-separated, default: mcp:read) +MCPGATEWAY_DCR_ALLOWED_ISSUERS="" # Optional allowlist of issuer URLs (empty = allow any) +MCPGATEWAY_DCR_TOKEN_ENDPOINT_AUTH_METHOD="client_secret_basic" # Auth method: client_secret_basic or client_secret_post +MCPGATEWAY_DCR_METADATA_CACHE_TTL=3600 # AS metadata cache TTL in seconds (default: 1 hour) +MCPGATEWAY_DCR_CLIENT_NAME_TEMPLATE="MCP Gateway ({gateway_name})" # Client name template for registration + +# OAuth Settings (used by DCR) +OAUTH_REQUEST_TIMEOUT=30 # HTTP request timeout in seconds +AUTH_ENCRYPTION_SECRET= # Required for encrypting client secrets +``` + +!!! important "Security Note" + Always set `AUTH_ENCRYPTION_SECRET` to a strong random value. DCR-registered client secrets and refresh tokens are encrypted using this key. + +--- + +## How It Works + +### 1. AS Metadata Discovery (RFC 8414) + +When you configure a gateway with an `issuer` URL, the gateway automatically discovers the AS configuration: + +``` +GET https://auth.example.com/.well-known/oauth-authorization-server +``` + +If that fails, tries OIDC discovery: + +``` +GET https://auth.example.com/.well-known/openid-configuration +``` + +The metadata response includes: +- `registration_endpoint` - Where to register clients +- `authorization_endpoint` - OAuth authorization URL +- `token_endpoint` - Token exchange URL +- Supported grant types, scopes, and auth methods + +**Caching:** Metadata is cached for `MCPGATEWAY_DCR_METADATA_CACHE_TTL` seconds (default: 1 hour). + +### 2. Client Registration (RFC 7591) + +If no `client_id` is configured and `MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS=true`, the gateway automatically registers: + +```json +POST https://auth.example.com/register +Content-Type: application/json + +{ + "client_name": "MCP Gateway (GitHub MCP)", + "redirect_uris": ["https://gateway.example.com/oauth/callback"], + "grant_types": ["authorization_code"], + "response_types": ["code"], + "token_endpoint_auth_method": "client_secret_basic", + "scope": "mcp:read mcp:write" +} +``` + +The AS responds with registered credentials: + +```json +{ + "client_id": "auto-generated-id", + "client_secret": "auto-generated-secret", + "registration_access_token": "...", + "registration_client_uri": "https://auth.example.com/register/auto-generated-id" +} +``` + +Gateway stores these in the `registered_oauth_clients` table (encrypted). + +### 3. OAuth Flow with Registered Credentials + +Once registered, the gateway uses the client credentials for standard OAuth flows with automatic PKCE. + +--- + ## Architecture -A simplified architecture. Please view the following guide for an in depth swimlane chart: +A simplified architecture. Please view the following guide for an in-depth swimlane chart: 📖 **[Dynamic Client Registration with HyperMCP Tutorial](../tutorials/dcr-hyprmcp.md)** @@ -32,9 +132,230 @@ graph LR HyprMCP --> ContextForge ``` +--- + +## Configuration Examples + +### Automatic Registration (Recommended) + +Configure a gateway with just the issuer URL - credentials auto-register: + +**Admin UI:** +1. Create/Edit Gateway → Authentication Type: OAuth +2. Set Grant Type: `authorization_code` +3. Set Issuer: `https://auth.example.com` +4. Set Redirect URI: `https://gateway.example.com/oauth/callback` +5. Leave Client ID and Client Secret empty +6. Save → Gateway auto-registers and stores credentials + +**JSON/API:** + +```json +{ + "name": "Auto-DCR Gateway", + "url": "https://mcp.example.com/sse", + "auth_type": "oauth", + "oauth_config": { + "grant_type": "authorization_code", + "issuer": "https://auth.example.com", + "redirect_uri": "https://gateway.example.com/oauth/callback", + "scopes": ["mcp:read", "mcp:write"] + } +} +``` + +### Manual Client Credentials (Fallback) + +If DCR is disabled or you have pre-registered credentials: + +```json +{ + "name": "Manual OAuth Gateway", + "url": "https://mcp.example.com/sse", + "auth_type": "oauth", + "oauth_config": { + "grant_type": "authorization_code", + "client_id": "pre-registered-id", + "client_secret": "pre-registered-secret", + "authorization_url": "https://auth.example.com/authorize", + "token_url": "https://auth.example.com/token", + "redirect_uri": "https://gateway.example.com/oauth/callback", + "scopes": ["mcp:read"] + } +} +``` + +### Public Client with PKCE Only + +For public clients that don't support client secrets: + +```json +{ + "name": "Public Client Gateway", + "url": "https://mcp.example.com/sse", + "auth_type": "oauth", + "oauth_config": { + "grant_type": "authorization_code", + "issuer": "https://auth.example.com", + "client_id": "public-client-id", + "redirect_uri": "https://gateway.example.com/oauth/callback", + "scopes": ["mcp:read"] + } +} +``` + +Note: No `client_secret` - PKCE provides security. + +--- + +## Database Schema + +DCR uses two new tables: + +### `registered_oauth_clients` + +Stores auto-registered OAuth clients: + +```sql +CREATE TABLE registered_oauth_clients ( + id VARCHAR(36) PRIMARY KEY, + gateway_id VARCHAR(255) NOT NULL, + issuer VARCHAR(500) NOT NULL, + client_id VARCHAR(255) NOT NULL, + client_secret_encrypted TEXT, + redirect_uris TEXT NOT NULL, + grant_types TEXT NOT NULL, + scope VARCHAR(1000), + registration_client_uri VARCHAR(500), + registration_access_token_encrypted TEXT, + created_at TIMESTAMP, + expires_at TIMESTAMP, + is_active BOOLEAN, + UNIQUE (gateway_id, issuer) +); +``` + +### `oauth_states` + +Stores OAuth authorization state with PKCE verifiers: + +```sql +CREATE TABLE oauth_states ( + id VARCHAR(36) PRIMARY KEY, + gateway_id VARCHAR(255) NOT NULL, + state VARCHAR(255) NOT NULL UNIQUE, + code_verifier VARCHAR(255), -- PKCE code verifier + app_user_email VARCHAR(255), + created_at TIMESTAMP, + expires_at TIMESTAMP +); +``` + +--- + +## Security Features + +### 1. Issuer Allowlist + +Restrict which Authorization Servers can be used: + +```bash +MCPGATEWAY_DCR_ALLOWED_ISSUERS='["https://trusted-as1.com", "https://trusted-as2.com"]' +``` + +If set, gateway will reject DCR for any issuer not in the list. + +### 2. Encrypted Credentials + +All sensitive data is encrypted at rest using `AUTH_ENCRYPTION_SECRET`: +- Client secrets +- Registration access tokens +- Refresh tokens + +### 3. PKCE Integration + +All Authorization Code flows automatically use PKCE (RFC 7636) for enhanced security: +- Prevents authorization code interception attacks +- Supports public clients without client secrets +- No configuration needed - always enabled + +### 4. Metadata Validation + +Gateway validates AS metadata responses: +- Issuer URL must match discovery URL +- Registration endpoint must be present +- Proper HTTP status codes (200/201 for registration) + +--- + +## Troubleshooting + +### DCR Registration Fails + +**Error: "AS does not support Dynamic Client Registration"** +- The Authorization Server doesn't expose a `registration_endpoint` in its metadata +- Solution: Use manual client credentials instead + +**Error: "Issuer not in allowed issuers list"** +- The issuer URL is not in `MCPGATEWAY_DCR_ALLOWED_ISSUERS` +- Solution: Add the issuer to the allowlist or clear the allowlist to allow any + +**Error: "Failed to discover AS metadata"** +- The issuer URL is incorrect or unreachable +- Solution: Verify the issuer URL and network connectivity + +### Client Registration Rejected + +**HTTP 400/401 from AS** +- Check AS logs for specific error messages +- Verify redirect URI matches AS requirements +- Ensure scopes are supported by the AS + +**HTTP 200 instead of 201** +- Some ASs return 200 instead of RFC 7591's 201 - this is accepted +- If registration still fails, check response body for errors + +### Token Exchange Issues + +**Invalid client_id after DCR** +- Verify registration was successful in `registered_oauth_clients` table +- Check that `client_secret_encrypted` was properly stored +- Try re-registering by deleting the record and recreating the gateway + +--- + +## Monitoring + +Check DCR activity in logs: + +```bash +# Search for DCR operations +grep "Discovered AS metadata" logs/mcpgateway.log +grep "Successfully registered client" logs/mcpgateway.log +grep "Found existing registered client" logs/mcpgateway.log + +# Check for errors +grep "DcrError" logs/mcpgateway.log +``` + +Query registered clients: + +```sql +SELECT + gateway_id, + issuer, + client_id, + created_at, + is_active +FROM registered_oauth_clients +WHERE is_active = 1; +``` + +--- + ## Related Documentation -- [OAuth 2.0 Integration](oauth.md) - General OAuth2 configuration +- [OAuth 2.0 Integration](oauth.md) - General OAuth2 configuration and PKCE details - [Proxy Authentication](../deployment/proxy-auth.md) - Using authentication proxies - [SSO Configuration](sso.md) - Single Sign-On setup - [Security Best Practices](securing.md) - Security guidelines diff --git a/docs/docs/manage/index.md b/docs/docs/manage/index.md index 3fc890cae..1a02c889d 100644 --- a/docs/docs/manage/index.md +++ b/docs/docs/manage/index.md @@ -20,6 +20,8 @@ Whether you're self-hosting, running in the cloud, or deploying to Kubernetes, t | Page | Description | |------|-------------| | [Configuration](configuration.md) | **Complete configuration reference** - databases, environment variables, and deployment settings | +| [Scaling Guide](scale.md) | 📈 **Production Scaling** - Horizontal/vertical scaling, Kubernetes HPA, connection pooling, and performance tuning | +| [Performance Tuning](tuning.md) | Optimize Gunicorn workers, database connections, and container resources | | [Dynamic Client Registration](dcr.md) | 🔐 **OAuth2 DCR** - Automatic client provisioning for streamable HTTP servers | | [Backups](backup.md) | How to persist and restore your database, configs, and resource state | | [Export & Import](export-import.md) | Complete configuration management with CLI, API, and Admin UI | diff --git a/docs/docs/manage/oauth.md b/docs/docs/manage/oauth.md index f087ce645..9579126da 100644 --- a/docs/docs/manage/oauth.md +++ b/docs/docs/manage/oauth.md @@ -187,8 +187,28 @@ See also: [securing.md](./securing.md) for general hardening guidance and [proxy --- +## PKCE Support + +MCP Gateway implements **PKCE (Proof Key for Code Exchange)** as defined in [RFC 7636](https://tools.ietf.org/html/rfc7636) for all Authorization Code flows. This provides enhanced security, especially for: + +- Public clients (mobile apps, SPAs, desktop apps) +- Environments where client secrets cannot be securely stored +- Protection against authorization code interception attacks + +**How it works:** + +1. Gateway generates a random `code_verifier` (43-128 characters) +2. Computes `code_challenge` = BASE64URL(SHA256(code_verifier)) +3. Sends `code_challenge` and `code_challenge_method=S256` in authorization request +4. Stores `code_verifier` in OAuth state (encrypted at rest) +5. Includes `code_verifier` when exchanging authorization code for token + +PKCE is **automatically enabled** for all Authorization Code flows - no configuration needed. + +--- + ## FAQ -- Can I use PKCE? Not yet; planned as a future enhancement. -- Can I configure per-tool OAuth? Roadmap considers multiple OAuth configs per tool; current design is per-gateway. -- Do you cache tokens? Default is no caching; tokens are fetched per operation. Optional storage/refresh is planned per the UI design. +- **Can I use PKCE?** Yes! PKCE is automatically enabled for all Authorization Code flows (RFC 7636). +- **Can I configure per-tool OAuth?** Roadmap considers multiple OAuth configs per tool; current design is per-gateway. +- **Do you cache tokens?** Default is no caching; tokens are fetched per operation. Optional storage/refresh is available for Authorization Code flows. diff --git a/docs/docs/manage/scale.md b/docs/docs/manage/scale.md new file mode 100644 index 000000000..da09ce92c --- /dev/null +++ b/docs/docs/manage/scale.md @@ -0,0 +1,1445 @@ +# Scaling MCP Gateway + +> Comprehensive guide to scaling MCP Gateway from development to production, covering vertical scaling, horizontal scaling, connection pooling, performance tuning, and Kubernetes deployment strategies. + +## Overview + +MCP Gateway is designed to scale from single-container development environments to distributed multi-node production deployments. This guide covers: + +- **Vertical Scaling**: Optimizing single-instance performance with Gunicorn workers +- **Horizontal Scaling**: Multi-container deployments with shared state +- **Database Optimization**: PostgreSQL connection pooling and settings +- **Cache Architecture**: Redis for distributed caching +- **Performance Tuning**: Configuration and benchmarking +- **Kubernetes Deployment**: HPA, resource limits, and best practices + +--- + +## Table of Contents + +1. [Understanding the GIL and Worker Architecture](#1-understanding-the-gil-and-worker-architecture) +2. [Vertical Scaling with Gunicorn](#2-vertical-scaling-with-gunicorn) +3. [Future: Python 3.14 and PostgreSQL 18](#3-future-python-314-and-postgresql-18) +4. [Horizontal Scaling with Kubernetes](#4-horizontal-scaling-with-kubernetes) +5. [Database Connection Pooling](#5-database-connection-pooling) +6. [Redis for Distributed Caching](#6-redis-for-distributed-caching) +7. [Performance Tuning](#7-performance-tuning) +8. [Benchmarking and Load Testing](#8-benchmarking-and-load-testing) +9. [Health Checks and Readiness](#9-health-checks-and-readiness) +10. [Stateless Architecture and Long-Running Connections](#10-stateless-architecture-and-long-running-connections) +11. [Kubernetes Production Deployment](#11-kubernetes-production-deployment) +12. [Monitoring and Observability](#12-monitoring-and-observability) + +--- + +## 1. Understanding the GIL and Worker Architecture + +### The Python Global Interpreter Lock (GIL) + +Python's Global Interpreter Lock (GIL) prevents multiple native threads from executing Python bytecode simultaneously. This means: + +- **Single worker** = Single CPU core usage (even on multi-core systems) +- **I/O-bound workloads** (API calls, database queries) benefit from async/await +- **CPU-bound workloads** (JSON parsing, encryption) require multiple processes + +### Pydantic v2: Rust-Powered Performance + +MCP Gateway leverages **Pydantic v2.11+** for all request/response validation and schema definitions. Unlike pure Python libraries, Pydantic v2 includes a **Rust-based core** (`pydantic-core`) that significantly improves performance: + +**Performance benefits:** +- **5-50x faster validation** compared to Pydantic v1 +- **JSON parsing** in Rust (bypasses GIL for serialization/deserialization) +- **Schema validation** runs in compiled Rust code +- **Reduced CPU overhead** for request processing + +**Impact on scaling:** +- 5,463 lines of Pydantic schemas (`mcpgateway/schemas.py`) +- Every API request validated through Rust-optimized code +- Lower CPU usage per request = higher throughput per worker +- Rust components release the GIL during execution + +This means that even within a single worker process, Pydantic's Rust core can run concurrently with Python code for validation-heavy workloads. + +### MCP Gateway's Solution: Gunicorn with Multiple Workers + +MCP Gateway uses **Gunicorn with UvicornWorker** to spawn multiple worker processes: + +```python +# gunicorn.config.py +workers = 8 # Multiple processes bypass the GIL +worker_class = "uvicorn.workers.UvicornWorker" # Async support +timeout = 600 # 10-minute timeout for long-running operations +preload_app = True # Load app once, then fork (memory efficient) +``` + +**Key benefits:** + +- Each worker is a separate process with its own GIL +- 8 workers = ability to use 8 CPU cores +- UvicornWorker enables async I/O within each worker +- Preloading reduces memory footprint (shared code segments) + +The trade-off is that you are running multiple Python interpreter instances, and each consumes additional memory. + +This also requires having shared state (e.g. Redis or a Database). +--- + +## 2. Vertical Scaling with Gunicorn + +### Worker Count Calculation + +**Formula**: `workers = (2 × CPU_cores) + 1` + +**Examples:** + +| CPU Cores | Recommended Workers | Use Case | +|-----------|---------------------|----------| +| 1 | 2-3 | Development/testing | +| 2 | 4-5 | Small production | +| 4 | 8-9 | Medium production | +| 8 | 16-17 | Large production | + +### Configuration Methods + +#### Environment Variables + +```bash +# Automatic detection based on CPU cores +export GUNICORN_WORKERS=auto + +# Manual override +export GUNICORN_WORKERS=16 +export GUNICORN_TIMEOUT=600 +export GUNICORN_MAX_REQUESTS=100000 +export GUNICORN_MAX_REQUESTS_JITTER=100 +export GUNICORN_PRELOAD_APP=true +``` + +#### Kubernetes ConfigMap + +```yaml +# charts/mcp-stack/values.yaml +mcpContextForge: + config: + GUNICORN_WORKERS: "16" # Number of worker processes + GUNICORN_TIMEOUT: "600" # Worker timeout (seconds) + GUNICORN_MAX_REQUESTS: "100000" # Requests before worker restart + GUNICORN_MAX_REQUESTS_JITTER: "100" # Prevents thundering herd + GUNICORN_PRELOAD_APP: "true" # Memory optimization +``` + +### Resource Allocation + +**CPU**: Allocate 1 CPU core per 2 workers (allows for I/O wait) + +**Memory**: +- Base: 256MB +- Per worker: 128-256MB (depending on workload) +- Formula: `memory = 256 + (workers × 200)` MB + +**Example for 16 workers:** +- CPU: `8-10 cores` (allows headroom) +- Memory: `3.5-4 GB` (256 + 16×200 = 3.5GB) + +```yaml +# Kubernetes resource limits +resources: + limits: + cpu: 10000m # 10 cores + memory: 4Gi + requests: + cpu: 8000m # 8 cores + memory: 3584Mi # 3.5GB +``` + +--- + +## 3. Future: Python 3.14 and PostgreSQL 18 + +### Python 3.14 (Free-Threaded Mode) + +**Status**: Beta (as of July 2025) - [PEP 703](https://peps.python.org/pep-0703/) + +Python 3.14 introduces **optional free-threading** (GIL removal), a groundbreaking change that enables true parallel multi-threading: + +```bash +# Enable free-threading mode +python3.14 -X gil=0 -m gunicorn ... + +# Or use PYTHON_GIL environment variable +PYTHON_GIL=0 python3.14 -m gunicorn ... +``` + +**Performance characteristics:** + +| Workload Type | Expected Impact | +|---------------|----------------| +| Single-threaded | **3-15% slower** (overhead from thread-safety mechanisms) | +| Multi-threaded (I/O-bound) | **Minimal impact** (already benefits from async/await) | +| Multi-threaded (CPU-bound) | **Near-linear scaling** with CPU cores | +| Multi-process (current) | **No change** (already bypasses GIL) | + +**Benefits when available:** +- **True parallel threads**: Multiple threads execute Python code simultaneously +- **Lower memory overhead**: Threads share memory (vs. separate processes) +- **Faster inter-thread communication**: Shared memory, no IPC overhead +- **Better resource efficiency**: One interpreter instance instead of multiple processes + +**Trade-offs:** +- **Single-threaded penalty**: 3-15% slower due to fine-grained locking +- **Library compatibility**: Some C extensions need updates (most popular libraries already compatible) +- **Different scaling model**: Move from `workers=16` to `workers=2 --threads=32` + +**Migration strategy:** + +1. **Now (Python 3.11-3.13)**: Continue using multi-process Gunicorn + ```python + workers = 16 # Multiple processes + worker_class = "uvicorn.workers.UvicornWorker" + ``` + +2. **Python 3.14 beta**: Test in staging environment + ```bash + # Build free-threaded Python + ./configure --enable-experimental-jit --with-pydebug + make + + # Test with free-threading + PYTHON_GIL=0 python3.14 -m pytest tests/ + ``` + +3. **Python 3.14 stable**: Evaluate hybrid approach + ```python + workers = 4 # Fewer processes + threads = 8 # More threads per process + worker_class = "uvicorn.workers.UvicornWorker" + ``` + +4. **Post-migration**: Thread-based scaling + ```python + workers = 2 # Minimal processes + threads = 32 # Scale with threads + preload_app = True # Single app load + ``` + +**Current recommendation**: +- **Production**: Use Python 3.11-3.13 with multi-process Gunicorn (proven, stable) +- **Testing**: Experiment with Python 3.14 beta in non-production environments +- **Monitoring**: Watch for library compatibility announcements + +**Why MCP Gateway is well-positioned for free-threading:** + +MCP Gateway's architecture already benefits from components that will perform even better with Python 3.14: + +1. **Pydantic v2 Rust core**: Already bypasses GIL for validation - will work seamlessly with free-threading +2. **FastAPI/Uvicorn**: Built for async I/O - natural fit for thread-based concurrency +3. **SQLAlchemy async**: Database operations already non-blocking +4. **Stateless design**: No shared mutable state between requests + +**Resources:** +- [Python 3.14 Free-Threading Guide](https://www.pythoncheatsheet.org/blog/python-3-14-breaking-free-from-gil) +- [PEP 703: Making the GIL Optional](https://peps.python.org/pep-0703/) +- [Python 3.14 Release Schedule](https://peps.python.org/pep-0745/) +- [Pydantic v2 Performance](https://docs.pydantic.dev/latest/blog/pydantic-v2/) + +### PostgreSQL 18 (Async I/O) + +**Status**: Development (expected 2025) + +PostgreSQL 18 introduces native async I/O: + +- **Improved connection handling**: Better async query performance +- **Reduced latency**: Non-blocking I/O operations +- **Better scalability**: Efficient connection multiplexing + +**Current recommendation**: PostgreSQL 16+ (stable async support via asyncpg) + +```bash +# Production-ready now +DATABASE_URL=postgresql+asyncpg://user:pass@postgres:5432/mcp +``` + +--- + +## 4. Horizontal Scaling with Kubernetes + +### Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Load Balancer │ +│ (Kubernetes Service) │ +└────────────┬────────────────────────────────┬───────────────┘ + │ │ + ┌────────▼─────────┐ ┌────────▼─────────┐ + │ Gateway Pod 1 │ │ Gateway Pod 2 │ + │ (8 workers) │ │ (8 workers) │ + └────────┬─────────┘ └────────┬─────────┘ + │ │ + └────────────┬───────────────────┘ + │ + ┌───────────────▼───────────────────────┐ + │ │ + ┌─────▼──────┐ ┌──────────▼─────┐ + │ PostgreSQL │ │ Redis │ + │ (shared) │ │ (shared) │ + └────────────┘ └────────────────┘ +``` + +### Shared State Requirements + +For multi-pod deployments: + +1. **Shared PostgreSQL**: All data (servers, tools, users, teams) +2. **Shared Redis**: Distributed caching and session management +3. **Stateless pods**: No local state, can be killed/restarted anytime + +### Kubernetes Deployment + +#### Helm Chart Configuration + +```yaml +# charts/mcp-stack/values.yaml +mcpContextForge: + replicaCount: 3 # Start with 3 pods + + # Horizontal Pod Autoscaler + hpa: + enabled: true + minReplicas: 3 # Never scale below 3 + maxReplicas: 20 # Scale up to 20 pods + targetCPUUtilizationPercentage: 70 # Scale at 70% CPU + targetMemoryUtilizationPercentage: 80 # Scale at 80% memory + + # Pod resources + resources: + limits: + cpu: 2000m # 2 cores per pod + memory: 4Gi + requests: + cpu: 1000m # 1 core per pod + memory: 2Gi + + # Environment configuration + config: + GUNICORN_WORKERS: "8" # 8 workers per pod + CACHE_TYPE: redis # Shared cache + DB_POOL_SIZE: "50" # Per-pod pool size + +# Shared PostgreSQL +postgres: + enabled: true + resources: + limits: + cpu: 4000m # 4 cores + memory: 8Gi + requests: + cpu: 2000m + memory: 4Gi + + # Important: Set max_connections + # Formula: (num_pods × DB_POOL_SIZE × 1.2) + 20 + # Example: (20 pods × 50 pool × 1.2) + 20 = 1220 + config: + max_connections: 1500 # Adjust based on scale + +# Shared Redis +redis: + enabled: true + resources: + limits: + cpu: 2000m + memory: 4Gi + requests: + cpu: 1000m + memory: 2Gi +``` + +#### Deploy with Helm + +```bash +# Install/upgrade with custom values +helm upgrade --install mcp-stack ./charts/mcp-stack \ + --namespace mcp-gateway \ + --create-namespace \ + --values production-values.yaml + +# Verify HPA +kubectl get hpa -n mcp-gateway +``` + +### Horizontal Scaling Calculation + +**Total capacity** = `pods × workers × requests_per_second` + +**Example:** +- 10 pods × 8 workers × 100 RPS = **8,000 RPS** + +**Database connections needed:** +- 10 pods × 50 pool size = **500 connections** +- Add 20% overhead = **600 connections** +- Set `max_connections=1000` (buffer for maintenance) + +--- + +## 5. Database Connection Pooling + +### Connection Pool Architecture + +SQLAlchemy manages a connection pool per process: + +``` +Pod 1 (8 workers) → 8 connection pools → PostgreSQL +Pod 2 (8 workers) → 8 connection pools → PostgreSQL +Pod N (8 workers) → 8 connection pools → PostgreSQL +``` + +### Pool Configuration + +#### Environment Variables + +```bash +# Connection pool settings +DB_POOL_SIZE=50 # Persistent connections per worker +DB_MAX_OVERFLOW=10 # Additional connections allowed +DB_POOL_TIMEOUT=60 # Wait time before timeout (seconds) +DB_POOL_RECYCLE=3600 # Recycle connections after 1 hour +DB_MAX_RETRIES=5 # Retry attempts on failure +DB_RETRY_INTERVAL_MS=2000 # Retry interval +``` + +#### Configuration in Code + +```python +# mcpgateway/config.py +@property +def database_settings(self) -> dict: + return { + "pool_size": self.db_pool_size, # 50 + "max_overflow": self.db_max_overflow, # 10 + "pool_timeout": self.db_pool_timeout, # 60s + "pool_recycle": self.db_pool_recycle, # 3600s + } +``` + +### PostgreSQL Configuration + +#### Calculate max_connections + +```bash +# Formula +max_connections = (num_pods × num_workers × pool_size × 1.2) + buffer + +# Example: 10 pods, 8 workers, 50 pool size +max_connections = (10 × 8 × 50 × 1.2) + 200 = 5000 connections +``` + +#### PostgreSQL Configuration File + +```ini +# postgresql.conf +max_connections = 5000 +shared_buffers = 16GB # 25% of RAM +effective_cache_size = 48GB # 75% of RAM +work_mem = 16MB # Per operation +maintenance_work_mem = 2GB +``` + +#### Managed Services + +**IBM Cloud Databases for PostgreSQL:** +```bash +# Increase max_connections via CLI +ibmcloud cdb deployment-configuration postgres \ + --configuration max_connections=5000 +``` + +**AWS RDS:** +```bash +# Via parameter group +max_connections = {DBInstanceClassMemory/9531392} +``` + +**Google Cloud SQL:** +```bash +# Auto-scales based on instance size +# 4 vCPU = 400 connections +# 8 vCPU = 800 connections +``` + +### Connection Pool Monitoring + +```python +# Health endpoint checks pool status +@app.get("/health") +async def healthcheck(db: Session = Depends(get_db)): + try: + db.execute(text("SELECT 1")) + return {"status": "healthy"} + except Exception as e: + return {"status": "unhealthy", "error": str(e)} +``` + +```bash +# Check PostgreSQL connections +kubectl exec -it postgres-pod -- psql -U admin -d postgresdb \ + -c "SELECT count(*) FROM pg_stat_activity;" +``` + +--- + +## 6. Redis for Distributed Caching + +### Architecture + +Redis provides shared state across all Gateway pods: + +- **Session storage**: User sessions (TTL: 3600s) +- **Message cache**: Ephemeral data (TTL: 600s) +- **Federation cache**: Gateway peer discovery + +### Configuration + +#### Enable Redis Caching + +```bash +# .env or Kubernetes ConfigMap +CACHE_TYPE=redis +REDIS_URL=redis://redis-service:6379/0 +CACHE_PREFIX=mcpgw: +SESSION_TTL=3600 +MESSAGE_TTL=600 +REDIS_MAX_RETRIES=3 +REDIS_RETRY_INTERVAL_MS=2000 +``` + +#### Kubernetes Deployment + +```yaml +# charts/mcp-stack/values.yaml +redis: + enabled: true + + resources: + limits: + cpu: 2000m + memory: 4Gi + requests: + cpu: 1000m + memory: 2Gi + + # Enable persistence + persistence: + enabled: true + size: 10Gi +``` + +### Redis Sizing + +**Memory calculation:** +- Sessions: `concurrent_users × 50KB` +- Messages: `messages_per_minute × 100KB × (TTL/60)` + +**Example:** +- 10,000 users × 50KB = 500MB +- 1,000 msg/min × 100KB × 10min = 1GB +- **Total: 1.5GB + 50% overhead = 2.5GB** + +### High Availability + +**Redis Sentinel** (3+ nodes): +```yaml +redis: + sentinel: + enabled: true + quorum: 2 + + replicas: 3 # 1 primary + 2 replicas +``` + +**Redis Cluster** (6+ nodes): +```bash +REDIS_URL=redis://redis-cluster:6379/0?cluster=true +``` + +--- + +## 7. Performance Tuning + +### Application Architecture Performance + +MCP Gateway's technology stack is optimized for high performance: + +**Rust-Powered Components:** +- **Pydantic v2** (5-50x faster validation via Rust core) +- **Uvicorn** (ASGI server with Rust-based httptools) + +**Async-First Design:** +- **FastAPI** (async request handling) +- **SQLAlchemy 2.0** (async database operations) +- **asyncio** event loop per worker + +**Performance characteristics:** +- Request validation: **< 1ms** (Pydantic v2 Rust core) +- JSON serialization: **3-5x faster** than pure Python +- Database queries: Non-blocking async I/O +- Concurrent requests per worker: **1000+** (async event loop) + +### System-Level Optimization + +#### Kernel Parameters + +```bash +# /etc/sysctl.conf +net.core.somaxconn=4096 +net.ipv4.tcp_max_syn_backlog=4096 +net.ipv4.ip_local_port_range=1024 65535 +net.ipv4.tcp_tw_reuse=1 +fs.file-max=2097152 + +# Apply changes +sysctl -p +``` + +#### File Descriptors + +```bash +# /etc/security/limits.conf +* soft nofile 1048576 +* hard nofile 1048576 + +# Verify +ulimit -n +``` + +### Gunicorn Tuning + +#### Optimal Settings + +```python +# gunicorn.config.py +workers = (CPU_cores × 2) + 1 +timeout = 600 # Long enough for LLM calls +max_requests = 100000 # Prevent memory leaks +max_requests_jitter = 100 # Randomize restart +preload_app = True # Reduce memory +reuse_port = True # Load balance across workers +``` + +#### Worker Class Selection + +**UvicornWorker** (default - best for async): +```python +worker_class = "uvicorn.workers.UvicornWorker" +``` + +**Gevent** (alternative for I/O-heavy): +```bash +pip install gunicorn[gevent] +worker_class = "gevent" +worker_connections = 1000 +``` + +### Application Tuning + +```bash +# Resource limits +TOOL_TIMEOUT=60 +TOOL_CONCURRENT_LIMIT=10 +RESOURCE_CACHE_SIZE=1000 +RESOURCE_CACHE_TTL=3600 + +# Retry configuration +RETRY_MAX_ATTEMPTS=3 +RETRY_BASE_DELAY=1.0 +RETRY_MAX_DELAY=60 + +# Health check intervals +HEALTH_CHECK_INTERVAL=60 +HEALTH_CHECK_TIMEOUT=10 +UNHEALTHY_THRESHOLD=3 +``` + +--- + +## 8. Benchmarking and Load Testing + +### Tools + +**hey** - HTTP load generator +```bash +# Install +brew install hey # macOS +sudo apt install hey # Ubuntu + +# Or from source +go install github.com/rakyll/hey@latest +``` + +**k6** - Modern load testing +```bash +brew install k6 # macOS +``` + +### Baseline Test + +#### Prepare Environment + +```bash +# Get JWT token +export MCPGATEWAY_BEARER_TOKEN=$(python3 -m mcpgateway.utils.create_jwt_token \ + --username admin@example.com --exp 0 --secret my-test-key) + +# Create test payload +cat > payload.json <1000 RPS per pod) +- **P99 latency**: 99th percentile (target: <500ms) +- **Error rate**: 5xx responses (target: <0.1%) + +### Kubernetes Load Test + +```bash +# Deploy test pod +kubectl run load-test --image=williamyeh/hey:latest \ + --rm -it --restart=Never -- \ + -n 100000 -c 500 \ + -H "Authorization: Bearer $TOKEN" \ + http://mcp-gateway-service/ +``` + +### Advanced: k6 Script + +```javascript +// load-test.k6.js +import http from 'k6/http'; +import { check } from 'k6'; + +export let options = { + stages: [ + { duration: '2m', target: 100 }, // Ramp up + { duration: '5m', target: 100 }, // Sustained + { duration: '2m', target: 500 }, // Spike + { duration: '5m', target: 500 }, // High load + { duration: '2m', target: 0 }, // Ramp down + ], + thresholds: { + http_req_duration: ['p(99)<500'], // 99% < 500ms + http_req_failed: ['rate<0.01'], // <1% errors + }, +}; + +export default function () { + const payload = JSON.stringify({ + jsonrpc: '2.0', + id: 1, + method: 'tools/list', + params: {}, + }); + + const res = http.post('http://localhost:4444/', payload, { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${__ENV.TOKEN}`, + }, + }); + + check(res, { + 'status is 200': (r) => r.status === 200, + 'response time < 500ms': (r) => r.timings.duration < 500, + }); +} +``` + +```bash +# Run k6 test +TOKEN=$MCPGATEWAY_BEARER_TOKEN k6 run load-test.k6.js +``` + +--- + +## 9. Health Checks and Readiness + +### Health Check Endpoints + +MCP Gateway provides two health endpoints: + +#### Liveness Probe: `/health` + +**Purpose**: Is the application alive? + +```python +@app.get("/health") +async def healthcheck(db: Session = Depends(get_db)): + """Check database connectivity""" + try: + db.execute(text("SELECT 1")) + return {"status": "healthy"} + except Exception as e: + return {"status": "unhealthy", "error": str(e)} +``` + +**Response:** +```json +{ + "status": "healthy" +} +``` + +#### Readiness Probe: `/ready` + +**Purpose**: Is the application ready to receive traffic? + +```python +@app.get("/ready") +async def readiness_check(db: Session = Depends(get_db)): + """Check if ready to serve traffic""" + try: + await asyncio.to_thread(db.execute, text("SELECT 1")) + return JSONResponse({"status": "ready"}, status_code=200) + except Exception as e: + return JSONResponse( + {"status": "not ready", "error": str(e)}, + status_code=503 + ) +``` + +### Kubernetes Probe Configuration + +```yaml +# charts/mcp-stack/templates/deployment-mcpgateway.yaml +containers: + - name: mcp-context-forge + + # Startup probe (initial readiness) + startupProbe: + exec: + command: + - python3 + - /app/mcpgateway/utils/db_isready.py + - --max-tries=1 + - --timeout=2 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 60 # 5 minutes max + + # Readiness probe (traffic routing) + readinessProbe: + httpGet: + path: /ready + port: 4444 + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 2 + successThreshold: 1 + failureThreshold: 3 + + # Liveness probe (restart if unhealthy) + livenessProbe: + httpGet: + path: /health + port: 4444 + initialDelaySeconds: 10 + periodSeconds: 15 + timeoutSeconds: 2 + successThreshold: 1 + failureThreshold: 3 +``` + +### Probe Tuning Guidelines + +**Startup Probe:** +- Use for slow initialization (database migrations, model loading) +- `failureThreshold × periodSeconds` = max startup time +- Example: 60 × 5s = 5 minutes + +**Readiness Probe:** +- Aggressive: Remove pod from load balancer quickly +- `failureThreshold` = 3 (fail fast) +- `periodSeconds` = 10 (frequent checks) + +**Liveness Probe:** +- Conservative: Avoid unnecessary restarts +- `failureThreshold` = 5 (tolerate transient issues) +- `periodSeconds` = 15 (less frequent) + +### Monitoring Health + +```bash +# Check pod health +kubectl get pods -n mcp-gateway + +# Detailed status +kubectl describe pod -n mcp-gateway + +# Check readiness +kubectl get pods -n mcp-gateway \ + -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' + +# Test health endpoint +kubectl exec -it -n mcp-gateway -- \ + curl http://localhost:4444/health + +# View probe failures +kubectl get events -n mcp-gateway \ + --field-selector involvedObject.name= +``` + +--- + +## 10. Stateless Architecture and Long-Running Connections + +### Stateless Design Principles + +MCP Gateway is designed to be **stateless**, enabling horizontal scaling: + +1. **No local session storage**: All sessions in Redis +2. **No in-memory caching** (in production): Use Redis +3. **Database-backed state**: All data in PostgreSQL +4. **Shared configuration**: Environment variables via ConfigMap + +### Session Management + +#### Stateful Sessions (Not Recommended for Scale) + +```bash +USE_STATEFUL_SESSIONS=true # Event store in database +``` + +**Limitations:** +- Sessions tied to specific pods +- Requires sticky sessions (session affinity) +- Doesn't scale horizontally + +#### Stateless Sessions (Recommended) + +```bash +USE_STATEFUL_SESSIONS=false +JSON_RESPONSE_ENABLED=true +CACHE_TYPE=redis +``` + +**Benefits:** +- Any pod can handle any request +- True horizontal scaling +- Automatic failover + +### Long-Running Connections + +MCP Gateway supports long-running connections for streaming: + +#### Server-Sent Events (SSE) + +```python +# Endpoint: /servers/{id}/sse +@app.get("/servers/{server_id}/sse") +async def sse_endpoint(server_id: int): + """Stream events to client""" + # Connection can last minutes/hours +``` + +#### WebSocket + +```python +# Endpoint: /servers/{id}/ws +@app.websocket("/servers/{server_id}/ws") +async def websocket_endpoint(server_id: int): + """Bidirectional streaming""" +``` + +### Load Balancer Configuration + +**Kubernetes Service** (default): +```yaml +# Distributes connections across pods +apiVersion: v1 +kind: Service +metadata: + name: mcp-gateway-service +spec: + type: ClusterIP + sessionAffinity: None # No sticky sessions + ports: + - port: 80 + targetPort: 4444 +``` + +**NGINX Ingress** (for WebSocket): +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/websocket-services: "mcp-gateway-service" +spec: + rules: + - host: gateway.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: mcp-gateway-service + port: + number: 80 +``` + +### Connection Lifecycle + +``` +Client → Load Balancer → Pod A (SSE stream) + ↓ + (Pod A dies) + ↓ +Client ← Load Balancer → Pod B (reconnect) +``` + +**Best practices:** +1. Client implements reconnection logic +2. Server sets `SSE_KEEPALIVE_INTERVAL=30` (keepalive events) +3. Load balancer timeout > keepalive interval + +--- + +## 11. Kubernetes Production Deployment + +### Reference Architecture + +```yaml +# production-values.yaml +mcpContextForge: + # --- Scaling --- + replicaCount: 5 + + hpa: + enabled: true + minReplicas: 5 + maxReplicas: 50 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + + # --- Resources --- + resources: + limits: + cpu: 4000m # 4 cores per pod + memory: 8Gi + requests: + cpu: 2000m # 2 cores per pod + memory: 4Gi + + # --- Configuration --- + config: + # Gunicorn + GUNICORN_WORKERS: "16" + GUNICORN_TIMEOUT: "600" + GUNICORN_MAX_REQUESTS: "100000" + GUNICORN_PRELOAD_APP: "true" + + # Database + DB_POOL_SIZE: "50" + DB_MAX_OVERFLOW: "10" + DB_POOL_TIMEOUT: "60" + DB_POOL_RECYCLE: "3600" + + # Cache + CACHE_TYPE: redis + CACHE_PREFIX: mcpgw: + SESSION_TTL: "3600" + MESSAGE_TTL: "600" + + # Performance + TOOL_CONCURRENT_LIMIT: "20" + RESOURCE_CACHE_SIZE: "2000" + + # --- Health Checks --- + probes: + startup: + type: exec + command: ["python3", "/app/mcpgateway/utils/db_isready.py"] + periodSeconds: 5 + failureThreshold: 60 + + readiness: + type: http + path: /ready + port: 4444 + periodSeconds: 10 + failureThreshold: 3 + + liveness: + type: http + path: /health + port: 4444 + periodSeconds: 15 + failureThreshold: 5 + +# --- PostgreSQL --- +postgres: + enabled: true + + resources: + limits: + cpu: 8000m # 8 cores + memory: 32Gi + requests: + cpu: 4000m + memory: 16Gi + + persistence: + enabled: true + size: 100Gi + storageClassName: fast-ssd + + # Connection limits + # max_connections = (50 pods × 16 workers × 50 pool × 1.2) + 200 + config: + max_connections: 50000 + shared_buffers: 8GB + effective_cache_size: 24GB + work_mem: 32MB + +# --- Redis --- +redis: + enabled: true + + resources: + limits: + cpu: 4000m + memory: 16Gi + requests: + cpu: 2000m + memory: 8Gi + + persistence: + enabled: true + size: 50Gi +``` + +### Deployment Steps + +```bash +# 1. Create namespace +kubectl create namespace mcp-gateway + +# 2. Create secrets +kubectl create secret generic mcp-secrets \ + -n mcp-gateway \ + --from-literal=JWT_SECRET_KEY=$(openssl rand -hex 32) \ + --from-literal=AUTH_ENCRYPTION_SECRET=$(openssl rand -hex 32) \ + --from-literal=POSTGRES_PASSWORD=$(openssl rand -base64 32) + +# 3. Install with Helm +helm upgrade --install mcp-stack ./charts/mcp-stack \ + -n mcp-gateway \ + -f production-values.yaml \ + --wait \ + --timeout 10m + +# 4. Verify deployment +kubectl get pods -n mcp-gateway +kubectl get hpa -n mcp-gateway +kubectl get svc -n mcp-gateway + +# 5. Run migration job +kubectl get jobs -n mcp-gateway + +# 6. Test scaling +kubectl top pods -n mcp-gateway +``` + +### Pod Disruption Budget + +```yaml +# pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: mcp-gateway-pdb + namespace: mcp-gateway +spec: + minAvailable: 3 # Keep 3 pods always running + selector: + matchLabels: + app: mcp-gateway +``` + +### Network Policies + +```yaml +# network-policy.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mcp-gateway-policy + namespace: mcp-gateway +spec: + podSelector: + matchLabels: + app: mcp-gateway + policyTypes: + - Ingress + - Egress + ingress: + - from: + - podSelector: + matchLabels: + app: ingress-nginx + ports: + - protocol: TCP + port: 4444 + egress: + - to: + - podSelector: + matchLabels: + app: postgres + ports: + - protocol: TCP + port: 5432 + - to: + - podSelector: + matchLabels: + app: redis + ports: + - protocol: TCP + port: 6379 +``` + +--- + +## 12. Monitoring and Observability + +### OpenTelemetry Integration + +MCP Gateway includes built-in OpenTelemetry support: + +```bash +# Enable observability +OTEL_ENABLE_OBSERVABILITY=true +OTEL_TRACES_EXPORTER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4317 +OTEL_SERVICE_NAME=mcp-gateway +``` + +### Prometheus Metrics + +Deploy Prometheus stack: + +```bash +# Add Prometheus Helm repo +helm repo add prometheus-community \ + https://prometheus-community.github.io/helm-charts + +# Install kube-prometheus-stack +helm install prometheus prometheus-community/kube-prometheus-stack \ + -n monitoring \ + --create-namespace +``` + +### Key Metrics to Monitor + +**Application Metrics:** +- Request rate: `rate(http_requests_total[1m])` +- Latency: `histogram_quantile(0.99, http_request_duration_seconds)` +- Error rate: `rate(http_requests_total{status=~"5.."}[1m])` + +**System Metrics:** +- CPU usage: `container_cpu_usage_seconds_total` +- Memory usage: `container_memory_working_set_bytes` +- Network I/O: `container_network_receive_bytes_total` + +**Database Metrics:** +- Connection pool usage: `db_pool_size` / `db_pool_connections_active` +- Query latency: `db_query_duration_seconds` +- Deadlocks: `pg_stat_database_deadlocks` + +**HPA Metrics:** +```bash +kubectl get hpa -n mcp-gateway -w +``` + +### Grafana Dashboards + +Import dashboards: +1. **Kubernetes Cluster Monitoring** (ID: 7249) +2. **PostgreSQL** (ID: 9628) +3. **Redis** (ID: 11835) +4. **NGINX Ingress** (ID: 9614) + +### Alerting Rules + +```yaml +# prometheus-rules.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: mcp-gateway-alerts + namespace: monitoring +spec: + groups: + - name: mcp-gateway + interval: 30s + rules: + - alert: HighErrorRate + expr: | + rate(http_requests_total{status=~"5..", namespace="mcp-gateway"}[5m]) > 0.05 + for: 5m + annotations: + summary: "High error rate detected" + + - alert: HighLatency + expr: | + histogram_quantile(0.99, + rate(http_request_duration_seconds_bucket[5m])) > 1 + for: 5m + annotations: + summary: "P99 latency exceeds 1s" + + - alert: DatabaseConnectionPoolExhausted + expr: | + db_pool_connections_active / db_pool_size > 0.9 + for: 2m + annotations: + summary: "Database connection pool >90% utilized" +``` + +--- + +## Summary and Checklist + +### Performance Technology Stack + +MCP Gateway is built on a high-performance foundation: + +✅ **Pydantic v2.11+** - Rust-powered validation (5-50x faster than v1) +✅ **FastAPI** - Modern async framework with OpenAPI support +✅ **Uvicorn** - ASGI server with Rust-based HTTP parsing +✅ **SQLAlchemy 2.0** - Async database operations +✅ **Python 3.11+** - Current stable with excellent performance +🔮 **Python 3.14** - Future free-threading support (beta) + +### Scaling Checklist + +- [ ] **Vertical Scaling** + - [ ] Configure Gunicorn workers: `(2 × CPU) + 1` + - [ ] Allocate CPU: 1 core per 2 workers + - [ ] Allocate memory: 256MB + (workers × 200MB) + +- [ ] **Horizontal Scaling** + - [ ] Deploy to Kubernetes with HPA enabled + - [ ] Set `minReplicas` ≥ 3 for high availability + - [ ] Configure shared PostgreSQL and Redis + +- [ ] **Database Optimization** + - [ ] Calculate `max_connections`: `(pods × workers × pool) × 1.2` + - [ ] Set `DB_POOL_SIZE` per worker (recommended: 50) + - [ ] Configure `DB_POOL_RECYCLE=3600` to prevent stale connections + +- [ ] **Caching** + - [ ] Enable Redis: `CACHE_TYPE=redis` + - [ ] Set `REDIS_URL` to shared Redis instance + - [ ] Configure TTLs: `SESSION_TTL=3600`, `MESSAGE_TTL=600` + +- [ ] **Performance** + - [ ] Tune Gunicorn: `GUNICORN_PRELOAD_APP=true` + - [ ] Set timeouts: `GUNICORN_TIMEOUT=600` + - [ ] Configure retries: `RETRY_MAX_ATTEMPTS=3` + +- [ ] **Health Checks** + - [ ] Configure `/health` liveness probe + - [ ] Configure `/ready` readiness probe + - [ ] Set appropriate thresholds and timeouts + +- [ ] **Monitoring** + - [ ] Enable OpenTelemetry: `OTEL_ENABLE_OBSERVABILITY=true` + - [ ] Deploy Prometheus and Grafana + - [ ] Configure alerts for errors, latency, and resources + +- [ ] **Load Testing** + - [ ] Benchmark with `hey` or `k6` + - [ ] Target: >1000 RPS per pod, P99 <500ms + - [ ] Test failover scenarios + +### Reference Documentation + +- [Gunicorn Configuration](../deployment/local.md) +- [Kubernetes Deployment](../deployment/kubernetes.md) +- [Helm Charts](../deployment/helm.md) +- [Performance Testing](../testing/performance.md) +- [Observability](observability.md) +- [Configuration Guide](configuration.md) +- [Database Tuning](tuning.md) + +--- + +## Additional Resources + +### External Links + +- [Gunicorn Documentation](https://docs.gunicorn.org/) +- [Kubernetes HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) +- [PostgreSQL Connection Pooling](https://www.postgresql.org/docs/current/runtime-config-connection.html) +- [Redis Cluster](https://redis.io/docs/reference/cluster-spec/) +- [OpenTelemetry Python](https://opentelemetry.io/docs/instrumentation/python/) + +### Community + +- [GitHub Discussions](https://github.com/ibm/mcp-context-forge/discussions) +- [Issue Tracker](https://github.com/ibm/mcp-context-forge/issues) + +--- + +*Last updated: 2025-10-02* diff --git a/docs/docs/overview/.pages b/docs/docs/overview/.pages index 31eef9d2f..816318c6b 100644 --- a/docs/docs/overview/.pages +++ b/docs/docs/overview/.pages @@ -6,3 +6,4 @@ nav: - ui-concepts.md - passthrough.md - tags.md + - session_pooling.md diff --git a/docs/docs/overview/features.md b/docs/docs/overview/features.md index 8be56910c..964aa0ce9 100644 --- a/docs/docs/overview/features.md +++ b/docs/docs/overview/features.md @@ -27,6 +27,20 @@ adding auth, caching, federation, and an HTMX-powered Admin UI. --- +## ♻️ Session Pooling & Stateful Sessions + +MCP Gateway supports **session pooling** for SSE and WebSocket transports. When enabled, a single session is reused for each (user, server) pair, improving performance and enabling stateful workflows. + +**Key benefits:** +- Faster response times (fewer round-trips) +- Lower memory and connection usage +- Maintains conversational or tool state across calls +- Fully compatible with multi-worker deployments (with Redis backend) + +See [Session Pooling](session_pooling.md) for configuration and details. + +--- + ## 🌍 Federation & Discovery ??? summary "Features" diff --git a/docs/docs/overview/session_pooling.md b/docs/docs/overview/session_pooling.md new file mode 100644 index 000000000..a2c790d6e --- /dev/null +++ b/docs/docs/overview/session_pooling.md @@ -0,0 +1,74 @@ +# Session Pooling & Stateful Sessions in MCP Gateway + +## Overview + +MCP Gateway supports **session pooling** and **stateful session reuse** for SSE and WebSocket transports. This feature dramatically improves performance and enables stateful workflows by reusing a single MCP session across multiple tool calls for the same user and server. + +--- + +## Why Session Pooling? + +- **Performance**: Reduces network round-trips and memory usage by reusing sessions. +- **State Continuity**: Maintains conversational or tool state across calls. +- **Resource Efficiency**: Fewer open connections and lower backend load. +- **Protocol Compliance**: Aligns with MCP spec expectations for stateful sessions. + +--- + +## How It Works + +- When enabled, the gateway will reuse a session for the same `(user, server)` pair across requests. +- Sessions are pooled and tracked with idle timeouts and per-user limits. +- Pooling can be enabled globally or per-server. +- Idle sessions are evicted automatically. +- Auth context and user isolation are always enforced. + +--- + +## Configuration + +Add these to your `.env` or config: + +```env +SESSION_POOLING_ENABLED=true +SESSION_POOLING_SERVERS=server1,server2 # (optional, comma-separated) +SESSION_POOL_MAX_IDLE=600 # (seconds, default 600) +SESSION_POOL_USER_LIMIT=10 # (default 10) +``` + +--- + +## Observability + +- Pool hit/miss/evict metrics are tracked and can be exposed for monitoring. +- Idle session cleanup runs automatically. + +--- + +## Compatibility & Recommendations + +- **Redis backend** (`CACHE_TYPE=redis`): Best for multi-worker deployments. +- **Database backend**: Works, but may increase DB load. +- **Memory backend**: Only for single-process/dev. + +--- + +## Example: Enabling Session Pooling + +1. Set `SESSION_POOLING_ENABLED=true` in your `.env`. +2. (Optional) List specific servers in `SESSION_POOLING_SERVERS`. +3. Restart the gateway. + +--- + +## Backward Compatibility + +- With pooling disabled, the gateway behaves as before (per-request sessions). +- All isolation and security boundaries are preserved. + +--- + +## See Also +- [Features](features.md) +- [Quick Start](quick_start.md) +- [FAQ](../faq/index.md) diff --git a/docs/docs/using/agents/.pages b/docs/docs/using/agents/.pages index 78218d648..8735c0e4b 100644 --- a/docs/docs/using/agents/.pages +++ b/docs/docs/using/agents/.pages @@ -1,6 +1,7 @@ title: Agents nav: - index.md + - a2a.md - langchain.md - langgraph.md - crewai.md diff --git a/docs/docs/using/mcpgateway-translate.md b/docs/docs/using/mcpgateway-translate.md index 99600239a..644c3a4db 100644 --- a/docs/docs/using/mcpgateway-translate.md +++ b/docs/docs/using/mcpgateway-translate.md @@ -106,6 +106,7 @@ python3 -m mcpgateway.translate \ | **Bidirectional communication** | Full duplex message flow in all modes | | **Session management** | Stateful sessions with event replay (streamable HTTP) | | **Flexible response modes** | Choose between SSE streams or JSON responses | +| **Dynamic environment injection** | Extract HTTP headers and inject as environment variables for multi-tenant support | | **Keep-alive support** | Automatic keepalive frames prevent connection timeouts | | **CORS configuration** | Enable cross-origin requests for web applications | | **Authentication** | OAuth2 Bearer token support for secure connections | @@ -185,6 +186,42 @@ Connect to a remote streamable HTTP endpoint. | `--messagePath ` | Message POST endpoint path | /message | | `--keepAlive ` | Keepalive interval | 30 | +### Dynamic Environment Variable Injection + +| Option | Description | Default | +|--------|-------------|---------| +| `--enable-dynamic-env` | Enable dynamic environment variable injection from HTTP headers | False | +| `--header-to-env ` | Map HTTP header to environment variable (can be specified multiple times) | None | + +**Use case**: Multi-tenant deployments where different users need different credentials passed to the MCP server. + +**Example - GitHub Enterprise with per-user tokens**: +```bash +python3 -m mcpgateway.translate \ + --stdio "uvx mcp-server-github" \ + --expose-sse \ + --port 9000 \ + --enable-dynamic-env \ + --header-to-env "Authorization=GITHUB_TOKEN" \ + --header-to-env "X-GitHub-Enterprise-Host=GITHUB_HOST" +``` + +**Client request with headers**: +```bash +curl -X POST http://localhost:9000/message \ + -H "Authorization: Bearer ghp_user123token" \ + -H "X-GitHub-Enterprise-Host: github.company.com" \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' +``` + +**Security features**: +- Header names validated (alphanumeric + hyphens only) +- Environment variable names validated (standard naming rules) +- Values sanitized (dangerous characters removed, length limits enforced) +- Case-insensitive header matching +- Headers not provided in mappings are ignored + ## API Documentation ### SSE Mode Endpoints @@ -320,6 +357,41 @@ curl -X POST http://localhost:9001/message \ curl -N http://localhost:9001/sse ``` +### Multi-Tenant GitHub Enterprise + +Enable per-user GitHub tokens for enterprise deployments: + +```bash +# Start the bridge with dynamic environment injection +python3 -m mcpgateway.translate \ + --stdio "uvx mcp-server-github" \ + --expose-sse \ + --port 9000 \ + --enable-dynamic-env \ + --header-to-env "Authorization=GITHUB_TOKEN" \ + --header-to-env "X-GitHub-Enterprise-Host=GITHUB_HOST" + +# User A's request (uses their personal access token) +curl -X POST http://localhost:9000/message \ + -H "Authorization: Bearer ghp_userA_token123" \ + -H "X-GitHub-Enterprise-Host: github.company.com" \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"get_repositories"}}' + +# User B's request (uses their own token) +curl -X POST http://localhost:9000/message \ + -H "Authorization: Bearer ghp_userB_token456" \ + -H "X-GitHub-Enterprise-Host: github.company.com" \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"get_repositories"}}' +``` + +**Benefits**: +- Each user's credentials are isolated per request +- No shared token security risks +- Supports different enterprise hosts per user +- MCP server process restarts with new credentials for each request + ### Container Deployment ```dockerfile diff --git a/docs/docs/using/plugins/index.md b/docs/docs/using/plugins/index.md index 07aa84dff..d7284d4e4 100644 --- a/docs/docs/using/plugins/index.md +++ b/docs/docs/using/plugins/index.md @@ -20,7 +20,7 @@ The MCP Context Forge Plugin Framework provides a comprehensive, production-grad !!! details "Plugin Framework Specification" Check the [specification](https://ibm.github.io/mcp-context-forge/architecture/plugins/) docs for a detailed design of the plugin system. - + The plugin framework implements a **hybrid architecture** supporting both native and external service integrations: ### Native Plugins @@ -92,9 +92,9 @@ class MyPlugin(Plugin): super().__init__(config) async def prompt_pre_fetch(self, payload: PromptPrehookPayload, context: PluginContext) -> PromptPrehookResult: - # modify + # modify return PromptPrehookResult(modified_payload=payload) - + # or block # return PromptPrehookResult( # continue_processing=False, @@ -118,7 +118,7 @@ plugins: priority: 120 ``` -**External plugin quickstart:** +**External plugin quickstart:** !!! details "Plugins Lifecycle Guide" See the [plugin lifecycle guide](https://ibm.github.io/mcp-context-forge/using/plugins/lifecycle/) for building, testing, and serving extenal plugins. @@ -278,7 +278,7 @@ Available hook values for the `hooks` field: #### Condition Fields -Users may only want plugins to be invoked on specific servers, tools, and prompts. To address this, a set of conditionals can be applied to a plugin. The attributes in a conditional combine together in as a set of `and` operations, while each attribute list item is `or`ed with other items in the list. +Users may only want plugins to be invoked on specific servers, tools, and prompts. To address this, a set of conditionals can be applied to a plugin. The attributes in a conditional combine together in as a set of `and` operations, while each attribute list item is `or`ed with other items in the list. The `conditions` array contains objects that specify when plugins should execute: diff --git a/docs/docs/using/plugins/plugins.md b/docs/docs/using/plugins/plugins.md index c08c91fae..d1bc6eca6 100644 --- a/docs/docs/using/plugins/plugins.md +++ b/docs/docs/using/plugins/plugins.md @@ -4,9 +4,9 @@ MCP Context Forge provides a comprehensive collection of production-ready plugin ## Plugin Categories -- [Security & Safety](#security-safety) -- [Reliability & Performance](#reliability-performance) -- [Content Transformation & Formatting](#content-transformation-formatting) +- [Security & Safety](#security-safety) +- [Reliability & Performance](#reliability-performance) +- [Content Transformation & Formatting](#content-transformation-formatting) - [Content Filtering & Validation](#content-filtering-validation) - [Compliance & Governance](#compliance-governance) - [Network & Integration](#network-integration) diff --git a/docs/docs/using/servers/.pages b/docs/docs/using/servers/.pages index 5321c98df..dbc87bc93 100644 --- a/docs/docs/using/servers/.pages +++ b/docs/docs/using/servers/.pages @@ -5,3 +5,4 @@ nav: - Python Servers: python - External Servers: external - IBM Servers: ibm + - Hashicorp Servers: hashicorp diff --git a/mcpgateway/admin.py b/mcpgateway/admin.py index 537708e57..3c3b2ac3f 100644 --- a/mcpgateway/admin.py +++ b/mcpgateway/admin.py @@ -5746,9 +5746,14 @@ async def admin_add_gateway(request: Request, db: Session = Depends(get_db), use except (json.JSONDecodeError, ValueError): auth_headers = [] - # Parse OAuth configuration if present + # Parse OAuth configuration - support both JSON string and individual form fields oauth_config_json = str(form.get("oauth_config")) oauth_config: Optional[dict[str, Any]] = None + + LOGGER.info(f"DEBUG: oauth_config_json from form = '{oauth_config_json}'") + LOGGER.info(f"DEBUG: Individual OAuth fields - grant_type='{form.get('oauth_grant_type')}', issuer='{form.get('oauth_issuer')}'") + + # Option 1: Pre-assembled oauth_config JSON (from API calls) if oauth_config_json and oauth_config_json != "None": try: oauth_config = json.loads(oauth_config_json) @@ -5760,6 +5765,55 @@ async def admin_add_gateway(request: Request, db: Session = Depends(get_db), use LOGGER.error(f"Failed to parse OAuth config: {e}") oauth_config = None + # Option 2: Assemble from individual UI form fields + if not oauth_config: + oauth_grant_type = str(form.get("oauth_grant_type", "")) + oauth_issuer = str(form.get("oauth_issuer", "")) + oauth_token_url = str(form.get("oauth_token_url", "")) + oauth_authorization_url = str(form.get("oauth_authorization_url", "")) + oauth_redirect_uri = str(form.get("oauth_redirect_uri", "")) + oauth_client_id = str(form.get("oauth_client_id", "")) + oauth_client_secret = str(form.get("oauth_client_secret", "")) + oauth_username = str(form.get("oauth_username", "")) + oauth_password = str(form.get("oauth_password", "")) + oauth_scopes_str = str(form.get("oauth_scopes", "")) + + # If any OAuth field is provided, assemble oauth_config + if any([oauth_grant_type, oauth_issuer, oauth_token_url, oauth_authorization_url, oauth_client_id]): + oauth_config = {} + + if oauth_grant_type: + oauth_config["grant_type"] = oauth_grant_type + if oauth_issuer: + oauth_config["issuer"] = oauth_issuer + if oauth_token_url: + oauth_config["token_url"] = oauth_token_url # OAuthManager expects 'token_url', not 'token_endpoint' + if oauth_authorization_url: + oauth_config["authorization_url"] = oauth_authorization_url # OAuthManager expects 'authorization_url', not 'authorization_endpoint' + if oauth_redirect_uri: + oauth_config["redirect_uri"] = oauth_redirect_uri + if oauth_client_id: + oauth_config["client_id"] = oauth_client_id + if oauth_client_secret: + # Encrypt the client secret + encryption = get_oauth_encryption(settings.auth_encryption_secret) + oauth_config["client_secret"] = encryption.encrypt_secret(oauth_client_secret) + + # Add username and password for password grant type + if oauth_username: + oauth_config["username"] = oauth_username + if oauth_password: + oauth_config["password"] = oauth_password + + # Parse scopes (comma or space separated) + if oauth_scopes_str: + scopes = [s.strip() for s in oauth_scopes_str.replace(",", " ").split() if s.strip()] + if scopes: + oauth_config["scopes"] = scopes + + LOGGER.info(f"✅ Assembled OAuth config from UI form fields: grant_type={oauth_grant_type}, issuer={oauth_issuer}") + LOGGER.info(f"DEBUG: Complete oauth_config = {oauth_config}") + visibility = str(form.get("visibility", "private")) # Handle passthrough_headers @@ -5773,13 +5827,22 @@ async def admin_add_gateway(request: Request, db: Session = Depends(get_db), use else: passthrough_headers = None + # Auto-detect OAuth: if oauth_config is present and auth_type not explicitly set, use "oauth" + auth_type_from_form = str(form.get("auth_type", "")) + LOGGER.info(f"DEBUG: auth_type from form: '{auth_type_from_form}', oauth_config present: {oauth_config is not None}") + if oauth_config and not auth_type_from_form: + auth_type_from_form = "oauth" + LOGGER.info("✅ Auto-detected OAuth configuration, setting auth_type='oauth'") + elif oauth_config and auth_type_from_form: + LOGGER.info(f"✅ OAuth config present with explicit auth_type='{auth_type_from_form}'") + gateway = GatewayCreate( name=str(form["name"]), url=str(form["url"]), description=str(form.get("description")), tags=tags, transport=str(form.get("transport", "SSE")), - auth_type=str(form.get("auth_type", "")), + auth_type=auth_type_from_form, auth_username=str(form.get("auth_username", "")), auth_password=str(form.get("auth_password", "")), auth_token=str(form.get("auth_token", "")), @@ -5997,9 +6060,11 @@ async def admin_edit_gateway( else: passthrough_headers = None - # Parse OAuth configuration if present + # Parse OAuth configuration - support both JSON string and individual form fields oauth_config_json = str(form.get("oauth_config")) oauth_config: Optional[dict[str, Any]] = None + + # Option 1: Pre-assembled oauth_config JSON (from API calls) if oauth_config_json and oauth_config_json != "None": try: oauth_config = json.loads(oauth_config_json) @@ -6011,6 +6076,54 @@ async def admin_edit_gateway( LOGGER.error(f"Failed to parse OAuth config: {e}") oauth_config = None + # Option 2: Assemble from individual UI form fields + if not oauth_config: + oauth_grant_type = str(form.get("oauth_grant_type", "")) + oauth_issuer = str(form.get("oauth_issuer", "")) + oauth_token_url = str(form.get("oauth_token_url", "")) + oauth_authorization_url = str(form.get("oauth_authorization_url", "")) + oauth_redirect_uri = str(form.get("oauth_redirect_uri", "")) + oauth_client_id = str(form.get("oauth_client_id", "")) + oauth_client_secret = str(form.get("oauth_client_secret", "")) + oauth_username = str(form.get("oauth_username", "")) + oauth_password = str(form.get("oauth_password", "")) + oauth_scopes_str = str(form.get("oauth_scopes", "")) + + # If any OAuth field is provided, assemble oauth_config + if any([oauth_grant_type, oauth_issuer, oauth_token_url, oauth_authorization_url, oauth_client_id]): + oauth_config = {} + + if oauth_grant_type: + oauth_config["grant_type"] = oauth_grant_type + if oauth_issuer: + oauth_config["issuer"] = oauth_issuer + if oauth_token_url: + oauth_config["token_url"] = oauth_token_url # OAuthManager expects 'token_url', not 'token_endpoint' + if oauth_authorization_url: + oauth_config["authorization_url"] = oauth_authorization_url # OAuthManager expects 'authorization_url', not 'authorization_endpoint' + if oauth_redirect_uri: + oauth_config["redirect_uri"] = oauth_redirect_uri + if oauth_client_id: + oauth_config["client_id"] = oauth_client_id + if oauth_client_secret: + # Encrypt the client secret + encryption = get_oauth_encryption(settings.auth_encryption_secret) + oauth_config["client_secret"] = encryption.encrypt_secret(oauth_client_secret) + + # Add username and password for password grant type + if oauth_username: + oauth_config["username"] = oauth_username + if oauth_password: + oauth_config["password"] = oauth_password + + # Parse scopes (comma or space separated) + if oauth_scopes_str: + scopes = [s.strip() for s in oauth_scopes_str.replace(",", " ").split() if s.strip()] + if scopes: + oauth_config["scopes"] = scopes + + LOGGER.info(f"✅ Assembled OAuth config from UI form fields (edit): grant_type={oauth_grant_type}, issuer={oauth_issuer}") + user_email = get_user_email(user) # Determine personal team for default assignment team_id_raw = form.get("team_id", None) @@ -6019,13 +6132,19 @@ async def admin_edit_gateway( team_service = TeamManagementService(db) team_id = await team_service.verify_team_for_user(user_email, team_id) + # Auto-detect OAuth: if oauth_config is present and auth_type not explicitly set, use "oauth" + auth_type_from_form = str(form.get("auth_type", "")) + if oauth_config and not auth_type_from_form: + auth_type_from_form = "oauth" + LOGGER.info("Auto-detected OAuth configuration in edit, setting auth_type='oauth'") + gateway = GatewayUpdate( # Pydantic validation happens here name=str(form.get("name")), url=str(form["url"]), description=str(form.get("description")), transport=str(form.get("transport", "SSE")), tags=tags, - auth_type=str(form.get("auth_type", "")), + auth_type=auth_type_from_form, auth_username=str(form.get("auth_username", "")), auth_password=str(form.get("auth_password", "")), auth_token=str(form.get("auth_token", "")), @@ -9682,34 +9801,45 @@ async def catalog_partial( root_path = request.scope.get("root_path", "") # Calculate pagination - page_size = 100 + page_size = settings.mcpgateway_catalog_page_size offset = (page - 1) * page_size catalog_request = CatalogListRequest(category=category, auth_type=auth_type, search=search, show_available_only=False, limit=page_size, offset=offset) response = await catalog_service.get_catalog_servers(catalog_request, db) + # Get ALL servers (no filters, no pagination) for counting statistics + all_servers_request = CatalogListRequest(show_available_only=False, limit=1000, offset=0) + all_servers_response = await catalog_service.get_catalog_servers(all_servers_request, db) + + # Pass filter parameters to template for pagination links + filter_params = { + "category": category, + "auth_type": auth_type, + "search": search, + } + # Calculate statistics and pagination info total_servers = response.total registered_count = sum(1 for s in response.servers if s.is_registered) total_pages = (total_servers + page_size - 1) // page_size # Ceiling division - # Count servers by category, auth type, and provider + # Count ALL servers by category, auth type, and provider (not just current page) servers_by_category = {} servers_by_auth_type = {} servers_by_provider = {} - for server in response.servers: + for server in all_servers_response.servers: servers_by_category[server.category] = servers_by_category.get(server.category, 0) + 1 servers_by_auth_type[server.auth_type] = servers_by_auth_type.get(server.auth_type, 0) + 1 servers_by_provider[server.provider] = servers_by_provider.get(server.provider, 0) + 1 stats = { - "total_servers": total_servers, + "total_servers": all_servers_response.total, # Use total from all servers "registered_servers": registered_count, - "categories": response.categories, - "auth_types": response.auth_types, - "providers": response.providers, + "categories": all_servers_response.categories, + "auth_types": all_servers_response.auth_types, + "providers": all_servers_response.providers, "servers_by_category": servers_by_category, "servers_by_auth_type": servers_by_auth_type, "servers_by_provider": servers_by_provider, @@ -9723,6 +9853,7 @@ async def catalog_partial( "page": page, "total_pages": total_pages, "page_size": page_size, + "filter_params": filter_params, } return request.app.state.templates.TemplateResponse("mcp_registry_partial.html", context) diff --git a/mcpgateway/alembic/versions/2f67b12600b4_add_registered_oauth_clients_table_for_.py b/mcpgateway/alembic/versions/2f67b12600b4_add_registered_oauth_clients_table_for_.py new file mode 100644 index 000000000..76f4d4c0b --- /dev/null +++ b/mcpgateway/alembic/versions/2f67b12600b4_add_registered_oauth_clients_table_for_.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +"""Add registered_oauth_clients table for DCR + +Revision ID: 2f67b12600b4 +Revises: 61ee11c482d6 +Create Date: 2025-09-30 15:51:10.600647 + +""" +# Standard +from typing import Sequence, Union + +# Third-Party +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "2f67b12600b4" +down_revision: Union[str, Sequence[str], None] = "61ee11c482d6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # Create registered_oauth_clients table for DCR (RFC 7591) + op.create_table( + "registered_oauth_clients", + sa.Column("id", sa.String(36), primary_key=True), + sa.Column("gateway_id", sa.String(36), sa.ForeignKey("gateways.id", ondelete="CASCADE"), nullable=False, index=True), + sa.Column("issuer", sa.String(500), nullable=False), + sa.Column("client_id", sa.String(500), nullable=False), + sa.Column("client_secret_encrypted", sa.Text, nullable=True), + sa.Column("redirect_uris", sa.Text, nullable=False), + sa.Column("grant_types", sa.Text, nullable=False), + sa.Column("response_types", sa.Text, nullable=True), + sa.Column("scope", sa.String(1000), nullable=True), + sa.Column("token_endpoint_auth_method", sa.String(50), server_default="client_secret_basic"), + sa.Column("registration_client_uri", sa.String(500), nullable=True), + sa.Column("registration_access_token_encrypted", sa.Text, nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("expires_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("is_active", sa.Boolean, server_default=sa.true()), + ) + + # Create unique index on (gateway_id, issuer) + op.create_index("idx_gateway_issuer", "registered_oauth_clients", ["gateway_id", "issuer"], unique=True) + + +def downgrade() -> None: + """Downgrade schema.""" + # Drop index and table + op.drop_index("idx_gateway_issuer", table_name="registered_oauth_clients") + op.drop_table("registered_oauth_clients") diff --git a/mcpgateway/alembic/versions/61ee11c482d6_add_code_verifier_to_oauth_states_for_.py b/mcpgateway/alembic/versions/61ee11c482d6_add_code_verifier_to_oauth_states_for_.py new file mode 100644 index 000000000..ac5e814b0 --- /dev/null +++ b/mcpgateway/alembic/versions/61ee11c482d6_add_code_verifier_to_oauth_states_for_.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +"""Add code_verifier to oauth_states for PKCE support + +Revision ID: 61ee11c482d6 +Revises: 0f81d4a5efe0 +Create Date: 2025-09-30 15:45:43.895080 + +""" +# Standard +from typing import Sequence, Union + +# Third-Party +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "61ee11c482d6" +down_revision: Union[str, Sequence[str], None] = "0f81d4a5efe0" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # Check if oauth_states table exists before adding column + conn = op.get_bind() + inspector = sa.inspect(conn) + + if "oauth_states" not in inspector.get_table_names(): + print("oauth_states table not found. Skipping PKCE code_verifier migration.") + return + + # Add code_verifier column to oauth_states for PKCE support (RFC 7636) + op.add_column("oauth_states", sa.Column("code_verifier", sa.String(128), nullable=True)) + + +def downgrade() -> None: + """Downgrade schema.""" + # Check if oauth_states table exists before dropping column + conn = op.get_bind() + inspector = sa.inspect(conn) + + if "oauth_states" not in inspector.get_table_names(): + print("oauth_states table not found. Skipping PKCE code_verifier downgrade.") + return + + # Remove code_verifier column from oauth_states + op.drop_column("oauth_states", "code_verifier") diff --git a/mcpgateway/cache/session_registry.py b/mcpgateway/cache/session_registry.py index 3679f4267..7d8ae4ff0 100644 --- a/mcpgateway/cache/session_registry.py +++ b/mcpgateway/cache/session_registry.py @@ -297,6 +297,121 @@ def __init__( self._lock = asyncio.Lock() self._cleanup_task = None + # --- Session Pooling for SSE/WS --- + # Key: (user, server_id) -> session_id + + _pooled_sessions: Dict[tuple[str, str], dict] = {} + _pool_metrics: Dict[str, int] = {"hit": 0, "miss": 0, "evict": 0} + _cleanup_task: Optional[asyncio.Task] = None + + async def start_pool_cleanup(self, interval: int = 60): + """Start periodic cleanup of idle pooled sessions. + + Args: + interval: Cleanup interval in seconds. Default is 60 seconds. + """ + if self._cleanup_task: + return + + async def _run(): + while True: + await asyncio.sleep(interval) + await self.cleanup_idle_sessions() + self._cleanup_task = asyncio.create_task(_run()) + + async def stop_pool_cleanup(self): + """Stop periodic cleanup of idle pooled sessions. + """ + if self._cleanup_task: + self._cleanup_task.cancel() + self._cleanup_task = None + + async def get_pooled_session(self, user: str, server_id: str) -> Optional[str]: + """Return session_id for (user, server_id) if pooled and valid, else None. Increments hit/miss metrics. + + Args: + user: User identifier + server_id: Server identifier + + Returns: +<<<<<<< HEAD + Session_id if found and valid, else None +======= + return: session_id if found and valid, else None +>>>>>>> b4739a5 (fix lint issues) + """ + key = (user, server_id) + entry = self._pooled_sessions.get(key) + if not entry: + self._pool_metrics["miss"] += 1 + return None + # Check idle timeout + max_idle = getattr(settings, "session_pool_max_idle", 600) + if time.time() - entry["last_used"] > max_idle: + await self.remove_session(entry["session_id"]) + self._pooled_sessions.pop(key, None) + self._pool_metrics["evict"] += 1 + return None + self._pool_metrics["hit"] += 1 + return entry["session_id"] + + async def pool_session(self, user: str, server_id: str, session_id: str): + """Add or update pooled session for (user, server_id). + + Args: + user: User identifier + server_id: Server identifier + session_id: Session identifier to pool + """ + key = (user, server_id) + self._pooled_sessions[key] = {"session_id": session_id, "last_used": time.time()} + + async def touch_pooled_session(self, user: str, server_id: str): + """Update last_used timestamp for pooled session. + + Args: + user: User identifier + server_id: Server identifier + """ + key = (user, server_id) + if key in self._pooled_sessions: + self._pooled_sessions[key]["last_used"] = time.time() + + async def evict_pooled_session(self, user: str, server_id: str): + """Evict pooled session for (user, server_id). + + Args: + user: User identifier + server_id: Server identifier + """ + key = (user, server_id) + entry = self._pooled_sessions.pop(key, None) + if entry: + await self.remove_session(entry["session_id"]) + + async def cleanup_idle_sessions(self): + """Evict idle pooled sessions (called periodically). Increments evict metric. + """ + now = time.time() + max_idle = getattr(settings, "session_pool_max_idle", 600) + for key, entry in list(self._pooled_sessions.items()): + if now - entry["last_used"] > max_idle: + await self.remove_session(entry["session_id"]) + self._pooled_sessions.pop(key, None) + self._pool_metrics["evict"] += 1 + + def get_pool_metrics(self) -> dict: + """Return current pool hit/miss/evict counts. + + Returns: +<<<<<<< HEAD + A dict with hit/miss/evict counts +======= + dict: A dict with hit/miss/evict counts +>>>>>>> b4739a5 (fix lint issues) + """ + return dict(self._pool_metrics) + async def initialize(self) -> None: """Initialize the registry with async setup. diff --git a/mcpgateway/config.py b/mcpgateway/config.py index 17f6191b0..f24d1ed4f 100644 --- a/mcpgateway/config.py +++ b/mcpgateway/config.py @@ -29,6 +29,19 @@ - PROMPT_CACHE_SIZE: Max cached prompts (default: 100) - HEALTH_CHECK_INTERVAL: Gateway health check interval (default: 60) + +Session Pooling (SSE/WS): +- SESSION_POOLING_ENABLED: Enable session pooling for SSE/WS (default: false) +- SESSION_POOLING_SERVERS: Comma-separated list of server IDs for which pooling is enabled (overrides global) +- SESSION_POOL_MAX_IDLE: Max idle time (seconds) before pooled session is evicted (default: 600) +- SESSION_POOL_USER_LIMIT: Max pooled sessions per user (default: 10) + +Example .env: +# SESSION_POOLING_ENABLED=true +# SESSION_POOLING_SERVERS=server1,server2 +# SESSION_POOL_MAX_IDLE=600 +# SESSION_POOL_USER_LIMIT=10 + Examples: >>> from mcpgateway.config import Settings >>> s = Settings(basic_auth_user='admin', basic_auth_password='secret') @@ -226,6 +239,39 @@ class Settings(BaseSettings): oauth_request_timeout: int = Field(default=30, description="OAuth request timeout in seconds") oauth_max_retries: int = Field(default=3, description="Maximum retries for OAuth token requests") + # =================================== + # Dynamic Client Registration (DCR) - Client Mode + # =================================== + + # Enable DCR client functionality + dcr_enabled: bool = Field(default=True, description="Enable Dynamic Client Registration (RFC 7591) - gateway acts as DCR client") + + # Auto-register when missing credentials + dcr_auto_register_on_missing_credentials: bool = Field(default=True, description="Automatically register with AS when gateway has issuer but no client_id") + + # Default scopes for DCR + dcr_default_scopes: List[str] = Field(default=["mcp:read"], description="Default MCP scopes to request during DCR") + + # Issuer allowlist (empty = allow any) + dcr_allowed_issuers: List[str] = Field(default_factory=list, description="Optional allowlist of issuer URLs for DCR (empty = allow any)") + + # Token endpoint auth method + dcr_token_endpoint_auth_method: str = Field(default="client_secret_basic", description="Token endpoint auth method for DCR (client_secret_basic or client_secret_post)") + + # Metadata cache TTL + dcr_metadata_cache_ttl: int = Field(default=3600, description="AS metadata cache TTL in seconds (RFC 8414 discovery)") + + # Client name template + dcr_client_name_template: str = Field(default="MCP Gateway ({gateway_name})", description="Template for client_name in DCR requests") + + # =================================== + # OAuth Discovery (RFC 8414) + # =================================== + + oauth_discovery_enabled: bool = Field(default=True, description="Enable OAuth AS metadata discovery (RFC 8414)") + + oauth_preferred_code_challenge_method: str = Field(default="S256", description="Preferred PKCE code challenge method (S256 or plain)") + # Email-Based Authentication email_auth_enabled: bool = Field(default=True, description="Enable email-based authentication") platform_admin_email: str = Field(default="admin@example.com", description="Platform administrator email address") @@ -278,6 +324,7 @@ class Settings(BaseSettings): mcpgateway_catalog_file: str = Field(default="mcp-catalog.yml", description="Path to catalog configuration file") mcpgateway_catalog_auto_health_check: bool = Field(default=True, description="Automatically health check catalog servers") mcpgateway_catalog_cache_ttl: int = Field(default=3600, description="Catalog cache TTL in seconds") + mcpgateway_catalog_page_size: int = Field(default=100, description="Number of catalog servers per page") # Security skip_ssl_verify: bool = False @@ -792,6 +839,23 @@ def parse_issuers(cls, v): use_stateful_sessions: bool = False # Set to False to use stateless sessions without event store json_response_enabled: bool = True # Enable JSON responses instead of SSE streams + # Session Pooling (SSE/WS) + session_pooling_enabled: bool = False # Enable session pooling for SSE/WS transports + session_pooling_servers: list[str] = Field(default_factory=list, description="Server IDs for which pooling is enabled (overrides global)") + session_pool_max_idle: int = 600 # Max idle time (seconds) before session is evicted + session_pool_user_limit: int = 10 # Max pooled sessions per user + + # # Observability: session pool metrics will be exposed if enabled + # session_pool_metrics_enabled: bool = True + # session_pool_eviction_interval: int = 60 # Interval (seconds) to check for idle sessions to evict + # session_pool_cleanup_interval: int = 300 # Interval (seconds) to clean up closed sessions + # session_pool_max_size: int = 1000 # Max number of pooled sessions in memory + # session_pool_check_interval: int = 30 # Interval (seconds) to check session health + # session_pool_eviction_batch_size: int = 50 # Number of sessions to evict in each eviction run + # session_pool_warning_threshold: int = 800 # Threshold to log warnings about high pool usage + # session_pool_error_threshold: int = 900 # Threshold to log errors about critical pool usage + # session_pool_stats_interval: int = 60 # Interval (seconds) to log session pool statistics + # Core plugin settings plugins_enabled: bool = Field(default=False, description="Enable the plugin framework") plugin_config_file: str = Field(default="plugins/config.yaml", description="Path to main plugin configuration file") diff --git a/mcpgateway/db.py b/mcpgateway/db.py index abac1120c..7799c48d4 100644 --- a/mcpgateway/db.py +++ b/mcpgateway/db.py @@ -2476,6 +2476,9 @@ class Gateway(Base): # Relationship with OAuth tokens oauth_tokens: Mapped[List["OAuthToken"]] = relationship("OAuthToken", back_populates="gateway", cascade="all, delete-orphan") + # Relationship with registered OAuth clients (DCR) + registered_oauth_clients: Mapped[List["RegisteredOAuthClient"]] = relationship("RegisteredOAuthClient", back_populates="gateway", cascade="all, delete-orphan") + __table_args__ = ( UniqueConstraint("team_id", "owner_email", "slug", name="uq_team_owner_slug_gateway"), UniqueConstraint("team_id", "owner_email", "url", name="uq_team_owner_url_gateway"), @@ -2714,6 +2717,7 @@ class OAuthState(Base): id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: uuid.uuid4().hex) gateway_id: Mapped[str] = mapped_column(String(36), ForeignKey("gateways.id", ondelete="CASCADE"), nullable=False) state: Mapped[str] = mapped_column(String(500), nullable=False, unique=True) # The state parameter + code_verifier: Mapped[Optional[str]] = mapped_column(String(128), nullable=True) # PKCE code verifier (RFC 7636) expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) used: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utc_now) @@ -2725,6 +2729,46 @@ class OAuthState(Base): __table_args__ = (Index("idx_oauth_state_lookup", "gateway_id", "state"),) +class RegisteredOAuthClient(Base): + """Stores dynamically registered OAuth clients (RFC 7591 client mode). + + This model maintains client credentials obtained through Dynamic Client + Registration with upstream Authorization Servers. + """ + + __tablename__ = "registered_oauth_clients" + + id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid.uuid4())) + gateway_id: Mapped[str] = mapped_column(String(36), ForeignKey("gateways.id", ondelete="CASCADE"), nullable=False, index=True) + + # Registration details + issuer: Mapped[str] = mapped_column(String(500), nullable=False) # AS issuer URL + client_id: Mapped[str] = mapped_column(String(500), nullable=False) + client_secret_encrypted: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Encrypted + + # RFC 7591 fields + redirect_uris: Mapped[str] = mapped_column(Text, nullable=False) # JSON array + grant_types: Mapped[str] = mapped_column(Text, nullable=False) # JSON array + response_types: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # JSON array + scope: Mapped[Optional[str]] = mapped_column(String(1000), nullable=True) + token_endpoint_auth_method: Mapped[str] = mapped_column(String(50), default="client_secret_basic") + + # Registration management (RFC 7591 section 4) + registration_client_uri: Mapped[Optional[str]] = mapped_column(String(500), nullable=True) + registration_access_token_encrypted: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + # Metadata + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utc_now) + expires_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + is_active: Mapped[bool] = mapped_column(Boolean, default=True) + + # Relationships + gateway: Mapped["Gateway"] = relationship("Gateway", back_populates="registered_oauth_clients") + + # Unique constraint: one registration per gateway+issuer + __table_args__ = (Index("idx_gateway_issuer", "gateway_id", "issuer", unique=True),) + + class EmailApiToken(Base): """Email user API token model for token catalog management. diff --git a/mcpgateway/main.py b/mcpgateway/main.py index c1006b519..af8d62de2 100644 --- a/mcpgateway/main.py +++ b/mcpgateway/main.py @@ -186,10 +186,20 @@ message_ttl=settings.message_ttl, ) +# Start periodic cleanup for pooled sessions if enabled +if getattr(settings, "session_pooling_enabled", False): + try: + interval = max(10, int(getattr(settings, "session_pool_max_idle", 60) // 2)) + asyncio.get_event_loop().create_task(session_registry.start_pool_cleanup(interval=interval)) + except Exception as e: + logger.warning(f"Could not start session pool cleanup: {e}") # Helper function for authentication compatibility + + def get_user_email(user): - """Extract email from user object, handling both string and dict formats. + """ + Extract email from user object, handling both string and dict formats. Args: user: User object, can be either a dict (new RBAC format) or string (legacy format) @@ -1228,11 +1238,12 @@ async def ping(request: Request, user=Depends(get_current_user)) -> JSONResponse Raises: HTTPException: If the request method is not "ping". """ + req_id: Optional[str] = None try: body: dict = await request.json() if body.get("method") != "ping": raise HTTPException(status_code=400, detail="Invalid method") - req_id: Optional[str] = body.get("id") + req_id = body.get("id") logger.debug(f"Authenticated user {user} sent ping request.") # Return an empty result per the MCP ping specification. response: dict = {"jsonrpc": "2.0", "id": req_id, "result": {}} @@ -1240,7 +1251,7 @@ async def ping(request: Request, user=Depends(get_current_user)) -> JSONResponse except Exception as e: error_response: dict = { "jsonrpc": "2.0", - "id": None, # req_id not available in this scope + "id": req_id, # Now req_id is always defined "error": {"code": -32603, "message": "Internal error", "data": str(e)}, } return JSONResponse(status_code=500, content=error_response) @@ -1589,15 +1600,34 @@ async def sse_endpoint(request: Request, server_id: str, user=Depends(get_curren base_url = update_url_protocol(request) server_sse_url = f"{base_url}/servers/{server_id}" - transport = SSETransport(base_url=server_sse_url) - await transport.connect() - await session_registry.add_session(transport.session_id, transport) - response = await transport.create_sse_response(request) + # --- Session Pooling Logic --- + pooling_enabled = getattr(settings, "session_pooling_enabled", False) + pooling_servers = set(getattr(settings, "session_pooling_servers", [])) + user_id = get_user_email(user) + use_pool = pooling_enabled or server_id in pooling_servers + session_id = None + transport = None + if use_pool: + session_id = await session_registry.get_pooled_session(user_id, server_id) + if session_id: + transport = await session_registry.get_session(session_id) + if transport: + await session_registry.touch_pooled_session(user_id, server_id) + if not transport: + transport = SSETransport(base_url=server_sse_url) + await transport.connect() + await session_registry.add_session(transport.session_id, transport) + if use_pool: + await session_registry.pool_session(user_id, server_id, transport.session_id) + session_id = transport.session_id - asyncio.create_task(session_registry.respond(server_id, user, session_id=transport.session_id, base_url=base_url)) + response = await transport.create_sse_response(request) + asyncio.create_task(session_registry.respond(server_id, user, session_id=session_id, base_url=base_url)) tasks = BackgroundTasks() - tasks.add_task(session_registry.remove_session, transport.session_id) + # Only remove session if not pooled, else rely on idle cleanup + if not use_pool: + tasks.add_task(session_registry.remove_session, transport.session_id) response.background = tasks logger.info(f"SSE connection established: {transport.session_id}") return response @@ -3354,6 +3384,7 @@ async def handle_rpc(request: Request, db: Session = Depends(get_db), user=Depen PluginError: If encounters issue with plugin PluginViolationError: If plugin violated the request. Example - In case of OPA plugin, if the request is denied by policy. """ + req_id = None try: # Extract user identifier from either RBAC user object or JWT payload if hasattr(user, "email"): @@ -3546,6 +3577,30 @@ async def websocket_endpoint(websocket: WebSocket): await websocket.close(code=1008, reason="Invalid authentication") return + # --- WebSocket Session Pooling Logic --- + pooling_enabled = getattr(settings, "session_pooling_enabled", False) + pooling_servers = set(getattr(settings, "session_pooling_servers", [])) + # For demo, use 'default' as server_id; in real use, extract from path/query + server_id = websocket.query_params.get("server_id", "default") + user_id = websocket.query_params.get("user", "unknown") + use_pool = pooling_enabled or server_id in pooling_servers + session_id = None + transport = None + if use_pool: + session_id = await session_registry.get_pooled_session(user_id, server_id) + if session_id: + transport = await session_registry.get_session(session_id) + if transport: + await session_registry.touch_pooled_session(user_id, server_id) + if not transport: + from mcpgateway.transports.websocket_transport import WebSocketTransport + transport = WebSocketTransport(websocket) + await transport.connect() + await session_registry.add_session(id(websocket), transport) # Use id(websocket) as session_id for now + if use_pool: + await session_registry.pool_session(user_id, server_id, id(websocket)) + session_id = id(websocket) + await websocket.accept() while True: try: diff --git a/mcpgateway/routers/oauth_router.py b/mcpgateway/routers/oauth_router.py index 9d84089dc..d97f18090 100644 --- a/mcpgateway/routers/oauth_router.py +++ b/mcpgateway/routers/oauth_router.py @@ -23,9 +23,11 @@ from sqlalchemy.orm import Session # First-Party +from mcpgateway.config import settings from mcpgateway.db import Gateway, get_db from mcpgateway.middleware.rbac import get_current_user_with_permissions from mcpgateway.schemas import EmailUserResponse +from mcpgateway.services.dcr_service import DcrError, DcrService from mcpgateway.services.oauth_manager import OAuthError, OAuthManager from mcpgateway.services.token_storage_service import TokenStorageService @@ -35,13 +37,20 @@ @oauth_router.get("/authorize/{gateway_id}") -async def initiate_oauth_flow(gateway_id: str, request: Request, current_user: EmailUserResponse = Depends(get_current_user_with_permissions), db: Session = Depends(get_db)) -> RedirectResponse: +async def initiate_oauth_flow( + gateway_id: str, request: Request, current_user: EmailUserResponse = Depends(get_current_user_with_permissions), db: Session = Depends(get_db) +) -> RedirectResponse: # noqa: ARG001 """Initiates the OAuth 2.0 Authorization Code flow for a specified gateway. This endpoint retrieves the OAuth configuration for the given gateway, validates that the gateway supports the Authorization Code flow, and redirects the user to the OAuth provider's authorization URL to begin the OAuth process. + **Phase 1.4: DCR Integration** + If the gateway has an issuer but no client_id, and DCR is enabled, this endpoint will + automatically register the gateway as an OAuth client with the Authorization Server + using Dynamic Client Registration (RFC 7591). + Args: gateway_id: The unique identifier of the gateway to authorize. request: The FastAPI request object. @@ -73,9 +82,86 @@ async def initiate_oauth_flow(gateway_id: str, request: Request, current_user: E if gateway.oauth_config.get("grant_type") != "authorization_code": raise HTTPException(status_code=400, detail="Gateway is not configured for Authorization Code flow") - # Initiate OAuth flow with user context + oauth_config = gateway.oauth_config.copy() # Work with a copy to avoid mutating the original + + # Phase 1.4: Auto-trigger DCR if credentials are missing + # Check if gateway has issuer but no client_id (DCR scenario) + issuer = oauth_config.get("issuer") + client_id = oauth_config.get("client_id") + + if issuer and not client_id: + if settings.dcr_enabled and settings.dcr_auto_register_on_missing_credentials: + logger.info(f"Gateway {gateway_id} has issuer but no client_id. Attempting DCR...") + + try: + # Initialize DCR service + dcr_service = DcrService() + + # Check if client is already registered in database + registered_client = await dcr_service.get_or_register_client( + gateway_id=gateway_id, + gateway_name=gateway.name, + issuer=issuer, + redirect_uri=oauth_config.get("redirect_uri"), + scopes=oauth_config.get("scopes", settings.dcr_default_scopes), + db=db, + ) + + logger.info(f"✅ DCR successful for gateway {gateway_id}: client_id={registered_client.client_id}") + + # Decrypt the client secret for use in OAuth flow (if present - public clients may not have secrets) + decrypted_secret = None + if registered_client.client_secret_encrypted: + # First-Party + from mcpgateway.utils.oauth_encryption import get_oauth_encryption + + encryption = get_oauth_encryption(settings.auth_encryption_secret) + decrypted_secret = encryption.decrypt_secret(registered_client.client_secret_encrypted) + + # Update oauth_config with registered credentials + oauth_config["client_id"] = registered_client.client_id + if decrypted_secret: + oauth_config["client_secret"] = decrypted_secret + + # Discover AS metadata to get authorization/token endpoints if not already set + # Note: OAuthManager expects 'authorization_url' and 'token_url', not 'authorization_endpoint'/'token_endpoint' + if not oauth_config.get("authorization_url") or not oauth_config.get("token_url"): + metadata = await dcr_service.discover_as_metadata(issuer) + oauth_config["authorization_url"] = metadata.get("authorization_endpoint") + oauth_config["token_url"] = metadata.get("token_endpoint") + logger.info(f"Discovered OAuth endpoints for {issuer}") + + # Update gateway's oauth_config and auth_type in database for future use + gateway.oauth_config = oauth_config + gateway.auth_type = "oauth" # Ensure auth_type is set for OAuth-protected servers + db.commit() + + logger.info(f"Updated gateway {gateway_id} with DCR credentials and auth_type=oauth") + + except DcrError as dcr_err: + logger.error(f"DCR failed for gateway {gateway_id}: {dcr_err}") + raise HTTPException( + status_code=500, + detail=f"Dynamic Client Registration failed: {str(dcr_err)}. Please configure client_id and client_secret manually or check your OAuth server supports RFC 7591.", + ) + except Exception as dcr_ex: + logger.error(f"Unexpected error during DCR for gateway {gateway_id}: {dcr_ex}") + raise HTTPException(status_code=500, detail=f"Failed to register OAuth client: {str(dcr_ex)}") + else: + # DCR is disabled or auto-register is off + logger.warning(f"Gateway {gateway_id} has issuer but no client_id, and DCR auto-registration is disabled") + raise HTTPException( + status_code=400, + detail="Gateway OAuth configuration is incomplete. Please provide client_id and client_secret, or enable DCR (Dynamic Client Registration) by setting MCPGATEWAY_DCR_ENABLED=true and MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS=true", + ) + + # Validate required fields for OAuth flow + if not oauth_config.get("client_id"): + raise HTTPException(status_code=400, detail="OAuth configuration missing client_id") + + # Initiate OAuth flow with user context (now includes PKCE from existing implementation) oauth_manager = OAuthManager(token_storage=TokenStorageService(db)) - auth_data = await oauth_manager.initiate_authorization_code_flow(gateway_id, gateway.oauth_config, app_user_email=current_user.get("email")) + auth_data = await oauth_manager.initiate_authorization_code_flow(gateway_id, oauth_config, app_user_email=current_user.get("email")) logger.info(f"Initiated OAuth flow for gateway {gateway_id} by user {current_user.get('email')}") @@ -444,3 +530,155 @@ async def fetch_tools_after_oauth(gateway_id: str, current_user: EmailUserRespon except Exception as e: logger.error(f"Failed to fetch tools after OAuth for gateway {gateway_id}: {e}") raise HTTPException(status_code=500, detail=f"Failed to fetch tools: {str(e)}") + + +# ============================================================================ +# Admin Endpoints for DCR Management +# ============================================================================ + + +@oauth_router.get("/registered-clients") +async def list_registered_oauth_clients(current_user: EmailUserResponse = Depends(get_current_user_with_permissions), db: Session = Depends(get_db)) -> Dict[str, Any]: # noqa: ARG001 + """List all registered OAuth clients (created via DCR). + + This endpoint shows OAuth clients that were dynamically registered with external + Authorization Servers using RFC 7591 Dynamic Client Registration. + + Args: + current_user: The authenticated user (admin access required) + db: Database session + + Returns: + Dict containing list of registered OAuth clients with metadata + + Raises: + HTTPException: If user lacks permissions or database error occurs + """ + try: + # First-Party + from mcpgateway.db import RegisteredOAuthClient + + # Query all registered clients + clients = db.execute(select(RegisteredOAuthClient)).scalars().all() + + # Build response + clients_data = [] + for client in clients: + clients_data.append( + { + "id": client.id, + "gateway_id": client.gateway_id, + "issuer": client.issuer, + "client_id": client.client_id, + "redirect_uris": client.redirect_uris.split(",") if isinstance(client.redirect_uris, str) else client.redirect_uris, + "grant_types": client.grant_types.split(",") if isinstance(client.grant_types, str) else client.grant_types, + "scope": client.scope, + "token_endpoint_auth_method": client.token_endpoint_auth_method, + "created_at": client.created_at.isoformat() if client.created_at else None, + "expires_at": client.expires_at.isoformat() if client.expires_at else None, + "is_active": client.is_active, + } + ) + + return {"total": len(clients_data), "clients": clients_data} + + except Exception as e: + logger.error(f"Failed to list registered OAuth clients: {e}") + raise HTTPException(status_code=500, detail=f"Failed to list registered clients: {str(e)}") + + +@oauth_router.get("/registered-clients/{gateway_id}") +async def get_registered_client_for_gateway( + gateway_id: str, current_user: EmailUserResponse = Depends(get_current_user_with_permissions), db: Session = Depends(get_db) # noqa: ARG001 +) -> Dict[str, Any]: + """Get the registered OAuth client for a specific gateway. + + Args: + gateway_id: The gateway ID to lookup + current_user: The authenticated user + db: Database session + + Returns: + Dict containing registered client information + + Raises: + HTTPException: If gateway or registered client not found + """ + try: + # First-Party + from mcpgateway.db import RegisteredOAuthClient + + # Query registered client for this gateway + client = db.execute(select(RegisteredOAuthClient).where(RegisteredOAuthClient.gateway_id == gateway_id)).scalar_one_or_none() + + if not client: + raise HTTPException(status_code=404, detail=f"No registered OAuth client found for gateway {gateway_id}") + + return { + "id": client.id, + "gateway_id": client.gateway_id, + "issuer": client.issuer, + "client_id": client.client_id, + "redirect_uris": client.redirect_uris.split(",") if isinstance(client.redirect_uris, str) else client.redirect_uris, + "grant_types": client.grant_types.split(",") if isinstance(client.grant_types, str) else client.grant_types, + "scope": client.scope, + "token_endpoint_auth_method": client.token_endpoint_auth_method, + "registration_client_uri": client.registration_client_uri, + "created_at": client.created_at.isoformat() if client.created_at else None, + "expires_at": client.expires_at.isoformat() if client.expires_at else None, + "is_active": client.is_active, + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Failed to get registered client for gateway {gateway_id}: {e}") + raise HTTPException(status_code=500, detail=f"Failed to get registered client: {str(e)}") + + +@oauth_router.delete("/registered-clients/{client_id}") +async def delete_registered_client(client_id: str, current_user: EmailUserResponse = Depends(get_current_user_with_permissions), db: Session = Depends(get_db)) -> Dict[str, Any]: # noqa: ARG001 + """Delete a registered OAuth client. + + This will revoke the client registration locally. Note: This does not automatically + revoke the client at the Authorization Server. You may need to manually revoke the + client using the registration_client_uri if available. + + Args: + client_id: The registered client ID to delete + current_user: The authenticated user (admin access required) + db: Database session + + Returns: + Dict containing success message + + Raises: + HTTPException: If client not found or deletion fails + """ + try: + # First-Party + from mcpgateway.db import RegisteredOAuthClient + + # Find the client + client = db.execute(select(RegisteredOAuthClient).where(RegisteredOAuthClient.id == client_id)).scalar_one_or_none() + + if not client: + raise HTTPException(status_code=404, detail=f"Registered client {client_id} not found") + + issuer = client.issuer + gateway_id = client.gateway_id + + # Delete the client + db.delete(client) + db.commit() + + logger.info(f"Deleted registered OAuth client {client_id} for gateway {gateway_id} (issuer: {issuer})") + + return {"success": True, "message": f"Registered OAuth client {client_id} deleted successfully", "gateway_id": gateway_id, "issuer": issuer} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Failed to delete registered client {client_id}: {e}") + db.rollback() + raise HTTPException(status_code=500, detail=f"Failed to delete registered client: {str(e)}") diff --git a/mcpgateway/schemas.py b/mcpgateway/schemas.py index 036c0cc6d..c13f35f1f 100644 --- a/mcpgateway/schemas.py +++ b/mcpgateway/schemas.py @@ -5381,6 +5381,7 @@ class CatalogServer(BaseModel): requires_api_key: bool = Field(default=False, description="Whether API key is required") secure: bool = Field(default=False, description="Whether additional security is required") tags: List[str] = Field(default_factory=list, description="Tags for categorization") + transport: Optional[str] = Field(None, description="Transport type: SSE, STREAMABLEHTTP, or WEBSOCKET") logo_url: Optional[str] = Field(None, description="URL to server logo/icon") documentation_url: Optional[str] = Field(None, description="URL to server documentation") is_registered: bool = Field(default=False, description="Whether server is already registered") diff --git a/mcpgateway/services/catalog_service.py b/mcpgateway/services/catalog_service.py index 45edcc3ab..6bdde238a 100644 --- a/mcpgateway/services/catalog_service.py +++ b/mcpgateway/services/catalog_service.py @@ -31,6 +31,7 @@ CatalogServerStatusResponse, ) from mcpgateway.services.gateway_service import GatewayService +from mcpgateway.utils.create_slug import slugify logger = logging.getLogger(__name__) @@ -120,6 +121,9 @@ async def get_catalog_servers(self, request: CatalogListRequest, db) -> CatalogL for server_data in servers: server = CatalogServer(**server_data) server.is_registered = server.url in registered_urls + # Set availability based on registration status (registered servers are assumed available) + # Individual health checks can be done via the /status endpoint + server.is_available = server.is_registered or server_data.get("is_available", True) catalog_servers.append(server) # Apply filters @@ -206,18 +210,22 @@ async def register_catalog_server(self, catalog_id: str, request: Optional[Catal # First-Party from mcpgateway.schemas import GatewayCreate # pylint: disable=import-outside-toplevel - # Detect transport type from URL or use SSE as default - url = server_data["url"].lower() - # Check for SSE patterns (highest priority) - if url.endswith("/sse") or "/sse/" in url: - transport = "SSE" # SSE endpoints or paths containing /sse/ - elif url.startswith("ws://") or url.startswith("wss://"): - transport = "SSE" # WebSocket URLs typically use SSE transport - # Then check for HTTP patterns - elif "/mcp" in url or url.endswith("/"): - transport = "STREAMABLEHTTP" # Generic MCP endpoints typically use HTTP - else: - transport = "SSE" # Default to SSE for most catalog servers + # Use explicit transport if provided, otherwise auto-detect from URL + transport = server_data.get("transport") + if not transport: + # Detect transport type from URL or use SSE as default + url = server_data["url"].lower() + # Check for WebSocket patterns (highest priority) + if url.startswith("ws://") or url.startswith("wss://"): + transport = "WEBSOCKET" # WebSocket transport for ws:// and wss:// URLs + # Check for SSE patterns + elif url.endswith("/sse") or "/sse/" in url: + transport = "SSE" # SSE endpoints or paths containing /sse/ + # Then check for HTTP patterns + elif "/mcp" in url or url.endswith("/"): + transport = "STREAMABLEHTTP" # Generic MCP endpoints typically use HTTP + else: + transport = "SSE" # Default to SSE for most catalog servers # Check for IPv6 URLs early to provide a clear error message url = server_data["url"] @@ -237,6 +245,8 @@ async def register_catalog_server(self, catalog_id: str, request: Optional[Catal # Set authentication based on server requirements auth_type = server_data.get("auth_type", "Open") + skip_initialization = False # Flag to skip connection test for OAuth servers without creds + if request and request.api_key and auth_type != "Open": # Handle all possible auth types from the catalog if auth_type in ["API Key", "API"]: @@ -248,10 +258,54 @@ async def register_catalog_server(self, catalog_id: str, request: Optional[Catal gateway_data["auth_type"] = "bearer" gateway_data["auth_token"] = request.api_key else: - # For any other auth types, use custom headers + # For any other auth types, use custom headers (as list of dicts) gateway_data["auth_type"] = "authheaders" - gateway_data["auth_header_key"] = "X-API-Key" - gateway_data["auth_header_value"] = request.api_key + gateway_data["auth_headers"] = [{"key": "X-API-Key", "value": request.api_key}] + elif auth_type in ["OAuth2.1", "OAuth"]: + # OAuth server without credentials - register but skip initialization + # User will need to complete OAuth flow later + skip_initialization = True + logger.info(f"Registering OAuth server {server_data['name']} without credentials - OAuth flow required later") + + # For OAuth servers without credentials, register directly without connection test + if skip_initialization: + # Create minimal gateway entry without tool discovery + # First-Party + from mcpgateway.db import Gateway as DbGateway # pylint: disable=import-outside-toplevel + + gateway_create = GatewayCreate(**gateway_data) + slug_name = slugify(gateway_data["name"]) + + db_gateway = DbGateway( + name=gateway_data["name"], + slug=slug_name, + url=gateway_data["url"], + description=gateway_data["description"], + tags=gateway_data.get("tags", []), + transport=gateway_data["transport"], + capabilities={}, + auth_type=None, # Will be set during OAuth configuration + enabled=False, # Disabled until OAuth is configured + created_via="catalog", + visibility="public", + version=1, + ) + + db.add(db_gateway) + db.commit() + db.refresh(db_gateway) + + # First-Party + from mcpgateway.schemas import GatewayRead # pylint: disable=import-outside-toplevel + + gateway_read = GatewayRead.model_validate(db_gateway) + + return CatalogServerRegisterResponse( + success=True, + server_id=str(gateway_read.id), + message=f"Successfully registered {gateway_read.name} - OAuth configuration required before activation", + error=None, + ) gateway_create = GatewayCreate(**gateway_data) @@ -284,9 +338,31 @@ async def register_catalog_server(self, catalog_id: str, request: Optional[Catal except Exception as e: logger.error(f"Failed to register catalog server {catalog_id}: {e}") + + # Map common exceptions to user-friendly messages + error_str = str(e) + user_message = "Registration failed" + + if "Connection refused" in error_str or "connect" in error_str.lower(): + user_message = "Server is offline or unreachable" + elif "SSL" in error_str or "certificate" in error_str.lower(): + user_message = "SSL certificate verification failed - check server security settings" + elif "timeout" in error_str.lower() or "timed out" in error_str.lower(): + user_message = "Server took too long to respond - it may be slow or unavailable" + elif "401" in error_str or "Unauthorized" in error_str: + user_message = "Authentication failed - check API key or OAuth credentials" + elif "403" in error_str or "Forbidden" in error_str: + user_message = "Access forbidden - check permissions and API key" + elif "404" in error_str or "Not Found" in error_str: + user_message = "Server endpoint not found - check URL is correct" + elif "500" in error_str or "Internal Server Error" in error_str: + user_message = "Remote server error - the MCP server is experiencing issues" + elif "IPv6" in error_str: + user_message = "IPv6 URLs are not supported - please use IPv4 or domain names" + # Don't rollback here - let FastAPI handle it # db.rollback() - return CatalogServerRegisterResponse(success=False, server_id="", message="Registration failed", error=str(e)) + return CatalogServerRegisterResponse(success=False, server_id="", message=user_message, error=error_str) async def check_server_availability(self, catalog_id: str) -> CatalogServerStatusResponse: """Check if a catalog server is available. diff --git a/mcpgateway/services/dcr_service.py b/mcpgateway/services/dcr_service.py new file mode 100644 index 000000000..0f86babf5 --- /dev/null +++ b/mcpgateway/services/dcr_service.py @@ -0,0 +1,336 @@ +# -*- coding: utf-8 -*- +"""Location: ./mcpgateway/services/dcr_service.py +Copyright 2025 +SPDX-License-Identifier: Apache-2.0 +Authors: Manav Gupta + +OAuth 2.0 Dynamic Client Registration Service. + +This module handles OAuth 2.0 Dynamic Client Registration (DCR) including: +- AS metadata discovery (RFC 8414) +- Client registration (RFC 7591) +- Client management (update, delete) +""" + +# Standard +from datetime import datetime, timezone +import json +import logging +from typing import Any, Dict, List + +# Third-Party +import aiohttp +from sqlalchemy.orm import Session + +# First-Party +from mcpgateway.config import get_settings +from mcpgateway.db import RegisteredOAuthClient +from mcpgateway.utils.oauth_encryption import get_oauth_encryption + +logger = logging.getLogger(__name__) + +# In-memory cache for AS metadata +# Format: {issuer: {"metadata": dict, "cached_at": datetime}} +_metadata_cache: Dict[str, Dict[str, Any]] = {} + + +class DcrService: + """Service for OAuth 2.0 Dynamic Client Registration (RFC 7591 client).""" + + def __init__(self): + """Initialize DCR service.""" + self.settings = get_settings() + + async def discover_as_metadata(self, issuer: str) -> Dict[str, Any]: + """Discover AS metadata via RFC 8414. + + Tries: + 1. {issuer}/.well-known/oauth-authorization-server (RFC 8414) + 2. {issuer}/.well-known/openid-configuration (OIDC fallback) + + Args: + issuer: The AS issuer URL + + Returns: + Dict containing AS metadata + + Raises: + DcrError: If metadata cannot be discovered + """ + # Check cache first + if issuer in _metadata_cache: + cached_entry = _metadata_cache[issuer] + cached_at = cached_entry["cached_at"] + cache_age = (datetime.now(timezone.utc) - cached_at).total_seconds() + + if cache_age < self.settings.dcr_metadata_cache_ttl: + logger.debug(f"Using cached AS metadata for {issuer}") + return cached_entry["metadata"] + + # Try RFC 8414 path first + rfc8414_url = f"{issuer}/.well-known/oauth-authorization-server" + + try: + async with aiohttp.ClientSession() as session: + async with session.get(rfc8414_url, timeout=aiohttp.ClientTimeout(total=self.settings.oauth_request_timeout)) as response: + if response.status == 200: + metadata = await response.json() + + # Validate issuer matches + if metadata.get("issuer") != issuer: + raise DcrError(f"AS metadata issuer mismatch: expected {issuer}, got {metadata.get('issuer')}") + + # Cache the metadata + _metadata_cache[issuer] = {"metadata": metadata, "cached_at": datetime.now(timezone.utc)} + + logger.info(f"Discovered AS metadata for {issuer} via RFC 8414") + return metadata + except aiohttp.ClientError as e: + logger.debug(f"RFC 8414 discovery failed for {issuer}: {e}, trying OIDC fallback") + + # Try OIDC discovery fallback + oidc_url = f"{issuer}/.well-known/openid-configuration" + + try: + async with aiohttp.ClientSession() as session: + async with session.get(oidc_url, timeout=aiohttp.ClientTimeout(total=self.settings.oauth_request_timeout)) as response: + if response.status == 200: + metadata = await response.json() + + # Validate issuer matches + if metadata.get("issuer") != issuer: + raise DcrError(f"AS metadata issuer mismatch: expected {issuer}, got {metadata.get('issuer')}") + + # Cache the metadata + _metadata_cache[issuer] = {"metadata": metadata, "cached_at": datetime.now(timezone.utc)} + + logger.info(f"Discovered AS metadata for {issuer} via OIDC discovery") + return metadata + + raise DcrError(f"AS metadata not found for {issuer} (status: {response.status})") + except aiohttp.ClientError as e: + raise DcrError(f"Failed to discover AS metadata for {issuer}: {e}") + + async def register_client(self, gateway_id: str, gateway_name: str, issuer: str, redirect_uri: str, scopes: List[str], db: Session) -> RegisteredOAuthClient: + """Register as OAuth client with upstream AS (RFC 7591). + + Args: + gateway_id: Gateway ID + gateway_name: Gateway name + issuer: AS issuer URL + redirect_uri: OAuth redirect URI + scopes: List of OAuth scopes + db: Database session + + Returns: + RegisteredOAuthClient record + + Raises: + DcrError: If registration fails + """ + # Validate issuer if allowlist is configured + if self.settings.dcr_allowed_issuers: + if issuer not in self.settings.dcr_allowed_issuers: + raise DcrError(f"Issuer {issuer} is not in allowed issuers list") + + # Discover AS metadata + metadata = await self.discover_as_metadata(issuer) + + registration_endpoint = metadata.get("registration_endpoint") + if not registration_endpoint: + raise DcrError(f"AS {issuer} does not support Dynamic Client Registration (no registration_endpoint)") + + # Build registration request (RFC 7591) + client_name = self.settings.dcr_client_name_template.replace("{gateway_name}", gateway_name) + + registration_request = { + "client_name": client_name, + "redirect_uris": [redirect_uri], + "grant_types": ["authorization_code"], + "response_types": ["code"], + "token_endpoint_auth_method": self.settings.dcr_token_endpoint_auth_method, + "scope": " ".join(scopes), + } + + # Send registration request + try: + async with aiohttp.ClientSession() as session: + async with session.post(registration_endpoint, json=registration_request, timeout=aiohttp.ClientTimeout(total=self.settings.oauth_request_timeout)) as response: + # Accept both 200 OK and 201 Created (some servers don't follow RFC 7591 strictly) + if response.status in (200, 201): + registration_response = await response.json() + else: + error_data = await response.json() + error_msg = error_data.get("error", "unknown_error") + error_desc = error_data.get("error_description", str(error_data)) + raise DcrError(f"Client registration failed: {error_msg} - {error_desc}") + except aiohttp.ClientError as e: + raise DcrError(f"Failed to register client with {issuer}: {e}") + + # Encrypt secrets + encryption = get_oauth_encryption(self.settings.auth_encryption_secret) + + client_secret = registration_response.get("client_secret") + client_secret_encrypted = encryption.encrypt_secret(client_secret) if client_secret else None + + registration_access_token = registration_response.get("registration_access_token") + registration_access_token_encrypted = encryption.encrypt_secret(registration_access_token) if registration_access_token else None + + # Create database record + registered_client = RegisteredOAuthClient( + gateway_id=gateway_id, + issuer=issuer, + client_id=registration_response["client_id"], + client_secret_encrypted=client_secret_encrypted, + redirect_uris=json.dumps(registration_response.get("redirect_uris", [redirect_uri])), + grant_types=json.dumps(registration_response.get("grant_types", ["authorization_code"])), + response_types=json.dumps(registration_response.get("response_types", ["code"])), + scope=registration_response.get("scope", " ".join(scopes)), + token_endpoint_auth_method=registration_response.get("token_endpoint_auth_method", self.settings.dcr_token_endpoint_auth_method), + registration_client_uri=registration_response.get("registration_client_uri"), + registration_access_token_encrypted=registration_access_token_encrypted, + created_at=datetime.now(timezone.utc), + expires_at=None, # TODO: Calculate from client_id_issued_at + client_secret_expires_at # pylint: disable=fixme + is_active=True, + ) + + db.add(registered_client) + db.commit() + db.refresh(registered_client) + + logger.info(f"Successfully registered client {registered_client.client_id} with {issuer} for gateway {gateway_id}") + + return registered_client + + async def get_or_register_client(self, gateway_id: str, gateway_name: str, issuer: str, redirect_uri: str, scopes: List[str], db: Session) -> RegisteredOAuthClient: + """Get existing registered client or register new one. + + Args: + gateway_id: Gateway ID + gateway_name: Gateway name + issuer: AS issuer URL + redirect_uri: OAuth redirect URI + scopes: List of OAuth scopes + db: Database session + + Returns: + RegisteredOAuthClient record + + Raises: + DcrError: If client not found and auto-register is disabled + """ + # Try to find existing client + existing_client = ( + db.query(RegisteredOAuthClient) + .filter(RegisteredOAuthClient.gateway_id == gateway_id, RegisteredOAuthClient.issuer == issuer, RegisteredOAuthClient.is_active.is_(True)) # pylint: disable=singleton-comparison + .first() + ) + + if existing_client: + logger.debug(f"Found existing registered client for gateway {gateway_id} and issuer {issuer}") + return existing_client + + # No existing client, check if auto-register is enabled + if not self.settings.dcr_auto_register_on_missing_credentials: + raise DcrError( + f"No registered client found for gateway {gateway_id} and issuer {issuer}. " "Auto-register is disabled. Set MCPGATEWAY_DCR_AUTO_REGISTER_ON_MISSING_CREDENTIALS=true to enable." + ) + + # Auto-register + logger.info(f"No existing client found for gateway {gateway_id}, registering new client with {issuer}") + return await self.register_client(gateway_id, gateway_name, issuer, redirect_uri, scopes, db) + + async def update_client_registration(self, client_record: RegisteredOAuthClient, db: Session) -> RegisteredOAuthClient: + """Update existing client registration (RFC 7591 section 4.2). + + Args: + client_record: Existing RegisteredOAuthClient record + db: Database session + + Returns: + Updated RegisteredOAuthClient record + + Raises: + DcrError: If update fails + """ + if not client_record.registration_client_uri: + raise DcrError("Cannot update client: no registration_client_uri available") + + if not client_record.registration_access_token_encrypted: + raise DcrError("Cannot update client: no registration_access_token available") + + # Decrypt registration access token + encryption = get_oauth_encryption(self.settings.auth_encryption_secret) + registration_access_token = encryption.decrypt_secret(client_record.registration_access_token_encrypted) + + # Build update request + update_request = {"client_id": client_record.client_id, "redirect_uris": json.loads(client_record.redirect_uris), "grant_types": json.loads(client_record.grant_types)} + + # Send update request + try: + async with aiohttp.ClientSession() as session: + headers = {"Authorization": f"Bearer {registration_access_token}"} + async with session.put( + client_record.registration_client_uri, json=update_request, headers=headers, timeout=aiohttp.ClientTimeout(total=self.settings.oauth_request_timeout) + ) as response: + if response.status == 200: + updated_response = await response.json() + + # Update encrypted secret if changed + if "client_secret" in updated_response: + client_record.client_secret_encrypted = encryption.encrypt_secret(updated_response["client_secret"]) + + db.commit() + db.refresh(client_record) + + logger.info(f"Successfully updated client registration for {client_record.client_id}") + return client_record + + error_data = await response.json() + raise DcrError(f"Failed to update client: {error_data}") + except aiohttp.ClientError as e: + raise DcrError(f"Failed to update client registration: {e}") + + async def delete_client_registration(self, client_record: RegisteredOAuthClient, db: Session) -> bool: # pylint: disable=unused-argument + """Delete/revoke client registration (RFC 7591 section 4.3). + + Args: + client_record: RegisteredOAuthClient record to delete + db: Database session + + Returns: + True if deletion succeeded + + Raises: + DcrError: If deletion fails (except 404) + """ + if not client_record.registration_client_uri: + logger.warning("Cannot delete client at AS: no registration_client_uri") + return True # Consider it deleted locally + + if not client_record.registration_access_token_encrypted: + logger.warning("Cannot delete client at AS: no registration_access_token") + return True # Consider it deleted locally + + # Decrypt registration access token + encryption = get_oauth_encryption(self.settings.auth_encryption_secret) + registration_access_token = encryption.decrypt_secret(client_record.registration_access_token_encrypted) + + # Send delete request + try: + async with aiohttp.ClientSession() as session: + headers = {"Authorization": f"Bearer {registration_access_token}"} + async with session.delete(client_record.registration_client_uri, headers=headers, timeout=aiohttp.ClientTimeout(total=self.settings.oauth_request_timeout)) as response: + if response.status in [204, 404]: # 204 = deleted, 404 = already gone + logger.info(f"Successfully deleted client registration for {client_record.client_id}") + return True + + logger.warning(f"Unexpected status when deleting client: {response.status}") + return True # Consider it best-effort + except aiohttp.ClientError as e: + logger.warning(f"Failed to delete client at AS: {e}") + return True # Best-effort, don't fail if AS is unreachable + + +class DcrError(Exception): + """DCR-related errors.""" diff --git a/mcpgateway/services/gateway_service.py b/mcpgateway/services/gateway_service.py index 58e767dd8..e067d4787 100644 --- a/mcpgateway/services/gateway_service.py +++ b/mcpgateway/services/gateway_service.py @@ -783,12 +783,20 @@ async def fetch_tools_after_oauth(self, db: Session, gateway_id: str, app_user_e raise GatewayConnectionError( f"No OAuth tokens found for user {app_user_email} on gateway {gateway.name}. Please complete the OAuth authorization flow first at /oauth/authorize/{gateway.id}" ) + + # Debug: Check if token was decrypted + if access_token.startswith("Z0FBQUFBQm"): # Encrypted tokens start with this + logger.error(f"Token appears to be encrypted! Encryption service may have failed. Token length: {len(access_token)}") + else: + logger.info(f"Using decrypted OAuth token for {gateway.name} (length: {len(access_token)})") + # Now connect to MCP server with the access token authentication = {"Authorization": f"Bearer {access_token}"} # Use the existing connection logic + # Note: For OAuth servers, skip validation since we already validated via OAuth flow if gateway.transport.upper() == "SSE": - capabilities, tools, resources, prompts = await self.connect_to_sse_server(gateway.url, authentication) + capabilities, tools, resources, prompts = await self._connect_to_sse_server_without_validation(gateway.url, authentication) elif gateway.transport.upper() == "STREAMABLEHTTP": capabilities, tools, resources, prompts = await self.connect_to_streamablehttp_server(gateway.url, authentication) else: @@ -2799,6 +2807,103 @@ async def _publish_event(self, event: Dict[str, Any]) -> None: for queue in self._event_subscribers: await queue.put(event) + async def _connect_to_sse_server_without_validation(self, server_url: str, authentication: Optional[Dict[str, str]] = None): + """Connect to an MCP server running with SSE transport, skipping URL validation. + + This is used for OAuth-protected servers where we've already validated the token works. + + Args: + server_url: The URL of the SSE MCP server to connect to. + authentication: Optional dictionary containing authentication headers. + + Returns: + Tuple containing (capabilities, tools, resources, prompts) from the MCP server. + """ + if authentication is None: + authentication = {} + + # Skip validation for OAuth servers - we already validated via OAuth flow + # Use async with for both sse_client and ClientSession + try: + async with sse_client(url=server_url, headers=authentication) as streams: + async with ClientSession(*streams) as session: + # Initialize the session + response = await session.initialize() + capabilities = response.capabilities.model_dump(by_alias=True, exclude_none=True) + logger.debug(f"Server capabilities: {capabilities}") + + response = await session.list_tools() + tools = response.tools + tools = [tool.model_dump(by_alias=True, exclude_none=True) for tool in tools] + + tools = [ToolCreate.model_validate(tool) for tool in tools] + if tools: + logger.info(f"Fetched {len(tools)} tools from gateway") + # Fetch resources if supported + resources = [] + logger.debug(f"Checking for resources support: {capabilities.get('resources')}") + if capabilities.get("resources"): + try: + response = await session.list_resources() + raw_resources = response.resources + for resource in raw_resources: + resource_data = resource.model_dump(by_alias=True, exclude_none=True) + # Convert AnyUrl to string if present + if "uri" in resource_data and hasattr(resource_data["uri"], "unicode_string"): + resource_data["uri"] = str(resource_data["uri"]) + # Add default content if not present (will be fetched on demand) + if "content" not in resource_data: + resource_data["content"] = "" + try: + resources.append(ResourceCreate.model_validate(resource_data)) + except Exception: + # If validation fails, create minimal resource + resources.append( + ResourceCreate( + uri=str(resource_data.get("uri", "")), + name=resource_data.get("name", ""), + description=resource_data.get("description"), + mime_type=resource_data.get("mime_type"), + template=resource_data.get("template"), + content="", + ) + ) + logger.info(f"Fetched {len(resources)} resources from gateway") + except Exception as e: + logger.warning(f"Failed to fetch resources: {e}") + + # Fetch prompts if supported + prompts = [] + logger.debug(f"Checking for prompts support: {capabilities.get('prompts')}") + if capabilities.get("prompts"): + try: + response = await session.list_prompts() + raw_prompts = response.prompts + for prompt in raw_prompts: + prompt_data = prompt.model_dump(by_alias=True, exclude_none=True) + # Add default template if not present + if "template" not in prompt_data: + prompt_data["template"] = "" + try: + prompts.append(PromptCreate.model_validate(prompt_data)) + except Exception: + # If validation fails, create minimal prompt + prompts.append( + PromptCreate( + name=prompt_data.get("name", ""), + description=prompt_data.get("description"), + template=prompt_data.get("template", ""), + ) + ) + logger.info(f"Fetched {len(prompts)} prompts from gateway") + except Exception as e: + logger.warning(f"Failed to fetch prompts: {e}") + + return capabilities, tools, resources, prompts + except Exception as e: + logger.error(f"SSE connection error details: {type(e).__name__}: {str(e)}", exc_info=True) + raise GatewayConnectionError(f"Failed to connect to SSE server at {server_url}: {str(e)}") + async def connect_to_sse_server(self, server_url: str, authentication: Optional[Dict[str, str]] = None): """Connect to an MCP server running with SSE transport. diff --git a/mcpgateway/services/oauth_manager.py b/mcpgateway/services/oauth_manager.py index 37f30ba21..0678dad93 100644 --- a/mcpgateway/services/oauth_manager.py +++ b/mcpgateway/services/oauth_manager.py @@ -127,6 +127,20 @@ def __init__(self, request_timeout: int = 30, max_retries: int = 3, token_storag self.token_storage = token_storage self.settings = get_settings() + def _generate_pkce_params(self) -> Dict[str, str]: + """Generate PKCE parameters for OAuth Authorization Code flow (RFC 7636). + + Returns: + Dict containing code_verifier, code_challenge, and code_challenge_method + """ + # Generate code_verifier: 43-128 character random string + code_verifier = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8").rstrip("=") + + # Generate code_challenge: base64url(SHA256(code_verifier)) + code_challenge = base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("utf-8")).digest()).decode("utf-8").rstrip("=") + + return {"code_verifier": code_verifier, "code_challenge": code_challenge, "code_challenge_method": "S256"} + async def get_access_token(self, credentials: Dict[str, Any]) -> str: """Get access token based on grant type. @@ -168,6 +182,8 @@ async def get_access_token(self, credentials: Dict[str, Any]) -> str: if grant_type == "client_credentials": return await self._client_credentials_flow(credentials) + if grant_type == "password": + return await self._password_flow(credentials) if grant_type == "authorization_code": # For authorization code flow in gateway initialization, we need to handle this differently # Since this is called during gateway setup, we'll try to use client credentials as fallback @@ -268,6 +284,105 @@ async def _client_credentials_flow(self, credentials: Dict[str, Any]) -> str: # This should never be reached due to the exception above, but needed for type safety raise OAuthError("Failed to obtain access token after all retry attempts") + async def _password_flow(self, credentials: Dict[str, Any]) -> str: + """Resource Owner Password Credentials flow (RFC 6749 Section 4.3). + + This flow is used when the application can directly handle the user's credentials, + such as with trusted first-party applications or legacy integrations like Keycloak. + + Args: + credentials: OAuth configuration with client_id, optional client_secret, token_url, username, password + + Returns: + Access token string + + Raises: + OAuthError: If token acquisition fails after all retries + """ + client_id = credentials.get("client_id") + client_secret = credentials.get("client_secret") + token_url = credentials["token_url"] + username = credentials.get("username") + password = credentials.get("password") + scopes = credentials.get("scopes", []) + + if not username or not password: + raise OAuthError("Username and password are required for password grant type") + + # Decrypt client secret if it's encrypted and present + if client_secret and len(client_secret) > 50: # Simple heuristic: encrypted secrets are longer + try: + settings = get_settings() + encryption = get_oauth_encryption(settings.auth_encryption_secret) + decrypted_secret = encryption.decrypt_secret(client_secret) + if decrypted_secret: + client_secret = decrypted_secret + logger.debug("Successfully decrypted client secret") + else: + logger.warning("Failed to decrypt client secret, using encrypted version") + except Exception as e: + logger.warning(f"Failed to decrypt client secret: {e}, using encrypted version") + + # Prepare token request data + token_data = { + "grant_type": "password", + "username": username, + "password": password, + } + + # Add client_id (required by most providers including Keycloak) + if client_id: + token_data["client_id"] = client_id + + # Add client_secret if present (some providers require it, others don't) + if client_secret: + token_data["client_secret"] = client_secret + + if scopes: + token_data["scope"] = " ".join(scopes) if isinstance(scopes, list) else scopes + + # Fetch token with retries + for attempt in range(self.max_retries): + try: + async with aiohttp.ClientSession() as session: + async with session.post(token_url, data=token_data, timeout=aiohttp.ClientTimeout(total=self.request_timeout)) as response: + response.raise_for_status() + + # Handle both JSON and form-encoded responses + content_type = response.headers.get("content-type", "") + if "application/x-www-form-urlencoded" in content_type: + # Parse form-encoded response + text_response = await response.text() + token_response = {} + for pair in text_response.split("&"): + if "=" in pair: + key, value = pair.split("=", 1) + token_response[key] = value + else: + # Try JSON response + try: + token_response = await response.json() + except Exception as e: + logger.warning(f"Failed to parse JSON response: {e}") + # Fallback to text parsing + text_response = await response.text() + token_response = {"raw_response": text_response} + + if "access_token" not in token_response: + raise OAuthError(f"No access_token in response: {token_response}") + + logger.info("Successfully obtained access token via password grant") + return token_response["access_token"] + + except aiohttp.ClientError as e: + logger.warning(f"Token request attempt {attempt + 1} failed: {str(e)}") + if attempt == self.max_retries - 1: + raise OAuthError(f"Failed to obtain access token after {self.max_retries} attempts: {str(e)}") + await asyncio.sleep(2**attempt) # Exponential backoff + + # This should never be reached due to the exception above, but needed for type safety + raise OAuthError("Failed to obtain access token after all retry attempts") + async def get_authorization_url(self, credentials: Dict[str, Any]) -> Dict[str, str]: """Get authorization URL for user delegation flow. @@ -307,12 +422,12 @@ async def exchange_code_for_token(self, credentials: Dict[str, Any], code: str, OAuthError: If token exchange fails """ client_id = credentials["client_id"] - client_secret = credentials["client_secret"] + client_secret = credentials.get("client_secret") # Optional for public clients (PKCE-only) token_url = credentials["token_url"] redirect_uri = credentials["redirect_uri"] - # Decrypt client secret if it's encrypted - if len(client_secret) > 50: # Simple heuristic: encrypted secrets are longer + # Decrypt client secret if it's encrypted and present + if client_secret and len(client_secret) > 50: # Simple heuristic: encrypted secrets are longer try: settings = get_settings() encryption = get_oauth_encryption(settings.auth_encryption_secret) @@ -331,9 +446,12 @@ async def exchange_code_for_token(self, credentials: Dict[str, Any], code: str, "code": code, "redirect_uri": redirect_uri, "client_id": client_id, - "client_secret": client_secret, } + # Only include client_secret if present (public clients don't have secrets) + if client_secret: + token_data["client_secret"] = client_secret + # Exchange code for token with retries for attempt in range(self.max_retries): try: @@ -377,7 +495,7 @@ async def exchange_code_for_token(self, credentials: Dict[str, Any], code: str, raise OAuthError("Failed to exchange code for token after all retry attempts") async def initiate_authorization_code_flow(self, gateway_id: str, credentials: Dict[str, Any], app_user_email: str = None) -> Dict[str, str]: - """Initiate Authorization Code flow and return authorization URL. + """Initiate Authorization Code flow with PKCE and return authorization URL. Args: gateway_id: ID of the gateway being configured @@ -388,22 +506,25 @@ async def initiate_authorization_code_flow(self, gateway_id: str, credentials: D Dict containing authorization_url and state """ + # Generate PKCE parameters (RFC 7636) + pkce_params = self._generate_pkce_params() + # Generate state parameter with user context for CSRF protection state = self._generate_state(gateway_id, app_user_email) - # Store state in session/cache for validation + # Store state with code_verifier in session/cache for validation if self.token_storage: - await self._store_authorization_state(gateway_id, state) + await self._store_authorization_state(gateway_id, state, code_verifier=pkce_params["code_verifier"]) - # Generate authorization URL - auth_url, _ = self._create_authorization_url(credentials, state) + # Generate authorization URL with PKCE + auth_url = self._create_authorization_url_with_pkce(credentials, state, pkce_params["code_challenge"], pkce_params["code_challenge_method"]) - logger.info(f"Generated authorization URL for gateway {gateway_id}") + logger.info(f"Generated authorization URL with PKCE for gateway {gateway_id}") return {"authorization_url": auth_url, "state": state, "gateway_id": gateway_id} async def complete_authorization_code_flow(self, gateway_id: str, code: str, state: str, credentials: Dict[str, Any]) -> Dict[str, Any]: - """Complete Authorization Code flow and store tokens. + """Complete Authorization Code flow with PKCE and store tokens. Args: gateway_id: ID of the gateway @@ -417,10 +538,13 @@ async def complete_authorization_code_flow(self, gateway_id: str, code: str, sta Raises: OAuthError: If state validation fails or token exchange fails """ - # First, validate state to prevent replay attacks - if not await self._validate_authorization_state(gateway_id, state): + # Validate state and retrieve code_verifier + state_data = await self._validate_and_retrieve_state(gateway_id, state) + if not state_data: raise OAuthError("Invalid or expired state parameter - possible replay attack") + code_verifier = state_data.get("code_verifier") + # Decode state to extract user context and verify HMAC try: # Decode base64 @@ -439,9 +563,9 @@ async def complete_authorization_code_flow(self, gateway_id: str, code: str, sta # Parse state data state_json = state_bytes.decode() - state_data = json.loads(state_json) - app_user_email = state_data.get("app_user_email") - state_gateway_id = state_data.get("gateway_id") + state_payload = json.loads(state_json) + app_user_email = state_payload.get("app_user_email") + state_gateway_id = state_payload.get("gateway_id") # Validate gateway ID matches if state_gateway_id != gateway_id: @@ -451,8 +575,8 @@ async def complete_authorization_code_flow(self, gateway_id: str, code: str, sta logger.warning(f"Failed to decode state JSON, trying legacy format: {e}") app_user_email = None - # Exchange code for tokens - token_response = await self._exchange_code_for_tokens(credentials, code) + # Exchange code for tokens with PKCE code_verifier + token_response = await self._exchange_code_for_tokens(credentials, code, code_verifier=code_verifier) # Extract user information from token response user_id = self._extract_user_id(token_response, credentials) @@ -516,12 +640,13 @@ def _generate_state(self, gateway_id: str, app_user_email: str = None) -> str: return state_encoded - async def _store_authorization_state(self, gateway_id: str, state: str) -> None: + async def _store_authorization_state(self, gateway_id: str, state: str, code_verifier: str = None) -> None: """Store authorization state for validation with TTL. Args: gateway_id: ID of the gateway state: State parameter to store + code_verifier: Optional PKCE code verifier (RFC 7636) """ expires_at = datetime.now(timezone.utc) + timedelta(seconds=STATE_TTL_SECONDS) settings = get_settings() @@ -532,7 +657,7 @@ async def _store_authorization_state(self, gateway_id: str, state: str) -> None: if redis: try: state_key = f"oauth:state:{gateway_id}:{state}" - state_data = {"state": state, "gateway_id": gateway_id, "expires_at": expires_at.isoformat(), "used": False} + state_data = {"state": state, "gateway_id": gateway_id, "code_verifier": code_verifier, "expires_at": expires_at.isoformat(), "used": False} # Store in Redis with TTL await redis.setex(state_key, STATE_TTL_SECONDS, json.dumps(state_data)) logger.debug(f"Stored OAuth state in Redis for gateway {gateway_id}") @@ -552,8 +677,8 @@ async def _store_authorization_state(self, gateway_id: str, state: str) -> None: # Clean up expired states first db.query(OAuthState).filter(OAuthState.expires_at < datetime.now(timezone.utc)).delete() - # Store new state - oauth_state = OAuthState(gateway_id=gateway_id, state=state, expires_at=expires_at, used=False) + # Store new state with code_verifier + oauth_state = OAuthState(gateway_id=gateway_id, state=state, code_verifier=code_verifier, expires_at=expires_at, used=False) db.add(oauth_state) db.commit() logger.debug(f"Stored OAuth state in database for gateway {gateway_id}") @@ -568,7 +693,7 @@ async def _store_authorization_state(self, gateway_id: str, state: str) -> None: # Clean up expired states first now = datetime.now(timezone.utc) state_key = f"oauth:state:{gateway_id}:{state}" - state_data = {"state": state, "gateway_id": gateway_id, "expires_at": expires_at.isoformat(), "used": False} + state_data = {"state": state, "gateway_id": gateway_id, "code_verifier": code_verifier, "expires_at": expires_at.isoformat(), "used": False} expired_states = [key for key, data in _oauth_states.items() if datetime.fromisoformat(data["expires_at"]) < now] for key in expired_states: del _oauth_states[key] @@ -704,6 +829,107 @@ async def _validate_authorization_state(self, gateway_id: str, state: str) -> bo logger.debug(f"Successfully validated OAuth state from memory for gateway {gateway_id}") return True + async def _validate_and_retrieve_state(self, gateway_id: str, state: str) -> Optional[Dict[str, Any]]: + """Validate state and return full state data including code_verifier. + + Args: + gateway_id: ID of the gateway + state: State parameter to validate + + Returns: + Dict with state data including code_verifier, or None if invalid/expired + """ + settings = get_settings() + + # Try Redis first + if settings.cache_type == "redis": + redis = await _get_redis_client() + if redis: + try: + state_key = f"oauth:state:{gateway_id}:{state}" + state_json = await redis.getdel(state_key) # Atomic get+delete + if not state_json: + return None + + state_data = json.loads(state_json) + + # Check expiration + try: + expires_at = datetime.fromisoformat(state_data["expires_at"]) + except Exception: + expires_at = datetime.strptime(state_data["expires_at"], "%Y-%m-%dT%H:%M:%S") + + if expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + + if expires_at < datetime.now(timezone.utc): + return None + + return state_data + except Exception as e: + logger.warning(f"Failed to validate state in Redis: {e}, falling back") + + # Try database + if settings.cache_type == "database": + try: + # First-Party + from mcpgateway.db import get_db, OAuthState # pylint: disable=import-outside-toplevel + + db_gen = get_db() + db = next(db_gen) + try: + oauth_state = db.query(OAuthState).filter(OAuthState.gateway_id == gateway_id, OAuthState.state == state).first() + + if not oauth_state: + return None + + # Check expiration + expires_at = oauth_state.expires_at + if expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + + if expires_at < datetime.now(timezone.utc): + db.delete(oauth_state) + db.commit() + return None + + # Check if already used + if oauth_state.used: + return None + + # Build state data + state_data = {"state": oauth_state.state, "gateway_id": oauth_state.gateway_id, "code_verifier": oauth_state.code_verifier, "expires_at": oauth_state.expires_at.isoformat()} + + # Mark as used and delete + db.delete(oauth_state) + db.commit() + + return state_data + finally: + db_gen.close() + except Exception as e: + logger.warning(f"Failed to validate state in database: {e}") + + # Fallback to in-memory + state_key = f"oauth:state:{gateway_id}:{state}" + async with _state_lock: + state_data = _oauth_states.get(state_key) + if not state_data: + return None + + # Check expiration + expires_at = datetime.fromisoformat(state_data["expires_at"]) + if expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + + if expires_at < datetime.now(timezone.utc): + del _oauth_states[state_key] + return None + + # Remove from memory (single-use) + del _oauth_states[state_key] + return state_data + def _create_authorization_url(self, credentials: Dict[str, Any], state: str) -> tuple[str, str]: """Create authorization URL with state parameter. @@ -727,12 +953,44 @@ def _create_authorization_url(self, credentials: Dict[str, Any], state: str) -> return auth_url, state - async def _exchange_code_for_tokens(self, credentials: Dict[str, Any], code: str) -> Dict[str, Any]: - """Exchange authorization code for tokens. + def _create_authorization_url_with_pkce(self, credentials: Dict[str, Any], state: str, code_challenge: str, code_challenge_method: str) -> str: + """Create authorization URL with PKCE parameters (RFC 7636). + + Args: + credentials: OAuth configuration + state: State parameter for CSRF protection + code_challenge: PKCE code challenge + code_challenge_method: PKCE method (S256) + + Returns: + Authorization URL string with PKCE parameters + """ + # Standard + from urllib.parse import urlencode # pylint: disable=import-outside-toplevel + + client_id = credentials["client_id"] + redirect_uri = credentials["redirect_uri"] + authorization_url = credentials["authorization_url"] + scopes = credentials.get("scopes", []) + + # Build authorization parameters + params = {"response_type": "code", "client_id": client_id, "redirect_uri": redirect_uri, "state": state, "code_challenge": code_challenge, "code_challenge_method": code_challenge_method} + + # Add scopes if present + if scopes: + params["scope"] = " ".join(scopes) if isinstance(scopes, list) else scopes + + # Build full URL + query_string = urlencode(params) + return f"{authorization_url}?{query_string}" + + async def _exchange_code_for_tokens(self, credentials: Dict[str, Any], code: str, code_verifier: str = None) -> Dict[str, Any]: + """Exchange authorization code for tokens with PKCE support. Args: credentials: OAuth configuration code: Authorization code from callback + code_verifier: Optional PKCE code verifier (RFC 7636) Returns: Token response dictionary @@ -741,12 +999,12 @@ async def _exchange_code_for_tokens(self, credentials: Dict[str, Any], code: str OAuthError: If token exchange fails """ client_id = credentials["client_id"] - client_secret = credentials["client_secret"] + client_secret = credentials.get("client_secret") # Optional for public clients (PKCE-only) token_url = credentials["token_url"] redirect_uri = credentials["redirect_uri"] - # Decrypt client secret if it's encrypted - if len(client_secret) > 50: # Simple heuristic: encrypted secrets are longer + # Decrypt client secret if it's encrypted and present + if client_secret and len(client_secret) > 50: # Simple heuristic: encrypted secrets are longer try: settings = get_settings() encryption = get_oauth_encryption(settings.auth_encryption_secret) @@ -765,9 +1023,16 @@ async def _exchange_code_for_tokens(self, credentials: Dict[str, Any], code: str "code": code, "redirect_uri": redirect_uri, "client_id": client_id, - "client_secret": client_secret, } + # Only include client_secret if present (public clients don't have secrets) + if client_secret: + token_data["client_secret"] = client_secret + + # Add PKCE code_verifier if present (RFC 7636) + if code_verifier: + token_data["code_verifier"] = code_verifier + # Exchange code for token with retries for attempt in range(self.max_retries): try: diff --git a/mcpgateway/services/tool_service.py b/mcpgateway/services/tool_service.py index d84fc4890..f6287bde2 100644 --- a/mcpgateway/services/tool_service.py +++ b/mcpgateway/services/tool_service.py @@ -1533,10 +1533,9 @@ async def create_tool_from_a2a_agent( input_schema={ "type": "object", "properties": { - "parameters": {"type": "object", "description": "Parameters to pass to the A2A agent"}, - "interaction_type": {"type": "string", "description": "Type of interaction", "default": "query"}, + "query": {"type": "string", "description": "User query", "default": "Hello from MCP Gateway Admin UI test!"}, }, - "required": ["parameters"], + "required": ["query"], }, allow_auto=True, annotations={ @@ -1626,11 +1625,11 @@ async def _call_a2a_agent(self, agent: DbA2AAgent, parameters: Dict[str, Any]): # Patch: Build correct JSON-RPC params structure from flat UI input params = None # If UI sends flat fields, convert to nested message structure - if isinstance(parameters, dict) and "parameters" in parameters and "interaction_type" in parameters and isinstance(parameters["interaction_type"], str): + if isinstance(parameters, dict) and "query" in parameters and isinstance(parameters["query"], str): # Build the nested message object message_id = f"admin-test-{int(time.time())}" - params = {"message": {"messageId": message_id, "role": "user", "parts": [{"type": "text", "text": parameters["interaction_type"]}]}} - method = parameters.get("parameters", "message/send") + params = {"message": {"messageId": message_id, "role": "user", "parts": [{"type": "text", "text": parameters["query"]}]}} + method = parameters.get("method", "message/send") else: # Already in correct format or unknown, pass through params = parameters.get("params", parameters) diff --git a/mcpgateway/static/admin.js b/mcpgateway/static/admin.js index b660cbd9c..a58ff19b6 100644 --- a/mcpgateway/static/admin.js +++ b/mcpgateway/static/admin.js @@ -4798,7 +4798,7 @@ function showTab(tabName) { if (tabName === "mcp-registry") { // Load MCP Registry content const registryContent = safeGetElement( - "mcp-registry-content", + "mcp-registry-servers", ); if (registryContent) { // Always load on first visit or if showing loading message @@ -4820,7 +4820,7 @@ function showTab(tabName) { "GET", `${rootPath}/admin/mcp-registry/partial`, { - target: "#mcp-registry-content", + target: "#mcp-registry-servers", swap: "innerHTML", }, ) @@ -7683,49 +7683,15 @@ async function handleGatewayFormSubmit(e) { } // Handle OAuth configuration - const authType = formData.get("auth_type"); - if (authType === "oauth") { - const oauthConfig = { - grant_type: formData.get("oauth_grant_type"), - client_id: formData.get("oauth_client_id"), - client_secret: formData.get("oauth_client_secret"), - token_url: formData.get("oauth_token_url"), - scopes: formData.get("oauth_scopes") - ? formData - .get("oauth_scopes") - .split(" ") - .filter((s) => s.trim()) - : [], - }; - - // Add authorization code specific fields - if (oauthConfig.grant_type === "authorization_code") { - oauthConfig.authorization_url = formData.get( - "oauth_authorization_url", - ); - oauthConfig.redirect_uri = formData.get("oauth_redirect_uri"); - - // Add token management options - oauthConfig.token_management = { - store_tokens: formData.get("oauth_store_tokens") === "on", - auto_refresh: formData.get("oauth_auto_refresh") === "on", - refresh_threshold_seconds: 300, - }; - } - - // Remove individual OAuth fields and add as oauth_config - formData.delete("oauth_grant_type"); - formData.delete("oauth_client_id"); - formData.delete("oauth_client_secret"); - formData.delete("oauth_token_url"); - formData.delete("oauth_scopes"); - formData.delete("oauth_authorization_url"); - formData.delete("oauth_redirect_uri"); - formData.delete("oauth_store_tokens"); - formData.delete("oauth_auto_refresh"); - - formData.append("oauth_config", JSON.stringify(oauthConfig)); - } + // NOTE: OAuth config assembly is now handled by the backend (mcpgateway/admin.py) + // The backend assembles individual form fields into oauth_config with proper field names + // and supports DCR (Dynamic Client Registration) when client_id/client_secret are empty + // + // Leaving this commented for reference: + // const authType = formData.get("auth_type"); + // if (authType === "oauth") { + // ... backend handles this now ... + // } formData.append("visibility", formData.get("visibility")); @@ -8307,40 +8273,15 @@ async function handleEditGatewayFormSubmit(e) { ); // Handle OAuth configuration - const authType = formData.get("auth_type"); - if (authType === "oauth") { - const oauthConfig = { - grant_type: formData.get("oauth_grant_type"), - client_id: formData.get("oauth_client_id"), - client_secret: formData.get("oauth_client_secret"), - token_url: formData.get("oauth_token_url"), - scopes: formData.get("oauth_scopes") - ? formData - .get("oauth_scopes") - .split(" ") - .filter((s) => s.trim()) - : [], - }; - - // Add authorization code specific fields - if (oauthConfig.grant_type === "authorization_code") { - oauthConfig.authorization_url = formData.get( - "oauth_authorization_url", - ); - oauthConfig.redirect_uri = formData.get("oauth_redirect_uri"); - } - - // Remove individual OAuth fields and add as oauth_config - formData.delete("oauth_grant_type"); - formData.delete("oauth_client_id"); - formData.delete("oauth_client_secret"); - formData.delete("oauth_token_url"); - formData.delete("oauth_scopes"); - formData.delete("oauth_authorization_url"); - formData.delete("oauth_redirect_uri"); - - formData.append("oauth_config", JSON.stringify(oauthConfig)); - } + // NOTE: OAuth config assembly is now handled by the backend (mcpgateway/admin.py) + // The backend assembles individual form fields into oauth_config with proper field names + // and supports DCR (Dynamic Client Registration) when client_id/client_secret are empty + // + // Leaving this commented for reference: + // const authType = formData.get("auth_type"); + // if (authType === "oauth") { + // ... backend handles this now ... + // } const isInactiveCheckedBool = isInactiveChecked("gateways"); formData.append("is_inactive_checked", isInactiveCheckedBool); @@ -9365,6 +9306,8 @@ function handleAuthTypeChange() { function handleOAuthGrantTypeChange() { const grantType = this.value; const authCodeFields = safeGetElement("oauth-auth-code-fields-gw"); + const usernameField = safeGetElement("oauth-username-field-gw"); + const passwordField = safeGetElement("oauth-password-field-gw"); if (authCodeFields) { if (grantType === "authorization_code") { @@ -9392,11 +9335,48 @@ function handleOAuthGrantTypeChange() { }); } } + + // Handle password grant type fields + if (usernameField && passwordField) { + if (grantType === "password") { + usernameField.style.display = "block"; + passwordField.style.display = "block"; + + // Make username and password required for password grant + const usernameInput = safeGetElement("oauth-username-gw"); + const passwordInput = safeGetElement("oauth-password-gw"); + if (usernameInput) { + usernameInput.required = true; + } + if (passwordInput) { + passwordInput.required = true; + } + + console.log( + "Password grant flow selected - username and password are now required", + ); + } else { + usernameField.style.display = "none"; + passwordField.style.display = "none"; + + // Remove required validation for hidden fields + const usernameInput = safeGetElement("oauth-username-gw"); + const passwordInput = safeGetElement("oauth-password-gw"); + if (usernameInput) { + usernameInput.required = false; + } + if (passwordInput) { + passwordInput.required = false; + } + } + } } function handleEditOAuthGrantTypeChange() { const grantType = this.value; const authCodeFields = safeGetElement("oauth-auth-code-fields-gw-edit"); + const usernameField = safeGetElement("oauth-username-field-edit"); + const passwordField = safeGetElement("oauth-password-field-edit"); if (authCodeFields) { if (grantType === "authorization_code") { @@ -9424,6 +9404,41 @@ function handleEditOAuthGrantTypeChange() { }); } } + + // Handle password grant type fields + if (usernameField && passwordField) { + if (grantType === "password") { + usernameField.style.display = "block"; + passwordField.style.display = "block"; + + // Make username and password required for password grant + const usernameInput = safeGetElement("oauth-username-gw-edit"); + const passwordInput = safeGetElement("oauth-password-gw-edit"); + if (usernameInput) { + usernameInput.required = true; + } + if (passwordInput) { + passwordInput.required = true; + } + + console.log( + "Password grant flow selected - username and password are now required", + ); + } else { + usernameField.style.display = "none"; + passwordField.style.display = "none"; + + // Remove required validation for hidden fields + const usernameInput = safeGetElement("oauth-username-gw-edit"); + const passwordInput = safeGetElement("oauth-password-gw-edit"); + if (usernameInput) { + usernameInput.required = false; + } + if (passwordInput) { + passwordInput.required = false; + } + } + } } function setupSchemaModeHandlers() { diff --git a/mcpgateway/templates/admin.html b/mcpgateway/templates/admin.html index 2f9fdb8e0..51ddb937f 100644 --- a/mcpgateway/templates/admin.html +++ b/mcpgateway/templates/admin.html @@ -472,15 +472,6 @@ > 📦 MCP Registry - - ⚙️ Configuration - {% if email_auth_enabled %}