Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmd/vmcp/app/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,14 @@ func runServe(cmd *cobra.Command, _ []string) error {
logger.Info("Health monitoring configured from operational settings")
}

// Extract partial failure mode from config (with default)
partialFailureMode := vmcp.PartialFailureModeStrict // default to strict mode
if cfg.Operational != nil &&
cfg.Operational.FailureHandling != nil &&
cfg.Operational.FailureHandling.PartialFailureMode != "" {
partialFailureMode = cfg.Operational.FailureHandling.PartialFailureMode
}

serverCfg := &vmcpserver.Config{
Name: cfg.Name,
Version: getVersion(),
Expand All @@ -388,6 +396,7 @@ func runServe(cmd *cobra.Command, _ []string) error {
TelemetryProvider: telemetryProvider,
AuditConfig: cfg.Audit,
HealthMonitorConfig: healthMonitorConfig,
PartialFailureMode: partialFailureMode,
}

// Convert composite tool configurations to workflow definitions
Expand Down
6 changes: 4 additions & 2 deletions pkg/vmcp/config/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"time"

"dario.cat/mergo"

"github.com/stacklok/toolhive/pkg/vmcp"
)

// Default constants for operational configuration.
Expand All @@ -18,8 +20,8 @@ const (
defaultUnhealthyThreshold = 3

// defaultPartialFailureMode defines the default behavior when some backends fail.
// "fail" means the entire request fails if any backend is unavailable.
defaultPartialFailureMode = "fail"
// Strict mode excludes degraded, unhealthy, and unauthenticated backends.
defaultPartialFailureMode = vmcp.PartialFailureModeStrict

// defaultTimeoutDefault is the default timeout for backend requests.
defaultTimeoutDefault = 30 * time.Second
Expand Down
2 changes: 1 addition & 1 deletion pkg/vmcp/config/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ func (*DefaultValidator) validateFailureHandling(fh *FailureHandlingConfig) erro
return fmt.Errorf("unhealthy_threshold must be positive")
}

validModes := []string{"fail", "best_effort"}
validModes := []string{vmcp.PartialFailureModeStrict, vmcp.PartialFailureModeLenient}
if !contains(validModes, fh.PartialFailureMode) {
return fmt.Errorf("partial_failure_mode must be one of: %s", strings.Join(validModes, ", "))
}
Expand Down
34 changes: 23 additions & 11 deletions pkg/vmcp/discovery/health_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,33 @@ import (
"github.com/stacklok/toolhive/pkg/vmcp/health"
)

// FilterHealthyBackends filters backends to only include those that can handle requests.
// FilterHealthyBackends filters backends based on health status and partial failure mode.
// Backends that are unhealthy or unauthenticated are excluded from the returned list.
//
// When healthProvider is nil (health monitoring disabled), all backends are returned.
//
// Filtering logic (uses health.IsBackendUsable):
// - Include: Healthy, Degraded (slow but functional), Unknown (not yet determined)
// - Exclude: Unhealthy, Unauthenticated (cannot be used)
// Filtering logic depends on the partial failure mode:
// - "fail" mode (strict): Include Healthy, Unknown; Exclude Degraded, Unhealthy, Unauthenticated
// - "best_effort" mode (lenient): Include Healthy, Degraded, Unknown; Exclude Unhealthy, Unauthenticated
//
// Returns the filtered backend list and logs excluded backends.
func FilterHealthyBackends(backends []vmcp.Backend, healthProvider health.StatusProvider) []vmcp.Backend {
func FilterHealthyBackends(
backends []vmcp.Backend,
healthProvider health.StatusProvider,
mode string,
) []vmcp.Backend {
// If health monitoring is disabled, return all backends
// Note: Check both interface nil and typed nil (Go interface semantics)
if !health.IsProviderInitialized(healthProvider) {
return backends
}

// Default mode if not specified
if mode == "" {
mode = vmcp.PartialFailureModeStrict // Default to strict mode for safety
logger.Debugf("No partial failure mode specified, using default: %s", mode)
}

filtered := make([]vmcp.Backend, 0, len(backends))
excludedCount := 0

Expand All @@ -33,26 +43,28 @@ func FilterHealthyBackends(backends []vmcp.Backend, healthProvider health.Status

// Include backend if:
// 1. Health status cannot be determined (err != nil) - assume healthy during transitions
// 2. Status indicates backend can handle requests (uses health.IsBackendUsable)
// 2. Status indicates backend can handle requests in the given mode
if err != nil {
// Backend not found in health monitor yet (new backend) - include it
logger.Debugf("Backend %s not found in health monitor, including in capabilities", backend.Name)
filtered = append(filtered, *backend)
continue
}

if health.IsBackendUsable(status) {
// Include usable backends (Healthy, Degraded, Unknown)
if health.IsBackendUsableInMode(status, mode) {
// Include usable backends based on mode
filtered = append(filtered, *backend)
} else {
// Exclude unusable backends (Unhealthy, Unauthenticated)
logger.Infof("Excluding backend %s from capabilities (status: %s)", backend.Name, status)
// Exclude unusable backends
logger.Infof("Excluding backend %s from capabilities (status: %s, mode: %s)",
backend.Name, status, mode)
excludedCount++
}
}

if excludedCount > 0 {
logger.Infof("Health filtering: %d backends included, %d backends excluded", len(filtered), excludedCount)
logger.Infof("Health filtering (%s mode): %d backends included, %d backends excluded",
mode, len(filtered), excludedCount)
}

return filtered
Expand Down
160 changes: 147 additions & 13 deletions pkg/vmcp/discovery/health_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func TestFilterHealthyBackends_NoHealthMonitoring(t *testing.T) {
}

// When health monitoring is disabled (nil provider), all backends should be returned
filtered := FilterHealthyBackends(backends, nil)
filtered := FilterHealthyBackends(backends, nil, "fail")

assert.Len(t, filtered, 3, "all backends should be included when health monitoring is disabled")
assert.Equal(t, backends, filtered, "backends should be unchanged")
Expand All @@ -82,7 +82,7 @@ func TestFilterHealthyBackends_AllHealthy(t *testing.T) {
healthProvider.setStatus("backend2", vmcp.BackendHealthy)
healthProvider.setStatus("backend3", vmcp.BackendHealthy)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

assert.Len(t, filtered, 3, "all healthy backends should be included")
assert.Equal(t, backends, filtered, "all backends should be present")
Expand All @@ -104,7 +104,7 @@ func TestFilterHealthyBackends_MixedHealthStatus(t *testing.T) {
healthProvider.setStatus("backend3", vmcp.BackendDegraded)
healthProvider.setStatus("backend4", vmcp.BackendUnauthenticated)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// Should include: healthy (backend1) and degraded (backend3)
// Should exclude: unhealthy (backend2) and unauthenticated (backend4)
Expand All @@ -127,7 +127,7 @@ func TestFilterHealthyBackends_ExcludesUnhealthy(t *testing.T) {
healthProvider.setStatus("backend2", vmcp.BackendUnhealthy)
healthProvider.setStatus("backend3", vmcp.BackendHealthy)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

require.Len(t, filtered, 2, "unhealthy backend should be excluded")
assert.Equal(t, "backend1", filtered[0].ID)
Expand All @@ -146,7 +146,7 @@ func TestFilterHealthyBackends_ExcludesUnauthenticated(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendHealthy)
healthProvider.setStatus("backend2", vmcp.BackendUnauthenticated)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

require.Len(t, filtered, 1, "unauthenticated backend should be excluded")
assert.Equal(t, "backend1", filtered[0].ID)
Expand All @@ -164,7 +164,7 @@ func TestFilterHealthyBackends_IncludesDegraded(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendHealthy)
healthProvider.setStatus("backend2", vmcp.BackendDegraded)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// Degraded backends are still functional (just slow), so they should be included
require.Len(t, filtered, 2, "degraded backends should be included")
Expand All @@ -184,7 +184,7 @@ func TestFilterHealthyBackends_IncludesUnknown(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendHealthy)
healthProvider.setStatus("backend2", vmcp.BackendUnknown)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// Unknown backends should be included (health not yet determined, give them a chance)
require.Len(t, filtered, 2, "unknown backends should be included")
Expand All @@ -204,7 +204,7 @@ func TestFilterHealthyBackends_BackendNotFoundInMonitor(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendHealthy)
// backend2 not in health monitor (GetBackendStatus will return error)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// Backend not found in monitor should be included (new backend during transitions)
require.Len(t, filtered, 2, "backends not found in monitor should be included")
Expand All @@ -224,7 +224,7 @@ func TestFilterHealthyBackends_AllUnhealthy(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendUnhealthy)
healthProvider.setStatus("backend2", vmcp.BackendUnhealthy)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

assert.Len(t, filtered, 0, "all unhealthy backends should be excluded")
}
Expand All @@ -235,7 +235,7 @@ func TestFilterHealthyBackends_EmptyBackendList(t *testing.T) {
backends := []vmcp.Backend{}
healthProvider := newMockHealthProvider()

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

assert.Len(t, filtered, 0, "empty input should return empty output")
assert.NotNil(t, filtered, "result should not be nil")
Expand All @@ -258,7 +258,7 @@ func TestFilterHealthyBackends_PreservesBackendData(t *testing.T) {
healthProvider := newMockHealthProvider()
healthProvider.setStatus("backend1", vmcp.BackendHealthy)

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

require.Len(t, filtered, 1)
// Verify all backend data is preserved
Expand All @@ -282,7 +282,7 @@ func TestFilterHealthyBackends_ErrorRetrievingStatus(t *testing.T) {
healthProvider.setStatus("backend1", vmcp.BackendHealthy)
healthProvider.setError("backend2", errors.New("health monitor error"))

filtered := FilterHealthyBackends(backends, healthProvider)
filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// Backend with error should be included (assume healthy during error conditions)
require.Len(t, filtered, 2, "backends with health monitor errors should be included")
Expand All @@ -304,7 +304,7 @@ func TestFilterHealthyBackends_NilTypedPointer(t *testing.T) {
var provider health.StatusProvider = nilMonitor

// Should not panic and should return all backends (health monitoring disabled)
filtered := FilterHealthyBackends(backends, provider)
filtered := FilterHealthyBackends(backends, provider, "fail")

assert.Len(t, filtered, 1, "Should return all backends when provider is nil pointer")
assert.Equal(t, "backend1", filtered[0].ID)
Expand Down Expand Up @@ -346,3 +346,137 @@ func TestIsProviderInitialized(t *testing.T) {
})
}
}

func TestFilterHealthyBackends_FailMode(t *testing.T) {
t.Parallel()

backends := []vmcp.Backend{
{ID: "b1", Name: "Backend 1"},
{ID: "b2", Name: "Backend 2"},
{ID: "b3", Name: "Backend 3"},
{ID: "b4", Name: "Backend 4"},
}

healthProvider := newMockHealthProvider()
healthProvider.setStatus("b1", vmcp.BackendHealthy)
healthProvider.setStatus("b2", vmcp.BackendDegraded)
healthProvider.setStatus("b3", vmcp.BackendUnhealthy)
healthProvider.setStatus("b4", vmcp.BackendUnauthenticated)

filtered := FilterHealthyBackends(backends, healthProvider, "fail")

// In "fail" mode: only healthy and unknown backends included
// Degraded backends are excluded in strict mode
require.Len(t, filtered, 1, "only healthy backends should be included in fail mode")
assert.Equal(t, "b1", filtered[0].ID)
}

func TestFilterHealthyBackends_BestEffortMode(t *testing.T) {
t.Parallel()

backends := []vmcp.Backend{
{ID: "b1", Name: "Backend 1"},
{ID: "b2", Name: "Backend 2"},
{ID: "b3", Name: "Backend 3"},
{ID: "b4", Name: "Backend 4"},
}

healthProvider := newMockHealthProvider()
healthProvider.setStatus("b1", vmcp.BackendHealthy)
healthProvider.setStatus("b2", vmcp.BackendDegraded)
healthProvider.setStatus("b3", vmcp.BackendUnhealthy)
healthProvider.setStatus("b4", vmcp.BackendUnauthenticated)

filtered := FilterHealthyBackends(backends, healthProvider, "best_effort")

// In "best_effort" mode: healthy, degraded, and unknown backends included
require.Len(t, filtered, 2, "healthy and degraded backends should be included in best_effort mode")
assert.Equal(t, "b1", filtered[0].ID)
assert.Equal(t, "b2", filtered[1].ID)
}

func TestFilterHealthyBackends_DefaultMode(t *testing.T) {
t.Parallel()

backends := []vmcp.Backend{
{ID: "b1", Name: "Backend 1"},
{ID: "b2", Name: "Backend 2"},
}

healthProvider := newMockHealthProvider()
healthProvider.setStatus("b1", vmcp.BackendHealthy)
healthProvider.setStatus("b2", vmcp.BackendDegraded)

// Empty mode string should default to "fail"
filtered := FilterHealthyBackends(backends, healthProvider, "")

// Default mode is "fail" (strict), so degraded is excluded
require.Len(t, filtered, 1, "empty mode should default to fail mode")
assert.Equal(t, "b1", filtered[0].ID)
}

func TestFilterHealthyBackends_UnknownMode(t *testing.T) {
t.Parallel()

backends := []vmcp.Backend{
{ID: "b1", Name: "Backend 1"},
{ID: "b2", Name: "Backend 2"},
{ID: "b3", Name: "Backend 3"},
}

healthProvider := newMockHealthProvider()
healthProvider.setStatus("b1", vmcp.BackendHealthy)
healthProvider.setStatus("b2", vmcp.BackendDegraded)
healthProvider.setStatus("b3", vmcp.BackendUnhealthy)

// Unknown mode should default to strict (fail) behavior for fail-safe operation
filtered := FilterHealthyBackends(backends, healthProvider, "unknown_mode")

// Should behave like "fail" mode (excludes degraded, includes only healthy)
require.Len(t, filtered, 1, "unknown mode should default to strict (fail) behavior")
assert.Equal(t, "b1", filtered[0].ID) // Only healthy backend
}

func TestFilterHealthyBackends_ModeWithUnknownStatus(t *testing.T) {
t.Parallel()

backends := []vmcp.Backend{
{ID: "b1", Name: "Backend 1"},
{ID: "b2", Name: "Backend 2"},
{ID: "b3", Name: "Backend 3"},
}

healthProvider := newMockHealthProvider()
healthProvider.setStatus("b1", vmcp.BackendHealthy)
healthProvider.setStatus("b2", vmcp.BackendUnknown)
healthProvider.setStatus("b3", vmcp.BackendDegraded)

tests := []struct {
name string
mode string
expected []string
}{
{
name: "fail mode includes unknown",
mode: "fail",
expected: []string{"b1", "b2"}, // Healthy and Unknown
},
{
name: "best_effort mode includes unknown",
mode: "best_effort",
expected: []string{"b1", "b2", "b3"}, // Healthy, Unknown, and Degraded
},
}

for _, tt := range tests {
tt := tt // capture range variable
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
filtered := FilterHealthyBackends(backends, healthProvider, tt.mode)
require.Len(t, filtered, len(tt.expected))
for i, expectedID := range tt.expected {
assert.Equal(t, expectedID, filtered[i].ID)
}
})
}
}
Loading
Loading