Skip to content

Commit

Permalink
review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
k-anshul committed Oct 15, 2024
1 parent 7dd7ad5 commit f046a64
Show file tree
Hide file tree
Showing 4 changed files with 413 additions and 144 deletions.
14 changes: 7 additions & 7 deletions admin/worker/deployments_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,26 +183,26 @@ func (w *Worker) deploymentHealthCheck(ctx context.Context, d *database.Deployme
// log metrics view errors
logger := w.logger.WithLazy(f...)
if health.ParseErrorCount > 0 || health.ReconcileErrorCount > 0 {
logger.Error("deployment health check: project has parse/reconcile errors", zap.Int32("parse_errors", health.ParseErrorCount), zap.Int32("reconcile_errors", health.ReconcileErrorCount))
logger.Warn("deployment health check: project has parse/reconcile errors", zap.Int32("parse_errors", health.ParseErrorCount), zap.Int32("reconcile_errors", health.ReconcileErrorCount))
}
for d, err := range health.MetricsViewErrors {
logger.Error("deployment health check: dashboard error", zap.String("dashboard", d), zap.String("error", err))
logger.Warn("deployment health check: metrics view error", zap.String("metrics_view", d), zap.String("error", err))
}

onlyUnhealthyDash := true
logError := false
if health.OlapError != "" {
onlyUnhealthyDash = false
logError = true
f = append(f, zap.String("olap_error", health.OlapError))
}
if health.ControllerError != "" {
onlyUnhealthyDash = false
logError = true
f = append(f, zap.String("controller_error", health.ControllerError))
}
if health.RepoError != "" {
onlyUnhealthyDash = false
logError = true
f = append(f, zap.String("repo_error", health.RepoError))
}
if onlyUnhealthyDash {
if !logError {
continue
}
w.logger.Error("deployment health check: runtime instance is unhealthy", f...)
Expand Down
71 changes: 38 additions & 33 deletions runtime/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,26 @@ type Health struct {
InstancesHealth map[string]*InstanceHealth
}

// InstanceHealth contains health information for a single instance.
// The information about OLAP and metrics views is cached in the catalog.
// We want to avoid hitting the underlying OLAP engine when OLAP engine can scale to zero when no queries are generated within TTL.
// We do not want to keep it running just to check health. In such cases, we use the cached health information.
type InstanceHealth struct {
// always recomputed
Controller string
Repo string
ParseErrCount int
ReconcileErrCount int

// cached
OLAP string
MetricsViews map[string]metricsViewHealth
// always recomputed on every health check
Controller string `json:"-"`
Repo string `json:"-"`
ParseErrCount int `json:"-"`
ReconcileErrCount int `json:"-"`

// cached health check information can be used if controller version is same and metrics view spec is same
OLAP string `json:"olap"`
MetricsViews map[string]InstanceHealthMetricsViewError `json:"metrics_views"`
ControllerVersion int64 `json:"controller_version"`
}

ControllerVersion int64
type InstanceHealthMetricsViewError struct {
Err string `json:"err"`
Version int64 `json:"version"`
}

func (r *Runtime) Health(ctx context.Context) (*Health, error) {
Expand Down Expand Up @@ -91,22 +99,26 @@ func (r *Runtime) InstanceHealth(ctx context.Context, instanceID string) (*Insta
release()
}

// run queries against metrics views
resources, err := ctrl.List(ctx, ResourceKindMetricsView, "", false)
// check resources with reconcile errors
resources, err := ctrl.List(ctx, "", "", false)
if err != nil {
return nil, err
}
res.MetricsViews = make(map[string]metricsViewHealth, len(resources))
for _, r := range resources {
if r.Meta.ReconcileError != "" {
res.ReconcileErrCount++
}
}

// run queries against metrics views
res.MetricsViews = make(map[string]InstanceHealthMetricsViewError, len(resources))
for _, mv := range resources {
if mv.GetMetricsView().State.ValidSpec == nil {
if mv.Meta.ReconcileError != "" {
res.ReconcileErrCount++
}
if mv.GetMetricsView() == nil || mv.GetMetricsView().State.ValidSpec == nil {
continue
}
olap, release, err := r.OLAP(ctx, instanceID, mv.GetMetricsView().State.ValidSpec.Connector)
if err != nil {
res.MetricsViews[mv.Meta.Name.Name] = metricsViewHealth{err: err.Error()}
res.MetricsViews[mv.Meta.Name.Name] = InstanceHealthMetricsViewError{Err: err.Error()}
release()
continue
}
Expand All @@ -130,17 +142,15 @@ func (r *Runtime) InstanceHealth(ctx context.Context, instanceID string) (*Insta
Claims: &SecurityClaims{SkipChecks: true},
})

mvHealth := metricsViewHealth{Version: mv.Meta.StateVersion}
mvHealth := InstanceHealthMetricsViewError{
Version: mv.Meta.StateVersion,
}
if err != nil {
mvHealth.err = err.Error()
mvHealth.Err = err.Error()
}
res.MetricsViews[mv.Meta.Name.Name] = mvHealth
}

if !canScaleToZero {
return res, nil
}

// save to cache
res.ControllerVersion = ctrl.catalog.version
bytes, err := json.Marshal(res)
Expand All @@ -156,7 +166,7 @@ func (r *Runtime) InstanceHealth(ctx context.Context, instanceID string) (*Insta

err = catalog.UpsertInstanceHealth(ctx, &drivers.InstanceHealth{
InstanceID: instanceID,
Health: bytes,
HealthJSON: bytes,
})
if err != nil {
return nil, err
Expand All @@ -178,7 +188,7 @@ func (r *Runtime) cachedInstanceHealth(ctx context.Context, instanceID string, c
}

c := &InstanceHealth{}
err = json.Unmarshal(cached.Health, c)
err = json.Unmarshal(cached.HealthJSON, c)
if err != nil || ctrlVersion != c.ControllerVersion {
return nil, false
}
Expand Down Expand Up @@ -219,14 +229,9 @@ func (h *InstanceHealth) To() *runtimev1.InstanceHealth {
}
r.MetricsViewErrors = make(map[string]string, len(h.MetricsViews))
for k, v := range h.MetricsViews {
if v.err != "" {
r.MetricsViewErrors[k] = v.err
if v.Err != "" {
r.MetricsViewErrors[k] = v.Err
}
}
return r
}

type metricsViewHealth struct {
err string
Version int64
}
Loading

0 comments on commit f046a64

Please sign in to comment.