-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Automatic switch to emergency mode when metrics unavailable #424
base: main
Are you sure you want to change the base?
Changes from 1 commit
438f653
df7b7b8
52780ef
9c153b5
f4454f4
bba0747
a968c7a
691c4f4
e28d881
192fecf
4343ea9
93556df
5cda06c
0fbfff5
ae2334a
28080e3
ca36611
5677abc
a6b0318
e8454e9
3725788
ba4351e
11d8cb8
c056d82
b47578c
7d07efd
12c7b05
ed22342
bddb139
0dc2749
dd7c10a
2e261c4
7c0e997
5be0c1a
d8ae58d
46d71a9
e41aab4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -286,19 +286,23 @@ func (c *Service) GetHPAOnTortoiseSpec(ctx context.Context, tortoise *autoscalin | |||||
return hpa, nil | ||||||
} | ||||||
|
||||||
func (c *Service) GetHPAOnTortoise(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, error) { | ||||||
func (c *Service) GetHPAOnTortoise(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, bool, error) { | ||||||
if !HasHorizontal(tortoise) { | ||||||
// there should be no HPA | ||||||
return nil, nil | ||||||
return nil, true, nil | ||||||
} | ||||||
hpa := &v2.HorizontalPodAutoscaler{} | ||||||
if err := c.c.Get(ctx, types.NamespacedName{Namespace: tortoise.Namespace, Name: tortoise.Status.Targets.HorizontalPodAutoscaler}, hpa); err != nil { | ||||||
return nil, fmt.Errorf("failed to get hpa on tortoise: %w", err) | ||||||
return nil, false, fmt.Errorf("failed to get hpa on tortoise: %w", err) | ||||||
} | ||||||
if reflect.DeepEqual(hpa.Status, v2.HorizontalPodAutoscalerStatus{}) || hpa.Status.Conditions == nil || hpa.Status.CurrentMetrics == nil { | ||||||
// Most likely, HPA is just created and not yet handled by HPA controller. | ||||||
return nil, false, nil | ||||||
} | ||||||
|
||||||
recordHPAMetric(ctx, tortoise, hpa) | ||||||
|
||||||
return hpa, nil | ||||||
return hpa, true, nil | ||||||
} | ||||||
|
||||||
func (s *Service) UpdatingHPATargetUtilizationAllowed(tortoise *autoscalingv1beta3.Tortoise, now time.Time) (*autoscalingv1beta3.Tortoise, bool) { | ||||||
|
@@ -498,33 +502,32 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( | |||||
givenHPA *v2.HorizontalPodAutoscaler, | ||||||
replicaNum int32, | ||||||
now time.Time, | ||||||
) (*autoscalingv1beta3.Tortoise, bool, error) { | ||||||
hpaCreated := false | ||||||
) (*autoscalingv1beta3.Tortoise, error) { | ||||||
if tortoise.Spec.UpdateMode == autoscalingv1beta3.UpdateModeOff { | ||||||
// When UpdateMode is Off, we don't update HPA. | ||||||
return tortoise, hpaCreated, nil | ||||||
return tortoise, nil | ||||||
} | ||||||
|
||||||
if !HasHorizontal(tortoise) { | ||||||
if tortoise.Spec.TargetRefs.HorizontalPodAutoscalerName == nil { | ||||||
// HPA should be created by Tortoise, which can be deleted. | ||||||
err := c.DeleteHPACreatedByTortoise(ctx, tortoise) | ||||||
if err != nil && !apierrors.IsNotFound(err) { | ||||||
return tortoise, hpaCreated, fmt.Errorf("delete hpa created by tortoise: %w", err) | ||||||
return tortoise, fmt.Errorf("delete hpa created by tortoise: %w", err) | ||||||
} | ||||||
c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPADeleted, fmt.Sprintf("Deleted a HPA %s/%s because tortoise has no resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) | ||||||
} else { | ||||||
// We cannot delete the HPA because it's specified by the user. | ||||||
err := c.disableHPA(ctx, tortoise, replicaNum) | ||||||
if err != nil { | ||||||
return tortoise, hpaCreated, fmt.Errorf("disable hpa: %w", err) | ||||||
return tortoise, fmt.Errorf("disable hpa: %w", err) | ||||||
} | ||||||
c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPADisabled, fmt.Sprintf("Disabled a HPA %s/%s because tortoise has no resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) | ||||||
} | ||||||
|
||||||
// No need to edit container resource phase. | ||||||
|
||||||
return tortoise, hpaCreated, nil | ||||||
return tortoise, nil | ||||||
} | ||||||
|
||||||
hpa := &v2.HorizontalPodAutoscaler{} | ||||||
|
@@ -536,23 +539,22 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( | |||||
// - In that case, we need to create an initial HPA or give an annotation to existing HPA. | ||||||
tortoise, err = c.InitializeHPA(ctx, tortoise, replicaNum, now) | ||||||
if err != nil { | ||||||
return tortoise, hpaCreated, fmt.Errorf("initialize hpa: %w", err) | ||||||
return tortoise, fmt.Errorf("initialize hpa: %w", err) | ||||||
} | ||||||
hpaCreated = true | ||||||
|
||||||
c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPACreated, fmt.Sprintf("Initialized a HPA %s/%s because tortoise has resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) | ||||||
return tortoise, hpaCreated, nil | ||||||
return tortoise, nil | ||||||
} | ||||||
|
||||||
return tortoise, hpaCreated, fmt.Errorf("failed to get hpa on tortoise: %w", err) | ||||||
return tortoise, fmt.Errorf("failed to get hpa on tortoise: %w", err) | ||||||
} | ||||||
|
||||||
var newhpa *v2.HorizontalPodAutoscaler | ||||||
var isHpaEdited bool | ||||||
newhpa, tortoise, isHpaEdited = c.syncHPAMetricsWithTortoiseAutoscalingPolicy(ctx, tortoise, hpa, now) | ||||||
if !isHpaEdited { | ||||||
// User didn't change anything. | ||||||
return tortoise, hpaCreated, nil | ||||||
return tortoise, nil | ||||||
} | ||||||
|
||||||
retryNumber := -1 | ||||||
|
@@ -571,12 +573,12 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( | |||||
} | ||||||
|
||||||
if err := retry.RetryOnConflict(retry.DefaultRetry, updateFn); err != nil { | ||||||
return tortoise, hpaCreated, fmt.Errorf("update hpa: %w (%v times retried)", err, replicaNum) | ||||||
return tortoise, fmt.Errorf("update hpa: %w (%v times retried)", err, replicaNum) | ||||||
} | ||||||
|
||||||
c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPAUpdated, fmt.Sprintf("Updated a HPA %s/%s because the autoscaling policy is changed in the tortoise", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) | ||||||
|
||||||
return tortoise, hpaCreated, nil | ||||||
return tortoise, nil | ||||||
} | ||||||
|
||||||
func HasHorizontal(tortoise *autoscalingv1beta3.Tortoise) bool { | ||||||
|
@@ -769,25 +771,18 @@ func (c *Service) excludeExternalMetric(ctx context.Context, hpa *v2.HorizontalP | |||||
return newHPA | ||||||
} | ||||||
|
||||||
func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.HorizontalPodAutoscaler) (bool, error) { | ||||||
func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.HorizontalPodAutoscaler) bool { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, can we do like this instead? It would eliminate the logic from the controller layer. func (c *Service) UpdateTortoisePhaseIfHPAIsUnhealthy(ctx context.Context, currenthpa *v2.HorizontalPodAutoscaler, tortoise *v1beta1.Tortoise) error {
...
if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" {
//switch to Emergency mode since no metrics
logger.Info("HPA failed to get resource metrics, switch to emergency mode")
tortoise.Status.TortoisePhase = v1beta3.TortoisePhaseEmergency
return nil
}
} Then, probably it'd make more sense to move inside the tortoise service, instead of hpa service. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wont this add extra responsibilities to tortoise service if the service has to manage hpa conditions as well? Would it be better to add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hmm, that's understandable. So,
this direction looks good.
randytqwjp marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
//currenthpa = currenthpa.DeepCopy() | ||||||
randytqwjp marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
logger := log.FromContext(ctx) | ||||||
if currenthpa == nil { | ||||||
logger.Info("empty HPA passed into status check, ignore") | ||||||
return true, nil | ||||||
} | ||||||
|
||||||
if reflect.DeepEqual(currenthpa.Status, v2.HorizontalPodAutoscalerStatus{}) { | ||||||
return false, fmt.Errorf("HPA empty status, switch to emergency mode") | ||||||
return true | ||||||
} | ||||||
|
||||||
if currenthpa.Status.Conditions == nil { | ||||||
return false, fmt.Errorf("HPA empty conditions, switch to emergency mode") | ||||||
if reflect.DeepEqual(currenthpa.Status, v2.HorizontalPodAutoscalerStatus{}) || currenthpa.Status.Conditions == nil || currenthpa.Status.CurrentMetrics == nil { | ||||||
return true | ||||||
} | ||||||
|
||||||
if currenthpa.Status.CurrentMetrics == nil { | ||||||
return false, fmt.Errorf("HPA no metrics, switch to emergency mode") | ||||||
} | ||||||
conditions := currenthpa.Status.Conditions | ||||||
currentMetrics := currenthpa.Status.CurrentMetrics | ||||||
|
||||||
|
@@ -796,7 +791,7 @@ func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.Horiz | |||||
if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { | ||||||
//switch to Emergency mode since no metrics | ||||||
logger.Info("HPA failed to get resource metrics, switch to emergency mode") | ||||||
return false, nil | ||||||
return false | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -805,14 +800,14 @@ func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.Horiz | |||||
for _, currentMetric := range currentMetrics { | ||||||
if !currentMetric.ContainerResource.Current.Value.IsZero() { | ||||||
//Can still get metrics for some containers, scale based on those | ||||||
randytqwjp marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return true, nil | ||||||
return true | ||||||
} | ||||||
} | ||||||
logger.Info("HPA all metrics return 0, switch to emergency mode") | ||||||
randytqwjp marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return false, nil | ||||||
return false | ||||||
} | ||||||
|
||||||
logger.Info("HPA status check passed") | ||||||
|
||||||
return true, nil | ||||||
return true | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: we don't need to define this here?