Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate cpu and mem savings in tortoise #427

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
8 changes: 4 additions & 4 deletions internal/controller/tortoise_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_
return ctrl.Result{RequeueAfter: r.Interval}, nil
}

_, tortoise, err = r.HpaService.UpdateHPAFromTortoiseRecommendation(ctx, tortoise, now)
tortoise, err = r.TortoiseService.UpdateResourceRequest(ctx, tortoise, currentDesiredReplicaNum, now)
if err != nil {
logger.Error(err, "update HPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
logger.Error(err, "update VPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
return ctrl.Result{}, err
}

tortoise, err = r.TortoiseService.UpdateResourceRequest(ctx, tortoise, currentDesiredReplicaNum, now)
_, tortoise, err = r.HpaService.UpdateHPAFromTortoiseRecommendation(ctx, tortoise, now)
if err != nil {
logger.Error(err, "update VPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
logger.Error(err, "update HPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
return ctrl.Result{}, err
}

Expand Down
33 changes: 24 additions & 9 deletions pkg/hpa/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet
recommendMax = c.maximumMaxReplica
}

oldMax := hpa.Spec.MaxReplicas
hpa.Spec.MaxReplicas = recommendMax

recommendMin, err := GetReplicasRecommendation(tortoise.Status.Recommendations.Horizontal.MinReplicas, now)
Expand Down Expand Up @@ -443,19 +444,33 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet
minToActuallyApply = recommendMin
}

oldMin := *hpa.Spec.MinReplicas
hpa.Spec.MinReplicas = &minToActuallyApply
if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics {
// We don't want to record applied* metric when UpdateMode is Off.
netChangeMaxReplicas := float64(hpa.Spec.MaxReplicas - recommendMax)
netChangeMinReplicas := float64(*hpa.Spec.MinReplicas) - float64(recommendMin)
if netChangeMaxReplicas > 0 || netChangeMinReplicas < 0 {
metrics.IncreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
}
if netChangeMaxReplicas < 0 || netChangeMinReplicas > 0 {
metrics.DecreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
netChangeMaxReplicas := float64(recommendMax - oldMax)
netChangeMinReplicas := float64(recommendMin - oldMin)
cpu := float64(0)
mem := float64(0)
for _, r := range tortoise.Status.Conditions.ContainerResourceRequests {
for resourcename, value := range r.Resource {
if resourcename == corev1.ResourceCPU {
cpu += value.AsApproximateFloat64()
}
if resourcename == corev1.ResourceMemory {
mem += value.AsApproximateFloat64()
}
}
}
metrics.NetHPAMinReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Name).Set(netChangeMinReplicas)
metrics.NetHPAMaxReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Name).Set(netChangeMaxReplicas)
netChangeMaxReplicasCpu := netChangeMaxReplicas * cpu
netChangeMinReplicasCpu := netChangeMinReplicas * cpu
netChangeMinReplicasMem := netChangeMinReplicas * mem
netChangeMaxReplicasMem := netChangeMaxReplicas * mem

metrics.NetHPAMinReplicasCPUCores.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMinReplicasCpu)
metrics.NetHPAMaxReplicasCPUCores.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMaxReplicasCpu)
metrics.NetHPAMinReplicasMemory.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMinReplicasMem)
metrics.NetHPAMaxReplicasMemory.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMaxReplicasMem)
metrics.AppliedHPAMinReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(float64(*hpa.Spec.MinReplicas))
metrics.AppliedHPAMaxReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(float64(hpa.Spec.MaxReplicas))
}
Expand Down
50 changes: 25 additions & 25 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,35 +46,35 @@ var (
Help: "memory request (byte) that tortoises actually applys",
}, []string{"tortoise_name", "namespace", "container_name", "controller_name", "controller_kind"})

DecreaseApplyCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "decrease_apply_counter",
Help: "counter for number of resource decreases applied by tortoise",
}, []string{"tortoise_name", "namespace"})

IncreaseApplyCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "increase_apply_counter",
Help: "counter for number of resource increases applied by tortoise",
}, []string{"tortoise_name", "namespace"})

NetHPAMinReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas",
Help: "net hpa minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name", "kube_deployment"})

NetHPAMaxReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas",
Help: "net hpa maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name", "kube_deployment"})
NetHPAMinReplicasCPUCores = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas_cpu_cores",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "net" mean?

Help: "net cpu cores changed by minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMinReplicasMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas_memory",
Help: "net memory changed by minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMaxReplicasCPUCores = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas_cpu_cores",
Help: "net cpu cores changed by maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMaxReplicasMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas_memory",
Help: "net memory changed by maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetCPURequest = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_cpu_request",
Help: "net cpu request (millicore) that tortoises actually applys",
}, []string{"tortoise_name", "namespace", "container_name", "controller_name", "controller_kind"})
}, []string{"tortoise_name", "namespace", "container_name", "kube_deployment", "controller_kind"})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we have to change it? I do prefer controller_name, which allows us to support other kind of resources (replicaset etc) in the future, and is also consistent with GKE metrics.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm but which controller is it referring to? because in the code it is actually referring to the deployment name. i just thought controller_name was a little confusing. But as you mentioned about the breaking change then we could leave it as is because theres no specific need to change it

Copy link
Collaborator

@sanposhiho sanposhiho Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I remember initially I wanted to use owner_name, but I changed it to controller_name on the second thought, because controller_name is used at GKE metrics. Generally speaking, it's much easier to use the same label names as other metrics as much as possible for when you make a dashboard etc at datadog. (you can aggregate with the common labels)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, make a complaint to GKE 😅


NetMemoryRequest = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_memory_request",
Help: "net memory request (byte) that tortoises actually applys",
}, []string{"tortoise_name", "namespace", "container_name", "controller_name", "controller_kind"})
}, []string{"tortoise_name", "namespace", "container_name", "kube_deployment", "controller_kind"})

ProposedHPATargetUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "proposed_hpa_utilization_target",
Expand Down Expand Up @@ -117,10 +117,10 @@ func init() {
AppliedHPAMinReplicas,
AppliedCPURequest,
AppliedMemoryRequest,
IncreaseApplyCounter,
DecreaseApplyCounter,
NetHPAMaxReplicas,
NetHPAMinReplicas,
NetHPAMinReplicasCPUCores,
NetHPAMinReplicasMemory,
NetHPAMaxReplicasCPUCores,
NetHPAMaxReplicasMemory,
NetCPURequest,
NetMemoryRequest,
ProposedHPATargetUtilization,
Expand Down
10 changes: 2 additions & 8 deletions pkg/tortoise/tortoise.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,21 +768,15 @@ func (c *Service) UpdateResourceRequest(ctx context.Context, tortoise *v1beta3.T
// only record metrics once in every reconcile loop.
for resourcename, value := range r.Resource {
oldRequest := oldRequestMap[r.ContainerName][resourcename]
netChange := float64(oldRequest.MilliValue() - value.MilliValue())
netChange := float64(oldRequest.MilliValue()-value.MilliValue()) * float64(replica)
if resourcename == corev1.ResourceCPU {
// We don't want to record applied* metric when UpdateMode is Off.
metrics.AppliedCPURequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(value.MilliValue()))
metrics.NetCPURequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(netChange)
}
if resourcename == corev1.ResourceMemory {
metrics.AppliedMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(value.Value()))
metrics.NetMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(netChange))
}
if netChange > 0 {
metrics.IncreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
}
if netChange < 0 {
metrics.DecreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
metrics.NetMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(netChange / float64(1000))
}
}
}
Expand Down
Loading