From abbc3e593092ce092dcd886d44810d08b29370cc Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Mon, 2 Oct 2023 21:38:17 +0900 Subject: [PATCH] feature: move configuration from flags to the config file (#125) --- README.md | 2 +- docs/configuration.md | 231 ++++++++++++++++++ docs/flag-configuration.md | 167 ------------- docs/horizontal.md | 76 +++--- docs/vertical.md | 4 +- go.mod | 7 +- go.sum | 14 +- main.go | 50 +--- pkg/config/config.go | 88 +++++++ pkg/config/config_test.go | 102 ++++++++ .../testdata/config-partly-override.yaml | 1 + pkg/config/testdata/config.yaml | 13 + 12 files changed, 505 insertions(+), 250 deletions(-) create mode 100644 docs/configuration.md delete mode 100644 docs/flag-configuration.md create mode 100644 pkg/config/config.go create mode 100644 pkg/config/config_test.go create mode 100644 pkg/config/testdata/config-partly-override.yaml create mode 100644 pkg/config/testdata/config.yaml diff --git a/README.md b/README.md index 8686bc15..d1214d7f 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Tortoise, then they'll prepare/keep adjusting HPA and VPA to achieve efficient a - [Horizontal scaling](./docs/horizontal.md): describes how the Tortoise does the horizontal autoscaling. - [Vertical scaling](./docs/vertical.md): describes how the Tortoise does the vertical autoscaling. - [Emergency mode](./docs/emergency.md): describes the emergency mode. -- [Flag configurations for admin](./docs/flag-configuration.md): describes how the cluster admin can configure the global behavior via flags +- [Configurations for admin](./docs/configuration.md): describes how the cluster admin can configure the global behavior via the configuration file. - [Technically details](./docs/internal.md): describes the technically details of Tortoise. (mostly for the contributors) - [Contributor guide](./docs/contributor-guide.md): describes other stuff for the contributor. (testing etc) diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..a4c6eb5f --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,231 @@ +## Configuration for admin + +Tortoise + +The cluster admin can set the global configurations via flags. + +``` +RangeOfMinMaxReplicasRecommendationHours: The time (hours) range of minReplicas and maxReplicas recommendation (default: 1) +MinMaxReplicasRecommendationType: The type of minReplicas and maxReplicas recommendation. The valid values are "daily" and "weekly" (default: weekly) +TTLHoursOfMinMaxReplicasRecommendation: The TTL of minReplicas and maxReplicas recommendation (default: 720 (=30 days)) +MaxReplicasFactor: The factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0) +MinReplicasFactor: The factor to calculate the minReplicas recommendation from the current replica number (default: 0.5) +ReplicaReductionFactor: The factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95) +UpperTargetResourceUtilization: The max target utilization that tortoise can give to the HPA (default: 90) +MinimumMinReplicas: The minimum minReplicas that tortoise can give to the HPA (default: 3) +PreferredReplicaNumUpperLimit: The replica number which the tortoise tries to keep the replica number less than. As it says "preferred", the tortoise **tries** to keep the replicas number less than this, but the replica number may be more than this when other "required" rule will be violated by this limit. (default: 30) +MaximumCPUCores: The maximum CPU cores that the tortoise can give to the container (default: 10) +MaximumMemoryBytes: The maximum memory bytes that the tortoise can give to the container (default: 10Gi) +Timezone: The timezone used to record time in tortoise objects (default: Asia/Tokyo) +TortoiseUpdateInterval: The interval of updating each tortoise (default: 15s) +``` + +### RangeOfMinMaxReplicasRecommendationHours + +The time (hours) range of minReplicas and maxReplicas recommendation (default: 1) + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + minReplicas: + - from: 0 + to: 1 + weekday: Sunday + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z + - from: 1 + to: 2 + weekday: Sunday + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z +``` + +## MinMaxReplicasRecommendationType + +The routine of minReplicas and maxReplicas recommendation. The valid values are "daily" and "weekly" (default: weekly) + +#### "daily" + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + minReplicas: + # This recommendation is from 0am to 1am on all days of week. + - from: 0 + to: 1 + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z + - from: 1 + to: 2 + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z + # ... + - from: 23 + to: 24 + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z +``` + +#### "weekly" + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + minReplicas: + # This recommendation is from 0am to 1am on Sundays. + - from: 0 + to: 1 + weekday: Sunday # Recommendation is generated for each day of week. + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z + - from: 1 + to: 2 + weekday: Sunday + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z + # ... + - from: 23 + to: 24 + weekday: Saturday + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z +``` + +### TTLHoursOfMinMaxReplicasRecommendation + +The TTL of minReplicas and maxReplicas recommendation (default: 720 (=30 days)) + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + minReplicas: + - from: 0 + to: 1 + weekday: Sunday + timezone: Asia/Tokyo + value: 3 + updatedAt: 2023-01-01T00:00:00Z # this recommendation will be expired on 2023-01-31. +``` + +### MaxReplicasFactor + +The factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0) + +If the current replica number is 15 and `MaxReplicasFactor` is 2.0, +the maxReplicas recommendation from the current situation will be 30 (15 * 2.0). + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + maxReplicas: + - from: 0 + to: 1 + weekday: Sunday + timezone: Asia/Tokyo + value: 30 + updatedAt: 2023-01-01T00:00:00Z +``` + +### MinReplicasFactor + +The factor to calculate the minReplicas recommendation from the current replica number (default: 0.5) + +If the current replica number is 10 and `MaxReplicasFactor` is 0.5, +the minReplicas recommendation from the current situation will be 5 (10 * 0.5). + +```yaml +kind: Tortoise +#... +status: + recommendations: + horizontal: + minReplicas: + - from: 0 + to: 1 + weekday: Sunday + timezone: Asia/Tokyo + value: 5 + updatedAt: 2023-01-01T00:00:00Z +``` + +### ReplicaReductionFactor + +The factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95) + +Let's say `ReplicaReductionFactor` is 0.95, +the minReplicas was increased to 100 due to the emergency mode, +and a user just turned off the emergency mode now. + +Then, the `minReplicas` is going to change like: + +100 --(*0.95)--> 95 --(*0.95)--> 91 -- ... + +It's reduced every time tortoise is evaluated by the controller. (= once a `TortoiseUpdateInterval`) + +### UpperTargetResourceUtilization + +The max target utilization that tortoise can give to the HPA (default: 90) +So, HPA target utilization managed by tortoise won't be higher than this value. + +### MinimumMinReplicas + +The minimum minReplicas that tortoise can give to the HPA (default: 3) +So, HPA minReplicas managed by tortoise won't be smaller than this value. + +### PreferredReplicaNumUpperLimit + +The replica number which the tortoise tries to keep the replica number less than. (default: 30) + +As it says "preferred", the tortoise **tries** to keep the replicas number less than this, +but the replica number may be more than this when other "required" rules (`MaximumCPUCores` and `MaximumMemoryBytes`) will be violated by this limit. + +So, when the number of replicas reaches `PreferredReplicaNumUpperLimit`, +a tortoise will increase the Pod's resource request instead of increasing the number of replicas. + +But, when the resource request reaches `MaximumCPUCores` or `MaximumMemoryBytes`, +a tortoise will ignore `PreferredReplicaNumUpperLimit`, and increase the number of replicas. + +### MaximumCPUCores + +The maximum CPU cores that the tortoise can give to the container (default: 10) + +Note that it's the upper limit for the container, not for the Pod. + +### MaximumMemoryBytes + +The maximum memory bytes that the tortoise can give to the container (default: 10Gi) + +Note that it's the upper limit for the container, not for the Pod. + +### timezone + +The timezone which used to record time in tortoise objects (default: Asia/Tokyo) + +### TortoiseUpdateInterval + +The interval of updating each tortoise (default: 15s) + +But, it may delay if there are many tortoise objects in the cluster. diff --git a/docs/flag-configuration.md b/docs/flag-configuration.md deleted file mode 100644 index ef77f3a6..00000000 --- a/docs/flag-configuration.md +++ /dev/null @@ -1,167 +0,0 @@ -## Flag configurations for admin - -Tortoise - -The cluster admin can set the global configurations via flags. - -``` ---range-of-min-max-replicas-recommendation-hours: The time (hours) range of minReplicas and maxReplicas recommendation (default: 1) ---ttl-hours-of-min-max-replicas-recommendation: The TTL of minReplicas and maxReplicas recommendation (default: 720 (=30 days)) ---max-replicas-factor: The factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0) ---min-replicas-factor: The factor to calculate the minReplicas recommendation from the current replica number (default: 0.5) ---replica-reduction-factor: The factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95) ---upper-target-resource-utilization: The max target utilization that tortoise can give to the HPA (default: 90) ---minimum-min-replicas: The minimum minReplicas that tortoise can give to the HPA (default: 3) ---preferred-replicas-number-upper-limit: The replica number which the tortoise tries to keep the replica number less than. As it says "preferred", the tortoise **tries** to keep the replicas number less than this, but the replica number may be more than this when other "required" rule will be violated by this limit. (default: 30) ---maximum-cpu-cores: The maximum CPU cores that the tortoise can give to the container (default: 10) ---maximum-memory-bytes: The maximum memory bytes that the tortoise can give to the container (default: 10Gi) ---timezone: The timezone used to record time in tortoise objects (default: Asia/Tokyo) ---tortoise-update-interval: The interval of updating each tortoise (default: 15s) -``` - -### range-of-min-max-replicas-recommendation-hours - -The time (hours) range of minReplicas and maxReplicas recommendation (default: 1) - -```yaml -kind: Tortoise -#... -status: - recommendations: - horizontal: - minReplicas: - - from: 0 - to: 1 - weekday: Sunday - timezone: Asia/Tokyo - value: 3 - updatedAt: 2023-01-01T00:00:00Z - - from: 1 - to: 2 - weekday: Sunday - timezone: Asia/Tokyo - value: 3 - updatedAt: 2023-01-01T00:00:00Z -``` - -### ttl-hours-of-min-max-replicas-recommendation - -The TTL of minReplicas and maxReplicas recommendation (default: 720 (=30 days)) - -```yaml -kind: Tortoise -#... -status: - recommendations: - horizontal: - minReplicas: - - from: 0 - to: 1 - weekday: Sunday - timezone: Asia/Tokyo - value: 3 - updatedAt: 2023-01-01T00:00:00Z # this recommendation will be expired on 2023-01-31. -``` - -### max-replicas-factor - -The factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0) - -If the current replica number is 15 and `max-replicas-factor` is 2.0, -the maxReplicas recommendation from the current situation will be 30 (15 * 2.0). - -```yaml -kind: Tortoise -#... -status: - recommendations: - horizontal: - maxReplicas: - - from: 0 - to: 1 - weekday: Sunday - timezone: Asia/Tokyo - value: 30 - updatedAt: 2023-01-01T00:00:00Z -``` - -### min-replicas-factor - -The factor to calculate the minReplicas recommendation from the current replica number (default: 0.5) - -If the current replica number is 10 and `max-replicas-factor` is 0.5, -the minReplicas recommendation from the current situation will be 5 (10 * 0.5). - -```yaml -kind: Tortoise -#... -status: - recommendations: - horizontal: - minReplicas: - - from: 0 - to: 1 - weekday: Sunday - timezone: Asia/Tokyo - value: 5 - updatedAt: 2023-01-01T00:00:00Z -``` - -### replica-reduction-factor - -The factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95) - -Let's say `replica-reduction-factor` is 0.95, -the minReplicas was increased to 100 due to the emergency mode, -and a user just turned off the emergency mode now. - -Then, the `minReplicas` is going to change like: - -100 --(*0.95)--> 95 --(*0.95)--> 91 -- ... - -It's reduced every time tortoise is evaluated by the controller. (= once a `tortoise-update-interval`) - -### upper-target-resource-utilization - -The max target utilization that tortoise can give to the HPA (default: 90) -So, HPA target utilization managed by tortoise won't be higher than this value. - -### minimum-min-replicas - -The minimum minReplicas that tortoise can give to the HPA (default: 3) -So, HPA minReplicas managed by tortoise won't be smaller than this value. - -### preferred-replicas-number-upper-limit - -The replica number which the tortoise tries to keep the replica number less than. (default: 30) - -As it says "preferred", the tortoise **tries** to keep the replicas number less than this, -but the replica number may be more than this when other "required" rules (`maximum-cpu-cores` and `maximum-memory-bytes`) will be violated by this limit. - -So, when the number of replicas reaches `preferred-replicas-number-upper-limit`, -a tortoise will increase the Pod's resource request instead of increasing the number of replicas. - -But, when the resource request reaches `maximum-cpu-cores` or `maximum-memory-bytes`, -a tortoise will ignore `preferred-replicas-number-upper-limit`, and increase the number of replicas. - -### maximum-cpu-cores - -The maximum CPU cores that the tortoise can give to the container (default: 10) - -Note that it's the upper limit for the container, not for the Pod. - -### maximum-memory-bytes - -The maximum memory bytes that the tortoise can give to the container (default: 10Gi) - -Note that it's the upper limit for the container, not for the Pod. - -### timezone - -The timezone which used to record time in tortoise objects (default: Asia/Tokyo) - -### tortoise-update-interval - -The interval of updating each tortoise (default: 15s) - -But, it may delay if there are many tortoise objects in the cluster. diff --git a/docs/horizontal.md b/docs/horizontal.md index 79238b35..479def57 100644 --- a/docs/horizontal.md +++ b/docs/horizontal.md @@ -14,9 +14,15 @@ Let's get into detail how each field gets changed. MaxReplicas is calculated by: ``` -max{replica numbers at the same time on the same weekday} * 2 +# If MinMaxReplicasRecommendationType = weekly +max{replica numbers at the same time on the same day of week} * MaxReplicasFactor + +# If MinMaxReplicasRecommendationType = daily +max{replica numbers at the same time} * MaxReplicasFactor ``` +(refer to [configuration.md](./configuration.md) about each parameter) + It only takes the num of replicas of the last 4 weeks into consideration. ### MinReplicas @@ -24,9 +30,15 @@ It only takes the num of replicas of the last 4 weeks into consideration. MinReplicas is calculated by: ``` -max{replica numbers at the same time on the same weekday} * 1/2 +# If MinMaxReplicasRecommendationType = weekly +max{replica numbers at the same time on the same day of week} * MinReplicasFactor + +# If MinMaxReplicasRecommendationType = daily +max{replica numbers at the same time} * MinReplicasFactor ``` +(refer to [configuration.md](./configuration.md) about each parameter) + It only takes the num of replicas of the last 4 weeks into consideration. ### Target utilization @@ -72,24 +84,24 @@ But, if a Pod has only one container but a corresponding HPA doesn't have `type: Although it says "Horizontal", a tortoise possibly changes the container's size if it's too large or too small. -- if the number of replicas equals `minimum-min-replicas`, make each container's size smaller instead of reducing the number of replicas. -- if the number of replicas equals `preferred-replicas-number-upper-limit`, make each container's size bigger instead of increasing the number of replicas. - - But, when the resource request reaches `maximum-cpu-cores` or `maximum-memory-bytes`, tortoise will ignore `preferred-replicas-number-upper-limit`, and increase the number of replicas. +- if the number of replicas equals `MinimumMinReplicas`, make each container's size smaller instead of reducing the number of replicas. +- if the number of replicas equals `PreferredReplicaNumUpperLimit`, make each container's size bigger instead of increasing the number of replicas. + - But, when the resource request reaches `MaximumCPUCores` or `MaximumMemoryBytes`, tortoise will ignore `PreferredReplicaNumUpperLimit`, and increase the number of replicas. I know it's complicated, describe specifically in the following examples. -#### Example1: reach `preferred-replicas-number-upper-limit` +#### Example1: reach `PreferredReplicaNumUpperLimit` Let's say the global configurations are: -- `minimum-min-replicas` 3 -- `preferred-replicas-number-upper-limit`: 10 -- `maximum-cpu-cores`: 5 cores +- `MinimumMinReplicas` 3 +- `PreferredReplicaNumUpperLimit`: 10 +- `MaximumCPUCores`: 5 cores And, the target workload currently looks like: - `.spec.ResourcePolicy[*].AutoscalingPolicy.CPU`: Horizontal - `.spec.ResourcePolicy[*].MinAllocatedResources`: 1 cores - resource request: CPU 2 cores -- the num of replicas: 10 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 10 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption is increasing. This case, Tortoise prefers not to increase the replica number more. @@ -97,7 +109,7 @@ Instead, Tortoise temporary switch the scaling way to Vertical to make each cont After a while, the target workload looks like: - resource request: 4 cores. -- the num of replicas: 10 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 10 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption starts to be decreasing. Given the resource consumption starts to be decreasing, the Tortoise switch the scaling way back to Horizontal. @@ -105,51 +117,51 @@ So, this workload will continue to work with 4 cores after this time. If the traffic next day is very similar to this day, then Tortoise no longer needs to switch the scaling way to Vertical during peak time. -#### Example2: reach `preferred-replicas-number-upper-limit` and `maximum-cpu-cores` and `maximum-memory-bytes` +#### Example2: reach `PreferredReplicaNumUpperLimit` and `MaximumCPUCores` and `MaximumMemoryBytes` Let's say the global configurations are (the same as Example1): -- `minimum-min-replicas` 3 -- `preferred-replicas-number-upper-limit`: 10 -- `maximum-cpu-cores`: 5 cores +- `MinimumMinReplicas` 3 +- `PreferredReplicaNumUpperLimit`: 10 +- `MaximumCPUCores`: 5 cores And, the target workload currently looks like (the same as Example1): - `.spec.ResourcePolicy[*].AutoscalingPolicy.CPU`: Horizontal - `.spec.ResourcePolicy[*].MinAllocatedResources`: CPU 1 cores - resource request: CPU 2 cores -- the num of replicas: 10 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 10 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption is increasing. As described in Example1, Tortoise temporary switches the scaling way to Vertical. After a while, the target workload looks like: -- resource request: CPU 5 cores (the same value as `maximum-cpu-cores` and `maximum-memory-bytes` given to each resource") +- resource request: CPU 5 cores (the same value as `MaximumCPUCores` and `MaximumMemoryBytes` given to each resource") - the num of replicas: 10 (the same as "preferred maximum number of replicas") - still the resource consumption is increasing. -The resource request reaches the `maximum-cpu-cores` and `maximum-memory-bytes` given to each resource" now. +The resource request reaches the `MaximumCPUCores` and `MaximumMemoryBytes` given to each resource" now. So, Tortoise switch the scaling way back to the Horizontal. After a while, the target workload looks like: -- resource request: CPU 5 cores (CPU request is the same value as `maximum-cpu-cores` and `maximum-memory-bytes`) -- the num of replicas: 15 (more than `preferred-replicas-number-upper-limit`) +- resource request: CPU 5 cores (CPU request is the same value as `MaximumCPUCores` and `MaximumMemoryBytes`) +- the num of replicas: 15 (more than `PreferredReplicaNumUpperLimit`) - the resource consumption starts to be decreasing. If the traffic next day is very similar to this day, Tortoise no longer needs to switch the scaling way to Vertical during peak time -because it's already reached `maximum-cpu-cores` and `maximum-memory-bytes`. +because it's already reached `MaximumCPUCores` and `MaximumMemoryBytes`. #### Example 3: reach the minimum number of replicas Let's say the global configurations are (the same as Example1): -- `minimum-min-replicas` 3 -- `preferred-replicas-number-upper-limit`: 10 -- `maximum-cpu-cores`: CPU 5 cores +- `MinimumMinReplicas` 3 +- `PreferredReplicaNumUpperLimit`: 10 +- `MaximumCPUCores`: CPU 5 cores And, the target workload currently looks like (the same as Example1): - `.spec.ResourcePolicy[*].AutoscalingPolicy.CPU`: Horizontal - `.spec.ResourcePolicy[*].MinAllocatedResources`: CPU 1 cores - resource request: CPU 2 cores -- the num of replicas: 3 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 3 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption is decreasing. This case, Tortoise prefers not to decrease the replica number more. @@ -157,7 +169,7 @@ Instead, Tortoise temporary switch the scaling way to Vertical to make each cont After a while, the target workload looks like: - resource request: CPU 1.5 cores -- the num of replicas: 3 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 3 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption starts to be increasing. Given the resource consumption starts to be increasing, the Tortoise switches the scaling way back to Horizontal. @@ -169,15 +181,15 @@ Tortoise no longer needs to switch the scaling way to Vertical during off-peak t #### Example 4: reach the minimum number of replicas and `.spec.ResourcePolicy[*].MinAllocatedResources` Let's say the global configurations are (the same as Example1): -- `minimum-min-replicas` 3 -- `preferred-replicas-number-upper-limit`: 10 -- `maximum-cpu-cores`: CPU 5 cores +- `MinimumMinReplicas` 3 +- `PreferredReplicaNumUpperLimit`: 10 +- `MaximumCPUCores`: CPU 5 cores And, the target workload currently looks like (the same as Example1): - `.spec.ResourcePolicy[*].AutoscalingPolicy.CPU`: Horizontal - `.spec.ResourcePolicy[*].MinAllocatedResources`: CPU 1 cores - resource request: CPU 2 cores -- the num of replicas: 3 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 3 (the same as `PreferredReplicaNumUpperLimit`) - the resource consumption is decreasing. This case, Tortoise prefers not to decrease the replica number more. @@ -185,11 +197,11 @@ Instead, Tortoise temporary switch the scaling way to Vertical to make each cont After a while, the target workload looks like: - resource request: CPU 1 cores (the same as `.spec.ResourcePolicy[*].MinAllocatedResources.CPU`) -- the num of replicas: 3 (the same as `preferred-replicas-number-upper-limit`) +- the num of replicas: 3 (the same as `PreferredReplicaNumUpperLimit`) - still the resource consumption is decreasing. The resource request reaches the `.spec.ResourcePolicy[*].MinAllocatedResources.CPU`. -But, the num of replicas has already reached `preferred-replicas-number-upper-limit`. +But, the num of replicas has already reached `PreferredReplicaNumUpperLimit`. So, Tortoise won't change anything further. diff --git a/docs/vertical.md b/docs/vertical.md index b60b8933..ef5a2842 100644 --- a/docs/vertical.md +++ b/docs/vertical.md @@ -11,5 +11,5 @@ that resource is basically updated based on the recommendation value from the VP ### exceptional case; behave like Horizontal Rarely the number of replicas get increased/decreased instead of increasing the resource request. -- When the resource request reaches `maximum-memory-bytes` or `maximum-cpu-cores`. -- When the resource usage gets increased unusually and the resource utilization is more than `upper-target-resource-utilization`. +- When the resource request reaches `MaximumMemoryBytes` or `MaximumCPUCores`. +- When the resource usage gets increased unusually and the resource utilization is more than `UpperTargetResourceUtilization`. diff --git a/go.mod b/go.mod index 59834b7b..43853bcb 100644 --- a/go.mod +++ b/go.mod @@ -50,11 +50,12 @@ require ( github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - go.uber.org/atomic v1.7.0 // indirect - go.uber.org/multierr v1.6.0 // indirect + github.com/stretchr/testify v1.8.3 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.24.0 // indirect golang.org/x/net v0.10.0 // indirect - golang.org/x/oauth2 v0.5.0 // indirect + golang.org/x/oauth2 v0.7.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/term v0.8.0 // indirect golang.org/x/text v0.9.0 // indirect diff --git a/go.sum b/go.sum index 0a07f3c3..e0707e0c 100644 --- a/go.sum +++ b/go.sum @@ -134,17 +134,20 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= -go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= +go.uber.org/multierr v1.8.0 h1:dg6GjLku4EH+249NNmoIciG9N/jURbDG+pFlTkhzIC8= +go.uber.org/multierr v1.8.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -172,8 +175,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.5.0 h1:HuArIo48skDwlrvM3sEdHXElYslAMsf3KwRkkW4MC4s= -golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -255,6 +258,7 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/main.go b/main.go index 5dd85b0c..dac957a0 100644 --- a/main.go +++ b/main.go @@ -27,9 +27,7 @@ package main import ( "flag" - "fmt" "os" - "time" v2 "k8s.io/api/autoscaling/v2" "k8s.io/apimachinery/pkg/runtime" @@ -42,6 +40,7 @@ import ( autoscalingv2 "github.com/mercari/tortoise/api/autoscaling/v2" autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" "github.com/mercari/tortoise/controllers" + "github.com/mercari/tortoise/pkg/config" "github.com/mercari/tortoise/pkg/deployment" "github.com/mercari/tortoise/pkg/hpa" "github.com/mercari/tortoise/pkg/recommender" @@ -76,40 +75,11 @@ func main() { "Enabling this will ensure there is only one active controller manager.") // Tortoise specific flags - var rangeOfMinMaxReplicasRecommendationHours int - var minMaxReplicasRoutine string - var tTLHoursOfMinMaxReplicasRecommendation int - var maxReplicasFactor float64 - var minReplicasFactor float64 - var replicaReductionFactor float64 - var upperTargetResourceUtilization int - var minimumMinReplicas int - var preferredReplicaNumUpperLimit int - var maxCPUPerContainer string - var maxMemoryPerContainer string - var timeZone string - var tortoiseUpdateInterval time.Duration - flag.IntVar(&rangeOfMinMaxReplicasRecommendationHours, "range-of-min-max-replicas-recommendation-hours", 1, "the time (hours) range of minReplicas and maxReplicas recommendation (default: 1)") - flag.StringVar(&minMaxReplicasRoutine, "min-max-replicas-routine", "weekly", "the routine of minReplicas and maxReplicas recommendation (default: weekly)") - flag.IntVar(&tTLHoursOfMinMaxReplicasRecommendation, "ttl-hours-of-min-max-replicas-recommendation", 24*30, "the TTL (hours) of minReplicas and maxReplicas recommendation (default: 720 (=30 days))") - flag.Float64Var(&maxReplicasFactor, "max-replicas-factor", 2.0, "the factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0)") - flag.Float64Var(&minReplicasFactor, "min-replicas-factor", 0.5, "the factor to calculate the minReplicas recommendation from the current replica number (default: 0.5)") - flag.Float64Var(&replicaReductionFactor, "replica-reduction-factor", 0.95, "the factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95)") - flag.IntVar(&upperTargetResourceUtilization, "upper-target-resource-utilization", 90, "the max target utilization that tortoise can give to the HPA (default: 90)") - flag.IntVar(&minimumMinReplicas, "minimum-min-replicas", 3, "the minimum minReplicas that tortoise can give to the HPA (default: 3)") - flag.IntVar(&preferredReplicaNumUpperLimit, "preferred-replicas-number-upper-limit", 30, "The replica number which the tortoise tries to keep the replica number less than. As it says \"preferred\", the tortoise **tries** to keep the replicas number less than this, but the replica number may be more than this when other \"required\" rule will be violated by this limit. (default: 30)") - flag.StringVar(&maxCPUPerContainer, "maximum-cpu-cores", "10", "the maximum CPU cores that the tortoise can give to the container (default: 10)") - flag.StringVar(&maxMemoryPerContainer, "maximum-memory-bytes", "10Gi", "the maximum memory bytes that the tortoise can give to the container (default: 10Gi)") - flag.StringVar(&timeZone, "timezone", "Asia/Tokyo", "The timezone used to record time in tortoise objects (default: Asia/Tokyo)") - flag.DurationVar(&tortoiseUpdateInterval, "tortoise-update-interval", 15*time.Second, "The interval of updating each tortoise (default: 15s)") - - if rangeOfMinMaxReplicasRecommendationHours > 24 || rangeOfMinMaxReplicasRecommendationHours < 1 { - setupLog.Error(fmt.Errorf("range-of-min-max-replicas-recommendation-hours should be between 1 and 24"), "invalid value") - os.Exit(1) - } - - if minMaxReplicasRoutine != "daily" && minMaxReplicasRoutine != "weekly" { - setupLog.Error(fmt.Errorf("min-max-replicas-routine should be either \"daily\" or \"weekly\""), "invalid value") + var configPath string + flag.StringVar(&configPath, "config", "", "The path to the config file.") + config, err := config.ParseConfig(configPath) + if err != nil { + setupLog.Error(err, "failed to load config") os.Exit(1) } @@ -145,7 +115,7 @@ func main() { os.Exit(1) } eventRecorder := mgr.GetEventRecorderFor("tortoise-controller") - tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, rangeOfMinMaxReplicasRecommendationHours, timeZone, tortoiseUpdateInterval, minMaxReplicasRoutine) + tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, config.RangeOfMinMaxReplicasRecommendationHours, config.TimeZone, config.TortoiseUpdateInterval, config.MinMaxReplicasRecommendationType) if err != nil { setupLog.Error(err, "unable to start tortoise service") os.Exit(1) @@ -157,16 +127,16 @@ func main() { os.Exit(1) } - hpaService := hpa.New(mgr.GetClient(), eventRecorder, replicaReductionFactor, upperTargetResourceUtilization) + hpaService := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.UpperTargetResourceUtilization) if err = (&controllers.TortoiseReconciler{ Scheme: mgr.GetScheme(), HpaService: hpaService, VpaService: vpaClient, DeploymentService: deployment.New(mgr.GetClient()), - RecommenderService: recommender.New(tTLHoursOfMinMaxReplicasRecommendation, maxReplicasFactor, minReplicasFactor, upperTargetResourceUtilization, minimumMinReplicas, preferredReplicaNumUpperLimit, maxCPUPerContainer, maxMemoryPerContainer), + RecommenderService: recommender.New(config.TTLHoursOfMinMaxReplicasRecommendation, config.MaxReplicasFactor, config.MinReplicasFactor, config.UpperTargetResourceUtilization, config.MinimumMinReplicas, config.PreferredReplicaNumUpperLimit, config.MaximumCPUCores, config.MaximumMemoryBytes), TortoiseService: tortoiseService, - Interval: tortoiseUpdateInterval, + Interval: config.TortoiseUpdateInterval, EventRecorder: eventRecorder, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") diff --git a/pkg/config/config.go b/pkg/config/config.go new file mode 100644 index 00000000..00ed78c6 --- /dev/null +++ b/pkg/config/config.go @@ -0,0 +1,88 @@ +package config + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +type Config struct { + // RangeOfMinMaxReplicasRecommendationHours is the time (hours) range of minReplicas and maxReplicas recommendation (default: 1) + RangeOfMinMaxReplicasRecommendationHours int `yaml:"RangeOfMinMaxReplicasRecommendationHours"` + // MinMaxReplicasRecommendationType is the routine of minReplicas and maxReplicas recommendation (default: weekly) + MinMaxReplicasRecommendationType string `yaml:"MinMaxReplicasRecommendationType"` + // TTLHoursOfMinMaxReplicasRecommendation is the TTL (hours) of minReplicas and maxReplicas recommendation (default: 720 (=30 days)) + TTLHoursOfMinMaxReplicasRecommendation int `yaml:"TTLHoursOfMinMaxReplicasRecommendation"` + // MaxReplicasFactor is the factor to calculate the maxReplicas recommendation from the current replica number (default: 2.0) + MaxReplicasFactor float64 `yaml:"MaxReplicasFactor"` + // MinReplicasFactor is the factor to calculate the minReplicas recommendation from the current replica number (default: 0.5) + MinReplicasFactor float64 `yaml:"MinReplicasFactor"` + // ReplicaReductionFactor is the factor to reduce the minReplicas gradually after turning off Emergency mode (default: 0.95) + ReplicaReductionFactor float64 `yaml:"ReplicaReductionFactor"` + // UpperTargetResourceUtilization is the max target utilization that tortoise can give to the HPA (default: 90) + UpperTargetResourceUtilization int `yaml:"UpperTargetResourceUtilization"` + // MinimumMinReplicas is the minimum minReplicas that tortoise can give to the HPA (default: 3) + MinimumMinReplicas int `yaml:"MinimumMinReplicas"` + // PreferredReplicaNumUpperLimit is the replica number which the tortoise tries to keep the replica number less than. As it says "preferred", the tortoise **tries** to keep the replicas number less than this, but the replica number may be more than this when other "required" rule will be violated by this limit. (default: 30) + PreferredReplicaNumUpperLimit int `yaml:"PreferredReplicaNumUpperLimit"` + // MaximumCPUCores is the maximum CPU cores that the tortoise can give to the container (default: 10) + MaximumCPUCores string `yaml:"MaximumCPUCores"` + // MaximumMemoryBytes is the maximum memory bytes that the tortoise can give to the container (default: 10Gi) + MaximumMemoryBytes string `yaml:"MaximumMemoryBytes"` + // TimeZone is the timezone used to record time in tortoise objects (default: Asia/Tokyo) + TimeZone string `yaml:"TimeZone"` + // TortoiseUpdateInterval is the interval of updating each tortoise (default: 15s) + TortoiseUpdateInterval time.Duration `yaml:"TortoiseUpdateInterval"` +} + +// ParseConfig parses the config file (yaml) and returns Config. +func ParseConfig(path string) (*Config, error) { + config := &Config{ + RangeOfMinMaxReplicasRecommendationHours: 1, + MinMaxReplicasRecommendationType: "weekly", + TTLHoursOfMinMaxReplicasRecommendation: 24 * 30, + MaxReplicasFactor: 2.0, + MinReplicasFactor: 0.5, + ReplicaReductionFactor: 0.95, + UpperTargetResourceUtilization: 90, + MinimumMinReplicas: 3, + PreferredReplicaNumUpperLimit: 30, + MaximumCPUCores: "10", + MaximumMemoryBytes: "10Gi", + TimeZone: "Asia/Tokyo", + TortoiseUpdateInterval: 15 * time.Second, + } + if path == "" { + return config, nil + } + + // read file from path + b, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + if err := yaml.Unmarshal(b, config); err != nil { + return nil, fmt.Errorf("failed to unmarshal config file: %w", err) + } + + if err := validate(config); err != nil { + return nil, fmt.Errorf("invalid config: %w", err) + } + + return config, nil +} + +func validate(config *Config) error { + if config.RangeOfMinMaxReplicasRecommendationHours > 24 || config.RangeOfMinMaxReplicasRecommendationHours < 1 { + return fmt.Errorf("RangeOfMinMaxReplicasRecommendationHours should be between 1 and 24") + } + + if config.MinMaxReplicasRecommendationType != "daily" && config.MinMaxReplicasRecommendationType != "weekly" { + return fmt.Errorf("MinMaxReplicasRecommendationType should be either \"daily\" or \"weekly\"") + } + + return nil +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go new file mode 100644 index 00000000..3541f3c5 --- /dev/null +++ b/pkg/config/config_test.go @@ -0,0 +1,102 @@ +package config + +import ( + "reflect" + "testing" + "time" +) + +func TestParseConfig(t *testing.T) { + type args struct { + path string + } + tests := []struct { + name string + args args + want *Config + wantErr bool + }{ + { + name: "config file", + args: args{ + path: "./testdata/config.yaml", + }, + want: &Config{ + RangeOfMinMaxReplicasRecommendationHours: 2, + MinMaxReplicasRecommendationType: "daily", + TTLHoursOfMinMaxReplicasRecommendation: 24 * 30, + MaxReplicasFactor: 2.0, + MinReplicasFactor: 0.5, + ReplicaReductionFactor: 0.95, + UpperTargetResourceUtilization: 90, + MinimumMinReplicas: 3, + PreferredReplicaNumUpperLimit: 30, + MaximumCPUCores: "10", + MaximumMemoryBytes: "10Gi", + TimeZone: "Asia/Tokyo", + TortoiseUpdateInterval: 1 * time.Hour, + }, + }, + { + name: "config file which has only one field", + args: args{ + path: "./testdata/config-partly-override.yaml", + }, + want: &Config{ + RangeOfMinMaxReplicasRecommendationHours: 6, + MinMaxReplicasRecommendationType: "weekly", + TTLHoursOfMinMaxReplicasRecommendation: 24 * 30, + MaxReplicasFactor: 2.0, + MinReplicasFactor: 0.5, + ReplicaReductionFactor: 0.95, + UpperTargetResourceUtilization: 90, + MinimumMinReplicas: 3, + PreferredReplicaNumUpperLimit: 30, + MaximumCPUCores: "10", + MaximumMemoryBytes: "10Gi", + TimeZone: "Asia/Tokyo", + TortoiseUpdateInterval: 15 * time.Second, + }, + }, + { + name: "config file not found", + args: args{ + path: "./testdata/not-found.yaml", + }, + wantErr: true, + }, + { + name: "config file is empty", + args: args{ + path: "", + }, + want: &Config{ + RangeOfMinMaxReplicasRecommendationHours: 1, + MinMaxReplicasRecommendationType: "weekly", + TTLHoursOfMinMaxReplicasRecommendation: 24 * 30, + MaxReplicasFactor: 2.0, + MinReplicasFactor: 0.5, + ReplicaReductionFactor: 0.95, + UpperTargetResourceUtilization: 90, + MinimumMinReplicas: 3, + PreferredReplicaNumUpperLimit: 30, + MaximumCPUCores: "10", + MaximumMemoryBytes: "10Gi", + TimeZone: "Asia/Tokyo", + TortoiseUpdateInterval: 15 * time.Second, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseConfig(tt.args.path) + if (err != nil) != tt.wantErr { + t.Errorf("ParseConfig() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("ParseConfig() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/config/testdata/config-partly-override.yaml b/pkg/config/testdata/config-partly-override.yaml new file mode 100644 index 00000000..52ac635e --- /dev/null +++ b/pkg/config/testdata/config-partly-override.yaml @@ -0,0 +1 @@ +RangeOfMinMaxReplicasRecommendationHours: 6 \ No newline at end of file diff --git a/pkg/config/testdata/config.yaml b/pkg/config/testdata/config.yaml new file mode 100644 index 00000000..0b5d6dcf --- /dev/null +++ b/pkg/config/testdata/config.yaml @@ -0,0 +1,13 @@ +RangeOfMinMaxReplicasRecommendationHours: 2 +MinMaxReplicasRecommendationType: "daily" +TTLHoursOfMinMaxReplicasRecommendation: 720 +MaxReplicasFactor: 2.0 +MinReplicasFactor: 0.5 +ReplicaReductionFactor: 0.95 +UpperTargetResourceUtilization: 90 +MinimumMinReplicas: 3 +PreferredReplicaNumUpperLimit: 30 +MaximumCPUCores: "10" +MaximumMemoryBytes: "10Gi" +TimeZone: "Asia/Tokyo" +TortoiseUpdateInterval: "1h" \ No newline at end of file