Merge pull request #15 from deepgram/brent-george/autoscaling

Autoscaling
deepgram · Jun 20, 2024 · 3163288 · 3163288
2 parents eadf990 + 170586b
commit 3163288
Show file tree

Hide file tree

Showing 33 changed files with 830 additions and 228 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Helm
-        uses: azure/setup-helm@v3
+        uses: azure/setup-helm@v4
 
       - name: Run helm lint
         run: helm lint ./charts/deepgram-self-hosted

diff --git a/charts/deepgram-self-hosted/CHANGELOG.md b/charts/deepgram-self-hosted/CHANGELOG.md
@@ -0,0 +1,42 @@
+# Changelog
+
+All notable changes to this Helm chart will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+*Nothing at this time*
+
+## [0.2.0-beta] - 2024-06-20
+
+### Added
+- Support for managing node autoscaling with [cluster-autoscaler](https://github.com/kubernetes/autoscaler).
+- Support for pod autoscaling of Deepgram components.
+- Support for keeping the upstream Deepgram License server as a backup even when the License Proxy is deployed. See `licenseProxy.keepUpstreamServerAsBackup` for details.
+
+### Changed
+
+- Initial installation replica count values moved from `scaling.static.{api,engine}.replicas` to `scaling.replicas.{api,engine}`.
+- License Proxy is no longer manually scaled. Instead, scaling can be indirectly controlled via `licenseProxy.{enabled,deploySecondReplica}`.
+- Labels for Deepgram dedicated nodes in the sample `cluster-config.yaml` for AWS, and the `nodeAffinity` sections of the sample `values.yaml` files. The key has been renamed from `deepgram/nodeType` to `k8s.deepgram.com/node-type`, and the values are no longer prepended with `deepgram`.
+- AWS EFS model download job hook delete policy changed to `before-hook-creation`.
+- Concurrency limit moved from API (`api.concurrencyLimit.activeRequests`) to Engine level (`engine.concurrencyLimit.activeRequests`).
+
+## [0.1.1-alpha] - 2024-06-03
+
+### Added
+
+- Various documentation improvements
+
+## [0.1.0-alpha] - 2024-05-31
+
+### Added
+
+- Initial implementation of the Helm chart.
+
+
+[unreleased]: https://github.com/deepgram/self-hosted-resources/compare/deepgram-self-hosted-0.1.1-alpha...HEAD
+[0.2.0-beta]: https://github.com/deepgram/self-hosted-resources/compare/deepgram-self-hosted-0.1.1-alpha...deepgram-self-hosted-0.2.0-beta
+[0.1.1-alpha]: https://github.com/deepgram/self-hosted-resources/compare/deepgram-self-hosted-0.1.0-alpha...deepgram-self-hosted-0.1.1-alpha
+[0.1.0-alpha]: https://github.com/deepgram/self-hosted-resources/releases/tag/deepgram-self-hosted-0.1.0-alpha
diff --git a/charts/deepgram-self-hosted/Chart.lock b/charts/deepgram-self-hosted/Chart.lock
@@ -2,5 +2,14 @@ dependencies:
 - name: gpu-operator
   repository: https://helm.ngc.nvidia.com/nvidia
   version: v24.3.0
-digest: sha256:bb948d8bad3f9ca1255d70a998d28aea2e3ed2f20fb2eab4bf8c676485b747db
-generated: "2024-05-16T10:44:38.204991308-04:00"
+- name: cluster-autoscaler
+  repository: https://kubernetes.github.io/autoscaler
+  version: 9.37.0
+- name: kube-prometheus-stack
+  repository: https://prometheus-community.github.io/helm-charts
+  version: 60.2.0
+- name: prometheus-adapter
+  repository: https://prometheus-community.github.io/helm-charts
+  version: 4.10.0
+digest: sha256:e68498ec97b15d90257562a3d25c33f2bde71447b963c893e49bfa97c3352773
+generated: "2024-06-18T21:48:47.225872139-04:00"
diff --git a/charts/deepgram-self-hosted/Chart.yaml b/charts/deepgram-self-hosted/Chart.yaml
@@ -1,12 +1,12 @@
 apiVersion: v2
 name: deepgram-self-hosted
 type: application
-version: 0.1.1-alpha
+version: 0.2.0-beta
 appVersion: "release-240528"
 description: A Helm chart for running Deepgram services in a self-hosted environment
 home: "https://developers.deepgram.com/docs/self-hosted-introduction"
 sources: ["https://github.com/deepgram/self-hosted-resources"]
-kubeVersion: ">=1.27.0-0"
+kubeVersion: ">=1.28.0-0"
 maintainers:
   - name: Deepgram Self-Hosted
     email: [email protected]
@@ -18,13 +18,25 @@ keywords:
   - aura
   - speech-to-text
   - stt
+  - asr
   - nova
-  - speech-to-speech
-  - sts
+  - voice agent
   - self-hosted
 
 dependencies:
   - name: gpu-operator
     version: "^24.3.0"
     repository: "https://helm.ngc.nvidia.com/nvidia"
     condition: gpu-operator.enabled
+  - name: cluster-autoscaler
+    version: "^9.37.0"
+    repository: "https://kubernetes.github.io/autoscaler"
+    condition: cluster-autoscaler.enabled
+  - name: kube-prometheus-stack
+    version: "^60.2.0"
+    repository: "https://prometheus-community.github.io/helm-charts"
+    condition: kube-prometheus-stack.includeDependency,scaling.auto.enabled
+  - name: prometheus-adapter
+    version: "^4.10.0"
+    repository: "https://prometheus-community.github.io/helm-charts"
+    condition: prometheus-adapter.includeDependency,scaling.auto.enabled
diff --git a/charts/deepgram-self-hosted/README.md b/charts/deepgram-self-hosted/README.md
diff --git a/charts/deepgram-self-hosted/README.md.gotmpl b/charts/deepgram-self-hosted/README.md.gotmpl
@@ -13,14 +13,16 @@
 
 {{ template "chart.requirementsSection" . }}
 
-## Get Repository Info
+## Using the Chart
+
+### Get Repository Info
 
 ```bash
 helm repo add deepgram https://deepgram.github.io/self-hosted-resources
 helm repo update
 ```
 
-## Installing the Chart
+### Installing the Chart
 
 The Deepgram self-hosted chart requires Helm 3.7+ in order to install successfully. Please check your helm release before installation.
 
@@ -29,10 +31,10 @@ You will need to provide your [self-service Deepgram licensing and credentials](
 You may also override any default configuration values. See [the Values section](#values) for a list of available options, and the [samples directory](./samples) for examples of a standard installation.
 
 ```
-helm install -f my-values.yaml [RELEASE_NAME] deepgram/deepgram-self-hosted --atomic --timeout 20m
+helm install -f my-values.yaml [RELEASE_NAME] deepgram/deepgram-self-hosted --atomic --timeout 45m
 ```
 
-## Upgrade and Rollback Strategies
+### Upgrade and Rollback Strategies
 
 To upgrade the Deepgram components to a new version, follow these steps:
 
@@ -41,7 +43,7 @@ To upgrade the Deepgram components to a new version, follow these steps:
 2. Run the Helm upgrade command:
 
     ```bash
-    helm upgrade -f my-values.yaml [RELEASE_NAME] deepgram/deepgram-self-hosted --atomic --timeout 30m
+    helm upgrade -f my-values.yaml [RELEASE_NAME] deepgram/deepgram-self-hosted --atomic --timeout 60m
     ```
 
 If you encounter any issues during the upgrade process, you can perform a rollback to the previous version:
@@ -52,15 +54,23 @@ helm rollback deepgram
 
 Before upgrading, ensure that you have reviewed the release notes and any migration guides provided by Deepgram for the specific version you are upgrading to.
 
-## Uninstalling the Chart
+### Uninstalling the Chart
 
 ```bash
 helm uninstall [RELEASE_NAME]
 ```
 
 This removes all the Kubernetes components associated with the chart and deletes the release.
 
-## Persistent Storage Options
+## Changelog
+
+See the [chart CHANGELOG](./CHANGELOG.md) for a list of relevant changes for each version of the Helm chart.
+
+For more details on changes to the underlying Deepgram resources, such as the container images or available models, see the [official Deepgram changelog](https://deepgram.com/changelog) ([RSS feed](https://deepgram.com/changelog.xml)).
+
+## Chart Configuration
+
+### Persistent Storage Options
 
 The Deepgram Helm chart supports different persistent storage options for storing Deepgram models and data. The available options include:
 
@@ -72,7 +82,38 @@ To configure a specific storage option, see the `engine.modelManager.volumes` [c
 
 For detailed instructions on setting up and configuring each storage option, refer to the [Deepgram self-hosted guides](https://developers.deepgram.com/docs/kubernetes) and the respective cloud provider's documentation.
 
-## RBAC Configuration
+### Autoscaling
+
+Autoscaling your cluster's capacity to meet incoming traffic demands involves both node autoscaling and pod autoscaling. Node autoscaling for supported cloud providers is setup by default when using this Helm chart and creating your cluster with the [Deepgram self-hosted guides](https://developers.deepgram.com/docs/kubernetes). Pod autoscaling can be enabled via the `scaling.auto.enabled` configuration option in this chart.
+
+#### Engine
+
+The Engine component is the core of the Deepgram self-hosted platform, responsible for performing inference using your deployed models. Autoscaling increases the number of Engine replicas to maintain consistent performance for incoming traffic. 
+
+There are currently two primary ways to scale the Engine component: scaling with a hard request limit per Engine Pod, or scaling with a soft request limit per Engine pod. 
+
+To set a hard limit on which to scale, configure `engine.concurrencyLimit.activeRequests` and `scaling.auto.engine.metrics.requestCapacityRatio`. The `activeRequests` parameter will set a hard limit of how many requests any given Engine pod will accept, and the `requestCapacityRatio` will govern scaling the Engine deployment when a certain percentage of "available request slots" is filled. For example, a requestCapacityRatio of `0.8` will scale the Engine deployment when the current number of active requests is >=80% of the active request concurrency limit. If the cluster is not able to scale in time and current active requests hits 100% of the preset limit, additional client requests to the API will return a `429 Too Many Requests` HTTP response to clients. This hard limit means that if a request is accepted for inference, it will have consistent performance, as the cluster will refuse surplus requests that could overload the cluster and degrade performance, at the expense of possibly rejecting some incoming requests if capacity does not scale in time. 
+
+To set a soft limit on which to scale, configure `scaling.auto.engine.metrics.{speechToText,textToSpeech}.{batch,streaming}.requestsPerPod`, depending on the primary traffic source for your environment. The cluster will attempt to scale to meet this target for number of requests per Engine pod, but will not reject extra requests with a `429 Too Many Request` HTTP response like the hard limit will. If the number of extra requests increases faster than the cluster can scale additional capacity, all incoming requests will still be accepted, but the performance of individual requests may degrade.
+
+> [!NOTE]
+> Deepgram recommends provisioning separate environments for batch speech-to-text, streaming speech-to-text, and text-to-speech workloads because typical latency and throughput tradeoffs are different for each of those use cases. 
+
+There is also a `scaling.auto.engine.metrics.custom` configuration value available to define your own custom scaling metric, if needed.
+
+#### API
+
+The API component is responsible for accepting incoming requests and forming responses, delegating inference work to the Deepgram Engine as needed. A single API pod can typically handle delegating requests to multiple Engine pods, so it is more compute efficient to deploy fewer API pods relative to the number of Engine pods. The `scaling.auto.api.metrics.engineToApiRatio` configuration value defines the ratio between Engine to API pods. The default value is appropriate for most deployments.
+
+There is also a `scaling.auto.api.metrics.custom` configuration value available to define your own custom scaling metric, if needed.
+
+#### License Proxy
+
+The [License Proxy](https://developers.deepgram.com/docs/license-proxy) is intended to be deployed as a fixed-scale deployment the proxies all licensing requests from your environment. It should not be upscaled with the traffic demands of your environment.
+
+This chart deploys one License Proxy Pod per environment by default. If you wish to deploy a second License Proxy Pod for redundancy, set `licenseProxy.deploySecondReplica` to `true`. 
+
+### RBAC Configuration
 
 Role-Based Access Control (RBAC) is used to control access to Kubernetes resources based on the roles and permissions assigned to users or service accounts. The Deepgram Helm chart includes default RBAC roles and bindings for the API, Engine, and License Proxy components.
 
@@ -85,7 +126,7 @@ To use custom RBAC roles and bindings based on your specific security requiremen
 
 Make sure to review and adjust the RBAC configuration according to the principle of least privilege, granting only the necessary permissions for each component.
 
-## Secret Management
+### Secret Management
 
 The Deepgram Helm chart takes references to two existing secrets - one containing your distribution credentials to pull container images from Deepgram's image repository, and one containing your Deepgram self-hosted API key.
 

diff --git a/charts/deepgram-self-hosted/charts/cluster-autoscaler-9.37.0.tgz b/charts/deepgram-self-hosted/charts/cluster-autoscaler-9.37.0.tgz
diff --git a/charts/deepgram-self-hosted/charts/kube-prometheus-stack-60.2.0.tgz b/charts/deepgram-self-hosted/charts/kube-prometheus-stack-60.2.0.tgz
diff --git a/charts/deepgram-self-hosted/charts/prometheus-adapter-4.10.0.tgz b/charts/deepgram-self-hosted/charts/prometheus-adapter-4.10.0.tgz
diff --git a/charts/deepgram-self-hosted/samples/01-basic-setup-aws.cluster-config.yaml b/charts/deepgram-self-hosted/samples/01-basic-setup-aws.cluster-config.yaml
@@ -0,0 +1,79 @@
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: deepgram-self-hosted-cluster
+  region: us-west-2
+  version: "1.30"
+
+iam:
+  withOIDC: true
+  serviceAccounts:
+    - metadata:
+        name: cluster-autoscaler-sa
+        namespace: dg-self-hosted
+      wellKnownPolicies:
+        autoScaler: true
+      roleName: cluster-autoscaler-role
+      roleOnly: true
+    - metadata:
+        name: efs-csi-controller-sa
+        namespace: kube-system
+      wellKnownPolicies:
+        efsCSIController: true
+      roleName: efs-csi-driver-role
+      roleOnly: true
+
+managedNodeGroups:
+  - name: control-plane-node-group
+    minSize: 1
+    desiredCapacity: 1
+    maxSize: 3
+    instanceType: t3.large
+    amiFamily: Ubuntu2204
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    propagateASGTags: true
+  - name: engine-node-group
+    minSize: 0
+    desiredCapacity: 0
+    maxSize: 8
+    instanceType: g6.2xlarge
+    amiFamily: Ubuntu2204
+    labels:
+      k8s.deepgram.com/node-type: engine
+      k8s.amazonaws.com/accelerator: nvidia-l4
+    iam:
+      withAddonPolicies:
+        efs: true
+        autoScaler: true
+    taints:
+      - key: efs.csi.aws.com/agent-not-ready
+        value: "true"
+        effect: NoExecute
+    propagateASGTags: true
+  - name: api-node-group
+    minSize: 0
+    desiredCapacity: 0
+    maxSize: 2
+    instanceType: c5n.xlarge
+    amiFamily: Ubuntu2204
+    labels:
+      k8s.deepgram.com/node-type: api
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    propagateASGTags: true
+  - name: license-proxy-node-group
+    minSize: 0
+    desiredCapacity: 0
+    maxSize: 2
+    instanceType: t3.large
+    amiFamily: Ubuntu2204
+    labels:
+      k8s.deepgram.com/node-type: license-proxy
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    propagateASGTags: true