From 0a364d891712777233e9556023561bf08d89a0ba Mon Sep 17 00:00:00 2001 From: Yunfeng Bai <83252681+yunfeng-scale@users.noreply.github.com> Date: Mon, 20 May 2024 16:21:50 -0700 Subject: [PATCH] Image cache and balloon on H100s, also temporarily stop people from using A100 (#523) * Cache H100 * Stop people from using A100 * no cover * no cover * update client version --- .../templates/balloon_h100_deployment.yaml | 50 +++++++++++++++++++ charts/model-engine/values_circleci.yaml | 1 + charts/model-engine/values_sample.yaml | 23 +++++++++ clients/python/llmengine/__init__.py | 2 +- clients/python/llmengine/model.py | 4 ++ clients/python/pyproject.toml | 2 +- clients/python/setup.py | 2 +- .../use_cases/llm_model_endpoint_use_cases.py | 4 ++ 8 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 charts/model-engine/templates/balloon_h100_deployment.yaml diff --git a/charts/model-engine/templates/balloon_h100_deployment.yaml b/charts/model-engine/templates/balloon_h100_deployment.yaml new file mode 100644 index 00000000..03bce9aa --- /dev/null +++ b/charts/model-engine/templates/balloon_h100_deployment.yaml @@ -0,0 +1,50 @@ +{{- if not .Values.serviceIdentifier }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Chart.Name }}-balloon-h100 + labels: + team: infra + product: common-warm-nodes +spec: + replicas: {{ .Values.replicaCount.balloonH100 }} + selector: + matchLabels: + app: {{ .Chart.Name }}-balloon-h100 + version: v1 + template: + metadata: + labels: + app: {{ .Chart.Name }}-balloon-h100 + product: common-warm-nodes + team: infra + env: {{ .Values.context }} + version: v1 + annotations: + sidecar.istio.io/inject: "false" + spec: + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-h100 + {{- with .Values.balloonNodeSelector }} + {{- toYaml . | nindent 8 }} + {{- end }} + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + containers: + - image: public.ecr.aws/ubuntu/ubuntu:latest + imagePullPolicy: IfNotPresent + name: main + resources: + limits: + memory: 28Gi + nvidia.com/gpu: 1 + cpu: 4 + command: + - /bin/bash + - -c + - "while true; do sleep 30; done" + terminationGracePeriodSeconds: 0 + priorityClassName: {{ .Chart.Name }}-low-priority +{{- end }} diff --git a/charts/model-engine/values_circleci.yaml b/charts/model-engine/values_circleci.yaml index 0f9d9337..d4e7718b 100644 --- a/charts/model-engine/values_circleci.yaml +++ b/charts/model-engine/values_circleci.yaml @@ -8,6 +8,7 @@ replicaCount: balloonA100: 0 balloonCpu: 0 balloonT4: 0 + balloonH100: 0 # tag needs to be set dynamically every time. Usually it is set to the SHA1 hash of the git # commit from which the image was built. diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml index 2d002c00..97f68532 100644 --- a/charts/model-engine/values_sample.yaml +++ b/charts/model-engine/values_sample.yaml @@ -81,6 +81,8 @@ replicaCount: balloonCpu: 0 # balloonT4 is a low priority pod deployment for T4 GPU nodes balloonT4: 0 + # balloonH100 is a low priority pod deployment for H100 GPU nodes + balloonH100: 0 # autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments) autoscaling: @@ -254,6 +256,27 @@ imageCache: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" + - name: h100 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: h100-mig-1g-20gb + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g-20gb + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: h100-mig-3g-40gb + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g-40gb + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" # celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache" celeryBrokerType: sqs diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py index 110ed4cc..15b836da 100644 --- a/clients/python/llmengine/__init__.py +++ b/clients/python/llmengine/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.0.0b32" +__version__ = "0.0.0b33" import os from typing import Sequence diff --git a/clients/python/llmengine/model.py b/clients/python/llmengine/model.py index 77a0c1d8..1e18d3bf 100644 --- a/clients/python/llmengine/model.py +++ b/clients/python/llmengine/model.py @@ -149,6 +149,8 @@ def create( - ``nvidia-ampere-a100`` - ``nvidia-ampere-a100e`` - ``nvidia-hopper-h100`` + - ``nvidia-hopper-h100-1g20gb`` + - ``nvidia-hopper-h100-3g40gb`` high_priority (`Optional[bool]`): Either ``True`` or ``False``. Enabling this will allow the created @@ -533,6 +535,8 @@ def update( - ``nvidia-ampere-a100`` - ``nvidia-ampere-a100e`` - ``nvidia-hopper-h100`` + - ``nvidia-hopper-h100-1g20gb`` + - ``nvidia-hopper-h100-3g40gb`` high_priority (`Optional[bool]`): Either ``True`` or ``False``. Enabling this will allow the created diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index b7459272..7d645b53 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scale-llm-engine" -version = "0.0.0.beta32" +version = "0.0.0.beta33" description = "Scale LLM Engine Python client" license = "Apache-2.0" authors = ["Phil Chen "] diff --git a/clients/python/setup.py b/clients/python/setup.py index 489e428a..c11111cf 100644 --- a/clients/python/setup.py +++ b/clients/python/setup.py @@ -3,7 +3,7 @@ setup( name="scale-llm-engine", python_requires=">=3.7", - version="0.0.0.beta32", + version="0.0.0.beta33", packages=find_packages(), package_data={"llmengine": ["py.typed"]}, ) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index e8549ad9..894096e8 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -879,6 +879,10 @@ async def execute( max_workers=request.max_workers, endpoint_type=request.endpoint_type, ) + if request.gpu_type == GpuType.NVIDIA_AMPERE_A100E: # pragma: no cover + raise ObjectHasInvalidValueException( + "We have migrated A100 usage to H100. Please request for H100 instead!" + ) if request.labels is None: raise EndpointLabelsException("Endpoint labels cannot be None!") validate_labels(request.labels)