vllm-project · Jeffwan · Feb 12, 2025 · Feb 12, 2025
diff --git a/development/tools/setup.sh b/development/tools/setup.sh
diff --git a/hack/lambda-cloud/README.md b/hack/lambda-cloud/README.md
@@ -0,0 +1,3 @@
+# AIBrix Single-Node Deployment on Lambda Instances
+
+Please refer to doc here for more details.
diff --git a/development/tools/install.sh → hack/lambda-cloud/install.sh b/development/tools/install.sh → hack/lambda-cloud/install.sh
@@ -61,21 +61,9 @@ echo "*********************************************************************"
 
 echo "Configure the nvidia container toolkits"
 
-# Add the Nvidia Container Toolkit production repository
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
-    &&  curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
-        sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
-        sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-
-# Configure the Nvidia repository to use experimental packages (uncommenting the experimental line if commented)
-sudo sed -i '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
-
-# Disable the Lambda Labs repository by commenting out all lines
-sudo sed -i 's/^/#/' /etc/apt/sources.list.d/lambda-repository.list
-
 # Remove older versions of the Nvidia Container Toolkit
 sudo apt-get remove -y nvidia-container-toolkit
-echo "Lagacy nvidia-container-toolkit has been removed successfully."
+echo "Legacy nvidia-container-toolkit has been removed successfully."
 
 # Update package lists and install the latest version of Nvidia Container Toolkit
 sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit

diff --git a/hack/lambda-cloud/nvkind-cluster.yaml b/hack/lambda-cloud/nvkind-cluster.yaml
@@ -0,0 +1,27 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+- role: worker
+  extraMounts:
+    # We inject all NVIDIA GPUs using the nvidia-container-runtime.
+    # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
+    # in `/etc/nvidia-container-runtime/config.toml`
+    - hostPath: /dev/null
+      containerPath: /var/run/nvidia-container-devices/all
+    - hostPath: /tmp/models/
+      containerPath: /data/models/
+    - hostPath: /root/.cache/huggingface
+      containerPath: /root/.cache/huggingface
+# ~/.cache/huggingface:/root/.cache/huggingface
+  # this is for gateway service, make it exposed to public
+  extraPortMappings:
+  # Reserved for prometheus & grafana ports. Update service with NodePort 30090 or 30030 if you need to debug the metrics
+  - containerPort: 30090
+    hostPort: 9090
+  - containerPort: 30030
+    hostPort: 3000
+  - containerPort: 38265
+    hostPort: 8265
+  - containerPort: 38000
+    hostPort: 8000
diff --git a/hack/lambda-cloud/setup.sh b/hack/lambda-cloud/setup.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e # Exit on any error
+
+# Step 1: Install the NVIDIA GPU Operator
+echo "Adding NVIDIA Helm repository and Installing NVIDIA GPU Operator..."
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
+helm repo update
+helm install gpu-operator nvidia/gpu-operator --namespace kube-system
+if [ $? -ne 0 ]; then
+    echo "Failed to add NVIDIA Helm repository or failed to install NVIDIA GPU Operator."
+    exit 1
+fi
+
+# Step 2: Install the  Cloud Provider Kind
+echo "Installing Cloud Provider Kind..."
+KIND_CLOUD_PROVIDER_VERSION="0.5.0"
+KIND_CLOUD_PROVIDER_URL="https://github.com/kubernetes-sigs/cloud-provider-kind/releases/download/v${KIND_CLOUD_PROVIDER_VERSION}/cloud-provider-kind_0.5.0_linux_amd64.tar.gz"
+
+# Download and extract
+curl -L ${KIND_CLOUD_PROVIDER_URL} -o cloud-provider-kind.tar.gz
+tar -xvzf cloud-provider-kind.tar.gz
+chmod +x cloud-provider-kind
+sudo mv cloud-provider-kind /usr/local/bin/
+
+# Verify installation
+if ! command -v cloud-provider-kind &> /dev/null; then
+    echo "Failed to install Cloud Provider Kind."
+    exit 1
+fi
+
+# Step 3: Run cloud-provider-kind in the background and forward logs
+echo "Starting cloud-provider-kind in the background..."
+LOG_FILE="/tmp/cloud-provider-kind.log"
+
+nohup cloud-provider-kind > ${LOG_FILE} 2>&1 &
+
+# Save the process ID
+echo $! > /var/run/cloud-provider-kind.pid
+echo "Cloud Provider Kind is running in the background. Logs are being written to ${LOG_FILE}."
+
+echo "Setup complete. All components have been installed successfully."
diff --git a/development/tools/verify.sh → hack/lambda-cloud/verify.sh b/development/tools/verify.sh → hack/lambda-cloud/verify.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# AIBrix Single-Node Deployment on Lambda Instances

		Please refer to doc here for more details.