Merge pull request #94 from kusumachalasani/aiworkloads

Include accelerate benchmarks
kruize · Oct 21, 2024 · f0b089e · f0b089e
2 parents 31264f6 + 7e0f3d5
commit f0b089e
Show file tree

Hide file tree

Showing 10 changed files with 421 additions and 192 deletions.
diff --git a/common/common_helper.sh b/common/common_helper.sh
@@ -292,21 +292,56 @@ function prometheus_install() {
 ###########################################
 function benchmarks_install() {
   	NAMESPACE="${1:-default}"
-	MANIFESTS="${2:-default_manifests}"
+	BENCHMARK="${2:-tfb}"
+	MANIFESTS="${3:-default_manifests}"
+
 	echo
 	echo "#######################################"
 	pushd benchmarks >/dev/null
-		echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
-		pushd techempower >/dev/null
-		# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
-		sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
-		sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
-		sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
-		sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
-
-		kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
-		check_err "ERROR: TechEmpower app failed to start, exiting"
-		popd >/dev/null
+		if [ ${BENCHMARK} == "tfb" ]; then
+			echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
+			pushd techempower >/dev/null
+			# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
+			sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
+			sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
+			sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
+			sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
+			kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
+			check_err "ERROR: TechEmpower app failed to start, exiting"
+			popd >/dev/null
+		fi
+		if [ ${BENCHMARK} == "human-eval" ]; then
+			echo "#######################################"
+			echo "Running HumanEval benchmark job in background"
+			echo
+			pushd human-eval-benchmark/manifests >/dev/null
+				sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' pvc.yaml
+				sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' job.yaml
+				# Update num_prompts value to 150 to run the benchmark for atleast 15 mins
+				sed -i "s/value: '10'/value: '150'/" job.yaml
+				oc apply -f pvc.yaml -n ${NAMESPACE}
+				oc apply -f job.yaml -n ${NAMESPACE}
+				check_err "ERROR: Human eval job failed to start, exiting"
+			popd >/dev/null
+		fi
+		if [ ${BENCHMARK} == "ttm" ]; then
+			echo "#######################################"
+			echo "Running Training TTM benchmark job in background"
+			pushd AI-MLbenchmarks/ttm >/dev/null
+				echo ""
+                                ./run_ttm.sh ${NAMESPACE} >> ${LOG_FILE} &
+                                check_err "ERROR: Training ttm jobs failed to start, exiting"
+			popd >/dev/null
+		fi
+		if [ ${BENCHMARK} == "llm-rag" ]; then
+			echo "#######################################"
+			echo "Installing LLM-RAG benchmark into cluster"
+			pushd AI-MLbenchmarks/llm-rag >/dev/null
+				./deploy.sh ${NAMESPACE}
+				check_err "ERROR: llm-rag benchmark failed to start, exiting"
+			popd >/dev/null
+		fi
+
 	popd >/dev/null
 	echo "#######################################"
 	echo
@@ -317,15 +352,35 @@ function benchmarks_install() {
 ###########################################
 function benchmarks_uninstall() {
         NAMESPACE="${1:-default}"
+	BENCHMARK="${1:-tfb}"
         MANIFESTS="${2:-default_manifests}"
         echo
         echo "#######################################"
         pushd benchmarks >/dev/null
-                echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
-                pushd techempower >/dev/null
-                kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
-                check_err "ERROR: TechEmpower app failed to delete, exiting"
-                popd >/dev/null
+		if [ ${BENCHMARK} == "tfb" ]; then
+			echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
+			pushd techempower >/dev/null
+				kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
+				check_err "ERROR: TechEmpower app failed to delete, exiting"
+			popd >/dev/null
+		fi
+		if [ ${BENCHMARK} == "human-eval" ]; then
+			echo "Uninstalling humanEval benchmark job in cluster"
+			pushd human-eval-benchmark >/dev/null
+				oc delete -f job.yaml
+				oc delete -f pvc.yaml
+				check_err "ERROR: human-eval benchmark failed to delete, exiting"
+			popd >/dev/null
+		fi
+		if [ ${BENCHMARK} == "ttm" ] || [${BENCHMARK} == "llm-rag"]; then
+
+			echo "Uninstalling ${BENCHMARK} benchmark in cluster"
+			pushd AI-MLbenchmarks/ttm >/dev/null
+				./cleanup.sh ${NAMESPACE}
+				check_err "ERROR: ${BENCHMARK} benchmark failed to delete, exiting"
+			popd >/dev/null
+                fi
+
         popd >/dev/null
         echo "#######################################"
         echo
@@ -337,23 +392,39 @@ function benchmarks_uninstall() {
 function apply_benchmark_load() {
 	TECHEMPOWER_LOAD_IMAGE="quay.io/kruizehub/tfb_hyperfoil_load:0.25.2"
 	APP_NAMESPACE="${1:-default}"
-	LOAD_DURATION="${2:-1200}"
+	BENCHMARK="${2:-tfb}"
+	LOAD_DURATION="${3:-1200}"
 
-	echo
-	echo "################################################################################################################"
-	echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
-	echo "################################################################################################################"
-	echo
+	if [ ${BENCHMARK} == "tfb" ]; then
+		if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "tfb"; then
+			echo
+			echo "################################################################################################################"
+			echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
+			echo "################################################################################################################"
+			echo
+			if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
+				TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
+			elif [ ${CLUSTER_TYPE} == "aks" ]; then
+				TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
+			elif [ ${CLUSTER_TYPE} == "openshift" ]; then
+				TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
+			fi
+			# docker run -d --rm --network="host"  ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
+			docker run -d --rm --network="host"  ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096
+		fi
+	fi
 
-	if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
-		TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
-	elif [ ${CLUSTER_TYPE} == "aks" ]; then
-		TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
-	elif [ ${CLUSTER_TYPE} == "openshift" ]; then
-		TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
+	if [ ${BENCHMARK} == "llm-rag" ]; then
+		if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "llm"; then
+			pushd benchmarks/AI-MLbenchmarks/llm-rag >/dev/null
+			echo
+			echo "################################################################################################################"
+			echo " Starting background load against the llm-rag benchmark in ${APP_NAMESPACE} namespace "
+			echo "################################################################################################################"
+			./run_load.sh ${APP_NAMESPACE} >> ${LOG_FILE} &
+			popd >/dev/null
+		fi
 	fi
-	# docker run -d --rm --network="host"  ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
-	docker run -d --rm --network="host"  ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096
 
 }
 
@@ -379,10 +450,10 @@ function check_minikube() {
 }
 
 ###########################################
-#   Deploy TFB Benchmarks - multiple import
+#   Create Namespace
 ###########################################
 function create_namespace() {
-	CAPP_NAMESPACE="${1:-test-multiple-import}"
+	CAPP_NAMESPACE=$1
 	echo
 	echo "#########################################"
 	if kubectl get namespace "${CAPP_NAMESPACE}" &> /dev/null; then
@@ -613,12 +684,14 @@ function get_urls() {
 ###########################################
 function show_urls() {
 	if [ ${demo} == "local" ]; then
+		{
 		echo
 		echo "#######################################"
 		echo "#             Quarkus App             #"
 		echo "#######################################"
 		echo "Info: Access techempower app at http://${TECHEMPOWER_URL}/db"
 		echo "Info: Access techempower app metrics at http://${TECHEMPOWER_URL}/q/metrics"
+		} >> "${LOG_FILE}" 2>&1
 	fi
 
 	echo
@@ -659,6 +732,7 @@ function setup_workload() {
 #
 #
 function kruize_local_demo_setup() {
+	bench=$1
 	# Start all the installs
 	start_time=$(get_date)
 	echo
@@ -667,6 +741,8 @@ function kruize_local_demo_setup() {
 	echo "#######################################"
 	echo
 
+	{
+
 	if [ ${kruize_restart} -eq 0 ]; then
 		clone_repos autotune
 		clone_repos benchmarks
@@ -685,7 +761,13 @@ function kruize_local_demo_setup() {
 			prometheus_install
 		fi
 		if [ ${demo} == "local" ]; then
-			benchmarks_install
+			create_namespace ${APP_NAMESPACE}
+			if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
+				benchmarks_install ${APP_NAMESPACE} ${bench}
+			fi
+			echo ""
+		elif [ ${demo} == "bulk" ]; then
+			setup_workload
 		fi
 	fi
 	kruize_local_patch
@@ -698,9 +780,13 @@ function kruize_local_demo_setup() {
 
 	get_urls
 
+	} >> "${LOG_FILE}" 2>&1
+
 	if [ ${demo} == "local" ]; then
-		# Run the Kruize Local experiments
 		kruize_local
+		if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
+			kruize_local_experiments
+		fi
 		show_urls
 	elif [ ${demo} == "bulk" ]; then
 		kruize_bulk
@@ -718,26 +804,18 @@ function kruize_local_demo_setup() {
 function kruize_local_demo_update() {
         # Start all the installs
         start_time=$(get_date)
-
+	bench=$1
 	if [ ${demo} == "local" ]; then
 		if [ ${benchmark} -eq 1 ]; then
-			echo
-	                echo "############################################"
-        	        echo "#     Deploy TFB on ${APP_NAMESPACE}        "
-                	echo "############################################"
 	                echo
 			create_namespace ${APP_NAMESPACE}
-			benchmarks_install ${APP_NAMESPACE} "resource_provisioning_manifests"
+			benchmarks_install ${APP_NAMESPACE} ${bench} "resource_provisioning_manifests"
 	                echo "Success! Running the benchmark in ${APP_NAMESPACE}"
         	        echo
 		fi
 		if [ ${benchmark_load} -eq 1 ]; then
 			echo
-			echo "#######################################"
-			echo "#     Apply the benchmark load        #"
-			echo "#######################################"
-			echo
-			apply_benchmark_load ${APP_NAMESPACE} ${LOAD_DURATION}
+			apply_benchmark_load ${APP_NAMESPACE} ${bench} ${LOAD_DURATION}
 			echo "Success! Running the benchmark load for ${LOAD_DURATION} seconds"
 			echo
 		fi
@@ -759,6 +837,7 @@ function kruize_local_demo_terminate() {
 	echo "#       Kruize Demo Terminate       #"
 	echo "#######################################"
 	echo
+	{
 	if [ ${CLUSTER_TYPE} == "minikube" ]; then
 		minikube_delete
 	elif [ ${CLUSTER_TYPE} == "kind" ]; then
@@ -767,7 +846,7 @@ function kruize_local_demo_terminate() {
 		kruize_uninstall
 	fi
 	if [ ${demo} == "local" ]; then
-		delete_namespace "test-multiple-import"
+		delete_namespace ${APP_NAMESPACE}
 	elif [ ${demo} == "bulk" ]; then
 		ns_name="tfb"
 		count=3
@@ -781,6 +860,7 @@ function kruize_local_demo_terminate() {
 	fi
 	delete_repos autotune
 	delete_repos "benchmarks"
+	} >> "${LOG_FILE}" 2>&1
 	end_time=$(get_date)
 	elapsed_time=$(time_diff "${start_time}" "${end_time}")
 	echo "Success! Kruize demo cleanup took ${elapsed_time} seconds"

diff --git a/monitoring/local_monitoring/ReadMe.md b/monitoring/local_monitoring/ReadMe.md
@@ -28,8 +28,9 @@ By default, it runs on the `Kind` cluster.
 ```
 
 ```
-Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
+Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-e recommendation_experiment] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
 c = supports minikube, kind and openshift cluster-type
+e = supports container, namespace and gpu. Default - none.
 i = kruize image. Default - quay.io/kruize/autotune_operator:<version as in pom.xml>
 l = Run a load against the benchmark
 p = expose prometheus port
@@ -44,13 +45,23 @@ m = manifests of the benchmark
 
 ## Understanding the Demo
 
-This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark to simulate different load conditions and observe how Kruize-Autotune reacts with its recommendations. Here’s a breakdown of what happens during the demo:
-
-- TFB deployment in default Namespace
-    - The TFB benchmark is initially deployed in the default namespace, comprising two key deployments
-      - tfb-qrh: Serving as the application server.
-      - tfb-database: Database to the server.
-    - Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
+This demo focuses on installing kruize and also install the benchmarks if asked for through `-e` parameter.
+- By default, it installs kruize and provides the URL to access the kruize UI service where the user can create experiments and generate recommendations.
+- To use demo benchmarks to create and generate recommendations through a script, pass -e for container, namespace and gpu benchmarks.
+    - For container and namespace type, benchmark 'TFB' is deployed in a namespace.
+    - For gpu type, benchmark 'human-eval' is deployed.
+
+Here’s a breakdown of what happens during the demo:
+
+- Deploys benchmarks in a namespace (if -e is passed)
+    - If -e is container/namespace
+        - The TFB benchmark is initially deployed in the namespace, comprising two key deployments
+          - tfb-qrh: Serving as the application server.
+          - tfb-database: Database to the server.
+        - Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
+    - If -e is gpu
+        - The human-eval benchmark is deployed as job in the namespace.
+        - The job is set to run for atleast 20 mins to generate the recommendations.
 - Install Kruize
   - Installs kruize under openshift-tuning name.
 - Metadata Collection and Experiment Creation
@@ -60,6 +71,9 @@ This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark
   - Generates Recommendations for all the experiments created.
 
 ## Recommendations for different load Simulations observed on Openshift
+
+TFB (TechEmpower Framework Benchmarks) benchmark is simulated in different load conditions and below are the different recommendations observed from Kruize-Autotune.
+
 ### IDLE 
 - Experiment: `monitor_tfb-db_benchmark`
   - Shows an IDLE scenario where CPU recommendations are not generated due to minimal CPU usage (less than a millicore).