Skip to content

Commit

Permalink
Merge pull request #94 from kusumachalasani/aiworkloads
Browse files Browse the repository at this point in the history
Include accelerate benchmarks
  • Loading branch information
dinogun authored Oct 21, 2024
2 parents 31264f6 + 7e0f3d5 commit f0b089e
Show file tree
Hide file tree
Showing 10 changed files with 421 additions and 192 deletions.
174 changes: 127 additions & 47 deletions common/common_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -292,21 +292,56 @@ function prometheus_install() {
###########################################
function benchmarks_install() {
NAMESPACE="${1:-default}"
MANIFESTS="${2:-default_manifests}"
BENCHMARK="${2:-tfb}"
MANIFESTS="${3:-default_manifests}"

echo
echo "#######################################"
pushd benchmarks >/dev/null
echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
pushd techempower >/dev/null
# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml

kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
check_err "ERROR: TechEmpower app failed to start, exiting"
popd >/dev/null
if [ ${BENCHMARK} == "tfb" ]; then
echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
pushd techempower >/dev/null
# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
check_err "ERROR: TechEmpower app failed to start, exiting"
popd >/dev/null
fi
if [ ${BENCHMARK} == "human-eval" ]; then
echo "#######################################"
echo "Running HumanEval benchmark job in background"
echo
pushd human-eval-benchmark/manifests >/dev/null
sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' pvc.yaml
sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' job.yaml
# Update num_prompts value to 150 to run the benchmark for atleast 15 mins
sed -i "s/value: '10'/value: '150'/" job.yaml
oc apply -f pvc.yaml -n ${NAMESPACE}
oc apply -f job.yaml -n ${NAMESPACE}
check_err "ERROR: Human eval job failed to start, exiting"
popd >/dev/null
fi
if [ ${BENCHMARK} == "ttm" ]; then
echo "#######################################"
echo "Running Training TTM benchmark job in background"
pushd AI-MLbenchmarks/ttm >/dev/null
echo ""
./run_ttm.sh ${NAMESPACE} >> ${LOG_FILE} &
check_err "ERROR: Training ttm jobs failed to start, exiting"
popd >/dev/null
fi
if [ ${BENCHMARK} == "llm-rag" ]; then
echo "#######################################"
echo "Installing LLM-RAG benchmark into cluster"
pushd AI-MLbenchmarks/llm-rag >/dev/null
./deploy.sh ${NAMESPACE}
check_err "ERROR: llm-rag benchmark failed to start, exiting"
popd >/dev/null
fi

popd >/dev/null
echo "#######################################"
echo
Expand All @@ -317,15 +352,35 @@ function benchmarks_install() {
###########################################
function benchmarks_uninstall() {
NAMESPACE="${1:-default}"
BENCHMARK="${1:-tfb}"
MANIFESTS="${2:-default_manifests}"
echo
echo "#######################################"
pushd benchmarks >/dev/null
echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
pushd techempower >/dev/null
kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
check_err "ERROR: TechEmpower app failed to delete, exiting"
popd >/dev/null
if [ ${BENCHMARK} == "tfb" ]; then
echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
pushd techempower >/dev/null
kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
check_err "ERROR: TechEmpower app failed to delete, exiting"
popd >/dev/null
fi
if [ ${BENCHMARK} == "human-eval" ]; then
echo "Uninstalling humanEval benchmark job in cluster"
pushd human-eval-benchmark >/dev/null
oc delete -f job.yaml
oc delete -f pvc.yaml
check_err "ERROR: human-eval benchmark failed to delete, exiting"
popd >/dev/null
fi
if [ ${BENCHMARK} == "ttm" ] || [${BENCHMARK} == "llm-rag"]; then

echo "Uninstalling ${BENCHMARK} benchmark in cluster"
pushd AI-MLbenchmarks/ttm >/dev/null
./cleanup.sh ${NAMESPACE}
check_err "ERROR: ${BENCHMARK} benchmark failed to delete, exiting"
popd >/dev/null
fi

popd >/dev/null
echo "#######################################"
echo
Expand All @@ -337,23 +392,39 @@ function benchmarks_uninstall() {
function apply_benchmark_load() {
TECHEMPOWER_LOAD_IMAGE="quay.io/kruizehub/tfb_hyperfoil_load:0.25.2"
APP_NAMESPACE="${1:-default}"
LOAD_DURATION="${2:-1200}"
BENCHMARK="${2:-tfb}"
LOAD_DURATION="${3:-1200}"

echo
echo "################################################################################################################"
echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
echo "################################################################################################################"
echo
if [ ${BENCHMARK} == "tfb" ]; then
if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "tfb"; then
echo
echo "################################################################################################################"
echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
echo "################################################################################################################"
echo
if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
elif [ ${CLUSTER_TYPE} == "aks" ]; then
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
elif [ ${CLUSTER_TYPE} == "openshift" ]; then
TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
fi
# docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096
fi
fi

if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
elif [ ${CLUSTER_TYPE} == "aks" ]; then
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
elif [ ${CLUSTER_TYPE} == "openshift" ]; then
TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
if [ ${BENCHMARK} == "llm-rag" ]; then
if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "llm"; then
pushd benchmarks/AI-MLbenchmarks/llm-rag >/dev/null
echo
echo "################################################################################################################"
echo " Starting background load against the llm-rag benchmark in ${APP_NAMESPACE} namespace "
echo "################################################################################################################"
./run_load.sh ${APP_NAMESPACE} >> ${LOG_FILE} &
popd >/dev/null
fi
fi
# docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096

}

Expand All @@ -379,10 +450,10 @@ function check_minikube() {
}

###########################################
# Deploy TFB Benchmarks - multiple import
# Create Namespace
###########################################
function create_namespace() {
CAPP_NAMESPACE="${1:-test-multiple-import}"
CAPP_NAMESPACE=$1
echo
echo "#########################################"
if kubectl get namespace "${CAPP_NAMESPACE}" &> /dev/null; then
Expand Down Expand Up @@ -613,12 +684,14 @@ function get_urls() {
###########################################
function show_urls() {
if [ ${demo} == "local" ]; then
{
echo
echo "#######################################"
echo "# Quarkus App #"
echo "#######################################"
echo "Info: Access techempower app at http://${TECHEMPOWER_URL}/db"
echo "Info: Access techempower app metrics at http://${TECHEMPOWER_URL}/q/metrics"
} >> "${LOG_FILE}" 2>&1
fi

echo
Expand Down Expand Up @@ -659,6 +732,7 @@ function setup_workload() {
#
#
function kruize_local_demo_setup() {
bench=$1
# Start all the installs
start_time=$(get_date)
echo
Expand All @@ -667,6 +741,8 @@ function kruize_local_demo_setup() {
echo "#######################################"
echo

{

if [ ${kruize_restart} -eq 0 ]; then
clone_repos autotune
clone_repos benchmarks
Expand All @@ -685,7 +761,13 @@ function kruize_local_demo_setup() {
prometheus_install
fi
if [ ${demo} == "local" ]; then
benchmarks_install
create_namespace ${APP_NAMESPACE}
if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
benchmarks_install ${APP_NAMESPACE} ${bench}
fi
echo ""
elif [ ${demo} == "bulk" ]; then
setup_workload
fi
fi
kruize_local_patch
Expand All @@ -698,9 +780,13 @@ function kruize_local_demo_setup() {

get_urls

} >> "${LOG_FILE}" 2>&1

if [ ${demo} == "local" ]; then
# Run the Kruize Local experiments
kruize_local
if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
kruize_local_experiments
fi
show_urls
elif [ ${demo} == "bulk" ]; then
kruize_bulk
Expand All @@ -718,26 +804,18 @@ function kruize_local_demo_setup() {
function kruize_local_demo_update() {
# Start all the installs
start_time=$(get_date)

bench=$1
if [ ${demo} == "local" ]; then
if [ ${benchmark} -eq 1 ]; then
echo
echo "############################################"
echo "# Deploy TFB on ${APP_NAMESPACE} "
echo "############################################"
echo
create_namespace ${APP_NAMESPACE}
benchmarks_install ${APP_NAMESPACE} "resource_provisioning_manifests"
benchmarks_install ${APP_NAMESPACE} ${bench} "resource_provisioning_manifests"
echo "Success! Running the benchmark in ${APP_NAMESPACE}"
echo
fi
if [ ${benchmark_load} -eq 1 ]; then
echo
echo "#######################################"
echo "# Apply the benchmark load #"
echo "#######################################"
echo
apply_benchmark_load ${APP_NAMESPACE} ${LOAD_DURATION}
apply_benchmark_load ${APP_NAMESPACE} ${bench} ${LOAD_DURATION}
echo "Success! Running the benchmark load for ${LOAD_DURATION} seconds"
echo
fi
Expand All @@ -759,6 +837,7 @@ function kruize_local_demo_terminate() {
echo "# Kruize Demo Terminate #"
echo "#######################################"
echo
{
if [ ${CLUSTER_TYPE} == "minikube" ]; then
minikube_delete
elif [ ${CLUSTER_TYPE} == "kind" ]; then
Expand All @@ -767,7 +846,7 @@ function kruize_local_demo_terminate() {
kruize_uninstall
fi
if [ ${demo} == "local" ]; then
delete_namespace "test-multiple-import"
delete_namespace ${APP_NAMESPACE}
elif [ ${demo} == "bulk" ]; then
ns_name="tfb"
count=3
Expand All @@ -781,6 +860,7 @@ function kruize_local_demo_terminate() {
fi
delete_repos autotune
delete_repos "benchmarks"
} >> "${LOG_FILE}" 2>&1
end_time=$(get_date)
elapsed_time=$(time_diff "${start_time}" "${end_time}")
echo "Success! Kruize demo cleanup took ${elapsed_time} seconds"
Expand Down
30 changes: 22 additions & 8 deletions monitoring/local_monitoring/ReadMe.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ By default, it runs on the `Kind` cluster.
```

```
Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-e recommendation_experiment] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
c = supports minikube, kind and openshift cluster-type
e = supports container, namespace and gpu. Default - none.
i = kruize image. Default - quay.io/kruize/autotune_operator:<version as in pom.xml>
l = Run a load against the benchmark
p = expose prometheus port
Expand All @@ -44,13 +45,23 @@ m = manifests of the benchmark

## Understanding the Demo

This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark to simulate different load conditions and observe how Kruize-Autotune reacts with its recommendations. Here’s a breakdown of what happens during the demo:

- TFB deployment in default Namespace
- The TFB benchmark is initially deployed in the default namespace, comprising two key deployments
- tfb-qrh: Serving as the application server.
- tfb-database: Database to the server.
- Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
This demo focuses on installing kruize and also install the benchmarks if asked for through `-e` parameter.
- By default, it installs kruize and provides the URL to access the kruize UI service where the user can create experiments and generate recommendations.
- To use demo benchmarks to create and generate recommendations through a script, pass -e for container, namespace and gpu benchmarks.
- For container and namespace type, benchmark 'TFB' is deployed in a namespace.
- For gpu type, benchmark 'human-eval' is deployed.

Here’s a breakdown of what happens during the demo:

- Deploys benchmarks in a namespace (if -e is passed)
- If -e is container/namespace
- The TFB benchmark is initially deployed in the namespace, comprising two key deployments
- tfb-qrh: Serving as the application server.
- tfb-database: Database to the server.
- Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
- If -e is gpu
- The human-eval benchmark is deployed as job in the namespace.
- The job is set to run for atleast 20 mins to generate the recommendations.
- Install Kruize
- Installs kruize under openshift-tuning name.
- Metadata Collection and Experiment Creation
Expand All @@ -60,6 +71,9 @@ This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark
- Generates Recommendations for all the experiments created.

## Recommendations for different load Simulations observed on Openshift

TFB (TechEmpower Framework Benchmarks) benchmark is simulated in different load conditions and below are the different recommendations observed from Kruize-Autotune.

### IDLE
- Experiment: `monitor_tfb-db_benchmark`
- Shows an IDLE scenario where CPU recommendations are not generated due to minimal CPU usage (less than a millicore).
Expand Down
Loading

0 comments on commit f0b089e

Please sign in to comment.