From 565572b61c8c05538c237f7b59268a40e01939b1 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 10 Oct 2023 19:07:34 -0700 Subject: [PATCH] Update docs and pylints Signed-off-by: Partho Sarthi --- user_tools/docs/index.md | 4 + user_tools/docs/user-tools-dataproc-gke.md | 180 ++++++++++++++++++ .../cloud_api/dataproc_gke.py | 2 +- .../cloud_api/sp_types.py | 1 + .../src/spark_rapids_tools/cmdli/tools_cli.py | 4 +- user_tools/tests/mock_cluster.py | 3 +- 6 files changed, 190 insertions(+), 4 deletions(-) create mode 100644 user_tools/docs/user-tools-dataproc-gke.md diff --git a/user_tools/docs/index.md b/user_tools/docs/index.md index 48481b004..e47e83078 100644 --- a/user_tools/docs/index.md +++ b/user_tools/docs/index.md @@ -95,6 +95,9 @@ The following table summarizes the commands supported for each cloud platform: | | diagnostic | spark_rapids_user_tools \ | 23.06+ | | | | dataproc diagnostic [ARGS] | | +------------------+---------------+-----------------------------------------+----------+ +| Dataproc_GKE | qualification | spark_rapids_user_tools \ | 23.08.2+ | +| | | dataproc-gke qualification [ARGS] | | ++------------------+---------------+-----------------------------------------+----------+ | Databricks_AWS | qualification | spark_rapids_user_tools \ | 23.04+ | | | | databricks-aws qualification [ARGS] | | | +---------------+-----------------------------------------+----------+ @@ -131,6 +134,7 @@ platform: - [AWS EMR](user-tools-aws-emr.md) - [Google Cloud Dataproc](user-tools-dataproc.md) +- [Google Cloud Dataproc GKE](user-tools-dataproc-gke.md) - [Databricks_AWS](user-tools-databricks-aws.md) - [Databricks_Azure](user-tools-databricks-azure.md) - [OnPrem](user-tools-onprem.md) diff --git a/user_tools/docs/user-tools-dataproc-gke.md b/user_tools/docs/user-tools-dataproc-gke.md new file mode 100644 index 000000000..6fe2c5955 --- /dev/null +++ b/user_tools/docs/user-tools-dataproc-gke.md @@ -0,0 +1,180 @@ +# RAPIDS User Tools on Dataproc GKE + +This is a guide for the RAPIDS tools for Apache Spark on [Google Cloud Dataproc GKE](https://cloud.google.com/dataproc/docs/guides/dpgke/dataproc-gke-overview). +At the end of this guide, the user will be able to run the RAPIDS tools to analyze the clusters and +the applications running on _Google Cloud Dataproc GKE_. + + +## Prerequisites + +### 1.gcloud CLI + +- Install the gcloud CLI. Follow the instructions on [gcloud-sdk-install](https://cloud.google.com/sdk/docs/install) +- Set the configuration settings and credentials of the gcloud CLI: + - Initialize the gcloud CLI by following [these instructions](https://cloud.google.com/sdk/docs/initializing#initialize_the) + - Grant authorization to the gcloud CLI [with a user account](https://cloud.google.com/sdk/docs/authorizing#authorize_with_a_user_account) + - Set up application default credentials to the gcloud CLI [by logging in](https://cloud.google.com/sdk/docs/authorizing#set_up_application_default_credentials) + - Manage gcloud CLI configurations. For more details, visit [gcloud-sdk-configurations](https://cloud.google.com/sdk/docs/configurations) + - Verify that the following [gcloud CLI properties](https://cloud.google.com/sdk/docs/properties) are properly defined: + - `dataproc/region`, + - `compute/zone`, + - `compute/region` + - `core/project` + +### 2.RAPIDS tools + +- Spark event logs: + - The RAPIDS tools can process Apache Spark CPU event logs from Spark 2.0 or higher (raw, .lz4, .lzf, .snappy, .zstd) + - For `qualification` commands, the event logs need to be archived to an accessible gs folder. + +### 3.Install the package + +- Install `spark-rapids-user-tools` with python [3.8, 3.10] using: + - pip: `pip install spark-rapids-user-tools` + - wheel-file: `pip install ` + - from source: `pip install -e .` +- verify the command is installed correctly by running + ```bash + spark_rapids_user_tools dataproc-gke -- --help + ``` + +### 4.Environment variables + +Before running any command, you can set environment variables to specify configurations. +RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: + - `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to + store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. + Note that caching the resources locally has an impact on the total execution time of the command. + - `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to + generate the output. The wrapper CLI arguments override that environment variable + (`--output_folder` and `local_folder` for Bootstrap and Qualification respectively). + +## Qualification command + +### Local deployment + +``` +spark_rapids_user_tools dataproc-gke qualification [options] +spark_rapids_user_tools dataproc-gke qualification -- --help +``` + +The local deployment runs on the local development machine. It requires: +1. Installing and configuring the gcloud CLI (`gsutil` and `gcloud` commands) +2. Java 1.8+ development environment +3. Internet access to download JAR dependencies from mvn: `spark-*.jar`, and `gcs-connector-hadoop-*.jar` +4. Dependencies are cached on the local disk to reduce the overhead of the download. + + +#### Command options + +| Option | Description | Default | Required | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| +| **cpu_cluster** | The virtual Dataproc-cluster on which the Apache Spark applications were executed. Accepted values are an virtual Dataproc-cluster name, or a valid path to the cluster properties file (json format) generated by gcloud CLI command `gcloud dataproc clusters describe`. This should not be confused with the GKE cluster name. | N/A | N | +| **eventlogs** | A comma seperated list of gs urls pointing to event logs or gs directory | Reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included in the output of `dataproc clusters describe`. Note that the wrapper will raise an exception if the property is not set. | N | +| **remote_folder** | The gs folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | +| **gpu_cluster** | The virtual Dataproc-cluster on which the Spark applications is planned to be migrated. The argument can be an virtual Dataproc-cluster or a valid path to the cluster's properties file (json format) generated by the gcloud CLI command `gcloud dataproc clusters describe` | The wrapper maps the machine instances of the original cluster into GPU supported instances | N | +| **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `qual-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | +| **jvm_heap_size** | The maximum heap size of the JVM in gigabytes | 24 | N | +| **tools_jar** | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote gs url | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo | N | +| **credentials_file** | The local path of JSON file that contains the application credentials | If missing, loads the env variable `GOOGLE_APPLICATION_CREDENTIALS` if any. Otherwise, it uses the default path "$HOME/.config/gcloud/application_default_credentials.json" | N | +| **filter_apps** | Filtering criteria of the applications listed in the final STDOUT table is one of the following (`ALL`, `SPEEDUPS`, `SAVINGS`). "`ALL`" means no filter applied. "`SPEEDUPS`" lists all the apps that are either '_Recommended_', or '_Strongly Recommended_' based on speedups. "`SAVINGS`" lists all the apps that have positive estimated GPU savings except for the apps that are '_Not Applicable_'. | `SAVINGS` | N | +| **gpu_cluster_recommendation** | The type of GPU cluster recommendation to generate. It accepts one of the following (`CLUSTER`, `JOB`, `MATCH`). `MATCH`: keep GPU cluster same number of nodes as CPU cluster; `CLUSTER`: recommend optimal GPU cluster by cost for entire cluster. `JOB`: recommend optimal GPU cluster by cost per job | `MATCH` | N | +| **cpu_discount** | A percent discount for the cpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **gpu_discount** | A percent discount for the gpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **global_discount** | A percent discount for both the cpu and gpu cluster costs in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **verbose** | True or False to enable verbosity to the wrapper script | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set | N | +| **rapids_options**** | A list of valid [Qualification tool options](../../core/docs/spark-qualification-tool.md#qualification-tool-options). Note that (`output-directory`, `platform`) flags are ignored, and that multiple "spark-property" is not supported. | N/A | N | + +#### Use case scenario + +A typical workflow to successfully run the `qualification` command in local mode is described as follows: + +1. Store the Apache Spark event logs in gs folder. +2. A user sets up his development machine: + 1. configures Java + 2. installs gcloud CLI and configures the profile and the credentials to make sure the gcloud CLI + commands can access the gs resources `LOGS_BUCKET`. + 3. installs `spark_rapids_user_tools` +3. If the results of the wrapper need to be stored on gs, then another gs uri is required `REMOTE_FOLDER=gs://OUT_BUCKET/` +4. User defines the virtual Dataproc-cluster on which the Spark application were running. Note that the cluster needs to be + active; Dataproc on GKE cannot be present in `STOPPED` state. It has to be visible by the gcloud CLI (i.e., can run `gcloud dataproc clusters describe + cluster_name`). +5. The following script runs qualification by passing gs remote directory to store the output: + + ``` + # define the wrapper cache directory if necessary + export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder + export EVENTLOGS=gs://LOGS_BUCKET/eventlogs/ + export CLUSTER_NAME=my-virtual-dataproc-cpu-cluster + export REMOTE_FOLDER=gs://OUT_BUCKET/wrapper_output + + spark_rapids_user_tools dataproc-gke qualification \ + --eventlogs $EVENTLOGS \ + --cpu_cluster $CLUSTER_NAME \ + --remote_folder $REMOTE_FOLDER + ``` + The wrapper generates a unique-Id for each execution in the format of `qual__<0x%08X>` + The above command will generate a directory containing `qualification_summary.csv` in addition to + the actual folder of the RAPIDS Qualification tool. The directory will be mirrored to gs path (`REMOTE_FOLDER`). + + ``` + ./qual__<0x%08X>/qualification_summary.csv + ./qual__<0x%08X>/rapids_4_spark_qualification_output/ + ``` + +### Qualification output + +For each app, the command output lists the following fields: + +- `App ID`: An application is referenced by its application ID, '_app-id_'. When running on YARN, + each application may have multiple attempts, but there are attempt IDs only for applications + in cluster mode, not applications in client mode. Applications in YARN cluster mode can be + identified by their attempt-id. +- `App Name`: Name of the application +- `Speedup Based Recommendation`: Recommendation based on '_Estimated Speed-up Factor_'. Note that an + application that has job or stage failures will be labeled '_Not Applicable_' +- `Savings Based Recommendation`: Recommendation based on '_Estimated GPU Savings_'. + - '_Strongly Recommended_': An app with savings GEQ 40% + - '_Recommended_': An app with savings between (1, 40) % + - '_Not Recommended_': An app with no savings + - '_Not Applicable_': An app that has job or stage failures. +- `Estimated GPU Speedup`: Speed-up factor estimated for the app. Calculated as the ratio + between '_App Duration_' and '_Estimated GPU Duration_'. +- `Estimated GPU Duration`: Predicted runtime of the app if it was run on GPU +- `App Duration`: Wall-Clock time measured since the application starts till it is completed. + If an app is not completed an estimated completion time would be computed. +- `Estimated GPU Savings(%)`: Percentage of cost savings of the app if it migrates to an + accelerated cluster. It is calculated as: + ``` + estimated_saving = 100 - ((100 * gpu_cost) / cpu_cost) + +The command creates a directory with UUID that contains the following: +- Directory generated by the RAPIDS qualification tool `rapids_4_spark_qualification_output`; +- A CSV file that contains the summary of all the applications along with estimated absolute costs +- Sample directory structure: + ``` + qual_20230314145334_d2CaFA34 + ├── qualification_summary.csv + └── rapids_4_spark_qualification_output + ├── ui + │ └── html + │ ├── sql-recommendation.html + │ ├── index.html + │ ├── application.html + │ └── raw.html + ├── rapids_4_spark_qualification_output_stages.csv + ├── rapids_4_spark_qualification_output.csv + ├── rapids_4_spark_qualification_output_execs.csv + └── rapids_4_spark_qualification_output.log + 3 directories, 9 files + + ``` + +#### TCO calculator + +In the `qualification_summary.csv` output file, you will see two additional columns appended: +`Estimated Job Frequency (monthly)` and `Annual Cost Savings`. +These new columns are to be used as part of a TCO calculator to see the long-term benefit of using +Spark RAPIDS with your applications. +A GSheet template with instructions can be found at here: [link](https://docs.google.com/spreadsheets/d/1CslQHTwxHEDTlAP4lcrOzbSrmucvn8z4iFlJo6EAhxs/edit#gid=1607726286). +Make a copy of the GSheet template and then follow the instructions listed in the `Instructions` tab. diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc_gke.py b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc_gke.py index a84ec7d29..06a71d342 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc_gke.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc_gke.py @@ -135,7 +135,7 @@ class DataprocGkeCluster(DataprocCluster): """ Represents an instance of running cluster on DataprocGke. """ - node_pools: list[GkeNodePool] = field(default=None, init=False) + node_pools: list = field(default=None, init=False) @staticmethod def __extract_info_from_value(conf_val: str): diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py index 52522ce3f..b8f4f6a29 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py @@ -28,6 +28,7 @@ from spark_rapids_pytools.common.sys_storage import StorageDriver, FSUtil from spark_rapids_pytools.common.utilities import ToolLogging, SysCmd, Utils, TemplateGenerator + class DeployMode(EnumeratedType): """List of tools deployment methods""" # The rapids job is running on local node diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index 2f971dcbe..50cd28d01 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -64,8 +64,8 @@ def qualification(self, Skipping this argument requires that the cluster argument points to a valid cluster name on the CSP. :param cluster: Name of cluster or path to cluster-properties. - :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws", - and "databricks-azure". + :param platform: defines one of the following "onprem", "emr", "dataproc", "dataproc-gke", + "databricks-aws", and "databricks-azure". :param target_platform: Cost savings and speedup recommendation for comparable cluster in target_platform based on on-premises cluster configuration. diff --git a/user_tools/tests/mock_cluster.py b/user_tools/tests/mock_cluster.py index 547de368b..65b74155f 100644 --- a/user_tools/tests/mock_cluster.py +++ b/user_tools/tests/mock_cluster.py @@ -34,7 +34,8 @@ "workerConfig": { "accelerators": [{ "acceleratorTypeUri": "https://www.googleapis.com/compute/beta/projects/project-id/zones/"\ - "us-central1-a/acceleratorTypes/nvidia-tesla-t4" + "us-central1-a/acceleratorTypes/nvidia-tesla-t4", + "acceleratorCount": 1, }], "instanceNames": [ "test-worker-0",