From fb6d20fa8d191571a81bea4a331315b4005c6df2 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Mon, 9 Oct 2023 15:35:20 -0700 Subject: [PATCH 1/4] add unit test for onprem no eventlogs (#605) Signed-off-by: cindyyuanjiang --- .../spark_rapids_tools_ut/test_tool_argprocessor.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py index 04ce37aa3..f1c719421 100644 --- a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py +++ b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py @@ -157,6 +157,18 @@ def test_cluster_props_no_eventlogs(self, get_ut_data_dir, tool_name, csp, prop_ assert tool_args['runtimePlatform'] == CspEnv(csp) self.validate_args_w_savings_enabled(tool_name, tool_args) + @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) + @register_triplet_test([ArgValueCase.IGNORE, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED]) + def test_cluster_props_no_eventlogs_on_prem(self, capsys, tool_name): + # Missing eventlogs is not accepted for onPrem + with pytest.raises(SystemExit) as pytest_wrapped_e: + AbsToolUserArgModel.create_tool_args(tool_name, + platform='onprem') + assert pytest_wrapped_e.type == SystemExit + captured = capsys.readouterr() + # Verify there is no URL in error message + assert 'https://' not in captured.err + @pytest.mark.skip(reason='Unit tests are not completed yet') def test_arg_cases_coverage(self): args_keys = [ From 4dec452e2ba5102d2b4deb51b0abb5b1bc61e7c6 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 11 Oct 2023 11:02:06 -0700 Subject: [PATCH 2/4] Add support in core tools for running qualification on Dataproc GKE (#613) * Add support for Dataproc GKE in core tools Signed-off-by: Partho Sarthi * Update scores for Dataproc GKE Signed-off-by: Partho Sarthi * Update docs Signed-off-by: Partho Sarthi * Update tests with Dataproc GKE operator scores Signed-off-by: Partho Sarthi * Remove Spark-ML operators Signed-off-by: Partho Sarthi --------- Signed-off-by: Partho Sarthi --- core/docs/spark-qualification-tool.md | 5 +- .../operatorsScore-dataproc-gke-t4.csv | 256 ++++++++++++++++++ .../qualification/PluginTypeChecker.scala | 2 + .../qualification/QualificationArgs.scala | 5 +- .../PluginTypeCheckerSuite.scala | 7 + .../qualification/QualificationSuite.scala | 24 ++ 6 files changed, 294 insertions(+), 5 deletions(-) create mode 100644 core/src/main/resources/operatorsScore-dataproc-gke-t4.csv diff --git a/core/docs/spark-qualification-tool.md b/core/docs/spark-qualification-tool.md index 023f4318b..64ce149f0 100644 --- a/core/docs/spark-qualification-tool.md +++ b/core/docs/spark-qualification-tool.md @@ -29,6 +29,7 @@ applicable environments. Here are the cluster information for the ETL benchmark | Dataproc (T4) | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB | | Dataproc (L4) | 8x n1-standard-16 | 8x g2-standard-16 | | Dataproc Serverless (L4) | 8x 16 cores | 8x 16 cores + 8x L4 24GB | +| Dataproc GKE (T4) | 8x n1-standard-32 | 8x n1-standard-32 + 8x T4 16GB | | EMR (T4) | 8x m5d.8xlarge | 4x g4dn.12xlarge | | EMR (A10) | 8x m5d.8xlarge | 8x g5.8xlarge | | Databricks AWS | 8x m6gd.8xlage | 8x g5.8xlarge | @@ -248,8 +249,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* -p, --per-sql Report at the individual SQL query level. --platform Cluster platform where Spark CPU workloads were executed. Options include onprem, dataproc-t4, - dataproc-l4, dataproc-serverless-l4, emr-t4, - emr-a10, databricks-aws, and databricks-azure. + dataproc-l4, dataproc-serverless-l4, dataproc-gke-t4, + emr-t4, emr-a10, databricks-aws, and databricks-azure. Default is onprem. -r, --report-read-schema Whether to output the read formats and datatypes to the CSV file. This can be very diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv new file mode 100644 index 000000000..3083cbe8b --- /dev/null +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -0,0 +1,256 @@ +CPUOperator,Score +CoalesceExec,3.65 +CollectLimitExec,3.65 +ExpandExec,3.76 +FileSourceScanExec,2.84 +FilterExec,3.79 +GenerateExec,3.65 +GlobalLimitExec,3.65 +LocalLimitExec,3.65 +ProjectExec,3.65 +RangeExec,3.65 +SampleExec,3.65 +SortExec,3.65 +TakeOrderedAndProjectExec,3.65 +HashAggregateExec,4.1 +ObjectHashAggregateExec,4.1 +SortAggregateExec,4.1 +DataWritingCommandExec,3.65 +ExecutedCommandExec,3.65 +BatchScanExec,2.84 +ShuffleExchangeExec,3.69 +BroadcastHashJoinExec,3.72 +BroadcastNestedLoopJoinExec,1.66 +CartesianProductExec,3.65 +ShuffledHashJoinExec,3.65 +SortMergeJoinExec,5.64 +WindowExec,3.65 +Abs,3.65 +Acos,3.65 +Acosh,3.65 +Add,3.65 +AggregateExpression,3.65 +Alias,3.65 +And,3.65 +ApproximatePercentile,3.65 +ArrayContains,3.65 +ArrayExcept,3.65 +ArrayExists,3.65 +ArrayIntersect,3.65 +ArrayMax,3.65 +ArrayMin,3.65 +ArrayRemove,3.65 +ArrayRepeat,3.65 +ArrayTransform,3.65 +ArrayUnion,3.65 +ArraysOverlap,3.65 +ArraysZip,3.65 +Asin,3.65 +Asinh,3.65 +AtLeastNNonNulls,3.65 +Atan,3.65 +Atanh,3.65 +AttributeReference,3.65 +Average,3.65 +BRound,3.65 +BitLength,3.65 +BitwiseAnd,3.65 +BitwiseNot,3.65 +BitwiseOr,3.65 +BitwiseXor,3.65 +CaseWhen,3.65 +Cbrt,3.65 +Ceil,3.65 +CheckOverflow,3.65 +Coalesce,3.65 +CollectList,3.65 +CollectSet,3.65 +Concat,3.65 +ConcatWs,3.65 +Contains,3.65 +Conv,3.65 +Cos,3.65 +Cosh,3.65 +Cot,3.65 +Count,3.65 +CreateArray,3.65 +CreateMap,3.65 +CreateNamedStruct,3.65 +CurrentRow$,3.65 +DateAdd,3.65 +DateAddInterval,3.65 +DateDiff,3.65 +DateFormatClass,3.65 +DateSub,3.65 +DayOfMonth,3.65 +DayOfWeek,3.65 +DayOfYear,3.65 +DenseRank,3.65 +Divide,3.65 +DynamicPruningExpression,3.65 +ElementAt,3.65 +EndsWith,3.65 +EqualNullSafe,3.65 +EqualTo,3.65 +Exp,3.65 +Explode,3.65 +Expm1,3.65 +First,3.65 +Flatten,3.65 +Floor,3.65 +FromUTCTimestamp,3.65 +FromUnixTime,3.65 +GetArrayItem,3.65 +GetArrayStructFields,3.65 +GetJsonObject,3.65 +GetMapValue,3.65 +GetStructField,3.65 +GetTimestamp,3.65 +GreaterThan,3.65 +GreaterThanOrEqual,3.65 +Greatest,3.65 +HiveGenericUDF,3.65 +HiveSimpleUDF,3.65 +Hour,3.65 +Hypot,3.65 +If,3.65 +In,3.65 +InSet,3.65 +InitCap,3.65 +InputFileBlockLength,3.65 +InputFileBlockStart,3.65 +InputFileName,3.65 +IntegralDivide,3.65 +IsNaN,3.65 +IsNotNull,3.65 +IsNull,3.65 +JsonToStructs,3.65 +JsonTuple,3.65 +KnownFloatingPointNormalized,3.65 +KnownNotNull,3.65 +Lag,3.65 +LambdaFunction,3.65 +Last,3.65 +LastDay,3.65 +Lead,3.65 +Least,3.65 +Length,3.65 +LessThan,3.65 +LessThanOrEqual,3.65 +Like,3.65 +Literal,3.65 +Log,3.65 +Log10,3.65 +Log1p,3.65 +Log2,3.65 +Logarithm,3.65 +Lower,3.65 +MakeDecimal,3.65 +MapConcat,3.65 +MapEntries,3.65 +MapFilter,3.65 +MapKeys,3.65 +MapValues,3.65 +Max,3.65 +Md5,3.65 +MicrosToTimestamp,3.65 +MillisToTimestamp,3.65 +Min,3.65 +Minute,3.65 +MonotonicallyIncreasingID,3.65 +Month,3.65 +Multiply,3.65 +Murmur3Hash,3.65 +NaNvl,3.65 +NamedLambdaVariable,3.65 +NormalizeNaNAndZero,3.65 +Not,3.65 +NthValue,3.65 +OctetLength,3.65 +Or,3.65 +PercentRank,3.65 +PivotFirst,3.65 +Pmod,3.65 +PosExplode,3.65 +Pow,3.65 +PreciseTimestampConversion,3.65 +PromotePrecision,3.65 +PythonUDF,3.65 +Quarter,3.65 +RLike,3.65 +RaiseError,3.65 +Rand,3.65 +Rank,3.65 +RegExpExtract,3.65 +RegExpExtractAll,3.65 +RegExpReplace,3.65 +Remainder,3.65 +ReplicateRows,3.65 +Reverse,3.65 +Rint,3.65 +Round,3.65 +RowNumber,3.65 +ScalaUDF,3.65 +ScalarSubquery,3.65 +Second,3.65 +SecondsToTimestamp,3.65 +Sequence,3.65 +ShiftLeft,3.65 +ShiftRight,3.65 +ShiftRightUnsigned,3.65 +Signum,3.65 +Sin,3.65 +Sinh,3.65 +Size,3.65 +SortArray,3.65 +SortOrder,3.65 +SparkPartitionID,3.65 +SpecifiedWindowFrame,3.65 +Sqrt,3.65 +StartsWith,3.65 +StddevPop,3.65 +StddevSamp,3.65 +StringInstr,3.65 +StringLPad,3.65 +StringLocate,3.65 +StringRPad,3.65 +StringRepeat,3.65 +StringReplace,3.65 +StringSplit,3.65 +StringToMap,3.65 +StringTranslate,3.65 +StringTrim,3.65 +StringTrimLeft,3.65 +StringTrimRight,3.65 +Substring,3.65 +SubstringIndex,3.65 +Subtract,3.65 +Sum,3.65 +Tan,3.65 +Tanh,3.65 +TimeAdd,3.65 +ToDegrees,3.65 +ToRadians,3.65 +ToUnixTimestamp,3.65 +TransformKeys,3.65 +TransformValues,3.65 +UnaryMinus,3.65 +UnaryPositive,3.65 +UnboundedFollowing$,3.65 +UnboundedPreceding$,3.65 +UnixTimestamp,3.65 +UnscaledValue,3.65 +Upper,3.65 +VariancePop,3.65 +VarianceSamp,3.65 +WeekDay,3.65 +WindowExpression,3.65 +WindowSpecDefinition,3.65 +XxHash64,3.65 +Year,3.65 +AggregateInPandasExec,1.2 +ArrowEvalPythonExec,1.2 +FlatMapGroupsInPandasExec,1.2 +FlatMapCoGroupsInPandasExec,1.2 +MapInPandasExec,1.2 +WindowInPandasExec,1.2 diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala index bd5641363..d14336618 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala @@ -48,6 +48,7 @@ class PluginTypeChecker(platform: String = "onprem", private val OPERATORS_SCORE_FILE_DATAPROC_T4 = "operatorsScore-dataproc-t4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_L4 = "operatorsScore-dataproc-l4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_SL_L4 = "operatorsScore-dataproc-serverless-l4.csv" + private val OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 = "operatorsScore-dataproc-gke-t4.csv" private val OPERATORS_SCORE_FILE_EMR_T4 = "operatorsScore-emr-t4.csv" private val OPERATORS_SCORE_FILE_EMR_A10 = "operatorsScore-emr-a10.csv" private val OPERATORS_SCORE_FILE_DATABRICKS_AWS = "operatorsScore-databricks-aws.csv" @@ -104,6 +105,7 @@ class PluginTypeChecker(platform: String = "onprem", case "dataproc-t4" | "dataproc" => OPERATORS_SCORE_FILE_DATAPROC_T4 case "dataproc-l4" => OPERATORS_SCORE_FILE_DATAPROC_L4 case "dataproc-serverless-l4" => OPERATORS_SCORE_FILE_DATAPROC_SL_L4 + case "dataproc-gke-t4" => OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 // if no GPU specified, then default to emr-t4 for backward compatibility case "emr-t4" | "emr" => OPERATORS_SCORE_FILE_EMR_T4 case "emr-a10" => OPERATORS_SCORE_FILE_EMR_A10 diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala index 075c3512f..76a2fba8e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala @@ -148,9 +148,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* val platform: ScallopOption[String] = opt[String](required = false, descr = "Cluster platform where Spark CPU workloads were executed. Options include " + - "onprem, dataproc-t4, dataproc-l4, dataproc-serverless-l4, emr-t4, emr-a10, " + - "databricks-aws, and databricks-azure. " + - "Default is onprem.", + "onprem, dataproc-t4, dataproc-l4, dataproc-serverless-l4, dataproc-gke-t4, emr-t4, " + + "emr-a10, databricks-aws, and databricks-azure. Default is onprem.", default = Some("onprem")) val speedupFactorFile: ScallopOption[String] = opt[String](required = false, diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala index e7720fd28..3ba8badc9 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala @@ -188,12 +188,19 @@ class PluginTypeCheckerSuite extends FunSuite with Logging { assert(checker.getSpeedupFactor("WindowExec") == 4.25) assert(checker.getSpeedupFactor("Ceil") == 4.25) } + test("supported operator score from dataproc-l4") { val checker = new PluginTypeChecker("dataproc-l4") assert(checker.getSpeedupFactor("UnionExec") == 4.16) assert(checker.getSpeedupFactor("Ceil") == 4.16) } + test("supported operator score from dataproc-gke-t4") { + val checker = new PluginTypeChecker("dataproc-gke-t4") + assert(checker.getSpeedupFactor("WindowExec") == 3.65) + assert(checker.getSpeedupFactor("Ceil") == 3.65) + } + test("supported operator score from emr-a10") { val checker = new PluginTypeChecker("emr-a10") assert(checker.getSpeedupFactor("UnionExec") == 2.59) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala index 502ddffc1..982a65ec8 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala @@ -1529,6 +1529,30 @@ class QualificationSuite extends BaseTestSuite { assert(outputActual.collect().size == 1) } + // run the qualification tool for dataproc-gke-t4 + TrampolineUtil.withTempDir { outpath => + val appArgs = new QualificationArgs(Array( + "--output-directory", + outpath.getAbsolutePath, + "--platform", + "dataproc-gke-t4", + eventLog)) + + val (exit, _) = + QualificationMain.mainInternal(appArgs) + assert(exit == 0) + + // the code above that runs the Spark query stops the Sparksession + // so create a new one to read in the csv file + createSparkSession() + + // validate that the SQL description in the csv file escapes commas properly + val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + + s"rapids_4_spark_qualification_output.csv" + val outputActual = readExpectedFile(new File(outputResults)) + assert(outputActual.collect().size == 1) + } + // run the qualification tool for databricks-aws TrampolineUtil.withTempDir { outpath => val appArgs = new QualificationArgs(Array( From 73ca0f0be0e3a90ab6da39b3e41a061516b50094 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:53:07 -0500 Subject: [PATCH 3/4] Bump urllib3 from 1.26.14 to 1.26.17 in /data_validation (#606) Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.14 to 1.26.17. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.14...1.26.17) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- data_validation/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_validation/pyproject.toml b/data_validation/pyproject.toml index 3cbdef404..21ef66ab3 100644 --- a/data_validation/pyproject.toml +++ b/data_validation/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "packaging==23.0", "certifi==2023.7.22", "idna==3.4", - "urllib3==1.26.14", + "urllib3==1.26.17", "beautifulsoup4==4.11.2" ] dynamic=["entry-points", "version"] From eb614494117eb5652b0b4d816b32cde25fcf9caf Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Fri, 13 Oct 2023 10:59:37 -0700 Subject: [PATCH 4/4] [BUG] Update user tools to use latest Databricks CLI version 0.200+ (#614) * updated implementation and docs for databricks platforms for change in new version of databricks cli Signed-off-by: cindyyuanjiang * use ID consistently in comments Signed-off-by: cindyyuanjiang --------- Signed-off-by: cindyyuanjiang --- user_tools/docs/user-tools-databricks-aws.md | 69 +++++++++---------- .../docs/user-tools-databricks-azure.md | 63 +++++++++-------- .../cloud_api/databricks_aws.py | 24 ++++--- .../cloud_api/databricks_azure.py | 20 ++++-- .../src/spark_rapids_pytools/cloud_api/emr.py | 2 +- .../wrappers/databricks_aws_wrapper.py | 12 ++-- .../wrappers/databricks_azure_wrapper.py | 12 ++-- .../src/spark_rapids_tools/cmdli/tools_cli.py | 9 +-- 8 files changed, 115 insertions(+), 96 deletions(-) diff --git a/user_tools/docs/user-tools-databricks-aws.md b/user_tools/docs/user-tools-databricks-aws.md index 86644bf06..766080460 100644 --- a/user_tools/docs/user-tools-databricks-aws.md +++ b/user_tools/docs/user-tools-databricks-aws.md @@ -10,10 +10,9 @@ The tool currently only supports event logs stored on S3 (no DBFS paths). The re ### 1.Databricks CLI -- Install the Databricks CLI. Follow the instructions on [Install the CLI](https://docs.databricks.com/dev-tools/cli/index.html#install-the-cli). +- Install the Databricks CLI version 0.200+. Follow the instructions on [Install the CLI](https://docs.databricks.com/en/dev-tools/cli/install.html). - Set the configuration settings and credentials of the Databricks CLI: - - Set up authentication by following these [instructions](https://docs.databricks.com/dev-tools/cli/index.html#set-up-authentication) - - Test the authentication setup by following these [instructions](https://docs.databricks.com/dev-tools/cli/index.html#test-your-authentication-setup) + - Set up authentication by following these [instructions](https://docs.databricks.com/en/dev-tools/cli/authentication.html) - Verify that the access credentials are stored in the file `~/.databrickscfg` on Unix, Linux, or macOS, or in another file defined by environment variable `DATABRICKS_CONFIG_FILE`. ### 2.AWS CLI @@ -44,7 +43,7 @@ Before running any command, you can set environment variables to specify configu - RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: - `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. Note that caching the resources locally has an impact on the total execution time of the command. - `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to generate the output. The wrapper CLI arguments override that environment variable (`--local_folder` for Qualification). -- For Databricks CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables). +- For Databricks CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth.html#environment-variables-and-fields-for-client-unified-authentication). - For AWS CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `AWS_SHARED_CREDENTIALS_FILE`, `AWS_CONFIG_FILE`, `AWS_REGION`, `AWS_DEFAULT_REGION`, `AWS_PROFILE` and `AWS_DEFAULT_OUTPUT`. See the full list of variables in [aws-cli-configure-envvars](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html). ## Qualification command @@ -64,25 +63,25 @@ The local deployment runs on the local development machine. It requires: #### Command options -| Option | Description | Default | Required | -|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| -| **cpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster name, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get --cluster-name` | N/A | N | -| **eventlogs** | A comma seperated list of S3 urls pointing to event logs or S3 directory | Reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included in the output of `databricks clusters get --cluster-name`. Note that the wrapper will raise an exception if the property is not set. | N | -| **remote_folder** | The S3 folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | -| **gpu_cluster** | The Databricks-cluster on which the Spark applications is planned to be migrated. The argument can be an Databricks-cluster or a valid path to the cluster's properties file (json format) generated by the Databricks CLI `databricks clusters get --cluster-name` command | The wrapper maps the AWS EC2 machine instances of the original cluster into AWS EC2 instances that support GPU acceleration. | N | -| **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `qual-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | -| **jvm_heap_size** | The maximum heap size of the JVM in gigabytes | 24 | N | -| **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | -| **aws_profile** | A named AWS profile that you can specify to get the settings/credentials of the AWS account | "default" if the env-variable `AWS_PROFILE` is not set | N | -| **tools_jar** | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote S3 url | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo | N | -| **filter_apps** | Filtering criteria of the applications listed in the final STDOUT table is one of the following (`ALL`, `SPEEDUPS`, `SAVINGS`). "`ALL`" means no filter applied. "`SPEEDUPS`" lists all the apps that are either '_Recommended_', or '_Strongly Recommended_' based on speedups. "`SAVINGS`" lists all the apps that have positive estimated GPU savings except for the apps that are '_Not Applicable_'. | `SAVINGS` | N | -| **gpu_cluster_recommendation** | The type of GPU cluster recommendation to generate. It accepts one of the following (`CLUSTER`, `JOB`, `MATCH`). `MATCH`: keep GPU cluster same number of nodes as CPU cluster; `CLUSTER`: recommend optimal GPU cluster by cost for entire cluster. `JOB`: recommend optimal GPU cluster by cost per job | `MATCH` | N | -| **credentials_file** | The local path of JSON file that contains the application credentials | If missing, loads the env variable `DATABRICKS_CONFIG_FILE` if any. Otherwise, it uses the default path `~/.databrickscfg` on Unix, Linux, or macOS | N | -| **cpu_discount** | A percent discount for the cpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **gpu_discount** | A percent discount for the gpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **global_discount** | A percent discount for both the cpu and gpu cluster costs in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **verbose** | True or False to enable verbosity to the wrapper script | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set | N | -| **rapids_options**** | A list of valid [Qualification tool options](../../core/docs/spark-qualification-tool.md#qualification-tool-options). Note that (`output-directory`, `platform`) flags are ignored, and that multiple "spark-property" is not supported. | N/A | N | +| Option | Description | Default | Required | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| +| **cpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster id, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get CLUSTER_ID [flags]` | N/A | N | +| **eventlogs** | A comma seperated list of S3 urls pointing to event logs or S3 directory | Reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. | N | +| **remote_folder** | The S3 folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | +| **gpu_cluster** | The Databricks-cluster on which the Spark applications is planned to be migrated. The argument can be an Databricks-cluster id or a valid path to the cluster's properties file (json format) generated by the Databricks CLI `databricks clusters get CLUSTER_ID [flags]` command | The wrapper maps the AWS EC2 machine instances of the original cluster into AWS EC2 instances that support GPU acceleration. | N | +| **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `qual-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | +| **jvm_heap_size** | The maximum heap size of the JVM in gigabytes | 24 | N | +| **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | +| **aws_profile** | A named AWS profile that you can specify to get the settings/credentials of the AWS account | "default" if the env-variable `AWS_PROFILE` is not set | N | +| **tools_jar** | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote S3 url | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo | N | +| **filter_apps** | Filtering criteria of the applications listed in the final STDOUT table is one of the following (`ALL`, `SPEEDUPS`, `SAVINGS`). "`ALL`" means no filter applied. "`SPEEDUPS`" lists all the apps that are either '_Recommended_', or '_Strongly Recommended_' based on speedups. "`SAVINGS`" lists all the apps that have positive estimated GPU savings except for the apps that are '_Not Applicable_'. | `SAVINGS` | N | +| **gpu_cluster_recommendation** | The type of GPU cluster recommendation to generate. It accepts one of the following (`CLUSTER`, `JOB`, `MATCH`). `MATCH`: keep GPU cluster same number of nodes as CPU cluster; `CLUSTER`: recommend optimal GPU cluster by cost for entire cluster. `JOB`: recommend optimal GPU cluster by cost per job | `MATCH` | N | +| **credentials_file** | The local path of JSON file that contains the application credentials | If missing, loads the env variable `DATABRICKS_CONFIG_FILE` if any. Otherwise, it uses the default path `~/.databrickscfg` on Unix, Linux, or macOS | N | +| **cpu_discount** | A percent discount for the cpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **gpu_discount** | A percent discount for the gpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **global_discount** | A percent discount for both the cpu and gpu cluster costs in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **verbose** | True or False to enable verbosity to the wrapper script | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set | N | +| **rapids_options**** | A list of valid [Qualification tool options](../../core/docs/spark-qualification-tool.md#qualification-tool-options). Note that (`output-directory`, `platform`) flags are ignored, and that multiple "spark-property" is not supported. | N/A | N | #### Use case scenario @@ -98,19 +97,19 @@ A typical workflow to successfully run the `qualification` command in local mode commands can access the S3 resources `LOGS_BUCKET`. 4. installs `spark_rapids_user_tools` 3. If the results of the wrapper need to be stored on S3, then another S3 uri is required `REMOTE_FOLDER=s3://OUT_BUCKET/` -4. User defines the Databricks-cluster on which the Spark application were running. Note that the cluster does not have to be active; but it has to be visible by the Databricks CLI (i.e., can run `databricks clusters get --cluster-name`). +4. User defines the Databricks-cluster on which the Spark application were running. Note that the cluster does not have to be active; but it has to be visible by the Databricks CLI (i.e., can run `databricks clusters get CLUSTER_ID [flags]`). 5. The following script runs qualification by passing an S3 remote directory to store the output: ``` # define the wrapper cache directory if necessary export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder export EVENTLOGS=s3://LOGS_BUCKET/eventlogs/ - export CLUSTER_NAME=my-databricks-cpu-cluster + export CLUSTER_ID=my-databricks-cpu-cluster-id export REMOTE_FOLDER=s3://OUT_BUCKET/wrapper_output spark_rapids_user_tools databricks-aws qualification \ --eventlogs $EVENTLOGS \ - --cpu_cluster $CLUSTER_NAME \ + --cpu_cluster $CLUSTER_ID \ --remote_folder $REMOTE_FOLDER ``` The wrapper generates a unique-Id for each execution in the format of `qual__<0x%08X>` @@ -183,9 +182,9 @@ The local deployment runs on the local development machine. It requires: | Option | Description | Default | Required | |----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| **gpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster name, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get --cluster-name` | If missing, then the argument `worker_info` has to be provided. | N | +| **gpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster id, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get CLUSTER_ID [flags]` | If missing, then the argument `worker_info` has to be provided. | N | | **worker_info** | A path pointing to a yaml file containing the system information of a worker node. It is assumed that all workers are homogenous. The format of the file is described in the following section. | None | N | -| **eventlogs** | A comma seperated list of S3 urls pointing to event logs or S3 directory | Reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included in the output of `databricks clusters get --cluster-name`. Note that the wrapper will raise an exception if the property is not set. | N | +| **eventlogs** | A comma seperated list of S3 urls pointing to event logs or S3 directory | Reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. | N | | **remote_folder** | The S3 folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | | **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `prof-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | | **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | @@ -261,26 +260,26 @@ argument `--remote_folder` was a valid S3 path. A cluster property is still accessible if one of the following conditions applies: -1. The cluster is listed by the `databricks clusters get --cluster-name $CLUSTER_NAME` cmd. In this case, the CLI will be triggered by providing - `--gpu_cluster $CLUSTER_NAME` +1. The cluster is listed by the `databricks clusters get CLUSTER_ID [flags]` cmd. In this case, the CLI will be triggered by providing + `--gpu_cluster $CLUSTER_ID` ``` # run the command using the GPU cluster name export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder export EVENTLOGS=s3://LOGS_BUCKET/eventlogs/ - export CLUSTER_NAME=my-databricks-gpu-cluster + export CLUSTER_ID=my-databricks-gpu-cluster-id export REMOTE_FOLDER=s3://OUT_BUCKET/wrapper_output spark_rapids_user_tools databricks-aws profiling \ --eventlogs $EVENTLOGS \ - --gpu_cluster $CLUSTER_NAME \ + --gpu_cluster $CLUSTER_ID \ --remote_folder $REMOTE_FOLDER ``` 2. The cluster properties file is accessible on local disk or a valid S3 path. ``` $> export CLUSTER_PROPS_FILE=cluster-props.json - $> databricks clusters get --cluster-name $CLUSTER_NAME > $CLUSTER_PROPS_FILE + $> databricks clusters get $CLUSTER_ID > $CLUSTER_PROPS_FILE ``` Trigger the CLI by providing the path to the properties file `--gpu_cluster $CLUSTER_PROPS_FILE` @@ -326,7 +325,7 @@ user must have SSH access. | Option | Description | Default | Required | |-------------------|-----------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|:--------:| -| **cluster** | Name of the Databricks cluster running an accelerated computing instance | N/A | Y | +| **cluster** | Id of the Databricks cluster running an accelerated computing instance | N/A | Y | | **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | | **aws_profile** | A named AWS profile that you can specify to get the settings/credentials of the AWS account | "default" if the the env-variable `AWS_PROFILE` is not set | N | | **output_folder** | Path to local directory where the final recommendations is logged | env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | @@ -348,7 +347,7 @@ The steps to run the command: ```bash spark_rapids_user_tools databricks-aws diagnostic \ - --cluster my-cluster-name + --cluster my-cluster-id ``` If the connection to Databricks instances cannot be established through SSH, the command will raise error. diff --git a/user_tools/docs/user-tools-databricks-azure.md b/user_tools/docs/user-tools-databricks-azure.md index cf504d609..435a861b2 100644 --- a/user_tools/docs/user-tools-databricks-azure.md +++ b/user_tools/docs/user-tools-databricks-azure.md @@ -10,10 +10,9 @@ The tool currently only supports event logs stored on ABFS ([Azure Blob File Sys ### 1.Databricks CLI -- Install the Databricks CLI. Follow the instructions on [Install the CLI](https://docs.databricks.com/dev-tools/cli/index.html#install-the-cli). +- Install the Databricks CLI version 0.200+. Follow the instructions on [Install the CLI](https://docs.databricks.com/en/dev-tools/cli/install.html). - Set the configuration settings and credentials of the Databricks CLI: - - Set up authentication using a Databricks personal access token by following these [instructions](https://docs.databricks.com/dev-tools/cli/index.html#set-up-authentication-using-a-databricks-personal-access-token). - - Test the authentication setup by following these [instructions](https://docs.databricks.com/dev-tools/cli/index.html#test-your-authentication-setup). + - Set up authentication by following these [instructions](https://docs.databricks.com/en/dev-tools/cli/authentication.html) - Verify that the access credentials are stored in the file `~/.databrickscfg` on Unix, Linux, or macOS, or in another file defined by environment variable `DATABRICKS_CONFIG_FILE`. ### 2.Azure CLI @@ -48,7 +47,7 @@ Before running any command, you can set environment variables to specify configu - RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: - `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. Note that caching the resources locally has an impact on the total execution time of the command. - `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to generate the output. The wrapper CLI arguments override that environment variable (`--local_folder` for Qualification). -- For Databricks CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables). +- For Databricks CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth.html#environment-variables-and-fields-for-client-unified-authentication). - For Azure CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `AZURE_CONFIG_FILE` and `AZURE_DEFAULTS_LOCATION`. ## Qualification command @@ -68,23 +67,23 @@ The local deployment runs on the local development machine. It requires: #### Command options -| Option | Description | Default | Required | -|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| -| **cpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster name, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get --cluster-name` | N/A | N | -| **eventlogs** | A comma seperated list of ABFS urls pointing to event logs or ABFS directory, or local event log filenames or directory | Reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included in the output of `databricks clusters get --cluster-name`. Note that the wrapper will raise an exception if the property is not set. | N | -| **remote_folder** | The ABFS folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | -| **gpu_cluster** | The Databricks-cluster on which the Spark applications is planned to be migrated. The argument can be an Databricks-cluster or a valid path to the cluster's properties file (json format) generated by the Databricks CLI `databricks clusters get --cluster-name` command | The wrapper maps the Azure machine instances of the original cluster into Azure instances that support GPU acceleration. | N | -| **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `qual-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | -| **jvm_heap_size** | The maximum heap size of the JVM in gigabytes | 24 | N | -| **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | -| **tools_jar** | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote ABFS url | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo | N | -| **filter_apps** | Filtering criteria of the applications listed in the final STDOUT table is one of the following (`ALL`, `SPEEDUPS`, `SAVINGS`). "`ALL`" means no filter applied. "`SPEEDUPS`" lists all the apps that are either '_Recommended_', or '_Strongly Recommended_' based on speedups. "`SAVINGS`" lists all the apps that have positive estimated GPU savings except for the apps that are '_Not Applicable_'. | `SAVINGS` | N | -| **gpu_cluster_recommendation** | The type of GPU cluster recommendation to generate. It accepts one of the following (`CLUSTER`, `JOB`, `MATCH`). `MATCH`: keep GPU cluster same number of nodes as CPU cluster; `CLUSTER`: recommend optimal GPU cluster by cost for entire cluster. `JOB`: recommend optimal GPU cluster by cost per job | `MATCH` | N | -| **cpu_discount** | A percent discount for the cpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **gpu_discount** | A percent discount for the gpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **global_discount** | A percent discount for both the cpu and gpu cluster costs in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | -| **verbose** | True or False to enable verbosity to the wrapper script | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set | N | -| **rapids_options**** | A list of valid [Qualification tool options](../../core/docs/spark-qualification-tool.md#qualification-tool-options). Note that (`output-directory`, `platform`) flags are ignored, and that multiple "spark-property" is not supported. | N/A | N | +| Option | Description | Default | Required | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| +| **cpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster id, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get CLUSTER_ID [flags]` | N/A | N | +| **eventlogs** | A comma seperated list of ABFS urls pointing to event logs or ABFS directory, or local event log filenames or directory | Reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. | N | +| **remote_folder** | The ABFS folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | +| **gpu_cluster** | The Databricks-cluster on which the Spark applications is planned to be migrated. The argument can be an Databricks-cluster id or a valid path to the cluster's properties file (json format) generated by the Databricks CLI `databricks clusters get CLUSTER_ID [flags]` command | The wrapper maps the Azure machine instances of the original cluster into Azure instances that support GPU acceleration. | N | +| **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `qual-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | +| **jvm_heap_size** | The maximum heap size of the JVM in gigabytes | 24 | N | +| **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | +| **tools_jar** | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote ABFS url | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo | N | +| **filter_apps** | Filtering criteria of the applications listed in the final STDOUT table is one of the following (`ALL`, `SPEEDUPS`, `SAVINGS`). "`ALL`" means no filter applied. "`SPEEDUPS`" lists all the apps that are either '_Recommended_', or '_Strongly Recommended_' based on speedups. "`SAVINGS`" lists all the apps that have positive estimated GPU savings except for the apps that are '_Not Applicable_'. | `SAVINGS` | N | +| **gpu_cluster_recommendation** | The type of GPU cluster recommendation to generate. It accepts one of the following (`CLUSTER`, `JOB`, `MATCH`). `MATCH`: keep GPU cluster same number of nodes as CPU cluster; `CLUSTER`: recommend optimal GPU cluster by cost for entire cluster. `JOB`: recommend optimal GPU cluster by cost per job | `MATCH` | N | +| **cpu_discount** | A percent discount for the cpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **gpu_discount** | A percent discount for the gpu cluster cost in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **global_discount** | A percent discount for both the cpu and gpu cluster costs in the form of an integer value (e.g. 30 for 30% discount) | N/A | N | +| **verbose** | True or False to enable verbosity to the wrapper script | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set | N | +| **rapids_options**** | A list of valid [Qualification tool options](../../core/docs/spark-qualification-tool.md#qualification-tool-options). Note that (`output-directory`, `platform`) flags are ignored, and that multiple "spark-property" is not supported. | N/A | N | #### Use case scenario @@ -107,12 +106,12 @@ A typical workflow to successfully run the `qualification` command in local mode # define the wrapper cache directory if necessary export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder export EVENTLOGS=abfss://LOGS_CONTAINER/eventlogs/ - export CLUSTER_NAME=my-databricks-cpu-cluster + export CLUSTER_ID=my-databricks-cpu-cluster-id export REMOTE_FOLDER=abfss://OUT_BUCKET/wrapper_output spark_rapids_user_tools databricks-azure qualification \ --eventlogs $EVENTLOGS \ - --cpu_cluster $CLUSTER_NAME \ + --cpu_cluster $CLUSTER_ID \ --remote_folder $REMOTE_FOLDER ``` The wrapper generates a unique-Id for each execution in the format of `qual__<0x%08X>` @@ -185,9 +184,9 @@ The local deployment runs on the local development machine. It requires: | Option | Description | Default | Required | |----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| **gpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster name, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get --cluster-name` | If missing, then the argument `worker_info` has to be provided. | N | +| **gpu_cluster** | The Databricks-cluster on which the Apache Spark applications were executed. Accepted values are an Databricks-cluster id, or a valid path to the cluster properties file (json format) generated by Databricks CLI command `databricks clusters get CLUSTER_ID [flags]` | If missing, then the argument `worker_info` has to be provided. | N | | **worker_info** | A path pointing to a yaml file containing the system information of a worker node. It is assumed that all workers are homogenous. The format of the file is described in the following section. | None | N | -| **eventlogs** | A comma seperated list of ABFS urls pointing to event logs or ABFS directory | Reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included in the output of `databricks clusters get --cluster-name`. Note that the wrapper will raise an exception if the property is not set. | N | +| **eventlogs** | A comma seperated list of ABFS urls pointing to event logs or ABFS directory | Reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. | N | | **remote_folder** | The ABFS folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk | N/A | N | | **local_folder** | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `prof-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | | **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | @@ -262,26 +261,26 @@ argument `--remote_folder` was a valid ABFS path. A cluster property is still accessible if one of the following conditions applies: -1. The cluster is listed by the `databricks clusters get --cluster-name $CLUSTER_NAME` cmd. In this case, the CLI will be triggered by providing - `--gpu_cluster $CLUSTER_NAME` +1. The cluster is listed by the `databricks clusters get CLUSTER_ID [flags]` cmd. In this case, the CLI will be triggered by providing + `--gpu_cluster $CLUSTER_ID` ``` # run the command using the GPU cluster name export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder export EVENTLOGS=abfss://LOGS_CONTAINER/eventlogs/ - export CLUSTER_NAME=my-databricks-gpu-cluster + export CLUSTER_ID=my-databricks-gpu-cluster-id export REMOTE_FOLDER=abfss://OUT_BUCKET/wrapper_output spark_rapids_user_tools databricks-azure profiling \ --eventlogs $EVENTLOGS \ - --gpu_cluster $CLUSTER_NAME \ + --gpu_cluster $CLUSTER_ID \ --remote_folder $REMOTE_FOLDER ``` 2. The cluster properties file is accessible on local disk or a valid ABFS path. ``` $> export CLUSTER_PROPS_FILE=cluster-props.json - $> databricks clusters get --cluster-name $CLUSTER_NAME > $CLUSTER_PROPS_FILE + $> databricks clusters get $CLUSTER_ID > $CLUSTER_PROPS_FILE ``` Trigger the CLI by providing the path to the properties file `--gpu_cluster $CLUSTER_PROPS_FILE` @@ -328,7 +327,7 @@ user must have SSH access. | Option | Description | Default | Required | |-------------------|-----------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|:--------:| -| **cluster** | Name of the Databricks cluster running an accelerated computing instance | N/A | Y | +| **cluster** | Id of the Databricks cluster running an accelerated computing instance | N/A | Y | | **profile** | A named Databricks profile that you can specify to get the settings/credentials of the Databricks account | "DEFAULT" | N | | **output_folder** | Path to local directory where the final recommendations is logged | env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory. | N | | **port** | Port number to be used for the ssh connections | 2200 | N | @@ -349,7 +348,7 @@ The steps to run the command: ```bash spark_rapids_user_tools databricks-azure diagnostic \ - --cluster my-cluster-name + --cluster my-cluster-id ``` If the connection to Databricks instances cannot be established through SSH, the command will raise error. \ No newline at end of file diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py index 91268cf6b..77584aa81 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py @@ -115,16 +115,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list def pull_cluster_props_by_args(self, args: dict) -> str: get_cluster_cmd = ['databricks', 'clusters', 'get'] if 'Id' in args: - get_cluster_cmd.extend(['--cluster-id', args.get('Id')]) + get_cluster_cmd.extend([args.get('Id')]) elif 'cluster' in args: - get_cluster_cmd.extend(['--cluster-name', args.get('cluster')]) + # TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as + # 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms + get_cluster_cmd.extend([args.get('cluster')]) else: - self.logger.error('Invalid arguments to pull the cluster properties') - cluster_described = self.run_sys_cmd(get_cluster_cmd) - if cluster_described is not None: - raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False) - return json.dumps(raw_prop_container.props) - return cluster_described + self.logger.error('Unable to pull cluster id or cluster name information') + + try: + cluster_described = self.run_sys_cmd(get_cluster_cmd) + if cluster_described is not None: + raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False) + return json.dumps(raw_prop_container.props) + except Exception as ex: + self.logger.error('Invalid arguments to pull the cluster properties: %s', ex) + raise ex + + return None def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str: port = self.env_vars.get('sshPort') diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py index 6deb9c369..9b9d4a490 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py @@ -122,12 +122,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list def pull_cluster_props_by_args(self, args: dict) -> str: get_cluster_cmd = ['databricks', 'clusters', 'get'] if 'Id' in args: - get_cluster_cmd.extend(['--cluster-id', args.get('Id')]) + get_cluster_cmd.extend([args.get('Id')]) elif 'cluster' in args: - get_cluster_cmd.extend(['--cluster-name', args.get('cluster')]) + # TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as + # 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms + get_cluster_cmd.extend([args.get('cluster')]) else: - self.logger.error('Invalid arguments to pull the cluster properties') - return self.run_sys_cmd(get_cluster_cmd) + self.logger.error('Unable to pull cluster id or cluster name information') + + try: + cluster_described = self.run_sys_cmd(get_cluster_cmd) + if cluster_described is not None: + raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False) + return json.dumps(raw_prop_container.props) + except Exception as ex: + self.logger.error('Invalid arguments to pull the cluster properties: %s', ex) + raise ex + + return None def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str: port = self.env_vars.get('sshPort') diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py index c0e04c8f5..579f59d8c 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py @@ -141,7 +141,7 @@ def _list_inconsistent_configurations(self) -> list: def pull_cluster_props_by_args(self, args: dict) -> str: aws_cluster_id = args.get('Id') cluster_name = args.get('cluster') - if args.get('Id') is None: + if aws_cluster_id is None: # use cluster name to get the cluster values # we need to get the cluster_id from the list command first. list_cmd_res = self.exec_platform_list_cluster_by_name(cluster_name) diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py index fc2ff55bd..ce1f6ca70 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py @@ -52,12 +52,12 @@ def qualification(cpu_cluster: str = None, or query to GPU. The wrapper downloads dependencies and executes the analysis on the local dev machine. :param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument - can be a Databricks-cluster or a valid path to the cluster's properties file (json format) + can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. :param eventlogs: Event log filenames or S3 storage directories containing event logs (comma separated). If missing, the wrapper reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included - in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`. + in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account. @@ -69,7 +69,7 @@ def qualification(cpu_cluster: str = None, :param remote_folder: An S3 folder where the output is uploaded at the end of execution. If no value is provided, the output will be only available on local disk. :param gpu_cluster: The Databricks-cluster on which the Spark applications is planned to be migrated. - The argument can be a Databricks-cluster or a valid path to the cluster's properties file + The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine instances of the original cluster into databricks instances that support GPU acceleration. :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem, @@ -158,7 +158,7 @@ def profiling(gpu_cluster: str = None, which can be used for debugging and profiling Apache Spark applications. :param gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument - can be a Databricks-cluster or a valid path to the cluster's properties file (json format) + can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. If missing, then the argument worker_info has to be provided. :param worker_info: A path pointing to a yaml file containing the system information of a worker node. It is assumed that all workers are homogenous. @@ -166,7 +166,7 @@ def profiling(gpu_cluster: str = None, :param eventlogs: Event log filenames or S3 storage directories containing event logs (comma separated). If missing, the wrapper reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included - in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`. + in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account. @@ -236,7 +236,7 @@ def diagnostic(cluster: str, Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes, Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might be collected by this tool, e.g. access secret configured in configuration files or dumped to log files. - :param cluster: Name of the Databricks cluster running an accelerated computing instance. + :param cluster: ID of the Databricks cluster running an accelerated computing instance. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account. :param output_folder: Local path where the archived result will be saved. diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py index ef16ad299..cff76e536 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py @@ -51,12 +51,12 @@ def qualification(cpu_cluster: str = None, or query to GPU. The wrapper downloads dependencies and executes the analysis on the local dev machine. :param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument - can be a Databricks-cluster or a valid path to the cluster's properties file (json format) + can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. :param eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories containing event logs (comma separated). If missing, the wrapper reads the Spark's property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included - in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`. + in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param local_folder: Local work-directory path to store the output and to be used as root @@ -67,7 +67,7 @@ def qualification(cpu_cluster: str = None, :param remote_folder: An ABFS (Azure Blob File System) folder where the output is uploaded at the end of execution. If no value is provided, the output will be only available on local disk. :param gpu_cluster: The Databricks-cluster on which the Spark applications are planned to be migrated. - The argument can be a Databricks-cluster or a valid path to the cluster's properties file + The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine instances of the original cluster into databricks instances that support GPU acceleration. :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem, @@ -153,7 +153,7 @@ def profiling(gpu_cluster: str = None, The Profiling tool analyzes both CPU or GPU generated event logs and generates information which can be used for debugging and profiling Apache Spark applications. :param gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument - can be a Databricks-cluster or a valid path to the cluster's properties file (json format) + can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format) generated by the databricks-CLI. If missing, then the argument worker_info has to be provided. :param worker_info: A path pointing to a yaml file containing the system information of a worker node. It is assumed that all workers are homogenous. @@ -161,7 +161,7 @@ def profiling(gpu_cluster: str = None, :param eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories containing event logs (comma separated). If missing, the wrapper reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included - in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`. + in the output of `databricks clusters get CLUSTER_ID [flags]`. Note that the wrapper will raise an exception if the property is not set. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param local_folder: Local work-directory path to store the output and to be used as root @@ -228,7 +228,7 @@ def diagnostic(cluster: str, Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes, Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might be collected by this tool, e.g. access secret configured in configuration files or dumped to log files. - :param cluster: Name of the Databricks cluster running an accelerated computing instance. + :param cluster: ID of the Databricks cluster running an accelerated computing instance. :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI. :param output_folder: Local path where the archived result will be saved. Note that this argument only accepts local filesystem. If the argument is NONE, diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index 2f971dcbe..e61be3d84 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -63,7 +63,7 @@ def qualification(self, Skipping this argument requires that the cluster argument points to a valid cluster name on the CSP. - :param cluster: Name of cluster or path to cluster-properties. + :param cluster: Name or ID (for databricks platforms) of cluster or path to cluster-properties. :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws", and "databricks-azure". :param target_platform: Cost savings and speedup recommendation for comparable cluster in @@ -139,8 +139,8 @@ def profiling(self, containing event logs (comma separated). If missing, the wrapper reads the Spark's property `spark.eventLog.dir` defined in the `cluster`. :param cluster: The cluster on which the Spark applications were executed. The argument - can be a cluster name or a valid path to the cluster's properties file (json format) - generated by the CSP SDK. + can be a cluster name od ID (for databricks platforms) or a valid path to the cluster's + properties file (json format) generated by the CSP SDK. :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws", and "databricks-azure". :param output_folder: path to store the output. @@ -173,7 +173,8 @@ def bootstrap(self, The tool will apply settings for the cluster assuming that jobs will run serially so that each job can use up all the cluster resources (CPU and GPU) when it is running. - :param cluster: Name of the cluster running an accelerated computing instance class + :param cluster: Name or ID (for databricks platforms) of the cluster running an accelerated + computing instance class :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws", and "databricks-azure". :param output_folder: path where the final recommendations will be saved.