From 6a368edbc19b9073abdf2967a6cf7a67ecdba3fd Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Mon, 9 Oct 2023 16:14:37 -0700 Subject: [PATCH 1/5] Add support for Dataproc GKE in core tools Signed-off-by: Partho Sarthi --- core/docs/spark-qualification-tool.md | 5 +- .../operatorsScore-dataproc-gke-t4.csv | 256 ++++++++++++++++++ .../qualification/PluginTypeChecker.scala | 3 + .../qualification/QualificationArgs.scala | 5 +- .../PluginTypeCheckerSuite.scala | 7 + .../qualification/QualificationSuite.scala | 24 ++ 6 files changed, 295 insertions(+), 5 deletions(-) create mode 100644 core/src/main/resources/operatorsScore-dataproc-gke-t4.csv diff --git a/core/docs/spark-qualification-tool.md b/core/docs/spark-qualification-tool.md index 023f4318b..25dd3cb5f 100644 --- a/core/docs/spark-qualification-tool.md +++ b/core/docs/spark-qualification-tool.md @@ -29,6 +29,7 @@ applicable environments. Here are the cluster information for the ETL benchmark | Dataproc (T4) | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB | | Dataproc (L4) | 8x n1-standard-16 | 8x g2-standard-16 | | Dataproc Serverless (L4) | 8x 16 cores | 8x 16 cores + 8x L4 24GB | +| Dataproc GKE (T4) | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB | | EMR (T4) | 8x m5d.8xlarge | 4x g4dn.12xlarge | | EMR (A10) | 8x m5d.8xlarge | 8x g5.8xlarge | | Databricks AWS | 8x m6gd.8xlage | 8x g5.8xlarge | @@ -248,8 +249,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* -p, --per-sql Report at the individual SQL query level. --platform Cluster platform where Spark CPU workloads were executed. Options include onprem, dataproc-t4, - dataproc-l4, dataproc-serverless-l4, emr-t4, - emr-a10, databricks-aws, and databricks-azure. + dataproc-l4, dataproc-serverless-l4, dataproc-gke-t4, + emr-t4, emr-a10, databricks-aws, and databricks-azure. Default is onprem. -r, --report-read-schema Whether to output the read formats and datatypes to the CSV file. This can be very diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv new file mode 100644 index 000000000..493f1154c --- /dev/null +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -0,0 +1,256 @@ +CPUOperator,Score +CoalesceExec,4.25 +CollectLimitExec,4.25 +ExpandExec,7.76 +FileSourceScanExec,3.64 +FilterExec,4.47 +GenerateExec,4.25 +GlobalLimitExec,4.25 +LocalLimitExec,4.25 +ProjectExec,4.25 +RangeExec,4.25 +SampleExec,4.25 +SortExec,4.25 +TakeOrderedAndProjectExec,20.96 +HashAggregateExec,5.54 +ObjectHashAggregateExec,5.54 +SortAggregateExec,5.54 +DataWritingCommandExec,4.25 +ExecutedCommandExec,4.25 +BatchScanExec,3.64 +ShuffleExchangeExec,5.21 +BroadcastHashJoinExec,6.42 +BroadcastNestedLoopJoinExec,17.46 +CartesianProductExec,4.25 +ShuffledHashJoinExec,4.25 +SortMergeJoinExec,7.4 +WindowExec,4.25 +Abs,4.25 +Acos,4.25 +Acosh,4.25 +Add,4.25 +AggregateExpression,4.25 +Alias,4.25 +And,4.25 +ApproximatePercentile,4.25 +ArrayContains,4.25 +ArrayExcept,4.25 +ArrayExists,4.25 +ArrayIntersect,4.25 +ArrayMax,4.25 +ArrayMin,4.25 +ArrayRemove,4.25 +ArrayRepeat,4.25 +ArrayTransform,4.25 +ArrayUnion,4.25 +ArraysOverlap,4.25 +ArraysZip,4.25 +Asin,4.25 +Asinh,4.25 +AtLeastNNonNulls,4.25 +Atan,4.25 +Atanh,4.25 +AttributeReference,4.25 +Average,4.25 +BRound,4.25 +BitLength,4.25 +BitwiseAnd,4.25 +BitwiseNot,4.25 +BitwiseOr,4.25 +BitwiseXor,4.25 +CaseWhen,4.25 +Cbrt,4.25 +Ceil,4.25 +CheckOverflow,4.25 +Coalesce,4.25 +CollectList,4.25 +CollectSet,4.25 +Concat,4.25 +ConcatWs,4.25 +Contains,4.25 +Conv,4.25 +Cos,4.25 +Cosh,4.25 +Cot,4.25 +Count,4.25 +CreateArray,4.25 +CreateMap,4.25 +CreateNamedStruct,4.25 +CurrentRow$,4.25 +DateAdd,4.25 +DateAddInterval,4.25 +DateDiff,4.25 +DateFormatClass,4.25 +DateSub,4.25 +DayOfMonth,4.25 +DayOfWeek,4.25 +DayOfYear,4.25 +DenseRank,4.25 +Divide,4.25 +DynamicPruningExpression,4.25 +ElementAt,4.25 +EndsWith,4.25 +EqualNullSafe,4.25 +EqualTo,4.25 +Exp,4.25 +Explode,4.25 +Expm1,4.25 +First,4.25 +Flatten,4.25 +Floor,4.25 +FromUTCTimestamp,4.25 +FromUnixTime,4.25 +GetArrayItem,4.25 +GetArrayStructFields,4.25 +GetJsonObject,4.25 +GetMapValue,4.25 +GetStructField,4.25 +GetTimestamp,4.25 +GreaterThan,4.25 +GreaterThanOrEqual,4.25 +Greatest,4.25 +HiveGenericUDF,4.25 +HiveSimpleUDF,4.25 +Hour,4.25 +Hypot,4.25 +If,4.25 +In,4.25 +InSet,4.25 +InitCap,4.25 +InputFileBlockLength,4.25 +InputFileBlockStart,4.25 +InputFileName,4.25 +IntegralDivide,4.25 +IsNaN,4.25 +IsNotNull,4.25 +IsNull,4.25 +JsonToStructs,4.25 +JsonTuple,4.25 +KnownFloatingPointNormalized,4.25 +KnownNotNull,4.25 +Lag,4.25 +LambdaFunction,4.25 +Last,4.25 +LastDay,4.25 +Lead,4.25 +Least,4.25 +Length,4.25 +LessThan,4.25 +LessThanOrEqual,4.25 +Like,4.25 +Literal,4.25 +Log,4.25 +Log10,4.25 +Log1p,4.25 +Log2,4.25 +Logarithm,4.25 +Lower,4.25 +MakeDecimal,4.25 +MapConcat,4.25 +MapEntries,4.25 +MapFilter,4.25 +MapKeys,4.25 +MapValues,4.25 +Max,4.25 +Md5,4.25 +MicrosToTimestamp,4.25 +MillisToTimestamp,4.25 +Min,4.25 +Minute,4.25 +MonotonicallyIncreasingID,4.25 +Month,4.25 +Multiply,4.25 +Murmur3Hash,4.25 +NaNvl,4.25 +NamedLambdaVariable,4.25 +NormalizeNaNAndZero,4.25 +Not,4.25 +NthValue,4.25 +OctetLength,4.25 +Or,4.25 +PercentRank,4.25 +PivotFirst,4.25 +Pmod,4.25 +PosExplode,4.25 +Pow,4.25 +PreciseTimestampConversion,4.25 +PromotePrecision,4.25 +PythonUDF,4.25 +Quarter,4.25 +RLike,4.25 +RaiseError,4.25 +Rand,4.25 +Rank,4.25 +RegExpExtract,4.25 +RegExpExtractAll,4.25 +RegExpReplace,4.25 +Remainder,4.25 +ReplicateRows,4.25 +Reverse,4.25 +Rint,4.25 +Round,4.25 +RowNumber,4.25 +ScalaUDF,4.25 +ScalarSubquery,4.25 +Second,4.25 +SecondsToTimestamp,4.25 +Sequence,4.25 +ShiftLeft,4.25 +ShiftRight,4.25 +ShiftRightUnsigned,4.25 +Signum,4.25 +Sin,4.25 +Sinh,4.25 +Size,4.25 +SortArray,4.25 +SortOrder,4.25 +SparkPartitionID,4.25 +SpecifiedWindowFrame,4.25 +Sqrt,4.25 +StartsWith,4.25 +StddevPop,4.25 +StddevSamp,4.25 +StringInstr,4.25 +StringLPad,4.25 +StringLocate,4.25 +StringRPad,4.25 +StringRepeat,4.25 +StringReplace,4.25 +StringSplit,4.25 +StringToMap,4.25 +StringTranslate,4.25 +StringTrim,4.25 +StringTrimLeft,4.25 +StringTrimRight,4.25 +Substring,4.25 +SubstringIndex,4.25 +Subtract,4.25 +Sum,4.25 +Tan,4.25 +Tanh,4.25 +TimeAdd,4.25 +ToDegrees,4.25 +ToRadians,4.25 +ToUnixTimestamp,4.25 +TransformKeys,4.25 +TransformValues,4.25 +UnaryMinus,4.25 +UnaryPositive,4.25 +UnboundedFollowing$,4.25 +UnboundedPreceding$,4.25 +UnixTimestamp,4.25 +UnscaledValue,4.25 +Upper,4.25 +VariancePop,4.25 +VarianceSamp,4.25 +WeekDay,4.25 +WindowExpression,4.25 +WindowSpecDefinition,4.25 +XxHash64,4.25 +Year,4.25 +AggregateInPandasExec,1.2 +ArrowEvalPythonExec,1.2 +FlatMapGroupsInPandasExec,1.2 +FlatMapCoGroupsInPandasExec,1.2 +MapInPandasExec,1.2 +WindowInPandasExec,1.2 diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala index bd5641363..c01586159 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala @@ -48,6 +48,8 @@ class PluginTypeChecker(platform: String = "onprem", private val OPERATORS_SCORE_FILE_DATAPROC_T4 = "operatorsScore-dataproc-t4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_L4 = "operatorsScore-dataproc-l4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_SL_L4 = "operatorsScore-dataproc-serverless-l4.csv" + // TODO: Replace this with GKE T4 speedup scores + private val OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 = "operatorsScore.csv" private val OPERATORS_SCORE_FILE_EMR_T4 = "operatorsScore-emr-t4.csv" private val OPERATORS_SCORE_FILE_EMR_A10 = "operatorsScore-emr-a10.csv" private val OPERATORS_SCORE_FILE_DATABRICKS_AWS = "operatorsScore-databricks-aws.csv" @@ -104,6 +106,7 @@ class PluginTypeChecker(platform: String = "onprem", case "dataproc-t4" | "dataproc" => OPERATORS_SCORE_FILE_DATAPROC_T4 case "dataproc-l4" => OPERATORS_SCORE_FILE_DATAPROC_L4 case "dataproc-serverless-l4" => OPERATORS_SCORE_FILE_DATAPROC_SL_L4 + case "dataproc-gke-t4" => OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 // if no GPU specified, then default to emr-t4 for backward compatibility case "emr-t4" | "emr" => OPERATORS_SCORE_FILE_EMR_T4 case "emr-a10" => OPERATORS_SCORE_FILE_EMR_A10 diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala index 075c3512f..76a2fba8e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala @@ -148,9 +148,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* val platform: ScallopOption[String] = opt[String](required = false, descr = "Cluster platform where Spark CPU workloads were executed. Options include " + - "onprem, dataproc-t4, dataproc-l4, dataproc-serverless-l4, emr-t4, emr-a10, " + - "databricks-aws, and databricks-azure. " + - "Default is onprem.", + "onprem, dataproc-t4, dataproc-l4, dataproc-serverless-l4, dataproc-gke-t4, emr-t4, " + + "emr-a10, databricks-aws, and databricks-azure. Default is onprem.", default = Some("onprem")) val speedupFactorFile: ScallopOption[String] = opt[String](required = false, diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala index e7720fd28..6a183dd3e 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala @@ -188,12 +188,19 @@ class PluginTypeCheckerSuite extends FunSuite with Logging { assert(checker.getSpeedupFactor("WindowExec") == 4.25) assert(checker.getSpeedupFactor("Ceil") == 4.25) } + test("supported operator score from dataproc-l4") { val checker = new PluginTypeChecker("dataproc-l4") assert(checker.getSpeedupFactor("UnionExec") == 4.16) assert(checker.getSpeedupFactor("Ceil") == 4.16) } + test("supported operator score from dataproc-gke-t4") { + val checker = new PluginTypeChecker("dataproc-gke-t4") + assert(checker.getSpeedupFactor("WindowExec") == 4.25) + assert(checker.getSpeedupFactor("Ceil") == 4.25) + } + test("supported operator score from emr-a10") { val checker = new PluginTypeChecker("emr-a10") assert(checker.getSpeedupFactor("UnionExec") == 2.59) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala index 502ddffc1..982a65ec8 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala @@ -1529,6 +1529,30 @@ class QualificationSuite extends BaseTestSuite { assert(outputActual.collect().size == 1) } + // run the qualification tool for dataproc-gke-t4 + TrampolineUtil.withTempDir { outpath => + val appArgs = new QualificationArgs(Array( + "--output-directory", + outpath.getAbsolutePath, + "--platform", + "dataproc-gke-t4", + eventLog)) + + val (exit, _) = + QualificationMain.mainInternal(appArgs) + assert(exit == 0) + + // the code above that runs the Spark query stops the Sparksession + // so create a new one to read in the csv file + createSparkSession() + + // validate that the SQL description in the csv file escapes commas properly + val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + + s"rapids_4_spark_qualification_output.csv" + val outputActual = readExpectedFile(new File(outputResults)) + assert(outputActual.collect().size == 1) + } + // run the qualification tool for databricks-aws TrampolineUtil.withTempDir { outpath => val appArgs = new QualificationArgs(Array( From 49a0f21ef435da98547b466294842e3c5a0e0544 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 10 Oct 2023 10:19:09 -0700 Subject: [PATCH 2/5] Update scores for Dataproc GKE Signed-off-by: Partho Sarthi --- .../operatorsScore-dataproc-gke-t4.csv | 510 +++++++++--------- .../qualification/PluginTypeChecker.scala | 3 +- 2 files changed, 262 insertions(+), 251 deletions(-) diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv index 493f1154c..e5b3f9525 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -1,256 +1,268 @@ CPUOperator,Score -CoalesceExec,4.25 -CollectLimitExec,4.25 -ExpandExec,7.76 -FileSourceScanExec,3.64 -FilterExec,4.47 -GenerateExec,4.25 -GlobalLimitExec,4.25 -LocalLimitExec,4.25 -ProjectExec,4.25 -RangeExec,4.25 -SampleExec,4.25 -SortExec,4.25 -TakeOrderedAndProjectExec,20.96 -HashAggregateExec,5.54 -ObjectHashAggregateExec,5.54 -SortAggregateExec,5.54 -DataWritingCommandExec,4.25 -ExecutedCommandExec,4.25 -BatchScanExec,3.64 -ShuffleExchangeExec,5.21 -BroadcastHashJoinExec,6.42 -BroadcastNestedLoopJoinExec,17.46 -CartesianProductExec,4.25 -ShuffledHashJoinExec,4.25 -SortMergeJoinExec,7.4 -WindowExec,4.25 -Abs,4.25 -Acos,4.25 -Acosh,4.25 -Add,4.25 -AggregateExpression,4.25 -Alias,4.25 -And,4.25 -ApproximatePercentile,4.25 -ArrayContains,4.25 -ArrayExcept,4.25 -ArrayExists,4.25 -ArrayIntersect,4.25 -ArrayMax,4.25 -ArrayMin,4.25 -ArrayRemove,4.25 -ArrayRepeat,4.25 -ArrayTransform,4.25 -ArrayUnion,4.25 -ArraysOverlap,4.25 -ArraysZip,4.25 -Asin,4.25 -Asinh,4.25 -AtLeastNNonNulls,4.25 -Atan,4.25 -Atanh,4.25 -AttributeReference,4.25 -Average,4.25 -BRound,4.25 -BitLength,4.25 -BitwiseAnd,4.25 -BitwiseNot,4.25 -BitwiseOr,4.25 -BitwiseXor,4.25 -CaseWhen,4.25 -Cbrt,4.25 -Ceil,4.25 -CheckOverflow,4.25 -Coalesce,4.25 -CollectList,4.25 -CollectSet,4.25 -Concat,4.25 -ConcatWs,4.25 -Contains,4.25 -Conv,4.25 -Cos,4.25 -Cosh,4.25 -Cot,4.25 -Count,4.25 -CreateArray,4.25 -CreateMap,4.25 -CreateNamedStruct,4.25 -CurrentRow$,4.25 -DateAdd,4.25 -DateAddInterval,4.25 -DateDiff,4.25 -DateFormatClass,4.25 -DateSub,4.25 -DayOfMonth,4.25 -DayOfWeek,4.25 -DayOfYear,4.25 -DenseRank,4.25 -Divide,4.25 -DynamicPruningExpression,4.25 -ElementAt,4.25 -EndsWith,4.25 -EqualNullSafe,4.25 -EqualTo,4.25 -Exp,4.25 -Explode,4.25 -Expm1,4.25 -First,4.25 -Flatten,4.25 -Floor,4.25 -FromUTCTimestamp,4.25 -FromUnixTime,4.25 -GetArrayItem,4.25 -GetArrayStructFields,4.25 -GetJsonObject,4.25 -GetMapValue,4.25 -GetStructField,4.25 -GetTimestamp,4.25 -GreaterThan,4.25 -GreaterThanOrEqual,4.25 -Greatest,4.25 -HiveGenericUDF,4.25 -HiveSimpleUDF,4.25 -Hour,4.25 -Hypot,4.25 -If,4.25 -In,4.25 -InSet,4.25 -InitCap,4.25 -InputFileBlockLength,4.25 -InputFileBlockStart,4.25 -InputFileName,4.25 -IntegralDivide,4.25 -IsNaN,4.25 -IsNotNull,4.25 -IsNull,4.25 -JsonToStructs,4.25 -JsonTuple,4.25 -KnownFloatingPointNormalized,4.25 -KnownNotNull,4.25 -Lag,4.25 -LambdaFunction,4.25 -Last,4.25 -LastDay,4.25 -Lead,4.25 -Least,4.25 -Length,4.25 -LessThan,4.25 -LessThanOrEqual,4.25 -Like,4.25 -Literal,4.25 -Log,4.25 -Log10,4.25 -Log1p,4.25 -Log2,4.25 -Logarithm,4.25 -Lower,4.25 -MakeDecimal,4.25 -MapConcat,4.25 -MapEntries,4.25 -MapFilter,4.25 -MapKeys,4.25 -MapValues,4.25 -Max,4.25 -Md5,4.25 -MicrosToTimestamp,4.25 -MillisToTimestamp,4.25 -Min,4.25 -Minute,4.25 -MonotonicallyIncreasingID,4.25 -Month,4.25 -Multiply,4.25 -Murmur3Hash,4.25 -NaNvl,4.25 -NamedLambdaVariable,4.25 -NormalizeNaNAndZero,4.25 -Not,4.25 -NthValue,4.25 -OctetLength,4.25 -Or,4.25 -PercentRank,4.25 -PivotFirst,4.25 -Pmod,4.25 -PosExplode,4.25 -Pow,4.25 -PreciseTimestampConversion,4.25 -PromotePrecision,4.25 -PythonUDF,4.25 -Quarter,4.25 -RLike,4.25 -RaiseError,4.25 -Rand,4.25 -Rank,4.25 -RegExpExtract,4.25 -RegExpExtractAll,4.25 -RegExpReplace,4.25 -Remainder,4.25 -ReplicateRows,4.25 -Reverse,4.25 -Rint,4.25 -Round,4.25 -RowNumber,4.25 -ScalaUDF,4.25 -ScalarSubquery,4.25 -Second,4.25 -SecondsToTimestamp,4.25 -Sequence,4.25 -ShiftLeft,4.25 -ShiftRight,4.25 -ShiftRightUnsigned,4.25 -Signum,4.25 -Sin,4.25 -Sinh,4.25 -Size,4.25 -SortArray,4.25 -SortOrder,4.25 -SparkPartitionID,4.25 -SpecifiedWindowFrame,4.25 -Sqrt,4.25 -StartsWith,4.25 -StddevPop,4.25 -StddevSamp,4.25 -StringInstr,4.25 -StringLPad,4.25 -StringLocate,4.25 -StringRPad,4.25 -StringRepeat,4.25 -StringReplace,4.25 -StringSplit,4.25 -StringToMap,4.25 -StringTranslate,4.25 -StringTrim,4.25 -StringTrimLeft,4.25 -StringTrimRight,4.25 -Substring,4.25 -SubstringIndex,4.25 -Subtract,4.25 -Sum,4.25 -Tan,4.25 -Tanh,4.25 -TimeAdd,4.25 -ToDegrees,4.25 -ToRadians,4.25 -ToUnixTimestamp,4.25 -TransformKeys,4.25 -TransformValues,4.25 -UnaryMinus,4.25 -UnaryPositive,4.25 -UnboundedFollowing$,4.25 -UnboundedPreceding$,4.25 -UnixTimestamp,4.25 -UnscaledValue,4.25 -Upper,4.25 -VariancePop,4.25 -VarianceSamp,4.25 -WeekDay,4.25 -WindowExpression,4.25 -WindowSpecDefinition,4.25 -XxHash64,4.25 -Year,4.25 +CoalesceExec,3.65 +CollectLimitExec,3.65 +ExpandExec,3.76 +FileSourceScanExec,2.84 +FilterExec,3.79 +GenerateExec,3.65 +GlobalLimitExec,3.65 +LocalLimitExec,3.65 +ProjectExec,3.65 +RangeExec,3.65 +SampleExec,3.65 +SortExec,3.65 +TakeOrderedAndProjectExec,3.65 +HashAggregateExec,4.1 +ObjectHashAggregateExec,4.1 +SortAggregateExec,4.1 +DataWritingCommandExec,3.65 +ExecutedCommandExec,3.65 +BatchScanExec,2.84 +ShuffleExchangeExec,3.69 +BroadcastHashJoinExec,3.72 +BroadcastNestedLoopJoinExec,1.66 +CartesianProductExec,3.65 +ShuffledHashJoinExec,3.65 +SortMergeJoinExec,5.64 +WindowExec,3.65 +Abs,3.65 +Acos,3.65 +Acosh,3.65 +Add,3.65 +AggregateExpression,3.65 +Alias,3.65 +And,3.65 +ApproximatePercentile,3.65 +ArrayContains,3.65 +ArrayExcept,3.65 +ArrayExists,3.65 +ArrayIntersect,3.65 +ArrayMax,3.65 +ArrayMin,3.65 +ArrayRemove,3.65 +ArrayRepeat,3.65 +ArrayTransform,3.65 +ArrayUnion,3.65 +ArraysOverlap,3.65 +ArraysZip,3.65 +Asin,3.65 +Asinh,3.65 +AtLeastNNonNulls,3.65 +Atan,3.65 +Atanh,3.65 +AttributeReference,3.65 +Average,3.65 +BRound,3.65 +BitLength,3.65 +BitwiseAnd,3.65 +BitwiseNot,3.65 +BitwiseOr,3.65 +BitwiseXor,3.65 +CaseWhen,3.65 +Cbrt,3.65 +Ceil,3.65 +CheckOverflow,3.65 +Coalesce,3.65 +CollectList,3.65 +CollectSet,3.65 +Concat,3.65 +ConcatWs,3.65 +Contains,3.65 +Conv,3.65 +Cos,3.65 +Cosh,3.65 +Cot,3.65 +Count,3.65 +CreateArray,3.65 +CreateMap,3.65 +CreateNamedStruct,3.65 +CurrentRow$,3.65 +DateAdd,3.65 +DateAddInterval,3.65 +DateDiff,3.65 +DateFormatClass,3.65 +DateSub,3.65 +DayOfMonth,3.65 +DayOfWeek,3.65 +DayOfYear,3.65 +DenseRank,3.65 +Divide,3.65 +DynamicPruningExpression,3.65 +ElementAt,3.65 +EndsWith,3.65 +EqualNullSafe,3.65 +EqualTo,3.65 +Exp,3.65 +Explode,3.65 +Expm1,3.65 +First,3.65 +Flatten,3.65 +Floor,3.65 +FromUTCTimestamp,3.65 +FromUnixTime,3.65 +GetArrayItem,3.65 +GetArrayStructFields,3.65 +GetJsonObject,3.65 +GetMapValue,3.65 +GetStructField,3.65 +GetTimestamp,3.65 +GreaterThan,3.65 +GreaterThanOrEqual,3.65 +Greatest,3.65 +HiveGenericUDF,3.65 +HiveSimpleUDF,3.65 +Hour,3.65 +Hypot,3.65 +If,3.65 +In,3.65 +InSet,3.65 +InitCap,3.65 +InputFileBlockLength,3.65 +InputFileBlockStart,3.65 +InputFileName,3.65 +IntegralDivide,3.65 +IsNaN,3.65 +IsNotNull,3.65 +IsNull,3.65 +JsonToStructs,3.65 +JsonTuple,3.65 +KnownFloatingPointNormalized,3.65 +KnownNotNull,3.65 +Lag,3.65 +LambdaFunction,3.65 +Last,3.65 +LastDay,3.65 +Lead,3.65 +Least,3.65 +Length,3.65 +LessThan,3.65 +LessThanOrEqual,3.65 +Like,3.65 +Literal,3.65 +Log,3.65 +Log10,3.65 +Log1p,3.65 +Log2,3.65 +Logarithm,3.65 +Lower,3.65 +MakeDecimal,3.65 +MapConcat,3.65 +MapEntries,3.65 +MapFilter,3.65 +MapKeys,3.65 +MapValues,3.65 +Max,3.65 +Md5,3.65 +MicrosToTimestamp,3.65 +MillisToTimestamp,3.65 +Min,3.65 +Minute,3.65 +MonotonicallyIncreasingID,3.65 +Month,3.65 +Multiply,3.65 +Murmur3Hash,3.65 +NaNvl,3.65 +NamedLambdaVariable,3.65 +NormalizeNaNAndZero,3.65 +Not,3.65 +NthValue,3.65 +OctetLength,3.65 +Or,3.65 +PercentRank,3.65 +PivotFirst,3.65 +Pmod,3.65 +PosExplode,3.65 +Pow,3.65 +PreciseTimestampConversion,3.65 +PromotePrecision,3.65 +PythonUDF,3.65 +Quarter,3.65 +RLike,3.65 +RaiseError,3.65 +Rand,3.65 +Rank,3.65 +RegExpExtract,3.65 +RegExpExtractAll,3.65 +RegExpReplace,3.65 +Remainder,3.65 +ReplicateRows,3.65 +Reverse,3.65 +Rint,3.65 +Round,3.65 +RowNumber,3.65 +ScalaUDF,3.65 +ScalarSubquery,3.65 +Second,3.65 +SecondsToTimestamp,3.65 +Sequence,3.65 +ShiftLeft,3.65 +ShiftRight,3.65 +ShiftRightUnsigned,3.65 +Signum,3.65 +Sin,3.65 +Sinh,3.65 +Size,3.65 +SortArray,3.65 +SortOrder,3.65 +SparkPartitionID,3.65 +SpecifiedWindowFrame,3.65 +Sqrt,3.65 +StartsWith,3.65 +StddevPop,3.65 +StddevSamp,3.65 +StringInstr,3.65 +StringLPad,3.65 +StringLocate,3.65 +StringRPad,3.65 +StringRepeat,3.65 +StringReplace,3.65 +StringSplit,3.65 +StringToMap,3.65 +StringTranslate,3.65 +StringTrim,3.65 +StringTrimLeft,3.65 +StringTrimRight,3.65 +Substring,3.65 +SubstringIndex,3.65 +Subtract,3.65 +Sum,3.65 +Tan,3.65 +Tanh,3.65 +TimeAdd,3.65 +ToDegrees,3.65 +ToRadians,3.65 +ToUnixTimestamp,3.65 +TransformKeys,3.65 +TransformValues,3.65 +UnaryMinus,3.65 +UnaryPositive,3.65 +UnboundedFollowing$,3.65 +UnboundedPreceding$,3.65 +UnixTimestamp,3.65 +UnscaledValue,3.65 +Upper,3.65 +VariancePop,3.65 +VarianceSamp,3.65 +WeekDay,3.65 +WindowExpression,3.65 +WindowSpecDefinition,3.65 +XxHash64,3.65 +Year,3.65 AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 FlatMapCoGroupsInPandasExec,1.2 MapInPandasExec,1.2 WindowInPandasExec,1.2 +KMeans-pyspark,8.86 +KMeans-scala,1.0 +PCA-pyspark,2.24 +PCA-scala,2.69 +LinearRegression-pyspark,2.0 +LinearRegression-scala,1.0 +RandomForestClassifier-pyspark,6.31 +RandomForestClassifier-scala,1.0 +RandomForestRegressor-pyspark,3.66 +RandomForestRegressor-scala,1.0 +XGBoost-pyspark,1.0 +XGBoost-scala,3.31 diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala index c01586159..d14336618 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala @@ -48,8 +48,7 @@ class PluginTypeChecker(platform: String = "onprem", private val OPERATORS_SCORE_FILE_DATAPROC_T4 = "operatorsScore-dataproc-t4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_L4 = "operatorsScore-dataproc-l4.csv" private val OPERATORS_SCORE_FILE_DATAPROC_SL_L4 = "operatorsScore-dataproc-serverless-l4.csv" - // TODO: Replace this with GKE T4 speedup scores - private val OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 = "operatorsScore.csv" + private val OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 = "operatorsScore-dataproc-gke-t4.csv" private val OPERATORS_SCORE_FILE_EMR_T4 = "operatorsScore-emr-t4.csv" private val OPERATORS_SCORE_FILE_EMR_A10 = "operatorsScore-emr-a10.csv" private val OPERATORS_SCORE_FILE_DATABRICKS_AWS = "operatorsScore-databricks-aws.csv" From f71a72e835961967b4f7d189dfd6e14123577256 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 10 Oct 2023 12:46:53 -0700 Subject: [PATCH 3/5] Update docs Signed-off-by: Partho Sarthi --- core/docs/spark-qualification-tool.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/docs/spark-qualification-tool.md b/core/docs/spark-qualification-tool.md index 25dd3cb5f..64ce149f0 100644 --- a/core/docs/spark-qualification-tool.md +++ b/core/docs/spark-qualification-tool.md @@ -29,7 +29,7 @@ applicable environments. Here are the cluster information for the ETL benchmark | Dataproc (T4) | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB | | Dataproc (L4) | 8x n1-standard-16 | 8x g2-standard-16 | | Dataproc Serverless (L4) | 8x 16 cores | 8x 16 cores + 8x L4 24GB | -| Dataproc GKE (T4) | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB | +| Dataproc GKE (T4) | 8x n1-standard-32 | 8x n1-standard-32 + 8x T4 16GB | | EMR (T4) | 8x m5d.8xlarge | 4x g4dn.12xlarge | | EMR (A10) | 8x m5d.8xlarge | 8x g5.8xlarge | | Databricks AWS | 8x m6gd.8xlage | 8x g5.8xlarge | From 5c1fda4d8c89b4089fe296e81fe25c5f780390c2 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 10 Oct 2023 14:13:01 -0700 Subject: [PATCH 4/5] Update tests with Dataproc GKE operator scores Signed-off-by: Partho Sarthi --- .../rapids/tool/qualification/PluginTypeCheckerSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala index 6a183dd3e..3ba8badc9 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala @@ -197,8 +197,8 @@ class PluginTypeCheckerSuite extends FunSuite with Logging { test("supported operator score from dataproc-gke-t4") { val checker = new PluginTypeChecker("dataproc-gke-t4") - assert(checker.getSpeedupFactor("WindowExec") == 4.25) - assert(checker.getSpeedupFactor("Ceil") == 4.25) + assert(checker.getSpeedupFactor("WindowExec") == 3.65) + assert(checker.getSpeedupFactor("Ceil") == 3.65) } test("supported operator score from emr-a10") { From deb5e9add446d14003294e64add1b3781397da46 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 10 Oct 2023 17:59:57 -0700 Subject: [PATCH 5/5] Remove Spark-ML operators Signed-off-by: Partho Sarthi --- .../resources/operatorsScore-dataproc-gke-t4.csv | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv index e5b3f9525..3083cbe8b 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -254,15 +254,3 @@ FlatMapGroupsInPandasExec,1.2 FlatMapCoGroupsInPandasExec,1.2 MapInPandasExec,1.2 WindowInPandasExec,1.2 -KMeans-pyspark,8.86 -KMeans-scala,1.0 -PCA-pyspark,2.24 -PCA-scala,2.69 -LinearRegression-pyspark,2.0 -LinearRegression-scala,1.0 -RandomForestClassifier-pyspark,6.31 -RandomForestClassifier-scala,1.0 -RandomForestRegressor-pyspark,3.66 -RandomForestRegressor-scala,1.0 -XGBoost-pyspark,1.0 -XGBoost-scala,3.31