diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index 12bcb9d79..227dbdd0b 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- spark-version: ['311', '320', '330', '341']
+ spark-version: ['311', '320', '333', '341']
steps:
- uses: actions/checkout@v3
diff --git a/core/README.md b/core/README.md
index a3b8359f8..8000a5460 100644
--- a/core/README.md
+++ b/core/README.md
@@ -20,14 +20,14 @@ mvn clean package
```
After a successful build, the jar of 'rapids-4-spark-tools_2.12-*-SNAPSHOT.jar' will be in 'target/' directory.
-This will build the plugin for a single version of Spark. By default, this is Apache Spark 3.1.1.
+This will build the plugin for a single version of Spark. By default, this is Apache Spark 3.3.3.
For development purpose, you may need to run the tests against different spark versions.
To run the tests against a specific Spark version, you can use the `-Dbuildver=XXX` command line option.
-For instance to build Spark 3.3.0 you would use:
+For instance to build Spark 3.4.1 you would use:
```shell script
-mvn -Dbuildver=330 clean package
+mvn -Dbuildver=341 clean package
```
Run `mvn help:all-profiles` to list supported Spark versions.
diff --git a/core/pom.xml b/core/pom.xml
index ae3902d40..07f4958f8 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -23,7 +23,7 @@
rapids-4-spark-tools_2.12
RAPIDS Accelerator for Apache Spark tools
RAPIDS Accelerator for Apache Spark tools
- 23.08.1-SNAPSHOT
+ 23.08.2-SNAPSHOT
jar
http://github.com/NVIDIA/spark-rapids-tools
@@ -73,7 +73,6 @@
release311
- true
buildver
311
@@ -83,7 +82,7 @@
311
${spark311.version}
${delta10x.version}
- 3.2.0
+ 3.3.6
@@ -98,7 +97,7 @@
312
${spark312.version}
${delta10x.version}
- 3.2.0
+ 3.3.6
@@ -113,7 +112,22 @@
313
${spark313.version}
${delta10x.version}
- 3.2.0
+ 3.3.6
+
+
+
+ release314
+
+
+ buildver
+ 314
+
+
+
+ 314
+ ${spark314.version}
+ ${delta10x.version}
+ 3.3.6
@@ -128,7 +142,7 @@
320
${spark320.version}
${delta20x.version}
- 3.3.1
+ 3.3.6
@@ -143,7 +157,7 @@
321
${spark321.version}
${delta20x.version}
- 3.3.1
+ 3.3.6
@@ -158,7 +172,7 @@
322
${spark322.version}
${delta20x.version}
- 3.3.1
+ 3.3.6
@@ -173,7 +187,7 @@
323
${spark323.version}
${delta20x.version}
- 3.3.1
+ 3.3.6
@@ -188,7 +202,7 @@
324
${spark324.version}
${delta20x.version}
- 3.3.1
+ 3.3.6
@@ -203,7 +217,7 @@
325
${spark325.version}
${delta20x.version}
- 3.3.5
+ 3.3.6
@@ -218,7 +232,7 @@
330
${spark330.version}
${delta23x.version}
- 3.3.2
+ 3.3.6
@@ -233,7 +247,7 @@
331
${spark331.version}
${delta23x.version}
- 3.3.2
+ 3.3.6
@@ -248,12 +262,13 @@
332
${spark332.version}
${delta23x.version}
- 3.3.2
+ 3.3.6
release333
+ true
buildver
333
@@ -263,7 +278,22 @@
333
${spark333.version}
${delta23x.version}
- 3.3.4
+ 3.3.6
+
+
+
+ release334
+
+
+ buildver
+ 334
+
+
+
+ 334
+ ${spark334.version}
+ ${delta23x.version}
+ 3.3.6
@@ -278,7 +308,7 @@
340
${spark340.version}
${delta24x.version}
- 3.3.4
+ 3.3.6
@@ -293,7 +323,22 @@
341
${spark341.version}
${delta24x.version}
- 3.3.4
+ 3.3.6
+
+
+
+ release342
+
+
+ buildver
+ 342
+
+
+
+ 342
+ ${spark342.version}
+ ${delta24x.version}
+ 3.3.6
@@ -308,7 +353,37 @@
350
${spark350.version}
${delta24x.version}
- 3.3.5
+ 3.3.6
+
+
+
+ release351
+
+
+ buildver
+ 351
+
+
+
+ 351
+ ${spark351.version}
+ ${delta24x.version}
+ 3.3.6
+
+
+
+ release400
+
+
+ buildver
+ 400
+
+
+
+ 400
+ ${spark400.version}
+ ${delta24x.version}
+ 3.3.6
@@ -316,6 +391,7 @@
3.1.1
3.1.2
3.1.3
+ 3.1.4-SNAPSHOT
3.2.0
3.2.1
3.2.2
@@ -325,16 +401,20 @@
3.3.0
3.3.1
3.3.2
- 3.3.3-SNAPSHOT
+ 3.3.3
+ 3.3.4-SNAPSHOT
3.4.0
3.4.1
+ 3.4.2-SNAPSHOT
3.5.0-SNAPSHOT
+ 3.5.1-SNAPSHOT
+ 4.0.0-SNAPSHOT
2.12
4.3.0
2.0.2
3.2.0
2.12.15
- ${spark311.version}
+ ${spark333.version}
2.0
3.5.1
3.0.5
diff --git a/core/src/main/resources/operatorsScore-databricks-aws.csv b/core/src/main/resources/operatorsScore-databricks-aws.csv
index eabc0078a..b80514227 100644
--- a/core/src/main/resources/operatorsScore-databricks-aws.csv
+++ b/core/src/main/resources/operatorsScore-databricks-aws.csv
@@ -155,6 +155,8 @@ MapKeys,2.45
MapValues,2.45
Max,2.45
Md5,2.45
+MicrosToTimestamp,2.45
+MillisToTimestamp,2.45
Min,2.45
Minute,2.45
MonotonicallyIncreasingID,2.45
@@ -193,6 +195,7 @@ RowNumber,2.45
ScalaUDF,2.45
ScalarSubquery,2.45
Second,2.45
+SecondsToTimestamp,2.45
Sequence,2.45
ShiftLeft,2.45
ShiftRight,2.45
@@ -217,6 +220,7 @@ StringRepeat,2.45
StringReplace,2.45
StringSplit,2.45
StringToMap,2.45
+StringTranslate,2.45
StringTrim,2.45
StringTrimLeft,2.45
StringTrimRight,2.45
diff --git a/core/src/main/resources/operatorsScore-databricks-azure.csv b/core/src/main/resources/operatorsScore-databricks-azure.csv
index dbf7f9da3..efc7b7215 100644
--- a/core/src/main/resources/operatorsScore-databricks-azure.csv
+++ b/core/src/main/resources/operatorsScore-databricks-azure.csv
@@ -155,6 +155,8 @@ MapKeys,2.73
MapValues,2.73
Max,2.73
Md5,2.73
+MicrosToTimestamp,2.73
+MillisToTimestamp,2.73
Min,2.73
Minute,2.73
MonotonicallyIncreasingID,2.73
@@ -193,6 +195,7 @@ RowNumber,2.73
ScalaUDF,2.73
ScalarSubquery,2.73
Second,2.73
+SecondsToTimestamp,2.73
Sequence,2.73
ShiftLeft,2.73
ShiftRight,2.73
@@ -217,6 +220,7 @@ StringRepeat,2.73
StringReplace,2.73
StringSplit,2.73
StringToMap,2.73
+StringTranslate,2.73
StringTrim,2.73
StringTrimLeft,2.73
StringTrimRight,2.73
diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv
index 66ff092de..0fdaea9d2 100644
--- a/core/src/main/resources/operatorsScore-dataproc-l4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv
@@ -155,6 +155,8 @@ MapKeys,4.16
MapValues,4.16
Max,4.16
Md5,4.16
+MicrosToTimestamp,4.16
+MillisToTimestamp,4.16
Min,4.16
Minute,4.16
MonotonicallyIncreasingID,4.16
@@ -193,6 +195,7 @@ RowNumber,4.16
ScalaUDF,4.16
ScalarSubquery,4.16
Second,4.16
+SecondsToTimestamp,4.16
Sequence,4.16
ShiftLeft,4.16
ShiftRight,4.16
@@ -217,6 +220,7 @@ StringRepeat,4.16
StringReplace,4.16
StringSplit,4.16
StringToMap,4.16
+StringTranslate,4.16
StringTrim,4.16
StringTrimLeft,4.16
StringTrimRight,4.16
diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv
index 2ea664b85..48ab44cab 100644
--- a/core/src/main/resources/operatorsScore-dataproc-t4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv
@@ -155,6 +155,8 @@ MapKeys,4.88
MapValues,4.88
Max,4.88
Md5,4.88
+MicrosToTimestamp,4.88
+MillisToTimestamp,4.88
Min,4.88
Minute,4.88
MonotonicallyIncreasingID,4.88
@@ -193,6 +195,7 @@ RowNumber,4.88
ScalaUDF,4.88
ScalarSubquery,4.88
Second,4.88
+SecondsToTimestamp,4.88
Sequence,4.88
ShiftLeft,4.88
ShiftRight,4.88
@@ -217,6 +220,7 @@ StringRepeat,4.88
StringReplace,4.88
StringSplit,4.88
StringToMap,4.88
+StringTranslate,4.88
StringTrim,4.88
StringTrimLeft,4.88
StringTrimRight,4.88
diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv
index b233887ec..ba697fb92 100644
--- a/core/src/main/resources/operatorsScore-emr-a10.csv
+++ b/core/src/main/resources/operatorsScore-emr-a10.csv
@@ -155,6 +155,8 @@ MapKeys,2.59
MapValues,2.59
Max,2.59
Md5,2.59
+MicrosToTimestamp,2.59
+MillisToTimestamp,2.59
Min,2.59
Minute,2.59
MonotonicallyIncreasingID,2.59
@@ -193,6 +195,7 @@ RowNumber,2.59
ScalaUDF,2.59
ScalarSubquery,2.59
Second,2.59
+SecondsToTimestamp,2.59
Sequence,2.59
ShiftLeft,2.59
ShiftRight,2.59
@@ -217,6 +220,7 @@ StringRepeat,2.59
StringReplace,2.59
StringSplit,2.59
StringToMap,2.59
+StringTranslate,2.59
StringTrim,2.59
StringTrimLeft,2.59
StringTrimRight,2.59
diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv
index 57f54922d..1662703c8 100644
--- a/core/src/main/resources/operatorsScore-emr-t4.csv
+++ b/core/src/main/resources/operatorsScore-emr-t4.csv
@@ -155,6 +155,8 @@ MapKeys,2.07
MapValues,2.07
Max,2.07
Md5,2.07
+MicrosToTimestamp,2.07
+MillisToTimestamp,2.07
Min,2.07
Minute,2.07
MonotonicallyIncreasingID,2.07
@@ -193,6 +195,7 @@ RowNumber,2.07
ScalaUDF,2.07
ScalarSubquery,2.07
Second,2.07
+SecondsToTimestamp,2.07
Sequence,2.07
ShiftLeft,2.07
ShiftRight,2.07
@@ -217,6 +220,7 @@ StringRepeat,2.07
StringReplace,2.07
StringSplit,2.07
StringToMap,2.07
+StringTranslate,2.07
StringTrim,2.07
StringTrimLeft,2.07
StringTrimRight,2.07
diff --git a/core/src/main/resources/operatorsScore.csv b/core/src/main/resources/operatorsScore.csv
index 4ffb0d902..7c00bb79b 100644
--- a/core/src/main/resources/operatorsScore.csv
+++ b/core/src/main/resources/operatorsScore.csv
@@ -160,6 +160,8 @@ MapKeys,4
MapValues,4
Max,4
Md5,4
+MicrosToTimestamp,4
+MillisToTimestamp,4
Min,4
Minute,4
MonotonicallyIncreasingID,4
@@ -198,6 +200,7 @@ RowNumber,4
ScalaUDF,4
ScalarSubquery,4
Second,4
+SecondsToTimestamp,4
Sequence,4
ShiftLeft,4
ShiftRight,4
@@ -222,6 +225,7 @@ StringRepeat,4
StringReplace,4
StringSplit,4
StringToMap,4
+StringTranslate,4
StringTrim,4
StringTrimLeft,4
StringTrimRight,4
diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv
index c1ff8979a..7358bb832 100644
--- a/core/src/main/resources/supportedExprs.csv
+++ b/core/src/main/resources/supportedExprs.csv
@@ -330,6 +330,10 @@ MapValues,S,`map_values`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
MapValues,S,`map_values`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA
Md5,S,`md5`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
Md5,S,`md5`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
+MicrosToTimestamp,S,`timestamp_micros`,None,project,input,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+MicrosToTimestamp,S,`timestamp_micros`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
+MillisToTimestamp,S,`timestamp_millis`,None,project,input,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+MillisToTimestamp,S,`timestamp_millis`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Minute,S,`minute`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Minute,S,`minute`,None,project,result,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
MonotonicallyIncreasingID,S,`monotonically_increasing_id`,None,project,result,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
@@ -377,8 +381,8 @@ Pow,S,`pow`; `power`,None,AST,rhs,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA
Pow,S,`pow`; `power`,None,AST,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
PreciseTimestampConversion,S, ,None,project,input,NA,NA,NA,NA,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
PreciseTimestampConversion,S, ,None,project,result,NA,NA,NA,NA,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
-PromotePrecision,S, ,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
-PromotePrecision,S, ,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
+PromotePrecision,S,`promote_precision`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
+PromotePrecision,S,`promote_precision`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
PythonUDF,S, ,None,aggregation,param,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS
PythonUDF,S, ,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NA,PS,NS,PS,NA
PythonUDF,S, ,None,reduction,param,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS
@@ -430,6 +434,8 @@ ScalaUDF,S, ,None,project,param,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,NS
ScalaUDF,S, ,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,NS
Second,S,`second`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Second,S,`second`,None,project,result,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+SecondsToTimestamp,S,`timestamp_seconds`,None,project,input,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
+SecondsToTimestamp,S,`timestamp_seconds`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Sequence,S,`sequence`,None,project,start,NA,S,S,S,S,NA,NA,NS,NS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Sequence,S,`sequence`,None,project,stop,NA,S,S,S,S,NA,NA,NS,NS,NA,NA,NA,NA,NA,NA,NA,NA,NA
Sequence,S,`sequence`,None,project,step,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NS,NA,NA,NA,NA
@@ -501,6 +507,10 @@ StringToMap,S,`str_to_map`,None,project,str,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,N
StringToMap,S,`str_to_map`,None,project,pairDelim,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
StringToMap,S,`str_to_map`,None,project,keyValueDelim,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
StringToMap,S,`str_to_map`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA
+StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
+StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,from,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
+StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,to,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
+StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
StringTrim,S,`trim`,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
StringTrim,S,`trim`,None,project,trimStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
StringTrim,S,`trim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala
index 8b7b05599..e9f24b8b4 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala
@@ -67,11 +67,20 @@ object EventLogPathProcessor extends Logging {
val SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER =
SPARK_SHORT_COMPRESSION_CODEC_NAMES ++ Set("gz")
+ // Show a special message if the eventlog is one of the following formats
+ // https://github.com/NVIDIA/spark-rapids-tools/issues/506
+ val EVENTLOGS_IN_PLAIN_TEXT_CODEC_NAMES = Set("log", "txt")
+
def eventLogNameFilter(logFile: Path): Boolean = {
EventLogFileWriter.codecName(logFile)
.forall(suffix => SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.contains(suffix))
}
+ def isPlainTxtFileName(logFile: Path): Boolean = {
+ EventLogFileWriter.codecName(logFile)
+ .forall(suffix => EVENTLOGS_IN_PLAIN_TEXT_CODEC_NAMES.contains(suffix))
+ }
+
// Databricks has the latest events in file named eventlog and then any rolled in format
// eventlog-2021-06-14--20-00.gz, here we assume that if any files start with eventlog
// then the directory is a Databricks event log directory.
@@ -126,10 +135,17 @@ object EventLogPathProcessor extends Logging {
val fileName = filePath.getName()
if (fileStatus.isFile() && !eventLogNameFilter(filePath)) {
- logWarning(s"File: $fileName it not a supported file type. " +
- "Supported compression types are: " +
- s"${SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.mkString(", ")}. " +
- "Skipping this file.")
+ val msg = if (isPlainTxtFileName(filePath)) {
+ // if the file is plain text, we want to show that the filePath without extension
+ // could be supported..
+ s"File: $fileName. Detected a text file. No extension is expected. skipping this file."
+ } else {
+ s"File: $fileName is not a supported file type. " +
+ "Supported compression types are: " +
+ s"${SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.mkString(", ")}. " +
+ "Skipping this file."
+ }
+ logWarning(msg)
Map.empty[EventLogInfo, Long]
} else if (fileStatus.isDirectory && isEventLogDir(fileStatus)) {
// either event logDir v2 directory or regular event log
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
index 18ea53d7a..7fc8ef864 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
@@ -76,7 +76,12 @@ object SQLPlanParser extends Logging {
val windowFunctionPattern = """(\w+)\(""".r
- val ignoreExpressions = Array("any", "cast", "decimal", "decimaltype", "every", "some", "list")
+ val ignoreExpressions = Array("any", "cast", "decimal", "decimaltype", "every", "some",
+ "list",
+ // current_database does not cause any CPU fallbacks
+ "current_database",
+ // ArrayBuffer is a Scala function and may appear in some of the JavaRDDs/UDAFs)
+ "arraybuffer")
/**
* This function is used to create a set of nodes that should be skipped while parsing the Execs
@@ -404,25 +409,33 @@ object SQLPlanParser extends Logging {
// This parser is used for SortAggregateExec, HashAggregateExec and ObjectHashAggregateExec
def parseAggregateExpressions(exprStr: String): Array[String] = {
val parsedExpressions = ArrayBuffer[String]()
- // (key=[num#83], functions=[partial_collect_list(letter#84, 0, 0), partial_count(letter#84)])
- val pattern = """functions=\[([\w#, +*\\\-\.<>=\`\(\)]+\])""".r
- val aggregatesString = pattern.findFirstMatchIn(exprStr)
- // This is to split multiple column names in AggregateExec. Each column will be aggregating
- // based on the aggregate function. Here "partial_" and "merge_" is removed and
- // only function name is preserved. Below regex will first remove the
- // "functions=" from the string followed by removing "partial_" and "merge_". That string is
- // split which produces an array containing column names. Finally we remove the parentheses
- // from the beginning and end to get only the expressions. Result will be as below.
- // paranRemoved = Array(collect_list(letter#84, 0, 0),, count(letter#84))
- if (aggregatesString.isDefined) {
- val paranRemoved = aggregatesString.get.toString.replaceAll("functions=", "").
- replaceAll("partial_", "").replaceAll("merge_", "").split("(?<=\\),)").map(_.trim).
- map(_.replaceAll("""^\[+""", "").replaceAll("""\]+$""", ""))
- paranRemoved.foreach { case expr =>
- val functionName = getFunctionName(functionPattern, expr)
- functionName match {
- case Some(func) => parsedExpressions += func
- case _ => // NO OP
+ // (keys=[num#83], functions=[partial_collect_list(letter#84, 0, 0), partial_count(letter#84)])
+ // Currently we only parse the functions expressions.
+ // "Keys" parsing is disabled for now because we won't be able to detect the types
+
+ // A map (value -> parseEnabled) between the group and the parsing metadata
+ val patternMap = Map(
+ "functions" -> true,
+ "keys" -> false
+ )
+ // It won't hurt to define a pattern that is neutral to the order of the functions/keys.
+ // This can avoid mismatches when exprStr comes in the fom of (functions=[], keys=[]).
+ val pattern = """^\((keys|functions)=\[(.*)\]\s*,\s*(keys|functions)=\[(.*)\]\s*\)$""".r
+ // Iterate through the matches and exclude disabled clauses
+ pattern.findAllMatchIn(exprStr).foreach { m =>
+ // The matching groups are:
+ // 0 -> entire expression
+ // 1 -> "keys"; 2 -> keys' expression
+ // 3 -> "functions"; 4 -> functions' expression
+ Array(1, 3).foreach { group_ind =>
+ val group_value = m.group(group_ind)
+ if (patternMap.getOrElse(group_value, false)) {
+ val clauseExpr = m.group(group_ind + 1)
+ // Here "partial_" and "merge_" is removed and only function name is preserved.
+ val processedExpr = clauseExpr.replaceAll("partial_", "").replaceAll("merge_", "")
+ // No need to split the expr any further because we are only interested in function names
+ val used_functions = getAllFunctionNames(functionPrefixPattern, processedExpr)
+ parsedExpressions ++= used_functions
}
}
}
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
index 6a974cf21..86909d98d 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
@@ -546,6 +546,7 @@ class AutoTuner(
/**
* Flow:
+ * if "spark.master" is standalone => Do Nothing
* if "spark.rapids.memory.pinnedPool.size" is set
* if yarn -> recommend "spark.executor.memoryOverhead"
* if using k8s ->
@@ -553,13 +554,15 @@ class AutoTuner(
* else recommend "spark.kubernetes.memoryOverheadFactor" and add comment if missing
*/
def addRecommendationForMemoryOverhead(recomValue: String): Unit = {
- val memOverheadLookup = memoryOverheadLabel
- appendRecommendationForMemoryMB(memOverheadLookup, recomValue)
- getPropertyValue("spark.rapids.memory.pinnedPool.size").foreach { lookup =>
- if (lookup != "spark.executor.memoryOverhead") {
- if (getPropertyValue(memOverheadLookup).isEmpty) {
- appendComment(s"'$memOverheadLookup' must be set if using " +
- s"'spark.rapids.memory.pinnedPool.size")
+ if (enableMemoryOverheadRecommendation(getPropertyValue("spark.master"))) {
+ val memOverheadLookup = memoryOverheadLabel
+ appendRecommendationForMemoryMB(memOverheadLookup, recomValue)
+ getPropertyValue("spark.rapids.memory.pinnedPool.size").foreach { lookup =>
+ if (lookup != "spark.executor.memoryOverhead") {
+ if (getPropertyValue(memOverheadLookup).isEmpty) {
+ appendComment(s"'$memOverheadLookup' must be set if using " +
+ s"'spark.rapids.memory.pinnedPool.size")
+ }
}
}
}
@@ -706,7 +709,14 @@ class AutoTuner(
s" $jarURL\n" +
s" Version used in application is $jarVer.")
}
- case None => logError("Could not pull the latest release of plugin jar.")
+ case None =>
+ logError("Could not pull the latest release of RAPIDS-plugin jar.")
+ val pluginRepoUrl = WebCrawlerUtil.getMVNArtifactURL("rapids.plugin")
+ appendComment(
+ "Failed to validate the latest release of Apache Spark plugin.\n" +
+ s" Verify that the version used in application ($jarVer) is the latest on:\n" +
+ s" $pluginRepoUrl")
+
}
}
}
@@ -1131,4 +1141,18 @@ object AutoTuner extends Logging {
f"$sizeNum%.2f$sizeUnit"
}
}
+
+ /**
+ * Given the spark property "spark.master", it checks whether memoryOverhead should be
+ * enabled/disabled. For Spark Standalone Mode, memoryOverhead property is skipped.
+ * @param confValue the value of property "spark.master"
+ * @return False if the value is a spark standalone. True if the value is not defined or
+ * set for yarn/Mesos
+ */
+ def enableMemoryOverheadRecommendation(confValue: Option[String]): Boolean = {
+ confValue match {
+ case Some(sparkMaster) if sparkMaster.startsWith("spark:") => false
+ case _ => true
+ }
+ }
}
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala
index 9d0074ee5..afa00e7bf 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala
@@ -41,7 +41,10 @@ object WebCrawlerUtil extends Logging {
private val ARTIFACT_VERSION_REGEX = "\\d{2}\\.\\d{2}\\.\\d+/"
// given an artifactID returns the full mvn url that lists all the
// releases
- private def getMVNArtifactURL(artifactID: String) : String = s"$NV_MVN_BASE_URL/$artifactID"
+ def getMVNArtifactURL(artifactID: String) : String = {
+ val artifactUrlPart = NV_ARTIFACTS_LOOKUP.getOrElse(artifactID, artifactID)
+ s"$NV_MVN_BASE_URL/$artifactUrlPart"
+ }
/**
* Given a valid URL, this method recursively picks all the hrefs defined in the HTML doc.
@@ -72,7 +75,7 @@ object WebCrawlerUtil extends Logging {
}
} catch {
case x: IOException =>
- logError(s"Exception while visiting webURL $webURL", x)
+ logWarning(s"Exception while visiting webURL $webURL: ${x.toString}")
}
}
}
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
index e080a0e66..6f18946f3 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
@@ -68,6 +68,11 @@ class BaseTestSuite extends FunSuite with BeforeAndAfterEach with Logging {
"Spark340 does not parse the eventlog correctly")
}
+ protected def ignoreExprForSparkGTE340(): (Boolean, String) = {
+ (!ToolUtils.isSpark340OrLater(),
+ "Spark340+ does not support the expression")
+ }
+
def runConditionalTest(testName: String, assumeCondition: () => (Boolean, String))
(fun: => Unit): Unit = {
val (isAllowed, ignoreMessage) = assumeCondition()
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
index ca9dfe573..a5a7c8262 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
@@ -30,11 +30,10 @@ import org.scalatest.exceptions.TestFailedException
import org.apache.spark.sql.TrampolineUtil
import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.functions.{ceil, col, collect_list, count, explode, floor, hex, json_tuple, round, row_number, sum}
+import org.apache.spark.sql.functions.{ceil, col, collect_list, count, explode, floor, hex, json_tuple, round, row_number, sum, translate}
import org.apache.spark.sql.rapids.tool.ToolUtils
import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
-import org.apache.spark.sql.types.StringType
class SQLPlanParserSuite extends BaseTestSuite {
@@ -864,6 +863,7 @@ class SQLPlanParserSuite extends BaseTestSuite {
val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir,
"ProjectExprsSupported") { spark =>
import spark.implicits._
+ import org.apache.spark.sql.types.StringType
val df1 = Seq(9.9, 10.2, 11.6, 12.5).toDF("value")
df1.write.parquet(s"$parquetoutputLoc/testtext")
val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext")
@@ -916,6 +916,66 @@ class SQLPlanParserSuite extends BaseTestSuite {
}
}
+ test("translate is supported in ProjectExec") {
+ TrampolineUtil.withTempDir { parquetoutputLoc =>
+ TrampolineUtil.withTempDir { eventLogDir =>
+ val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir,
+ "ProjectExprsSupported") { spark =>
+ import spark.implicits._
+ val df1 = Seq("", "abc", "ABC", "AaBbCc").toDF("value")
+ // write df1 to parquet to transform LocalTableScan to ProjectExec
+ df1.write.parquet(s"$parquetoutputLoc/testtext")
+ val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext")
+ // translate should be part of ProjectExec
+ df2.select(translate(df2("value"), "ABC", "123"))
+ }
+ val pluginTypeChecker = new PluginTypeChecker()
+ val app = createAppFromEventlog(eventLog)
+ assert(app.sqlPlans.size == 2)
+ val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+ SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+ }
+ val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq)
+ val wholeStages = allExecInfo.filter(_.exec.contains("WholeStageCodegen"))
+ assert(wholeStages.size == 1)
+ assert(wholeStages.forall(_.duration.nonEmpty))
+ val allChildren = wholeStages.flatMap(_.children).flatten
+ val projects = allChildren.filter(_.exec == "Project")
+ assertSizeAndSupported(1, projects)
+ }
+ }
+ }
+
+ test("Timestamp functions supported in ProjectExec") {
+ TrampolineUtil.withTempDir { parquetoutputLoc =>
+ TrampolineUtil.withTempDir { eventLogDir =>
+ val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir,
+ "ProjectExprsSupported") { spark =>
+ import spark.implicits._
+ val init_df = Seq((1230219000123123L, 1230219000123L, 1230219000.123))
+ val df1 = init_df.toDF("micro", "millis", "seconds")
+ df1.write.parquet(s"$parquetoutputLoc/testtext")
+ val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext")
+ df2.selectExpr("timestamp_micros(micro)", "timestamp_millis(millis)",
+ "timestamp_seconds(seconds)")
+ }
+ val pluginTypeChecker = new PluginTypeChecker()
+ val app = createAppFromEventlog(eventLog)
+ assert(app.sqlPlans.size == 2)
+ val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+ SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+ }
+ val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq)
+ val wholeStages = allExecInfo.filter(_.exec.contains("WholeStageCodegen"))
+ assert(wholeStages.size == 1)
+ assert(wholeStages.forall(_.duration.nonEmpty))
+ val allChildren = wholeStages.flatMap(_.children).flatten
+ val projects = allChildren.filter(_.exec == "Project")
+ assertSizeAndSupported(1, projects)
+ }
+ }
+ }
+
test("Parse SQL function Name in HashAggregateExec") {
TrampolineUtil.withTempDir { eventLogDir =>
val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "sqlmetric") { spark =>
@@ -1022,4 +1082,119 @@ class SQLPlanParserSuite extends BaseTestSuite {
val expressions = SQLPlanParser.parseFilterExpressions(exprString)
expressions should ===(expected)
}
+
+
+ test("Parse aggregate expressions") {
+ val exprString = "(keys=[], functions=[split(split(split(replace(replace(replace(replace(" +
+ "trim(replace(cast(unbase64(content#192) as string), , ), Some( )), *., ), *, ), " +
+ "https://, ), http://, ), /, -1)[0], :, -1)[0], \\?, -1)[0]#199, " +
+ "CASE WHEN (instr(replace(cast(unbase64(content#192) as string), , ), *) = 0) " +
+ "THEN concat(replace(cast(unbase64(content#192) as string), , ), %) " +
+ "ELSE replace(replace(replace(cast(unbase64(content#192) as string), , ), %, " +
+ "\\%), *, %) END#200])"
+ val expected = Array("replace", "concat", "instr", "split", "trim", "unbase64")
+ val expressions = SQLPlanParser.parseAggregateExpressions(exprString)
+ expressions should ===(expected)
+ }
+
+ runConditionalTest("promote_precision is supported for Spark LT 3.4.0: issue-517",
+ ignoreExprForSparkGTE340) {
+ // Spark-3.4.0 removed the promote_precision SQL function
+ // the SQL generates the following physical plan
+ // (1) Project [CheckOverflow((promote_precision(cast(dec1#24 as decimal(13,2)))
+ // + promote_precision(cast(dec2#25 as decimal(13,2)))), DecimalType(13,2))
+ // AS (dec1 + dec2)#30]
+ // For Spark3.4.0, the promote_precision was removed from the plan.
+ // (1) Project [(dec1#24 + dec2#25) AS (dec1 + dec2)#30]
+ TrampolineUtil.withTempDir { parquetoutputLoc =>
+ TrampolineUtil.withTempDir { eventLogDir =>
+ val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir,
+ "projectPromotePrecision") { spark =>
+ import spark.implicits._
+ import org.apache.spark.sql.types.DecimalType
+ val df = Seq(("12347.21", "1234154"), ("92233.08", "1")).toDF
+ .withColumn("dec1", col("_1").cast(DecimalType(7, 2)))
+ .withColumn("dec2", col("_2").cast(DecimalType(10, 0)))
+ // write the df to parquet to transform localTableScan to projectExec
+ df.write.parquet(s"$parquetoutputLoc/testPromotePrecision")
+ val df2 = spark.read.parquet(s"$parquetoutputLoc/testPromotePrecision")
+ df2.selectExpr("dec1+dec2")
+ }
+ val pluginTypeChecker = new PluginTypeChecker()
+ val app = createAppFromEventlog(eventLog)
+ val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+ SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+ }
+ // The promote_precision should be part of the project exec.
+ val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq)
+ val projExecs = allExecInfo.filter(_.exec.contains("Project"))
+ assertSizeAndSupported(1, projExecs)
+ }
+ }
+ }
+
+ test("current_database is not listed as unsupported: issue-547") {
+ // current_database is a scalar value that does not cause any CPU fallbacks
+ // we should not list it as unsupported expression if it appears in the SQL.
+ TrampolineUtil.withTempDir { parquetoutputLoc =>
+ TrampolineUtil.withTempDir { eventLogDir =>
+ val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir,
+ "ignoreCurrentDatabase") { spark =>
+ import spark.implicits._
+ val df1 = spark.sparkContext.parallelize(List(10, 20, 30, 40)).toDF
+ df1.write.parquet(s"$parquetoutputLoc/ignore_current_database")
+ val df2 = spark.read.parquet(s"$parquetoutputLoc/ignore_current_database")
+ // Note that the current_database will show up in project only if it is used as a column
+ // name. For example:
+ // > SELECT current_database(), current_database() as my_db;
+ // +------------------+-------+
+ // |current_database()| my_db|
+ // +------------------+-------+
+ // | default|default|
+ // | default|default|
+ // | default|default|
+ // | default|default|
+ // +------------------+-------+
+ // == Physical Plan ==
+ // *(1) Project [default AS current_database()#10, default AS my_db#9]
+ // +- *(1) ColumnarToRow
+ // +- FileScan parquet [] Batched: true, DataFilters: [], Format: Parquet, Location:
+ // InMemoryFileIndex(1 paths)[file:/tmp_folder/T/toolTest..., PartitionFilters: []
+ // PushedFilters: [], ReadSchema: struct<>
+ df2.selectExpr("current_database()","current_database() as my_db")
+ }
+ val pluginTypeChecker = new PluginTypeChecker()
+ val app = createAppFromEventlog(eventLog)
+ val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+ SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+ }
+ // The current_database should be part of the project-exec and the parser should ignore it.
+ val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq)
+ val projExecs = allExecInfo.filter(_.exec.contains("Project"))
+ assertSizeAndSupported(1, projExecs)
+ }
+ }
+ }
+
+ test("ArrayBuffer should be ignored in Expand: issue-554") {
+ // ArrayBuffer appears in some SQL queries.
+ // It is a non-Spark expression and it should not be considered as SQL-Function
+ val exprString = "[" +
+ "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), null, null, 0," +
+ " cast(if ((ret_type#1236 = 2)) 1 else 0 as bigint))," +
+ "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), wuid#1234, null, 1, null)," +
+ "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), null," +
+ " if ((if ((ret_type#1236 = 2)) 1 else 0 = 1)) wuid#1234 else null, 2, null)," +
+ "[" +
+ "CAST((CAST(supersql_t12.`ds` AS DOUBLE) / 100.0D) AS BIGINT)#1297L," +
+ "supersql_t12.`wuid`#1298," +
+ "(IF(((IF((supersql_t12.`ret_type` = 2), 1, 0)) = 1), supersql_t12.`wuid`," +
+ "CAST(NULL AS STRING)))#1299," +
+ "gid#1296," +
+ "CAST((IF((supersql_t12.`ret_type` = 2), 1, 0)) AS BIGINT)#1300L]]"
+ // Only "IF" should be picked up as a function name
+ val expected = Array("IF")
+ val expressions = SQLPlanParser.parseExpandExpressions(exprString)
+ expressions should ===(expected)
+ }
}
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala
index 86f0fad3e..cbcf439d9 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala
@@ -1230,4 +1230,67 @@ class AutoTunerSuite extends FunSuite with BeforeAndAfterEach with Logging {
// Assert recommendations are skipped in comments
assert(comments.map(_.comment).forall(autoTuner.selectedPlatform.isValidComment))
}
+
+ // When spark is running as a standalone, the memoryOverhead should not be listed as a
+ // recommendation: issue-553.
+ test("memoryOverhead should not be recommended for Spark Standalone") {
+ // This UT sets a custom spark-property "spark.master" pointing to a spark-standalone value
+ // The Autotuner should detects that the spark-master is standalone and refrains from
+ // recommending memoryOverhead value
+ val customProps = mutable.LinkedHashMap(
+ "spark.executor.cores" -> "8",
+ "spark.executor.memory" -> "47222m",
+ "spark.rapids.shuffle.multiThreaded.reader.threads" -> "8",
+ "spark.rapids.shuffle.multiThreaded.writer.threads" -> "8",
+ "spark.rapids.sql.concurrentGpuTasks" -> "2",
+ "spark.rapids.sql.multiThreadedRead.numThreads" -> "20",
+ "spark.shuffle.manager" -> "com.nvidia.spark.rapids.spark311.RapidsShuffleManager",
+ "spark.task.resource.gpu.amount" -> "0.0625")
+ // mock the properties loaded from eventLog
+ val logEventsProps: mutable.Map[String, String] =
+ mutable.LinkedHashMap[String, String](
+ "spark.master" -> "spark://HOST_NAME:7077",
+ "spark.executor.cores" -> "16",
+ "spark.executor.instances" -> "1",
+ "spark.executor.memory" -> "80g",
+ "spark.executor.resource.gpu.amount" -> "1",
+ "spark.executor.instances" -> "1",
+ "spark.sql.shuffle.partitions" -> "200",
+ "spark.sql.files.maxPartitionBytes" -> "1g",
+ "spark.task.resource.gpu.amount" -> "0.0625",
+ "spark.rapids.memory.pinnedPool.size" -> "5g",
+ "spark.rapids.sql.enabled" -> "true",
+ "spark.plugins" -> "com.nvidia.spark.SQLPlugin",
+ "spark.rapids.sql.concurrentGpuTasks" -> "4")
+ val dataprocWorkerInfo = buildWorkerInfoAsString(Some(customProps), Some(32),
+ Some("212992MiB"), Some(5), Some(4), Some("15109MiB"), Some("Tesla T4"))
+ val infoProvider = getMockInfoProvider(8126464.0, Seq(0), Seq(0.004), logEventsProps,
+ Some(defaultSparkVersion))
+ val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(dataprocWorkerInfo, infoProvider)
+ val (properties, comments) = autoTuner.getRecommendedProperties()
+ val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments)
+ // scalastyle:off line.size.limit
+ val expectedResults =
+ s"""|
+ |Spark Properties:
+ |--conf spark.executor.cores=8
+ |--conf spark.executor.instances=20
+ |--conf spark.executor.memory=16384m
+ |--conf spark.rapids.memory.pinnedPool.size=4096m
+ |--conf spark.rapids.sql.concurrentGpuTasks=2
+ |--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=128m
+ |--conf spark.sql.adaptive.coalescePartitions.minPartitionNum=160
+ |--conf spark.sql.files.maxPartitionBytes=4096m
+ |--conf spark.task.resource.gpu.amount=0.125
+ |
+ |Comments:
+ |- 'spark.sql.adaptive.advisoryPartitionSizeInBytes' was not set.
+ |- 'spark.sql.adaptive.coalescePartitions.minPartitionNum' was not set.
+ |- 'spark.sql.adaptive.enabled' should be enabled for better performance.
+ |- ${AutoTuner.classPathComments("rapids.jars.missing")}
+ |- ${AutoTuner.classPathComments("rapids.shuffle.jars")}
+ |""".stripMargin
+ // scalastyle:on line.size.limit
+ assert(expectedResults == autoTunerOutput)
+ }
}
diff --git a/user_tools/custom_speedup_factors/generate_speedup_factors.py b/user_tools/custom_speedup_factors/generate_speedup_factors.py
index edf6b538b..c09afddbd 100644
--- a/user_tools/custom_speedup_factors/generate_speedup_factors.py
+++ b/user_tools/custom_speedup_factors/generate_speedup_factors.py
@@ -66,11 +66,11 @@
cpu_sql_combined = cpu_sql_times.set_index('nodeName').join(mapping_info.set_index('SQL Node'), how='left')
# - parse WholeStageCodegen durations with child node mapping
- cpu_sql_times_df = cpu_sql_combined[['Child Node', 'max_value']]
+ cpu_sql_times_df = cpu_sql_combined[['Child Node', 'total']]
for index, row in cpu_sql_times_df.iterrows():
operators = str(row['Child Node']).split(',')
- duration = row['max_value']/len(operators)/1000.0
+ duration = row['total']/len(operators)/1000.0
for operator in operators:
if operator in cpu_stage_log[app_name]:
cpu_stage_log[app_name][operator] = cpu_stage_log[app_name][operator] + duration
@@ -155,8 +155,6 @@
scores_dict["FileSourceScanExec"] = str(round(cpu_stage_totals['Scan orc '] / gpu_stage_totals['GpuScan orc '], 2))
# Other operators
-if 'Project' in cpu_stage_totals and 'GpuProject' in gpu_stage_totals:
- scores_dict["ProjectExec"] = str(round(cpu_stage_totals['Project'] / gpu_stage_totals['GpuProject'], 2))
if 'Expand' in cpu_stage_totals and 'GpuExpand' in gpu_stage_totals:
scores_dict["ExpandExec"] = str(round(cpu_stage_totals['Expand'] / gpu_stage_totals['GpuExpand'], 2))
if 'CartesianProduct' in cpu_stage_totals and 'GpuCartesianProduct' in gpu_stage_totals:
@@ -173,6 +171,10 @@
scores_dict["HashAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2))
scores_dict["ObjectHashAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2))
scores_dict["SortAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2))
+if 'TakeOrderedAndProject' in cpu_stage_totals and 'GpuTopN' in gpu_stage_totals:
+ scores_dict["TakeOrderedAndProjectExec"] = str(round(cpu_stage_totals['TakeOrderedAndProject'] / gpu_stage_totals['GpuTopN'], 2))
+if 'BroadcastNestedLoopJoin' in cpu_stage_totals and 'GpuBroadcastNestedLoopJoin' in gpu_stage_totals:
+ scores_dict["BroadcastNestedLoopJoinExec"] = str(round(cpu_stage_totals['BroadcastNestedLoopJoin'] / gpu_stage_totals['GpuBroadcastNestedLoopJoin'], 2))
# Set minimum to 1.0 for speedup factors
for key in scores_dict:
diff --git a/user_tools/custom_speedup_factors/operatorsList.csv b/user_tools/custom_speedup_factors/operatorsList.csv
index bb60ebf1c..97665b164 100644
--- a/user_tools/custom_speedup_factors/operatorsList.csv
+++ b/user_tools/custom_speedup_factors/operatorsList.csv
@@ -11,7 +11,6 @@ ProjectExec
RangeExec
SampleExec
SortExec
-SubqueryBroadcastExec
TakeOrderedAndProjectExec
HashAggregateExec
ObjectHashAggregateExec
@@ -19,7 +18,6 @@ SortAggregateExec
DataWritingCommandExec
ExecutedCommandExec
BatchScanExec
-BroadcastExchangeExec
ShuffleExchangeExec
BroadcastHashJoinExec
BroadcastNestedLoopJoinExec
diff --git a/user_tools/custom_speedup_factors/validate_qualification_estimates.py b/user_tools/custom_speedup_factors/validate_qualification_estimates.py
index 036b75245..26e6b8ae2 100644
--- a/user_tools/custom_speedup_factors/validate_qualification_estimates.py
+++ b/user_tools/custom_speedup_factors/validate_qualification_estimates.py
@@ -67,7 +67,7 @@
gpu_profile_dir = gpu_profile
else:
gpu_profile_dir = f"{output}/gpu_profile"
- subprocess.run(f"spark_rapids_user_tools onprem profiling --csv --local_folder {gpu_profile_dir} --eventlogs {gpu_log}", shell=True)
+ subprocess.run(f"spark_rapids_user_tools onprem profiling --csv {jar_arg} --local_folder {gpu_profile_dir} --eventlogs {gpu_log}", shell=True)
if speedups is None:
### run CPU profiler if needed
@@ -76,7 +76,7 @@
cpu_profile_dir = cpu_profile
else:
cpu_profile_dir = f"{output}/cpu_profile"
- subprocess.run(f"spark_rapids_user_tools onprem profiling --csv --local_folder {cpu_profile_dir} --eventlogs {cpu_log}", shell=True)
+ subprocess.run(f"spark_rapids_user_tools onprem profiling --csv {jar_arg} --local_folder {cpu_profile_dir} --eventlogs {cpu_log}", shell=True)
### run speedup factor generation
subprocess.run(f"python generate_speedup_factors.py --cpu {cpu_profile_dir}/*/rapids_4_spark_profile --gpu {gpu_profile_dir}/*/rapids_4_spark_profile --output {output}/generatedScores.csv", shell=True)
diff --git a/user_tools/docs/user-tools-aws-emr.md b/user_tools/docs/user-tools-aws-emr.md
index 3faad8ce9..24f945f5e 100644
--- a/user_tools/docs/user-tools-aws-emr.md
+++ b/user_tools/docs/user-tools-aws-emr.md
@@ -32,7 +32,7 @@ the applications running on AWS EMR.
- from source: `pip install -e .`
- verify the command is installed correctly by running
```bash
- spark_rapids_user_tools emr --help
+ spark_rapids_user_tools emr -- --help
```
### 4.Environment variables
@@ -55,7 +55,7 @@ Before running any command, you can set environment variables to specify configu
```
spark_rapids_user_tools emr qualification [options]
-spark_rapids_user_tools emr qualification --help
+spark_rapids_user_tools emr qualification -- --help
```
The local deployment runs on the local development machine. It requires:
@@ -307,7 +307,7 @@ The CLI is triggered by providing the location where the yaml file is stored `--
```
spark_rapids_user_tools emr bootstrap [options]
-spark_rapids_user_tools emr bootstrap --help
+spark_rapids_user_tools emr bootstrap -- --help
```
The command generates an output with a list of properties to be applied to Spark configurations.
@@ -370,7 +370,7 @@ generate an output while displaying warning that the remote changes failed.
```
spark_rapids_user_tools emr diagnostic [options]
-spark_rapids_user_tools emr diagnostic --help
+spark_rapids_user_tools emr diagnostic -- --help
```
Run diagnostic command to collects information from EMR cluster, such as OS version, # of worker
diff --git a/user_tools/docs/user-tools-databricks-aws.md b/user_tools/docs/user-tools-databricks-aws.md
index 514715e49..f3e90121c 100644
--- a/user_tools/docs/user-tools-databricks-aws.md
+++ b/user_tools/docs/user-tools-databricks-aws.md
@@ -35,7 +35,7 @@ The tool currently only supports event logs stored on S3 (no DBFS paths). The re
- from source: `pip install -e .`
- verify the command is installed correctly by running
```bash
- spark_rapids_user_tools databricks-aws --help
+ spark_rapids_user_tools databricks-aws -- --help
```
### 5.Environment variables
@@ -53,7 +53,7 @@ Before running any command, you can set environment variables to specify configu
```
spark_rapids_user_tools databricks-aws qualification [options]
-spark_rapids_user_tools databricks-aws qualification --help
+spark_rapids_user_tools databricks-aws qualification -- --help
```
The local deployment runs on the local development machine. It requires:
diff --git a/user_tools/docs/user-tools-databricks-azure.md b/user_tools/docs/user-tools-databricks-azure.md
index cdc47151d..654c67049 100644
--- a/user_tools/docs/user-tools-databricks-azure.md
+++ b/user_tools/docs/user-tools-databricks-azure.md
@@ -39,7 +39,7 @@ The tool currently only supports event logs stored on ABFS ([Azure Blob File Sys
- from source: `pip install -e .`
- Verify the command is installed correctly by running
```bash
- spark_rapids_user_tools databricks-azure --help
+ spark_rapids_user_tools databricks-azure -- --help
```
### 5.Environment variables
@@ -57,7 +57,7 @@ Before running any command, you can set environment variables to specify configu
```
spark_rapids_user_tools databricks-azure qualification [options]
-spark_rapids_user_tools databricks-azure qualification --help
+spark_rapids_user_tools databricks-azure qualification -- --help
```
The local deployment runs on the local development machine. It requires:
diff --git a/user_tools/docs/user-tools-dataproc.md b/user_tools/docs/user-tools-dataproc.md
index 12ff51b8b..e6da32b78 100644
--- a/user_tools/docs/user-tools-dataproc.md
+++ b/user_tools/docs/user-tools-dataproc.md
@@ -35,7 +35,7 @@ the applications running on _Google Cloud Dataproc_.
- from source: `pip install -e .`
- verify the command is installed correctly by running
```bash
- spark_rapids_user_tools dataproc --help
+ spark_rapids_user_tools dataproc -- --help
```
### 4.Environment variables
@@ -55,7 +55,7 @@ RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`:
```
spark_rapids_user_tools dataproc qualification [options]
-spark_rapids_user_tools dataproc qualification --help
+spark_rapids_user_tools dataproc qualification -- --help
```
The local deployment runs on the local development machine. It requires:
@@ -380,7 +380,7 @@ generate an output while displaying warning that the remote changes failed.
```
spark_rapids_user_tools dataproc diagnostic [options]
-spark_rapids_user_tools dataproc diagnostic --help
+spark_rapids_user_tools dataproc diagnostic -- --help
```
Run diagnostic command to collects information from Dataproc cluster, such as OS version, # of worker
diff --git a/user_tools/docs/user-tools-onprem.md b/user_tools/docs/user-tools-onprem.md
index 6126150c7..565fb17f8 100644
--- a/user_tools/docs/user-tools-onprem.md
+++ b/user_tools/docs/user-tools-onprem.md
@@ -23,7 +23,7 @@ The tool currently only supports event logs stored on local path. The remote out
- from source: `pip install -e .`
- verify the command is installed correctly by running
```bash
- spark_rapids_user_tools onprem --help
+ spark_rapids_user_tools onprem -- --help
```
### 3.Environment variables
@@ -39,7 +39,7 @@ Before running any command, you can set environment variables to specify configu
```
spark_rapids_user_tools onprem qualification [options]
-spark_rapids_user_tools onprem qualification --help
+spark_rapids_user_tools onprem qualification -- --help
```
The local deployment runs on the local development machine. It requires:
diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml
index 6bde6fe28..0ea4a163c 100644
--- a/user_tools/pyproject.toml
+++ b/user_tools/pyproject.toml
@@ -37,13 +37,15 @@ dependencies = [
# this will include numpy
"pyarrow==12.0.1",
# used for ADLS filesystem implementation
+ # Issue-568: use 12.17.0 as the new 12.18.0 causes an error in runtime
+ "azure-storage-blob==12.17.0",
"adlfs==2023.4.0"
]
dynamic=["entry-points", "version"]
[project.scripts]
spark_rapids_user_tools = "spark_rapids_pytools.wrapper:main"
-ascli = "pyrapids.cmdli.pyrapids_cli:main"
+ascli = "spark_rapids_tools.cmdli.tools_cli:main"
[tool.setuptools]
package-dir = {"" = "src"}
diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py
index ebaf6f580..1c1792bef 100644
--- a/user_tools/src/spark_rapids_pytools/__init__.py
+++ b/user_tools/src/spark_rapids_pytools/__init__.py
@@ -16,5 +16,5 @@
from spark_rapids_pytools.build import get_version
-VERSION = '23.08.1'
+VERSION = '23.08.2'
__version__ = get_version(VERSION)
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py
index b5ea8696c..ceee37858 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py
@@ -18,7 +18,7 @@
from dataclasses import dataclass, field
from typing import Any, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.databricks_aws_job import DBAWSLocalRapidsJob
from spark_rapids_pytools.cloud_api.emr import EMRNode, EMRPlatform
from spark_rapids_pytools.cloud_api.s3storage import S3StorageDriver
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py
index f61edbe0f..f0c863e47 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py
@@ -20,7 +20,7 @@
from dataclasses import dataclass, field
from typing import Any, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.azurestorage import AzureStorageDriver
from spark_rapids_pytools.cloud_api.databricks_azure_job import DBAzureLocalRapidsJob
from spark_rapids_pytools.cloud_api.sp_types import CMDDriverBase, ClusterBase, ClusterNode, \
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py
index b4a256963..f2865dd6d 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py
@@ -19,7 +19,7 @@
from dataclasses import dataclass, field
from typing import Any, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.dataproc_job import DataprocLocalRapidsJob
from spark_rapids_pytools.cloud_api.gstorage import GStorageDriver
from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, CMDDriverBase, \
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py
index 817dc015f..a938e2f3e 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py
@@ -19,7 +19,7 @@
from dataclasses import field, dataclass
from typing import Any, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.emr_job import EmrLocalRapidsJob
from spark_rapids_pytools.cloud_api.s3storage import S3StorageDriver
from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, ClusterBase, CMDDriverBase, \
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py
index b6656c171..ef96c6420 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py
@@ -18,7 +18,7 @@
from dataclasses import dataclass
from typing import Any, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.rapids.rapids_job import RapidsLocalJob
from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, ClusterBase, ClusterNode, \
CMDDriverBase, ClusterGetAccessor, GpuDevice, \
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py
index dd0c1bdb7..f8dded82d 100644
--- a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py
+++ b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py
@@ -22,7 +22,7 @@
from logging import Logger
from typing import Type, Any, List, Callable
-from pyrapids import EnumeratedType, CspEnv
+from spark_rapids_tools import EnumeratedType, CspEnv
from spark_rapids_pytools.common.prop_manager import AbstractPropertiesContainer, JSONPropertiesContainer, \
get_elem_non_safe
from spark_rapids_pytools.common.sys_storage import StorageDriver, FSUtil
diff --git a/user_tools/src/spark_rapids_pytools/common/prop_manager.py b/user_tools/src/spark_rapids_pytools/common/prop_manager.py
index 35864c0fe..fdafcc71f 100644
--- a/user_tools/src/spark_rapids_pytools/common/prop_manager.py
+++ b/user_tools/src/spark_rapids_pytools/common/prop_manager.py
@@ -22,7 +22,7 @@
import yaml
-from pyrapids import get_elem_from_dict, get_elem_non_safe
+from spark_rapids_tools import get_elem_from_dict, get_elem_non_safe
def convert_dict_to_camel_case(dic: dict):
diff --git a/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py b/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py
index 73a52844c..6670e6698 100644
--- a/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py
+++ b/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py
@@ -16,7 +16,7 @@
from dataclasses import dataclass, field
-from pyrapids import get_elem_from_dict, get_elem_non_safe
+from spark_rapids_tools import get_elem_from_dict, get_elem_non_safe
from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil
from spark_rapids_pytools.pricing.price_provider import PriceProvider
diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
index b10b06c13..a8861c6a1 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
@@ -22,7 +22,7 @@
import pandas as pd
from tabulate import tabulate
-from pyrapids.enums import QualFilterApp, QualGpuClusterReshapeType
+from spark_rapids_tools.enums import QualFilterApp, QualGpuClusterReshapeType
from spark_rapids_pytools.cloud_api.sp_types import ClusterReshape, NodeHWInfo
from spark_rapids_pytools.common.sys_storage import FSUtil
from spark_rapids_pytools.common.utilities import Utils, TemplateGenerator
diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
index c88867376..4b116c0d7 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
@@ -26,7 +26,7 @@
from logging import Logger
from typing import Any, Callable, Dict, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import get_platform, \
ClusterBase, DeployMode, NodeHWInfo
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer
diff --git a/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py b/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py
index c4851a0b1..503547450 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py
@@ -21,7 +21,7 @@
from logging import Logger
from typing import Type, Any, ClassVar, List
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import PlatformBase
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil
diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
index b8bce125f..be286e1c0 100644
--- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
@@ -3,15 +3,15 @@
"deployMode": {
"LOCAL": [
{
- "name": "Apache Spark",
- "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz",
+ "name": "Apache Spark",
+ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
- "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff",
- "size": 299350810
+ "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
+ "size": 299426263
},
{
- "name": "Hadoop AWS",
+ "name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
@@ -19,7 +19,7 @@
"size": 962685
},
{
- "name": "AWS Java SDK Bundled",
+ "name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
index ec6ad1158..ad894149f 100644
--- a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
@@ -3,15 +3,15 @@
"deployMode": {
"LOCAL": [
{
- "name": "Apache Spark",
- "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz",
+ "name": "Apache Spark",
+ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
- "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff",
- "size": 299350810
+ "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
+ "size": 299426263
},
{
- "name": "Hadoop Azure",
+ "name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
index 3e9234f7a..7bb53c2e2 100644
--- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
@@ -4,19 +4,19 @@
"LOCAL": [
{
"name": "Apache Spark",
- "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz",
+ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
- "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff",
- "size": 299350810
+ "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
+ "size": 299426263
},
{
"name": "GCS Connector Hadoop3",
- "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.11/gcs-connector-hadoop3-2.2.11-shaded.jar",
+ "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar",
"type": "jar",
- "md5": "7639d38fff9f88fe80d7e4d9d47fb946",
- "sha1": "519e60640b7ffdbb00dbed20b852c2a406ddaff9",
- "size": 36497606
+ "md5": "41aea3add826dfbf3384a2c638148709",
+ "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66",
+ "size": 38413466
}
]
}
diff --git a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py
index 6a8a90057..6d3108ff3 100644
--- a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py
+++ b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py
@@ -25,7 +25,7 @@
import fire
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil
from spark_rapids_pytools.common.utilities import Utils
diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
index c4bfd3d98..b585ebc8f 100644
--- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
@@ -3,15 +3,15 @@
"deployMode": {
"LOCAL": [
{
- "name": "Apache Spark",
- "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz",
+ "name": "Apache Spark",
+ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
- "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff",
- "size": 299350810
+ "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
+ "size": 299426263
},
{
- "name": "Hadoop AWS",
+ "name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
@@ -19,7 +19,7 @@
"size": 962685
},
{
- "name": "AWS Java SDK Bundled",
+ "name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
index 4e993671a..2b6421093 100644
--- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
@@ -4,11 +4,11 @@
"LOCAL": [
{
"name": "Apache Spark",
- "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz",
+ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
- "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff",
- "size": 299350810
+ "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
+ "size": 299426263
}
]
}
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py
index c3f074393..6b83a04ce 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py
@@ -14,7 +14,7 @@
"""Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AWS."""
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.diagnostic import Diagnostic
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py
index aacd37227..1f4de26c6 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py
@@ -14,7 +14,7 @@
"""Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AZURE."""
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.diagnostic import Diagnostic
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py
index 4fd0ca4fb..05ae9eb60 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py
@@ -14,7 +14,7 @@
"""Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on Dataproc."""
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.bootstrap import Bootstrap
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
index 63069291d..65de11341 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
@@ -14,7 +14,7 @@
"""Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on AWS-EMR."""
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.bootstrap import Bootstrap
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py
index ba4971399..1e4ea0c56 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py
@@ -14,7 +14,7 @@
"""Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on On-Prem cluster."""
-from pyrapids import CspEnv
+from spark_rapids_tools import CspEnv
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal
diff --git a/user_tools/src/pyrapids/__init__.py b/user_tools/src/spark_rapids_tools/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/__init__.py
rename to user_tools/src/spark_rapids_tools/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/__init__.py b/user_tools/src/spark_rapids_tools/cloud/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/__init__.py
rename to user_tools/src/spark_rapids_tools/cloud/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/cluster.py b/user_tools/src/spark_rapids_tools/cloud/cluster.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/cluster.py
rename to user_tools/src/spark_rapids_tools/cloud/cluster.py
diff --git a/user_tools/src/pyrapids/cloud/databricks/__init__.py b/user_tools/src/spark_rapids_tools/cloud/databricks/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/databricks/__init__.py
rename to user_tools/src/spark_rapids_tools/cloud/databricks/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/databricks/dbcluster.py b/user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py
similarity index 89%
rename from user_tools/src/pyrapids/cloud/databricks/dbcluster.py
rename to user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py
index 58c950889..4b1c0ffc2 100644
--- a/user_tools/src/pyrapids/cloud/databricks/dbcluster.py
+++ b/user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py
@@ -18,8 +18,8 @@
from typing import ClassVar, Type, Optional
-from pyrapids.cloud.cluster import register_client_cluster, register_cluster_prop_mgr, ClusterPropMgr, ClientCluster
-from pyrapids.utils.propmanager import PropValidatorSchema
+from spark_rapids_tools.cloud.cluster import register_client_cluster, register_cluster_prop_mgr, ClusterPropMgr, ClientCluster
+from spark_rapids_tools.utils.propmanager import PropValidatorSchema
class DBAwsClusterSchema(PropValidatorSchema):
diff --git a/user_tools/src/pyrapids/cloud/dataproc/__init__.py b/user_tools/src/spark_rapids_tools/cloud/dataproc/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/dataproc/__init__.py
rename to user_tools/src/spark_rapids_tools/cloud/dataproc/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py b/user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py
similarity index 83%
rename from user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py
rename to user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py
index 482d18e50..20ebf526a 100644
--- a/user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py
+++ b/user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py
@@ -18,8 +18,8 @@
from typing import ClassVar, Type
-from pyrapids.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, register_cluster_prop_mgr
-from pyrapids.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema
+from spark_rapids_tools.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, register_cluster_prop_mgr
+from spark_rapids_tools.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema
class DataprocClusterSchema(PropValidatorSchemaCamel):
diff --git a/user_tools/src/pyrapids/cloud/emr/__init__.py b/user_tools/src/spark_rapids_tools/cloud/emr/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/emr/__init__.py
rename to user_tools/src/spark_rapids_tools/cloud/emr/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/emr/emrcluster.py b/user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py
similarity index 82%
rename from user_tools/src/pyrapids/cloud/emr/emrcluster.py
rename to user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py
index 7c676d074..d94830ca6 100644
--- a/user_tools/src/pyrapids/cloud/emr/emrcluster.py
+++ b/user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py
@@ -18,8 +18,8 @@
from typing import ClassVar, Type
-from pyrapids.cloud.cluster import register_cluster_prop_mgr, register_client_cluster, ClusterPropMgr, ClientCluster
-from pyrapids.utils.propmanager import PropValidatorSchemaUpper, PropValidatorSchema
+from spark_rapids_tools.cloud.cluster import register_cluster_prop_mgr, register_client_cluster, ClusterPropMgr, ClientCluster
+from spark_rapids_tools.utils.propmanager import PropValidatorSchemaUpper, PropValidatorSchema
class EmrClusterSchema(PropValidatorSchemaUpper):
diff --git a/user_tools/src/pyrapids/cloud/onprem/__init__.py b/user_tools/src/spark_rapids_tools/cloud/onprem/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/cloud/onprem/__init__.py
rename to user_tools/src/spark_rapids_tools/cloud/onprem/__init__.py
diff --git a/user_tools/src/pyrapids/cloud/onprem/onpremcluster.py b/user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py
similarity index 85%
rename from user_tools/src/pyrapids/cloud/onprem/onpremcluster.py
rename to user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py
index 8c6a87214..85e8972d5 100644
--- a/user_tools/src/pyrapids/cloud/onprem/onpremcluster.py
+++ b/user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py
@@ -19,9 +19,9 @@
from typing import ClassVar, Type
from typing_extensions import TypedDict
from pydantic import ConfigDict
-from pyrapids.cloud.cluster import ClientCluster, ClusterPropMgr, register_cluster_prop_mgr, register_client_cluster
-from pyrapids.utils.propmanager import PropValidatorSchema
-from pyrapids.utils.util import to_camel_case
+from spark_rapids_tools.cloud.cluster import ClientCluster, ClusterPropMgr, register_cluster_prop_mgr, register_client_cluster
+from spark_rapids_tools.utils.propmanager import PropValidatorSchema
+from spark_rapids_tools.utils.util import to_camel_case
class OnPremDriverConfigSchema(TypedDict):
diff --git a/user_tools/src/pyrapids/cmdli/__init__.py b/user_tools/src/spark_rapids_tools/cmdli/__init__.py
similarity index 92%
rename from user_tools/src/pyrapids/cmdli/__init__.py
rename to user_tools/src/spark_rapids_tools/cmdli/__init__.py
index ed252dbfc..1344d5a37 100644
--- a/user_tools/src/pyrapids/cmdli/__init__.py
+++ b/user_tools/src/spark_rapids_tools/cmdli/__init__.py
@@ -14,8 +14,8 @@
"""init file of the user CLI used to run the tools"""
-from .pyrapids_cli import PyRapids
+from .tools_cli import ToolsCLI
__all__ = [
- 'PyRapids'
+ 'ToolsCLI'
]
diff --git a/user_tools/src/pyrapids/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py
similarity index 95%
rename from user_tools/src/pyrapids/cmdli/argprocessor.py
rename to user_tools/src/spark_rapids_tools/cmdli/argprocessor.py
index 7a332461a..9a0617a89 100644
--- a/user_tools/src/pyrapids/cmdli/argprocessor.py
+++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py
@@ -25,9 +25,8 @@
from pydantic.dataclasses import dataclass
from pydantic_core import PydanticCustomError
-from pyrapids.cloud import ClientCluster
-from pyrapids.exceptions import IllegalArgumentError
-from pyrapids.utils import AbstractPropContainer, is_http_file
+from spark_rapids_tools.cloud import ClientCluster
+from spark_rapids_tools.utils import AbstractPropContainer, is_http_file
from spark_rapids_pytools.cloud_api.sp_types import DeployMode
from spark_rapids_pytools.common.utilities import ToolLogging
from spark_rapids_pytools.rapids.qualification import QualGpuClusterReshapeType
@@ -105,7 +104,7 @@ class AbsToolUserArgModel:
'meta': {},
'toolArgs': {}
})
- logger: ClassVar[Logger] = ToolLogging.get_and_setup_logger('pyrapids.argparser')
+ logger: ClassVar[Logger] = ToolLogging.get_and_setup_logger('spark_rapids_tools.argparser')
tool_name: ClassVar[str] = None
@classmethod
@@ -115,7 +114,7 @@ def create_tool_args(cls, tool_name: str, *args: Any, **kwargs: Any) -> Optional
impl_class = impl_entry.validator_class
new_obj = impl_class(*args, **kwargs)
return new_obj.build_tools_args()
- except (ValidationError, IllegalArgumentError) as e:
+ except (ValidationError, PydanticCustomError) as e:
impl_class.logger.error('Validation err: %s\n', e)
dump_tool_usage(impl_class.tool_name)
return None
@@ -126,8 +125,9 @@ def get_eventlogs(self) -> Optional[str]:
return None
def raise_validation_exception(self, validation_err: str):
- raise IllegalArgumentError(
- f'Invalid arguments: {validation_err}')
+ raise PydanticCustomError(
+ 'invalid_argument',
+ f'{validation_err}\n Error:')
def determine_cluster_arg_type(self) -> ArgValueCase:
# self.cluster is provided. then we need to verify that the expected files are there
@@ -137,13 +137,14 @@ def determine_cluster_arg_type(self) -> ArgValueCase:
# the file cannot be a http_url
if is_http_file(self.cluster):
# we do not accept http://urls
- raise IllegalArgumentError(
- f'Cluster properties cannot be a web URL path: {self.cluster}')
+ raise PydanticCustomError(
+ 'invalid_argument',
+ f'Cluster properties cannot be a web URL path: {self.cluster}\n Error:')
cluster_case = ArgValueCase.VALUE_B
else:
raise PydanticCustomError(
'file_path',
- 'Cluster property file is not in valid format {.json, .yaml, or .yml}')
+ 'Cluster property file is not in valid format {.json, .yaml, or .yml}\n Error:')
else:
cluster_case = ArgValueCase.VALUE_A
return cluster_case
@@ -167,8 +168,9 @@ def detect_platform_from_eventlogs_prefix(self):
def validate_onprem_with_cluster_name(self):
if self.platform == CspEnv.ONPREM:
- raise IllegalArgumentError(
- f'Invalid arguments: Cannot run cluster by name with platform [{CspEnv.ONPREM}]')
+ raise PydanticCustomError(
+ 'invalid_argument',
+ f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:')
def init_extra_arg_cases(self) -> list:
return []
@@ -229,8 +231,9 @@ def post_platform_assignment_validation(self, assigned_platform):
if self.argv_cases[1] == ArgValueCase.VALUE_A:
if assigned_platform == CspEnv.ONPREM:
# it is not allowed to run cluster_by_name on an OnPrem platform
- raise IllegalArgumentError(
- f'Invalid arguments: Cannot run cluster by name with platform [{CspEnv.ONPREM}]')
+ raise PydanticCustomError(
+ 'invalid_argument',
+ f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:')
@dataclass
@@ -356,10 +359,10 @@ def build_tools_args(self) -> dict:
self.p_args['toolArgs']['targetPlatform'] = None
else:
if not self.p_args['toolArgs']['targetPlatform'] in equivalent_pricing_list:
- raise IllegalArgumentError(
- 'Invalid arguments: '
+ raise PydanticCustomError(
+ 'invalid_argument',
f'The platform [{self.p_args["toolArgs"]["targetPlatform"]}] is currently '
- f'not supported to calculate savings from [{runtime_platform}] cluster')
+ f'not supported to calculate savings from [{runtime_platform}] cluster\n Error:')
else:
# target platform is not set, then we disable cost savings if the runtime platform if
# onprem
diff --git a/user_tools/src/pyrapids/cmdli/pyrapids_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
similarity index 97%
rename from user_tools/src/pyrapids/cmdli/pyrapids_cli.py
rename to user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
index 45d6ff799..001472e36 100644
--- a/user_tools/src/pyrapids/cmdli/pyrapids_cli.py
+++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
@@ -17,15 +17,15 @@
import fire
-from pyrapids.enums import QualGpuClusterReshapeType
-from pyrapids.utils.util import gen_app_banner
+from spark_rapids_tools.enums import QualGpuClusterReshapeType
+from spark_rapids_tools.utils.util import gen_app_banner
from spark_rapids_pytools.rapids.bootstrap import Bootstrap
from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal
from spark_rapids_pytools.rapids.qualification import QualificationAsLocal
from .argprocessor import AbsToolUserArgModel
-class PyRapids(object): # pylint: disable=too-few-public-methods
+class ToolsCLI(object): # pylint: disable=too-few-public-methods
"""CLI that provides a runtime environment that simplifies running cost and performance analysis
using the RAPIDS Accelerator for Apache Spark.
@@ -167,7 +167,7 @@ def main():
# Make Python Fire not use a pager when it prints a help text
fire.core.Display = lambda lines, out: out.write('\n'.join(lines) + '\n')
print(gen_app_banner())
- fire.Fire(PyRapids())
+ fire.Fire(ToolsCLI())
if __name__ == '__main__':
diff --git a/user_tools/src/pyrapids/enums.py b/user_tools/src/spark_rapids_tools/enums.py
similarity index 100%
rename from user_tools/src/pyrapids/enums.py
rename to user_tools/src/spark_rapids_tools/enums.py
diff --git a/user_tools/src/pyrapids/exceptions.py b/user_tools/src/spark_rapids_tools/exceptions.py
similarity index 96%
rename from user_tools/src/pyrapids/exceptions.py
rename to user_tools/src/spark_rapids_tools/exceptions.py
index 893fc0f84..d9acbb156 100644
--- a/user_tools/src/pyrapids/exceptions.py
+++ b/user_tools/src/spark_rapids_tools/exceptions.py
@@ -70,7 +70,3 @@ def __init__(self, msg: str, pydantic_err: Optional[ValidationError] = None):
content.append(str.join('. ', single_err))
self.message = str.join('\n', content)
super().__init__(self.message)
-
-
-class IllegalArgumentError(CspPathException, ValueError):
- pass
diff --git a/user_tools/src/pyrapids/storagelib/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/adls/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/adls/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/adls/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/adls/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/adls/adlsfs.py b/user_tools/src/spark_rapids_tools/storagelib/adls/adlsfs.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/adls/adlsfs.py
rename to user_tools/src/spark_rapids_tools/storagelib/adls/adlsfs.py
diff --git a/user_tools/src/pyrapids/storagelib/adls/adlspath.py b/user_tools/src/spark_rapids_tools/storagelib/adls/adlspath.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/adls/adlspath.py
rename to user_tools/src/spark_rapids_tools/storagelib/adls/adlspath.py
diff --git a/user_tools/src/pyrapids/storagelib/cspfs.py b/user_tools/src/spark_rapids_tools/storagelib/cspfs.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/cspfs.py
rename to user_tools/src/spark_rapids_tools/storagelib/cspfs.py
diff --git a/user_tools/src/pyrapids/storagelib/csppath.py b/user_tools/src/spark_rapids_tools/storagelib/csppath.py
similarity index 97%
rename from user_tools/src/pyrapids/storagelib/csppath.py
rename to user_tools/src/spark_rapids_tools/storagelib/csppath.py
index 923899b0c..50d1b8c2d 100644
--- a/user_tools/src/pyrapids/storagelib/csppath.py
+++ b/user_tools/src/spark_rapids_tools/storagelib/csppath.py
@@ -190,18 +190,18 @@ class CspPath(metaclass=CspPathMeta):
Create a new path subclass from a gcs URI:
>>> gs_path = CspPath('gs://bucket-name/folder_00/subfolder_01')
-
+
or from S3 URI:
>>> s3_path = CspPath('s3://bucket-name/folder_00/subfolder_01')
-
+
or from local file URI:
>>> local_path1, local_path2 = (CspPath('~/my_folder'), CspPath('file:///my_folder'))
-
+
Print the data from the file with `open_input_file()`:
diff --git a/user_tools/src/pyrapids/storagelib/gcs/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/gcs/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/gcs/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/gcs/gcsfs.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/gcsfs.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/gcs/gcsfs.py
rename to user_tools/src/spark_rapids_tools/storagelib/gcs/gcsfs.py
diff --git a/user_tools/src/pyrapids/storagelib/gcs/gcspath.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/gcspath.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/gcs/gcspath.py
rename to user_tools/src/spark_rapids_tools/storagelib/gcs/gcspath.py
diff --git a/user_tools/src/pyrapids/storagelib/hdfs/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/hdfs/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/hdfs/hdfsfs.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfsfs.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/hdfs/hdfsfs.py
rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfsfs.py
diff --git a/user_tools/src/pyrapids/storagelib/hdfs/hdfspath.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfspath.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/hdfs/hdfspath.py
rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfspath.py
diff --git a/user_tools/src/pyrapids/storagelib/local/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/local/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/local/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/local/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/local/localfs.py b/user_tools/src/spark_rapids_tools/storagelib/local/localfs.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/local/localfs.py
rename to user_tools/src/spark_rapids_tools/storagelib/local/localfs.py
diff --git a/user_tools/src/pyrapids/storagelib/local/localpath.py b/user_tools/src/spark_rapids_tools/storagelib/local/localpath.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/local/localpath.py
rename to user_tools/src/spark_rapids_tools/storagelib/local/localpath.py
diff --git a/user_tools/src/pyrapids/storagelib/s3/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/s3/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/s3/__init__.py
rename to user_tools/src/spark_rapids_tools/storagelib/s3/__init__.py
diff --git a/user_tools/src/pyrapids/storagelib/s3/s3fs.py b/user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py
similarity index 95%
rename from user_tools/src/pyrapids/storagelib/s3/s3fs.py
rename to user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py
index e64922acc..b4054cf0a 100644
--- a/user_tools/src/pyrapids/storagelib/s3/s3fs.py
+++ b/user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py
@@ -15,7 +15,7 @@
"""Wrapper for the S3 File system"""
-from pyrapids.storagelib.cspfs import register_fs_class, CspFs
+from spark_rapids_tools.storagelib.cspfs import register_fs_class, CspFs
@register_fs_class('s3', 'S3FileSystem')
diff --git a/user_tools/src/pyrapids/storagelib/s3/s3path.py b/user_tools/src/spark_rapids_tools/storagelib/s3/s3path.py
similarity index 100%
rename from user_tools/src/pyrapids/storagelib/s3/s3path.py
rename to user_tools/src/spark_rapids_tools/storagelib/s3/s3path.py
diff --git a/user_tools/src/pyrapids/tools/__init__.py b/user_tools/src/spark_rapids_tools/tools/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/tools/__init__.py
rename to user_tools/src/spark_rapids_tools/tools/__init__.py
diff --git a/user_tools/src/pyrapids/tools/autotuner.py b/user_tools/src/spark_rapids_tools/tools/autotuner.py
similarity index 89%
rename from user_tools/src/pyrapids/tools/autotuner.py
rename to user_tools/src/spark_rapids_tools/tools/autotuner.py
index b65ff094a..a390fefad 100644
--- a/user_tools/src/pyrapids/tools/autotuner.py
+++ b/user_tools/src/spark_rapids_tools/tools/autotuner.py
@@ -16,7 +16,7 @@
from typing import Optional, ClassVar, Type
-from pyrapids.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema, AbstractPropContainer
+from spark_rapids_tools.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema, AbstractPropContainer
class AutoTunerInputSchema(PropValidatorSchemaCamel):
diff --git a/user_tools/src/pyrapids/utils/__init__.py b/user_tools/src/spark_rapids_tools/utils/__init__.py
similarity index 100%
rename from user_tools/src/pyrapids/utils/__init__.py
rename to user_tools/src/spark_rapids_tools/utils/__init__.py
diff --git a/user_tools/src/pyrapids/utils/propmanager.py b/user_tools/src/spark_rapids_tools/utils/propmanager.py
similarity index 94%
rename from user_tools/src/pyrapids/utils/propmanager.py
rename to user_tools/src/spark_rapids_tools/utils/propmanager.py
index fbe55d0b5..c7dd6d822 100644
--- a/user_tools/src/pyrapids/utils/propmanager.py
+++ b/user_tools/src/spark_rapids_tools/utils/propmanager.py
@@ -23,9 +23,9 @@
import yaml
from pydantic import BaseModel, ConfigDict, model_validator, ValidationError
-from pyrapids.exceptions import JsonLoadException, YamlLoadException, InvalidPropertiesSchema
-from pyrapids.storagelib.csppath import CspPath, CspPathT
-from pyrapids.utils.util import to_camel_case, to_camel_capital_case, get_elem_from_dict, get_elem_non_safe
+from spark_rapids_tools.exceptions import JsonLoadException, YamlLoadException, InvalidPropertiesSchema
+from spark_rapids_tools.storagelib.csppath import CspPath, CspPathT
+from spark_rapids_tools.utils.util import to_camel_case, to_camel_capital_case, get_elem_from_dict, get_elem_non_safe
def load_json(file_path: Union[str, CspPathT]) -> Any:
diff --git a/user_tools/src/pyrapids/utils/util.py b/user_tools/src/spark_rapids_tools/utils/util.py
similarity index 95%
rename from user_tools/src/pyrapids/utils/util.py
rename to user_tools/src/spark_rapids_tools/utils/util.py
index 07bf2439b..8d1676979 100644
--- a/user_tools/src/pyrapids/utils/util.py
+++ b/user_tools/src/spark_rapids_tools/utils/util.py
@@ -26,7 +26,7 @@
from pydantic import ValidationError, AnyHttpUrl, TypeAdapter
import spark_rapids_pytools
-from pyrapids.exceptions import CspPathAttributeError
+from spark_rapids_tools.exceptions import CspPathAttributeError
def get_elem_from_dict(data, keys):
@@ -87,8 +87,8 @@ def to_snake_case(word: str) -> str:
def dump_tool_usage(tool_name: Optional[str], raise_sys_exit: Optional[bool] = True):
- imported_module = __import__('pyrapids.cmdli', globals(), locals(), ['PyRapids'])
- wrapper_clzz = getattr(imported_module, 'PyRapids')
+ imported_module = __import__('spark_rapids_tools.cmdli', globals(), locals(), ['ToolsCLI'])
+ wrapper_clzz = getattr(imported_module, 'ToolsCLI')
help_name = 'ascli'
usage_cmd = f'{tool_name} --help'
try:
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/test-azure-instances-catalog.json b/user_tools/tests/pyrapids_unit/resources/cluster/databricks/test-azure-instances-catalog.json
deleted file mode 100644
index cca387f06..000000000
--- a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/test-azure-instances-catalog.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
- "Standard_NC4as_T4_v3": {
- "VCpuInfo": {
- "DefaultVCpus": 4
- },
- "MemoryInfo": {
- "SizeInMiB": 0
- },
- "GpuInfo": {
- "GPUs": [
- {
- "Name": "",
- "Manufacturer": "",
- "Count": 1,
- "MemoryInfo": {
- "SizeInMiB": 0
- }
- }
- ]
- }
- }
-}
diff --git a/user_tools/tests/pyrapids_unit/__init__.py b/user_tools/tests/spark_rapids_tools_ut/__init__.py
similarity index 100%
rename from user_tools/tests/pyrapids_unit/__init__.py
rename to user_tools/tests/spark_rapids_tools_ut/__init__.py
diff --git a/user_tools/tests/pyrapids_unit/conftest.py b/user_tools/tests/spark_rapids_tools_ut/conftest.py
similarity index 92%
rename from user_tools/tests/pyrapids_unit/conftest.py
rename to user_tools/tests/spark_rapids_tools_ut/conftest.py
index 0b886a042..dbb5fb383 100644
--- a/user_tools/tests/pyrapids_unit/conftest.py
+++ b/user_tools/tests/spark_rapids_tools_ut/conftest.py
@@ -25,7 +25,7 @@ def get_test_resources_path():
import importlib_resources
else:
import importlib.resources as importlib_resources
- pkg = importlib_resources.files('tests.pyrapids_unit')
+ pkg = importlib_resources.files('tests.spark_rapids_tools_ut')
return pkg / 'resources'
@@ -47,7 +47,7 @@ def gen_cpu_cluster_props():
all_csps = csps + ['onprem']
-class PyrapidsUnitTest: # pylint: disable=too-few-public-methods
+class SparkRapidsToolsUT: # pylint: disable=too-few-public-methods
@pytest.fixture(autouse=True)
def get_ut_data_dir(self):
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/aws-cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/aws-cpu-00.json
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/cluster/databricks/aws-cpu-00.json
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/aws-cpu-00.json
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/azure-cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/azure-cpu-00.json
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/cluster/databricks/azure-cpu-00.json
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/azure-cpu-00.json
diff --git a/user_tools/tests/pyrapids_unit/resources/eventlogs/.gitkeep b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/test-azure-instances-catalog.json
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/eventlogs/.gitkeep
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/test-azure-instances-catalog.json
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/dataproc/cpu-00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/dataproc/cpu-00.yaml
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/cluster/dataproc/cpu-00.yaml
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/dataproc/cpu-00.yaml
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/emr/cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/emr/cpu-00.json
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/cluster/emr/cpu-00.json
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/emr/cpu-00.json
diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/onprem/cpu-00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/onprem/cpu-00.yaml
similarity index 100%
rename from user_tools/tests/pyrapids_unit/resources/cluster/onprem/cpu-00.yaml
rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/onprem/cpu-00.yaml
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/eventlogs/.gitkeep b/user_tools/tests/spark_rapids_tools_ut/resources/eventlogs/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/user_tools/tests/pyrapids_unit/test_cluster.py b/user_tools/tests/spark_rapids_tools_ut/test_cluster.py
similarity index 81%
rename from user_tools/tests/pyrapids_unit/test_cluster.py
rename to user_tools/tests/spark_rapids_tools_ut/test_cluster.py
index c6e6cbf03..3524c64da 100644
--- a/user_tools/tests/pyrapids_unit/test_cluster.py
+++ b/user_tools/tests/spark_rapids_tools_ut/test_cluster.py
@@ -16,13 +16,13 @@
import pytest # pylint: disable=import-error
-from pyrapids import CspPath
-from pyrapids.cloud import ClientCluster
-from pyrapids.exceptions import InvalidPropertiesSchema
-from .conftest import PyrapidsUnitTest, all_cpu_cluster_props
+from spark_rapids_tools import CspPath
+from spark_rapids_tools.cloud import ClientCluster
+from spark_rapids_tools.exceptions import InvalidPropertiesSchema
+from .conftest import SparkRapidsToolsUT, all_cpu_cluster_props
-class TestClusterCSP(PyrapidsUnitTest): # pylint: disable=too-few-public-methods
+class TestClusterCSP(SparkRapidsToolsUT): # pylint: disable=too-few-public-methods
"""
Class testing identifying the cluster type by comparing the properties to
the defined Schema
diff --git a/user_tools/tests/pyrapids_unit/test_tool_argprocessor.py b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py
similarity index 96%
rename from user_tools/tests/pyrapids_unit/test_tool_argprocessor.py
rename to user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py
index 1dca5ba9f..04ce37aa3 100644
--- a/user_tools/tests/pyrapids_unit/test_tool_argprocessor.py
+++ b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py
@@ -21,10 +21,10 @@
import fire
import pytest # pylint: disable=import-error
-from pyrapids import CspEnv
-from pyrapids.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase
-from pyrapids.enums import QualFilterApp
-from .conftest import PyrapidsUnitTest, all_cpu_cluster_props, csp_cpu_cluster_props, csps
+from spark_rapids_tools import CspEnv
+from spark_rapids_tools.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase
+from spark_rapids_tools.enums import QualFilterApp
+from .conftest import SparkRapidsToolsUT, all_cpu_cluster_props, csp_cpu_cluster_props, csps
@dataclasses.dataclass
@@ -55,7 +55,7 @@ def decorator(func_cb: Callable):
return decorator
-class TestToolArgProcessor(PyrapidsUnitTest): # pylint: disable=too-few-public-methods
+class TestToolArgProcessor(SparkRapidsToolsUT): # pylint: disable=too-few-public-methods
"""
Class testing toolArgProcessor functionalities
"""