From bfb37347848245ff3acf2151003d0cbf7a494ba1 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:24:07 -0500 Subject: [PATCH 01/16] Improve AutoTuner plugin recommendation for Fat mode (#543) Signed-off-by: Ahmed Hussein (amahussein) Fixes #536 - when the AutoTuner fails to read the latest plugin version, it shows a warning message - the autotuner adds a comment instructing the user to validate that the current version is the latest on the mvn URL --- .../nvidia/spark/rapids/tool/profiling/AutoTuner.scala | 9 ++++++++- .../spark/sql/rapids/tool/util/WebCrawlerUtil.scala | 7 +++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala index 6a974cf21..b2aeb2920 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala @@ -706,7 +706,14 @@ class AutoTuner( s" $jarURL\n" + s" Version used in application is $jarVer.") } - case None => logError("Could not pull the latest release of plugin jar.") + case None => + logError("Could not pull the latest release of RAPIDS-plugin jar.") + val pluginRepoUrl = WebCrawlerUtil.getMVNArtifactURL("rapids.plugin") + appendComment( + "Failed to validate the latest release of Apache Spark plugin.\n" + + s" Verify that the version used in application ($jarVer) is the latest on:\n" + + s" $pluginRepoUrl") + } } } diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala index 9d0074ee5..afa00e7bf 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/WebCrawlerUtil.scala @@ -41,7 +41,10 @@ object WebCrawlerUtil extends Logging { private val ARTIFACT_VERSION_REGEX = "\\d{2}\\.\\d{2}\\.\\d+/" // given an artifactID returns the full mvn url that lists all the // releases - private def getMVNArtifactURL(artifactID: String) : String = s"$NV_MVN_BASE_URL/$artifactID" + def getMVNArtifactURL(artifactID: String) : String = { + val artifactUrlPart = NV_ARTIFACTS_LOOKUP.getOrElse(artifactID, artifactID) + s"$NV_MVN_BASE_URL/$artifactUrlPart" + } /** * Given a valid URL, this method recursively picks all the hrefs defined in the HTML doc. @@ -72,7 +75,7 @@ object WebCrawlerUtil extends Logging { } } catch { case x: IOException => - logError(s"Exception while visiting webURL $webURL", x) + logWarning(s"Exception while visiting webURL $webURL: ${x.toString}") } } } From c9ddfd25a158b1fba95276cd5abe159bd6e08c4d Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:24:32 -0500 Subject: [PATCH 02/16] Bump default build to use Spark-333 (#537) Signed-off-by: Ahmed Hussein (amahussein) Fixes #515 - update pom file to use spark-333 - upgrade hadoop client versions - update snapshots and releases - update the dependencies in the user-tools --- .github/workflows/mvn-verify-check.yml | 2 +- core/README.md | 6 +- core/pom.xml | 118 +++++++++++++++--- .../resources/databricks_aws-configs.json | 12 +- .../resources/databricks_azure-configs.json | 10 +- .../resources/dataproc-configs.json | 14 +-- .../resources/emr-configs.json | 12 +- .../resources/onprem-configs.json | 6 +- 8 files changed, 130 insertions(+), 50 deletions(-) diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 12bcb9d79..227dbdd0b 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - spark-version: ['311', '320', '330', '341'] + spark-version: ['311', '320', '333', '341'] steps: - uses: actions/checkout@v3 diff --git a/core/README.md b/core/README.md index a3b8359f8..8000a5460 100644 --- a/core/README.md +++ b/core/README.md @@ -20,14 +20,14 @@ mvn clean package ``` After a successful build, the jar of 'rapids-4-spark-tools_2.12-*-SNAPSHOT.jar' will be in 'target/' directory. -This will build the plugin for a single version of Spark. By default, this is Apache Spark 3.1.1. +This will build the plugin for a single version of Spark. By default, this is Apache Spark 3.3.3. For development purpose, you may need to run the tests against different spark versions. To run the tests against a specific Spark version, you can use the `-Dbuildver=XXX` command line option. -For instance to build Spark 3.3.0 you would use: +For instance to build Spark 3.4.1 you would use: ```shell script -mvn -Dbuildver=330 clean package +mvn -Dbuildver=341 clean package ``` Run `mvn help:all-profiles` to list supported Spark versions. diff --git a/core/pom.xml b/core/pom.xml index ae3902d40..ad8412983 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -73,7 +73,6 @@ release311 - true buildver 311 @@ -83,7 +82,7 @@ 311 ${spark311.version} ${delta10x.version} - 3.2.0 + 3.3.6 @@ -98,7 +97,7 @@ 312 ${spark312.version} ${delta10x.version} - 3.2.0 + 3.3.6 @@ -113,7 +112,22 @@ 313 ${spark313.version} ${delta10x.version} - 3.2.0 + 3.3.6 + + + + release314 + + + buildver + 314 + + + + 314 + ${spark314.version} + ${delta10x.version} + 3.3.6 @@ -128,7 +142,7 @@ 320 ${spark320.version} ${delta20x.version} - 3.3.1 + 3.3.6 @@ -143,7 +157,7 @@ 321 ${spark321.version} ${delta20x.version} - 3.3.1 + 3.3.6 @@ -158,7 +172,7 @@ 322 ${spark322.version} ${delta20x.version} - 3.3.1 + 3.3.6 @@ -173,7 +187,7 @@ 323 ${spark323.version} ${delta20x.version} - 3.3.1 + 3.3.6 @@ -188,7 +202,7 @@ 324 ${spark324.version} ${delta20x.version} - 3.3.1 + 3.3.6 @@ -203,7 +217,7 @@ 325 ${spark325.version} ${delta20x.version} - 3.3.5 + 3.3.6 @@ -218,7 +232,7 @@ 330 ${spark330.version} ${delta23x.version} - 3.3.2 + 3.3.6 @@ -233,7 +247,7 @@ 331 ${spark331.version} ${delta23x.version} - 3.3.2 + 3.3.6 @@ -248,12 +262,13 @@ 332 ${spark332.version} ${delta23x.version} - 3.3.2 + 3.3.6 release333 + true buildver 333 @@ -263,7 +278,22 @@ 333 ${spark333.version} ${delta23x.version} - 3.3.4 + 3.3.6 + + + + release334 + + + buildver + 334 + + + + 334 + ${spark334.version} + ${delta23x.version} + 3.3.6 @@ -278,7 +308,7 @@ 340 ${spark340.version} ${delta24x.version} - 3.3.4 + 3.3.6 @@ -293,7 +323,22 @@ 341 ${spark341.version} ${delta24x.version} - 3.3.4 + 3.3.6 + + + + release342 + + + buildver + 342 + + + + 342 + ${spark342.version} + ${delta24x.version} + 3.3.6 @@ -308,7 +353,37 @@ 350 ${spark350.version} ${delta24x.version} - 3.3.5 + 3.3.6 + + + + release351 + + + buildver + 351 + + + + 351 + ${spark351.version} + ${delta24x.version} + 3.3.6 + + + + release400 + + + buildver + 400 + + + + 400 + ${spark400.version} + ${delta24x.version} + 3.3.6 @@ -316,6 +391,7 @@ 3.1.1 3.1.2 3.1.3 + 3.1.4-SNAPSHOT 3.2.0 3.2.1 3.2.2 @@ -325,16 +401,20 @@ 3.3.0 3.3.1 3.3.2 - 3.3.3-SNAPSHOT + 3.3.3 + 3.3.4-SNAPSHOT 3.4.0 3.4.1 + 3.4.2-SNAPSHOT 3.5.0-SNAPSHOT + 3.5.1-SNAPSHOT + 4.0.0-SNAPSHOT 2.12 4.3.0 2.0.2 3.2.0 2.12.15 - ${spark311.version} + ${spark333.version} 2.0 3.5.1 3.0.5 diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json index a5dd63c6d..3b178b232 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json @@ -3,15 +3,15 @@ "deployMode": { "LOCAL": [ { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz", + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "type": "archive", "relativePath": "jars/*", - "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff", - "size": 299350810 + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 }, { - "name": "Hadoop AWS", + "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "type": "jar", "md5": "59907e790ce713441955015d79f670bc", @@ -19,7 +19,7 @@ "size": 962685 }, { - "name": "AWS Java SDK Bundled", + "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "type": "jar", "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json index fbbe8ceae..31deab290 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json @@ -3,15 +3,15 @@ "deployMode": { "LOCAL": [ { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz", + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "type": "archive", "relativePath": "jars/*", - "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff", - "size": 299350810 + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 }, { - "name": "Hadoop Azure", + "name": "Hadoop Azure", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", "type": "jar", "md5": "1ec4cbd59548412010fe1515070eef73", diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json index 3e9234f7a..7bb53c2e2 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json @@ -4,19 +4,19 @@ "LOCAL": [ { "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "type": "archive", "relativePath": "jars/*", - "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff", - "size": 299350810 + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 }, { "name": "GCS Connector Hadoop3", - "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.11/gcs-connector-hadoop3-2.2.11-shaded.jar", + "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", "type": "jar", - "md5": "7639d38fff9f88fe80d7e4d9d47fb946", - "sha1": "519e60640b7ffdbb00dbed20b852c2a406ddaff9", - "size": 36497606 + "md5": "41aea3add826dfbf3384a2c638148709", + "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66", + "size": 38413466 } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json index c4bfd3d98..b585ebc8f 100644 --- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json @@ -3,15 +3,15 @@ "deployMode": { "LOCAL": [ { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz", + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "type": "archive", "relativePath": "jars/*", - "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff", - "size": 299350810 + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 }, { - "name": "Hadoop AWS", + "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "type": "jar", "md5": "59907e790ce713441955015d79f670bc", @@ -19,7 +19,7 @@ "size": 962685 }, { - "name": "AWS Java SDK Bundled", + "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "type": "jar", "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json index 4e993671a..2b6421093 100644 --- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json @@ -4,11 +4,11 @@ "LOCAL": [ { "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "type": "archive", "relativePath": "jars/*", - "sha512": "769db39a560a95fd88b58ed3e9e7d1e92fb68ee406689fb4d30c033cb5911e05c1942dcc70e5ec4585df84e80aabbc272b9386a208debda89522efff1335c8ff", - "size": 299350810 + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 } ] } From 9c39862325583b93ad8e8245ed7adb3444ac1c85 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:14:48 -0500 Subject: [PATCH 03/16] Improve parsing of aggregate expressions (#535) Signed-off-by: Ahmed Hussein (amahussein) Fixes #519 - `parseAggregateExpressions` function was very specific on the pattern of the aggregate expression. - the chnages update the implementation to correctly pull any possible function name from the aggregates - added 1 unit test with complex expression --- .../tool/planparser/SQLPlanParser.scala | 46 +++++++++++-------- .../tool/planparser/SqlPlanParserSuite.scala | 14 ++++++ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala index 18ea53d7a..4ec00d8d7 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala @@ -404,25 +404,33 @@ object SQLPlanParser extends Logging { // This parser is used for SortAggregateExec, HashAggregateExec and ObjectHashAggregateExec def parseAggregateExpressions(exprStr: String): Array[String] = { val parsedExpressions = ArrayBuffer[String]() - // (key=[num#83], functions=[partial_collect_list(letter#84, 0, 0), partial_count(letter#84)]) - val pattern = """functions=\[([\w#, +*\\\-\.<>=\`\(\)]+\])""".r - val aggregatesString = pattern.findFirstMatchIn(exprStr) - // This is to split multiple column names in AggregateExec. Each column will be aggregating - // based on the aggregate function. Here "partial_" and "merge_" is removed and - // only function name is preserved. Below regex will first remove the - // "functions=" from the string followed by removing "partial_" and "merge_". That string is - // split which produces an array containing column names. Finally we remove the parentheses - // from the beginning and end to get only the expressions. Result will be as below. - // paranRemoved = Array(collect_list(letter#84, 0, 0),, count(letter#84)) - if (aggregatesString.isDefined) { - val paranRemoved = aggregatesString.get.toString.replaceAll("functions=", ""). - replaceAll("partial_", "").replaceAll("merge_", "").split("(?<=\\),)").map(_.trim). - map(_.replaceAll("""^\[+""", "").replaceAll("""\]+$""", "")) - paranRemoved.foreach { case expr => - val functionName = getFunctionName(functionPattern, expr) - functionName match { - case Some(func) => parsedExpressions += func - case _ => // NO OP + // (keys=[num#83], functions=[partial_collect_list(letter#84, 0, 0), partial_count(letter#84)]) + // Currently we only parse the functions expressions. + // "Keys" parsing is disabled for now because we won't be able to detect the types + + // A map (value -> parseEnabled) between the group and the parsing metadata + val patternMap = Map( + "functions" -> true, + "keys" -> false + ) + // It won't hurt to define a pattern that is neutral to the order of the functions/keys. + // This can avoid mismatches when exprStr comes in the fom of (functions=[], keys=[]). + val pattern = """^\((keys|functions)=\[(.*)\]\s*,\s*(keys|functions)=\[(.*)\]\s*\)$""".r + // Iterate through the matches and exclude disabled clauses + pattern.findAllMatchIn(exprStr).foreach { m => + // The matching groups are: + // 0 -> entire expression + // 1 -> "keys"; 2 -> keys' expression + // 3 -> "functions"; 4 -> functions' expression + Array(1, 3).foreach { group_ind => + val group_value = m.group(group_ind) + if (patternMap.getOrElse(group_value, false)) { + val clauseExpr = m.group(group_ind + 1) + // Here "partial_" and "merge_" is removed and only function name is preserved. + val processedExpr = clauseExpr.replaceAll("partial_", "").replaceAll("merge_", "") + // No need to split the expr any further because we are only interested in function names + val used_functions = getAllFunctionNames(functionPrefixPattern, processedExpr) + parsedExpressions ++= used_functions } } } diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index ca9dfe573..f641904fd 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -1022,4 +1022,18 @@ class SQLPlanParserSuite extends BaseTestSuite { val expressions = SQLPlanParser.parseFilterExpressions(exprString) expressions should ===(expected) } + + + test("Parse aggregate expressions") { + val exprString = "(keys=[], functions=[split(split(split(replace(replace(replace(replace(" + + "trim(replace(cast(unbase64(content#192) as string), , ), Some( )), *., ), *, ), " + + "https://, ), http://, ), /, -1)[0], :, -1)[0], \\?, -1)[0]#199, " + + "CASE WHEN (instr(replace(cast(unbase64(content#192) as string), , ), *) = 0) " + + "THEN concat(replace(cast(unbase64(content#192) as string), , ), %) " + + "ELSE replace(replace(replace(cast(unbase64(content#192) as string), , ), %, " + + "\\%), *, %) END#200])" + val expected = Array("replace", "concat", "instr", "split", "trim", "unbase64") + val expressions = SQLPlanParser.parseAggregateExpressions(exprString) + expressions should ===(expected) + } } From 0e8640a3597afae2cb8cb264882f4f1586f73921 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 7 Sep 2023 16:34:31 -0500 Subject: [PATCH 04/16] Improve tool error message for files with text extensions (#544) * Improve tool error message for files with text extensions Fixes #506 - Check if an unsupported filepath has the extension `txt` or `log`. the warning message indicates that no extension is expected. --------- Signed-off-by: Ahmed Hussein (amahussein) --- .../rapids/tool/EventLogPathProcessor.scala | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala index 8b7b05599..e9f24b8b4 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala @@ -67,11 +67,20 @@ object EventLogPathProcessor extends Logging { val SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER = SPARK_SHORT_COMPRESSION_CODEC_NAMES ++ Set("gz") + // Show a special message if the eventlog is one of the following formats + // https://github.com/NVIDIA/spark-rapids-tools/issues/506 + val EVENTLOGS_IN_PLAIN_TEXT_CODEC_NAMES = Set("log", "txt") + def eventLogNameFilter(logFile: Path): Boolean = { EventLogFileWriter.codecName(logFile) .forall(suffix => SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.contains(suffix)) } + def isPlainTxtFileName(logFile: Path): Boolean = { + EventLogFileWriter.codecName(logFile) + .forall(suffix => EVENTLOGS_IN_PLAIN_TEXT_CODEC_NAMES.contains(suffix)) + } + // Databricks has the latest events in file named eventlog and then any rolled in format // eventlog-2021-06-14--20-00.gz, here we assume that if any files start with eventlog // then the directory is a Databricks event log directory. @@ -126,10 +135,17 @@ object EventLogPathProcessor extends Logging { val fileName = filePath.getName() if (fileStatus.isFile() && !eventLogNameFilter(filePath)) { - logWarning(s"File: $fileName it not a supported file type. " + - "Supported compression types are: " + - s"${SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.mkString(", ")}. " + - "Skipping this file.") + val msg = if (isPlainTxtFileName(filePath)) { + // if the file is plain text, we want to show that the filePath without extension + // could be supported.. + s"File: $fileName. Detected a text file. No extension is expected. skipping this file." + } else { + s"File: $fileName is not a supported file type. " + + "Supported compression types are: " + + s"${SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.mkString(", ")}. " + + "Skipping this file." + } + logWarning(msg) Map.empty[EventLogInfo, Long] } else if (fileStatus.isDirectory && isEventLogDir(fileStatus)) { // either event logDir v2 directory or regular event log From 61460fd160aa7f91d86931276737a3d6f1bcc733 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 7 Sep 2023 20:32:04 -0500 Subject: [PATCH 05/16] [DOC] Fix help command in documentation (#540) Signed-off-by: Ahmed Hussein (amahussein) Fixes #496 --- user_tools/docs/user-tools-aws-emr.md | 8 ++++---- user_tools/docs/user-tools-databricks-aws.md | 4 ++-- user_tools/docs/user-tools-databricks-azure.md | 4 ++-- user_tools/docs/user-tools-dataproc.md | 6 +++--- user_tools/docs/user-tools-onprem.md | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/user_tools/docs/user-tools-aws-emr.md b/user_tools/docs/user-tools-aws-emr.md index 3faad8ce9..24f945f5e 100644 --- a/user_tools/docs/user-tools-aws-emr.md +++ b/user_tools/docs/user-tools-aws-emr.md @@ -32,7 +32,7 @@ the applications running on AWS EMR. - from source: `pip install -e .` - verify the command is installed correctly by running ```bash - spark_rapids_user_tools emr --help + spark_rapids_user_tools emr -- --help ``` ### 4.Environment variables @@ -55,7 +55,7 @@ Before running any command, you can set environment variables to specify configu ``` spark_rapids_user_tools emr qualification [options] -spark_rapids_user_tools emr qualification --help +spark_rapids_user_tools emr qualification -- --help ``` The local deployment runs on the local development machine. It requires: @@ -307,7 +307,7 @@ The CLI is triggered by providing the location where the yaml file is stored `-- ``` spark_rapids_user_tools emr bootstrap [options] -spark_rapids_user_tools emr bootstrap --help +spark_rapids_user_tools emr bootstrap -- --help ``` The command generates an output with a list of properties to be applied to Spark configurations. @@ -370,7 +370,7 @@ generate an output while displaying warning that the remote changes failed. ``` spark_rapids_user_tools emr diagnostic [options] -spark_rapids_user_tools emr diagnostic --help +spark_rapids_user_tools emr diagnostic -- --help ``` Run diagnostic command to collects information from EMR cluster, such as OS version, # of worker diff --git a/user_tools/docs/user-tools-databricks-aws.md b/user_tools/docs/user-tools-databricks-aws.md index 0116923d8..7cdd77832 100644 --- a/user_tools/docs/user-tools-databricks-aws.md +++ b/user_tools/docs/user-tools-databricks-aws.md @@ -35,7 +35,7 @@ The tool currently only supports event logs stored on S3 (no DBFS paths). The re - from source: `pip install -e .` - verify the command is installed correctly by running ```bash - spark_rapids_user_tools databricks-aws --help + spark_rapids_user_tools databricks-aws -- --help ``` ### 5.Environment variables @@ -53,7 +53,7 @@ Before running any command, you can set environment variables to specify configu ``` spark_rapids_user_tools databricks-aws qualification [options] -spark_rapids_user_tools databricks-aws qualification --help +spark_rapids_user_tools databricks-aws qualification -- --help ``` The local deployment runs on the local development machine. It requires: diff --git a/user_tools/docs/user-tools-databricks-azure.md b/user_tools/docs/user-tools-databricks-azure.md index 9a6fd2b62..b461c3f8b 100644 --- a/user_tools/docs/user-tools-databricks-azure.md +++ b/user_tools/docs/user-tools-databricks-azure.md @@ -39,7 +39,7 @@ The tool currently only supports event logs stored on ABFS ([Azure Blob File Sys - from source: `pip install -e .` - Verify the command is installed correctly by running ```bash - spark_rapids_user_tools databricks-azure --help + spark_rapids_user_tools databricks-azure -- --help ``` ### 5.Environment variables @@ -57,7 +57,7 @@ Before running any command, you can set environment variables to specify configu ``` spark_rapids_user_tools databricks-azure qualification [options] -spark_rapids_user_tools databricks-azure qualification --help +spark_rapids_user_tools databricks-azure qualification -- --help ``` The local deployment runs on the local development machine. It requires: diff --git a/user_tools/docs/user-tools-dataproc.md b/user_tools/docs/user-tools-dataproc.md index 12ff51b8b..e6da32b78 100644 --- a/user_tools/docs/user-tools-dataproc.md +++ b/user_tools/docs/user-tools-dataproc.md @@ -35,7 +35,7 @@ the applications running on _Google Cloud Dataproc_. - from source: `pip install -e .` - verify the command is installed correctly by running ```bash - spark_rapids_user_tools dataproc --help + spark_rapids_user_tools dataproc -- --help ``` ### 4.Environment variables @@ -55,7 +55,7 @@ RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: ``` spark_rapids_user_tools dataproc qualification [options] -spark_rapids_user_tools dataproc qualification --help +spark_rapids_user_tools dataproc qualification -- --help ``` The local deployment runs on the local development machine. It requires: @@ -380,7 +380,7 @@ generate an output while displaying warning that the remote changes failed. ``` spark_rapids_user_tools dataproc diagnostic [options] -spark_rapids_user_tools dataproc diagnostic --help +spark_rapids_user_tools dataproc diagnostic -- --help ``` Run diagnostic command to collects information from Dataproc cluster, such as OS version, # of worker diff --git a/user_tools/docs/user-tools-onprem.md b/user_tools/docs/user-tools-onprem.md index 6126150c7..565fb17f8 100644 --- a/user_tools/docs/user-tools-onprem.md +++ b/user_tools/docs/user-tools-onprem.md @@ -23,7 +23,7 @@ The tool currently only supports event logs stored on local path. The remote out - from source: `pip install -e .` - verify the command is installed correctly by running ```bash - spark_rapids_user_tools onprem --help + spark_rapids_user_tools onprem -- --help ``` ### 3.Environment variables @@ -39,7 +39,7 @@ Before running any command, you can set environment variables to specify configu ``` spark_rapids_user_tools onprem qualification [options] -spark_rapids_user_tools onprem qualification --help +spark_rapids_user_tools onprem qualification -- --help ``` The local deployment runs on the local development machine. It requires: From 79827013e882f8fae0403f16089876bb086c14a7 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:34:49 -0500 Subject: [PATCH 06/16] Qualification should treat promote_precision as supported (#545) * Qualification should treat promote_precision as supported Fixes #517 - Added column value `promote_precision` to `SQL Func` - Added new unit test to verify that promote_precision is supported for Spark LT 3.4.0 Signed-off-by: Ahmed Hussein (amahussein) --------- Signed-off-by: Ahmed Hussein (amahussein) --- core/src/main/resources/supportedExprs.csv | 4 +- .../nvidia/spark/rapids/BaseTestSuite.scala | 5 +++ .../tool/planparser/SqlPlanParserSuite.scala | 39 ++++++++++++++++++- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv index c1ff8979a..2c15cf7dd 100644 --- a/core/src/main/resources/supportedExprs.csv +++ b/core/src/main/resources/supportedExprs.csv @@ -377,8 +377,8 @@ Pow,S,`pow`; `power`,None,AST,rhs,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA Pow,S,`pow`; `power`,None,AST,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA PreciseTimestampConversion,S, ,None,project,input,NA,NA,NA,NA,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA PreciseTimestampConversion,S, ,None,project,result,NA,NA,NA,NA,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA -PromotePrecision,S, ,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA -PromotePrecision,S, ,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA +PromotePrecision,S,`promote_precision`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA +PromotePrecision,S,`promote_precision`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA PythonUDF,S, ,None,aggregation,param,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS PythonUDF,S, ,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NA,PS,NS,PS,NA PythonUDF,S, ,None,reduction,param,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS diff --git a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala index e080a0e66..6f18946f3 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala @@ -68,6 +68,11 @@ class BaseTestSuite extends FunSuite with BeforeAndAfterEach with Logging { "Spark340 does not parse the eventlog correctly") } + protected def ignoreExprForSparkGTE340(): (Boolean, String) = { + (!ToolUtils.isSpark340OrLater(), + "Spark340+ does not support the expression") + } + def runConditionalTest(testName: String, assumeCondition: () => (Boolean, String)) (fun: => Unit): Unit = { val (isAllowed, ignoreMessage) = assumeCondition() diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index f641904fd..d8e6c7812 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.functions.{ceil, col, collect_list, count, explode, import org.apache.spark.sql.rapids.tool.ToolUtils import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil -import org.apache.spark.sql.types.StringType class SQLPlanParserSuite extends BaseTestSuite { @@ -864,6 +863,7 @@ class SQLPlanParserSuite extends BaseTestSuite { val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "ProjectExprsSupported") { spark => import spark.implicits._ + import org.apache.spark.sql.types.StringType val df1 = Seq(9.9, 10.2, 11.6, 12.5).toDF("value") df1.write.parquet(s"$parquetoutputLoc/testtext") val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext") @@ -1036,4 +1036,41 @@ class SQLPlanParserSuite extends BaseTestSuite { val expressions = SQLPlanParser.parseAggregateExpressions(exprString) expressions should ===(expected) } + + runConditionalTest("promote_precision is supported for Spark LT 3.4.0: issue-517", + ignoreExprForSparkGTE340) { + // Spark-3.4.0 removed the promote_precision SQL function + // the SQL generates the following physical plan + // (1) Project [CheckOverflow((promote_precision(cast(dec1#24 as decimal(13,2))) + // + promote_precision(cast(dec2#25 as decimal(13,2)))), DecimalType(13,2)) + // AS (dec1 + dec2)#30] + // For Spark3.4.0, the promote_precision was removed from the plan. + // (1) Project [(dec1#24 + dec2#25) AS (dec1 + dec2)#30] + TrampolineUtil.withTempDir { parquetoutputLoc => + TrampolineUtil.withTempDir { eventLogDir => + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "projectPromotePrecision") { spark => + import spark.implicits._ + import org.apache.spark.sql.types.DecimalType + val df = Seq(("12347.21", "1234154"), ("92233.08", "1")).toDF + .withColumn("dec1", col("_1").cast(DecimalType(7, 2))) + .withColumn("dec2", col("_2").cast(DecimalType(10, 0))) + // write the df to parquet to transform localTableScan to projectExec + df.write.parquet(s"$parquetoutputLoc/testPromotePrecision") + val df2 = spark.read.parquet(s"$parquetoutputLoc/testPromotePrecision") + df2.selectExpr("dec1+dec2") + } + val pluginTypeChecker = new PluginTypeChecker() + val app = createAppFromEventlog(eventLog) + val parsedPlans = app.sqlPlans.map { case (sqlID, plan) => + SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app) + } + // The promote_precision should be part of the project exec. + val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq) + val projExecs = allExecInfo.filter(_.exec.contains("Project")) + assertSizeAndSupported(1, projExecs) + } + } + } + } From e943d89e4f2eb1ed942d6a126ff8c5ab86689677 Mon Sep 17 00:00:00 2001 From: Matt Ahrens Date: Mon, 11 Sep 2023 08:42:56 -0500 Subject: [PATCH 07/16] Adding TakeOrderedAndProject and BroadcastNestedLoopJoin, removing Project from speedup generation (#548) Signed-off-by: Matt Ahrens --- .../custom_speedup_factors/generate_speedup_factors.py | 6 ++++-- user_tools/custom_speedup_factors/operatorsList.csv | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/user_tools/custom_speedup_factors/generate_speedup_factors.py b/user_tools/custom_speedup_factors/generate_speedup_factors.py index edf6b538b..8a47cc7ec 100644 --- a/user_tools/custom_speedup_factors/generate_speedup_factors.py +++ b/user_tools/custom_speedup_factors/generate_speedup_factors.py @@ -155,8 +155,6 @@ scores_dict["FileSourceScanExec"] = str(round(cpu_stage_totals['Scan orc '] / gpu_stage_totals['GpuScan orc '], 2)) # Other operators -if 'Project' in cpu_stage_totals and 'GpuProject' in gpu_stage_totals: - scores_dict["ProjectExec"] = str(round(cpu_stage_totals['Project'] / gpu_stage_totals['GpuProject'], 2)) if 'Expand' in cpu_stage_totals and 'GpuExpand' in gpu_stage_totals: scores_dict["ExpandExec"] = str(round(cpu_stage_totals['Expand'] / gpu_stage_totals['GpuExpand'], 2)) if 'CartesianProduct' in cpu_stage_totals and 'GpuCartesianProduct' in gpu_stage_totals: @@ -173,6 +171,10 @@ scores_dict["HashAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2)) scores_dict["ObjectHashAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2)) scores_dict["SortAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2)) +if 'TakeOrderedAndProject' in cpu_stage_totals and 'GpuTopN' in gpu_stage_totals: + scores_dict["TakeOrderedAndProjectExec"] = str(round(cpu_stage_totals['TakeOrderedAndProject'] / gpu_stage_totals['GpuTopN'], 2)) +if 'BroadcastNestedLoopJoin' in cpu_stage_totals and 'GpuBroadcastNestedLoopJoin' in gpu_stage_totals: + scores_dict["BroadcastNestedLoopJoinExec"] = str(round(cpu_stage_totals['BroadcastNestedLoopJoin'] / gpu_stage_totals['GpuBroadcastNestedLoopJoin'], 2)) # Set minimum to 1.0 for speedup factors for key in scores_dict: diff --git a/user_tools/custom_speedup_factors/operatorsList.csv b/user_tools/custom_speedup_factors/operatorsList.csv index bb60ebf1c..97665b164 100644 --- a/user_tools/custom_speedup_factors/operatorsList.csv +++ b/user_tools/custom_speedup_factors/operatorsList.csv @@ -11,7 +11,6 @@ ProjectExec RangeExec SampleExec SortExec -SubqueryBroadcastExec TakeOrderedAndProjectExec HashAggregateExec ObjectHashAggregateExec @@ -19,7 +18,6 @@ SortAggregateExec DataWritingCommandExec ExecutedCommandExec BatchScanExec -BroadcastExchangeExec ShuffleExchangeExec BroadcastHashJoinExec BroadcastNestedLoopJoinExec From cdbe837c57ff58db3d560d22eca1dbd518250f5c Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Mon, 11 Sep 2023 07:00:55 -0700 Subject: [PATCH 08/16] Add `translate` as supported expression in qualification tools (#546) * added translate as supported expr in tools --------- Signed-off-by: cindyyuanjiang --- .../operatorsScore-databricks-aws.csv | 1 + .../operatorsScore-databricks-azure.csv | 1 + .../resources/operatorsScore-dataproc-l4.csv | 1 + .../resources/operatorsScore-dataproc-t4.csv | 1 + .../main/resources/operatorsScore-emr-a10.csv | 1 + .../main/resources/operatorsScore-emr-t4.csv | 1 + core/src/main/resources/operatorsScore.csv | 1 + core/src/main/resources/supportedExprs.csv | 4 +++ .../tool/planparser/SqlPlanParserSuite.scala | 32 ++++++++++++++++++- 9 files changed, 42 insertions(+), 1 deletion(-) diff --git a/core/src/main/resources/operatorsScore-databricks-aws.csv b/core/src/main/resources/operatorsScore-databricks-aws.csv index eabc0078a..cd12f33a5 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws.csv @@ -217,6 +217,7 @@ StringRepeat,2.45 StringReplace,2.45 StringSplit,2.45 StringToMap,2.45 +StringTranslate,2.45 StringTrim,2.45 StringTrimLeft,2.45 StringTrimRight,2.45 diff --git a/core/src/main/resources/operatorsScore-databricks-azure.csv b/core/src/main/resources/operatorsScore-databricks-azure.csv index dbf7f9da3..9b4c7b4bf 100644 --- a/core/src/main/resources/operatorsScore-databricks-azure.csv +++ b/core/src/main/resources/operatorsScore-databricks-azure.csv @@ -217,6 +217,7 @@ StringRepeat,2.73 StringReplace,2.73 StringSplit,2.73 StringToMap,2.73 +StringTranslate,2.73 StringTrim,2.73 StringTrimLeft,2.73 StringTrimRight,2.73 diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv index 66ff092de..541c15d15 100644 --- a/core/src/main/resources/operatorsScore-dataproc-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv @@ -217,6 +217,7 @@ StringRepeat,4.16 StringReplace,4.16 StringSplit,4.16 StringToMap,4.16 +StringTranslate,4.16 StringTrim,4.16 StringTrimLeft,4.16 StringTrimRight,4.16 diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv index 2ea664b85..94779648c 100644 --- a/core/src/main/resources/operatorsScore-dataproc-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv @@ -217,6 +217,7 @@ StringRepeat,4.88 StringReplace,4.88 StringSplit,4.88 StringToMap,4.88 +StringTranslate,4.88 StringTrim,4.88 StringTrimLeft,4.88 StringTrimRight,4.88 diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv index b233887ec..dd039b0bf 100644 --- a/core/src/main/resources/operatorsScore-emr-a10.csv +++ b/core/src/main/resources/operatorsScore-emr-a10.csv @@ -217,6 +217,7 @@ StringRepeat,2.59 StringReplace,2.59 StringSplit,2.59 StringToMap,2.59 +StringTranslate,2.59 StringTrim,2.59 StringTrimLeft,2.59 StringTrimRight,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv index 57f54922d..e91bf7dba 100644 --- a/core/src/main/resources/operatorsScore-emr-t4.csv +++ b/core/src/main/resources/operatorsScore-emr-t4.csv @@ -217,6 +217,7 @@ StringRepeat,2.07 StringReplace,2.07 StringSplit,2.07 StringToMap,2.07 +StringTranslate,2.07 StringTrim,2.07 StringTrimLeft,2.07 StringTrimRight,2.07 diff --git a/core/src/main/resources/operatorsScore.csv b/core/src/main/resources/operatorsScore.csv index 4ffb0d902..4d84db4d6 100644 --- a/core/src/main/resources/operatorsScore.csv +++ b/core/src/main/resources/operatorsScore.csv @@ -222,6 +222,7 @@ StringRepeat,4 StringReplace,4 StringSplit,4 StringToMap,4 +StringTranslate,4 StringTrim,4 StringTrimLeft,4 StringTrimRight,4 diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv index 2c15cf7dd..01d457771 100644 --- a/core/src/main/resources/supportedExprs.csv +++ b/core/src/main/resources/supportedExprs.csv @@ -501,6 +501,10 @@ StringToMap,S,`str_to_map`,None,project,str,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,N StringToMap,S,`str_to_map`,None,project,pairDelim,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringToMap,S,`str_to_map`,None,project,keyValueDelim,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringToMap,S,`str_to_map`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA +StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA +StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,from,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA +StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,to,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA +StringTranslate,S,`translate`,This is not 100% compatible with the Spark version because the GPU implementation supports all unicode code points. In Spark versions < 3.2.0; translate() does not support unicode characters with code point >= U+10000 (See SPARK-34094),project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringTrim,S,`trim`,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringTrim,S,`trim`,None,project,trimStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA StringTrim,S,`trim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index d8e6c7812..485d5e87c 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -30,7 +30,7 @@ import org.scalatest.exceptions.TestFailedException import org.apache.spark.sql.TrampolineUtil import org.apache.spark.sql.expressions.Window -import org.apache.spark.sql.functions.{ceil, col, collect_list, count, explode, floor, hex, json_tuple, round, row_number, sum} +import org.apache.spark.sql.functions.{ceil, col, collect_list, count, explode, floor, hex, json_tuple, round, row_number, sum, translate} import org.apache.spark.sql.rapids.tool.ToolUtils import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil @@ -916,6 +916,36 @@ class SQLPlanParserSuite extends BaseTestSuite { } } + test("translate is supported in ProjectExec") { + TrampolineUtil.withTempDir { parquetoutputLoc => + TrampolineUtil.withTempDir { eventLogDir => + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "ProjectExprsSupported") { spark => + import spark.implicits._ + val df1 = Seq("", "abc", "ABC", "AaBbCc").toDF("value") + // write df1 to parquet to transform LocalTableScan to ProjectExec + df1.write.parquet(s"$parquetoutputLoc/testtext") + val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext") + // translate should be part of ProjectExec + df2.select(translate(df2("value"), "ABC", "123")) + } + val pluginTypeChecker = new PluginTypeChecker() + val app = createAppFromEventlog(eventLog) + assert(app.sqlPlans.size == 2) + val parsedPlans = app.sqlPlans.map { case (sqlID, plan) => + SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app) + } + val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq) + val wholeStages = allExecInfo.filter(_.exec.contains("WholeStageCodegen")) + assert(wholeStages.size == 1) + assert(wholeStages.forall(_.duration.nonEmpty)) + val allChildren = wholeStages.flatMap(_.children).flatten + val projects = allChildren.filter(_.exec == "Project") + assertSizeAndSupported(1, projects) + } + } + } + test("Parse SQL function Name in HashAggregateExec") { TrampolineUtil.withTempDir { eventLogDir => val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "sqlmetric") { spark => From ead8bf8cda374f0a43d06149fb3c2a710ab45a26 Mon Sep 17 00:00:00 2001 From: Matt Ahrens Date: Mon, 11 Sep 2023 09:44:32 -0500 Subject: [PATCH 09/16] Changing max_value to total based on profiler core changes (#555) Signed-off-by: Matt Ahrens --- user_tools/custom_speedup_factors/generate_speedup_factors.py | 4 ++-- .../validate_qualification_estimates.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/user_tools/custom_speedup_factors/generate_speedup_factors.py b/user_tools/custom_speedup_factors/generate_speedup_factors.py index 8a47cc7ec..c09afddbd 100644 --- a/user_tools/custom_speedup_factors/generate_speedup_factors.py +++ b/user_tools/custom_speedup_factors/generate_speedup_factors.py @@ -66,11 +66,11 @@ cpu_sql_combined = cpu_sql_times.set_index('nodeName').join(mapping_info.set_index('SQL Node'), how='left') # - parse WholeStageCodegen durations with child node mapping - cpu_sql_times_df = cpu_sql_combined[['Child Node', 'max_value']] + cpu_sql_times_df = cpu_sql_combined[['Child Node', 'total']] for index, row in cpu_sql_times_df.iterrows(): operators = str(row['Child Node']).split(',') - duration = row['max_value']/len(operators)/1000.0 + duration = row['total']/len(operators)/1000.0 for operator in operators: if operator in cpu_stage_log[app_name]: cpu_stage_log[app_name][operator] = cpu_stage_log[app_name][operator] + duration diff --git a/user_tools/custom_speedup_factors/validate_qualification_estimates.py b/user_tools/custom_speedup_factors/validate_qualification_estimates.py index 036b75245..26e6b8ae2 100644 --- a/user_tools/custom_speedup_factors/validate_qualification_estimates.py +++ b/user_tools/custom_speedup_factors/validate_qualification_estimates.py @@ -67,7 +67,7 @@ gpu_profile_dir = gpu_profile else: gpu_profile_dir = f"{output}/gpu_profile" - subprocess.run(f"spark_rapids_user_tools onprem profiling --csv --local_folder {gpu_profile_dir} --eventlogs {gpu_log}", shell=True) + subprocess.run(f"spark_rapids_user_tools onprem profiling --csv {jar_arg} --local_folder {gpu_profile_dir} --eventlogs {gpu_log}", shell=True) if speedups is None: ### run CPU profiler if needed @@ -76,7 +76,7 @@ cpu_profile_dir = cpu_profile else: cpu_profile_dir = f"{output}/cpu_profile" - subprocess.run(f"spark_rapids_user_tools onprem profiling --csv --local_folder {cpu_profile_dir} --eventlogs {cpu_log}", shell=True) + subprocess.run(f"spark_rapids_user_tools onprem profiling --csv {jar_arg} --local_folder {cpu_profile_dir} --eventlogs {cpu_log}", shell=True) ### run speedup factor generation subprocess.run(f"python generate_speedup_factors.py --cpu {cpu_profile_dir}/*/rapids_4_spark_profile --gpu {gpu_profile_dir}/*/rapids_4_spark_profile --output {output}/generatedScores.csv", shell=True) From e570b961ba5f24fb89a160a8d9b127bd3019694e Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Mon, 11 Sep 2023 12:50:10 -0500 Subject: [PATCH 10/16] Fix handling of current_databse and ArrayBuffer (#556) * Fix handling of current_databse and ArrayBuffer Fixes #547, Fixes #554 - current_database() is a scala value that does not cause any fallback to CPU. It should be added to the ignored-lists - ArrayBuffer is not Spark sql function. Add it to to the ignored list - Create 2 new unit tests --------- Signed-off-by: Ahmed Hussein (amahussein) --- .../tool/planparser/SQLPlanParser.scala | 7 +- .../tool/planparser/SqlPlanParserSuite.scala | 64 +++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala index 4ec00d8d7..7fc8ef864 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala @@ -76,7 +76,12 @@ object SQLPlanParser extends Logging { val windowFunctionPattern = """(\w+)\(""".r - val ignoreExpressions = Array("any", "cast", "decimal", "decimaltype", "every", "some", "list") + val ignoreExpressions = Array("any", "cast", "decimal", "decimaltype", "every", "some", + "list", + // current_database does not cause any CPU fallbacks + "current_database", + // ArrayBuffer is a Scala function and may appear in some of the JavaRDDs/UDAFs) + "arraybuffer") /** * This function is used to create a set of nodes that should be skipped while parsing the Execs diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index 485d5e87c..a9e6f8c2d 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -1103,4 +1103,68 @@ class SQLPlanParserSuite extends BaseTestSuite { } } + test("current_database is not listed as unsupported: issue-547") { + // current_database is a scalar value that does not cause any CPU fallbacks + // we should not list it as unsupported expression if it appears in the SQL. + TrampolineUtil.withTempDir { parquetoutputLoc => + TrampolineUtil.withTempDir { eventLogDir => + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "ignoreCurrentDatabase") { spark => + import spark.implicits._ + val df1 = spark.sparkContext.parallelize(List(10, 20, 30, 40)).toDF + df1.write.parquet(s"$parquetoutputLoc/ignore_current_database") + val df2 = spark.read.parquet(s"$parquetoutputLoc/ignore_current_database") + // Note that the current_database will show up in project only if it is used as a column + // name. For example: + // > SELECT current_database(), current_database() as my_db; + // +------------------+-------+ + // |current_database()| my_db| + // +------------------+-------+ + // | default|default| + // | default|default| + // | default|default| + // | default|default| + // +------------------+-------+ + // == Physical Plan == + // *(1) Project [default AS current_database()#10, default AS my_db#9] + // +- *(1) ColumnarToRow + // +- FileScan parquet [] Batched: true, DataFilters: [], Format: Parquet, Location: + // InMemoryFileIndex(1 paths)[file:/tmp_folder/T/toolTest..., PartitionFilters: [] + // PushedFilters: [], ReadSchema: struct<> + df2.selectExpr("current_database()","current_database() as my_db") + } + val pluginTypeChecker = new PluginTypeChecker() + val app = createAppFromEventlog(eventLog) + val parsedPlans = app.sqlPlans.map { case (sqlID, plan) => + SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app) + } + // The current_database should be part of the project-exec and the parser should ignore it. + val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq) + val projExecs = allExecInfo.filter(_.exec.contains("Project")) + assertSizeAndSupported(1, projExecs) + } + } + } + + test("ArrayBuffer should be ignored in Expand: issue-554") { + // ArrayBuffer appears in some SQL queries. + // It is a non-Spark expression and it should not be considered as SQL-Function + val exprString = "[" + + "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), null, null, 0," + + " cast(if ((ret_type#1236 = 2)) 1 else 0 as bigint))," + + "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), wuid#1234, null, 1, null)," + + "ArrayBuffer(cast((cast(ds#1241L as double) / 100.0) as bigint), null," + + " if ((if ((ret_type#1236 = 2)) 1 else 0 = 1)) wuid#1234 else null, 2, null)," + + "[" + + "CAST((CAST(supersql_t12.`ds` AS DOUBLE) / 100.0D) AS BIGINT)#1297L," + + "supersql_t12.`wuid`#1298," + + "(IF(((IF((supersql_t12.`ret_type` = 2), 1, 0)) = 1), supersql_t12.`wuid`," + + "CAST(NULL AS STRING)))#1299," + + "gid#1296," + + "CAST((IF((supersql_t12.`ret_type` = 2), 1, 0)) AS BIGINT)#1300L]]" + // Only "IF" should be picked up as a function name + val expected = Array("IF") + val expressions = SQLPlanParser.parseExpandExpressions(exprString) + expressions should ===(expected) + } } From 92c4bfe874106bc41c7b3a10e0deb37a4bdc0ec4 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Mon, 11 Sep 2023 11:18:35 -0700 Subject: [PATCH 11/16] [FEA] Add support to TIMESTAMP functions (#549) * updated supportedExprs and operatorsScore files for timstamp functions * added unit tests --------- Signed-off-by: cindyyuanjiang --- .../operatorsScore-databricks-aws.csv | 3 ++ .../operatorsScore-databricks-azure.csv | 3 ++ .../resources/operatorsScore-dataproc-l4.csv | 3 ++ .../resources/operatorsScore-dataproc-t4.csv | 3 ++ .../main/resources/operatorsScore-emr-a10.csv | 3 ++ .../main/resources/operatorsScore-emr-t4.csv | 3 ++ core/src/main/resources/operatorsScore.csv | 3 ++ core/src/main/resources/supportedExprs.csv | 6 ++++ .../tool/planparser/SqlPlanParserSuite.scala | 30 +++++++++++++++++++ 9 files changed, 57 insertions(+) diff --git a/core/src/main/resources/operatorsScore-databricks-aws.csv b/core/src/main/resources/operatorsScore-databricks-aws.csv index cd12f33a5..b80514227 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws.csv @@ -155,6 +155,8 @@ MapKeys,2.45 MapValues,2.45 Max,2.45 Md5,2.45 +MicrosToTimestamp,2.45 +MillisToTimestamp,2.45 Min,2.45 Minute,2.45 MonotonicallyIncreasingID,2.45 @@ -193,6 +195,7 @@ RowNumber,2.45 ScalaUDF,2.45 ScalarSubquery,2.45 Second,2.45 +SecondsToTimestamp,2.45 Sequence,2.45 ShiftLeft,2.45 ShiftRight,2.45 diff --git a/core/src/main/resources/operatorsScore-databricks-azure.csv b/core/src/main/resources/operatorsScore-databricks-azure.csv index 9b4c7b4bf..efc7b7215 100644 --- a/core/src/main/resources/operatorsScore-databricks-azure.csv +++ b/core/src/main/resources/operatorsScore-databricks-azure.csv @@ -155,6 +155,8 @@ MapKeys,2.73 MapValues,2.73 Max,2.73 Md5,2.73 +MicrosToTimestamp,2.73 +MillisToTimestamp,2.73 Min,2.73 Minute,2.73 MonotonicallyIncreasingID,2.73 @@ -193,6 +195,7 @@ RowNumber,2.73 ScalaUDF,2.73 ScalarSubquery,2.73 Second,2.73 +SecondsToTimestamp,2.73 Sequence,2.73 ShiftLeft,2.73 ShiftRight,2.73 diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv index 541c15d15..0fdaea9d2 100644 --- a/core/src/main/resources/operatorsScore-dataproc-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv @@ -155,6 +155,8 @@ MapKeys,4.16 MapValues,4.16 Max,4.16 Md5,4.16 +MicrosToTimestamp,4.16 +MillisToTimestamp,4.16 Min,4.16 Minute,4.16 MonotonicallyIncreasingID,4.16 @@ -193,6 +195,7 @@ RowNumber,4.16 ScalaUDF,4.16 ScalarSubquery,4.16 Second,4.16 +SecondsToTimestamp,4.16 Sequence,4.16 ShiftLeft,4.16 ShiftRight,4.16 diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv index 94779648c..48ab44cab 100644 --- a/core/src/main/resources/operatorsScore-dataproc-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv @@ -155,6 +155,8 @@ MapKeys,4.88 MapValues,4.88 Max,4.88 Md5,4.88 +MicrosToTimestamp,4.88 +MillisToTimestamp,4.88 Min,4.88 Minute,4.88 MonotonicallyIncreasingID,4.88 @@ -193,6 +195,7 @@ RowNumber,4.88 ScalaUDF,4.88 ScalarSubquery,4.88 Second,4.88 +SecondsToTimestamp,4.88 Sequence,4.88 ShiftLeft,4.88 ShiftRight,4.88 diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv index dd039b0bf..ba697fb92 100644 --- a/core/src/main/resources/operatorsScore-emr-a10.csv +++ b/core/src/main/resources/operatorsScore-emr-a10.csv @@ -155,6 +155,8 @@ MapKeys,2.59 MapValues,2.59 Max,2.59 Md5,2.59 +MicrosToTimestamp,2.59 +MillisToTimestamp,2.59 Min,2.59 Minute,2.59 MonotonicallyIncreasingID,2.59 @@ -193,6 +195,7 @@ RowNumber,2.59 ScalaUDF,2.59 ScalarSubquery,2.59 Second,2.59 +SecondsToTimestamp,2.59 Sequence,2.59 ShiftLeft,2.59 ShiftRight,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv index e91bf7dba..1662703c8 100644 --- a/core/src/main/resources/operatorsScore-emr-t4.csv +++ b/core/src/main/resources/operatorsScore-emr-t4.csv @@ -155,6 +155,8 @@ MapKeys,2.07 MapValues,2.07 Max,2.07 Md5,2.07 +MicrosToTimestamp,2.07 +MillisToTimestamp,2.07 Min,2.07 Minute,2.07 MonotonicallyIncreasingID,2.07 @@ -193,6 +195,7 @@ RowNumber,2.07 ScalaUDF,2.07 ScalarSubquery,2.07 Second,2.07 +SecondsToTimestamp,2.07 Sequence,2.07 ShiftLeft,2.07 ShiftRight,2.07 diff --git a/core/src/main/resources/operatorsScore.csv b/core/src/main/resources/operatorsScore.csv index 4d84db4d6..7c00bb79b 100644 --- a/core/src/main/resources/operatorsScore.csv +++ b/core/src/main/resources/operatorsScore.csv @@ -160,6 +160,8 @@ MapKeys,4 MapValues,4 Max,4 Md5,4 +MicrosToTimestamp,4 +MillisToTimestamp,4 Min,4 Minute,4 MonotonicallyIncreasingID,4 @@ -198,6 +200,7 @@ RowNumber,4 ScalaUDF,4 ScalarSubquery,4 Second,4 +SecondsToTimestamp,4 Sequence,4 ShiftLeft,4 ShiftRight,4 diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv index 01d457771..7358bb832 100644 --- a/core/src/main/resources/supportedExprs.csv +++ b/core/src/main/resources/supportedExprs.csv @@ -330,6 +330,10 @@ MapValues,S,`map_values`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, MapValues,S,`map_values`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA Md5,S,`md5`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA Md5,S,`md5`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA +MicrosToTimestamp,S,`timestamp_micros`,None,project,input,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +MicrosToTimestamp,S,`timestamp_micros`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA +MillisToTimestamp,S,`timestamp_millis`,None,project,input,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +MillisToTimestamp,S,`timestamp_millis`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA Minute,S,`minute`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA Minute,S,`minute`,None,project,result,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA MonotonicallyIncreasingID,S,`monotonically_increasing_id`,None,project,result,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -430,6 +434,8 @@ ScalaUDF,S, ,None,project,param,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,NS ScalaUDF,S, ,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,NS Second,S,`second`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA Second,S,`second`,None,project,result,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +SecondsToTimestamp,S,`timestamp_seconds`,None,project,input,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA +SecondsToTimestamp,S,`timestamp_seconds`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA Sequence,S,`sequence`,None,project,start,NA,S,S,S,S,NA,NA,NS,NS,NA,NA,NA,NA,NA,NA,NA,NA,NA Sequence,S,`sequence`,None,project,stop,NA,S,S,S,S,NA,NA,NS,NS,NA,NA,NA,NA,NA,NA,NA,NA,NA Sequence,S,`sequence`,None,project,step,NA,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NS,NA,NA,NA,NA diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index a9e6f8c2d..a5a7c8262 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -946,6 +946,36 @@ class SQLPlanParserSuite extends BaseTestSuite { } } + test("Timestamp functions supported in ProjectExec") { + TrampolineUtil.withTempDir { parquetoutputLoc => + TrampolineUtil.withTempDir { eventLogDir => + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "ProjectExprsSupported") { spark => + import spark.implicits._ + val init_df = Seq((1230219000123123L, 1230219000123L, 1230219000.123)) + val df1 = init_df.toDF("micro", "millis", "seconds") + df1.write.parquet(s"$parquetoutputLoc/testtext") + val df2 = spark.read.parquet(s"$parquetoutputLoc/testtext") + df2.selectExpr("timestamp_micros(micro)", "timestamp_millis(millis)", + "timestamp_seconds(seconds)") + } + val pluginTypeChecker = new PluginTypeChecker() + val app = createAppFromEventlog(eventLog) + assert(app.sqlPlans.size == 2) + val parsedPlans = app.sqlPlans.map { case (sqlID, plan) => + SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app) + } + val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq) + val wholeStages = allExecInfo.filter(_.exec.contains("WholeStageCodegen")) + assert(wholeStages.size == 1) + assert(wholeStages.forall(_.duration.nonEmpty)) + val allChildren = wholeStages.flatMap(_.children).flatten + val projects = allChildren.filter(_.exec == "Project") + assertSizeAndSupported(1, projects) + } + } + } + test("Parse SQL function Name in HashAggregateExec") { TrampolineUtil.withTempDir { eventLogDir => val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "sqlmetric") { spark => From 04a037b79ed282286e5358ed5399a8ab72a0c5b5 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:35:56 -0500 Subject: [PATCH 12/16] Remove memoryOverhead recommendations for Standalone Spark (#557) * Remove memoryOverhead recommendations for Standalone Spark Fixes #553 - Add functionality to disable memoryOverhead when spark is standalone - Add unit-test with standalone enabled --------- Signed-off-by: Ahmed Hussein (amahussein) --- .../rapids/tool/profiling/AutoTuner.scala | 31 ++++++--- .../tool/profiling/AutoTunerSuite.scala | 63 +++++++++++++++++++ 2 files changed, 87 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala index b2aeb2920..86909d98d 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala @@ -546,6 +546,7 @@ class AutoTuner( /** * Flow: + * if "spark.master" is standalone => Do Nothing * if "spark.rapids.memory.pinnedPool.size" is set * if yarn -> recommend "spark.executor.memoryOverhead" * if using k8s -> @@ -553,13 +554,15 @@ class AutoTuner( * else recommend "spark.kubernetes.memoryOverheadFactor" and add comment if missing */ def addRecommendationForMemoryOverhead(recomValue: String): Unit = { - val memOverheadLookup = memoryOverheadLabel - appendRecommendationForMemoryMB(memOverheadLookup, recomValue) - getPropertyValue("spark.rapids.memory.pinnedPool.size").foreach { lookup => - if (lookup != "spark.executor.memoryOverhead") { - if (getPropertyValue(memOverheadLookup).isEmpty) { - appendComment(s"'$memOverheadLookup' must be set if using " + - s"'spark.rapids.memory.pinnedPool.size") + if (enableMemoryOverheadRecommendation(getPropertyValue("spark.master"))) { + val memOverheadLookup = memoryOverheadLabel + appendRecommendationForMemoryMB(memOverheadLookup, recomValue) + getPropertyValue("spark.rapids.memory.pinnedPool.size").foreach { lookup => + if (lookup != "spark.executor.memoryOverhead") { + if (getPropertyValue(memOverheadLookup).isEmpty) { + appendComment(s"'$memOverheadLookup' must be set if using " + + s"'spark.rapids.memory.pinnedPool.size") + } } } } @@ -1138,4 +1141,18 @@ object AutoTuner extends Logging { f"$sizeNum%.2f$sizeUnit" } } + + /** + * Given the spark property "spark.master", it checks whether memoryOverhead should be + * enabled/disabled. For Spark Standalone Mode, memoryOverhead property is skipped. + * @param confValue the value of property "spark.master" + * @return False if the value is a spark standalone. True if the value is not defined or + * set for yarn/Mesos + */ + def enableMemoryOverheadRecommendation(confValue: Option[String]): Boolean = { + confValue match { + case Some(sparkMaster) if sparkMaster.startsWith("spark:") => false + case _ => true + } + } } diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala index 86f0fad3e..cbcf439d9 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala @@ -1230,4 +1230,67 @@ class AutoTunerSuite extends FunSuite with BeforeAndAfterEach with Logging { // Assert recommendations are skipped in comments assert(comments.map(_.comment).forall(autoTuner.selectedPlatform.isValidComment)) } + + // When spark is running as a standalone, the memoryOverhead should not be listed as a + // recommendation: issue-553. + test("memoryOverhead should not be recommended for Spark Standalone") { + // This UT sets a custom spark-property "spark.master" pointing to a spark-standalone value + // The Autotuner should detects that the spark-master is standalone and refrains from + // recommending memoryOverhead value + val customProps = mutable.LinkedHashMap( + "spark.executor.cores" -> "8", + "spark.executor.memory" -> "47222m", + "spark.rapids.shuffle.multiThreaded.reader.threads" -> "8", + "spark.rapids.shuffle.multiThreaded.writer.threads" -> "8", + "spark.rapids.sql.concurrentGpuTasks" -> "2", + "spark.rapids.sql.multiThreadedRead.numThreads" -> "20", + "spark.shuffle.manager" -> "com.nvidia.spark.rapids.spark311.RapidsShuffleManager", + "spark.task.resource.gpu.amount" -> "0.0625") + // mock the properties loaded from eventLog + val logEventsProps: mutable.Map[String, String] = + mutable.LinkedHashMap[String, String]( + "spark.master" -> "spark://HOST_NAME:7077", + "spark.executor.cores" -> "16", + "spark.executor.instances" -> "1", + "spark.executor.memory" -> "80g", + "spark.executor.resource.gpu.amount" -> "1", + "spark.executor.instances" -> "1", + "spark.sql.shuffle.partitions" -> "200", + "spark.sql.files.maxPartitionBytes" -> "1g", + "spark.task.resource.gpu.amount" -> "0.0625", + "spark.rapids.memory.pinnedPool.size" -> "5g", + "spark.rapids.sql.enabled" -> "true", + "spark.plugins" -> "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.concurrentGpuTasks" -> "4") + val dataprocWorkerInfo = buildWorkerInfoAsString(Some(customProps), Some(32), + Some("212992MiB"), Some(5), Some(4), Some("15109MiB"), Some("Tesla T4")) + val infoProvider = getMockInfoProvider(8126464.0, Seq(0), Seq(0.004), logEventsProps, + Some(defaultSparkVersion)) + val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(dataprocWorkerInfo, infoProvider) + val (properties, comments) = autoTuner.getRecommendedProperties() + val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments) + // scalastyle:off line.size.limit + val expectedResults = + s"""| + |Spark Properties: + |--conf spark.executor.cores=8 + |--conf spark.executor.instances=20 + |--conf spark.executor.memory=16384m + |--conf spark.rapids.memory.pinnedPool.size=4096m + |--conf spark.rapids.sql.concurrentGpuTasks=2 + |--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=128m + |--conf spark.sql.adaptive.coalescePartitions.minPartitionNum=160 + |--conf spark.sql.files.maxPartitionBytes=4096m + |--conf spark.task.resource.gpu.amount=0.125 + | + |Comments: + |- 'spark.sql.adaptive.advisoryPartitionSizeInBytes' was not set. + |- 'spark.sql.adaptive.coalescePartitions.minPartitionNum' was not set. + |- 'spark.sql.adaptive.enabled' should be enabled for better performance. + |- ${AutoTuner.classPathComments("rapids.jars.missing")} + |- ${AutoTuner.classPathComments("rapids.shuffle.jars")} + |""".stripMargin + // scalastyle:on line.size.limit + assert(expectedResults == autoTunerOutput) + } } From b38a981524d623c407e3f7e0e61dde214be33260 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 12 Sep 2023 16:51:29 +0000 Subject: [PATCH 13/16] Update dev-version by jenkins-spark-rapids-tools-auto-release-40 Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- core/pom.xml | 2 +- user_tools/src/spark_rapids_pytools/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index ad8412983..07f4958f8 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-tools_2.12 RAPIDS Accelerator for Apache Spark tools RAPIDS Accelerator for Apache Spark tools - 23.08.1-SNAPSHOT + 23.08.2-SNAPSHOT jar http://github.com/NVIDIA/spark-rapids-tools diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py index ebaf6f580..1c1792bef 100644 --- a/user_tools/src/spark_rapids_pytools/__init__.py +++ b/user_tools/src/spark_rapids_pytools/__init__.py @@ -16,5 +16,5 @@ from spark_rapids_pytools.build import get_version -VERSION = '23.08.1' +VERSION = '23.08.2' __version__ = get_version(VERSION) From 4a50bf1045fdde12889ea050dcd18cb38ca5359e Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Wed, 13 Sep 2023 13:55:15 -0500 Subject: [PATCH 14/16] Fix sdk_monitor exception thrown by abfs protocol (#569) Signed-off-by: Ahmed Hussein (amahussein) Fixes #568 --- user_tools/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml index 6bde6fe28..6e50e3a75 100644 --- a/user_tools/pyproject.toml +++ b/user_tools/pyproject.toml @@ -37,6 +37,8 @@ dependencies = [ # this will include numpy "pyarrow==12.0.1", # used for ADLS filesystem implementation + # Issue-568: use 12.17.0 as the new 12.18.0 causes an error in runtime + "azure-storage-blob==12.17.0", "adlfs==2023.4.0" ] dynamic=["entry-points", "version"] From 355f6d236840d706982aaf81ccbb7059ecfb4af3 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Wed, 13 Sep 2023 15:00:31 -0500 Subject: [PATCH 15/16] Rename and change pyrapids to spark_rapids_tools (#570) Signed-off-by: Ahmed Hussein (amahussein) Fixes #567 --- user_tools/pyproject.toml | 2 +- .../spark_rapids_pytools/cloud_api/databricks_aws.py | 2 +- .../spark_rapids_pytools/cloud_api/databricks_azure.py | 2 +- .../src/spark_rapids_pytools/cloud_api/dataproc.py | 2 +- user_tools/src/spark_rapids_pytools/cloud_api/emr.py | 2 +- .../src/spark_rapids_pytools/cloud_api/onprem.py | 2 +- .../src/spark_rapids_pytools/cloud_api/sp_types.py | 2 +- .../src/spark_rapids_pytools/common/prop_manager.py | 2 +- .../src/spark_rapids_pytools/pricing/emr_pricing.py | 2 +- .../src/spark_rapids_pytools/rapids/qualification.py | 2 +- .../src/spark_rapids_pytools/rapids/rapids_tool.py | 2 +- .../src/spark_rapids_pytools/rapids/tool_ctxt.py | 2 +- .../resources/dev/prepackage_mgr.py | 2 +- .../wrappers/databricks_aws_wrapper.py | 2 +- .../wrappers/databricks_azure_wrapper.py | 2 +- .../spark_rapids_pytools/wrappers/dataproc_wrapper.py | 2 +- .../src/spark_rapids_pytools/wrappers/emr_wrapper.py | 2 +- .../spark_rapids_pytools/wrappers/onprem_wrapper.py | 2 +- .../src/{pyrapids => spark_rapids_tools}/__init__.py | 0 .../{pyrapids => spark_rapids_tools}/cloud/__init__.py | 0 .../{pyrapids => spark_rapids_tools}/cloud/cluster.py | 0 .../cloud/databricks/__init__.py | 0 .../cloud/databricks/dbcluster.py | 4 ++-- .../cloud/dataproc/__init__.py | 0 .../cloud/dataproc/dataproccluster.py | 4 ++-- .../cloud/emr/__init__.py | 0 .../cloud/emr/emrcluster.py | 4 ++-- .../cloud/onprem/__init__.py | 0 .../cloud/onprem/onpremcluster.py | 6 +++--- .../{pyrapids => spark_rapids_tools}/cmdli/__init__.py | 4 ++-- .../cmdli/argprocessor.py | 8 ++++---- .../cmdli/tools_cli.py} | 8 ++++---- .../src/{pyrapids => spark_rapids_tools}/enums.py | 0 .../src/{pyrapids => spark_rapids_tools}/exceptions.py | 0 .../storagelib/__init__.py | 0 .../storagelib/adls/__init__.py | 0 .../storagelib/adls/adlsfs.py | 0 .../storagelib/adls/adlspath.py | 0 .../storagelib/cspfs.py | 0 .../storagelib/csppath.py | 8 ++++---- .../storagelib/gcs/__init__.py | 0 .../storagelib/gcs/gcsfs.py | 0 .../storagelib/gcs/gcspath.py | 0 .../storagelib/hdfs/__init__.py | 0 .../storagelib/hdfs/hdfsfs.py | 0 .../storagelib/hdfs/hdfspath.py | 0 .../storagelib/local/__init__.py | 0 .../storagelib/local/localfs.py | 0 .../storagelib/local/localpath.py | 0 .../storagelib/s3/__init__.py | 0 .../storagelib/s3/s3fs.py | 2 +- .../storagelib/s3/s3path.py | 0 .../{pyrapids => spark_rapids_tools}/tools/__init__.py | 0 .../tools/autotuner.py | 2 +- .../{pyrapids => spark_rapids_tools}/utils/__init__.py | 0 .../utils/propmanager.py | 6 +++--- .../src/{pyrapids => spark_rapids_tools}/utils/util.py | 6 +++--- .../__init__.py | 0 .../conftest.py | 4 ++-- .../resources/cluster/databricks/aws-cpu-00.json | 0 .../resources/cluster/databricks/azure-cpu-00.json | 0 .../resources/cluster/dataproc/cpu-00.yaml | 0 .../resources/cluster/emr/cpu-00.json | 0 .../resources/cluster/onprem/cpu-00.yaml | 0 .../resources/eventlogs/.gitkeep | 0 .../test_cluster.py | 10 +++++----- .../test_tool_argprocessor.py | 10 +++++----- 67 files changed, 61 insertions(+), 61 deletions(-) rename user_tools/src/{pyrapids => spark_rapids_tools}/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/cluster.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/databricks/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/databricks/dbcluster.py (89%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/dataproc/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/dataproc/dataproccluster.py (83%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/emr/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/emr/emrcluster.py (82%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/onprem/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cloud/onprem/onpremcluster.py (85%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cmdli/__init__.py (92%) rename user_tools/src/{pyrapids => spark_rapids_tools}/cmdli/argprocessor.py (98%) rename user_tools/src/{pyrapids/cmdli/pyrapids_cli.py => spark_rapids_tools/cmdli/tools_cli.py} (97%) rename user_tools/src/{pyrapids => spark_rapids_tools}/enums.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/exceptions.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/adls/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/adls/adlsfs.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/adls/adlspath.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/cspfs.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/csppath.py (97%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/gcs/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/gcs/gcsfs.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/gcs/gcspath.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/hdfs/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/hdfs/hdfsfs.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/hdfs/hdfspath.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/local/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/local/localfs.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/local/localpath.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/s3/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/s3/s3fs.py (95%) rename user_tools/src/{pyrapids => spark_rapids_tools}/storagelib/s3/s3path.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/tools/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/tools/autotuner.py (89%) rename user_tools/src/{pyrapids => spark_rapids_tools}/utils/__init__.py (100%) rename user_tools/src/{pyrapids => spark_rapids_tools}/utils/propmanager.py (94%) rename user_tools/src/{pyrapids => spark_rapids_tools}/utils/util.py (95%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/__init__.py (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/conftest.py (92%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/cluster/databricks/aws-cpu-00.json (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/cluster/databricks/azure-cpu-00.json (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/cluster/dataproc/cpu-00.yaml (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/cluster/emr/cpu-00.json (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/cluster/onprem/cpu-00.yaml (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/resources/eventlogs/.gitkeep (100%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/test_cluster.py (81%) rename user_tools/tests/{pyrapids_unit => spark_rapids_tools_ut}/test_tool_argprocessor.py (96%) diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml index 6e50e3a75..0ea4a163c 100644 --- a/user_tools/pyproject.toml +++ b/user_tools/pyproject.toml @@ -45,7 +45,7 @@ dynamic=["entry-points", "version"] [project.scripts] spark_rapids_user_tools = "spark_rapids_pytools.wrapper:main" -ascli = "pyrapids.cmdli.pyrapids_cli:main" +ascli = "spark_rapids_tools.cmdli.tools_cli:main" [tool.setuptools] package-dir = {"" = "src"} diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py index aed603d3f..d0fc047e5 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py @@ -18,7 +18,7 @@ from dataclasses import dataclass, field from typing import Any, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.databricks_aws_job import DBAWSLocalRapidsJob from spark_rapids_pytools.cloud_api.emr import EMRNode, EMRPlatform from spark_rapids_pytools.cloud_api.s3storage import S3StorageDriver diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py index c958f23f4..b92b70954 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py @@ -20,7 +20,7 @@ from dataclasses import dataclass, field from typing import Any, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.azurestorage import AzureStorageDriver from spark_rapids_pytools.cloud_api.databricks_azure_job import DBAzureLocalRapidsJob from spark_rapids_pytools.cloud_api.sp_types import CMDDriverBase, ClusterBase, ClusterNode, \ diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py index e737441ec..afd81ae7a 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py @@ -19,7 +19,7 @@ from dataclasses import dataclass, field from typing import Any, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.dataproc_job import DataprocLocalRapidsJob from spark_rapids_pytools.cloud_api.gstorage import GStorageDriver from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, CMDDriverBase, \ diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py index ce885ce37..ed67b1bd9 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py @@ -19,7 +19,7 @@ from dataclasses import field, dataclass from typing import Any, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.emr_job import EmrLocalRapidsJob from spark_rapids_pytools.cloud_api.s3storage import S3StorageDriver from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, ClusterBase, CMDDriverBase, \ diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py index b6656c171..ef96c6420 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py @@ -18,7 +18,7 @@ from dataclasses import dataclass from typing import Any, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.rapids.rapids_job import RapidsLocalJob from spark_rapids_pytools.cloud_api.sp_types import PlatformBase, ClusterBase, ClusterNode, \ CMDDriverBase, ClusterGetAccessor, GpuDevice, \ diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py index 9dc6331d7..65fb960cc 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py @@ -22,7 +22,7 @@ from logging import Logger from typing import Type, Any, List, Callable -from pyrapids import EnumeratedType, CspEnv +from spark_rapids_tools import EnumeratedType, CspEnv from spark_rapids_pytools.common.prop_manager import AbstractPropertiesContainer, JSONPropertiesContainer, \ get_elem_non_safe from spark_rapids_pytools.common.sys_storage import StorageDriver, FSUtil diff --git a/user_tools/src/spark_rapids_pytools/common/prop_manager.py b/user_tools/src/spark_rapids_pytools/common/prop_manager.py index 35864c0fe..fdafcc71f 100644 --- a/user_tools/src/spark_rapids_pytools/common/prop_manager.py +++ b/user_tools/src/spark_rapids_pytools/common/prop_manager.py @@ -22,7 +22,7 @@ import yaml -from pyrapids import get_elem_from_dict, get_elem_non_safe +from spark_rapids_tools import get_elem_from_dict, get_elem_non_safe def convert_dict_to_camel_case(dic: dict): diff --git a/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py b/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py index 73a52844c..6670e6698 100644 --- a/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py +++ b/user_tools/src/spark_rapids_pytools/pricing/emr_pricing.py @@ -16,7 +16,7 @@ from dataclasses import dataclass, field -from pyrapids import get_elem_from_dict, get_elem_non_safe +from spark_rapids_tools import get_elem_from_dict, get_elem_non_safe from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil from spark_rapids_pytools.pricing.price_provider import PriceProvider diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py index b10b06c13..a8861c6a1 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py +++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py @@ -22,7 +22,7 @@ import pandas as pd from tabulate import tabulate -from pyrapids.enums import QualFilterApp, QualGpuClusterReshapeType +from spark_rapids_tools.enums import QualFilterApp, QualGpuClusterReshapeType from spark_rapids_pytools.cloud_api.sp_types import ClusterReshape, NodeHWInfo from spark_rapids_pytools.common.sys_storage import FSUtil from spark_rapids_pytools.common.utilities import Utils, TemplateGenerator diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index c88867376..4b116c0d7 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -26,7 +26,7 @@ from logging import Logger from typing import Any, Callable, Dict, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import get_platform, \ ClusterBase, DeployMode, NodeHWInfo from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer diff --git a/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py b/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py index c4851a0b1..503547450 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py +++ b/user_tools/src/spark_rapids_pytools/rapids/tool_ctxt.py @@ -21,7 +21,7 @@ from logging import Logger from typing import Type, Any, ClassVar, List -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import PlatformBase from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil diff --git a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py index 6a8a90057..6d3108ff3 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py +++ b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py @@ -25,7 +25,7 @@ import fire -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil from spark_rapids_pytools.common.utilities import Utils diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py index 7a91a7b6b..083d5d56f 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py @@ -14,7 +14,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AWS.""" -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py index 2eb1cf8a7..87a9b2891 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py @@ -14,7 +14,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AZURE.""" -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal diff --git a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py index 4fd0ca4fb..05ae9eb60 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py @@ -14,7 +14,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on Dataproc.""" -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.bootstrap import Bootstrap diff --git a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py index 63069291d..65de11341 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py @@ -14,7 +14,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on AWS-EMR.""" -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.bootstrap import Bootstrap diff --git a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py index ba4971399..1e4ea0c56 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py @@ -14,7 +14,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on On-Prem cluster.""" -from pyrapids import CspEnv +from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal diff --git a/user_tools/src/pyrapids/__init__.py b/user_tools/src/spark_rapids_tools/__init__.py similarity index 100% rename from user_tools/src/pyrapids/__init__.py rename to user_tools/src/spark_rapids_tools/__init__.py diff --git a/user_tools/src/pyrapids/cloud/__init__.py b/user_tools/src/spark_rapids_tools/cloud/__init__.py similarity index 100% rename from user_tools/src/pyrapids/cloud/__init__.py rename to user_tools/src/spark_rapids_tools/cloud/__init__.py diff --git a/user_tools/src/pyrapids/cloud/cluster.py b/user_tools/src/spark_rapids_tools/cloud/cluster.py similarity index 100% rename from user_tools/src/pyrapids/cloud/cluster.py rename to user_tools/src/spark_rapids_tools/cloud/cluster.py diff --git a/user_tools/src/pyrapids/cloud/databricks/__init__.py b/user_tools/src/spark_rapids_tools/cloud/databricks/__init__.py similarity index 100% rename from user_tools/src/pyrapids/cloud/databricks/__init__.py rename to user_tools/src/spark_rapids_tools/cloud/databricks/__init__.py diff --git a/user_tools/src/pyrapids/cloud/databricks/dbcluster.py b/user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py similarity index 89% rename from user_tools/src/pyrapids/cloud/databricks/dbcluster.py rename to user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py index 58c950889..4b1c0ffc2 100644 --- a/user_tools/src/pyrapids/cloud/databricks/dbcluster.py +++ b/user_tools/src/spark_rapids_tools/cloud/databricks/dbcluster.py @@ -18,8 +18,8 @@ from typing import ClassVar, Type, Optional -from pyrapids.cloud.cluster import register_client_cluster, register_cluster_prop_mgr, ClusterPropMgr, ClientCluster -from pyrapids.utils.propmanager import PropValidatorSchema +from spark_rapids_tools.cloud.cluster import register_client_cluster, register_cluster_prop_mgr, ClusterPropMgr, ClientCluster +from spark_rapids_tools.utils.propmanager import PropValidatorSchema class DBAwsClusterSchema(PropValidatorSchema): diff --git a/user_tools/src/pyrapids/cloud/dataproc/__init__.py b/user_tools/src/spark_rapids_tools/cloud/dataproc/__init__.py similarity index 100% rename from user_tools/src/pyrapids/cloud/dataproc/__init__.py rename to user_tools/src/spark_rapids_tools/cloud/dataproc/__init__.py diff --git a/user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py b/user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py similarity index 83% rename from user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py rename to user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py index 482d18e50..20ebf526a 100644 --- a/user_tools/src/pyrapids/cloud/dataproc/dataproccluster.py +++ b/user_tools/src/spark_rapids_tools/cloud/dataproc/dataproccluster.py @@ -18,8 +18,8 @@ from typing import ClassVar, Type -from pyrapids.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, register_cluster_prop_mgr -from pyrapids.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema +from spark_rapids_tools.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, register_cluster_prop_mgr +from spark_rapids_tools.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema class DataprocClusterSchema(PropValidatorSchemaCamel): diff --git a/user_tools/src/pyrapids/cloud/emr/__init__.py b/user_tools/src/spark_rapids_tools/cloud/emr/__init__.py similarity index 100% rename from user_tools/src/pyrapids/cloud/emr/__init__.py rename to user_tools/src/spark_rapids_tools/cloud/emr/__init__.py diff --git a/user_tools/src/pyrapids/cloud/emr/emrcluster.py b/user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py similarity index 82% rename from user_tools/src/pyrapids/cloud/emr/emrcluster.py rename to user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py index 7c676d074..d94830ca6 100644 --- a/user_tools/src/pyrapids/cloud/emr/emrcluster.py +++ b/user_tools/src/spark_rapids_tools/cloud/emr/emrcluster.py @@ -18,8 +18,8 @@ from typing import ClassVar, Type -from pyrapids.cloud.cluster import register_cluster_prop_mgr, register_client_cluster, ClusterPropMgr, ClientCluster -from pyrapids.utils.propmanager import PropValidatorSchemaUpper, PropValidatorSchema +from spark_rapids_tools.cloud.cluster import register_cluster_prop_mgr, register_client_cluster, ClusterPropMgr, ClientCluster +from spark_rapids_tools.utils.propmanager import PropValidatorSchemaUpper, PropValidatorSchema class EmrClusterSchema(PropValidatorSchemaUpper): diff --git a/user_tools/src/pyrapids/cloud/onprem/__init__.py b/user_tools/src/spark_rapids_tools/cloud/onprem/__init__.py similarity index 100% rename from user_tools/src/pyrapids/cloud/onprem/__init__.py rename to user_tools/src/spark_rapids_tools/cloud/onprem/__init__.py diff --git a/user_tools/src/pyrapids/cloud/onprem/onpremcluster.py b/user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py similarity index 85% rename from user_tools/src/pyrapids/cloud/onprem/onpremcluster.py rename to user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py index 8c6a87214..85e8972d5 100644 --- a/user_tools/src/pyrapids/cloud/onprem/onpremcluster.py +++ b/user_tools/src/spark_rapids_tools/cloud/onprem/onpremcluster.py @@ -19,9 +19,9 @@ from typing import ClassVar, Type from typing_extensions import TypedDict from pydantic import ConfigDict -from pyrapids.cloud.cluster import ClientCluster, ClusterPropMgr, register_cluster_prop_mgr, register_client_cluster -from pyrapids.utils.propmanager import PropValidatorSchema -from pyrapids.utils.util import to_camel_case +from spark_rapids_tools.cloud.cluster import ClientCluster, ClusterPropMgr, register_cluster_prop_mgr, register_client_cluster +from spark_rapids_tools.utils.propmanager import PropValidatorSchema +from spark_rapids_tools.utils.util import to_camel_case class OnPremDriverConfigSchema(TypedDict): diff --git a/user_tools/src/pyrapids/cmdli/__init__.py b/user_tools/src/spark_rapids_tools/cmdli/__init__.py similarity index 92% rename from user_tools/src/pyrapids/cmdli/__init__.py rename to user_tools/src/spark_rapids_tools/cmdli/__init__.py index ed252dbfc..1344d5a37 100644 --- a/user_tools/src/pyrapids/cmdli/__init__.py +++ b/user_tools/src/spark_rapids_tools/cmdli/__init__.py @@ -14,8 +14,8 @@ """init file of the user CLI used to run the tools""" -from .pyrapids_cli import PyRapids +from .tools_cli import ToolsCLI __all__ = [ - 'PyRapids' + 'ToolsCLI' ] diff --git a/user_tools/src/pyrapids/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py similarity index 98% rename from user_tools/src/pyrapids/cmdli/argprocessor.py rename to user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 7a332461a..1803f7602 100644 --- a/user_tools/src/pyrapids/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -25,9 +25,9 @@ from pydantic.dataclasses import dataclass from pydantic_core import PydanticCustomError -from pyrapids.cloud import ClientCluster -from pyrapids.exceptions import IllegalArgumentError -from pyrapids.utils import AbstractPropContainer, is_http_file +from spark_rapids_tools.cloud import ClientCluster +from spark_rapids_tools.exceptions import IllegalArgumentError +from spark_rapids_tools.utils import AbstractPropContainer, is_http_file from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging from spark_rapids_pytools.rapids.qualification import QualGpuClusterReshapeType @@ -105,7 +105,7 @@ class AbsToolUserArgModel: 'meta': {}, 'toolArgs': {} }) - logger: ClassVar[Logger] = ToolLogging.get_and_setup_logger('pyrapids.argparser') + logger: ClassVar[Logger] = ToolLogging.get_and_setup_logger('spark_rapids_tools.argparser') tool_name: ClassVar[str] = None @classmethod diff --git a/user_tools/src/pyrapids/cmdli/pyrapids_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py similarity index 97% rename from user_tools/src/pyrapids/cmdli/pyrapids_cli.py rename to user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index 45d6ff799..001472e36 100644 --- a/user_tools/src/pyrapids/cmdli/pyrapids_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -17,15 +17,15 @@ import fire -from pyrapids.enums import QualGpuClusterReshapeType -from pyrapids.utils.util import gen_app_banner +from spark_rapids_tools.enums import QualGpuClusterReshapeType +from spark_rapids_tools.utils.util import gen_app_banner from spark_rapids_pytools.rapids.bootstrap import Bootstrap from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal from spark_rapids_pytools.rapids.qualification import QualificationAsLocal from .argprocessor import AbsToolUserArgModel -class PyRapids(object): # pylint: disable=too-few-public-methods +class ToolsCLI(object): # pylint: disable=too-few-public-methods """CLI that provides a runtime environment that simplifies running cost and performance analysis using the RAPIDS Accelerator for Apache Spark. @@ -167,7 +167,7 @@ def main(): # Make Python Fire not use a pager when it prints a help text fire.core.Display = lambda lines, out: out.write('\n'.join(lines) + '\n') print(gen_app_banner()) - fire.Fire(PyRapids()) + fire.Fire(ToolsCLI()) if __name__ == '__main__': diff --git a/user_tools/src/pyrapids/enums.py b/user_tools/src/spark_rapids_tools/enums.py similarity index 100% rename from user_tools/src/pyrapids/enums.py rename to user_tools/src/spark_rapids_tools/enums.py diff --git a/user_tools/src/pyrapids/exceptions.py b/user_tools/src/spark_rapids_tools/exceptions.py similarity index 100% rename from user_tools/src/pyrapids/exceptions.py rename to user_tools/src/spark_rapids_tools/exceptions.py diff --git a/user_tools/src/pyrapids/storagelib/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/adls/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/adls/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/adls/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/adls/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/adls/adlsfs.py b/user_tools/src/spark_rapids_tools/storagelib/adls/adlsfs.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/adls/adlsfs.py rename to user_tools/src/spark_rapids_tools/storagelib/adls/adlsfs.py diff --git a/user_tools/src/pyrapids/storagelib/adls/adlspath.py b/user_tools/src/spark_rapids_tools/storagelib/adls/adlspath.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/adls/adlspath.py rename to user_tools/src/spark_rapids_tools/storagelib/adls/adlspath.py diff --git a/user_tools/src/pyrapids/storagelib/cspfs.py b/user_tools/src/spark_rapids_tools/storagelib/cspfs.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/cspfs.py rename to user_tools/src/spark_rapids_tools/storagelib/cspfs.py diff --git a/user_tools/src/pyrapids/storagelib/csppath.py b/user_tools/src/spark_rapids_tools/storagelib/csppath.py similarity index 97% rename from user_tools/src/pyrapids/storagelib/csppath.py rename to user_tools/src/spark_rapids_tools/storagelib/csppath.py index 923899b0c..50d1b8c2d 100644 --- a/user_tools/src/pyrapids/storagelib/csppath.py +++ b/user_tools/src/spark_rapids_tools/storagelib/csppath.py @@ -190,18 +190,18 @@ class CspPath(metaclass=CspPathMeta): Create a new path subclass from a gcs URI: >>> gs_path = CspPath('gs://bucket-name/folder_00/subfolder_01') - + or from S3 URI: >>> s3_path = CspPath('s3://bucket-name/folder_00/subfolder_01') - + or from local file URI: >>> local_path1, local_path2 = (CspPath('~/my_folder'), CspPath('file:///my_folder')) - + Print the data from the file with `open_input_file()`: diff --git a/user_tools/src/pyrapids/storagelib/gcs/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/gcs/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/gcs/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/gcs/gcsfs.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/gcsfs.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/gcs/gcsfs.py rename to user_tools/src/spark_rapids_tools/storagelib/gcs/gcsfs.py diff --git a/user_tools/src/pyrapids/storagelib/gcs/gcspath.py b/user_tools/src/spark_rapids_tools/storagelib/gcs/gcspath.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/gcs/gcspath.py rename to user_tools/src/spark_rapids_tools/storagelib/gcs/gcspath.py diff --git a/user_tools/src/pyrapids/storagelib/hdfs/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/hdfs/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/hdfs/hdfsfs.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfsfs.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/hdfs/hdfsfs.py rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfsfs.py diff --git a/user_tools/src/pyrapids/storagelib/hdfs/hdfspath.py b/user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfspath.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/hdfs/hdfspath.py rename to user_tools/src/spark_rapids_tools/storagelib/hdfs/hdfspath.py diff --git a/user_tools/src/pyrapids/storagelib/local/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/local/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/local/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/local/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/local/localfs.py b/user_tools/src/spark_rapids_tools/storagelib/local/localfs.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/local/localfs.py rename to user_tools/src/spark_rapids_tools/storagelib/local/localfs.py diff --git a/user_tools/src/pyrapids/storagelib/local/localpath.py b/user_tools/src/spark_rapids_tools/storagelib/local/localpath.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/local/localpath.py rename to user_tools/src/spark_rapids_tools/storagelib/local/localpath.py diff --git a/user_tools/src/pyrapids/storagelib/s3/__init__.py b/user_tools/src/spark_rapids_tools/storagelib/s3/__init__.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/s3/__init__.py rename to user_tools/src/spark_rapids_tools/storagelib/s3/__init__.py diff --git a/user_tools/src/pyrapids/storagelib/s3/s3fs.py b/user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py similarity index 95% rename from user_tools/src/pyrapids/storagelib/s3/s3fs.py rename to user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py index e64922acc..b4054cf0a 100644 --- a/user_tools/src/pyrapids/storagelib/s3/s3fs.py +++ b/user_tools/src/spark_rapids_tools/storagelib/s3/s3fs.py @@ -15,7 +15,7 @@ """Wrapper for the S3 File system""" -from pyrapids.storagelib.cspfs import register_fs_class, CspFs +from spark_rapids_tools.storagelib.cspfs import register_fs_class, CspFs @register_fs_class('s3', 'S3FileSystem') diff --git a/user_tools/src/pyrapids/storagelib/s3/s3path.py b/user_tools/src/spark_rapids_tools/storagelib/s3/s3path.py similarity index 100% rename from user_tools/src/pyrapids/storagelib/s3/s3path.py rename to user_tools/src/spark_rapids_tools/storagelib/s3/s3path.py diff --git a/user_tools/src/pyrapids/tools/__init__.py b/user_tools/src/spark_rapids_tools/tools/__init__.py similarity index 100% rename from user_tools/src/pyrapids/tools/__init__.py rename to user_tools/src/spark_rapids_tools/tools/__init__.py diff --git a/user_tools/src/pyrapids/tools/autotuner.py b/user_tools/src/spark_rapids_tools/tools/autotuner.py similarity index 89% rename from user_tools/src/pyrapids/tools/autotuner.py rename to user_tools/src/spark_rapids_tools/tools/autotuner.py index b65ff094a..a390fefad 100644 --- a/user_tools/src/pyrapids/tools/autotuner.py +++ b/user_tools/src/spark_rapids_tools/tools/autotuner.py @@ -16,7 +16,7 @@ from typing import Optional, ClassVar, Type -from pyrapids.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema, AbstractPropContainer +from spark_rapids_tools.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema, AbstractPropContainer class AutoTunerInputSchema(PropValidatorSchemaCamel): diff --git a/user_tools/src/pyrapids/utils/__init__.py b/user_tools/src/spark_rapids_tools/utils/__init__.py similarity index 100% rename from user_tools/src/pyrapids/utils/__init__.py rename to user_tools/src/spark_rapids_tools/utils/__init__.py diff --git a/user_tools/src/pyrapids/utils/propmanager.py b/user_tools/src/spark_rapids_tools/utils/propmanager.py similarity index 94% rename from user_tools/src/pyrapids/utils/propmanager.py rename to user_tools/src/spark_rapids_tools/utils/propmanager.py index fbe55d0b5..c7dd6d822 100644 --- a/user_tools/src/pyrapids/utils/propmanager.py +++ b/user_tools/src/spark_rapids_tools/utils/propmanager.py @@ -23,9 +23,9 @@ import yaml from pydantic import BaseModel, ConfigDict, model_validator, ValidationError -from pyrapids.exceptions import JsonLoadException, YamlLoadException, InvalidPropertiesSchema -from pyrapids.storagelib.csppath import CspPath, CspPathT -from pyrapids.utils.util import to_camel_case, to_camel_capital_case, get_elem_from_dict, get_elem_non_safe +from spark_rapids_tools.exceptions import JsonLoadException, YamlLoadException, InvalidPropertiesSchema +from spark_rapids_tools.storagelib.csppath import CspPath, CspPathT +from spark_rapids_tools.utils.util import to_camel_case, to_camel_capital_case, get_elem_from_dict, get_elem_non_safe def load_json(file_path: Union[str, CspPathT]) -> Any: diff --git a/user_tools/src/pyrapids/utils/util.py b/user_tools/src/spark_rapids_tools/utils/util.py similarity index 95% rename from user_tools/src/pyrapids/utils/util.py rename to user_tools/src/spark_rapids_tools/utils/util.py index 07bf2439b..8d1676979 100644 --- a/user_tools/src/pyrapids/utils/util.py +++ b/user_tools/src/spark_rapids_tools/utils/util.py @@ -26,7 +26,7 @@ from pydantic import ValidationError, AnyHttpUrl, TypeAdapter import spark_rapids_pytools -from pyrapids.exceptions import CspPathAttributeError +from spark_rapids_tools.exceptions import CspPathAttributeError def get_elem_from_dict(data, keys): @@ -87,8 +87,8 @@ def to_snake_case(word: str) -> str: def dump_tool_usage(tool_name: Optional[str], raise_sys_exit: Optional[bool] = True): - imported_module = __import__('pyrapids.cmdli', globals(), locals(), ['PyRapids']) - wrapper_clzz = getattr(imported_module, 'PyRapids') + imported_module = __import__('spark_rapids_tools.cmdli', globals(), locals(), ['ToolsCLI']) + wrapper_clzz = getattr(imported_module, 'ToolsCLI') help_name = 'ascli' usage_cmd = f'{tool_name} --help' try: diff --git a/user_tools/tests/pyrapids_unit/__init__.py b/user_tools/tests/spark_rapids_tools_ut/__init__.py similarity index 100% rename from user_tools/tests/pyrapids_unit/__init__.py rename to user_tools/tests/spark_rapids_tools_ut/__init__.py diff --git a/user_tools/tests/pyrapids_unit/conftest.py b/user_tools/tests/spark_rapids_tools_ut/conftest.py similarity index 92% rename from user_tools/tests/pyrapids_unit/conftest.py rename to user_tools/tests/spark_rapids_tools_ut/conftest.py index 0b886a042..dbb5fb383 100644 --- a/user_tools/tests/pyrapids_unit/conftest.py +++ b/user_tools/tests/spark_rapids_tools_ut/conftest.py @@ -25,7 +25,7 @@ def get_test_resources_path(): import importlib_resources else: import importlib.resources as importlib_resources - pkg = importlib_resources.files('tests.pyrapids_unit') + pkg = importlib_resources.files('tests.spark_rapids_tools_ut') return pkg / 'resources' @@ -47,7 +47,7 @@ def gen_cpu_cluster_props(): all_csps = csps + ['onprem'] -class PyrapidsUnitTest: # pylint: disable=too-few-public-methods +class SparkRapidsToolsUT: # pylint: disable=too-few-public-methods @pytest.fixture(autouse=True) def get_ut_data_dir(self): diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/aws-cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/aws-cpu-00.json similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/cluster/databricks/aws-cpu-00.json rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/aws-cpu-00.json diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/databricks/azure-cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/azure-cpu-00.json similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/cluster/databricks/azure-cpu-00.json rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/databricks/azure-cpu-00.json diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/dataproc/cpu-00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/dataproc/cpu-00.yaml similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/cluster/dataproc/cpu-00.yaml rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/dataproc/cpu-00.yaml diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/emr/cpu-00.json b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/emr/cpu-00.json similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/cluster/emr/cpu-00.json rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/emr/cpu-00.json diff --git a/user_tools/tests/pyrapids_unit/resources/cluster/onprem/cpu-00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/cluster/onprem/cpu-00.yaml similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/cluster/onprem/cpu-00.yaml rename to user_tools/tests/spark_rapids_tools_ut/resources/cluster/onprem/cpu-00.yaml diff --git a/user_tools/tests/pyrapids_unit/resources/eventlogs/.gitkeep b/user_tools/tests/spark_rapids_tools_ut/resources/eventlogs/.gitkeep similarity index 100% rename from user_tools/tests/pyrapids_unit/resources/eventlogs/.gitkeep rename to user_tools/tests/spark_rapids_tools_ut/resources/eventlogs/.gitkeep diff --git a/user_tools/tests/pyrapids_unit/test_cluster.py b/user_tools/tests/spark_rapids_tools_ut/test_cluster.py similarity index 81% rename from user_tools/tests/pyrapids_unit/test_cluster.py rename to user_tools/tests/spark_rapids_tools_ut/test_cluster.py index c6e6cbf03..3524c64da 100644 --- a/user_tools/tests/pyrapids_unit/test_cluster.py +++ b/user_tools/tests/spark_rapids_tools_ut/test_cluster.py @@ -16,13 +16,13 @@ import pytest # pylint: disable=import-error -from pyrapids import CspPath -from pyrapids.cloud import ClientCluster -from pyrapids.exceptions import InvalidPropertiesSchema -from .conftest import PyrapidsUnitTest, all_cpu_cluster_props +from spark_rapids_tools import CspPath +from spark_rapids_tools.cloud import ClientCluster +from spark_rapids_tools.exceptions import InvalidPropertiesSchema +from .conftest import SparkRapidsToolsUT, all_cpu_cluster_props -class TestClusterCSP(PyrapidsUnitTest): # pylint: disable=too-few-public-methods +class TestClusterCSP(SparkRapidsToolsUT): # pylint: disable=too-few-public-methods """ Class testing identifying the cluster type by comparing the properties to the defined Schema diff --git a/user_tools/tests/pyrapids_unit/test_tool_argprocessor.py b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py similarity index 96% rename from user_tools/tests/pyrapids_unit/test_tool_argprocessor.py rename to user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py index 1dca5ba9f..04ce37aa3 100644 --- a/user_tools/tests/pyrapids_unit/test_tool_argprocessor.py +++ b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py @@ -21,10 +21,10 @@ import fire import pytest # pylint: disable=import-error -from pyrapids import CspEnv -from pyrapids.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase -from pyrapids.enums import QualFilterApp -from .conftest import PyrapidsUnitTest, all_cpu_cluster_props, csp_cpu_cluster_props, csps +from spark_rapids_tools import CspEnv +from spark_rapids_tools.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase +from spark_rapids_tools.enums import QualFilterApp +from .conftest import SparkRapidsToolsUT, all_cpu_cluster_props, csp_cpu_cluster_props, csps @dataclasses.dataclass @@ -55,7 +55,7 @@ def decorator(func_cb: Callable): return decorator -class TestToolArgProcessor(PyrapidsUnitTest): # pylint: disable=too-few-public-methods +class TestToolArgProcessor(SparkRapidsToolsUT): # pylint: disable=too-few-public-methods """ Class testing toolArgProcessor functionalities """ From 5c3da28112686e8db43f1433db615fd4bf20140c Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Tue, 19 Sep 2023 17:54:16 -0700 Subject: [PATCH 16/16] [FEA] Remove URLs from pydantic error messages (#560) * replaced IllegalArgumentError with PydanticCustomError for improved error message Signed-off-by: cindyyuanjiang * refactored pydantic err message Signed-off-by: cindyyuanjiang * merge conflict Signed-off-by: cindyyuanjiang * fixed incorrect imports Signed-off-by: cindyyuanjiang --------- Signed-off-by: cindyyuanjiang --- .../spark_rapids_tools/cmdli/argprocessor.py | 31 ++++++++++--------- .../src/spark_rapids_tools/exceptions.py | 4 --- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 1803f7602..9a0617a89 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -26,7 +26,6 @@ from pydantic_core import PydanticCustomError from spark_rapids_tools.cloud import ClientCluster -from spark_rapids_tools.exceptions import IllegalArgumentError from spark_rapids_tools.utils import AbstractPropContainer, is_http_file from spark_rapids_pytools.cloud_api.sp_types import DeployMode from spark_rapids_pytools.common.utilities import ToolLogging @@ -115,7 +114,7 @@ def create_tool_args(cls, tool_name: str, *args: Any, **kwargs: Any) -> Optional impl_class = impl_entry.validator_class new_obj = impl_class(*args, **kwargs) return new_obj.build_tools_args() - except (ValidationError, IllegalArgumentError) as e: + except (ValidationError, PydanticCustomError) as e: impl_class.logger.error('Validation err: %s\n', e) dump_tool_usage(impl_class.tool_name) return None @@ -126,8 +125,9 @@ def get_eventlogs(self) -> Optional[str]: return None def raise_validation_exception(self, validation_err: str): - raise IllegalArgumentError( - f'Invalid arguments: {validation_err}') + raise PydanticCustomError( + 'invalid_argument', + f'{validation_err}\n Error:') def determine_cluster_arg_type(self) -> ArgValueCase: # self.cluster is provided. then we need to verify that the expected files are there @@ -137,13 +137,14 @@ def determine_cluster_arg_type(self) -> ArgValueCase: # the file cannot be a http_url if is_http_file(self.cluster): # we do not accept http://urls - raise IllegalArgumentError( - f'Cluster properties cannot be a web URL path: {self.cluster}') + raise PydanticCustomError( + 'invalid_argument', + f'Cluster properties cannot be a web URL path: {self.cluster}\n Error:') cluster_case = ArgValueCase.VALUE_B else: raise PydanticCustomError( 'file_path', - 'Cluster property file is not in valid format {.json, .yaml, or .yml}') + 'Cluster property file is not in valid format {.json, .yaml, or .yml}\n Error:') else: cluster_case = ArgValueCase.VALUE_A return cluster_case @@ -167,8 +168,9 @@ def detect_platform_from_eventlogs_prefix(self): def validate_onprem_with_cluster_name(self): if self.platform == CspEnv.ONPREM: - raise IllegalArgumentError( - f'Invalid arguments: Cannot run cluster by name with platform [{CspEnv.ONPREM}]') + raise PydanticCustomError( + 'invalid_argument', + f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:') def init_extra_arg_cases(self) -> list: return [] @@ -229,8 +231,9 @@ def post_platform_assignment_validation(self, assigned_platform): if self.argv_cases[1] == ArgValueCase.VALUE_A: if assigned_platform == CspEnv.ONPREM: # it is not allowed to run cluster_by_name on an OnPrem platform - raise IllegalArgumentError( - f'Invalid arguments: Cannot run cluster by name with platform [{CspEnv.ONPREM}]') + raise PydanticCustomError( + 'invalid_argument', + f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:') @dataclass @@ -356,10 +359,10 @@ def build_tools_args(self) -> dict: self.p_args['toolArgs']['targetPlatform'] = None else: if not self.p_args['toolArgs']['targetPlatform'] in equivalent_pricing_list: - raise IllegalArgumentError( - 'Invalid arguments: ' + raise PydanticCustomError( + 'invalid_argument', f'The platform [{self.p_args["toolArgs"]["targetPlatform"]}] is currently ' - f'not supported to calculate savings from [{runtime_platform}] cluster') + f'not supported to calculate savings from [{runtime_platform}] cluster\n Error:') else: # target platform is not set, then we disable cost savings if the runtime platform if # onprem diff --git a/user_tools/src/spark_rapids_tools/exceptions.py b/user_tools/src/spark_rapids_tools/exceptions.py index 893fc0f84..d9acbb156 100644 --- a/user_tools/src/spark_rapids_tools/exceptions.py +++ b/user_tools/src/spark_rapids_tools/exceptions.py @@ -70,7 +70,3 @@ def __init__(self, msg: str, pydantic_err: Optional[ValidationError] = None): content.append(str.join('. ', single_err)) self.message = str.join('\n', content) super().__init__(self.message) - - -class IllegalArgumentError(CspPathException, ValueError): - pass