diff --git a/core/src/main/resources/operatorsScore-databricks-aws.csv b/core/src/main/resources/operatorsScore-databricks-aws.csv index 394437c13..afd6848ce 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws.csv @@ -176,6 +176,7 @@ Not,2.45 NthValue,2.45 OctetLength,2.45 Or,2.45 +ParseUrl,2.45 Percentile,2.45 PercentRank,2.45 PivotFirst,2.45 diff --git a/core/src/main/resources/operatorsScore-databricks-azure.csv b/core/src/main/resources/operatorsScore-databricks-azure.csv index 86daf247d..6b7396c0d 100644 --- a/core/src/main/resources/operatorsScore-databricks-azure.csv +++ b/core/src/main/resources/operatorsScore-databricks-azure.csv @@ -176,6 +176,7 @@ Not,2.73 NthValue,2.73 OctetLength,2.73 Or,2.73 +ParseUrl,2.73 Percentile,2.73 PercentRank,2.73 PivotFirst,2.73 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv index e1d3678d4..204866e8a 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv @@ -168,6 +168,7 @@ NormalizeNaNAndZero,3.74 Not,3.74 NthValue,3.74 OctetLength,3.74 +ParseUrl,3.74 Or,3.74 Percentile,3.74 PercentRank,3.74 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv index 2777068b7..bf6395056 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -168,6 +168,7 @@ NormalizeNaNAndZero,3.65 Not,3.65 NthValue,3.65 OctetLength,3.65 +ParseUrl,3.65 Or,3.65 Percentile,3.65 PercentRank,3.65 diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv index ad371bb8f..251c67c9f 100644 --- a/core/src/main/resources/operatorsScore-dataproc-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv @@ -176,6 +176,7 @@ Not,4.16 NthValue,4.16 OctetLength,4.16 Or,4.16 +ParseUrl,4.16 Percentile,4.16 PercentRank,4.16 PivotFirst,4.16 diff --git a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv index fbe0d057e..eb22ba760 100644 --- a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv @@ -169,6 +169,7 @@ Not,4.25 NthValue,4.25 OctetLength,4.25 Or,4.25 +ParseUrl,4.25 Percentile,4.25 PercentRank,4.25 PivotFirst,4.25 diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv index c6f3a7654..789829e69 100644 --- a/core/src/main/resources/operatorsScore-dataproc-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv @@ -176,6 +176,7 @@ Not,4.88 NthValue,4.88 OctetLength,4.88 Or,4.88 +ParseUrl,4.88 Percentile,4.88 PercentRank,4.88 PivotFirst,4.88 diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv index 91a2a840d..600614084 100644 --- a/core/src/main/resources/operatorsScore-emr-a10.csv +++ b/core/src/main/resources/operatorsScore-emr-a10.csv @@ -176,6 +176,7 @@ Not,2.59 NthValue,2.59 OctetLength,2.59 Or,2.59 +ParseUrl,2.59 Percentile,2.59 PercentRank,2.59 PivotFirst,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv index 8d34a914c..bf4c818ae 100644 --- a/core/src/main/resources/operatorsScore-emr-t4.csv +++ b/core/src/main/resources/operatorsScore-emr-t4.csv @@ -176,6 +176,7 @@ Not,2.07 NthValue,2.07 OctetLength,2.07 Or,2.07 +ParseUrl,2.07 Percentile,2.07 PercentRank,2.07 PivotFirst,2.07 diff --git a/core/src/main/resources/operatorsScore-onprem.csv b/core/src/main/resources/operatorsScore-onprem.csv index 50ec61028..d5638c96e 100644 --- a/core/src/main/resources/operatorsScore-onprem.csv +++ b/core/src/main/resources/operatorsScore-onprem.csv @@ -181,6 +181,7 @@ Not,4 NthValue,4 OctetLength,4 Or,4 +ParseUrl,4 Percentile,4 PercentRank,4 PivotFirst,4 diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv index f6d4ee1fc..c5f273891 100644 --- a/core/src/main/resources/supportedExprs.csv +++ b/core/src/main/resources/supportedExprs.csv @@ -382,6 +382,10 @@ Or,S,`or`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, Or,S,`or`,None,AST,lhs,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Or,S,`or`,None,AST,rhs,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Or,S,`or`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +ParseUrl,S,`parse_url`,None,project,url,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA +ParseUrl,S,`parse_url`,None,project,partToExtract,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA +ParseUrl,S,`parse_url`,None,project,key,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA +ParseUrl,S,`parse_url`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA PercentRank,S,`percent_rank`,None,window,ordering,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,NS,NS PercentRank,S,`percent_rank`,None,window,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Pmod,S,`pmod`,None,project,lhs,NA,S,S,S,S,S,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA @@ -539,7 +543,7 @@ StringTrimLeft,S,`ltrim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA, StringTrimRight,S,`rtrim`,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringTrimRight,S,`rtrim`,None,project,trimStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA StringTrimRight,S,`rtrim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA -StructsToJson,NS,`to_json`,This is disabled by default because to_json support is experimental. See compatibility guide for more information.,project,struct,S,S,S,S,S,S,S,NA,NA,S,NA,NA,NA,NA,S,S,S,NA +StructsToJson,NS,`to_json`,This is disabled by default because to_json support is experimental. See compatibility guide for more information.,project,struct,S,S,S,S,S,S,S,S,PS,S,S,NA,NA,NA,PS,PS,PS,NA StructsToJson,NS,`to_json`,This is disabled by default because to_json support is experimental. See compatibility guide for more information.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA Substring,S,`substr`; `substring`,None,project,str,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NS,NA,NA,NA,NA,NA Substring,S,`substr`; `substring`,None,project,pos,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala index b809d9521..47b38d847 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala @@ -381,31 +381,48 @@ object SQLPlanParser extends Logging { funcName } - private def getAllFunctionNames(functionPattern: Regex, expr: String, + // This method aims at doing some common processing to an expression before + // we start parsing it. For example, some special handling is required for some functions. + private def processSpecialFunctions(expr: String): String = { + // For parse_url, we only support parse_url(*,Host,*); parse_url(*,Protocol,*) + // So we want to be able to define that parse_url(*,QUERY,*) is not supported. + + // The following regex uses forward references to find matches for parse_url(*) + // we need to use forward references because otherwise multiple occurrences will be matched + // only once. + // https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-a- + // regex-without-using-recursion-or/47162099#47162099 + // example parse_url: + // Project [url_col#7, parse_url(url_col#7, HOST, false) AS HOST#9, + // parse_url(url_col#7, QUERY, false) AS QUERY#10] + val parseURLPattern = ("parse_url(?=\\()(?:(?=.*?\\((?!.*?\\1)(.*\\)(?!.*\\2).*))(?=.*?\\)" + + "(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^(]*(?=\\2$)").r + var newExpr = expr + parseURLPattern.findAllMatchIn(expr).foreach { parse_call => + // iterate on all matches replacing parse_url by parse_url_query + // note that we do replaceFirst because we want to map 1-to-1 and the order does + // not matter here. + if (parse_call.matched.matches("parse_url\\(.*,\\s*(?i)query\\s*,.*\\)")) { + newExpr = newExpr.replaceFirst("parse_url\\(", "parse_url_query(") + } + } + newExpr + } + + private def getAllFunctionNames(regPattern: Regex, expr: String, groupInd: Int = 1): Set[String] = { // Returns all matches in an expression. This can be used when the SQL expression is not // tokenized. - functionPattern.findAllMatchIn(expr).map(_.group(groupInd)).toSet.filterNot(ignoreExpression(_)) + val newExpr = processSpecialFunctions(expr) + regPattern.findAllMatchIn(newExpr).map(_.group(groupInd)).toSet.filterNot(ignoreExpression(_)) } def parseProjectExpressions(exprStr: String): Array[String] = { - val parsedExpressions = ArrayBuffer[String]() // Project [cast(value#136 as string) AS value#144, CEIL(value#136) AS CEIL(value)#143L] // This is to split the string such that only function names are extracted. The pattern is // such that function name is succeeded by `(`. We use regex to extract all the function names // below: - // paranRemoved = Array(cast(value#136 as string), CEIL(value#136)) - val pattern: Regex = "([a-zA-Z0-9_]+)\\(".r - val functionNamePattern: Regex = """(\w+)""".r - val paranRemoved = pattern.findAllMatchIn(exprStr).toArray.map(_.group(1)) - paranRemoved.foreach { case expr => - val functionName = getFunctionName(functionNamePattern, expr) - functionName match { - case Some(func) => parsedExpressions += func - case _ => // NO OP - } - } - parsedExpressions.distinct.toArray + getAllFunctionNames(functionPrefixPattern, exprStr).toArray } // This parser is used for SortAggregateExec, HashAggregateExec and ObjectHashAggregateExec diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala index 0d8806237..a5a4ae4c2 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala @@ -687,7 +687,7 @@ class SQLPlanParserSuite extends BaseTestSuite { } } - test("WindowExec and expressions within WIndowExec") { + test("WindowExec and expressions within WindowExec") { TrampolineUtil.withTempDir { eventLogDir => val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "sqlmetric") { spark => import spark.implicits._ @@ -1047,6 +1047,37 @@ class SQLPlanParserSuite extends BaseTestSuite { } } + test("ParseUrl is supported except that for parse_url_query") { + // parse_url(*,QUERY,*) should cause the project to be unsupported + // the expression will appear in the unsupportedExpression summary + TrampolineUtil.withTempDir { parquetoutputLoc => + TrampolineUtil.withTempDir { eventLogDir => + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "ParseURL") { spark => + import spark.implicits._ + val df1 = Seq("https://spark.apache.org/downloads.html?query=50", + "https://docs.nvidia.com/spark-rapids/user-guide/23.12/spark-profiling-tool.html" + ).toDF("url_col") + // write df1 to parquet to transform LocalTableScan to ProjectExec + df1.write.parquet(s"$parquetoutputLoc/testparse") + val df2 = spark.read.parquet(s"$parquetoutputLoc/testparse") + df2.selectExpr("*", "parse_url(`url_col`, 'HOST') as HOST", + "parse_url(`url_col`,'QUERY') as QUERY") + } + val pluginTypeChecker = new PluginTypeChecker() + val app = createAppFromEventlog(eventLog) + + assert(app.sqlPlans.size == 2) + val parsedPlans = app.sqlPlans.map { case (sqlID, plan) => + SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app) + } + val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq) + val projects = allExecInfo.filter(_.exec.contains("Project")) + assertSizeAndNotSupported(1, projects) + } + } + } + test("xxhash64 is supported in ProjectExec") { TrampolineUtil.withTempDir { parquetoutputLoc => TrampolineUtil.withTempDir { eventLogDir =>