NVIDIA · nartal1 · Oct 16, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 2, 2023
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala
@@ -391,6 +391,7 @@ object QualOutputWriter {
   val ESTIMATED_GPU_SPEEDUP = "Estimated GPU Speedup"
   val ESTIMATED_GPU_TIMESAVED = "Estimated GPU Time Saved"
   val STAGE_ESTIMATED_STR = "Stage Estimated"
+  val NUM_TRANSITIONS = "Number of CPU-GPU Transitions"
   val UNSUPPORTED_EXECS = "Unsupported Execs"
   val UNSUPPORTED_EXPRS = "Unsupported Expressions"
   val CLUSTER_TAGS = "Cluster Tags"
@@ -856,7 +857,8 @@ object QualOutputWriter {
       AVERAGE_SPEEDUP_STR -> AVERAGE_SPEEDUP_STR.size,
       STAGE_DUR_STR -> STAGE_DUR_STR.size,
       UNSUPPORTED_TASK_DURATION_STR -> UNSUPPORTED_TASK_DURATION_STR.size,
-      STAGE_ESTIMATED_STR -> STAGE_ESTIMATED_STR.size
+      STAGE_ESTIMATED_STR -> STAGE_ESTIMATED_STR.size,
+      NUM_TRANSITIONS -> NUM_TRANSITIONS.size
     )
     detailedHeadersAndFields
   }
@@ -878,7 +880,8 @@ object QualOutputWriter {
           headersAndSizes(AVERAGE_SPEEDUP_STR),
         info.stageTaskTime.toString -> headersAndSizes(STAGE_DUR_STR),
         info.unsupportedTaskDur.toString -> headersAndSizes(UNSUPPORTED_TASK_DURATION_STR),
-        info.estimated.toString -> headersAndSizes(STAGE_ESTIMATED_STR))
+        info.estimated.toString -> headersAndSizes(STAGE_ESTIMATED_STR),
+        info.numTransitions.toString -> headersAndSizes(NUM_TRANSITIONS))
       constructOutputRow(data, delimiter, prettyPrint)
     }
   }

diff --git a/.../src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala b/.../src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala
@@ -16,6 +16,8 @@
 
 package org.apache.spark.sql.rapids.tool.qualification
 
+import java.util.concurrent.TimeUnit
+
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 
 import com.nvidia.spark.rapids.tool.EventLogInfo
@@ -51,6 +53,8 @@ class QualificationAppInfo(
     HashMap.empty[Long, StageTaskQualificationSummary]
   val stageIdToTaskEndSum: HashMap[Long, StageTaskQualificationSummary] =
     HashMap.empty[Long, StageTaskQualificationSummary]
+  val stageIdToGpuCpuTransitions: HashMap[Int, Int] = HashMap.empty[Int, Int]
+  var execsNoStageTransitions: Int = 0
 
   val stageIdToSqlID: HashMap[Int, Long] = HashMap.empty[Int, Long]
   val sqlIDtoFailures: HashMap[Long, ArrayBuffer[String]] = HashMap.empty[Long, ArrayBuffer[String]]
@@ -161,11 +165,11 @@ class QualificationAppInfo(
   }
 
   private def calculateSQLSupportedTaskDuration(all: Seq[StageQualSummaryInfo]): Long = {
-    all.map(s => s.stageTaskTime - s.unsupportedTaskDur).sum
+    all.map(s => s.stageTaskTime - s.unsupportedTaskDur).sum - calculateNoExecsStageDurations(all)
   }
 
   private def calculateSQLUnsupportedTaskDuration(all: Seq[StageQualSummaryInfo]): Long = {
-    all.map(_.unsupportedTaskDur).sum
+    all.map(_.unsupportedTaskDur).sum  + calculateNoExecsStageDurations(all)
   }
 
   private def calculateSpeedupFactor(all: Seq[StageQualSummaryInfo]): Double = {
@@ -174,6 +178,23 @@ class QualificationAppInfo(
     res
   }
 
+  private def calculateNoExecsStageDurations(all: Seq[StageQualSummaryInfo]): Long = {
+    // If there are Execs not associated with any stage, then some of the Execs may not be
+    // supported on GPU.  We need to estimate the duration of these Execs and add it to the
+    // unsupportedTaskDur. We estimate the duration by taking the average of the unsupportedTaskDur
+    // of all the stages and multiplying it by the number of Execs that are not associated with
+    // any stage. We multiply with a penalty factor of 0.05
+    // TODO: Need to come up with better heuristics for penalty factor.
+    val unsupportedTasksize= all.map(_.unsupportedTaskDur).size
+    if (execsNoStageTransitions != 0 && unsupportedTasksize != 0) {
+      execsNoStageTransitions * (
+        all.map(_.unsupportedTaskDur).sum / unsupportedTasksize) * 0.05
+    }.toLong
+    else {
+      0L
+    }
+  }
+
   private def getAllReadFileFormats: Seq[String] = {
     dataSourceInfo.map { ds =>
       s"${ds.format.toLowerCase()}[${ds.schema}]"
@@ -241,13 +262,74 @@ class QualificationAppInfo(
     stages.map { stageId =>
       val stageTaskTime = stageIdToTaskEndSum.get(stageId)
         .map(_.totalTaskDuration).getOrElse(0L)
+      val numTransitions = stageIdToGpuCpuTransitions.getOrElse(stageId, 0)
+      val transitionsTime = numTransitions match {
+        case gpuCpuTransitions if gpuCpuTransitions > 0 =>
+          // Duration to transfer data from GPU to CPU and vice versa.
+          // Assuming it's a PCI-E Gen3, but also assuming that some of the result could be
+          // spilled to disk.
+          // Duration in Spark metrics is in milliseconds and CPU-GPU transfer rate is in bytes/sec.
+          // So we need to convert the transitions time to milliseconds.
+          val totalBytesRead =
+            stageIdToTaskEndSum.get(stageId).map(_.totalbytesRead).getOrElse(0L)
+          if (totalBytesRead > 0) {
+            TimeUnit.SECONDS.toMillis(
+              totalBytesRead / QualificationAppInfo.CPU_GPU_TRANSFER_RATE) * gpuCpuTransitions
+          } else {
+            0L
+          }
+        case _ => 0L
+      }
+      // Update totaltaskduration of stageIdToTaskEndSum to include transitions time
+      val stageIdToTasksMetrics = stageIdToTaskEndSum.get(stageId).orElse(None)
+      if (stageIdToTasksMetrics.isDefined) {
+        stageIdToTasksMetrics.get.totalTaskDuration += transitionsTime
+      }
       StageQualSummaryInfo(stageId, allSpeedupFactorAvg, stageTaskTime,
-        eachStageUnsupported, estimated)
+        eachStageUnsupported + transitionsTime, estimated, numTransitions)
     }.toSet
   }
 
   def summarizeStageLevel(execInfos: Seq[ExecInfo], sqlID: Long): Set[StageQualSummaryInfo] = {
     val (allStagesToExecs, execsNoStage) = getStageToExec(execInfos)
+
+    // Get the total number of transitions between CPU and GPU for each stage and
+    // store it in a Map.
+    allStagesToExecs.foreach { case (stageId, execs) =>
+      // Flatten all the Execs within a stage.
+      // Example: Exchange;WholeStageCodegen (14);Exchange;WholeStageCodegen (13);Exchange
+      // will be flattened to Exchange;Sort;Exchange;Sort;SortMergeJoin;SortMergeJoin;Exchange;
+      val allExecs = execs.map(x => if (x.exec.startsWith("WholeStage")) {
+        x.children.getOrElse(Seq.empty)
+      } else {
+        Seq(x)
+      }).flatten.reverse
+
+      // If it's a shuffle stage, then we need to keep the first and last Exchange and remove
+      // all the intermediate Exchanges as input size is captured in Exchange node.
+      val dedupedExecs = if (allExecs.size > 2) {
+        allExecs.head +:
+          allExecs.tail.init.filter(x => x.exec != "Exchange") :+ allExecs.last
+      } else {
+        allExecs
+      }
+      // Create a list of transitions by zipping allExecs with itself but with the first element
+      // This will create a list of adjacent pairs.
+      // Example: If allExecs = (ScanExec, FilterExec, SortExec, ProjectExec), then it will create
+      // a list of tuples as follows:
+      // (ScanExec, FilterExec), (FilterExec, SortExec), (SortExec, ProjectExec)
+      val transitions = dedupedExecs.zip(dedupedExecs.drop(1)).count {
+        // If the current execution (currExec) is supported, and the next execution (nextExec)
+        // is not supported, or if the current execution is not supported and the next execution
+        // is supported, then we consider this as a transition.
+        case (currExec, nextExec) => (currExec.isSupported && !nextExec.isSupported) ||
+          (!currExec.isSupported && nextExec.isSupported)
+      }
+      stageIdToGpuCpuTransitions(stageId) = transitions
+    }
+    if (execsNoStage.nonEmpty) {
+      execsNoStageTransitions += execsNoStage.filterNot(exec => exec.isSupported).size
+    }
     if (allStagesToExecs.isEmpty) {
       // use job level
       // also get the job ids associated with the SQLId
@@ -670,7 +752,8 @@ class StageTaskQualificationSummary(
     val stageAttemptId: Int,
     var executorRunTime: Long,
     var executorCPUTime: Long,
-    var totalTaskDuration: Long)
+    var totalTaskDuration: Long,
+    var totalbytesRead: Long)
 
 case class QualApplicationInfo(
     appName: String,
@@ -736,7 +819,8 @@ case class StageQualSummaryInfo(
     averageSpeedup: Double,
     stageTaskTime: Long,
     unsupportedTaskDur: Long,
-    estimated: Boolean = false)
+    estimated: Boolean = false,
+    numTransitions: Int)
 
 object QualificationAppInfo extends Logging {
   // define recommendation constants
@@ -746,6 +830,13 @@ object QualificationAppInfo extends Logging {
   val NOT_APPLICABLE = "Not Applicable"
   val LOWER_BOUND_RECOMMENDED = 1.3
   val LOWER_BOUND_STRONGLY_RECOMMENDED = 2.5
+  // Below is the total time taken whenever there are ColumnarToRow or RowToColumnar transitions
+  // This includes the time taken to convert the data from one format to another and the time taken
+  // to transfer the data from CPU to GPU and vice versa. Current transfer rate is 10MB/s and is
+  // based on the testing on few eventlogs.
+  // TODO: Need to test this on more eventlogs including NDS queries
+  //  and come up with a better transfer rate.
+  val CPU_GPU_TRANSFER_RATE = 10000000L
 
   private def handleException(e: Exception, path: EventLogInfo): String = {
     val message: String = e match {

diff --git a/...in/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationEventProcessor.scala b/...in/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationEventProcessor.scala
@@ -72,16 +72,26 @@ class QualificationEventProcessor(app: QualificationAppInfo, perSqlOnly: Boolean
     super.doSparkListenerTaskEnd(app, event)
     // keep all stage task times to see for nonsql duration
     val taskSum = app.stageIdToTaskEndSum.getOrElseUpdate(event.stageId, {
-      new StageTaskQualificationSummary(event.stageId, event.stageAttemptId, 0, 0, 0)
+      new StageTaskQualificationSummary(event.stageId, event.stageAttemptId, 0, 0, 0, 0)
     })
     taskSum.executorRunTime += event.taskMetrics.executorRunTime
     taskSum.executorCPUTime += NANOSECONDS.toMillis(event.taskMetrics.executorCpuTime)
     taskSum.totalTaskDuration += event.taskInfo.duration
+    // Add the total bytes read from the task if it's available. This is from inputMetrics if
+    // it is reading from datasource, or shuffleReadMetrics if it is reading from shuffle.
+    val inputMetrics = event.taskMetrics.inputMetrics
+    if (inputMetrics != null) {
+      taskSum.totalbytesRead += inputMetrics.bytesRead
+    }
+    val shuffleReadMetrics = event.taskMetrics.shuffleReadMetrics
+    if (shuffleReadMetrics != null) {
+      taskSum.totalbytesRead += shuffleReadMetrics.totalBytesRead
+    }
 
     // Adds in everything (including failures)
     app.stageIdToSqlID.get(event.stageId).foreach { sqlID =>
       val taskSum = app.sqlIDToTaskEndSum.getOrElseUpdate(sqlID, {
-        new StageTaskQualificationSummary(event.stageId, event.stageAttemptId, 0, 0, 0)
+        new StageTaskQualificationSummary(event.stageId, event.stageAttemptId, 0, 0, 0, 0)
       })
       taskSum.executorRunTime += event.taskMetrics.executorRunTime
       taskSum.executorCPUTime += NANOSECONDS.toMillis(event.taskMetrics.executorCpuTime)

diff --git a/core/src/test/resources/QualificationExpectations/jdbc_expectation.csv b/core/src/test/resources/QualificationExpectations/jdbc_expectation.csv
@@ -1,2 +1,2 @@
 App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
-"Spark shell","app-20211019113801-0001","Not Recommended",1.0,569385.42,2581.57,3627,19894,571967,3503,28.41,"","JDBC[*]","","","","",1812,544575,677,19217,3.8,false,"Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand;CollectLimit","",30
+"Spark shell","app-20211019113801-0001","Not Recommended",1.0,569387.57,2579.42,3627,19894,571967,3500,28.41,"","JDBC[*]","","","","",1812,544575,693,19201,3.8,false,"Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand;CollectLimit","",30
diff --git a/core/src/test/resources/QualificationExpectations/qual_test_simple_expectation.csv b/core/src/test/resources/QualificationExpectations/qual_test_simple_expectation.csv
@@ -1,5 +1,5 @@
 App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
-"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Recommended",1.92,8472.65,7846.34,12434,132257,16319,10589,37.7,"","","JSON","","","",7143,4717,19616,112641,3.86,false,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",1
-"Spark shell","local-1651187225439","Not Recommended",1.0,355483.43,153.56,760,180,355637,350,87.88,"","JSON[string:bigint:int]","","","","",498,343411,97,83,1.78,false,"SerializeFromObject;CollectLimit;DeserializeToObject;Scan json;Filter;MapElements","",1
-"Spark shell","local-1651188809790","Not Recommended",1.0,166199.97,15.02,911,283,166215,45,81.18,"","JSON[string:bigint:int]","","","","UDF",715,133608,269,14,1.5,false,"CollectLimit;Scan json;Project","UDF",1
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390","Not Recommended",1.0,6240.0,0.0,2032,4666,6240,0,46.27,"","JSON[string:bigint:int]","JSON","","","UDF",1209,5793,4664,2,1.0,false,"Execute InsertIntoHadoopFsRelationCommand json;LocalTableScan;Project;Scan json;Execute CreateViewCommand","UDF",1
+"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Recommended",1.92,8477.87,7841.12,12434,132257,16319,10582,37.7,"","","JSON","","","",7143,4717,19691,112566,3.86,false,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",1
+"Spark shell","local-1651187225439","Not Recommended",1.0,355490.83,146.16,760,180,355637,333,87.88,"","JSON[string:bigint:int]","","","","",498,343411,101,79,1.78,false,"SerializeFromObject;CollectLimit;DeserializeToObject;Scan json;Filter;MapElements","",1
+"Spark shell","local-1651188809790","Not Recommended",1.0,166213.92,1.07,911,283,166215,3,81.18,"","JSON[string:bigint:int]","","","","UDF",715,133608,282,1,1.5,false,"CollectLimit;Scan json;Project","UDF",1
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390","Not Recommended",1.0,6240.0,0.0,2032,4666,6240,-151,46.27,"","JSON[string:bigint:int]","JSON","","","UDF",1209,5793,5013,-347,1.0,false,"Execute InsertIntoHadoopFsRelationCommand json;LocalTableScan;Project;Scan json;Execute CreateViewCommand","UDF",1
diff --git a/core/src/test/resources/QualificationExpectations/qual_test_simple_expectation_persql.csv b/core/src/test/resources/QualificationExpectations/qual_test_simple_expectation_persql.csv
@@ -1,18 +1,18 @@
 App Name,App ID,SQL ID,SQL Description,SQL DF Duration,GPU Opportunity,Estimated GPU Duration,Estimated GPU Speedup,Estimated GPU Time Saved,Recommendation
-"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",1,"count at QualificationInfoUtils.scala:94",7143,6719,2078.49,3.43,5064.5,"Strongly Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",3,"count at QualificationInfoUtils.scala:94",2052,1660,800.56,2.56,1251.43,"Strongly Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",2,"count at QualificationInfoUtils.scala:94",1933,1551,763.96,2.53,1169.03,"Strongly Recommended"
-"Spark shell","local-1651187225439",0,"show at <console>:26",498,249,373.5,1.33,124.5,"Recommended"
-"Spark shell","local-1651188809790",1,"show at <console>:26",196,98,147.0,1.33,49.0,"Recommended"
-"Spark shell","local-1651187225439",1,"show at <console>:26",262,60,240.54,1.08,21.45,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",0,"json at QualificationInfoUtils.scala:76",1306,187,1246.97,1.04,59.02,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",0,"json at QualificationInfoUtils.scala:130",1209,0,1209.0,1.0,0.0,"Not Recommended"
-"Spark shell","local-1651188809790",0,"show at <console>:26",715,2,715.0,1.0,0.0,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",2,"json at QualificationInfoUtils.scala:136",321,0,321.0,1.0,0.0,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",5,"json at QualificationInfoUtils.scala:136",129,0,129.0,1.0,0.0,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",8,"json at QualificationInfoUtils.scala:136",127,0,127.0,1.0,0.0,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",6,"json at QualificationInfoUtils.scala:130",110,0,110.0,1.0,0.0,"Not Recommended"
-"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",3,"json at QualificationInfoUtils.scala:130",108,0,108.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",1,"count at QualificationInfoUtils.scala:94",7143,6714,2082.44,3.43,5060.55,"Strongly Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",3,"count at QualificationInfoUtils.scala:94",2052,1655,804.22,2.55,1247.77,"Strongly Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",2,"count at QualificationInfoUtils.scala:94",1933,1546,767.5,2.51,1165.49,"Strongly Recommended"
+"Spark shell","local-1651188809790",1,"show at <console>:26",196,90,150.76,1.3,45.23,"Recommended"
+"Spark shell","local-1651187225439",0,"show at <console>:26",498,226,384.81,1.29,113.18,"Not Recommended"
+"Spark shell","local-1651187225439",1,"show at <console>:26",262,40,247.69,1.05,14.3,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",0,"json at QualificationInfoUtils.scala:76",1306,131,1264.57,1.03,41.42,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",0,"json at QualificationInfoUtils.scala:130",1209,-543,1209.0,1.0,0.0,"Not Recommended"
+"Spark shell","local-1651188809790",0,"show at <console>:26",715,-66,715.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",2,"json at QualificationInfoUtils.scala:136",321,-144,321.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",5,"json at QualificationInfoUtils.scala:136",129,-57,129.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",8,"json at QualificationInfoUtils.scala:136",127,-56,127.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",3,"json at QualificationInfoUtils.scala:130",108,-48,108.0,1.0,0.0,"Not Recommended"
 "Rapids Spark Profiling Tool Unit Tests","local-1623281204390",4,"createOrReplaceTempView at QualificationInfoUtils.scala:133",22,22,22.0,1.0,0.0,"Not Recommended"
 "Rapids Spark Profiling Tool Unit Tests","local-1623281204390",7,"createOrReplaceTempView at QualificationInfoUtils.scala:133",4,4,4.0,1.0,0.0,"Not Recommended"
 "Rapids Spark Profiling Tool Unit Tests","local-1623281204390",1,"createOrReplaceTempView at QualificationInfoUtils.scala:133",2,2,2.0,1.0,0.0,"Not Recommended"
+"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",6,"json at QualificationInfoUtils.scala:130",110,-49,110.0,0.99,-0.01,"Not Recommended"