Skip to content

Commit 518d17a

Browse files
committed
Merge dev into main
Signed-off-by: spark-rapids automation <[email protected]>
2 parents 8fcef52 + a8ed8f3 commit 518d17a

File tree

12 files changed

+211
-127
lines changed

12 files changed

+211
-127
lines changed

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Analysis.scala

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Analysis(apps: Seq[ApplicationInfo]) {
2929

3030
def getDurations(tcs: ArrayBuffer[TaskCase]): (Long, Long, Long, Double) = {
3131
val durations = tcs.map(_.duration)
32-
if (durations.size > 0 ) {
32+
if (durations.nonEmpty ) {
3333
(durations.sum, durations.max, durations.min,
3434
ToolUtils.calculateAverage(durations.sum, durations.size, 1))
3535
} else {
@@ -49,22 +49,20 @@ class Analysis(apps: Seq[ApplicationInfo]) {
4949
def jobAndStageMetricsAggregation(): Seq[JobStageAggTaskMetricsProfileResult] = {
5050
val allJobRows = apps.flatMap { app =>
5151
app.jobIdToInfo.map { case (id, jc) =>
52-
val stageIdsInJob = jc.stageIds
5352
val stagesInJob = app.stageIdToInfo.filterKeys { case (sid, _) =>
54-
stageIdsInJob.contains(sid)
55-
}.keys.map(_._1).toSeq
53+
jc.stageIds.contains(sid)
54+
}.keys.map(_._1).toSet
5655
if (stagesInJob.isEmpty) {
5756
None
5857
} else {
5958
val tasksInJob = app.taskEnd.filter { tc =>
6059
stagesInJob.contains(tc.stageId)
6160
}
6261
// count duplicate task attempts
63-
val numTaskAttempt = tasksInJob.size
6462
val (durSum, durMax, durMin, durAvg) = getDurations(tasksInJob)
6563
Some(JobStageAggTaskMetricsProfileResult(app.index,
6664
s"job_$id",
67-
numTaskAttempt,
65+
tasksInJob.size,
6866
jc.duration,
6967
tasksInJob.map(_.diskBytesSpilled).sum,
7068
durSum,
@@ -100,9 +98,8 @@ class Analysis(apps: Seq[ApplicationInfo]) {
10098
}
10199
val allJobStageRows = apps.flatMap { app =>
102100
app.jobIdToInfo.flatMap { case (_, jc) =>
103-
val stageIdsInJob = jc.stageIds
104101
val stagesInJob = app.stageIdToInfo.filterKeys { case (sid, _) =>
105-
stageIdsInJob.contains(sid)
102+
jc.stageIds.contains(sid)
106103
}
107104
if (stagesInJob.isEmpty) {
108105
None
@@ -111,12 +108,10 @@ class Analysis(apps: Seq[ApplicationInfo]) {
111108
val tasksInStage = app.taskEnd.filter { tc =>
112109
tc.stageId == id
113110
}
114-
// count duplicate task attempts
115-
val numAttempts = tasksInStage.size
116111
val (durSum, durMax, durMin, durAvg) = getDurations(tasksInStage)
117112
Some(JobStageAggTaskMetricsProfileResult(app.index,
118113
s"stage_$id",
119-
numAttempts,
114+
tasksInStage.size,
120115
sc.duration,
121116
tasksInStage.map(_.diskBytesSpilled).sum,
122117
durSum,
@@ -153,17 +148,16 @@ class Analysis(apps: Seq[ApplicationInfo]) {
153148
}
154149
// stages that are missing from a job, perhaps dropped events
155150
val stagesWithoutJobs = apps.flatMap { app =>
156-
val allStageinJobs = app.jobIdToInfo.flatMap { case (_, jc) =>
157-
val stageIdsInJob = jc.stageIds
151+
val allStageInJobs = app.jobIdToInfo.flatMap { case (_, jc) =>
158152
app.stageIdToInfo.filterKeys { case (sid, _) =>
159-
stageIdsInJob.contains(sid)
153+
jc.stageIds.contains(sid)
160154
}
161155
}
162-
val missing = app.stageIdToInfo.keys.toSeq.diff(allStageinJobs.keys.toSeq)
156+
val missing = app.stageIdToInfo.keys.toSet.diff(allStageInJobs.keys.toSet)
163157
if (missing.isEmpty) {
164158
Seq.empty
165159
} else {
166-
missing.map { case ((id, saId)) =>
160+
missing.map { case (id, saId) =>
167161
val scOpt = app.stageIdToInfo.get((id, saId))
168162
scOpt match {
169163
case None =>
@@ -214,11 +208,11 @@ class Analysis(apps: Seq[ApplicationInfo]) {
214208
}
215209

216210
val allRows = allJobRows ++ allJobStageRows ++ stagesWithoutJobs
217-
val filteredRows = allRows.filter(_.isDefined).map(_.get)
218-
if (filteredRows.size > 0) {
211+
val filteredRows = allRows.flatMap(row => row)
212+
if (filteredRows.nonEmpty) {
219213
val sortedRows = filteredRows.sortBy { cols =>
220214
val sortDur = cols.duration.getOrElse(0L)
221-
(cols.appIndex, -(sortDur), cols.id)
215+
(cols.appIndex, -sortDur, cols.id)
222216
}
223217
sortedRows
224218
} else {
@@ -231,12 +225,12 @@ class Analysis(apps: Seq[ApplicationInfo]) {
231225
val allRows = apps.flatMap { app =>
232226
app.sqlIdToInfo.map { case (sqlId, sqlCase) =>
233227
val jcs = app.jobIdToInfo.filter { case (_, jc) =>
234-
jc.sqlID.getOrElse(-1) == sqlId
228+
jc.sqlID.isDefined && jc.sqlID.get == sqlId
235229
}
236230
if (jcs.isEmpty) {
237231
None
238232
} else {
239-
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSeq
233+
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSet
240234
val tasksInSQL = app.taskEnd.filter { tc =>
241235
stageIdsForSQL.contains(tc.stageId)
242236
}
@@ -298,7 +292,7 @@ class Analysis(apps: Seq[ApplicationInfo]) {
298292
}
299293
}
300294
}
301-
val allFiltered = allRows.filter(_.isDefined).map(_.get)
295+
val allFiltered = allRows.flatMap(row => row)
302296
if (allFiltered.size > 0) {
303297
val sortedRows = allFiltered.sortBy { cols =>
304298
val sortDur = cols.duration.getOrElse(0L)
@@ -314,12 +308,12 @@ class Analysis(apps: Seq[ApplicationInfo]) {
314308
val allRows = apps.flatMap { app =>
315309
app.sqlIdToInfo.map { case (sqlId, _) =>
316310
val jcs = app.jobIdToInfo.filter { case (_, jc) =>
317-
jc.sqlID.getOrElse(-1) == sqlId
311+
jc.sqlID.isDefined && jc.sqlID.get == sqlId
318312
}
319313
if (jcs.isEmpty) {
320314
None
321315
} else {
322-
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSeq
316+
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSet
323317

324318
val tasksInSQL = app.taskEnd.filter { tc =>
325319
stageIdsForSQL.contains(tc.stageId)
@@ -344,7 +338,7 @@ class Analysis(apps: Seq[ApplicationInfo]) {
344338
}
345339
}
346340
}
347-
val allFiltered = allRows.filter(_.isDefined).map(_.get)
341+
val allFiltered = allRows.flatMap(row => row)
348342
if (allFiltered.size > 0) {
349343
val sortedRows = allFiltered.sortBy { cols =>
350344
(cols.appIndex, cols.sqlId)
@@ -359,12 +353,12 @@ class Analysis(apps: Seq[ApplicationInfo]) {
359353
apps.map { app =>
360354
val maxOfSqls = app.sqlIdToInfo.map { case (sqlId, _) =>
361355
val jcs = app.jobIdToInfo.filter { case (_, jc) =>
362-
jc.sqlID.getOrElse(-1) == sqlId
356+
jc.sqlID.isDefined && jc.sqlID.get == sqlId
363357
}
364358
if (jcs.isEmpty) {
365359
0L
366360
} else {
367-
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSeq
361+
val stageIdsForSQL = jcs.flatMap(_._2.stageIds).toSet
368362
val tasksInSQL = app.taskEnd.filter { tc =>
369363
stageIdsForSQL.contains(tc.stageId)
370364
}
@@ -394,7 +388,7 @@ class Analysis(apps: Seq[ApplicationInfo]) {
394388
sqlCase.sqlCpuTimePercent)
395389
}
396390
}
397-
if (allRows.size > 0) {
391+
if (allRows.nonEmpty) {
398392
val sortedRows = allRows.sortBy { cols =>
399393
val sortDur = cols.duration.getOrElse(0L)
400394
(cols.appIndex, cols.sqlID, sortDur)
@@ -443,8 +437,8 @@ class Analysis(apps: Seq[ApplicationInfo]) {
443437
}
444438
}
445439

446-
val allNonEmptyRows = allRows.filter(_.isDefined).map(_.get)
447-
if (allNonEmptyRows.size > 0) {
440+
val allNonEmptyRows = allRows.flatMap(row => row)
441+
if (allNonEmptyRows.nonEmpty) {
448442
val sortedRows = allNonEmptyRows.sortBy { cols =>
449443
(cols.appIndex, cols.stageId, cols.stageAttemptId, cols.taskId, cols.taskAttemptId)
450444
}

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -856,8 +856,8 @@ class AutoTuner(
856856

857857
/**
858858
* Recommendation for 'spark.sql.files.maxPartitionBytes' based on input size for each task.
859-
* Note that the logic can be disabled by adding the property to [[limitedLogicRecommendations]]
860-
* which is one of the arguments of [[getRecommendedProperties()]].
859+
* Note that the logic can be disabled by adding the property to "limitedLogicRecommendations"
860+
* which is one of the arguments of [[getRecommendedProperties]].
861861
*/
862862
private def recommendMaxPartitionBytes(): Unit = {
863863
val maxPartitionProp =
@@ -873,8 +873,8 @@ class AutoTuner(
873873

874874
/**
875875
* Recommendations for 'spark.sql.shuffle.partitions' based on spills and skew in shuffle stages.
876-
* Note that the logic can be disabled by adding the property to [[limitedLogicRecommendations]]
877-
* which is one of the arguments of [[getRecommendedProperties()]].
876+
* Note that the logic can be disabled by adding the property to "limitedLogicRecommendations"
877+
* which is one of the arguments of [[getRecommendedProperties]].
878878
*/
879879
def recommendShufflePartitions(): Unit = {
880880
val lookup = "spark.sql.shuffle.partitions"

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ class CollectInformation(apps: Seq[ApplicationInfo]) extends Logging {
236236
CollectInformation.addNewProps(propsToKeep, props, numApps)
237237
}
238238
val allRows = props.map { case (k, v) => Seq(k) ++ v }.toSeq
239-
if (allRows.size > 0) {
239+
if (allRows.nonEmpty) {
240240
val resRows = allRows.map(r => RapidsPropertyProfileResult(r(0), outputHeaders, r))
241241
resRows.sortBy(cols => cols.key)
242242
} else {
@@ -259,7 +259,7 @@ class CollectInformation(apps: Seq[ApplicationInfo]) extends Logging {
259259
val allWholeStages = apps.flatMap { app =>
260260
app.wholeStage
261261
}
262-
if (allWholeStages.size > 0) {
262+
if (allWholeStages.nonEmpty) {
263263
allWholeStages.sortBy(cols => (cols.appIndex, cols.sqlID, cols.nodeID))
264264
} else {
265265
Seq.empty
@@ -269,7 +269,7 @@ class CollectInformation(apps: Seq[ApplicationInfo]) extends Logging {
269269
// Print SQL Plan Metrics
270270
def getSQLPlanMetrics: Seq[SQLAccumProfileResults] = {
271271
val sqlAccums = CollectInformation.generateSQLAccums(apps)
272-
if (sqlAccums.size > 0) {
272+
if (sqlAccums.nonEmpty) {
273273
sqlAccums.sortBy(cols => (cols.appIndex, cols.sqlID, cols.nodeID,
274274
cols.nodeName, cols.accumulatorId, cols.metricType))
275275
} else {
@@ -286,11 +286,11 @@ object CollectInformation extends Logging {
286286
def generateSQLAccums(apps: Seq[ApplicationInfo]): Seq[SQLAccumProfileResults] = {
287287
val allRows = apps.flatMap { app =>
288288
app.allSQLMetrics.map { metric =>
289-
val sqlId = metric.sqlID
290289
val jobsForSql = app.jobIdToInfo.filter { case (_, jc) =>
291-
jc.sqlID.getOrElse(-1) == sqlId
290+
// Avoid getOrElse to reduce memory allocations
291+
jc.sqlID.isDefined && jc.sqlID.get == metric.sqlID
292292
}
293-
val stageIdsForSQL = jobsForSql.flatMap(_._2.stageIds).toSeq
293+
val stageIdsForSQL = jobsForSql.flatMap(_._2.stageIds).toSet
294294
val accumsOpt = app.taskStageAccumMap.get(metric.accumulatorId)
295295
val taskMax = accumsOpt match {
296296
case Some(accums) =>
@@ -326,7 +326,7 @@ object CollectInformation extends Logging {
326326
val driverMax = driverAccumsOpt match {
327327
case Some(accums) =>
328328
val filtered = accums.filter { a =>
329-
a.sqlID == sqlId
329+
a.sqlID == metric.sqlID
330330
}
331331
val accumValues = filtered.map(_.value).sortWith(_ < _)
332332
if (accumValues.isEmpty) {

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021, NVIDIA CORPORATION.
2+
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -144,7 +144,7 @@ case class QueryPlanWithMetrics(plan: SparkPlanInfoWithStage, metrics: Map[Long,
144144
* Each graph is defined with a set of nodes and a set of edges. Each node represents a node in the
145145
* SparkPlan tree, and each edge represents a parent-child relationship between two nodes.
146146
*/
147-
case class SparkPlanGraph(
147+
case class SparkPlanGraphForDot(
148148
nodes: Seq[SparkPlanGraphNode],
149149
edges: Seq[SparkPlanGraphEdge],
150150
appId: String,
@@ -187,14 +187,14 @@ object SparkPlanGraph {
187187
appId: String,
188188
sqlId: String,
189189
physicalPlan: String,
190-
stageIdToStageMetrics: Map[Int, StageMetrics]): SparkPlanGraph = {
190+
stageIdToStageMetrics: Map[Int, StageMetrics]): SparkPlanGraphForDot = {
191191
val nodeIdGenerator = new AtomicLong(0)
192192
val nodes = mutable.ArrayBuffer[SparkPlanGraphNode]()
193193
val edges = mutable.ArrayBuffer[SparkPlanGraphEdge]()
194194
val exchanges = mutable.HashMap[SparkPlanInfoWithStage, SparkPlanGraphNode]()
195195
buildSparkPlanGraphNode(planInfo, nodeIdGenerator, nodes, edges, null, null, null, exchanges,
196196
stageIdToStageMetrics)
197-
new SparkPlanGraph(nodes, edges, appId, sqlId, physicalPlan)
197+
SparkPlanGraphForDot(nodes, edges, appId, sqlId, physicalPlan)
198198
}
199199

200200
@tailrec

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,11 @@ class SQLExecutionInfoClass(
176176
var duration: Option[Long],
177177
var hasDatasetOrRDD: Boolean,
178178
var problematic: String = "",
179-
var sqlCpuTimePercent: Double = -1)
179+
var sqlCpuTimePercent: Double = -1) {
180+
def setDsOrRdd(value: Boolean): Unit = {
181+
hasDatasetOrRDD = value
182+
}
183+
}
180184

181185
case class SQLAccumProfileResults(appIndex: Int, sqlID: Long, nodeID: Long,
182186
nodeName: String, accumulatorId: Long, name: String, min: Long, median:Long,

core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
306306
val app = new ApplicationInfo(hadoopConf, path, index)
307307
EventLogPathProcessor.logApplicationInfo(app)
308308
val endTime = System.currentTimeMillis()
309-
logInfo(s"Took ${endTime - startTime}ms to process ${path.eventLog.toString}")
309+
logInfo(s"Took ${endTime - startTime}ms to create App for ${path.eventLog.toString}")
310310
Some(app)
311311
} catch {
312312
case _: com.fasterxml.jackson.core.JsonParseException =>
@@ -327,9 +327,12 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
327327
* and returns the summary information. The summary information is much smaller than
328328
* the ApplicationInfo because it has processed and combined many of the raw events.
329329
*/
330-
private def processApps(apps: Seq[ApplicationInfo], printPlans: Boolean,
331-
profileOutputWriter: ProfileOutputWriter): (ApplicationSummaryInfo,
332-
Option[CompareSummaryInfo]) = {
330+
private def processApps(
331+
apps: Seq[ApplicationInfo],
332+
printPlans: Boolean,
333+
profileOutputWriter: ProfileOutputWriter)
334+
: (ApplicationSummaryInfo, Option[CompareSummaryInfo]) = {
335+
val startTime = System.currentTimeMillis()
333336

334337
val collect = new CollectInformation(apps)
335338
val appInfo = collect.getAppInfo
@@ -403,7 +406,9 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
403406
s"to $outputDir in $duration second(s)\n")
404407
}
405408
}
406-
(ApplicationSummaryInfo(appInfo, dsInfo, execInfo, jobInfo, rapidsProps,
409+
val endTime = System.currentTimeMillis()
410+
logInfo(s"Took ${endTime - startTime}ms to Process [${appInfo.head.appId}]")
411+
(ApplicationSummaryInfo(appInfo, dsInfo, execInfo, jobInfo, rapidsProps,
407412
rapidsJar, sqlMetrics, jsMetAgg, sqlTaskAggMetrics, durAndCpuMet, skewInfo,
408413
failedTasks, failedStages, failedJobs, removedBMs, removedExecutors,
409414
unsupportedOps, sparkProps, sqlStageInfo, wholeStage, maxTaskInputInfo,

0 commit comments

Comments
 (0)