From 48e92c32deed1210e42b723d612e61d36b514d95 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Mon, 12 Jul 2021 17:04:54 -0500 Subject: [PATCH] Profiling tool: add app index to tables that don't have it (#2914) * Add app index to tables that don't have it. Signed-off-by: Thomas Graves * update README * Add missing union in for removed executors and block managers to handle multiple apps * spacing --- tools/README.md | 18 +- .../rapids/tool/profiling/HealthCheck.scala | 32 ++-- .../tool/profiling/ApplicationInfo.scala | 6 +- ...executors_removed_eventlog_expectation.csv | 6 +- ...s_join_eventlog_sqlmetrics_expectation.csv | 168 +++++++++--------- ...ved_blockManagers_eventlog_expectation.csv | 6 +- 6 files changed, 120 insertions(+), 116 deletions(-) diff --git a/tools/README.md b/tools/README.md index ab358dce25c..63023536c62 100644 --- a/tools/README.md +++ b/tools/README.md @@ -363,15 +363,15 @@ Rapids Accelerator Jar and cuDF Jar: These are also called accumulables in Spark. ``` SQL Plan Metrics for Application: -+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+ -|sqlID|nodeID|nodeName |accumulatorId|name |max_value |metricType| -+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+ -|0 |1 |GpuColumnarExchange |111 |output rows |1111111111 |sum | -|0 |1 |GpuColumnarExchange |112 |output columnar batches|222222 |sum | -|0 |1 |GpuColumnarExchange |113 |data size |333333333333 |size | -|0 |1 |GpuColumnarExchange |114 |shuffle bytes written |444444444444 |size | -|0 |1 |GpuColumnarExchange |115 |shuffle records written|555555 |sum | -|0 |1 |GpuColumnarExchange |116 |shuffle write time |666666666666 |nsTiming | ++--------+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+ +|appIndex|sqlID|nodeID|nodeName |accumulatorId|name |max_value |metricType| ++--------+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+ +|1 |0 |1 |GpuColumnarExchange |111 |output rows |1111111111 |sum | +|1 |0 |1 |GpuColumnarExchange |112 |output columnar batches|222222 |sum | +|1 |0 |1 |GpuColumnarExchange |113 |data size |333333333333 |size | +|1 |0 |1 |GpuColumnarExchange |114 |shuffle bytes written |444444444444 |size | +|1 |0 |1 |GpuColumnarExchange |115 |shuffle records written|555555 |sum | +|1 |0 |1 |GpuColumnarExchange |116 |shuffle write time |666666666666 |nsTiming | ``` - Print SQL Plans (-p option): diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/HealthCheck.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/HealthCheck.scala index b7ddd6ef3e1..38f79776964 100644 --- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/HealthCheck.scala +++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/HealthCheck.scala @@ -71,25 +71,29 @@ class HealthCheck(apps: Seq[ApplicationInfo], textFileWriter: ToolTextFileWriter //Function to list all SparkListenerBlockManagerRemoved def listRemovedBlockManager(): Unit = { - for (app <- apps) { - if (app.allDataFrames.contains(s"blockManagersRemovedDF_${app.index}")) { - val blockManagersMessageHeader = - s"Removed BlockManager(s):\n" - app.runQuery(query = app.getblockManagersRemoved, fileWriter = Some(textFileWriter), - messageHeader = blockManagersMessageHeader) - } + val header = "\nRemoved BlockManager(s):\n" + val query = apps + .filter { p => + (p.allDataFrames.contains(s"blockManagersRemovedDF_${p.index}")) + }.map(app => "(" + app.getblockManagersRemoved + ")") + .mkString(" union ") + if (query.nonEmpty) { + apps.head.runQuery(query + "order by appIndex, executorID", false, + fileWriter = Some(textFileWriter), messageHeader = header) } } //Function to list all SparkListenerExecutorRemoved def listRemovedExecutors(): Unit = { - for (app <- apps) { - if (app.allDataFrames.contains(s"executorsRemovedDF_${app.index}")) { - val executorsRemovedMessageHeader = - s"Removed Executors(s):\n" - app.runQuery(query = app.getExecutorsRemoved, fileWriter = Some(textFileWriter), - messageHeader = executorsRemovedMessageHeader) - } + val header = "\nRemoved Executors(s):\n" + val query = apps + .filter { p => + (p.allDataFrames.contains(s"executorsRemovedDF_${p.index}")) + }.map(app => "(" + app.getExecutorsRemoved + ")") + .mkString(" union ") + if (query.nonEmpty) { + apps.head.runQuery(query + "order by appIndex, executorID", false, + fileWriter = Some(textFileWriter), messageHeader = header) } } diff --git a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala index 31f8b9a5efe..576d94b5283 100644 --- a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala +++ b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala @@ -884,7 +884,7 @@ class ApplicationInfo( |and s.accumulatorId=t.accumulatorId |and s.sqlID=p.sqlID and s.accumulatorId=p.accumulatorId |) - |select sqlID, nodeID, nodeName, + |select $index as appIndex, sqlID, nodeID, nodeName, |accumulatorId, name, max(value) as max_value, metricType |from allaccums |group by sqlID, nodeID, nodeName, accumulatorId, name, metricType @@ -920,14 +920,14 @@ class ApplicationInfo( } def getblockManagersRemoved: String = { - s"""select executorID, time + s"""select $index as appIndex, executorID, time |from blockManagersRemovedDF_$index |order by cast(executorID as long) |""".stripMargin } def getExecutorsRemoved: String = { - s"""select executorID, time, + s"""select $index as appIndex, executorID, time, |substr(reason, 1, 100) reason_first100char |from executorsRemovedDF_$index |order by cast(executorID as long) diff --git a/tools/src/test/resources/ProfilingExpectations/executors_removed_eventlog_expectation.csv b/tools/src/test/resources/ProfilingExpectations/executors_removed_eventlog_expectation.csv index b47bd574bd6..ffe393e32d9 100644 --- a/tools/src/test/resources/ProfilingExpectations/executors_removed_eventlog_expectation.csv +++ b/tools/src/test/resources/ProfilingExpectations/executors_removed_eventlog_expectation.csv @@ -1,3 +1,3 @@ -executorID,time,reason_first100char -1,1623285426806,Executor Process Lost -2,1623285426654,Executor Process Lost +appIndex,executorID,time,reason_first100char +1,1,1623285426806,Executor Process Lost +1,2,1623285426654,Executor Process Lost diff --git a/tools/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetrics_expectation.csv b/tools/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetrics_expectation.csv index aa9b3984a58..52fc579443b 100644 --- a/tools/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetrics_expectation.csv +++ b/tools/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetrics_expectation.csv @@ -1,84 +1,84 @@ -sqlID,nodeID,nodeName,accumulatorId,name,max_value,metricType -0,0,GpuColumnarToRow,33,total time,857404,nsTiming -0,1,GpuHashAggregate,34,output rows,1,sum -0,1,GpuHashAggregate,35,output columnar batches,1,sum -0,1,GpuHashAggregate,36,total time,4212819,nsTiming -0,1,GpuHashAggregate,37,aggregation time,3846803,nsTiming -0,2,GpuShuffleCoalesce,39,output rows,200,sum -0,2,GpuShuffleCoalesce,40,output columnar batches,1,sum -0,2,GpuShuffleCoalesce,41,total time,3803240,nsTiming -0,2,GpuShuffleCoalesce,42,collect batch time,3277904,nsTiming -0,2,GpuShuffleCoalesce,43,concat batch time,392509,nsTiming -0,3,GpuColumnarExchange,44,partition data size,16000,sum -0,3,GpuColumnarExchange,45,partitions,1,sum -0,3,GpuColumnarExchange,46,output rows,200,sum -0,3,GpuColumnarExchange,47,output columnar batches,200,sum -0,3,GpuColumnarExchange,48,data size,19600,size -0,3,GpuColumnarExchange,50,local blocks read,200,sum -0,3,GpuColumnarExchange,53,local bytes read,15400,size -0,3,GpuColumnarExchange,54,fetch wait time,0,timing -0,3,GpuColumnarExchange,55,records read,200,sum -0,3,GpuColumnarExchange,56,shuffle bytes written,15400,size -0,3,GpuColumnarExchange,57,shuffle records written,200,sum -0,3,GpuColumnarExchange,58,shuffle write time,93193331,nsTiming -0,4,GpuHashAggregate,59,output rows,200,sum -0,4,GpuHashAggregate,60,output columnar batches,200,sum -0,4,GpuHashAggregate,61,total time,80781515,nsTiming -0,4,GpuHashAggregate,62,aggregation time,31923387,nsTiming -0,5,GpuProject,64,total time,5377158,nsTiming -0,6,GpuShuffledHashJoin,65,output rows,10000000,sum -0,6,GpuShuffledHashJoin,66,output columnar batches,200,sum -0,6,GpuShuffledHashJoin,67,total time,3904332009,nsTiming -0,6,GpuShuffledHashJoin,68,build side size,80000000,size -0,6,GpuShuffledHashJoin,69,build time,3448606506,nsTiming -0,6,GpuShuffledHashJoin,70,stream time,260796041,nsTiming -0,6,GpuShuffledHashJoin,71,join time,178084313,nsTiming -0,6,GpuShuffledHashJoin,72,join output rows,10000000,sum -0,7,GpuShuffleCoalesce,74,output rows,10000000,sum -0,7,GpuShuffleCoalesce,75,output columnar batches,200,sum -0,7,GpuShuffleCoalesce,76,total time,261389422,nsTiming -0,7,GpuShuffleCoalesce,77,collect batch time,167775821,nsTiming -0,7,GpuShuffleCoalesce,78,concat batch time,83550919,nsTiming -0,8,GpuColumnarExchange,79,partition data size,42872100,sum -0,8,GpuColumnarExchange,80,partitions,200,sum -0,8,GpuColumnarExchange,81,output rows,10000000,sum -0,8,GpuColumnarExchange,82,output columnar batches,1200,sum -0,8,GpuColumnarExchange,83,data size,40076192,size -0,8,GpuColumnarExchange,85,local blocks read,1200,sum -0,8,GpuColumnarExchange,88,local bytes read,40132258,size -0,8,GpuColumnarExchange,89,fetch wait time,0,timing -0,8,GpuColumnarExchange,90,records read,1200,sum -0,8,GpuColumnarExchange,91,shuffle bytes written,40132258,size -0,8,GpuColumnarExchange,92,shuffle records written,1200,sum -0,8,GpuColumnarExchange,93,shuffle write time,508750471,nsTiming -0,9,GpuProject,94,total time,6667140,nsTiming -0,10,GpuRowToColumnar,95,total time,61112304,nsTiming -0,11,WholeStageCodegen (1),96,duration,5463,timing -0,13,Scan,97,number of output rows,10000000,sum -0,14,GpuCoalesceBatches,98,output rows,10000000,sum -0,14,GpuCoalesceBatches,99,output columnar batches,200,sum -0,14,GpuCoalesceBatches,100,total time,3383354389,nsTiming -0,14,GpuCoalesceBatches,101,collect batch time,3275108263,nsTiming -0,14,GpuCoalesceBatches,102,concat batch time,20312708,nsTiming -0,14,GpuCoalesceBatches,103,peak device memory,80000000,size -0,15,GpuShuffleCoalesce,107,output rows,10000000,sum -0,15,GpuShuffleCoalesce,108,output columnar batches,200,sum -0,15,GpuShuffleCoalesce,109,total time,3266208420,nsTiming -0,15,GpuShuffleCoalesce,110,collect batch time,359397047,nsTiming -0,15,GpuShuffleCoalesce,111,concat batch time,104974316,nsTiming -0,16,GpuColumnarExchange,112,partition data size,42872100,sum -0,16,GpuColumnarExchange,113,partitions,200,sum -0,16,GpuColumnarExchange,114,output rows,10000000,sum -0,16,GpuColumnarExchange,115,output columnar batches,1200,sum -0,16,GpuColumnarExchange,116,data size,40076192,size -0,16,GpuColumnarExchange,118,local blocks read,1200,sum -0,16,GpuColumnarExchange,121,local bytes read,40132250,size -0,16,GpuColumnarExchange,122,fetch wait time,0,timing -0,16,GpuColumnarExchange,123,records read,1200,sum -0,16,GpuColumnarExchange,124,shuffle bytes written,40132250,size -0,16,GpuColumnarExchange,125,shuffle records written,1200,sum -0,16,GpuColumnarExchange,126,shuffle write time,400284505,nsTiming -0,17,GpuProject,127,total time,207820,nsTiming -0,18,GpuRowToColumnar,128,total time,58640462,nsTiming -0,19,WholeStageCodegen (2),129,duration,5920,timing -0,21,Scan,130,number of output rows,10000000,sum +appIndex,sqlID,nodeID,nodeName,accumulatorId,name,max_value,metricType +1,0,0,GpuColumnarToRow,33,total time,857404,nsTiming +1,0,1,GpuHashAggregate,34,output rows,1,sum +1,0,1,GpuHashAggregate,35,output columnar batches,1,sum +1,0,1,GpuHashAggregate,36,total time,4212819,nsTiming +1,0,1,GpuHashAggregate,37,aggregation time,3846803,nsTiming +1,0,2,GpuShuffleCoalesce,39,output rows,200,sum +1,0,2,GpuShuffleCoalesce,40,output columnar batches,1,sum +1,0,2,GpuShuffleCoalesce,41,total time,3803240,nsTiming +1,0,2,GpuShuffleCoalesce,42,collect batch time,3277904,nsTiming +1,0,2,GpuShuffleCoalesce,43,concat batch time,392509,nsTiming +1,0,3,GpuColumnarExchange,44,partition data size,16000,sum +1,0,3,GpuColumnarExchange,45,partitions,1,sum +1,0,3,GpuColumnarExchange,46,output rows,200,sum +1,0,3,GpuColumnarExchange,47,output columnar batches,200,sum +1,0,3,GpuColumnarExchange,48,data size,19600,size +1,0,3,GpuColumnarExchange,50,local blocks read,200,sum +1,0,3,GpuColumnarExchange,53,local bytes read,15400,size +1,0,3,GpuColumnarExchange,54,fetch wait time,0,timing +1,0,3,GpuColumnarExchange,55,records read,200,sum +1,0,3,GpuColumnarExchange,56,shuffle bytes written,15400,size +1,0,3,GpuColumnarExchange,57,shuffle records written,200,sum +1,0,3,GpuColumnarExchange,58,shuffle write time,93193331,nsTiming +1,0,4,GpuHashAggregate,59,output rows,200,sum +1,0,4,GpuHashAggregate,60,output columnar batches,200,sum +1,0,4,GpuHashAggregate,61,total time,80781515,nsTiming +1,0,4,GpuHashAggregate,62,aggregation time,31923387,nsTiming +1,0,5,GpuProject,64,total time,5377158,nsTiming +1,0,6,GpuShuffledHashJoin,65,output rows,10000000,sum +1,0,6,GpuShuffledHashJoin,66,output columnar batches,200,sum +1,0,6,GpuShuffledHashJoin,67,total time,3904332009,nsTiming +1,0,6,GpuShuffledHashJoin,68,build side size,80000000,size +1,0,6,GpuShuffledHashJoin,69,build time,3448606506,nsTiming +1,0,6,GpuShuffledHashJoin,70,stream time,260796041,nsTiming +1,0,6,GpuShuffledHashJoin,71,join time,178084313,nsTiming +1,0,6,GpuShuffledHashJoin,72,join output rows,10000000,sum +1,0,7,GpuShuffleCoalesce,74,output rows,10000000,sum +1,0,7,GpuShuffleCoalesce,75,output columnar batches,200,sum +1,0,7,GpuShuffleCoalesce,76,total time,261389422,nsTiming +1,0,7,GpuShuffleCoalesce,77,collect batch time,167775821,nsTiming +1,0,7,GpuShuffleCoalesce,78,concat batch time,83550919,nsTiming +1,0,8,GpuColumnarExchange,79,partition data size,42872100,sum +1,0,8,GpuColumnarExchange,80,partitions,200,sum +1,0,8,GpuColumnarExchange,81,output rows,10000000,sum +1,0,8,GpuColumnarExchange,82,output columnar batches,1200,sum +1,0,8,GpuColumnarExchange,83,data size,40076192,size +1,0,8,GpuColumnarExchange,85,local blocks read,1200,sum +1,0,8,GpuColumnarExchange,88,local bytes read,40132258,size +1,0,8,GpuColumnarExchange,89,fetch wait time,0,timing +1,0,8,GpuColumnarExchange,90,records read,1200,sum +1,0,8,GpuColumnarExchange,91,shuffle bytes written,40132258,size +1,0,8,GpuColumnarExchange,92,shuffle records written,1200,sum +1,0,8,GpuColumnarExchange,93,shuffle write time,508750471,nsTiming +1,0,9,GpuProject,94,total time,6667140,nsTiming +1,0,10,GpuRowToColumnar,95,total time,61112304,nsTiming +1,0,11,WholeStageCodegen (1),96,duration,5463,timing +1,0,13,Scan,97,number of output rows,10000000,sum +1,0,14,GpuCoalesceBatches,98,output rows,10000000,sum +1,0,14,GpuCoalesceBatches,99,output columnar batches,200,sum +1,0,14,GpuCoalesceBatches,100,total time,3383354389,nsTiming +1,0,14,GpuCoalesceBatches,101,collect batch time,3275108263,nsTiming +1,0,14,GpuCoalesceBatches,102,concat batch time,20312708,nsTiming +1,0,14,GpuCoalesceBatches,103,peak device memory,80000000,size +1,0,15,GpuShuffleCoalesce,107,output rows,10000000,sum +1,0,15,GpuShuffleCoalesce,108,output columnar batches,200,sum +1,0,15,GpuShuffleCoalesce,109,total time,3266208420,nsTiming +1,0,15,GpuShuffleCoalesce,110,collect batch time,359397047,nsTiming +1,0,15,GpuShuffleCoalesce,111,concat batch time,104974316,nsTiming +1,0,16,GpuColumnarExchange,112,partition data size,42872100,sum +1,0,16,GpuColumnarExchange,113,partitions,200,sum +1,0,16,GpuColumnarExchange,114,output rows,10000000,sum +1,0,16,GpuColumnarExchange,115,output columnar batches,1200,sum +1,0,16,GpuColumnarExchange,116,data size,40076192,size +1,0,16,GpuColumnarExchange,118,local blocks read,1200,sum +1,0,16,GpuColumnarExchange,121,local bytes read,40132250,size +1,0,16,GpuColumnarExchange,122,fetch wait time,0,timing +1,0,16,GpuColumnarExchange,123,records read,1200,sum +1,0,16,GpuColumnarExchange,124,shuffle bytes written,40132250,size +1,0,16,GpuColumnarExchange,125,shuffle records written,1200,sum +1,0,16,GpuColumnarExchange,126,shuffle write time,400284505,nsTiming +1,0,17,GpuProject,127,total time,207820,nsTiming +1,0,18,GpuRowToColumnar,128,total time,58640462,nsTiming +1,0,19,WholeStageCodegen (2),129,duration,5920,timing +1,0,21,Scan,130,number of output rows,10000000,sum diff --git a/tools/src/test/resources/ProfilingExpectations/removed_blockManagers_eventlog_expectation.csv b/tools/src/test/resources/ProfilingExpectations/removed_blockManagers_eventlog_expectation.csv index 02289800d5b..03bf9bc62a8 100644 --- a/tools/src/test/resources/ProfilingExpectations/removed_blockManagers_eventlog_expectation.csv +++ b/tools/src/test/resources/ProfilingExpectations/removed_blockManagers_eventlog_expectation.csv @@ -1,3 +1,3 @@ -executorID,time -1,1623285426800 -2,1623285426638 +appIndex,executorID,time +1,1,1623285426800 +1,2,1623285426638