Skip to content

Commit

Permalink
Profiling tool: add app index to tables that don't have it (#2914)
Browse files Browse the repository at this point in the history
* Add app index to tables that don't have it.

Signed-off-by: Thomas Graves <tgraves@nvidia.com>

* update README

* Add missing union in for removed executors and block managers to handle
multiple apps

* spacing
  • Loading branch information
tgravescs authored Jul 12, 2021
1 parent 6fbee07 commit 48e92c3
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 116 deletions.
18 changes: 9 additions & 9 deletions tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -363,15 +363,15 @@ Rapids Accelerator Jar and cuDF Jar:
These are also called accumulables in Spark.
```
SQL Plan Metrics for Application:
+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+
|sqlID|nodeID|nodeName |accumulatorId|name |max_value |metricType|
+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+
|0 |1 |GpuColumnarExchange |111 |output rows |1111111111 |sum |
|0 |1 |GpuColumnarExchange |112 |output columnar batches|222222 |sum |
|0 |1 |GpuColumnarExchange |113 |data size |333333333333 |size |
|0 |1 |GpuColumnarExchange |114 |shuffle bytes written |444444444444 |size |
|0 |1 |GpuColumnarExchange |115 |shuffle records written|555555 |sum |
|0 |1 |GpuColumnarExchange |116 |shuffle write time |666666666666 |nsTiming |
+--------+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+
|appIndex|sqlID|nodeID|nodeName |accumulatorId|name |max_value |metricType|
+--------+-----+------+-----------------------------------------------------------+-------------+-----------------------+-------------+----------+
|1 |0 |1 |GpuColumnarExchange |111 |output rows |1111111111 |sum |
|1 |0 |1 |GpuColumnarExchange |112 |output columnar batches|222222 |sum |
|1 |0 |1 |GpuColumnarExchange |113 |data size |333333333333 |size |
|1 |0 |1 |GpuColumnarExchange |114 |shuffle bytes written |444444444444 |size |
|1 |0 |1 |GpuColumnarExchange |115 |shuffle records written|555555 |sum |
|1 |0 |1 |GpuColumnarExchange |116 |shuffle write time |666666666666 |nsTiming |
```
- Print SQL Plans (-p option):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,25 +71,29 @@ class HealthCheck(apps: Seq[ApplicationInfo], textFileWriter: ToolTextFileWriter

//Function to list all SparkListenerBlockManagerRemoved
def listRemovedBlockManager(): Unit = {
for (app <- apps) {
if (app.allDataFrames.contains(s"blockManagersRemovedDF_${app.index}")) {
val blockManagersMessageHeader =
s"Removed BlockManager(s):\n"
app.runQuery(query = app.getblockManagersRemoved, fileWriter = Some(textFileWriter),
messageHeader = blockManagersMessageHeader)
}
val header = "\nRemoved BlockManager(s):\n"
val query = apps
.filter { p =>
(p.allDataFrames.contains(s"blockManagersRemovedDF_${p.index}"))
}.map(app => "(" + app.getblockManagersRemoved + ")")
.mkString(" union ")
if (query.nonEmpty) {
apps.head.runQuery(query + "order by appIndex, executorID", false,
fileWriter = Some(textFileWriter), messageHeader = header)
}
}

//Function to list all SparkListenerExecutorRemoved
def listRemovedExecutors(): Unit = {
for (app <- apps) {
if (app.allDataFrames.contains(s"executorsRemovedDF_${app.index}")) {
val executorsRemovedMessageHeader =
s"Removed Executors(s):\n"
app.runQuery(query = app.getExecutorsRemoved, fileWriter = Some(textFileWriter),
messageHeader = executorsRemovedMessageHeader)
}
val header = "\nRemoved Executors(s):\n"
val query = apps
.filter { p =>
(p.allDataFrames.contains(s"executorsRemovedDF_${p.index}"))
}.map(app => "(" + app.getExecutorsRemoved + ")")
.mkString(" union ")
if (query.nonEmpty) {
apps.head.runQuery(query + "order by appIndex, executorID", false,
fileWriter = Some(textFileWriter), messageHeader = header)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ class ApplicationInfo(
|and s.accumulatorId=t.accumulatorId
|and s.sqlID=p.sqlID and s.accumulatorId=p.accumulatorId
|)
|select sqlID, nodeID, nodeName,
|select $index as appIndex, sqlID, nodeID, nodeName,
|accumulatorId, name, max(value) as max_value, metricType
|from allaccums
|group by sqlID, nodeID, nodeName, accumulatorId, name, metricType
Expand Down Expand Up @@ -920,14 +920,14 @@ class ApplicationInfo(
}

def getblockManagersRemoved: String = {
s"""select executorID, time
s"""select $index as appIndex, executorID, time
|from blockManagersRemovedDF_$index
|order by cast(executorID as long)
|""".stripMargin
}

def getExecutorsRemoved: String = {
s"""select executorID, time,
s"""select $index as appIndex, executorID, time,
|substr(reason, 1, 100) reason_first100char
|from executorsRemovedDF_$index
|order by cast(executorID as long)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
executorID,time,reason_first100char
1,1623285426806,Executor Process Lost
2,1623285426654,Executor Process Lost
appIndex,executorID,time,reason_first100char
1,1,1623285426806,Executor Process Lost
1,2,1623285426654,Executor Process Lost
Original file line number Diff line number Diff line change
@@ -1,84 +1,84 @@
sqlID,nodeID,nodeName,accumulatorId,name,max_value,metricType
0,0,GpuColumnarToRow,33,total time,857404,nsTiming
0,1,GpuHashAggregate,34,output rows,1,sum
0,1,GpuHashAggregate,35,output columnar batches,1,sum
0,1,GpuHashAggregate,36,total time,4212819,nsTiming
0,1,GpuHashAggregate,37,aggregation time,3846803,nsTiming
0,2,GpuShuffleCoalesce,39,output rows,200,sum
0,2,GpuShuffleCoalesce,40,output columnar batches,1,sum
0,2,GpuShuffleCoalesce,41,total time,3803240,nsTiming
0,2,GpuShuffleCoalesce,42,collect batch time,3277904,nsTiming
0,2,GpuShuffleCoalesce,43,concat batch time,392509,nsTiming
0,3,GpuColumnarExchange,44,partition data size,16000,sum
0,3,GpuColumnarExchange,45,partitions,1,sum
0,3,GpuColumnarExchange,46,output rows,200,sum
0,3,GpuColumnarExchange,47,output columnar batches,200,sum
0,3,GpuColumnarExchange,48,data size,19600,size
0,3,GpuColumnarExchange,50,local blocks read,200,sum
0,3,GpuColumnarExchange,53,local bytes read,15400,size
0,3,GpuColumnarExchange,54,fetch wait time,0,timing
0,3,GpuColumnarExchange,55,records read,200,sum
0,3,GpuColumnarExchange,56,shuffle bytes written,15400,size
0,3,GpuColumnarExchange,57,shuffle records written,200,sum
0,3,GpuColumnarExchange,58,shuffle write time,93193331,nsTiming
0,4,GpuHashAggregate,59,output rows,200,sum
0,4,GpuHashAggregate,60,output columnar batches,200,sum
0,4,GpuHashAggregate,61,total time,80781515,nsTiming
0,4,GpuHashAggregate,62,aggregation time,31923387,nsTiming
0,5,GpuProject,64,total time,5377158,nsTiming
0,6,GpuShuffledHashJoin,65,output rows,10000000,sum
0,6,GpuShuffledHashJoin,66,output columnar batches,200,sum
0,6,GpuShuffledHashJoin,67,total time,3904332009,nsTiming
0,6,GpuShuffledHashJoin,68,build side size,80000000,size
0,6,GpuShuffledHashJoin,69,build time,3448606506,nsTiming
0,6,GpuShuffledHashJoin,70,stream time,260796041,nsTiming
0,6,GpuShuffledHashJoin,71,join time,178084313,nsTiming
0,6,GpuShuffledHashJoin,72,join output rows,10000000,sum
0,7,GpuShuffleCoalesce,74,output rows,10000000,sum
0,7,GpuShuffleCoalesce,75,output columnar batches,200,sum
0,7,GpuShuffleCoalesce,76,total time,261389422,nsTiming
0,7,GpuShuffleCoalesce,77,collect batch time,167775821,nsTiming
0,7,GpuShuffleCoalesce,78,concat batch time,83550919,nsTiming
0,8,GpuColumnarExchange,79,partition data size,42872100,sum
0,8,GpuColumnarExchange,80,partitions,200,sum
0,8,GpuColumnarExchange,81,output rows,10000000,sum
0,8,GpuColumnarExchange,82,output columnar batches,1200,sum
0,8,GpuColumnarExchange,83,data size,40076192,size
0,8,GpuColumnarExchange,85,local blocks read,1200,sum
0,8,GpuColumnarExchange,88,local bytes read,40132258,size
0,8,GpuColumnarExchange,89,fetch wait time,0,timing
0,8,GpuColumnarExchange,90,records read,1200,sum
0,8,GpuColumnarExchange,91,shuffle bytes written,40132258,size
0,8,GpuColumnarExchange,92,shuffle records written,1200,sum
0,8,GpuColumnarExchange,93,shuffle write time,508750471,nsTiming
0,9,GpuProject,94,total time,6667140,nsTiming
0,10,GpuRowToColumnar,95,total time,61112304,nsTiming
0,11,WholeStageCodegen (1),96,duration,5463,timing
0,13,Scan,97,number of output rows,10000000,sum
0,14,GpuCoalesceBatches,98,output rows,10000000,sum
0,14,GpuCoalesceBatches,99,output columnar batches,200,sum
0,14,GpuCoalesceBatches,100,total time,3383354389,nsTiming
0,14,GpuCoalesceBatches,101,collect batch time,3275108263,nsTiming
0,14,GpuCoalesceBatches,102,concat batch time,20312708,nsTiming
0,14,GpuCoalesceBatches,103,peak device memory,80000000,size
0,15,GpuShuffleCoalesce,107,output rows,10000000,sum
0,15,GpuShuffleCoalesce,108,output columnar batches,200,sum
0,15,GpuShuffleCoalesce,109,total time,3266208420,nsTiming
0,15,GpuShuffleCoalesce,110,collect batch time,359397047,nsTiming
0,15,GpuShuffleCoalesce,111,concat batch time,104974316,nsTiming
0,16,GpuColumnarExchange,112,partition data size,42872100,sum
0,16,GpuColumnarExchange,113,partitions,200,sum
0,16,GpuColumnarExchange,114,output rows,10000000,sum
0,16,GpuColumnarExchange,115,output columnar batches,1200,sum
0,16,GpuColumnarExchange,116,data size,40076192,size
0,16,GpuColumnarExchange,118,local blocks read,1200,sum
0,16,GpuColumnarExchange,121,local bytes read,40132250,size
0,16,GpuColumnarExchange,122,fetch wait time,0,timing
0,16,GpuColumnarExchange,123,records read,1200,sum
0,16,GpuColumnarExchange,124,shuffle bytes written,40132250,size
0,16,GpuColumnarExchange,125,shuffle records written,1200,sum
0,16,GpuColumnarExchange,126,shuffle write time,400284505,nsTiming
0,17,GpuProject,127,total time,207820,nsTiming
0,18,GpuRowToColumnar,128,total time,58640462,nsTiming
0,19,WholeStageCodegen (2),129,duration,5920,timing
0,21,Scan,130,number of output rows,10000000,sum
appIndex,sqlID,nodeID,nodeName,accumulatorId,name,max_value,metricType
1,0,0,GpuColumnarToRow,33,total time,857404,nsTiming
1,0,1,GpuHashAggregate,34,output rows,1,sum
1,0,1,GpuHashAggregate,35,output columnar batches,1,sum
1,0,1,GpuHashAggregate,36,total time,4212819,nsTiming
1,0,1,GpuHashAggregate,37,aggregation time,3846803,nsTiming
1,0,2,GpuShuffleCoalesce,39,output rows,200,sum
1,0,2,GpuShuffleCoalesce,40,output columnar batches,1,sum
1,0,2,GpuShuffleCoalesce,41,total time,3803240,nsTiming
1,0,2,GpuShuffleCoalesce,42,collect batch time,3277904,nsTiming
1,0,2,GpuShuffleCoalesce,43,concat batch time,392509,nsTiming
1,0,3,GpuColumnarExchange,44,partition data size,16000,sum
1,0,3,GpuColumnarExchange,45,partitions,1,sum
1,0,3,GpuColumnarExchange,46,output rows,200,sum
1,0,3,GpuColumnarExchange,47,output columnar batches,200,sum
1,0,3,GpuColumnarExchange,48,data size,19600,size
1,0,3,GpuColumnarExchange,50,local blocks read,200,sum
1,0,3,GpuColumnarExchange,53,local bytes read,15400,size
1,0,3,GpuColumnarExchange,54,fetch wait time,0,timing
1,0,3,GpuColumnarExchange,55,records read,200,sum
1,0,3,GpuColumnarExchange,56,shuffle bytes written,15400,size
1,0,3,GpuColumnarExchange,57,shuffle records written,200,sum
1,0,3,GpuColumnarExchange,58,shuffle write time,93193331,nsTiming
1,0,4,GpuHashAggregate,59,output rows,200,sum
1,0,4,GpuHashAggregate,60,output columnar batches,200,sum
1,0,4,GpuHashAggregate,61,total time,80781515,nsTiming
1,0,4,GpuHashAggregate,62,aggregation time,31923387,nsTiming
1,0,5,GpuProject,64,total time,5377158,nsTiming
1,0,6,GpuShuffledHashJoin,65,output rows,10000000,sum
1,0,6,GpuShuffledHashJoin,66,output columnar batches,200,sum
1,0,6,GpuShuffledHashJoin,67,total time,3904332009,nsTiming
1,0,6,GpuShuffledHashJoin,68,build side size,80000000,size
1,0,6,GpuShuffledHashJoin,69,build time,3448606506,nsTiming
1,0,6,GpuShuffledHashJoin,70,stream time,260796041,nsTiming
1,0,6,GpuShuffledHashJoin,71,join time,178084313,nsTiming
1,0,6,GpuShuffledHashJoin,72,join output rows,10000000,sum
1,0,7,GpuShuffleCoalesce,74,output rows,10000000,sum
1,0,7,GpuShuffleCoalesce,75,output columnar batches,200,sum
1,0,7,GpuShuffleCoalesce,76,total time,261389422,nsTiming
1,0,7,GpuShuffleCoalesce,77,collect batch time,167775821,nsTiming
1,0,7,GpuShuffleCoalesce,78,concat batch time,83550919,nsTiming
1,0,8,GpuColumnarExchange,79,partition data size,42872100,sum
1,0,8,GpuColumnarExchange,80,partitions,200,sum
1,0,8,GpuColumnarExchange,81,output rows,10000000,sum
1,0,8,GpuColumnarExchange,82,output columnar batches,1200,sum
1,0,8,GpuColumnarExchange,83,data size,40076192,size
1,0,8,GpuColumnarExchange,85,local blocks read,1200,sum
1,0,8,GpuColumnarExchange,88,local bytes read,40132258,size
1,0,8,GpuColumnarExchange,89,fetch wait time,0,timing
1,0,8,GpuColumnarExchange,90,records read,1200,sum
1,0,8,GpuColumnarExchange,91,shuffle bytes written,40132258,size
1,0,8,GpuColumnarExchange,92,shuffle records written,1200,sum
1,0,8,GpuColumnarExchange,93,shuffle write time,508750471,nsTiming
1,0,9,GpuProject,94,total time,6667140,nsTiming
1,0,10,GpuRowToColumnar,95,total time,61112304,nsTiming
1,0,11,WholeStageCodegen (1),96,duration,5463,timing
1,0,13,Scan,97,number of output rows,10000000,sum
1,0,14,GpuCoalesceBatches,98,output rows,10000000,sum
1,0,14,GpuCoalesceBatches,99,output columnar batches,200,sum
1,0,14,GpuCoalesceBatches,100,total time,3383354389,nsTiming
1,0,14,GpuCoalesceBatches,101,collect batch time,3275108263,nsTiming
1,0,14,GpuCoalesceBatches,102,concat batch time,20312708,nsTiming
1,0,14,GpuCoalesceBatches,103,peak device memory,80000000,size
1,0,15,GpuShuffleCoalesce,107,output rows,10000000,sum
1,0,15,GpuShuffleCoalesce,108,output columnar batches,200,sum
1,0,15,GpuShuffleCoalesce,109,total time,3266208420,nsTiming
1,0,15,GpuShuffleCoalesce,110,collect batch time,359397047,nsTiming
1,0,15,GpuShuffleCoalesce,111,concat batch time,104974316,nsTiming
1,0,16,GpuColumnarExchange,112,partition data size,42872100,sum
1,0,16,GpuColumnarExchange,113,partitions,200,sum
1,0,16,GpuColumnarExchange,114,output rows,10000000,sum
1,0,16,GpuColumnarExchange,115,output columnar batches,1200,sum
1,0,16,GpuColumnarExchange,116,data size,40076192,size
1,0,16,GpuColumnarExchange,118,local blocks read,1200,sum
1,0,16,GpuColumnarExchange,121,local bytes read,40132250,size
1,0,16,GpuColumnarExchange,122,fetch wait time,0,timing
1,0,16,GpuColumnarExchange,123,records read,1200,sum
1,0,16,GpuColumnarExchange,124,shuffle bytes written,40132250,size
1,0,16,GpuColumnarExchange,125,shuffle records written,1200,sum
1,0,16,GpuColumnarExchange,126,shuffle write time,400284505,nsTiming
1,0,17,GpuProject,127,total time,207820,nsTiming
1,0,18,GpuRowToColumnar,128,total time,58640462,nsTiming
1,0,19,WholeStageCodegen (2),129,duration,5920,timing
1,0,21,Scan,130,number of output rows,10000000,sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
executorID,time
1,1623285426800
2,1623285426638
appIndex,executorID,time
1,1,1623285426800
1,2,1623285426638

0 comments on commit 48e92c3

Please sign in to comment.