Adds aggregation across metrics for failed/succeeded and non completed stages (#1558)

sayedbilalbari · web-flow · commit 634bd964e1e6 · 2025-03-10T16:58:16.000-07:00
Fixes #1552 Currently we store the stageInfo using the stageModelManager class where we map incoming stage information during the following events - 1. doSparkListenerStageCompleted https://github.com/NVIDIA/spark-rapids-tools/blob/1f037fa867e4df0952e29d82164cc7fc507c9b4e/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala#L475 2. doSparkListenerStageSubmitted. - https://github.com/NVIDIA/spark-rapids-tools/blob/1f037fa867e4df0952e29d82164cc7fc507c9b4e/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala#L464 So a stage information is updated once when a stage is submitted and once during completion. A stageCompleted event comes for all attempts of a stage ( eg - there will be two stage Submitted and StageCompleted events for stage that fails on first attempt and succeeds on attempt 2) This PR changes that behavior to aggregate all attempts for a stage ( failed + succeeded ) ### Changes - This pull request includes several changes to improve the handling of stage attempts and task metrics in the Spark RAPIDS tool. The most important changes include adding logic to handle multiple stage attempts, modifying methods to aggregate metrics for these attempts, and updating the `AccumManager` to simplify task accumulation. Handling multiple stage attempts: * [`core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala`](diffhunk://#diff-4b0aab10a86746bb7480cc3bde4e013c04707758c61782934c07604443160b40L450-R455): Added logic to handle multiple stage attempts by aggregating metrics for each attempt. * [`core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala`](diffhunk://#diff-8d5819c9445c1489d61ee8d03fd2b1ee1e0cb33896f402f4ceb7782c35deed69R688-R746): Introduced `aggregateStageProfileMetric` method to combine metrics for multiple attempts of the same stage. Simplifying task accumulation: * [`core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala`](diffhunk://#diff-9b551b7ad326fd9175e0c5b0ba69e947058ee2587922f1fe059e85623604e9c1L372-R372): Modified `addAccToTask` method to remove the `taskId` parameter and simplify task accumulation. * [`core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumInfo.scala`](diffhunk://#diff-2cdf5cec29c5cfc15962382b2134c8e88b6623afdfd7cc6a81ec3dfc5663b4a1L87-R89): Updated `addAccumToTask` method to remove the `taskId` parameter. * [`core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumManager.scala`](diffhunk://#diff-ff390301f53c6470012e1c36878c1987f176c7eeaa52e30e18f93f76e58587b3L43-R45): Simplified `addAccToTask` method by removing the `taskId` parameter. ### Testing This change has been tested against internal event logs and integration tests have been updated to ensure this behavior is tested for the future --------- Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala
@@ -320,9 +320,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
       AccumProfileResults(0, 0, AccumMetaRef.EMPTY_ACCUM_META_REF, 0L, 0L, 0L, 0L)
     val emptyNodeNames = Seq.empty[String]
     val emptyDiagnosticMetrics = HashMap.empty[String, AccumProfileResults]
-    // TODO: this has stage attempts. we should handle different attempts
     app.stageManager.getAllStages.map { sm =>
-      // TODO: Should we only consider successful tasks?
       val tasksInStage = app.taskManager.getTasks(sm.stageInfo.stageId,
         sm.stageInfo.attemptNumber())
       // count duplicate task attempts
@@ -358,13 +356,12 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
   }
 
   /**
-   * Aggregates the SparkMetrics by stage. This is an internal method to populate the cached metrics
+   * Aggregates the SparkMetrics by completed stage information.
+   * This is an internal method to populate the cached metrics
    * to be used by other aggregators.
    * @param index AppIndex (used by the profiler tool)
    */
   private def aggregateSparkMetricsByStageInternal(index: Int): Unit = {
-    // TODO: this has stage attempts. we should handle different attempts
-
     // For Photon apps, peak memory and shuffle write time need to be calculated from accumulators
     // instead of task metrics.
     // Approach:
@@ -447,7 +444,15 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
         perStageRec.swBytesWrittenSum,
         perStageRec.swRecordsWrittenSum,
         perStageRec.swWriteTimeSum)  // converted to milliseconds by the aggregator
-      stageLevelSparkMetrics(index).put(sm.stageInfo.stageId, stageRow)
+      // This logic is to handle the case where there are multiple attempts for a stage.
+      // We check if the StageLevelCache already has a row for the stage.
+      // If yes, we aggregate the metrics of the new row with the existing row.
+      // If no, we just store the new row.
+      val rowToStore = stageLevelSparkMetrics(index)
+        .get(sm.stageInfo.stageId)
+        .map(_.aggregateStageProfileMetric(stageRow))
+        .getOrElse(stageRow)
+      stageLevelSparkMetrics(index).put(sm.stageInfo.stageId, rowToStore)
     }
   }
 }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala
@@ -694,6 +694,65 @@ case class StageAggTaskMetricsProfileResult(
     swRecordsWrittenSum: Long,
     swWriteTimeSum: Long // milliseconds
   ) extends BaseJobStageAggTaskMetricsProfileResult {
+
+  /**
+   * Combines two StageAggTaskMetricsProfileResults for the same stage.
+   * This method aggregates the metrics from the current instance and the provided `other` instance.
+   *
+   * Detailed explanation ->
+   * 1. A stage can have two successful attempts.
+   * 2. We store both of those attempt information using the StageManager
+   * 3. During aggregation, we combine the metrics for a stage at a stageID
+   *    level
+   * 4. For combining aggregated information for multiple stage attempts, we combine the
+   *    aggregated per attempt information into one using the below method
+   *
+   * @param other The StageAggTaskMetricsProfileResult to be combined with the current instance.
+   * @return A new StageAggTaskMetricsProfileResult with aggregated metrics.
+   */
+  def aggregateStageProfileMetric(
+      other: StageAggTaskMetricsProfileResult
+  ): StageAggTaskMetricsProfileResult = {
+    StageAggTaskMetricsProfileResult(
+      appIndex = this.appIndex,
+      id = this.id,
+      numTasks = this.numTasks + other.numTasks,
+      duration = Option(this.duration.getOrElse(0L) + other.duration.getOrElse(0L)),
+      diskBytesSpilledSum = this.diskBytesSpilledSum + other.diskBytesSpilledSum,
+      durationSum = this.durationSum + other.durationSum,
+      durationMax = Math.max(this.durationMax, other.durationMax),
+      durationMin = Math.min(this.durationMin, other.durationMin),
+      durationAvg = (this.durationAvg + other.durationAvg) / 2,
+      executorCPUTimeSum = this.executorCPUTimeSum + other.executorCPUTimeSum,
+      executorDeserializeCpuTimeSum = this.executorDeserializeCpuTimeSum +
+        other.executorDeserializeCpuTimeSum,
+      executorDeserializeTimeSum = this.executorDeserializeTimeSum +
+        other.executorDeserializeTimeSum,
+      executorRunTimeSum = this.executorRunTimeSum + other.executorRunTimeSum,
+      inputBytesReadSum = this.inputBytesReadSum + other.inputBytesReadSum,
+      inputRecordsReadSum = this.inputRecordsReadSum + other.inputRecordsReadSum,
+      jvmGCTimeSum = this.jvmGCTimeSum + other.jvmGCTimeSum,
+      memoryBytesSpilledSum = this.memoryBytesSpilledSum + other.memoryBytesSpilledSum,
+      outputBytesWrittenSum = this.outputBytesWrittenSum + other.outputBytesWrittenSum,
+      outputRecordsWrittenSum = this.outputRecordsWrittenSum + other.outputRecordsWrittenSum,
+      peakExecutionMemoryMax = Math.max(this.peakExecutionMemoryMax, other.peakExecutionMemoryMax),
+      resultSerializationTimeSum = this.resultSerializationTimeSum +
+        other.resultSerializationTimeSum,
+      resultSizeMax = Math.max(this.resultSizeMax, other.resultSizeMax),
+      srFetchWaitTimeSum = this.srFetchWaitTimeSum + other.srFetchWaitTimeSum,
+      srLocalBlocksFetchedSum = this.srLocalBlocksFetchedSum + other.srLocalBlocksFetchedSum,
+      srRemoteBlocksFetchSum = this.srRemoteBlocksFetchSum + other.srRemoteBlocksFetchSum,
+      srRemoteBytesReadSum = this.srRemoteBytesReadSum + other.srRemoteBytesReadSum,
+      srRemoteBytesReadToDiskSum = this.srRemoteBytesReadToDiskSum +
+        other.srRemoteBytesReadToDiskSum,
+      srTotalBytesReadSum = this.srTotalBytesReadSum + other.srTotalBytesReadSum,
+      srcLocalBytesReadSum = this.srcLocalBytesReadSum + other.srcLocalBytesReadSum,
+      swBytesWrittenSum = this.swBytesWrittenSum + other.swBytesWrittenSum,
+      swRecordsWrittenSum = this.swRecordsWrittenSum + other.swRecordsWrittenSum,
+      swWriteTimeSum = this.swWriteTimeSum + other.swWriteTimeSum
+    )
+  }
+
   override def idHeader = "stageId"
 }
 
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
@@ -369,7 +369,7 @@ abstract class EventProcessorBase[T <: AppBase](app: T) extends SparkListener wi
     // Parse task accumulables
     for (res <- event.taskInfo.accumulables) {
       try {
-        app.accumManager.addAccToTask(event.stageId, event.taskInfo.taskId, res)
+        app.accumManager.addAccToTask(event.stageId, res)
       } catch {
         case NonFatal(e) =>
           logWarning("Exception when parsing accumulables on task-completed "
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumInfo.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumInfo.scala
@@ -84,10 +84,9 @@ class AccumInfo(val infoRef: AccumMetaRef) {
    * attempt information with give no Stats at accumulable level
    *
    * @param stageId The ID of the stage containing the task
-   * @param taskId The ID of the completed task
    * @param accumulableInfo Accumulator information from the TaskEnd event
    */
-  def addAccumToTask(stageId: Int, taskId: Long, accumulableInfo: AccumulableInfo): Unit = {
+  def addAccumToTask(stageId: Int, accumulableInfo: AccumulableInfo): Unit = {
     // 1. We first extract the incoming task update value
     // 2. Then allocate a new Statistic metric object with min,max as incoming update
     // 3. Use count to calculate rolling average
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumManager.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/AccumManager.scala
@@ -40,9 +40,9 @@ class AccumManager {
     accumInfoRef.addAccumToStage(stageId, accumulableInfo)
   }
 
-  def addAccToTask(stageId: Int, taskId: Long, accumulableInfo: AccumulableInfo): Unit = {
+  def addAccToTask(stageId: Int, accumulableInfo: AccumulableInfo): Unit = {
     val accumInfoRef = getOrCreateAccumInfo(accumulableInfo.id, accumulableInfo.name)
-    accumInfoRef.addAccumToTask(stageId, taskId, accumulableInfo)
+    accumInfoRef.addAccumToTask(stageId, accumulableInfo)
   }
 
   def getAccStageIds(id: Long): Set[Int] = {
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/StageModel.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/StageModel.scala
@@ -69,10 +69,23 @@ class StageModel private(sInfo: StageInfo) {
       ProfileUtils.optionLongMinusOptionLong(stageInfo.completionTime, stageInfo.submissionTime)
   }
 
+  /**
+   * Returns true if a stage attempt has failed.
+   * There can be multiple attempts( retries ) of a stage
+   * that can fail until the last attempt succeeds.
+   *
+   * @return true if a stage attempt has failed.
+   */
   def hasFailed: Boolean = {
     stageInfo.failureReason.isDefined
   }
 
+  /**
+   * Returns the failure reason if the stage has failed.
+   * Failure reason being set is the sure shot of a failed stage.
+   *
+   * @return the failure reason if the stage has failed, or an empty string otherwise
+   */
   def getFailureReason: String = {
     stageInfo.failureReason.getOrElse("")
   }

Original file line number	Diff line number	Diff line change
`@@ -40,9 +40,9 @@ class AccumManager {`
`40`	`40`	`accumInfoRef.addAccumToStage(stageId, accumulableInfo)`
`41`	`41`	`}`
`42`	`42`
`43`		`- def addAccToTask(stageId: Int, taskId: Long, accumulableInfo: AccumulableInfo): Unit = {`
	`43`	`+ def addAccToTask(stageId: Int, accumulableInfo: AccumulableInfo): Unit = {`
`44`	`44`	`val accumInfoRef = getOrCreateAccumInfo(accumulableInfo.id, accumulableInfo.name)`
`45`		`- accumInfoRef.addAccumToTask(stageId, taskId, accumulableInfo)`
	`45`	`+ accumInfoRef.addAccumToTask(stageId, accumulableInfo)`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`def getAccStageIds(id: Long): Set[Int] = {`