stczwd commented on pull request #35185:
URL: https://github.com/apache/spark/pull/35185#issuecomment-1055414369


   >  I wanted to confirm whether we have tests where task index != partitionId 
- for example stage resubmission case.
   
   Yes, we have already test this. Here ia a simple example for stage 
recomputation with FetchFailed error.
   
   ```
   {
     "status" : "FAILED",
     "stageId" : 39,
     "attemptId" : 1,
     "numTasks" : 407,
     "numActiveTasks" : 0,
     "numCompleteTasks" : 155,
     "numFailedTasks" : 77,
     "numKilledTasks" : 0,
     "numCompletedIndices" : 155,
     "executorRunTime" : 821532,
     "executorCpuTime" : 806253615510,
     "submissionTime" : "2022-03-01T12:36:32.018GMT",
     "firstTaskLaunchedTime" : "2022-03-01T12:36:32.028GMT",
     "completionTime" : "2022-03-01T12:37:33.643GMT",
     "failureReason" : "org.apache.spark.shuffle.MetadataFetchFailedException: 
Missing an output location for shuffle 1\n",
     "inputBytes" : 0,
     "inputRecords" : 0,
     "outputBytes" : 0,
     "outputRecords" : 0,
     "shuffleReadBytes" : 22927763615,
     "shuffleReadRecords" : 943893085,
     "shuffleWriteBytes" : 0,
     "shuffleWriteRecords" : 0,
     "memoryBytesSpilled" : 0,
     "diskBytesSpilled" : 0,
     "name" : "run at ThreadPoolExecutor.java:1149",
     "description" : "benchmark q24a-v2.4",
     "details" : 
"scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\njava.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\njava.lang.Thread.run(Thread.java:748)",
     "schedulingPool" : "default",
     "rddIds" : [ 93, 92, 91, 90, 89 ],
     "accumulatorUpdates" : [ ],
     "tasks" : {
       "14343" : {
         "taskId" : 14343,
         "index" : 65,
         "attempt" : 0,
         "partitionId" : 157,
         "launchTime" : "2022-03-01T12:36:32.031GMT",
         "duration" : 5475,
         "executorId" : "5",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14367" : {
         "taskId" : 14367,
         "index" : 89,
         "attempt" : 0,
         "partitionId" : 182,
         "launchTime" : "2022-03-01T12:36:37.415GMT",
         "duration" : 5322,
         "executorId" : "8",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14404" : {
         "taskId" : 14404,
         "index" : 126,
         "attempt" : 0,
         "partitionId" : 219,
         "launchTime" : "2022-03-01T12:36:37.900GMT",
         "duration" : 5246,
         "executorId" : "7",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14463" : {
         "taskId" : 14463,
         "index" : 185,
         "attempt" : 0,
         "partitionId" : 278,
         "launchTime" : "2022-03-01T12:36:42.840GMT",
         "duration" : 103059,
         "executorId" : "6",
         "host" : "",
         "status" : "FAILED",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "errorMessage" : "FetchFailed",
         "taskMetrics" : {
         }
       },
       "14305" : {
         "taskId" : 14305,
         "index" : 27,
         "attempt" : 0,
         "partitionId" : 98,
         "launchTime" : "2022-03-01T12:36:32.030GMT",
         "duration" : 4910,
         "executorId" : "10",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14358" : {
         "taskId" : 14358,
         "index" : 80,
         "attempt" : 0,
         "partitionId" : 173,
         "launchTime" : "2022-03-01T12:36:37.257GMT",
         "duration" : 5062,
         "executorId" : "9",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14419" : {
         "taskId" : 14419,
         "index" : 141,
         "attempt" : 0,
         "partitionId" : 234,
         "launchTime" : "2022-03-01T12:36:41.673GMT",
         "duration" : 4798,
         "executorId" : "10",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14399" : {
         "taskId" : 14399,
         "index" : 121,
         "attempt" : 0,
         "partitionId" : 214,
         "launchTime" : "2022-03-01T12:36:37.797GMT",
         "duration" : 5337,
         "executorId" : "7",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14296" : {
         "taskId" : 14296,
         "index" : 18,
         "attempt" : 0,
         "partitionId" : 83,
         "launchTime" : "2022-03-01T12:36:32.029GMT",
         "duration" : 4944,
         "executorId" : "10",
         "host" : "",
         "status" : "SUCCESS",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "taskMetrics" : {
         }
       },
       "14468" : {
         "taskId" : 14468,
         "index" : 190,
         "attempt" : 0,
         "partitionId" : 283,
         "launchTime" : "2022-03-01T12:36:43.134GMT",
         "duration" : 103319,
         "executorId" : "7",
         "host" : "",
         "status" : "FAILED",
         "taskLocality" : "PROCESS_LOCAL",
         "speculative" : false,
         "accumulatorUpdates" : [ ],
         "errorMessage" : "FetchFailed",
         "taskMetrics" : {
         }
       }
   ....
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to