[ https://issues.apache.org/jira/browse/MAPREDUCE-7474?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17838338#comment-17838338 ]
ASF GitHub Bot commented on MAPREDUCE-7474: ------------------------------------------- steveloughran commented on code in PR #6716: URL: https://github.com/apache/hadoop/pull/6716#discussion_r1569262867 ########## hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java: ########## @@ -142,64 +154,93 @@ protected Result executeStage( } Outcome outcome = null; - IOException exception; + IOException exception = null; + boolean baseDirDeleted = false; // to delete. LOG.info("{}: Deleting job directory {}", getName(), baseDir); if (args.deleteTaskAttemptDirsInParallel) { - // Attempt to do a parallel delete of task attempt dirs; - // don't overreact if a delete fails, but stop trying - // to delete the others, and fall back to deleting the - // job dir. - Path taskSubDir - = getStageConfig().getJobAttemptTaskSubDir(); - try (DurationInfo info = new DurationInfo(LOG, - "parallel deletion of task attempts in %s", - taskSubDir)) { - RemoteIterator<FileStatus> dirs = - RemoteIterators.filteringRemoteIterator( - listStatusIterator(taskSubDir), - FileStatus::isDirectory); - TaskPool.foreach(dirs) - .executeWith(getIOProcessors()) - .stopOnFailure() - .suppressExceptions(false) - .run(this::rmTaskAttemptDir); - getIOStatistics().aggregate((retrieveIOStatistics(dirs))); - - if (getLastDeleteException() != null) { - // one of the task attempts failed. - throw getLastDeleteException(); + + // parallel delete of task attempt dirs. + + if (args.parallelDeleteAttemptBaseDeleteFirst) { + // attempt to delete the base dir first. + // This can reduce ABFS delete load but may time out + // (which the fallback to parallel delete will handle). + // on GCS it is slow. + try (DurationInfo info = new DurationInfo(LOG, true, + "Initial delete of %s", baseDir)) { + exception = deleteOneDir(baseDir); + if (exception == null) { + // success: record this as the outcome, which + // will skip the parallel delete. + outcome = Outcome.DELETED; + baseDirDeleted = true; + } else { + // failure: log and continue + LOG.warn("{}: Exception on initial attempt at deleting base dir {}\n" + + "attempting parallel delete", + getName(), baseDir, exception); + } + } + } + if (!baseDirDeleted) { Review Comment: it gets set on L180; will comment > [ABFS] Improve commit resilience and performance in Manifest Committer > ---------------------------------------------------------------------- > > Key: MAPREDUCE-7474 > URL: https://issues.apache.org/jira/browse/MAPREDUCE-7474 > Project: Hadoop Map/Reduce > Issue Type: Bug > Components: client > Affects Versions: 3.4.0, 3.3.6 > Reporter: Steve Loughran > Assignee: Steve Loughran > Priority: Major > Labels: pull-request-available > > * Manifest committer is not resilient to rename failures on task commit > without HADOOP-18012 rename recovery enabled. > * large burst of delete calls noted: are they needed? > relates to HADOOP-19093 but takes a more minimal approach with goal of > changes in manifest committer only. > Initial proposed changes > * retry recovery on task commit rename, always (repeat save, delete, rename) > * audit delete use and see if it can be pruned -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: mapreduce-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: mapreduce-issues-h...@hadoop.apache.org