zhouyejoe commented on a change in pull request #33078:
URL: https://github.com/apache/spark/pull/33078#discussion_r663593359
##########
File path:
common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
##########
@@ -403,38 +394,78 @@ public MergeStatuses
finalizeShuffleMerge(FinalizeShuffleMerge msg) throws IOExc
reduceIds.add(partition.reduceId);
sizes.add(partition.getLastChunkOffset());
} catch (IOException ioe) {
- logger.warn("Exception while finalizing shuffle partition {} {}
{}", msg.appId,
- msg.shuffleId, partition.reduceId, ioe);
+ logger.warn("Exception while finalizing shuffle partition {}_{} {}
{}", msg.appId,
+ msg.attemptId, msg.shuffleId, partition.reduceId, ioe);
} finally {
partition.closeAllFiles();
- // The partition should be removed after the files are written so
that any new stream
- // for the same reduce partition will see that the data file
exists.
- partitionsIter.remove();
}
}
}
mergeStatuses = new MergeStatuses(msg.shuffleId,
bitmaps.toArray(new RoaringBitmap[bitmaps.size()]),
Ints.toArray(reduceIds),
Longs.toArray(sizes));
}
- partitions.remove(appShuffleId);
- logger.info("Finalized shuffle {} from Application {}.", msg.shuffleId,
msg.appId);
+ logger.info("Finalized shuffle {} from Application {}_{}.",
+ msg.shuffleId, msg.appId, msg.attemptId);
return mergeStatuses;
}
@Override
public void registerExecutor(String appId, ExecutorShuffleInfo executorInfo)
{
if (logger.isDebugEnabled()) {
logger.debug("register executor with RemoteBlockPushResolver {}
local-dirs {} "
- + "num sub-dirs {}", appId, Arrays.toString(executorInfo.localDirs),
- executorInfo.subDirsPerLocalDir);
+ + "num sub-dirs {} shuffleManager {}", appId,
Arrays.toString(executorInfo.localDirs),
+ executorInfo.subDirsPerLocalDir, executorInfo.shuffleManager);
+ }
+ String shuffleManagerMeta = executorInfo.shuffleManager;
+ if (shuffleManagerMeta.contains(":")) {
+ String mergeDirInfo =
shuffleManagerMeta.substring(shuffleManagerMeta.indexOf(":") + 1);
+ try {
+ ObjectMapper mapper = new ObjectMapper();
+ MergeDirectoryMeta mergeDirectoryMeta =
+ mapper.readValue(mergeDirInfo, MergeDirectoryMeta.class);
+ if (mergeDirectoryMeta.attemptId == ATTEMPT_ID_UNDEFINED) {
+ // When attemptId is -1, there is no attemptId stored in the
ExecutorShuffleInfo.
+ // Only the first ExecutorRegister message can register the merge
dirs
+ appsShuffleInfo.computeIfAbsent(appId, id ->
+ new AppShuffleInfo(
+ appId, mergeDirectoryMeta.attemptId,
+ new AppPathsInfo(appId, executorInfo.localDirs,
+ mergeDirectoryMeta.mergeDir, executorInfo.subDirsPerLocalDir)
+ ));
+ } else {
+ // If attemptId is not -1, there is attemptId stored in the
ExecutorShuffleInfo.
+ // The first ExecutorRegister message from the same application
attempt wil register
+ // the merge dirs in External Shuffle Service. Any later
ExecutorRegister message
+ // from the same application attempt will not override the merge
dirs. But it can
+ // be overridden by ExecutorRegister message from newer application
attempt,
+ // and former attempts' shuffle partitions information will also be
cleaned up.
+ ConcurrentMap<Integer, AppShuffleInfo> appShuffleInfoToBeCleanedUp =
+ Maps.newConcurrentMap();
+ appsShuffleInfo.compute(appId, (id, appShuffleInfo) -> {
+ if (appShuffleInfo == null || (appShuffleInfo != null
+ && mergeDirectoryMeta.attemptId > appShuffleInfo.attemptId)) {
+
appShuffleInfoToBeCleanedUp.putIfAbsent(appShuffleInfo.attemptId,
appShuffleInfo);
+ appShuffleInfo =
+ new AppShuffleInfo(
+ appId, mergeDirectoryMeta.attemptId,
+ new AppPathsInfo(appId, executorInfo.localDirs,
+ mergeDirectoryMeta.mergeDir,
executorInfo.subDirsPerLocalDir));
+ }
+ return appShuffleInfo;
+ });
+ for (AppShuffleInfo appShuffleInfo:
appShuffleInfoToBeCleanedUp.values()) {
+ logger.info("Remove shuffle info for {}_{} as new application
attempt registered",
+ appId, appShuffleInfo.attemptId);
+ appShuffleInfo.cleanupShufflePartitionInfo();
Review comment:
Updated to cleanup async.
##########
File path:
common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
##########
@@ -403,38 +394,78 @@ public MergeStatuses
finalizeShuffleMerge(FinalizeShuffleMerge msg) throws IOExc
reduceIds.add(partition.reduceId);
sizes.add(partition.getLastChunkOffset());
} catch (IOException ioe) {
- logger.warn("Exception while finalizing shuffle partition {} {}
{}", msg.appId,
- msg.shuffleId, partition.reduceId, ioe);
+ logger.warn("Exception while finalizing shuffle partition {}_{} {}
{}", msg.appId,
+ msg.attemptId, msg.shuffleId, partition.reduceId, ioe);
} finally {
partition.closeAllFiles();
- // The partition should be removed after the files are written so
that any new stream
- // for the same reduce partition will see that the data file
exists.
- partitionsIter.remove();
}
}
}
mergeStatuses = new MergeStatuses(msg.shuffleId,
bitmaps.toArray(new RoaringBitmap[bitmaps.size()]),
Ints.toArray(reduceIds),
Longs.toArray(sizes));
}
- partitions.remove(appShuffleId);
- logger.info("Finalized shuffle {} from Application {}.", msg.shuffleId,
msg.appId);
+ logger.info("Finalized shuffle {} from Application {}_{}.",
+ msg.shuffleId, msg.appId, msg.attemptId);
return mergeStatuses;
}
@Override
public void registerExecutor(String appId, ExecutorShuffleInfo executorInfo)
{
if (logger.isDebugEnabled()) {
logger.debug("register executor with RemoteBlockPushResolver {}
local-dirs {} "
- + "num sub-dirs {}", appId, Arrays.toString(executorInfo.localDirs),
- executorInfo.subDirsPerLocalDir);
+ + "num sub-dirs {} shuffleManager {}", appId,
Arrays.toString(executorInfo.localDirs),
+ executorInfo.subDirsPerLocalDir, executorInfo.shuffleManager);
+ }
+ String shuffleManagerMeta = executorInfo.shuffleManager;
+ if (shuffleManagerMeta.contains(":")) {
+ String mergeDirInfo =
shuffleManagerMeta.substring(shuffleManagerMeta.indexOf(":") + 1);
+ try {
+ ObjectMapper mapper = new ObjectMapper();
+ MergeDirectoryMeta mergeDirectoryMeta =
+ mapper.readValue(mergeDirInfo, MergeDirectoryMeta.class);
+ if (mergeDirectoryMeta.attemptId == ATTEMPT_ID_UNDEFINED) {
+ // When attemptId is -1, there is no attemptId stored in the
ExecutorShuffleInfo.
+ // Only the first ExecutorRegister message can register the merge
dirs
+ appsShuffleInfo.computeIfAbsent(appId, id ->
+ new AppShuffleInfo(
+ appId, mergeDirectoryMeta.attemptId,
+ new AppPathsInfo(appId, executorInfo.localDirs,
+ mergeDirectoryMeta.mergeDir, executorInfo.subDirsPerLocalDir)
+ ));
+ } else {
+ // If attemptId is not -1, there is attemptId stored in the
ExecutorShuffleInfo.
+ // The first ExecutorRegister message from the same application
attempt wil register
+ // the merge dirs in External Shuffle Service. Any later
ExecutorRegister message
+ // from the same application attempt will not override the merge
dirs. But it can
+ // be overridden by ExecutorRegister message from newer application
attempt,
+ // and former attempts' shuffle partitions information will also be
cleaned up.
+ ConcurrentMap<Integer, AppShuffleInfo> appShuffleInfoToBeCleanedUp =
+ Maps.newConcurrentMap();
+ appsShuffleInfo.compute(appId, (id, appShuffleInfo) -> {
+ if (appShuffleInfo == null || (appShuffleInfo != null
+ && mergeDirectoryMeta.attemptId > appShuffleInfo.attemptId)) {
+
appShuffleInfoToBeCleanedUp.putIfAbsent(appShuffleInfo.attemptId,
appShuffleInfo);
+ appShuffleInfo =
+ new AppShuffleInfo(
+ appId, mergeDirectoryMeta.attemptId,
+ new AppPathsInfo(appId, executorInfo.localDirs,
+ mergeDirectoryMeta.mergeDir,
executorInfo.subDirsPerLocalDir));
+ }
+ return appShuffleInfo;
+ });
+ for (AppShuffleInfo appShuffleInfo:
appShuffleInfoToBeCleanedUp.values()) {
+ logger.info("Remove shuffle info for {}_{} as new application
attempt registered",
+ appId, appShuffleInfo.attemptId);
+ appShuffleInfo.cleanupShufflePartitionInfo();
Review comment:
UT to be added
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]