zuston commented on code in PR #383:
URL: https://github.com/apache/incubator-uniffle/pull/383#discussion_r1038748521


##########
server/src/main/java/org/apache/uniffle/server/ShuffleFlushManager.java:
##########
@@ -142,87 +143,96 @@ public void addToFlushQueue(ShuffleDataFlushEvent event) {
     }
   }
 
-  private void flushToFile(ShuffleDataFlushEvent event) {
-
-    Storage storage = storageManager.selectStorage(event);
-    if (storage != null && !storage.canWrite()) {
-      addPendingEvents(event);
-      return;
-    }
-
+  private void flushToFileImpl(ShuffleDataFlushEvent event) {
     long start = System.currentTimeMillis();
-    List<ShufflePartitionedBlock> blocks = event.getShuffleBlocks();
     boolean writeSuccess = false;
-    try {
-      // storage info maybe null if the application cache was cleared already
-      if (storage != null) {
-        if (blocks == null || blocks.isEmpty()) {
-          LOG.info("There is no block to be flushed: " + event);
-        } else if (!event.isValid()) {
-          //  avoid printing error log
+
+    while (true) {
+      try {
+        if (!event.isValid()) {
           writeSuccess = true;
           LOG.warn("AppId {} was removed already, event {} should be dropped", 
event.getAppId(), event);
-        } else {
-          String user = StringUtils.defaultString(
-              
shuffleServer.getShuffleTaskManager().getUserByAppId(event.getAppId()),
-              StringUtils.EMPTY
-          );
-          CreateShuffleWriteHandlerRequest request = new 
CreateShuffleWriteHandlerRequest(
-                  storageType,
-                  event.getAppId(),
-                  event.getShuffleId(),
-                  event.getStartPartition(),
-                  event.getEndPartition(),
-                  storageBasePaths.toArray(new 
String[storageBasePaths.size()]),
-                  shuffleServerId,
-                  hadoopConf,
-                  storageDataReplica,
-                  user);
-          ShuffleWriteHandler handler = 
storage.getOrCreateWriteHandler(request);
-          do {
-            if (event.getRetryTimes() > retryMax) {
-              LOG.error("Failed to write data for " + event + " in " + 
retryMax + " times, shuffle data will be lost");
-              
ShuffleServerMetrics.incStorageFailedCounter(storage.getStorageHost());
-              break;
-            }
-            if (!event.isValid()) {
-              LOG.warn("AppId {} was removed already, event {} should be 
dropped, may leak one handler",
-                  event.getAppId(), event);
-              //  avoid printing error log
-              writeSuccess = true;
-              break;
-            }
+          break;
+        }
+
+        if (event.getRetryTimes() > retryMax) {
+          LOG.error("Failed to write data for " + event + " in " + retryMax + 
" times, shuffle data will be lost");
+          
ShuffleServerMetrics.incStorageFailedCounter(event.getUnderStorage().getStorageHost());
+          break;
+        }
 
-            writeSuccess = storageManager.write(storage, handler, event);
+        List<ShufflePartitionedBlock> blocks = event.getShuffleBlocks();
+        if (blocks == null || blocks.isEmpty()) {
+          LOG.info("There is no block to be flushed: " + event);
+          break;
+        }
 
-            if (writeSuccess) {
-              updateCommittedBlockIds(event.getAppId(), event.getShuffleId(), 
blocks);
-              
ShuffleServerMetrics.incStorageSuccessCounter(storage.getStorageHost());
+        Storage storage = storageManager.selectStorage(event);
+        if (storage == null) {
+          break;
+        }
+
+        if (!storage.canWrite()) {
+          if (storageManager instanceof MultiStorageManager) {
+            event.increaseRetryTimes();
+            
ShuffleServerMetrics.incStorageRetryCounter(storage.getStorageHost());
+            continue;
+          } else {
+            if (event.isPended()) {
+              // add metrics
               break;

Review Comment:
   If the event has been pushed into pending queue and re-enter the flush 
queue, when its disk has problems again, it will be re-pushed to pending queue, 
that will make the system too much pressure. We should drop these events 
directly. Right?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to