[GitHub] [hbase] sandeepvinayak commented on a change in pull request #2975: HBASE-25596: Fix NPE and avoid permanent unreplicated data due to EOF

GitBox Mon, 22 Feb 2021 14:40:49 -0800


sandeepvinayak commented on a change in pull request #2975:
URL: https://github.com/apache/hbase/pull/2975#discussion_r580644411




##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReaderThread.java
##########
@@ -133,71 +134,126 @@ public 
ReplicationSourceWALReaderThread(ReplicationSourceManager manager,
   @Override
   public void run() {
     int sleepMultiplier = 1;
-    while (isReaderRunning()) { // we only loop back here if something fatal 
happened to our stream
-      try (WALEntryStream entryStream =
-          new WALEntryStream(logQueue, fs, conf, lastReadPosition, metrics)) {
-        while (isReaderRunning()) { // loop here to keep reusing stream while 
we can
-          if (!source.isPeerEnabled()) {
-            Threads.sleep(sleepForRetries);
-            continue;
-          }
-          if (!checkQuota()) {
-            continue;
-          }
-          WALEntryBatch batch = new 
WALEntryBatch(replicationBatchCountCapacity);
-          boolean hasNext;
-          while ((hasNext = entryStream.hasNext()) == true) {
-            Entry entry = entryStream.next();
-            entry = filterEntry(entry);
-            if (entry != null) {
-              WALEdit edit = entry.getEdit();
-              if (edit != null && !edit.isEmpty()) {
-                long entrySize = getEntrySizeIncludeBulkLoad(entry);
-                long entrySizeExcludeBulkLoad = 
getEntrySizeExcludeBulkLoad(entry);
-                batch.addEntry(entry, entrySize);
-                updateBatchStats(batch, entry, entryStream.getPosition(), 
entrySize);
-                boolean totalBufferTooLarge = 
acquireBufferQuota(entrySizeExcludeBulkLoad);
-                // Stop if too many entries or too big
-                if (totalBufferTooLarge || batch.getHeapSize() >= 
replicationBatchSizeCapacity
+    WALEntryBatch batch = null;
+    WALEntryStream entryStream =
+      new WALEntryStream(logQueue, fs, conf, lastReadPosition, metrics);
+    try {
+      while (isReaderRunning()) { // we only loop back here if something fatal 
happens to stream
+        try {
+          entryStream = new WALEntryStream(logQueue, fs, conf, 
lastReadPosition, metrics);
+          while (isReaderRunning()) { // loop here to keep reusing stream 
while we can
+            if (!source.isPeerEnabled()) {
+              Threads.sleep(sleepForRetries);
+              continue;
+            }
+            if (!checkQuota()) {
+              continue;
+            }
+            batch = new WALEntryBatch(replicationBatchCountCapacity);
+            boolean hasNext = entryStream.hasNext();
+            while (hasNext) {
+              Entry entry = entryStream.next();
+              entry = filterEntry(entry);
+              if (entry != null) {
+                WALEdit edit = entry.getEdit();
+                if (edit != null && !edit.isEmpty()) {
+                  long entrySize = getEntrySizeIncludeBulkLoad(entry);
+                  long entrySizeExcludeBulkLoad = 
getEntrySizeExcludeBulkLoad(entry);
+                  batch.addEntry(entry, entrySize);
+                  updateBatchStats(batch, entry, entryStream.getPosition(), 
entrySize);
+                  boolean totalBufferTooLarge = 
acquireBufferQuota(entrySizeExcludeBulkLoad);
+                  // Stop if too many entries or too big
+                  if (totalBufferTooLarge || batch.getHeapSize() >= 
replicationBatchSizeCapacity
                     || batch.getNbEntries() >= replicationBatchCountCapacity) {
-                  break;
+                    break;
+                  }
                 }
               }
+              hasNext = entryStream.hasNext();
             }
-          }
 
-          updateBatch(entryStream, batch, hasNext);
-          if (isShippable(batch)) {
-            sleepMultiplier = 1;
-            entryBatchQueue.put(batch);
-            if (!batch.hasMoreEntries()) {
-              // we're done with queue recovery, shut ourselves down
-              setReaderRunning(false);
+            // If the batch has data to max capacity or stream doesn't have 
anything
+            // try to ship it
+            if (isBatchQueuedToBeShipped(entryStream, batch, hasNext, false)) {
+              sleepMultiplier = 1;
             }
+          }
+        } catch (IOException | WALEntryStreamRuntimeException e) { // stream 
related
+          if (handleEofException(e, entryStream, batch)) {
+            sleepMultiplier = 1;
           } else {
-            Thread.sleep(sleepForRetries);
+            if (sleepMultiplier < maxRetriesMultiplier) {
+              LOG.debug("Failed to read stream of replication entries: " + e);
+              sleepMultiplier++;
+            } else {
+              LOG.error("Failed to read stream of replication entries", e);
+            }
+            Threads.sleep(sleepForRetries * sleepMultiplier);
           }
-          resetStream(entryStream);
+        } catch (InterruptedException e) {
+          LOG.trace("Interrupted while sleeping between WAL reads");
+          Thread.currentThread().interrupt();
+        } finally {
+          entryStream.close();
         }
-      } catch (IOException | WALEntryStreamRuntimeException e) { // stream 
related
-        if (sleepMultiplier < maxRetriesMultiplier) {
-          LOG.debug("Failed to read stream of replication entries: " + e);
-          sleepMultiplier++;
-        } else {
-          LOG.error("Failed to read stream of replication entries", e);
-          handleEofException(e);
-        }
-        Threads.sleep(sleepForRetries * sleepMultiplier);
-      } catch (InterruptedException e) {
-        LOG.trace("Interrupted while sleeping between WAL reads");
-        Thread.currentThread().interrupt();
       }
+    } catch (IOException e) {

Review comment:
       yes, the internal catch should ideally handle the internal try, but 
since internal `catch` may also blow up since now we are trying to add the data 
in batch from there. So, we are handling that in the external catch. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hbase] sandeepvinayak commented on a change in pull request #2975: HBASE-25596: Fix NPE and avoid permanent unreplicated data due to EOF

Reply via email to