This is an automated email from the ASF dual-hosted git repository.

hsaputra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 79768fe  ISSUE #2615: Fix for invalid ensemble issue during ledger 
recovery
79768fe is described below

commit 79768fee1c28d42f4883eb80c5ab6dd88e38a4f5
Author: Jack Vanlightly <[email protected]>
AuthorDate: Mon Apr 12 11:06:52 2021 -0700

    ISSUE #2615: Fix for invalid ensemble issue during ledger recovery
    
    Ensures that only entries of the current ensemble are included in the 
ledger recovery process, thus avoiding a ledger recovery failure scenario where 
it tries to append an ensemble with a lower first entry id than the prior 
ensemble.
    
    Descriptions of the changes in this PR:
    
    This PR includes a small change in the LedgerRecoveryOp that avoids a 
scenario where ledger recovery tries to create an invalid ensemble thereby 
failing. This could cause data unavailability for as long as trigger conditions 
last.
    
    During ledger recovery, only entries of the current ensemble can be 
included in the read and write back phase. Prior ensembles, if any, are 
immutable. But it is possible, in a multi-ensemble ledger, for the current 
ensemble to return an LAC of -1. This then causes the recovery to read entries 
from prior ensembles and write them back to the current ensemble. This does not 
cause any data loss, but it is wasteful of both space and time. The main issue 
is that if an ensemble change occurs [...]
    
    If a bookie of the current ensemble were to be down, then the ledger would 
be unrecoverable until it became available again.
    
    The solution is that the lowest safe LAC for recovery is: first entry id of 
current ensemble - 1.
    
    ### Changes
    
    Change to LedgerRecoveryOp as described above.
    New unit test in LedgerRecoveryTest2.
    
    Master Issue: #2615
    
    
    Reviewers: Andrey Yegorov, Enrico Olivelli <[email protected]>, Flavio 
Junqueira
    
    This closes #2654 from Vanlightly/fix-invalid-ensemble-change, closes #2615
---
 .../apache/bookkeeper/client/LedgerRecoveryOp.java |  19 +++-
 .../bookkeeper/client/LedgerRecovery2Test.java     | 102 +++++++++++++++++++--
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java
index 534ea83..08a9f2e 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java
@@ -104,7 +104,24 @@ class LedgerRecoveryOp implements ReadEntryListener, 
AddCallback {
                     public void readLastConfirmedDataComplete(int rc, 
RecoveryData data) {
                         if (rc == BKException.Code.OK) {
                             synchronized (lh) {
-                                lh.lastAddPushed = lh.lastAddConfirmed = 
data.getLastAddConfirmed();
+                                /**
+                                 The lowest an LAC can be for use in recovery 
is the first entry id
+                                 of the current ensemble - 1.
+                                 All ensembles prior to the current one, if 
any, are confirmed and
+                                 immutable (so are not part of the recovery 
process).
+                                 So we take the highest of:
+                                 - the LAC returned by the current bookie 
ensemble (could be -1)
+                                 - the first entry id of the current ensemble 
- 1.
+                                 */
+                                Long lastEnsembleEntryId = 
lh.getVersionedLedgerMetadata()
+                                        .getValue()
+                                        .getAllEnsembles()
+                                        .lastEntry()
+                                        .getKey();
+
+                                lh.lastAddPushed = lh.lastAddConfirmed = 
Math.max(data.getLastAddConfirmed(),
+                                        (lastEnsembleEntryId - 1));
+
                                 lh.length = data.getLength();
                                 lh.pendingAddsSequenceHead = 
lh.lastAddConfirmed;
                                 startEntryToRead = endEntryToRead = 
lh.lastAddConfirmed;
diff --git 
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/LedgerRecovery2Test.java
 
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/LedgerRecovery2Test.java
index 38e4879..95d93ef 100644
--- 
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/LedgerRecovery2Test.java
+++ 
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/LedgerRecovery2Test.java
@@ -59,14 +59,31 @@ public class LedgerRecovery2Test {
     private static final BookieId b5 = new BookieSocketAddress("b5", 
3181).toBookieId();
 
     private static Versioned<LedgerMetadata> setupLedger(ClientContext 
clientCtx, long ledgerId,
-                                              List<BookieId> bookies) throws 
Exception {
+                                                         List<BookieId> 
bookies) throws Exception {
         LedgerMetadata md = LedgerMetadataBuilder.create()
-            .withId(ledgerId)
-            .withPassword(PASSWD).withDigestType(DigestType.CRC32C)
-            .newEnsembleEntry(0, bookies).build();
-        return clientCtx.getLedgerManager().createLedgerMetadata(1L, md).get();
+                .withId(ledgerId)
+                .withPassword(PASSWD).withDigestType(DigestType.CRC32C)
+                .withWriteQuorumSize(bookies.size())
+                .newEnsembleEntry(0, bookies).build();
+        return clientCtx.getLedgerManager().createLedgerMetadata(ledgerId, 
md).get();
     }
 
+    private static Versioned<LedgerMetadata> setupLedger(ClientContext 
clientCtx, long ledgerId,
+                                                         List<BookieId> 
bookies,
+                                                         int ensembleSize,
+                                                         int writeQuorumSize,
+                                                         int ackQuorumSize) 
throws Exception {
+        LedgerMetadata md = LedgerMetadataBuilder.create()
+                .withId(ledgerId)
+                .withPassword(PASSWD).withDigestType(DigestType.CRC32C)
+                .withEnsembleSize(ensembleSize)
+                .withWriteQuorumSize(writeQuorumSize)
+                .withAckQuorumSize(ackQuorumSize)
+                .newEnsembleEntry(0, bookies).build();
+        return clientCtx.getLedgerManager().createLedgerMetadata(ledgerId, 
md).get();
+    }
+
+
     @Test
     public void testCantRecoverAllDown() throws Exception {
         MockClientContext clientCtx = MockClientContext.create();
@@ -491,4 +508,77 @@ public class LedgerRecovery2Test {
             reachedStepFuture.get();
         } catch (Exception e) {}
     }
-}
+
+
+    /*
+     * This test verifies that an IllegalStateException does not occur during 
recovery because of an attempt
+     * to create a new ensemble that has a lower first entry id than an 
existing ledger.
+     *
+     * To reproduce original issue, revert the fix and run this test.
+     * The fix was to apply max(LAC from current ensemble, (first entry of 
current ensemble - 1)) as the LAC
+     * of the recovery phase rather than accept a value of -1 that might be 
returned by the LAC reads.
+     */
+    @Test
+    public void testRecoveryWhenSecondEnsembleReturnsLacMinusOne() throws 
Exception {
+        MockClientContext clientCtx = MockClientContext.create();
+        clientCtx.getMockRegistrationClient().addBookies(b4).get();
+
+        // at least two non-empty ensembles required as else the first 
ensemble would
+        // only be replaced, thus avoiding the issue.
+
+        // initial state: 2 ensembles due to a write failure of e1 to b2
+        // ensemble 1
+        Versioned<LedgerMetadata> md = setupLedger(clientCtx, 1, 
Lists.newArrayList(b1, b2), 2, 2, 2);
+        clientCtx.getMockBookieClient().getMockBookies().seedEntries(b1, 1L, 
0L, -1L);
+        clientCtx.getMockBookieClient().getMockBookies().seedEntries(b2, 1L, 
0L, -1L);
+        clientCtx.getMockBookieClient().getMockBookies().seedEntries(b1, 1L, 
1L, -1L);
+        // write to b2 failed, causing ensemble change
+
+        // ensemble 2 - the write of e1 to b2 failed, so new ensemble with b3 
created
+        ClientUtil.transformMetadata(clientCtx, 1L,
+                (metadata) -> 
LedgerMetadataBuilder.from(metadata).newEnsembleEntry(1L, 
Lists.newArrayList(b1, b3))
+                        .build());
+        clientCtx.getMockBookieClient().getMockBookies().seedEntries(b3, 1L, 
1L, 0L);
+
+        ReadOnlyLedgerHandle lh = new ReadOnlyLedgerHandle(
+                clientCtx, 1L, md, BookKeeper.DigestType.CRC32C, PASSWD, 
false);
+
+        // however, any read or write to b3 fails, which will:
+        // 1. cause the LAC read to return -1 (b1 has -1)
+        // 2. cause an ensemble change during recovery write back phase
+        clientCtx.getMockBookieClient().setPreWriteHook(
+                (bookie, ledgerId, entryId) -> {
+                    if (bookie.equals(b3)) {
+                        return FutureUtils.exception(new 
BKException.BKWriteException());
+                    } else {
+                        return FutureUtils.value(null);
+                    }
+                });
+
+        clientCtx.getMockBookieClient().setPreReadHook(
+                (bookie, ledgerId, entryId) -> {
+                    if (bookie.equals(b3)) {
+                        return FutureUtils.exception(new 
BKException.BKTimeoutException());
+                    } else {
+                        return FutureUtils.value(null);
+                    }
+                });
+
+        // writer 2 starts recovery (the subject of this test)
+        // (either the writer failed or simply has not yet sent the pending 
writes to the new ensemble)
+        GenericCallbackFuture<Void> recoveryPromise = new 
GenericCallbackFuture<>();
+        lh.recover(recoveryPromise, null, false);
+        recoveryPromise.get();
+
+        // The recovery process is successfully able to complete recovery, 
with the expected ensembles.
+        Assert.assertEquals(lh.getLedgerMetadata().getAllEnsembles().size(), 
2);
+        
Assert.assertTrue(lh.getLedgerMetadata().getAllEnsembles().get(0L).contains(b1));
+        
Assert.assertTrue(lh.getLedgerMetadata().getAllEnsembles().get(0L).contains(b2));
+        
Assert.assertTrue(lh.getLedgerMetadata().getAllEnsembles().get(1L).contains(b1));
+        
Assert.assertTrue(lh.getLedgerMetadata().getAllEnsembles().get(1L).contains(b4));
+
+        // the ledger is closed with entry id 1
+        Assert.assertEquals(lh.getLastAddConfirmed(), 1L);
+        Assert.assertEquals(lh.getLedgerMetadata().getLastEntryId(), 1L);
+    }
+}
\ No newline at end of file

Reply via email to