This is an automated email from the ASF dual-hosted git repository.

zhaijia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 8e056d4  Issue #753: Allow option to disable data sync on journal
8e056d4 is described below

commit 8e056d4dd0262f5ed5245a8dc8abe608b737f6f9
Author: Matteo Merli <[email protected]>
AuthorDate: Thu Nov 23 16:26:18 2017 +0800

    Issue #753: Allow option to disable data sync on journal
    
    Context in #753
    
    For deployments where a RAID battery-backed cache is not available and 
where durability is not a requirements, we should allow bookies to rely on page 
cache and relax durability.
    
    Author: Matteo Merli <[email protected]>
    
    Reviewers: Enrico Olivelli <[email protected]>, Sijie Guo 
<[email protected]>
    
    This closes #754 from merlimat/option-to-disable-sync-master, closes #753
---
 bookkeeper-server/conf/bk_server.conf              |  8 +++
 .../java/org/apache/bookkeeper/bookie/Journal.java | 26 +++++++--
 .../bookkeeper/conf/ServerConfiguration.java       | 31 +++++++++++
 .../bookkeeper/bookie/BookieJournalNoSyncTest.java | 63 ++++++++++++++++++++++
 4 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/bookkeeper-server/conf/bk_server.conf 
b/bookkeeper-server/conf/bk_server.conf
index b4f37e4..7bdfa0d 100755
--- a/bookkeeper-server/conf/bk_server.conf
+++ b/bookkeeper-server/conf/bk_server.conf
@@ -296,6 +296,14 @@ journalDirectory=/tmp/bk-txn
 # Should we remove pages from page cache after force write
 # journalRemoveFromPageCache=false
 
+# Should the data be fsynced on journal before acknowledgment.
+# By default, data sync is enabled to guarantee durability of writes.
+# Beware: while disabling data sync in the Bookie journal might improve the 
bookie write performance, it will also
+# introduce the possibility of data loss. With no sync, the journal entries 
are written in the OS page cache but
+# not flushed to disk. In case of power failure, the affected bookie might 
lose the unflushed data. If the ledger
+# is replicated to multiple bookies, the chances of data loss are reduced 
though still present.
+# journalSyncData=true
+
 # Should we group journal force writes, which optimize group commit
 # for higher throughput
 # journalAdaptiveGroupWrites=true
diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
index 1f8aef4..1a3635c 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
@@ -499,6 +499,9 @@ public class Journal extends BookieCriticalThread 
implements CheckpointSource {
     // should we hint the filesystem to remove pages from cache after force 
write
     private final boolean removePagesFromCache;
 
+    // Should data be fsynced on disk before triggering the callback
+    private final boolean syncData;
+
     private final LastLogMark lastLogMark = new LastLogMark(0, 0);
 
     /**
@@ -543,6 +546,7 @@ public class Journal extends BookieCriticalThread 
implements CheckpointSource {
         this.maxJournalSize = conf.getMaxJournalSizeMB() * MB;
         this.journalPreAllocSize = conf.getJournalPreAllocSizeMB() * MB;
         this.journalWriteBufferSize = conf.getJournalWriteBufferSizeKB() * KB;
+        this.syncData = conf.getJournalSyncData();
         this.maxBackupJournals = conf.getMaxBackupJournals();
         this.forceWriteThread = new ForceWriteThread(this, 
conf.getJournalAdaptiveGroupWrites());
         this.maxGroupWaitInNanos = 
TimeUnit.MILLISECONDS.toNanos(conf.getJournalMaxGroupWaitMSec());
@@ -914,9 +918,25 @@ public class Journal extends BookieCriticalThread 
implements CheckpointSource {
                             
forceWriteBatchEntriesStats.registerSuccessfulValue(toFlush.size());
                             
forceWriteBatchBytesStats.registerSuccessfulValue(batchSize);
 
-                            forceWriteRequests.put(new 
ForceWriteRequest(logFile, logId, lastFlushPosition, toFlush,
-                                    (lastFlushPosition > maxJournalSize), 
false));
-                            toFlush = new LinkedList<QueueEntry>();
+                            boolean shouldRolloverJournal = (lastFlushPosition 
> maxJournalSize);
+                            if (syncData) {
+                                // Trigger data sync to disk in the 
"Force-Write" thread. Callback will be triggered after data is committed to disk
+                                forceWriteRequests.put(new 
ForceWriteRequest(logFile, logId, lastFlushPosition, toFlush, 
shouldRolloverJournal, false));
+                                toFlush = new LinkedList<QueueEntry>();
+                            } else {
+                                // Data is already written on the file (though 
it might still be in the OS page-cache)
+                                lastLogMark.setCurLogMark(logId, 
lastFlushPosition);
+                                for (int i = 0; i < toFlush.size(); i++) {
+                                    cbThreadPool.execute((QueueEntry) 
toFlush.get(i));
+                                }
+
+                                toFlush.clear();
+                                if (shouldRolloverJournal) {
+                                    forceWriteRequests.put(new 
ForceWriteRequest(logFile, logId, lastFlushPosition,
+                                            new LinkedList<>(), 
shouldRolloverJournal, false));
+                                }
+                            }
+
                             batchSize = 0L;
                             // check whether journal file is over file limit
                             if (bc.position() > maxJournalSize) {
diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
index ad7002b..b49969d 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
@@ -71,6 +71,7 @@ public class ServerConfiguration extends 
AbstractConfiguration {
     // Journal Parameters
     protected final static String MAX_JOURNAL_SIZE = "journalMaxSizeMB";
     protected final static String MAX_BACKUP_JOURNALS = "journalMaxBackups";
+    protected final static String JOURNAL_SYNC_DATA = "journalSyncData";
     protected final static String JOURNAL_ADAPTIVE_GROUP_WRITES = 
"journalAdaptiveGroupWrites";
     protected final static String JOURNAL_MAX_GROUP_WAIT_MSEC = 
"journalMaxGroupWaitMSec";
     protected final static String JOURNAL_BUFFERED_WRITES_THRESHOLD = 
"journalBufferedWritesThreshold";
@@ -1537,6 +1538,36 @@ public class ServerConfiguration extends 
AbstractConfiguration {
     }
 
     /**
+     * Should the data be fsynced on journal before acknowledgment
+     *
+     * Default is true
+     *
+     * @return
+     */
+    public boolean getJournalSyncData() {
+        return getBoolean(JOURNAL_SYNC_DATA, true);
+    }
+
+    /**
+     * Enable or disable journal syncs.
+     * <p>
+     * By default, data sync is enabled to guarantee durability of writes.
+     * <p>
+     * Beware: while disabling data sync in the Bookie journal might improve 
the bookie write performance, it will also
+     * introduce the possibility of data loss. With no sync, the journal 
entries are written in the OS page cache but
+     * not flushed to disk. In case of power failure, the affected bookie 
might lose the unflushed data. If the ledger
+     * is replicated to multiple bookies, the chances of data loss are reduced 
though still present.
+     *
+     * @param syncData
+     *            whether to sync data on disk before acknowledgement
+     * @return server configuration object
+     */
+    public ServerConfiguration setJournalSyncData(boolean syncData) {
+        setProperty(JOURNAL_SYNC_DATA, syncData);
+        return this;
+    }
+
+    /**
      * Should we group journal force writes
      *
      * @return group journal force writes
diff --git 
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
 
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
new file mode 100644
index 0000000..baa2d7d
--- /dev/null
+++ 
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
@@ -0,0 +1,63 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.bookkeeper.bookie;
+
+import java.util.Enumeration;
+
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.client.LedgerEntry;
+import org.apache.bookkeeper.client.LedgerHandle;
+import org.apache.bookkeeper.test.BookKeeperClusterTestCase;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BookieJournalNoSyncTest extends BookKeeperClusterTestCase {
+
+    public BookieJournalNoSyncTest() {
+        super(1);
+
+        baseConf.setJournalSyncData(false);
+    }
+
+    @Test
+    public void testWriteToJournal() throws Exception {
+        LedgerHandle lh = bkc.createLedger(1, 1, DigestType.CRC32, new 
byte[0]);
+
+        int N = 10;
+
+        long ledgerId = lh.getId();
+
+        for (int i = 0; i < N; i++) {
+            lh.addEntry(("entry-" + i).getBytes());
+        }
+
+        restartBookies();
+
+        LedgerHandle readLh = bkc.openLedger(ledgerId, DigestType.CRC32, new 
byte[0]);
+
+        Enumeration<LedgerEntry> entries = readLh.readEntries(0, N - 1);
+        for (int i = 0; i < N; i++) {
+            LedgerEntry entry = entries.nextElement();
+            Assert.assertEquals("entry-" + i, new String(entry.getEntry()));
+        }
+    }
+
+}

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to