This is an automated email from the ASF dual-hosted git repository.
zhaijia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git
The following commit(s) were added to refs/heads/master by this push:
new 8e056d4 Issue #753: Allow option to disable data sync on journal
8e056d4 is described below
commit 8e056d4dd0262f5ed5245a8dc8abe608b737f6f9
Author: Matteo Merli <[email protected]>
AuthorDate: Thu Nov 23 16:26:18 2017 +0800
Issue #753: Allow option to disable data sync on journal
Context in #753
For deployments where a RAID battery-backed cache is not available and
where durability is not a requirements, we should allow bookies to rely on page
cache and relax durability.
Author: Matteo Merli <[email protected]>
Reviewers: Enrico Olivelli <[email protected]>, Sijie Guo
<[email protected]>
This closes #754 from merlimat/option-to-disable-sync-master, closes #753
---
bookkeeper-server/conf/bk_server.conf | 8 +++
.../java/org/apache/bookkeeper/bookie/Journal.java | 26 +++++++--
.../bookkeeper/conf/ServerConfiguration.java | 31 +++++++++++
.../bookkeeper/bookie/BookieJournalNoSyncTest.java | 63 ++++++++++++++++++++++
4 files changed, 125 insertions(+), 3 deletions(-)
diff --git a/bookkeeper-server/conf/bk_server.conf
b/bookkeeper-server/conf/bk_server.conf
index b4f37e4..7bdfa0d 100755
--- a/bookkeeper-server/conf/bk_server.conf
+++ b/bookkeeper-server/conf/bk_server.conf
@@ -296,6 +296,14 @@ journalDirectory=/tmp/bk-txn
# Should we remove pages from page cache after force write
# journalRemoveFromPageCache=false
+# Should the data be fsynced on journal before acknowledgment.
+# By default, data sync is enabled to guarantee durability of writes.
+# Beware: while disabling data sync in the Bookie journal might improve the
bookie write performance, it will also
+# introduce the possibility of data loss. With no sync, the journal entries
are written in the OS page cache but
+# not flushed to disk. In case of power failure, the affected bookie might
lose the unflushed data. If the ledger
+# is replicated to multiple bookies, the chances of data loss are reduced
though still present.
+# journalSyncData=true
+
# Should we group journal force writes, which optimize group commit
# for higher throughput
# journalAdaptiveGroupWrites=true
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
index 1f8aef4..1a3635c 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/Journal.java
@@ -499,6 +499,9 @@ public class Journal extends BookieCriticalThread
implements CheckpointSource {
// should we hint the filesystem to remove pages from cache after force
write
private final boolean removePagesFromCache;
+ // Should data be fsynced on disk before triggering the callback
+ private final boolean syncData;
+
private final LastLogMark lastLogMark = new LastLogMark(0, 0);
/**
@@ -543,6 +546,7 @@ public class Journal extends BookieCriticalThread
implements CheckpointSource {
this.maxJournalSize = conf.getMaxJournalSizeMB() * MB;
this.journalPreAllocSize = conf.getJournalPreAllocSizeMB() * MB;
this.journalWriteBufferSize = conf.getJournalWriteBufferSizeKB() * KB;
+ this.syncData = conf.getJournalSyncData();
this.maxBackupJournals = conf.getMaxBackupJournals();
this.forceWriteThread = new ForceWriteThread(this,
conf.getJournalAdaptiveGroupWrites());
this.maxGroupWaitInNanos =
TimeUnit.MILLISECONDS.toNanos(conf.getJournalMaxGroupWaitMSec());
@@ -914,9 +918,25 @@ public class Journal extends BookieCriticalThread
implements CheckpointSource {
forceWriteBatchEntriesStats.registerSuccessfulValue(toFlush.size());
forceWriteBatchBytesStats.registerSuccessfulValue(batchSize);
- forceWriteRequests.put(new
ForceWriteRequest(logFile, logId, lastFlushPosition, toFlush,
- (lastFlushPosition > maxJournalSize),
false));
- toFlush = new LinkedList<QueueEntry>();
+ boolean shouldRolloverJournal = (lastFlushPosition
> maxJournalSize);
+ if (syncData) {
+ // Trigger data sync to disk in the
"Force-Write" thread. Callback will be triggered after data is committed to disk
+ forceWriteRequests.put(new
ForceWriteRequest(logFile, logId, lastFlushPosition, toFlush,
shouldRolloverJournal, false));
+ toFlush = new LinkedList<QueueEntry>();
+ } else {
+ // Data is already written on the file (though
it might still be in the OS page-cache)
+ lastLogMark.setCurLogMark(logId,
lastFlushPosition);
+ for (int i = 0; i < toFlush.size(); i++) {
+ cbThreadPool.execute((QueueEntry)
toFlush.get(i));
+ }
+
+ toFlush.clear();
+ if (shouldRolloverJournal) {
+ forceWriteRequests.put(new
ForceWriteRequest(logFile, logId, lastFlushPosition,
+ new LinkedList<>(),
shouldRolloverJournal, false));
+ }
+ }
+
batchSize = 0L;
// check whether journal file is over file limit
if (bc.position() > maxJournalSize) {
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
index ad7002b..b49969d 100644
---
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
+++
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
@@ -71,6 +71,7 @@ public class ServerConfiguration extends
AbstractConfiguration {
// Journal Parameters
protected final static String MAX_JOURNAL_SIZE = "journalMaxSizeMB";
protected final static String MAX_BACKUP_JOURNALS = "journalMaxBackups";
+ protected final static String JOURNAL_SYNC_DATA = "journalSyncData";
protected final static String JOURNAL_ADAPTIVE_GROUP_WRITES =
"journalAdaptiveGroupWrites";
protected final static String JOURNAL_MAX_GROUP_WAIT_MSEC =
"journalMaxGroupWaitMSec";
protected final static String JOURNAL_BUFFERED_WRITES_THRESHOLD =
"journalBufferedWritesThreshold";
@@ -1537,6 +1538,36 @@ public class ServerConfiguration extends
AbstractConfiguration {
}
/**
+ * Should the data be fsynced on journal before acknowledgment
+ *
+ * Default is true
+ *
+ * @return
+ */
+ public boolean getJournalSyncData() {
+ return getBoolean(JOURNAL_SYNC_DATA, true);
+ }
+
+ /**
+ * Enable or disable journal syncs.
+ * <p>
+ * By default, data sync is enabled to guarantee durability of writes.
+ * <p>
+ * Beware: while disabling data sync in the Bookie journal might improve
the bookie write performance, it will also
+ * introduce the possibility of data loss. With no sync, the journal
entries are written in the OS page cache but
+ * not flushed to disk. In case of power failure, the affected bookie
might lose the unflushed data. If the ledger
+ * is replicated to multiple bookies, the chances of data loss are reduced
though still present.
+ *
+ * @param syncData
+ * whether to sync data on disk before acknowledgement
+ * @return server configuration object
+ */
+ public ServerConfiguration setJournalSyncData(boolean syncData) {
+ setProperty(JOURNAL_SYNC_DATA, syncData);
+ return this;
+ }
+
+ /**
* Should we group journal force writes
*
* @return group journal force writes
diff --git
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
new file mode 100644
index 0000000..baa2d7d
--- /dev/null
+++
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/BookieJournalNoSyncTest.java
@@ -0,0 +1,63 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.bookkeeper.bookie;
+
+import java.util.Enumeration;
+
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.client.LedgerEntry;
+import org.apache.bookkeeper.client.LedgerHandle;
+import org.apache.bookkeeper.test.BookKeeperClusterTestCase;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BookieJournalNoSyncTest extends BookKeeperClusterTestCase {
+
+ public BookieJournalNoSyncTest() {
+ super(1);
+
+ baseConf.setJournalSyncData(false);
+ }
+
+ @Test
+ public void testWriteToJournal() throws Exception {
+ LedgerHandle lh = bkc.createLedger(1, 1, DigestType.CRC32, new
byte[0]);
+
+ int N = 10;
+
+ long ledgerId = lh.getId();
+
+ for (int i = 0; i < N; i++) {
+ lh.addEntry(("entry-" + i).getBytes());
+ }
+
+ restartBookies();
+
+ LedgerHandle readLh = bkc.openLedger(ledgerId, DigestType.CRC32, new
byte[0]);
+
+ Enumeration<LedgerEntry> entries = readLh.readEntries(0, N - 1);
+ for (int i = 0; i < N; i++) {
+ LedgerEntry entry = entries.nextElement();
+ Assert.assertEquals("entry-" + i, new String(entry.getEntry()));
+ }
+ }
+
+}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].