[
https://issues.apache.org/jira/browse/CASSANDRA-2463?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
C. Scott Andreas updated CASSANDRA-2463:
----------------------------------------
Comment: was deleted
(was: diff --git a/src/java/org/apache/cassandra/db/BinaryMemtable.java
b/src/java/org/apache/cassandra/db/BinaryMemtable.java
index 4b4e2ff..14665ad 100644
--- a/src/java/org/apache/cassandra/db/BinaryMemtable.java
+++ b/src/java/org/apache/cassandra/db/BinaryMemtable.java
@@ -125,7 +125,7 @@ public class BinaryMemtable implements IFlushable
private SSTableReader writeSortedContents(List<DecoratedKey> sortedKeys)
throws IOException
{
logger.info("Writing " + this);
- SSTableWriter writer = cfs.createFlushWriter(sortedKeys.size());
+ SSTableWriter writer = cfs.createFlushWriter(sortedKeys.size(),
currentSize.get());
for (DecoratedKey key : sortedKeys)
{
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 8ff9f82..14e984b 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -2183,9 +2183,9 @@ public class ColumnFamilyStore implements
ColumnFamilyStoreMBean
}
}
- public SSTableWriter createFlushWriter(long estimatedRows) throws
IOException
+ public SSTableWriter createFlushWriter(long estimatedRows, long
estimatedSize) throws IOException
{
- return new SSTableWriter(getFlushPath(), estimatedRows, metadata,
partitioner);
+ return new SSTableWriter(getFlushPath(), estimatedRows, estimatedSize,
metadata, partitioner);
}
public SSTableWriter createCompactionWriter(long estimatedRows, String
location) throws IOException
diff --git a/src/java/org/apache/cassandra/db/Memtable.java
b/src/java/org/apache/cassandra/db/Memtable.java
index db65f01..3acb7a9 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java
@@ -155,7 +155,7 @@ public class Memtable implements Comparable<Memtable>,
IFlushable
private SSTableReader writeSortedContents() throws IOException
{
logger.info("Writing " + this);
- SSTableWriter writer = cfs.createFlushWriter(columnFamilies.size());
+ SSTableWriter writer = cfs.createFlushWriter(columnFamilies.size(),
currentThroughput.get());
for (Map.Entry<DecoratedKey, ColumnFamily> entry :
columnFamilies.entrySet())
writer.append(entry.getKey(), entry.getValue());
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
index 809a3f4..d05542f 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
@@ -25,6 +25,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import java.lang.Math;
import com.google.common.collect.Sets;
@@ -65,7 +66,7 @@ public class SSTableWriter extends SSTable
this(filename, keyCount,
DatabaseDescriptor.getCFMetaData(Descriptor.fromFilename(filename)),
StorageService.getPartitioner());
}
- public SSTableWriter(String filename, long keyCount, CFMetaData metadata,
IPartitioner partitioner) throws IOException
+ public SSTableWriter(String filename, long keyCount, long bufferSize,
CFMetaData metadata, IPartitioner partitioner) throws IOException
{
super(Descriptor.fromFilename(filename),
new HashSet<Component>(Arrays.asList(Component.DATA,
Component.FILTER, Component.PRIMARY_INDEX, Component.STATS)),
@@ -75,7 +76,17 @@ public class SSTableWriter extends SSTable
SSTable.defaultColumnHistogram());
iwriter = new IndexWriter(descriptor, partitioner, keyCount);
dbuilder =
SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
- dataFile = new BufferedRandomAccessFile(new File(getFilename()), "rw",
DatabaseDescriptor.getInMemoryCompactionLimit(), true);
+
+ if (bufferSize == 0)
+ bufferSize = BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE;
+ else
+ bufferSize = Math.min(bufferSize,
BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE);
+
+ dataFile = new BufferedRandomAccessFile(new File(getFilename()), "rw",
(int) bufferSize, true);
+ }
+
+ public SSTableWriter(String filename, long keyCount, CFMetaData metadata,
IPartitioner partitioner) throws IOException {
+ this(filename, keyCount, BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE,
metadata, partitioner);
}
public void mark()
)
> Flush and Compaction Unnecessarily Allocate 256MB Contiguous Buffers
> --------------------------------------------------------------------
>
> Key: CASSANDRA-2463
> URL: https://issues.apache.org/jira/browse/CASSANDRA-2463
> Project: Cassandra
> Issue Type: Bug
> Components: Core
> Affects Versions: 0.7.4
> Environment: Any
> Reporter: C. Scott Andreas
> Labels: patch
> Fix For: 0.7.4
>
> Original Estimate: 72h
> Remaining Estimate: 72h
>
> Currently, Cassandra 0.7.x allocates a 256MB contiguous byte array at the
> beginning of a memtable flush or compaction (presently hard-coded as
> Config.in_memory_compaction_limit_in_mb). When several memtable flushes are
> triggered at once (as by `nodetool flush` or `nodetool snapshot`), the
> tenured generation will typically experience extreme pressure as it attempts
> to locate [n] contiguous 256mb chunks of heap to allocate. This will often
> trigger a promotion failure, resulting in a stop-the-world GC until the
> allocation can be made. (Note that in the case of the "release valve" being
> triggered, the problem is even further exacerbated; the release valve will
> ironically trigger two contiguous 256MB allocations when attempting to flush
> the two largest memtables).
> This patch sets the buffer to be used by BufferedRandomAccessFile to
> Math.min(bytesToWrite, BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE) rather
> than a hard-coded 256MB. The typical resulting buffer size is 64kb.
> I've taken some time to measure the impact of this change on the base 0.7.4
> release and with this patch applied. This test involved launching Cassandra,
> performing four million writes across three column families from three
> clients, and monitoring heap usage and garbage collections. Cassandra was
> launched with 2GB of heap and the default JVM options shipped with the
> project. This configuration has 7 column families with a total of 15GB of
> data.
> Here's the base 0.7.4 release:
> http://cl.ly/413g2K06121z252e2t10
> Note that on launch, we see a flush + compaction triggered almost
> immediately, resulting in at least 7x very quick 256MB allocations maxing out
> the heap, resulting in a promotion failure and a full GC. As flushes
> proceeed, we see that most of these have a corresponding CMS, consistent with
> the pattern of a large allocation and immediate collection. We see a second
> promotion failure and full GC at the 75% mark as the allocations cannot be
> satisfied without a collection, along with several CMSs in between. In the
> failure cases, the allocation requests occur so quickly that a standard CMS
> phase cannot completed before a ParNew attempts to promote the surviving byte
> array into the tenured generation. The heap usage and GC profile of this
> graph is very unhealthy.
> Here's the 0.7.4 release with this patch applied:
> http://cl.ly/050I1g26401B1X0w3s1f
> This graph is very different. At launch, rather than a immediate spike to
> full allocation and a promotion failure, we see a slow allocation slope
> reaching only 1/8th of total heap size. As writes begin, we see several
> flushes and compactions, but none result in immediate, large allocations. The
> ParNew collector keeps up with collections far more ably, resulting in only
> one healthy CMS collection with no promotion failure. Unlike the unhealthy
> rapid allocation and massive collection pattern we see in the first graph,
> this graph depicts a healthy sawtooth pattern of ParNews and an occasional
> effective CMS with no danger of heap fragmentation resulting in a promotion
> failure.
> The bottom line is that there's no need to allocate a hard-coded 256MB write
> buffer for flushing memtables and compactions to disk. Doing so results in
> unhealthy rapid allocation patterns and increases the probability of
> triggering promotion failures and full stop-the-world GCs which can cause
> nodes to become unresponsive and shunned from the ring during flushes and
> compactions.
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira