[ 
https://issues.apache.org/jira/browse/CASSANDRA-2463?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

C. Scott Andreas updated CASSANDRA-2463:
----------------------------------------

    Comment: was deleted

(was: diff --git a/src/java/org/apache/cassandra/db/BinaryMemtable.java 
b/src/java/org/apache/cassandra/db/BinaryMemtable.java
index 4b4e2ff..14665ad 100644
--- a/src/java/org/apache/cassandra/db/BinaryMemtable.java
+++ b/src/java/org/apache/cassandra/db/BinaryMemtable.java
@@ -125,7 +125,7 @@ public class BinaryMemtable implements IFlushable
     private SSTableReader writeSortedContents(List<DecoratedKey> sortedKeys) 
throws IOException
     {
         logger.info("Writing " + this);
-        SSTableWriter writer = cfs.createFlushWriter(sortedKeys.size());
+        SSTableWriter writer = cfs.createFlushWriter(sortedKeys.size(), 
currentSize.get());
 
         for (DecoratedKey key : sortedKeys)
         {
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java 
b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 8ff9f82..14e984b 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -2183,9 +2183,9 @@ public class ColumnFamilyStore implements 
ColumnFamilyStoreMBean
         }
     }
 
-    public SSTableWriter createFlushWriter(long estimatedRows) throws 
IOException
+    public SSTableWriter createFlushWriter(long estimatedRows, long 
estimatedSize) throws IOException
     {
-        return new SSTableWriter(getFlushPath(), estimatedRows, metadata, 
partitioner);
+        return new SSTableWriter(getFlushPath(), estimatedRows, estimatedSize, 
metadata, partitioner);
     }
 
     public SSTableWriter createCompactionWriter(long estimatedRows, String 
location) throws IOException
diff --git a/src/java/org/apache/cassandra/db/Memtable.java 
b/src/java/org/apache/cassandra/db/Memtable.java
index db65f01..3acb7a9 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java
@@ -155,7 +155,7 @@ public class Memtable implements Comparable<Memtable>, 
IFlushable
     private SSTableReader writeSortedContents() throws IOException
     {
         logger.info("Writing " + this);
-        SSTableWriter writer = cfs.createFlushWriter(columnFamilies.size());
+        SSTableWriter writer = cfs.createFlushWriter(columnFamilies.size(), 
currentThroughput.get());
 
         for (Map.Entry<DecoratedKey, ColumnFamily> entry : 
columnFamilies.entrySet())
             writer.append(entry.getKey(), entry.getValue());
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java 
b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
index 809a3f4..d05542f 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
@@ -25,6 +25,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
+import java.lang.Math;
 
 import com.google.common.collect.Sets;
 
@@ -65,7 +66,7 @@ public class SSTableWriter extends SSTable
         this(filename, keyCount, 
DatabaseDescriptor.getCFMetaData(Descriptor.fromFilename(filename)), 
StorageService.getPartitioner());
     }
 
-    public SSTableWriter(String filename, long keyCount, CFMetaData metadata, 
IPartitioner partitioner) throws IOException
+    public SSTableWriter(String filename, long keyCount, long bufferSize, 
CFMetaData metadata, IPartitioner partitioner) throws IOException
     {
         super(Descriptor.fromFilename(filename),
               new HashSet<Component>(Arrays.asList(Component.DATA, 
Component.FILTER, Component.PRIMARY_INDEX, Component.STATS)),
@@ -75,7 +76,17 @@ public class SSTableWriter extends SSTable
               SSTable.defaultColumnHistogram());
         iwriter = new IndexWriter(descriptor, partitioner, keyCount);
         dbuilder = 
SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
-        dataFile = new BufferedRandomAccessFile(new File(getFilename()), "rw", 
DatabaseDescriptor.getInMemoryCompactionLimit(), true);
+
+        if (bufferSize == 0)
+            bufferSize = BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE;
+        else
+            bufferSize = Math.min(bufferSize, 
BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE);
+
+        dataFile = new BufferedRandomAccessFile(new File(getFilename()), "rw", 
(int) bufferSize, true);
+    }
+
+    public SSTableWriter(String filename, long keyCount, CFMetaData metadata, 
IPartitioner partitioner) throws IOException {
+        this(filename, keyCount, BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE, 
metadata, partitioner);
     }
     
     public void mark()
)

> Flush and Compaction Unnecessarily Allocate 256MB Contiguous Buffers
> --------------------------------------------------------------------
>
>                 Key: CASSANDRA-2463
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-2463
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Core
>    Affects Versions: 0.7.4
>         Environment: Any
>            Reporter: C. Scott Andreas
>              Labels: patch
>             Fix For: 0.7.4
>
>   Original Estimate: 72h
>  Remaining Estimate: 72h
>
> Currently, Cassandra 0.7.x allocates a 256MB contiguous byte array at the 
> beginning of a memtable flush or compaction (presently hard-coded as 
> Config.in_memory_compaction_limit_in_mb). When several memtable flushes are 
> triggered at once (as by `nodetool flush` or `nodetool snapshot`), the 
> tenured generation will typically experience extreme pressure as it attempts 
> to locate [n] contiguous 256mb chunks of heap to allocate. This will often 
> trigger a promotion failure, resulting in a stop-the-world GC until the 
> allocation can be made. (Note that in the case of the "release valve" being 
> triggered, the problem is even further exacerbated; the release valve will 
> ironically trigger two contiguous 256MB allocations when attempting to flush 
> the two largest memtables).
> This patch sets the buffer to be used by BufferedRandomAccessFile to 
> Math.min(bytesToWrite, BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE) rather 
> than a hard-coded 256MB. The typical resulting buffer size is 64kb.
> I've taken some time to measure the impact of this change on the base 0.7.4 
> release and with this patch applied. This test involved launching Cassandra, 
> performing four million writes across three column families from three 
> clients, and monitoring heap usage and garbage collections. Cassandra was 
> launched with 2GB of heap and the default JVM options shipped with the 
> project. This configuration has 7 column families with a total of 15GB of 
> data.
> Here's the base 0.7.4 release:
> http://cl.ly/413g2K06121z252e2t10
> Note that on launch, we see a flush + compaction triggered almost 
> immediately, resulting in at least 7x very quick 256MB allocations maxing out 
> the heap, resulting in a promotion failure and a full GC. As flushes 
> proceeed, we see that most of these have a corresponding CMS, consistent with 
> the pattern of a large allocation and immediate collection. We see a second 
> promotion failure and full GC at the 75% mark as the allocations cannot be 
> satisfied without a collection, along with several CMSs in between. In the 
> failure cases, the allocation requests occur so quickly that a standard CMS 
> phase cannot completed before a ParNew attempts to promote the surviving byte 
> array into the tenured generation. The heap usage and GC profile of this 
> graph is very unhealthy.
> Here's the 0.7.4 release with this patch applied:
> http://cl.ly/050I1g26401B1X0w3s1f
> This graph is very different. At launch, rather than a immediate spike to 
> full allocation and a promotion failure, we see a slow allocation slope 
> reaching only 1/8th of total heap size. As writes begin, we see several 
> flushes and compactions, but none result in immediate, large allocations. The 
> ParNew collector keeps up with collections far more ably, resulting in only 
> one healthy CMS collection with no promotion failure. Unlike the unhealthy 
> rapid allocation and massive collection pattern we see in the first graph, 
> this graph depicts a healthy sawtooth pattern of ParNews and an occasional 
> effective CMS with no danger of heap fragmentation resulting in a promotion 
> failure.
> The bottom line is that there's no need to allocate a hard-coded 256MB write 
> buffer for flushing memtables and compactions to disk. Doing so results in 
> unhealthy rapid allocation patterns and increases the probability of 
> triggering promotion failures and full stop-the-world GCs which can cause 
> nodes to become unresponsive and shunned from the ring during flushes and 
> compactions.

--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to