Author: jbellis
Date: Mon Dec 26 05:21:06 2011
New Revision: 1224679

URL: http://svn.apache.org/viewvc?rev=1224679&view=rev
Log:
Optimize key count estimation when opening sstable on startup
patch by Melvin Wang and jbellis; reviewed by slebresne for CASSANDRA-2988

Modified:
    cassandra/trunk/CHANGES.txt
    cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTable.java
    cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
    cassandra/trunk/src/java/org/apache/cassandra/utils/EstimatedHistogram.java

Modified: cassandra/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/CHANGES.txt?rev=1224679&r1=1224678&r2=1224679&view=diff
==============================================================================
--- cassandra/trunk/CHANGES.txt (original)
+++ cassandra/trunk/CHANGES.txt Mon Dec 26 05:21:06 2011
@@ -1,4 +1,6 @@
 1.1-dev
+ * Optimize key count estimation when opening sstable on startup
+   (CASSANDRA-2988)
  * multi-dc replication optimization supporting CL > ONE (CASSANDRA-3577)
  * add command to stop compactions (CASSANDRA-1740, 3566, 3582)
  * multithreaded streaming (CASSANDRA-3494)

Modified: cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTable.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTable.java?rev=1224679&r1=1224678&r2=1224679&view=diff
==============================================================================
--- cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTable.java 
(original)
+++ cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTable.java Mon 
Dec 26 05:21:06 2011
@@ -207,25 +207,6 @@ public abstract class SSTable
         return components;
     }
 
-    /** @return An estimate of the number of keys contained in the given data 
file. */
-    static long estimateRowsFromData(Descriptor desc, RandomAccessReader 
dfile) throws IOException
-    {
-        // collect sizes for the first 1000 keys, or first 100 megabytes of 
data
-        final int SAMPLES_CAP = 1000, BYTES_CAP = (int)Math.min(100000000, 
dfile.length());
-        int keys = 0;
-        long dataPosition = 0;
-        while (dataPosition < BYTES_CAP && keys < SAMPLES_CAP)
-        {
-            dfile.seek(dataPosition);
-            ByteBufferUtil.skipShortLength(dfile);
-            long dataSize = SSTableReader.readRowSize(dfile, desc);
-            dataPosition = dfile.getFilePointer() + dataSize;
-            keys++;
-        }
-        dfile.seek(0);
-        return dfile.length() / (dataPosition / keys);
-    }
-
     /** @return An estimate of the number of keys contained in the given index 
file. */
     static long estimateRowsFromIndex(RandomAccessReader ifile) throws 
IOException
     {

Modified: 
cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTableReader.java?rev=1224679&r1=1224678&r2=1224679&view=diff
==============================================================================
--- cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTableReader.java 
(original)
+++ cassandra/trunk/src/java/org/apache/cassandra/io/sstable/SSTableReader.java 
Mon Dec 26 05:21:06 2011
@@ -335,11 +335,14 @@ public class SSTableReader extends SSTab
         try
         {
             long indexSize = input.length();
-            long estimatedKeys = SSTable.estimateRowsFromIndex(input);
+            long histogramCount = sstableMetadata.estimatedRowSize.count();
+            long estimatedKeys = histogramCount > 0 && 
!sstableMetadata.estimatedRowSize.isOverflowed()
+                               ? histogramCount
+                               : SSTable.estimateRowsFromIndex(input); // 
statistics is supposed to be optional
             indexSummary = new IndexSummary(estimatedKeys);
             if (recreatebloom)
-                // estimate key count based on index length
                 bf = LegacyBloomFilter.getFilter(estimatedKeys, 15);
+
             while (true)
             {
                 long indexPosition = input.getFilePointer();

Modified: 
cassandra/trunk/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/utils/EstimatedHistogram.java?rev=1224679&r1=1224678&r2=1224679&view=diff
==============================================================================
--- cassandra/trunk/src/java/org/apache/cassandra/utils/EstimatedHistogram.java 
(original)
+++ cassandra/trunk/src/java/org/apache/cassandra/utils/EstimatedHistogram.java 
Mon Dec 26 05:21:06 2011
@@ -183,6 +183,17 @@ public class EstimatedHistogram
     }
 
     /**
+     * @return the total number of non-zero values
+     */
+    public long count()
+    {
+       long sum = 0L;
+       for (int i = 0; i < buckets.length(); i++) 
+           sum += buckets.get(i);
+       return sum;
+    }
+
+    /**
      * @return true if this histogram has overflowed -- that is, a value 
larger than our largest bucket could bound was added
      */
     public boolean isOverflowed()


Reply via email to