[14/50] [abbrv] hbase git commit: Revert "HBASE-15477 Purge 'next block header' from cached blocks"

syuanjiang Wed, 30 Mar 2016 09:04:14 -0700

Revert "HBASE-15477 Purge 'next block header' from cached blocks"


Overcommit. Revert to fix.

This reverts commit 000117ad9fd7eb59074c9bb0da2cf1f9544d4bed.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/54a543de
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/54a543de
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/54a543de

Branch: refs/heads/hbase-12439
Commit: 54a543de229b358b438c920f04f7f2d1ff767cab
Parents: 3f3613a
Author: stack <[email protected]>
Authored: Tue Mar 22 18:37:25 2016 -0700
Committer: stack <[email protected]>
Committed: Tue Mar 22 18:37:25 2016 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hbase/io/hfile/BlockType.java |   4 -
 .../hbase/io/hfile/HFileContextBuilder.java     |  20 -
 .../org/apache/hadoop/hbase/nio/ByteBuff.java   |   6 -
 .../hbase/io/hfile/MemcachedBlockCache.java     |   2 +-
 .../hadoop/hbase/io/hfile/ChecksumUtil.java     |   5 +-
 .../hadoop/hbase/io/hfile/HFileBlock.java       | 997 ++++++++++---------
 .../hadoop/hbase/io/hfile/HFileBlockIndex.java  |   2 +-
 .../hadoop/hbase/io/hfile/HFileReaderImpl.java  |  26 +-
 .../hadoop/hbase/io/hfile/HFileScanner.java     |  12 -
 .../hbase/io/hfile/bucket/BucketCache.java      |  15 +-
 .../hbase/regionserver/KeyValueScanner.java     |  12 +-
 .../hadoop/hbase/regionserver/StoreFile.java    |   4 +-
 .../hadoop/hbase/io/hfile/CacheTestUtils.java   |  23 +-
 .../hadoop/hbase/io/hfile/TestCacheOnWrite.java |  10 +-
 .../hadoop/hbase/io/hfile/TestChecksum.java     |  27 +-
 .../hadoop/hbase/io/hfile/TestHFileBlock.java   |  27 +-
 .../io/hfile/TestHFileBlockCompatibility.java   | 750 ++++++++++++++
 .../hbase/io/hfile/TestHFileBlockIndex.java     |   3 +-
 .../io/hfile/TestHFileDataBlockEncoder.java     |  10 +-
 .../hbase/io/hfile/TestHFileEncryption.java     |   2 +-
 .../hbase/io/hfile/TestHFileWriterV3.java       |   7 +-
 .../hadoop/hbase/io/hfile/TestPrefetch.java     |   9 +-
 .../regionserver/TestCacheOnWriteInSchema.java  |   8 +-
 23 files changed, 1374 insertions(+), 607 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java
----------------------------------------------------------------------
diff --git 
a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java
index 32eb0b2..4228f57 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java
@@ -132,10 +132,6 @@ public enum BlockType {
     out.write(magic);
   }
 
-  public void write(ByteBuffer buf) {
-    buf.put(magic);
-  }
-
   public void write(ByteBuff buf) {
     buf.put(magic);
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
----------------------------------------------------------------------
diff --git 
a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
index a6645a6..6d3bb13 100644
--- 
a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
+++ 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
@@ -55,26 +55,6 @@ public class HFileContextBuilder {
 
   private String hfileName = null;
 
-  public HFileContextBuilder() {}
-
-  /**
-   * Use this constructor if you want to change a few settings only in another 
context.
-   */
-  public HFileContextBuilder(final HFileContext hfc) {
-    this.usesHBaseChecksum = hfc.isUseHBaseChecksum();
-    this.includesMvcc = hfc.isIncludesMvcc();
-    this.includesTags = hfc.isIncludesTags();
-    this.compression = hfc.getCompression();
-    this.compressTags = hfc.isCompressTags();
-    this.checksumType = hfc.getChecksumType();
-    this.bytesPerChecksum = hfc.getBytesPerChecksum();
-    this.blocksize = hfc.getBlocksize();
-    this.encoding = hfc.getDataBlockEncoding();
-    this.cryptoContext = hfc.getEncryptionContext();
-    this.fileCreateTime = hfc.getFileCreateTime();
-    this.hfileName = hfc.getHFileName();
-  }
-
   public HFileContextBuilder withHBaseCheckSum(boolean useHBaseCheckSum) {
     this.usesHBaseChecksum = useHBaseCheckSum;
     return this;

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java
----------------------------------------------------------------------
diff --git 
a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java
index 183a031..1e0e957 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java
@@ -496,12 +496,6 @@ public abstract class ByteBuff {
     return -(low + 1); // key not found.
   }
 
-  @Override
-  public String toString() {
-    return this.getClass().getSimpleName() + "[pos=" + position() + ", lim=" + 
limit() +
-        ", cap= " + capacity() + "]";
-  }
-
   public static String toStringBinary(final ByteBuff b, int off, int len) {
     StringBuilder result = new StringBuilder();
     // Just in case we are passed a 'len' that is > buffer length...

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java
----------------------------------------------------------------------
diff --git 
a/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java
 
b/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java
index ae871c4..536872e 100644
--- 
a/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java
+++ 
b/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java
@@ -260,7 +260,7 @@ public class MemcachedBlockCache implements BlockCache {
     public HFileBlock decode(CachedData d) {
       try {
         ByteBuff buf = new SingleByteBuff(ByteBuffer.wrap(d.getData()));
-        return (HFileBlock) HFileBlock.BLOCK_DESERIALIZER.deserialize(buf, 
true,
+        return (HFileBlock) HFileBlock.blockDeserializer.deserialize(buf, true,
           MemoryType.EXCLUSIVE);
       } catch (IOException e) {
         LOG.warn("Error deserializing data from memcached",e);

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java
index b0b1714..69f4330 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java
@@ -91,7 +91,7 @@ public class ChecksumUtil {
 
     // If this is an older version of the block that does not have
     // checksums, then return false indicating that checksum verification
-    // did not succeed. Actually, this method should never be called
+    // did not succeed. Actually, this methiod should never be called
     // when the minorVersion is 0, thus this is a defensive check for a
     // cannot-happen case. Since this is a cannot-happen case, it is
     // better to return false to indicate a checksum validation failure.
@@ -141,7 +141,8 @@ public class ChecksumUtil {
    * @return The number of bytes needed to store the checksum values
    */
   static long numBytes(long datasize, int bytesPerChecksum) {
-    return numChunks(datasize, bytesPerChecksum) * HFileBlock.CHECKSUM_SIZE;
+    return numChunks(datasize, bytesPerChecksum) *
+                     HFileBlock.CHECKSUM_SIZE;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
index f3402da..6268f2e 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
@@ -56,131 +56,50 @@ import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 
 /**
- * Reads {@link HFile} version 2 blocks to HFiles and via {@link Cacheable} 
Interface to caches.
- * Version 2 was introduced in hbase-0.92.0. No longer has support for version 
1 blocks since
- * hbase-1.3.0.
- *
- * <p>Version 1 was the original file block. Version 2 was introduced when we 
changed the hbase file
- * format to support multi-level block indexes and compound bloom filters 
(HBASE-3857).
+ * Reads {@link HFile} version 1 and version 2 blocks but writes version 2 
blocks only.
+ * Version 2 was introduced in hbase-0.92.0. Does read and write out to the 
filesystem but also
+ * the read and write to Cache.
  *
+ * <h3>HFileBlock: Version 1</h3>
+ * As of this writing, there should be no more version 1 blocks found out in 
the wild. Version 2
+ * as introduced in hbase-0.92.0.
+ * In version 1 all blocks are always compressed or uncompressed, as
+ * specified by the {@link HFile}'s compression algorithm, with a type-specific
+ * magic record stored in the beginning of the compressed data (i.e. one needs
+ * to uncompress the compressed block to determine the block type). There is
+ * only a single compression algorithm setting for all blocks. Offset and size
+ * information from the block index are required to read a block.
  * <h3>HFileBlock: Version 2</h3>
  * In version 2, a block is structured as follows:
  * <ul>
- * <li><b>Header:</b> See Writer#putHeader() for where header is written; 
header total size is
- * HFILEBLOCK_HEADER_SIZE
+ * <li><b>Header:</b> See Writer#putHeader(); header total size is 
HFILEBLOCK_HEADER_SIZE)
  * <ul>
- * <li>0. blockType: Magic record identifying the {@link BlockType} (8 bytes):
- * e.g. <code>DATABLK*</code>
- * <li>1. onDiskSizeWithoutHeader: Compressed -- a.k.a 'on disk' -- block 
size, excluding header,
- * but including tailing checksum bytes (4 bytes)
- * <li>2. uncompressedSizeWithoutHeader: Uncompressed block size, excluding 
header, and excluding
- * checksum bytes (4 bytes)
- * <li>3. prevBlockOffset: The offset of the previous block of the same type 
(8 bytes). This is
+ * <li>Magic record identifying the {@link BlockType} (8 bytes): e.g. 
<code>DATABLK*</code>
+ * <li>Compressed -- a.k.a 'on disk' -- block size, excluding header, but 
including
+ *     tailing checksum bytes (4 bytes)
+ * <li>Uncompressed block size, excluding header, and excluding checksum bytes 
(4 bytes)
+ * <li>The offset of the previous block of the same type (8 bytes). This is
  * used to navigate to the previous block without having to go to the block 
index
- * <li>4: For minorVersions &gt;=1, the ordinal describing checksum type (1 
byte)
- * <li>5: For minorVersions &gt;=1, the number of data bytes/checksum chunk (4 
bytes)
- * <li>6: onDiskDataSizeWithHeader: For minorVersions &gt;=1, the size of data 
'on disk', including
- * header, excluding checksums (4 bytes)
+ * <li>For minorVersions &gt;=1, the ordinal describing checksum type (1 byte)
+ * <li>For minorVersions &gt;=1, the number of data bytes/checksum chunk (4 
bytes)
+ * <li>For minorVersions &gt;=1, the size of data 'on disk', including header,
+ * excluding checksums (4 bytes)
  * </ul>
  * </li>
- * <li><b>Raw/Compressed/Encrypted/Encoded data:</b> The compression
- * algorithm is the same for all the blocks in an {@link HFile}. If 
compression is NONE, this is
- * just raw, serialized Cells.
+ * <li><b>Raw/Compressed/Encrypted/Encoded data:</b> The compression algorithm 
is the
+ * same for all the blocks in the {@link HFile}, similarly to what was done in
+ * version 1. If compression is NONE, this is just raw, serialized Cells.
  * <li><b>Tail:</b> For minorVersions &gt;=1, a series of 4 byte checksums, 
one each for
  * the number of bytes specified by bytesPerChecksum.
  * </ul>
- *
- * <h3>Caching</h3>
- * Caches cache whole blocks with trailing checksums if any. We then tag on 
some metadata, the
- * content of BLOCK_METADATA_SPACE which will be flag on if we are doing 
'hbase'
- * checksums and then the offset into the file which is needed when we re-make 
a cache key
- * when we return the block to the cache as 'done'. See {@link 
Cacheable#serialize(ByteBuffer)} and
- * {@link Cacheable#getDeserializer()}.
- *
- * <p>TODO: Should we cache the checksums? Down in 
Writer#getBlockForCaching(CacheConfig) where
- * we make a block to cache-on-write, there is an attempt at turning off 
checksums. This is not the
- * only place we get blocks to cache. We also will cache the raw return from 
an hdfs read. In this
- * case, the checksums may be present. If the cache is backed by something 
that doesn't do ECC,
- * say an SSD, we might want to preserve checksums. For now this is open 
question.
- * <p>TODO: Over in BucketCache, we save a block allocation by doing a custom 
serialization.
- * Be sure to change it if serialization changes in here. Could we add a 
method here that takes an
- * IOEngine and that then serializes to it rather than expose our internals 
over in BucketCache?
- * IOEngine is in the bucket subpackage. Pull it up? Then this class knows 
about bucketcache. Ugh.
+ * <p>Be aware that when we read from HDFS, we overread pulling in the next 
blocks' header too.
+ * We do this to save having to do two seeks to read an HFileBlock; a seek to 
read the header
+ * to figure lengths, etc., and then another seek to pull in the data.
  */
 @InterfaceAudience.Private
 public class HFileBlock implements Cacheable {
   private static final Log LOG = LogFactory.getLog(HFileBlock.class);
 
-  /** Type of block. Header field 0. */
-  private BlockType blockType;
-
-  /**
-   * Size on disk excluding header, including checksum. Header field 1.
-   * @see Writer#putHeader(byte[], int, int, int, int)
-   */
-  private int onDiskSizeWithoutHeader;
-
-  /**
-   * Size of pure data. Does not include header or checksums. Header field 2.
-   * @see Writer#putHeader(byte[], int, int, int, int)
-   */
-  private int uncompressedSizeWithoutHeader;
-
-  /**
-   * The offset of the previous block on disk. Header field 3.
-   * @see Writer#putHeader(byte[], int, int, int, int)
-   */
-  private long prevBlockOffset;
-
-  /**
-   * Size on disk of header + data. Excludes checksum. Header field 6,
-   * OR calculated from {@link #onDiskSizeWithoutHeader} when using HDFS 
checksum.
-   * @see Writer#putHeader(byte[], int, int, int, int)
-   */
-  private int onDiskDataSizeWithHeader;
-
-
-  /**
-   * The in-memory representation of the hfile block. Can be on or offheap. 
Can be backed by
-   * a single ByteBuffer or by many. Make no assumptions.
-   *
-   * <p>Be careful reading from this <code>buf</code>. Duplicate and work on 
the duplicate or if
-   * not, be sure to reset position and limit else trouble down the road.
-   *
-   * <p>TODO: Make this read-only once made.
-   *
-   * <p>We are using the ByteBuff type. ByteBuffer is not extensible yet we 
need to be able to have
-   * a ByteBuffer-like API across multiple ByteBuffers reading from a cache 
such as BucketCache.
-   * So, we have this ByteBuff type. Unfortunately, it is spread all about 
HFileBlock. Would be
-   * good if could be confined to cache-use only but hard-to-do.
-   */
-  private ByteBuff buf;
-
-  /** Meta data that holds meta information on the hfileblock.
-   */
-  private HFileContext fileContext;
-
-  /**
-   * The offset of this block in the file. Populated by the reader for
-   * convenience of access. This offset is not part of the block header.
-   */
-  private long offset = UNSET;
-
-  private MemoryType memType = MemoryType.EXCLUSIVE;
-
-  /**
-   * The on-disk size of the next block, including the header and checksums if 
present, obtained by
-   * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of 
the next block's
-   * header, or UNSET if unknown.
-   *
-   * Blocks try to carry the size of the next block to read in this data 
member. They will even have
-   * this value when served from cache. Could save a seek in the case where we 
are iterating through
-   * a file and some of the blocks come from cache. If from cache, then having 
this info to hand
-   * will save us doing a seek to read the header so we can read the body of a 
block.
-   * TODO: see how effective this is at saving seeks.
-   */
-  private int nextBlockOnDiskSize = UNSET;
-
   /**
    * On a checksum failure, do these many succeeding read requests using hdfs 
checksums before
    * auto-reenabling hbase checksum verification.
@@ -196,18 +115,14 @@ public class HFileBlock implements Cacheable {
       (int)ClassSize.estimateBase(MultiByteBuff.class, false);
 
   /**
-   * Space for metadata on a block that gets stored along with the block when 
we cache it.
-   * There are a few bytes stuck on the end of the HFileBlock that we pull in 
from HDFS (note,
+   * See #blockDeserializer method for more info.
+   * 13 bytes of extra stuff stuck on the end of the HFileBlock that we pull 
in from HDFS (note,
    * when we read from HDFS, we pull in an HFileBlock AND the header of the 
next block if one).
-   * 8 bytes are offset of this block (long) in the file. Offset is important 
because
-   * used when we remake the CacheKey when we return the block to cache when 
done. There is also
-   * a flag on whether checksumming is being done by hbase or not. See class 
comment for note on
-   * uncertain state of checksumming of blocks that come out of cache (should 
we or should we not?).
-   * Finally there 4 bytes to hold the length of the next block which can save 
a seek on occasion.
-   * <p>This EXTRA came in with original commit of the bucketcache, 
HBASE-7404. Was formerly
-   * known as EXTRA_SERIALIZATION_SPACE.
+   * The 13 bytes are: usesHBaseChecksum (1 byte) + offset of this block 
(long) +
+   * nextBlockOnDiskSizeWithHeader (int).
    */
-  static final int BLOCK_METADATA_SPACE = Bytes.SIZEOF_BYTE + 
Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT;
+  public static final int EXTRA_SERIALIZATION_SPACE =
+      Bytes.SIZEOF_BYTE + Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG;
 
   /**
    * Each checksum value is an integer that can be stored in 4 bytes.
@@ -220,47 +135,57 @@ public class HFileBlock implements Cacheable {
   /**
    * Used deserializing blocks from Cache.
    *
-   * <code>
+   * Serializing to cache is a little hard to follow. See Writer#finishBlock 
for where it is done.
+   * When we start to append to a new HFileBlock,
+   * we skip over where the header should go before we start adding Cells. 
When the block is
+   * done, we'll then go back and fill in the header and the checksum tail. Be 
aware that what
+   * gets serialized into the blockcache is a byte array that contains an 
HFileBlock followed by
+   * its checksums and then the header of the next HFileBlock (needed to help 
navigate), followed
+   * again by an extra 13 bytes of meta info needed when time to recreate the 
HFileBlock from cache.
+   *
    * ++++++++++++++
    * + HFileBlock +
    * ++++++++++++++
-   * + Checksums  + <= Optional
+   * + Checksums  +
+   * ++++++++++++++
+   * + NextHeader +
    * ++++++++++++++
-   * + Metadata!  +
+   * + ExtraMeta! +
    * ++++++++++++++
-   * </code>
-   * @see #serialize(ByteBuffer)
+   *
+   * TODO: Fix it so we do NOT put the NextHeader into blockcache. It is not 
necessary.
    */
-  static final CacheableDeserializer<Cacheable> BLOCK_DESERIALIZER =
+  static final CacheableDeserializer<Cacheable> blockDeserializer =
       new CacheableDeserializer<Cacheable>() {
         public HFileBlock deserialize(ByteBuff buf, boolean reuse, MemoryType 
memType)
         throws IOException {
-          // The buf has the file block followed by block metadata.
-          // Set limit to just before the BLOCK_METADATA_SPACE then rewind.
-          buf.limit(buf.limit() - BLOCK_METADATA_SPACE).rewind();
-          // Get a new buffer to pass the HFileBlock for it to 'own'.
-          ByteBuff newByteBuff;
+          // Rewind to just before the EXTRA_SERIALIZATION_SPACE.
+          buf.limit(buf.limit() - 
HFileBlock.EXTRA_SERIALIZATION_SPACE).rewind();
+          // Get a new buffer to pass the deserialized HFileBlock for it to 
'own'.
+          ByteBuff newByteBuffer;
           if (reuse) {
-            newByteBuff = buf.slice();
+            newByteBuffer = buf.slice();
           } else {
             int len = buf.limit();
-            newByteBuff = new SingleByteBuff(ByteBuffer.allocate(len));
-            newByteBuff.put(0, buf, buf.position(), len);
+            newByteBuffer = new SingleByteBuff(ByteBuffer.allocate(len));
+            newByteBuffer.put(0, buf, buf.position(), len);
           }
-          // Read out the BLOCK_METADATA_SPACE content and shove into our 
HFileBlock.
+          // Read out the EXTRA_SERIALIZATION_SPACE content and shove into our 
HFileBlock.
           buf.position(buf.limit());
-          buf.limit(buf.limit() + HFileBlock.BLOCK_METADATA_SPACE);
+          buf.limit(buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE);
           boolean usesChecksum = buf.get() == (byte)1;
-          long offset = buf.getLong();
-          int nextBlockOnDiskSize = buf.getInt();
-          HFileBlock hFileBlock =
-              new HFileBlock(newByteBuff, usesChecksum, memType, offset, 
nextBlockOnDiskSize, null);
+          HFileBlock hFileBlock = new HFileBlock(newByteBuffer, usesChecksum, 
memType);
+          hFileBlock.offset = buf.getLong();
+          hFileBlock.nextBlockOnDiskSizeWithHeader = buf.getInt();
+          if (hFileBlock.hasNextBlockHeader()) {
+            hFileBlock.buf.limit(hFileBlock.buf.limit() - 
hFileBlock.headerSize());
+          }
           return hFileBlock;
         }
 
         @Override
         public int getDeserialiserIdentifier() {
-          return DESERIALIZER_IDENTIFIER;
+          return deserializerIdentifier;
         }
 
         @Override
@@ -270,36 +195,65 @@ public class HFileBlock implements Cacheable {
         }
       };
 
-  private static final int DESERIALIZER_IDENTIFIER;
+  private static final int deserializerIdentifier;
   static {
-    DESERIALIZER_IDENTIFIER =
-        
CacheableDeserializerIdManager.registerDeserializer(BLOCK_DESERIALIZER);
+    deserializerIdentifier = CacheableDeserializerIdManager
+        .registerDeserializer(blockDeserializer);
   }
 
+  /** Type of block. Header field 0. */
+  private BlockType blockType;
+
   /**
-   * Copy constructor. Creates a shallow copy of {@code that}'s buffer.
+   * Size on disk excluding header, including checksum. Header field 1.
+   * @see Writer#putHeader(byte[], int, int, int, int)
    */
-  private HFileBlock(HFileBlock that) {
-    this.blockType = that.blockType;
-    this.onDiskSizeWithoutHeader = that.onDiskSizeWithoutHeader;
-    this.uncompressedSizeWithoutHeader = that.uncompressedSizeWithoutHeader;
-    this.prevBlockOffset = that.prevBlockOffset;
-    this.buf = that.buf.duplicate();
-    this.offset = that.offset;
-    this.onDiskDataSizeWithHeader = that.onDiskDataSizeWithHeader;
-    this.fileContext = that.fileContext;
-    this.nextBlockOnDiskSize = that.nextBlockOnDiskSize;
-  }
+  private int onDiskSizeWithoutHeader;
+
+  /**
+   * Size of pure data. Does not include header or checksums. Header field 2.
+   * @see Writer#putHeader(byte[], int, int, int, int)
+   */
+  private final int uncompressedSizeWithoutHeader;
+
+  /**
+   * The offset of the previous block on disk. Header field 3.
+   * @see Writer#putHeader(byte[], int, int, int, int)
+   */
+  private final long prevBlockOffset;
+
+  /**
+   * Size on disk of header + data. Excludes checksum. Header field 6,
+   * OR calculated from {@link #onDiskSizeWithoutHeader} when using HDFS 
checksum.
+   * @see Writer#putHeader(byte[], int, int, int, int)
+   */
+  private final int onDiskDataSizeWithHeader;
+
+  /** The in-memory representation of the hfile block */
+  private ByteBuff buf;
+
+  /** Meta data that holds meta information on the hfileblock */
+  private HFileContext fileContext;
+
+  /**
+   * The offset of this block in the file. Populated by the reader for
+   * convenience of access. This offset is not part of the block header.
+   */
+  private long offset = UNSET;
+
+  /**
+   * The on-disk size of the next block, including the header, obtained by
+   * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of 
the next block's
+   * header, or -1 if unknown.
+   */
+  private int nextBlockOnDiskSizeWithHeader = UNSET;
+
+  private MemoryType memType = MemoryType.EXCLUSIVE;
 
   /**
    * Creates a new {@link HFile} block from the given fields. This constructor
    * is used when the block data has already been read and uncompressed,
-   * and is sitting in a byte buffer and we want to stuff the block into cache.
-   * See {@link Writer#getBlockForCaching(CacheConfig)}.
-   *
-   * <p>TODO: The caller presumes no checksumming
-   * required of this block instance since going into cache; checksum already 
verified on
-   * underlying block data pulled in from filesystem. Is that correct? What if 
cache is SSD?
+   * and is sitting in a byte buffer.
    *
    * @param blockType the type of this block, see {@link BlockType}
    * @param onDiskSizeWithoutHeader see {@link #onDiskSizeWithoutHeader}
@@ -313,94 +267,86 @@ public class HFileBlock implements Cacheable {
    * @param fileContext HFile meta data
    */
   HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader, int 
uncompressedSizeWithoutHeader,
-      long prevBlockOffset, ByteBuffer b, boolean fillHeader, long offset,
-      final int nextBlockOnDiskSize, int onDiskDataSizeWithHeader, 
HFileContext fileContext) {
-    init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader,
-        prevBlockOffset, offset, onDiskDataSizeWithHeader, 
nextBlockOnDiskSize, fileContext);
-    this.buf = new SingleByteBuff(b);
+      long prevBlockOffset, ByteBuff buf, boolean fillHeader, long offset,
+      int onDiskDataSizeWithHeader, HFileContext fileContext) {
+    this.blockType = blockType;
+    this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader;
+    this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader;
+    this.prevBlockOffset = prevBlockOffset;
+    this.buf = buf;
+    this.offset = offset;
+    this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader;
+    this.fileContext = fileContext;
     if (fillHeader) {
       overwriteHeader();
     }
     this.buf.rewind();
   }
 
-  /**
-   * Creates a block from an existing buffer starting with a header. Rewinds
-   * and takes ownership of the buffer. By definition of rewind, ignores the
-   * buffer position, but if you slice the buffer beforehand, it will rewind
-   * to that point.
-   * @param buf Has header, content, and trailing checksums if present.
-   */
-  HFileBlock(ByteBuff buf, boolean usesHBaseChecksum, MemoryType memType, 
final long offset,
-      final int nextBlockOnDiskSize, HFileContext fileContext) throws 
IOException {
-    buf.rewind();
-    final BlockType blockType = BlockType.read(buf);
-    final int onDiskSizeWithoutHeader = buf.getInt();
-    final int uncompressedSizeWithoutHeader = buf.getInt();
-    final long prevBlockOffset = buf.getLong();
-    byte checksumType = buf.get();
-    int bytesPerChecksum = buf.getInt();
-    int onDiskDataSizeWithHeader = buf.getInt();
-    // This constructor is called when we deserialize a block from cache and 
when we read a block in
-    // from the fs. fileCache is null when deserialized from cache so need to 
make up one.
-    HFileContextBuilder fileContextBuilder = fileContext != null?
-        new HFileContextBuilder(fileContext): new HFileContextBuilder();
-    fileContextBuilder.withHBaseCheckSum(usesHBaseChecksum);
-    if (usesHBaseChecksum) {
-      // Use the checksum type and bytes per checksum from header, not from 
filecontext.
-      
fileContextBuilder.withChecksumType(ChecksumType.codeToType(checksumType));
-      fileContextBuilder.withBytesPerCheckSum(bytesPerChecksum);
-    } else {
-      fileContextBuilder.withChecksumType(ChecksumType.NULL);
-      fileContextBuilder.withBytesPerCheckSum(0);
-      // Need to fix onDiskDataSizeWithHeader; there are not checksums 
after-block-data
-      onDiskDataSizeWithHeader = onDiskSizeWithoutHeader + 
headerSize(usesHBaseChecksum);
-    }
-    fileContext = fileContextBuilder.build();
-    assert usesHBaseChecksum == fileContext.isUseHBaseChecksum();
-    init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader,
-        prevBlockOffset, offset, onDiskDataSizeWithHeader, 
nextBlockOnDiskSize, fileContext);
-    this.memType = memType;
-    this.offset = offset;
-    this.buf = buf;
-    this.buf.rewind();
+  HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader, int 
uncompressedSizeWithoutHeader,
+      long prevBlockOffset, ByteBuffer buf, boolean fillHeader, long offset,
+      int onDiskDataSizeWithHeader, HFileContext fileContext) {
+    this(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader, 
prevBlockOffset,
+        new SingleByteBuff(buf), fillHeader, offset, onDiskDataSizeWithHeader, 
fileContext);
   }
 
   /**
-   * Called from constructors.
+   * Copy constructor. Creates a shallow copy of {@code that}'s buffer.
    */
-  private void init(BlockType blockType, int onDiskSizeWithoutHeader,
-      int uncompressedSizeWithoutHeader, long prevBlockOffset,
-      long offset, int onDiskDataSizeWithHeader, final int nextBlockOnDiskSize,
-      HFileContext fileContext) {
-    this.blockType = blockType;
-    this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader;
-    this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader;
-    this.prevBlockOffset = prevBlockOffset;
-    this.offset = offset;
-    this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader;
-    this.nextBlockOnDiskSize = nextBlockOnDiskSize;
-    this.fileContext = fileContext;
+  HFileBlock(HFileBlock that) {
+    this.blockType = that.blockType;
+    this.onDiskSizeWithoutHeader = that.onDiskSizeWithoutHeader;
+    this.uncompressedSizeWithoutHeader = that.uncompressedSizeWithoutHeader;
+    this.prevBlockOffset = that.prevBlockOffset;
+    this.buf = that.buf.duplicate();
+    this.offset = that.offset;
+    this.onDiskDataSizeWithHeader = that.onDiskDataSizeWithHeader;
+    this.fileContext = that.fileContext;
+    this.nextBlockOnDiskSizeWithHeader = that.nextBlockOnDiskSizeWithHeader;
+  }
+
+  HFileBlock(ByteBuffer b, boolean usesHBaseChecksum) throws IOException {
+    this(new SingleByteBuff(b), usesHBaseChecksum);
   }
 
   /**
-   * Parse total ondisk size including header and checksum. Its second field 
in header after
-   * the magic bytes.
-   * @param headerBuf Header ByteBuffer. Presumed exact size of header.
-   * @return Size of the block with header included.
+   * Creates a block from an existing buffer starting with a header. Rewinds
+   * and takes ownership of the buffer. By definition of rewind, ignores the
+   * buffer position, but if you slice the buffer beforehand, it will rewind
+   * to that point.
    */
-  private static int getOnDiskSizeWithHeader(final ByteBuffer headerBuf) {
-    // Set hbase checksum to true always calling headerSize.
-    return headerBuf.getInt(BlockType.MAGIC_LENGTH) + headerSize(true);
+  HFileBlock(ByteBuff b, boolean usesHBaseChecksum) throws IOException {
+    this(b, usesHBaseChecksum, MemoryType.EXCLUSIVE);
   }
 
   /**
-   * @return the on-disk size of the next block (including the header size and 
any checksums if
-   * present) read by peeking into the next block's header; use as a hint when 
doing
-   * a read of the next block when scanning or running over a file.
+   * Creates a block from an existing buffer starting with a header. Rewinds
+   * and takes ownership of the buffer. By definition of rewind, ignores the
+   * buffer position, but if you slice the buffer beforehand, it will rewind
+   * to that point.
    */
-  public int getNextBlockOnDiskSize() {
-    return nextBlockOnDiskSize;
+  HFileBlock(ByteBuff b, boolean usesHBaseChecksum, MemoryType memType) throws 
IOException {
+    b.rewind();
+    blockType = BlockType.read(b);
+    onDiskSizeWithoutHeader = b.getInt();
+    uncompressedSizeWithoutHeader = b.getInt();
+    prevBlockOffset = b.getLong();
+    HFileContextBuilder contextBuilder = new HFileContextBuilder();
+    contextBuilder.withHBaseCheckSum(usesHBaseChecksum);
+    if (usesHBaseChecksum) {
+      contextBuilder.withChecksumType(ChecksumType.codeToType(b.get()));
+      contextBuilder.withBytesPerCheckSum(b.getInt());
+      this.onDiskDataSizeWithHeader = b.getInt();
+    } else {
+      contextBuilder.withChecksumType(ChecksumType.NULL);
+      contextBuilder.withBytesPerCheckSum(0);
+      this.onDiskDataSizeWithHeader =
+          onDiskSizeWithoutHeader + 
HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
+    }
+    this.fileContext = contextBuilder.build();
+    this.memType = memType;
+    buf = b;
+    buf.rewind();
   }
 
   public BlockType getBlockType() {
@@ -468,26 +414,49 @@ public class HFileBlock implements Cacheable {
    * @return the buffer with header skipped and checksum omitted.
    */
   public ByteBuff getBufferWithoutHeader() {
-    ByteBuff dup = getBufferReadOnly();
-    // Now set it up so Buffer spans content only -- no header or no checksums.
-    return dup.position(headerSize()).limit(buf.limit() - 
totalChecksumBytes()).slice();
+    ByteBuff dup = this.buf.duplicate();
+    dup.position(headerSize());
+    dup.limit(buf.limit() - totalChecksumBytes());
+    return dup.slice();
   }
 
   /**
-   * Returns a read-only duplicate of the buffer this block stores internally 
ready to be read.
-   * Clients must not modify the buffer object though they may set position 
and limit on the
-   * returned buffer since we pass back a duplicate. This method has to be 
public because it is used
+   * Returns the buffer this block stores internally. The clients must not
+   * modify the buffer object. This method has to be public because it is used
    * in {@link CompoundBloomFilter} to avoid object creation on every Bloom
-   * filter lookup, but has to be used with caution. Buffer holds header, 
block content,
-   * and any follow-on checksums if present.
+   * filter lookup, but has to be used with caution. Checksum data is not
+   * included in the returned buffer but header data is.
    *
    * @return the buffer of this block for read-only operations
    */
-  public ByteBuff getBufferReadOnly() {
-    // TODO: ByteBuf does not support asReadOnlyBuffer(). Fix.
+  ByteBuff getBufferReadOnly() {
+    ByteBuff dup = this.buf.duplicate();
+    dup.limit(buf.limit() - totalChecksumBytes());
+    return dup.slice();
+  }
+
+  /**
+   * Returns the buffer of this block, including header data. The clients must
+   * not modify the buffer object. This method has to be public because it is
+   * used in {@link org.apache.hadoop.hbase.io.hfile.bucket.BucketCache} to 
avoid buffer copy.
+   *
+   * @return the buffer with header and checksum included for read-only 
operations
+   */
+  public ByteBuff getBufferReadOnlyWithHeader() {
     ByteBuff dup = this.buf.duplicate();
-    assert dup.position() == 0;
-    return dup;
+    return dup.slice();
+  }
+
+  /**
+   * Returns a byte buffer of this block, including header data and checksum, 
positioned at
+   * the beginning of header. The underlying data array is not copied.
+   *
+   * @return the byte buffer with header and checksum included
+   */
+  ByteBuff getBufferWithHeader() {
+    ByteBuff dupBuf = buf.duplicate();
+    dupBuf.rewind();
+    return dupBuf;
   }
 
   private void sanityCheckAssertion(long valueFromBuf, long valueFromField,
@@ -512,38 +481,39 @@ public class HFileBlock implements Cacheable {
    * valid header consistent with the fields. Assumes a packed block structure.
    * This function is primary for testing and debugging, and is not
    * thread-safe, because it alters the internal buffer pointer.
-   * Used by tests only.
    */
-  @VisibleForTesting
   void sanityCheck() throws IOException {
-    // Duplicate so no side-effects
-    ByteBuff dup = this.buf.duplicate().rewind();
-    sanityCheckAssertion(BlockType.read(dup), blockType);
+    buf.rewind();
+
+    sanityCheckAssertion(BlockType.read(buf), blockType);
 
-    sanityCheckAssertion(dup.getInt(), onDiskSizeWithoutHeader, 
"onDiskSizeWithoutHeader");
+    sanityCheckAssertion(buf.getInt(), onDiskSizeWithoutHeader,
+        "onDiskSizeWithoutHeader");
 
-    sanityCheckAssertion(dup.getInt(), uncompressedSizeWithoutHeader,
+    sanityCheckAssertion(buf.getInt(), uncompressedSizeWithoutHeader,
         "uncompressedSizeWithoutHeader");
 
-    sanityCheckAssertion(dup.getLong(), prevBlockOffset, "prevBlockOffset");
+    sanityCheckAssertion(buf.getLong(), prevBlockOffset, "prevBlocKOffset");
     if (this.fileContext.isUseHBaseChecksum()) {
-      sanityCheckAssertion(dup.get(), 
this.fileContext.getChecksumType().getCode(), "checksumType");
-      sanityCheckAssertion(dup.getInt(), 
this.fileContext.getBytesPerChecksum(),
+      sanityCheckAssertion(buf.get(), 
this.fileContext.getChecksumType().getCode(), "checksumType");
+      sanityCheckAssertion(buf.getInt(), 
this.fileContext.getBytesPerChecksum(),
           "bytesPerChecksum");
-      sanityCheckAssertion(dup.getInt(), onDiskDataSizeWithHeader, 
"onDiskDataSizeWithHeader");
+      sanityCheckAssertion(buf.getInt(), onDiskDataSizeWithHeader, 
"onDiskDataSizeWithHeader");
     }
 
     int cksumBytes = totalChecksumBytes();
     int expectedBufLimit = onDiskDataSizeWithHeader + cksumBytes;
-    if (dup.limit() != expectedBufLimit) {
-      throw new AssertionError("Expected limit " + expectedBufLimit + ", got " 
+ dup.limit());
+    if (buf.limit() != expectedBufLimit) {
+      throw new AssertionError("Expected buffer limit " + expectedBufLimit
+          + ", got " + buf.limit());
     }
 
     // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read 
the next
     // block's header, so there are two sensible values for buffer capacity.
     int hdrSize = headerSize();
-    if (dup.capacity() != expectedBufLimit && dup.capacity() != 
expectedBufLimit + hdrSize) {
-      throw new AssertionError("Invalid buffer capacity: " + dup.capacity() +
+    if (buf.capacity() != expectedBufLimit &&
+        buf.capacity() != expectedBufLimit + hdrSize) {
+      throw new AssertionError("Invalid buffer capacity: " + buf.capacity() +
           ", expected " + expectedBufLimit + " or " + (expectedBufLimit + 
hdrSize));
     }
   }
@@ -590,6 +560,30 @@ public class HFileBlock implements Cacheable {
   }
 
   /**
+   * Called after reading a block with provided onDiskSizeWithHeader.
+   */
+  private void validateOnDiskSizeWithoutHeader(int 
expectedOnDiskSizeWithoutHeader)
+  throws IOException {
+    if (onDiskSizeWithoutHeader != expectedOnDiskSizeWithoutHeader) {
+      String dataBegin = null;
+      if (buf.hasArray()) {
+        dataBegin = Bytes.toStringBinary(buf.array(), buf.arrayOffset(), 
Math.min(32, buf.limit()));
+      } else {
+        ByteBuff bufDup = getBufferReadOnly();
+        byte[] dataBeginBytes = new byte[Math.min(32, bufDup.limit() - 
bufDup.position())];
+        bufDup.get(dataBeginBytes);
+        dataBegin = Bytes.toStringBinary(dataBeginBytes);
+      }
+      String blockInfoMsg =
+        "Block offset: " + offset + ", data starts with: " + dataBegin;
+      throw new IOException("On-disk size without header provided is "
+          + expectedOnDiskSizeWithoutHeader + ", but block "
+          + "header contains " + onDiskSizeWithoutHeader + ". " +
+          blockInfoMsg);
+    }
+  }
+
+  /**
    * Retrieves the decompressed/decrypted view of this block. An encoded block 
remains in its
    * encoded structure. Internal structures are shared between instances where 
applicable.
    */
@@ -613,10 +607,33 @@ public class HFileBlock implements Cacheable {
     ctx.prepareDecoding(unpacked.getOnDiskSizeWithoutHeader(),
       unpacked.getUncompressedSizeWithoutHeader(), 
unpacked.getBufferWithoutHeader(),
       dup);
+
+    // Preserve the next block's header bytes in the new block if we have them.
+    if (unpacked.hasNextBlockHeader()) {
+      // Both the buffers are limited till checksum bytes and avoid the next 
block's header.
+      // Below call to copyFromBufferToBuffer() will try positional read/write 
from/to buffers when
+      // any of the buffer is DBB. So we change the limit on a dup buffer. No 
copying just create
+      // new BB objects
+      ByteBuff inDup = this.buf.duplicate();
+      inDup.limit(inDup.limit() + headerSize());
+      ByteBuff outDup = unpacked.buf.duplicate();
+      outDup.limit(outDup.limit() + unpacked.headerSize());
+      outDup.put(
+          unpacked.headerSize() + unpacked.uncompressedSizeWithoutHeader
+              + unpacked.totalChecksumBytes(), inDup, 
this.onDiskDataSizeWithHeader,
+          unpacked.headerSize());
+    }
     return unpacked;
   }
 
   /**
+   * Return true when this buffer includes next block's header.
+   */
+  private boolean hasNextBlockHeader() {
+    return nextBlockOnDiskSizeWithHeader > 0;
+  }
+
+  /**
    * Always allocates a new buffer of the correct size. Copies header bytes
    * from the existing buffer. Does not change header fields.
    * Reserve room to keep checksum bytes too.
@@ -624,7 +641,8 @@ public class HFileBlock implements Cacheable {
   private void allocateBuffer() {
     int cksumBytes = totalChecksumBytes();
     int headerSize = headerSize();
-    int capacityNeeded = headerSize + uncompressedSizeWithoutHeader + 
cksumBytes;
+    int capacityNeeded = headerSize + uncompressedSizeWithoutHeader +
+        cksumBytes + (hasNextBlockHeader() ? headerSize : 0);
 
     // TODO we need consider allocating offheap here?
     ByteBuffer newBuf = ByteBuffer.allocate(capacityNeeded);
@@ -652,8 +670,9 @@ public class HFileBlock implements Cacheable {
   }
 
   /** An additional sanity-check in case no compression or encryption is being 
used. */
-  public void sanityCheckUncompressedSize() throws IOException {
-    if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + 
totalChecksumBytes()) {
+  public void assumeUncompressed() throws IOException {
+    if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader +
+        totalChecksumBytes()) {
       throw new IOException("Using no compression but "
           + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
           + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
@@ -661,14 +680,11 @@ public class HFileBlock implements Cacheable {
     }
   }
 
-  /**
-   * Cannot be {@link #UNSET}. Must be a legitimate value. Used re-making the 
{@link CacheKey} when
-   * block is returned to the cache.
-   * @return the offset of this block in the file it was read from
-   */
+  /** @return the offset of this block in the file it was read from */
   long getOffset() {
     if (offset < 0) {
-      throw new IllegalStateException("HFile block offset not initialized 
properly");
+      throw new IllegalStateException(
+          "HFile block offset not initialized properly");
     }
     return offset;
   }
@@ -728,6 +744,7 @@ public class HFileBlock implements Cacheable {
         // We could not read the "extra data", but that is OK.
         break;
       }
+
       if (ret < 0) {
         throw new IOException("Premature EOF from inputStream (read "
             + "returned " + ret + ", was trying to read " + necessaryLen
@@ -782,6 +799,14 @@ public class HFileBlock implements Cacheable {
   }
 
   /**
+   * @return the on-disk size of the next block (including the header size)
+   *         that was read by peeking into the next block's header
+   */
+  public int getNextBlockOnDiskSizeWithHeader() {
+    return nextBlockOnDiskSizeWithHeader;
+  }
+
+  /**
    * Unified version 2 {@link HFile} block writer. The intended usage pattern
    * is as follows:
    * <ol>
@@ -813,8 +838,8 @@ public class HFileBlock implements Cacheable {
     private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx;
 
     /**
-     * The stream we use to accumulate data into a block in an uncompressed 
format.
-     * We reset this stream at the end of each block and reuse it. The
+     * The stream we use to accumulate data in uncompressed format for each
+     * block. We reset this stream at the end of each block and reuse it. The
      * header is written as the first {@link 
HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this
      * stream.
      */
@@ -842,7 +867,7 @@ public class HFileBlock implements Cacheable {
      * if compression is turned on. It also includes the checksum data that
      * immediately follows the block data. (header + data + checksums)
      */
-    private byte[] onDiskBlockBytesWithHeader;
+    private byte[] onDiskBytesWithHeader;
 
     /**
      * The size of the checksum data on disk. It is used only if data is
@@ -859,7 +884,7 @@ public class HFileBlock implements Cacheable {
      * {@link org.apache.hadoop.hbase.HConstants#HFILEBLOCK_HEADER_SIZE}.
      * Does not store checksums.
      */
-    private byte[] uncompressedBlockBytesWithHeader;
+    private byte[] uncompressedBytesWithHeader;
 
     /**
      * Current block's start offset in the {@link HFile}. Set in
@@ -967,19 +992,18 @@ public class HFileBlock implements Cacheable {
       Preconditions.checkState(state != State.INIT,
           "Unexpected state: " + state);
 
-      if (state == State.BLOCK_READY) {
+      if (state == State.BLOCK_READY)
         return;
-      }
 
       // This will set state to BLOCK_READY.
       finishBlock();
     }
 
     /**
-     * Finish up writing of the block.
-     * Flushes the compressing stream (if using compression), fills out the 
header,
-     * does any compression/encryption of bytes to flush out to disk, and 
manages
-     * the cache on write content, if applicable. Sets block write state to 
"block ready".
+     * An internal method that flushes the compressing stream (if using
+     * compression), serializes the header, and takes care of the separate
+     * uncompressed stream for caching on write, if applicable. Sets block
+     * write state to "block ready".
      */
     private void finishBlock() throws IOException {
       if (blockType == BlockType.DATA) {
@@ -988,40 +1012,41 @@ public class HFileBlock implements Cacheable {
         blockType = dataBlockEncodingCtx.getBlockType();
       }
       userDataStream.flush();
-      // This does an array copy, so it is safe to cache this byte array when 
cache-on-write.
+      // This does an array copy, so it is safe to cache this byte array.
       // Header is still the empty, 'dummy' header that is yet to be filled 
out.
-      uncompressedBlockBytesWithHeader = baosInMemory.toByteArray();
+      uncompressedBytesWithHeader = baosInMemory.toByteArray();
       prevOffset = prevOffsetByType[blockType.getId()];
 
-      // We need to set state before we can package the block up for 
cache-on-write. In a way, the
-      // block is ready, but not yet encoded or compressed.
+      // We need to set state before we can package the block up for
+      // cache-on-write. In a way, the block is ready, but not yet encoded or
+      // compressed.
       state = State.BLOCK_READY;
       if (blockType == BlockType.DATA || blockType == BlockType.ENCODED_DATA) {
-        onDiskBlockBytesWithHeader = dataBlockEncodingCtx.
-            compressAndEncrypt(uncompressedBlockBytesWithHeader);
+        onDiskBytesWithHeader = dataBlockEncodingCtx
+            .compressAndEncrypt(uncompressedBytesWithHeader);
       } else {
-        onDiskBlockBytesWithHeader = defaultBlockEncodingCtx.
-            compressAndEncrypt(uncompressedBlockBytesWithHeader);
+        onDiskBytesWithHeader = this.defaultBlockEncodingCtx.
+            compressAndEncrypt(uncompressedBytesWithHeader);
       }
       // Calculate how many bytes we need for checksum on the tail of the 
block.
       int numBytes = (int) ChecksumUtil.numBytes(
-          onDiskBlockBytesWithHeader.length,
+          onDiskBytesWithHeader.length,
           fileContext.getBytesPerChecksum());
 
       // Put the header for the on disk bytes; header currently is unfilled-out
-      putHeader(onDiskBlockBytesWithHeader, 0,
-          onDiskBlockBytesWithHeader.length + numBytes,
-          uncompressedBlockBytesWithHeader.length, 
onDiskBlockBytesWithHeader.length);
+      putHeader(onDiskBytesWithHeader, 0,
+          onDiskBytesWithHeader.length + numBytes,
+          uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
       // Set the header for the uncompressed bytes (for cache-on-write) -- IFF 
different from
-      // onDiskBlockBytesWithHeader array.
-      if (onDiskBlockBytesWithHeader != uncompressedBlockBytesWithHeader) {
-        putHeader(uncompressedBlockBytesWithHeader, 0,
-          onDiskBlockBytesWithHeader.length + numBytes,
-          uncompressedBlockBytesWithHeader.length, 
onDiskBlockBytesWithHeader.length);
+      // onDiskBytesWithHeader array.
+      if (onDiskBytesWithHeader != uncompressedBytesWithHeader) {
+        putHeader(uncompressedBytesWithHeader, 0,
+          onDiskBytesWithHeader.length + numBytes,
+          uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
       }
       onDiskChecksum = new byte[numBytes];
       ChecksumUtil.generateChecksums(
-          onDiskBlockBytesWithHeader, 0, onDiskBlockBytesWithHeader.length,
+          onDiskBytesWithHeader, 0, onDiskBytesWithHeader.length,
           onDiskChecksum, 0, fileContext.getChecksumType(), 
fileContext.getBytesPerChecksum());
     }
 
@@ -1076,7 +1101,7 @@ public class HFileBlock implements Cacheable {
     protected void finishBlockAndWriteHeaderAndData(DataOutputStream out)
       throws IOException {
       ensureBlockReady();
-      out.write(onDiskBlockBytesWithHeader);
+      out.write(onDiskBytesWithHeader);
       out.write(onDiskChecksum);
     }
 
@@ -1095,12 +1120,12 @@ public class HFileBlock implements Cacheable {
       // This is not very optimal, because we are doing an extra copy.
       // But this method is used only by unit tests.
       byte[] output =
-          new byte[onDiskBlockBytesWithHeader.length
+          new byte[onDiskBytesWithHeader.length
               + onDiskChecksum.length];
-      System.arraycopy(onDiskBlockBytesWithHeader, 0, output, 0,
-          onDiskBlockBytesWithHeader.length);
+      System.arraycopy(onDiskBytesWithHeader, 0, output, 0,
+          onDiskBytesWithHeader.length);
       System.arraycopy(onDiskChecksum, 0, output,
-          onDiskBlockBytesWithHeader.length, onDiskChecksum.length);
+          onDiskBytesWithHeader.length, onDiskChecksum.length);
       return output;
     }
 
@@ -1128,7 +1153,7 @@ public class HFileBlock implements Cacheable {
      */
     int getOnDiskSizeWithoutHeader() {
       expectState(State.BLOCK_READY);
-      return onDiskBlockBytesWithHeader.length +
+      return onDiskBytesWithHeader.length +
           onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE;
     }
 
@@ -1141,7 +1166,7 @@ public class HFileBlock implements Cacheable {
      */
     int getOnDiskSizeWithHeader() {
       expectState(State.BLOCK_READY);
-      return onDiskBlockBytesWithHeader.length + onDiskChecksum.length;
+      return onDiskBytesWithHeader.length + onDiskChecksum.length;
     }
 
     /**
@@ -1149,7 +1174,7 @@ public class HFileBlock implements Cacheable {
      */
     int getUncompressedSizeWithoutHeader() {
       expectState(State.BLOCK_READY);
-      return uncompressedBlockBytesWithHeader.length - 
HConstants.HFILEBLOCK_HEADER_SIZE;
+      return uncompressedBytesWithHeader.length - 
HConstants.HFILEBLOCK_HEADER_SIZE;
     }
 
     /**
@@ -1157,7 +1182,7 @@ public class HFileBlock implements Cacheable {
      */
     int getUncompressedSizeWithHeader() {
       expectState(State.BLOCK_READY);
-      return uncompressedBlockBytesWithHeader.length;
+      return uncompressedBytesWithHeader.length;
     }
 
     /** @return true if a block is being written  */
@@ -1187,7 +1212,7 @@ public class HFileBlock implements Cacheable {
      */
     ByteBuffer getUncompressedBufferWithHeader() {
       expectState(State.BLOCK_READY);
-      return ByteBuffer.wrap(uncompressedBlockBytesWithHeader);
+      return ByteBuffer.wrap(uncompressedBytesWithHeader);
     }
 
     /**
@@ -1200,7 +1225,7 @@ public class HFileBlock implements Cacheable {
      */
     ByteBuffer getOnDiskBufferWithHeader() {
       expectState(State.BLOCK_READY);
-      return ByteBuffer.wrap(onDiskBlockBytesWithHeader);
+      return ByteBuffer.wrap(onDiskBytesWithHeader);
     }
 
     private void expectState(State expectedState) {
@@ -1232,10 +1257,6 @@ public class HFileBlock implements Cacheable {
      * block does not have checksum data even though the header minor
      * version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a
      * 0 value in bytesPerChecksum.
-     *
-     * <p>TODO: Should there be an option where a cache can ask that hbase 
preserve block
-     * checksums for checking after a block comes out of the cache? Otehrwise, 
cache is responsible
-     * for blocks being wholesome (ECC memory or if file-backed, it does 
checksumming).
      */
     HFileBlock getBlockForCaching(CacheConfig cacheConf) {
       HFileContext newContext = new HFileContextBuilder()
@@ -1249,13 +1270,13 @@ public class HFileBlock implements Cacheable {
                                 .withIncludesMvcc(fileContext.isIncludesMvcc())
                                 .withIncludesTags(fileContext.isIncludesTags())
                                 .build();
-       return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(),
+      return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(),
           getUncompressedSizeWithoutHeader(), prevOffset,
-          cacheConf.shouldCacheCompressed(blockType.getCategory())?
+          cacheConf.shouldCacheCompressed(blockType.getCategory()) ?
             getOnDiskBufferWithHeader() :
             getUncompressedBufferWithHeader(),
-          FILL_HEADER, startOffset, UNSET,
-          onDiskBlockBytesWithHeader.length + onDiskChecksum.length, 
newContext);
+          FILL_HEADER, startOffset,
+          onDiskBytesWithHeader.length + onDiskChecksum.length, newContext);
     }
   }
 
@@ -1301,9 +1322,12 @@ public class HFileBlock implements Cacheable {
      * @param offset
      * @param onDiskSize the on-disk size of the entire block, including all
      *          applicable headers, or -1 if unknown
+     * @param uncompressedSize the uncompressed size of the compressed part of
+     *          the block, or -1 if unknown
      * @return the newly read block
      */
-    HFileBlock readBlockData(long offset, long onDiskSize, boolean pread) 
throws IOException;
+    HFileBlock readBlockData(long offset, long onDiskSize,
+        int uncompressedSize, boolean pread) throws IOException;
 
     /**
      * Creates a block iterator over the given portion of the {@link HFile}.
@@ -1356,11 +1380,6 @@ public class HFileBlock implements Cacheable {
     /** Default context used when BlockType != {@link BlockType#ENCODED_DATA}. 
*/
     private final HFileBlockDefaultDecodingContext defaultDecodingCtx;
 
-    /**
-     * When we read a block, we overread and pull in the next blocks header 
too. We will save it
-     * here. If moving serially through the file, we will trip over this 
caching of the next blocks
-     * header so we won't have to do explicit seek to find next blocks 
lengths, etc.
-     */
     private ThreadLocal<PrefetchedHeader> prefetchedHeaderForThread =
         new ThreadLocal<PrefetchedHeader>() {
       @Override
@@ -1424,7 +1443,7 @@ public class HFileBlock implements Cacheable {
         public HFileBlock nextBlock() throws IOException {
           if (offset >= endOffset)
             return null;
-          HFileBlock b = readBlockData(offset, -1, false);
+          HFileBlock b = readBlockData(offset, -1, -1, false);
           offset += b.getOnDiskSizeWithHeader();
           return b.unpack(fileContext, owner);
         }
@@ -1444,7 +1463,7 @@ public class HFileBlock implements Cacheable {
 
     /**
      * Does a positional read or a seek and read into the given buffer. Returns
-     * the on-disk size of the next block, or -1 if it could not be 
read/determined; e.g. EOF.
+     * the on-disk size of the next block, or -1 if it could not be determined.
      *
      * @param dest destination buffer
      * @param destOffset offset into the destination buffer at where to put 
the bytes we read
@@ -1454,8 +1473,7 @@ public class HFileBlock implements Cacheable {
      * @param pread whether we should do a positional read
      * @param istream The input source of data
      * @return the on-disk size of the next block with header size included, or
-     *         -1 if it could not be determined; if not -1, the 
<code>dest</code> INCLUDES the
-     *         next header
+     *         -1 if it could not be determined
      * @throws IOException
      */
     protected int readAtOffset(FSDataInputStream istream, byte [] dest, int 
destOffset, int size,
@@ -1487,16 +1505,16 @@ public class HFileBlock implements Cacheable {
           }
 
           // Try to read the next block header.
-          if (!readWithExtra(istream, dest, destOffset, size, hdrSize)) {
+          if (!readWithExtra(istream, dest, destOffset, size, hdrSize))
             return -1;
-          }
         } finally {
           streamLock.unlock();
         }
       } else {
         // Positional read. Better for random reads; or when the streamLock is 
already locked.
         int extraSize = peekIntoNextBlock ? hdrSize : 0;
-        if (!positionalReadWithExtra(istream, fileOffset, dest, destOffset, 
size, extraSize)) {
+        if (!positionalReadWithExtra(istream, fileOffset, dest, destOffset,
+            size, extraSize)) {
           return -1;
         }
       }
@@ -1512,12 +1530,16 @@ public class HFileBlock implements Cacheable {
      * @param offset the offset in the stream to read at
      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
      *          the header, or -1 if unknown
+     * @param uncompressedSize the uncompressed size of the the block. Always
+     *          expected to be -1. This parameter is only used in version 1.
      * @param pread whether to use a positional read
      */
     @Override
-    public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL, 
boolean pread)
+    public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL,
+        int uncompressedSize, boolean pread)
     throws IOException {
-      // Get a copy of the current state of whether to validate
+
+      // get a copy of the current state of whether to validate
       // hbase checksums or not for this read call. This is not
       // thread-safe but the one constaint is that if we decide
       // to skip hbase checksum verification then we are
@@ -1526,7 +1548,8 @@ public class HFileBlock implements Cacheable {
       FSDataInputStream is = 
streamWrapper.getStream(doVerificationThruHBaseChecksum);
 
       HFileBlock blk = readBlockDataInternal(is, offset,
-                         onDiskSizeWithHeaderL, pread,
+                         onDiskSizeWithHeaderL,
+                         uncompressedSize, pread,
                          doVerificationThruHBaseChecksum);
       if (blk == null) {
         HFile.LOG.warn("HBase checksum verification failed for file " +
@@ -1553,7 +1576,8 @@ public class HFileBlock implements Cacheable {
         // a few more than precisely this number.
         is = 
this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD);
         doVerificationThruHBaseChecksum = false;
-        blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, pread,
+        blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL,
+                                    uncompressedSize, pread,
                                     doVerificationThruHBaseChecksum);
         if (blk != null) {
           HFile.LOG.warn("HDFS checksum verification suceeded for file " +
@@ -1581,139 +1605,175 @@ public class HFileBlock implements Cacheable {
     }
 
     /**
-     * @return Check <code>onDiskSizeWithHeaderL</code> size is healthy and 
then return it as an int
-     * @throws IOException
-     */
-    private static int checkAndGetSizeAsInt(final long onDiskSizeWithHeaderL, 
final int hdrSize)
-    throws IOException {
-      if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1)
-          || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) {
-        throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL
-            + ": expected to be at least " + hdrSize
-            + " and at most " + Integer.MAX_VALUE + ", or -1");
-      }
-      return (int)onDiskSizeWithHeaderL;
-    }
-
-    /**
-     * Check threadlocal cache for this block's header; we usually read it on 
the tail of reading
-     * the previous block to save a seek. Otherwise, we have to do a seek to 
read the header before
-     * we can pull in the block.
-     * @return The cached block header or null if not found.
-     * @see #cacheNextBlockHeader(long, byte[], int, int)
-     */
-    private ByteBuffer getCachedHeader(final long offset) {
-      PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
-      // PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
-      return prefetchedHeader != null && prefetchedHeader.offset == offset?
-          prefetchedHeader.buf: null;
-    }
-
-    /**
-     * Save away the next blocks header in thread local.
-     * @see #getCachedHeader(long)
-     */
-    private void cacheNextBlockHeader(final long nextBlockOffset,
-        final byte [] header, final int headerOffset, final int headerLength) {
-      PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
-      prefetchedHeader.offset = nextBlockOffset;
-      System.arraycopy(header, headerOffset, prefetchedHeader.header, 0, 
headerLength);
-    }
-
-    /**
-     * Verify the passed in onDiskSizeWithHeader aligns with what is in the 
header else something
-     * is not right.
-     * @throws IOException
-     */
-    private void verifyOnDiskSizeMatchesHeader(final int passedIn, final 
ByteBuffer headerBuf,
-        final long offset)
-    throws IOException {
-      // Assert size provided aligns with what is in the header
-      int fromHeader = getOnDiskSizeWithHeader(headerBuf);
-      if (passedIn != fromHeader) {
-        throw new IOException("Passed in onDiskSizeWithHeader=" + passedIn + " 
!= " + fromHeader +
-            ", offset=" + offset + ", fileContext=" + this.fileContext);
-      }
-    }
-
-    /**
      * Reads a version 2 block.
      *
      * @param offset the offset in the stream to read at
      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
-     *          the header and checksums if present or -1 if unknown
+     *          the header, or -1 if unknown
+     * @param uncompressedSize the uncompressed size of the the block. Always
+     *          expected to be -1. This parameter is only used in version 1.
      * @param pread whether to use a positional read
      * @param verifyChecksum Whether to use HBase checksums.
      *        If HBase checksum is switched off, then use HDFS checksum.
      * @return the HFileBlock or null if there is a HBase checksum mismatch
      */
     private HFileBlock readBlockDataInternal(FSDataInputStream is, long offset,
-        long onDiskSizeWithHeaderL, boolean pread, boolean verifyChecksum)
+        long onDiskSizeWithHeaderL, int uncompressedSize, boolean pread,
+        boolean verifyChecksum)
     throws IOException {
       if (offset < 0) {
         throw new IOException("Invalid offset=" + offset + " trying to read "
-            + "block (onDiskSize=" + onDiskSizeWithHeaderL + ")");
+            + "block (onDiskSize=" + onDiskSizeWithHeaderL
+            + ", uncompressedSize=" + uncompressedSize + ")");
       }
-      int onDiskSizeWithHeader = checkAndGetSizeAsInt(onDiskSizeWithHeaderL, 
hdrSize);
-      ByteBuffer headerBuf = getCachedHeader(offset);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Reading " + this.fileContext.getHFileName() + " at offset=" 
+ offset +
-          ", pread=" + pread + ", verifyChecksum=" + verifyChecksum + ", 
cachedHeader=" +
-          headerBuf + ", onDiskSizeWithHeader=" + onDiskSizeWithHeader);
+
+      if (uncompressedSize != -1) {
+        throw new IOException("Version 2 block reader API does not need " +
+            "the uncompressed size parameter");
+      }
+
+      if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1)
+          || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) {
+        throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL
+            + ": expected to be at least " + hdrSize
+            + " and at most " + Integer.MAX_VALUE + ", or -1 (offset="
+            + offset + ", uncompressedSize=" + uncompressedSize + ")");
+      }
+
+      int onDiskSizeWithHeader = (int) onDiskSizeWithHeaderL;
+
+      // See if we can avoid reading the header. This is desirable, because
+      // we will not incur a backward seek operation if we have already
+      // read this block's header as part of the previous read's look-ahead.
+      // And we also want to skip reading the header again if it has already
+      // been read.
+      // TODO: How often does this optimization fire? Has to be same thread so 
the thread local
+      // is pertinent and we have to be reading next block as in a big scan.
+      ByteBuffer headerBuf = null;
+      PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
+      boolean preReadHeader = false;
+      if (prefetchedHeader != null && prefetchedHeader.offset == offset) {
+        headerBuf = prefetchedHeader.buf;
+        preReadHeader = true;
       }
-      if (onDiskSizeWithHeader <= 0) {
-        // We were not passed the block size. Need to get it from the header. 
If header was not in
-        // cache, need to seek to pull it in. This latter might happen when we 
are doing the first
-        // read in a series of reads or a random read, and we don't have 
access to the block index.
-        // This is costly and should happen very rarely.
+      // Allocate enough space to fit the next block's header too.
+      int nextBlockOnDiskSize = 0;
+      byte[] onDiskBlock = null;
+
+      HFileBlock b = null;
+      boolean fastPath = false;
+      boolean readHdrOnly = false;
+      if (onDiskSizeWithHeader > 0) {
+        fastPath = true;
+        // We know the total on-disk size. Read the entire block into memory,
+        // then parse the header. This code path is used when
+        // doing a random read operation relying on the block index, as well as
+        // when the client knows the on-disk size from peeking into the next
+        // block's header (e.g. this block's header) when reading the previous
+        // block. This is the faster and more preferable case.
+
+        // Size that we have to skip in case we have already read the header.
+        int preReadHeaderSize = headerBuf == null ? 0 : hdrSize;
+        onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize]; // room for 
this block plus the
+                                                                // next 
block's header
+        nextBlockOnDiskSize = readAtOffset(is, onDiskBlock,
+            preReadHeaderSize, onDiskSizeWithHeader - preReadHeaderSize,
+            true, offset + preReadHeaderSize, pread);
+        if (headerBuf != null) {
+          // the header has been read when reading the previous block, copy
+          // to this block's header
+          // headerBuf is HBB
+          assert headerBuf.hasArray();
+          System.arraycopy(headerBuf.array(),
+              headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
+        } else {
+          headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize);
+        }
+        // We know the total on-disk size but not the uncompressed size. Parse 
the header.
+        try {
+          // TODO: FIX!!! Expensive parse just to get a length
+          b = new HFileBlock(headerBuf, fileContext.isUseHBaseChecksum());
+        } catch (IOException ex) {
+          // Seen in load testing. Provide comprehensive debug info.
+          throw new IOException("Failed to read compressed block at "
+              + offset
+              + ", onDiskSizeWithoutHeader="
+              + onDiskSizeWithHeader
+              + ", preReadHeaderSize="
+              + hdrSize
+              + ", header.length="
+              + prefetchedHeader.header.length
+              + ", header bytes: "
+              + Bytes.toStringBinary(prefetchedHeader.header, 0,
+                  hdrSize), ex);
+        }
+        // if the caller specifies a onDiskSizeWithHeader, validate it.
+        int onDiskSizeWithoutHeader = onDiskSizeWithHeader - hdrSize;
+        assert onDiskSizeWithoutHeader >= 0;
+        b.validateOnDiskSizeWithoutHeader(onDiskSizeWithoutHeader);
+      } else {
+        // Check headerBuf to see if we have read this block's header as part 
of
+        // reading the previous block. This is an optimization of peeking into
+        // the next block's header (e.g.this block's header) when reading the
+        // previous block. This is the faster and more preferable case. If the
+        // header is already there, don't read the header again.
+
+        // Unfortunately, we still have to do a separate read operation to
+        // read the header.
         if (headerBuf == null) {
+          readHdrOnly = true;
+          // From the header, determine the on-disk size of the given hfile
+          // block, and read the remaining data, thereby incurring two read
+          // operations. This might happen when we are doing the first read
+          // in a series of reads or a random read, and we don't have access
+          // to the block index. This is costly and should happen very rarely.
           headerBuf = ByteBuffer.allocate(hdrSize);
-          readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(), 
hdrSize, false,
-              offset, pread);
+          // headerBuf is HBB
+          readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(),
+              hdrSize, false, offset, pread);
         }
-        onDiskSizeWithHeader = getOnDiskSizeWithHeader(headerBuf);
-      }
-      int preReadHeaderSize = headerBuf == null? 0 : hdrSize;
-      // Allocate enough space to fit the next block's header too; saves a 
seek next time through.
-      // onDiskBlock is whole block + header + checksums then extra hdrSize to 
read next header;
-      // onDiskSizeWithHeader is header, body, and any checksums if present.
-      // TODO: Make this ByteBuffer-based. Will make it easier to go to HDFS 
with BBPool (offheap).
-      byte[] onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize];
-      int nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, 
preReadHeaderSize,
-          onDiskSizeWithHeader - preReadHeaderSize, true, offset + 
preReadHeaderSize, pread);
-      if (headerBuf != null) {
-        // The header has been read when reading the previous block OR in a 
distinct header-only
-        // read. Copy to this block's header.
+        // TODO: FIX!!! Expensive parse just to get a length
+        b = new HFileBlock(headerBuf, fileContext.isUseHBaseChecksum());
+        // onDiskBlock is whole block + header + checksums then extra hdrSize 
to read next header
+        onDiskBlock = new byte[b.getOnDiskSizeWithHeader() + hdrSize];
+        // headerBuf is HBB. Copy hdr into onDiskBlock
         System.arraycopy(headerBuf.array(), headerBuf.arrayOffset(), 
onDiskBlock, 0, hdrSize);
-      } else {
-        headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize);
+        nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, hdrSize,
+            b.getOnDiskSizeWithHeader() - hdrSize, true, offset + hdrSize, 
pread);
+        onDiskSizeWithHeader = b.onDiskSizeWithoutHeader + hdrSize;
       }
-      // Do a few checks before we go instantiate HFileBlock.
-      assert onDiskSizeWithHeader > this.hdrSize;
-      verifyOnDiskSizeMatchesHeader(onDiskSizeWithHeader, headerBuf, offset);
-      // The onDiskBlock will become the headerAndDataBuffer for this block.
-      // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already
-      // contains the header of next block, so no need to set next block's 
header in it.
-      HFileBlock hFileBlock =
-          new HFileBlock(new SingleByteBuff(ByteBuffer.wrap(onDiskBlock, 0, 
onDiskSizeWithHeader)),
-              this.fileContext.isUseHBaseChecksum(), MemoryType.EXCLUSIVE, 
offset,
-              nextBlockOnDiskSize, fileContext);
-      // Run check on uncompressed sizings.
+
       if (!fileContext.isCompressedOrEncrypted()) {
-        hFileBlock.sanityCheckUncompressed();
+        b.assumeUncompressed();
       }
-      if (verifyChecksum && !validateBlockChecksum(hFileBlock, offset, 
onDiskBlock, hdrSize)) {
-        return null;
+
+      if (verifyChecksum && !validateBlockChecksum(b, offset, onDiskBlock, 
hdrSize)) {
+        return null;             // checksum mismatch
       }
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Read " + hFileBlock);
+
+      // The onDiskBlock will become the headerAndDataBuffer for this block.
+      // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already
+      // contains the header of next block, so no need to set next
+      // block's header in it.
+      b = new HFileBlock(ByteBuffer.wrap(onDiskBlock, 0, onDiskSizeWithHeader),
+        this.fileContext.isUseHBaseChecksum());
+
+      b.nextBlockOnDiskSizeWithHeader = nextBlockOnDiskSize;
+
+      // Set prefetched header
+      if (b.hasNextBlockHeader()) {
+        prefetchedHeader.offset = offset + b.getOnDiskSizeWithHeader();
+        System.arraycopy(onDiskBlock, onDiskSizeWithHeader, 
prefetchedHeader.header, 0, hdrSize);
       }
-      // Cache next block header if we read it for the next time through here.
-      if (nextBlockOnDiskSize != -1) {
-        cacheNextBlockHeader(offset + hFileBlock.getOnDiskSizeWithHeader(),
-            onDiskBlock, onDiskSizeWithHeader, hdrSize);
+
+      b.offset = offset;
+      b.fileContext.setIncludesTags(this.fileContext.isIncludesTags());
+      b.fileContext.setIncludesMvcc(this.fileContext.isIncludesMvcc());
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Read preReadHeader=" + preReadHeader + ", fastPath=" + 
fastPath +
+            ", readHdrOnly=" + readHdrOnly + ", " + b);
       }
-      return hFileBlock;
+      return b;
     }
 
     @Override
@@ -1759,73 +1819,42 @@ public class HFileBlock implements Cacheable {
     }
   }
 
-  /** An additional sanity-check in case no compression or encryption is being 
used. */
-  void sanityCheckUncompressed() throws IOException {
-    if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader +
-        totalChecksumBytes()) {
-      throw new IOException("Using no compression but "
-          + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
-          + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
-          + ", numChecksumbytes=" + totalChecksumBytes());
-    }
-  }
-
-  // Cacheable implementation
   @Override
   public int getSerializedLength() {
     if (buf != null) {
-      // Include extra bytes for block metadata.
-      return this.buf.limit() + BLOCK_METADATA_SPACE;
+      // include extra bytes for the next header when it's available.
+      int extraSpace = hasNextBlockHeader() ? headerSize() : 0;
+      return this.buf.limit() + extraSpace + 
HFileBlock.EXTRA_SERIALIZATION_SPACE;
     }
     return 0;
   }
 
-  // Cacheable implementation
   @Override
   public void serialize(ByteBuffer destination) {
-    // BE CAREFUL!! There is a custom version of this serialization over in 
BucketCache#doDrain.
-    // Make sure any changes in here are reflected over there.
-    this.buf.get(destination, 0, getSerializedLength() - BLOCK_METADATA_SPACE);
-    destination = addMetaData(destination);
-
-    // Make it ready for reading. flip sets position to zero and limit to 
current position which
-    // is what we want if we do not want to serialize the block plus checksums 
if present plus
-    // metadata.
-    destination.flip();
-  }
-
-  /**
-   * For use by bucketcache. This exposes internals.
-   */
-  public ByteBuffer getMetaData() {
-    ByteBuffer bb = ByteBuffer.allocate(BLOCK_METADATA_SPACE);
-    bb = addMetaData(bb);
-    bb.flip();
-    return bb;
+    this.buf.get(destination, 0, getSerializedLength() - 
EXTRA_SERIALIZATION_SPACE);
+    serializeExtraInfo(destination);
   }
 
   /**
-   * Adds metadata at current position (position is moved forward). Does not 
flip or reset.
-   * @return The passed <code>destination</code> with metadata added.
+   * Write out the content of EXTRA_SERIALIZATION_SPACE. Public so can be 
accessed by BucketCache.
    */
-  private ByteBuffer addMetaData(final ByteBuffer destination) {
+  public void serializeExtraInfo(ByteBuffer destination) {
     destination.put(this.fileContext.isUseHBaseChecksum() ? (byte) 1 : (byte) 
0);
     destination.putLong(this.offset);
-    destination.putInt(this.nextBlockOnDiskSize);
-    return destination;
+    destination.putInt(this.nextBlockOnDiskSizeWithHeader);
+    destination.rewind();
   }
 
-  // Cacheable implementation
   @Override
   public CacheableDeserializer<Cacheable> getDeserializer() {
-    return HFileBlock.BLOCK_DESERIALIZER;
+    return HFileBlock.blockDeserializer;
   }
 
   @Override
   public int hashCode() {
     int result = 1;
     result = result * 31 + blockType.hashCode();
-    result = result * 31 + nextBlockOnDiskSize;
+    result = result * 31 + nextBlockOnDiskSizeWithHeader;
     result = result * 31 + (int) (offset ^ (offset >>> 32));
     result = result * 31 + onDiskSizeWithoutHeader;
     result = result * 31 + (int) (prevBlockOffset ^ (prevBlockOffset >>> 32));
@@ -1851,10 +1880,9 @@ public class HFileBlock implements Cacheable {
     if (castedComparison.blockType != this.blockType) {
       return false;
     }
-    if (castedComparison.nextBlockOnDiskSize != this.nextBlockOnDiskSize) {
+    if (castedComparison.nextBlockOnDiskSizeWithHeader != 
this.nextBlockOnDiskSizeWithHeader) {
       return false;
     }
-    // Offset is important. Needed when we have to remake cachekey when block 
is returned to cache.
     if (castedComparison.offset != this.offset) {
       return false;
     }
@@ -1940,7 +1968,7 @@ public class HFileBlock implements Cacheable {
   }
 
   /**
-   * @return This HFileBlocks fileContext which will a derivative of the
+   * @return the HFileContext used to create this HFileBlock. Not necessary the
    * fileContext for the file from which this block's data was originally read.
    */
   HFileContext getHFileContext() {
@@ -1964,7 +1992,6 @@ public class HFileBlock implements Cacheable {
    * This is mostly helpful for debugging. This assumes that the block
    * has minor version > 0.
    */
-  @VisibleForTesting
   static String toStringHeader(ByteBuff buf) throws IOException {
     byte[] magicBuf = new byte[Math.min(buf.limit() - buf.position(), 
BlockType.MAGIC_LENGTH)];
     buf.get(magicBuf);

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
index 506f08d..9f29f97 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
@@ -60,7 +60,7 @@ import org.apache.hadoop.util.StringUtils;
  * Examples of how to use the block index writer can be found in
  * {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter} and
  *  {@link HFileWriterImpl}. Examples of how to use the reader can be
- *  found in {@link HFileReaderImpl} and
+ *  found in {@link HFileWriterImpl} and
  *  {@link org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex}.
  */
 @InterfaceAudience.Private

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
index d71911f..8f5040e 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
@@ -252,20 +252,18 @@ public class HFileReaderImpl implements HFile.Reader, 
Configurable {
           long end = 0;
           try {
             end = getTrailer().getLoadOnOpenDataOffset();
+            HFileBlock prevBlock = null;
             if (LOG.isTraceEnabled()) {
               LOG.trace("File=" + path.toString() + ", offset=" + offset + ", 
end=" + end);
             }
-            // TODO: Could we use block iterator in here? Would that get stuff 
into the cache?
-            HFileBlock prevBlock = null;
             while (offset < end) {
               if (Thread.interrupted()) {
                 break;
               }
-              // Perhaps we got our block from cache? Unlikely as this may be, 
if it happens, then
-              // the internal-to-hfileblock thread local which holds the 
overread that gets the
-              // next header, will not have happened...so, pass in the 
onDiskSize gotten from the
-              // cached block. This 'optimization' triggers extremely rarely 
I'd say.
-              long onDiskSize = prevBlock != null? 
prevBlock.getNextBlockOnDiskSize(): -1;
+              long onDiskSize = -1;
+              if (prevBlock != null) {
+                onDiskSize = prevBlock.getNextBlockOnDiskSizeWithHeader();
+              }
               HFileBlock block = readBlock(offset, onDiskSize, true, false, 
false, false,
                 null, null);
               // Need not update the current block. Ideally here the readBlock 
won't find the
@@ -905,8 +903,9 @@ public class HFileReaderImpl implements HFile.Reader, 
Configurable {
 
         // We are reading the next block without block type validation, because
         // it might turn out to be a non-data block.
-        block = reader.readBlock(block.getOffset() + 
block.getOnDiskSizeWithHeader(),
-            block.getNextBlockOnDiskSize(), cacheBlocks, pread,
+        block = reader.readBlock(block.getOffset()
+            + block.getOnDiskSizeWithHeader(),
+            block.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
             isCompaction, true, null, getEffectiveDataBlockEncoding());
         if (block != null && !block.getBlockType().isData()) { // Findbugs: 
NP_NULL_ON_SOME_PATH
           // Whatever block we read we will be returning it unless
@@ -1440,8 +1439,8 @@ public class HFileReaderImpl implements HFile.Reader, 
Configurable {
         // Cache Miss, please load.
       }
 
-      HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset, 
blockSize, true).
-          unpack(hfileContext, fsBlockReader);
+      HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
+          blockSize, -1, true).unpack(hfileContext, fsBlockReader);
 
       // Cache the block
       if (cacheBlock) {
@@ -1527,8 +1526,8 @@ public class HFileReaderImpl implements HFile.Reader, 
Configurable {
           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
         }
         // Load block from filesystem.
-        HFileBlock hfileBlock =
-            fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, 
pread);
+        HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, 
onDiskBlockSize, -1,
+            pread);
         validateBlockType(hfileBlock, expectedBlockType);
         HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
         BlockType.BlockCategory category = 
hfileBlock.getBlockType().getCategory();
@@ -1872,7 +1871,6 @@ public class HFileReaderImpl implements HFile.Reader, 
Configurable {
    * @return Scanner on this file.
    */
   @Override
-  @VisibleForTesting
   public HFileScanner getScanner(boolean cacheBlocks, final boolean pread) {
     return getScanner(cacheBlocks, pread, false);
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
index e0f3d74..c67bdd4 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
@@ -99,21 +99,18 @@ public interface HFileScanner extends Shipper, Closeable {
    * @throws IOException
    */
   boolean seekTo() throws IOException;
-
   /**
    * Scans to the next entry in the file.
    * @return Returns false if you are at the end otherwise true if more in 
file.
    * @throws IOException
    */
   boolean next() throws IOException;
-
   /**
    * Gets the current key in the form of a cell. You must call
    * {@link #seekTo(Cell)} before this method.
    * @return gets the current key as a Cell.
    */
   Cell getKey();
-
   /**
    * Gets a buffer view to the current value.  You must call
    * {@link #seekTo(Cell)} before this method.
@@ -122,35 +119,26 @@ public interface HFileScanner extends Shipper, Closeable {
    * the position is 0, the start of the buffer view.
    */
   ByteBuffer getValue();
-
   /**
    * @return Instance of {@link org.apache.hadoop.hbase.Cell}.
    */
   Cell getCell();
-
   /**
    * Convenience method to get a copy of the key as a string - interpreting the
    * bytes as UTF8. You must call {@link #seekTo(Cell)} before this method.
    * @return key as a string
-   * @deprecated Since hbase-2.0.0
    */
-  @Deprecated
   String getKeyString();
-
   /**
    * Convenience method to get a copy of the value as a string - interpreting
    * the bytes as UTF8. You must call {@link #seekTo(Cell)} before this method.
    * @return value as a string
-   * @deprecated Since hbase-2.0.0
    */
-  @Deprecated
   String getValueString();
-
   /**
    * @return Reader that underlies this Scanner instance.
    */
   HFile.Reader getReader();
-
   /**
    * @return True is scanner has had one of the seek calls invoked; i.e.
    * {@link #seekBefore(Cell)} or {@link #seekTo()} or {@link #seekTo(Cell)}.

[14/50] [abbrv] hbase git commit: Revert "HBASE-15477 Purge 'next block header' from cached blocks"

Reply via email to