Revert "HBASE-15477 Purge 'next block header' from cached blocks"
Overcommit. Revert to fix. This reverts commit 000117ad9fd7eb59074c9bb0da2cf1f9544d4bed. Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/54a543de Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/54a543de Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/54a543de Branch: refs/heads/hbase-12439 Commit: 54a543de229b358b438c920f04f7f2d1ff767cab Parents: 3f3613a Author: stack <[email protected]> Authored: Tue Mar 22 18:37:25 2016 -0700 Committer: stack <[email protected]> Committed: Tue Mar 22 18:37:25 2016 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hbase/io/hfile/BlockType.java | 4 - .../hbase/io/hfile/HFileContextBuilder.java | 20 - .../org/apache/hadoop/hbase/nio/ByteBuff.java | 6 - .../hbase/io/hfile/MemcachedBlockCache.java | 2 +- .../hadoop/hbase/io/hfile/ChecksumUtil.java | 5 +- .../hadoop/hbase/io/hfile/HFileBlock.java | 997 ++++++++++--------- .../hadoop/hbase/io/hfile/HFileBlockIndex.java | 2 +- .../hadoop/hbase/io/hfile/HFileReaderImpl.java | 26 +- .../hadoop/hbase/io/hfile/HFileScanner.java | 12 - .../hbase/io/hfile/bucket/BucketCache.java | 15 +- .../hbase/regionserver/KeyValueScanner.java | 12 +- .../hadoop/hbase/regionserver/StoreFile.java | 4 +- .../hadoop/hbase/io/hfile/CacheTestUtils.java | 23 +- .../hadoop/hbase/io/hfile/TestCacheOnWrite.java | 10 +- .../hadoop/hbase/io/hfile/TestChecksum.java | 27 +- .../hadoop/hbase/io/hfile/TestHFileBlock.java | 27 +- .../io/hfile/TestHFileBlockCompatibility.java | 750 ++++++++++++++ .../hbase/io/hfile/TestHFileBlockIndex.java | 3 +- .../io/hfile/TestHFileDataBlockEncoder.java | 10 +- .../hbase/io/hfile/TestHFileEncryption.java | 2 +- .../hbase/io/hfile/TestHFileWriterV3.java | 7 +- .../hadoop/hbase/io/hfile/TestPrefetch.java | 9 +- .../regionserver/TestCacheOnWriteInSchema.java | 8 +- 23 files changed, 1374 insertions(+), 607 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java ---------------------------------------------------------------------- diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java index 32eb0b2..4228f57 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java @@ -132,10 +132,6 @@ public enum BlockType { out.write(magic); } - public void write(ByteBuffer buf) { - buf.put(magic); - } - public void write(ByteBuff buf) { buf.put(magic); } http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java ---------------------------------------------------------------------- diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java index a6645a6..6d3bb13 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java @@ -55,26 +55,6 @@ public class HFileContextBuilder { private String hfileName = null; - public HFileContextBuilder() {} - - /** - * Use this constructor if you want to change a few settings only in another context. - */ - public HFileContextBuilder(final HFileContext hfc) { - this.usesHBaseChecksum = hfc.isUseHBaseChecksum(); - this.includesMvcc = hfc.isIncludesMvcc(); - this.includesTags = hfc.isIncludesTags(); - this.compression = hfc.getCompression(); - this.compressTags = hfc.isCompressTags(); - this.checksumType = hfc.getChecksumType(); - this.bytesPerChecksum = hfc.getBytesPerChecksum(); - this.blocksize = hfc.getBlocksize(); - this.encoding = hfc.getDataBlockEncoding(); - this.cryptoContext = hfc.getEncryptionContext(); - this.fileCreateTime = hfc.getFileCreateTime(); - this.hfileName = hfc.getHFileName(); - } - public HFileContextBuilder withHBaseCheckSum(boolean useHBaseCheckSum) { this.usesHBaseChecksum = useHBaseCheckSum; return this; http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java ---------------------------------------------------------------------- diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java index 183a031..1e0e957 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/ByteBuff.java @@ -496,12 +496,6 @@ public abstract class ByteBuff { return -(low + 1); // key not found. } - @Override - public String toString() { - return this.getClass().getSimpleName() + "[pos=" + position() + ", lim=" + limit() + - ", cap= " + capacity() + "]"; - } - public static String toStringBinary(final ByteBuff b, int off, int len) { StringBuilder result = new StringBuilder(); // Just in case we are passed a 'len' that is > buffer length... http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java ---------------------------------------------------------------------- diff --git a/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java b/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java index ae871c4..536872e 100644 --- a/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java +++ b/hbase-external-blockcache/src/main/java/org/apache/hadoop/hbase/io/hfile/MemcachedBlockCache.java @@ -260,7 +260,7 @@ public class MemcachedBlockCache implements BlockCache { public HFileBlock decode(CachedData d) { try { ByteBuff buf = new SingleByteBuff(ByteBuffer.wrap(d.getData())); - return (HFileBlock) HFileBlock.BLOCK_DESERIALIZER.deserialize(buf, true, + return (HFileBlock) HFileBlock.blockDeserializer.deserialize(buf, true, MemoryType.EXCLUSIVE); } catch (IOException e) { LOG.warn("Error deserializing data from memcached",e); http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java index b0b1714..69f4330 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/ChecksumUtil.java @@ -91,7 +91,7 @@ public class ChecksumUtil { // If this is an older version of the block that does not have // checksums, then return false indicating that checksum verification - // did not succeed. Actually, this method should never be called + // did not succeed. Actually, this methiod should never be called // when the minorVersion is 0, thus this is a defensive check for a // cannot-happen case. Since this is a cannot-happen case, it is // better to return false to indicate a checksum validation failure. @@ -141,7 +141,8 @@ public class ChecksumUtil { * @return The number of bytes needed to store the checksum values */ static long numBytes(long datasize, int bytesPerChecksum) { - return numChunks(datasize, bytesPerChecksum) * HFileBlock.CHECKSUM_SIZE; + return numChunks(datasize, bytesPerChecksum) * + HFileBlock.CHECKSUM_SIZE; } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java index f3402da..6268f2e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java @@ -56,131 +56,50 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; /** - * Reads {@link HFile} version 2 blocks to HFiles and via {@link Cacheable} Interface to caches. - * Version 2 was introduced in hbase-0.92.0. No longer has support for version 1 blocks since - * hbase-1.3.0. - * - * <p>Version 1 was the original file block. Version 2 was introduced when we changed the hbase file - * format to support multi-level block indexes and compound bloom filters (HBASE-3857). + * Reads {@link HFile} version 1 and version 2 blocks but writes version 2 blocks only. + * Version 2 was introduced in hbase-0.92.0. Does read and write out to the filesystem but also + * the read and write to Cache. * + * <h3>HFileBlock: Version 1</h3> + * As of this writing, there should be no more version 1 blocks found out in the wild. Version 2 + * as introduced in hbase-0.92.0. + * In version 1 all blocks are always compressed or uncompressed, as + * specified by the {@link HFile}'s compression algorithm, with a type-specific + * magic record stored in the beginning of the compressed data (i.e. one needs + * to uncompress the compressed block to determine the block type). There is + * only a single compression algorithm setting for all blocks. Offset and size + * information from the block index are required to read a block. * <h3>HFileBlock: Version 2</h3> * In version 2, a block is structured as follows: * <ul> - * <li><b>Header:</b> See Writer#putHeader() for where header is written; header total size is - * HFILEBLOCK_HEADER_SIZE + * <li><b>Header:</b> See Writer#putHeader(); header total size is HFILEBLOCK_HEADER_SIZE) * <ul> - * <li>0. blockType: Magic record identifying the {@link BlockType} (8 bytes): - * e.g. <code>DATABLK*</code> - * <li>1. onDiskSizeWithoutHeader: Compressed -- a.k.a 'on disk' -- block size, excluding header, - * but including tailing checksum bytes (4 bytes) - * <li>2. uncompressedSizeWithoutHeader: Uncompressed block size, excluding header, and excluding - * checksum bytes (4 bytes) - * <li>3. prevBlockOffset: The offset of the previous block of the same type (8 bytes). This is + * <li>Magic record identifying the {@link BlockType} (8 bytes): e.g. <code>DATABLK*</code> + * <li>Compressed -- a.k.a 'on disk' -- block size, excluding header, but including + * tailing checksum bytes (4 bytes) + * <li>Uncompressed block size, excluding header, and excluding checksum bytes (4 bytes) + * <li>The offset of the previous block of the same type (8 bytes). This is * used to navigate to the previous block without having to go to the block index - * <li>4: For minorVersions >=1, the ordinal describing checksum type (1 byte) - * <li>5: For minorVersions >=1, the number of data bytes/checksum chunk (4 bytes) - * <li>6: onDiskDataSizeWithHeader: For minorVersions >=1, the size of data 'on disk', including - * header, excluding checksums (4 bytes) + * <li>For minorVersions >=1, the ordinal describing checksum type (1 byte) + * <li>For minorVersions >=1, the number of data bytes/checksum chunk (4 bytes) + * <li>For minorVersions >=1, the size of data 'on disk', including header, + * excluding checksums (4 bytes) * </ul> * </li> - * <li><b>Raw/Compressed/Encrypted/Encoded data:</b> The compression - * algorithm is the same for all the blocks in an {@link HFile}. If compression is NONE, this is - * just raw, serialized Cells. + * <li><b>Raw/Compressed/Encrypted/Encoded data:</b> The compression algorithm is the + * same for all the blocks in the {@link HFile}, similarly to what was done in + * version 1. If compression is NONE, this is just raw, serialized Cells. * <li><b>Tail:</b> For minorVersions >=1, a series of 4 byte checksums, one each for * the number of bytes specified by bytesPerChecksum. * </ul> - * - * <h3>Caching</h3> - * Caches cache whole blocks with trailing checksums if any. We then tag on some metadata, the - * content of BLOCK_METADATA_SPACE which will be flag on if we are doing 'hbase' - * checksums and then the offset into the file which is needed when we re-make a cache key - * when we return the block to the cache as 'done'. See {@link Cacheable#serialize(ByteBuffer)} and - * {@link Cacheable#getDeserializer()}. - * - * <p>TODO: Should we cache the checksums? Down in Writer#getBlockForCaching(CacheConfig) where - * we make a block to cache-on-write, there is an attempt at turning off checksums. This is not the - * only place we get blocks to cache. We also will cache the raw return from an hdfs read. In this - * case, the checksums may be present. If the cache is backed by something that doesn't do ECC, - * say an SSD, we might want to preserve checksums. For now this is open question. - * <p>TODO: Over in BucketCache, we save a block allocation by doing a custom serialization. - * Be sure to change it if serialization changes in here. Could we add a method here that takes an - * IOEngine and that then serializes to it rather than expose our internals over in BucketCache? - * IOEngine is in the bucket subpackage. Pull it up? Then this class knows about bucketcache. Ugh. + * <p>Be aware that when we read from HDFS, we overread pulling in the next blocks' header too. + * We do this to save having to do two seeks to read an HFileBlock; a seek to read the header + * to figure lengths, etc., and then another seek to pull in the data. */ @InterfaceAudience.Private public class HFileBlock implements Cacheable { private static final Log LOG = LogFactory.getLog(HFileBlock.class); - /** Type of block. Header field 0. */ - private BlockType blockType; - - /** - * Size on disk excluding header, including checksum. Header field 1. - * @see Writer#putHeader(byte[], int, int, int, int) - */ - private int onDiskSizeWithoutHeader; - - /** - * Size of pure data. Does not include header or checksums. Header field 2. - * @see Writer#putHeader(byte[], int, int, int, int) - */ - private int uncompressedSizeWithoutHeader; - - /** - * The offset of the previous block on disk. Header field 3. - * @see Writer#putHeader(byte[], int, int, int, int) - */ - private long prevBlockOffset; - - /** - * Size on disk of header + data. Excludes checksum. Header field 6, - * OR calculated from {@link #onDiskSizeWithoutHeader} when using HDFS checksum. - * @see Writer#putHeader(byte[], int, int, int, int) - */ - private int onDiskDataSizeWithHeader; - - - /** - * The in-memory representation of the hfile block. Can be on or offheap. Can be backed by - * a single ByteBuffer or by many. Make no assumptions. - * - * <p>Be careful reading from this <code>buf</code>. Duplicate and work on the duplicate or if - * not, be sure to reset position and limit else trouble down the road. - * - * <p>TODO: Make this read-only once made. - * - * <p>We are using the ByteBuff type. ByteBuffer is not extensible yet we need to be able to have - * a ByteBuffer-like API across multiple ByteBuffers reading from a cache such as BucketCache. - * So, we have this ByteBuff type. Unfortunately, it is spread all about HFileBlock. Would be - * good if could be confined to cache-use only but hard-to-do. - */ - private ByteBuff buf; - - /** Meta data that holds meta information on the hfileblock. - */ - private HFileContext fileContext; - - /** - * The offset of this block in the file. Populated by the reader for - * convenience of access. This offset is not part of the block header. - */ - private long offset = UNSET; - - private MemoryType memType = MemoryType.EXCLUSIVE; - - /** - * The on-disk size of the next block, including the header and checksums if present, obtained by - * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the next block's - * header, or UNSET if unknown. - * - * Blocks try to carry the size of the next block to read in this data member. They will even have - * this value when served from cache. Could save a seek in the case where we are iterating through - * a file and some of the blocks come from cache. If from cache, then having this info to hand - * will save us doing a seek to read the header so we can read the body of a block. - * TODO: see how effective this is at saving seeks. - */ - private int nextBlockOnDiskSize = UNSET; - /** * On a checksum failure, do these many succeeding read requests using hdfs checksums before * auto-reenabling hbase checksum verification. @@ -196,18 +115,14 @@ public class HFileBlock implements Cacheable { (int)ClassSize.estimateBase(MultiByteBuff.class, false); /** - * Space for metadata on a block that gets stored along with the block when we cache it. - * There are a few bytes stuck on the end of the HFileBlock that we pull in from HDFS (note, + * See #blockDeserializer method for more info. + * 13 bytes of extra stuff stuck on the end of the HFileBlock that we pull in from HDFS (note, * when we read from HDFS, we pull in an HFileBlock AND the header of the next block if one). - * 8 bytes are offset of this block (long) in the file. Offset is important because - * used when we remake the CacheKey when we return the block to cache when done. There is also - * a flag on whether checksumming is being done by hbase or not. See class comment for note on - * uncertain state of checksumming of blocks that come out of cache (should we or should we not?). - * Finally there 4 bytes to hold the length of the next block which can save a seek on occasion. - * <p>This EXTRA came in with original commit of the bucketcache, HBASE-7404. Was formerly - * known as EXTRA_SERIALIZATION_SPACE. + * The 13 bytes are: usesHBaseChecksum (1 byte) + offset of this block (long) + + * nextBlockOnDiskSizeWithHeader (int). */ - static final int BLOCK_METADATA_SPACE = Bytes.SIZEOF_BYTE + Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT; + public static final int EXTRA_SERIALIZATION_SPACE = + Bytes.SIZEOF_BYTE + Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG; /** * Each checksum value is an integer that can be stored in 4 bytes. @@ -220,47 +135,57 @@ public class HFileBlock implements Cacheable { /** * Used deserializing blocks from Cache. * - * <code> + * Serializing to cache is a little hard to follow. See Writer#finishBlock for where it is done. + * When we start to append to a new HFileBlock, + * we skip over where the header should go before we start adding Cells. When the block is + * done, we'll then go back and fill in the header and the checksum tail. Be aware that what + * gets serialized into the blockcache is a byte array that contains an HFileBlock followed by + * its checksums and then the header of the next HFileBlock (needed to help navigate), followed + * again by an extra 13 bytes of meta info needed when time to recreate the HFileBlock from cache. + * * ++++++++++++++ * + HFileBlock + * ++++++++++++++ - * + Checksums + <= Optional + * + Checksums + + * ++++++++++++++ + * + NextHeader + * ++++++++++++++ - * + Metadata! + + * + ExtraMeta! + * ++++++++++++++ - * </code> - * @see #serialize(ByteBuffer) + * + * TODO: Fix it so we do NOT put the NextHeader into blockcache. It is not necessary. */ - static final CacheableDeserializer<Cacheable> BLOCK_DESERIALIZER = + static final CacheableDeserializer<Cacheable> blockDeserializer = new CacheableDeserializer<Cacheable>() { public HFileBlock deserialize(ByteBuff buf, boolean reuse, MemoryType memType) throws IOException { - // The buf has the file block followed by block metadata. - // Set limit to just before the BLOCK_METADATA_SPACE then rewind. - buf.limit(buf.limit() - BLOCK_METADATA_SPACE).rewind(); - // Get a new buffer to pass the HFileBlock for it to 'own'. - ByteBuff newByteBuff; + // Rewind to just before the EXTRA_SERIALIZATION_SPACE. + buf.limit(buf.limit() - HFileBlock.EXTRA_SERIALIZATION_SPACE).rewind(); + // Get a new buffer to pass the deserialized HFileBlock for it to 'own'. + ByteBuff newByteBuffer; if (reuse) { - newByteBuff = buf.slice(); + newByteBuffer = buf.slice(); } else { int len = buf.limit(); - newByteBuff = new SingleByteBuff(ByteBuffer.allocate(len)); - newByteBuff.put(0, buf, buf.position(), len); + newByteBuffer = new SingleByteBuff(ByteBuffer.allocate(len)); + newByteBuffer.put(0, buf, buf.position(), len); } - // Read out the BLOCK_METADATA_SPACE content and shove into our HFileBlock. + // Read out the EXTRA_SERIALIZATION_SPACE content and shove into our HFileBlock. buf.position(buf.limit()); - buf.limit(buf.limit() + HFileBlock.BLOCK_METADATA_SPACE); + buf.limit(buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE); boolean usesChecksum = buf.get() == (byte)1; - long offset = buf.getLong(); - int nextBlockOnDiskSize = buf.getInt(); - HFileBlock hFileBlock = - new HFileBlock(newByteBuff, usesChecksum, memType, offset, nextBlockOnDiskSize, null); + HFileBlock hFileBlock = new HFileBlock(newByteBuffer, usesChecksum, memType); + hFileBlock.offset = buf.getLong(); + hFileBlock.nextBlockOnDiskSizeWithHeader = buf.getInt(); + if (hFileBlock.hasNextBlockHeader()) { + hFileBlock.buf.limit(hFileBlock.buf.limit() - hFileBlock.headerSize()); + } return hFileBlock; } @Override public int getDeserialiserIdentifier() { - return DESERIALIZER_IDENTIFIER; + return deserializerIdentifier; } @Override @@ -270,36 +195,65 @@ public class HFileBlock implements Cacheable { } }; - private static final int DESERIALIZER_IDENTIFIER; + private static final int deserializerIdentifier; static { - DESERIALIZER_IDENTIFIER = - CacheableDeserializerIdManager.registerDeserializer(BLOCK_DESERIALIZER); + deserializerIdentifier = CacheableDeserializerIdManager + .registerDeserializer(blockDeserializer); } + /** Type of block. Header field 0. */ + private BlockType blockType; + /** - * Copy constructor. Creates a shallow copy of {@code that}'s buffer. + * Size on disk excluding header, including checksum. Header field 1. + * @see Writer#putHeader(byte[], int, int, int, int) */ - private HFileBlock(HFileBlock that) { - this.blockType = that.blockType; - this.onDiskSizeWithoutHeader = that.onDiskSizeWithoutHeader; - this.uncompressedSizeWithoutHeader = that.uncompressedSizeWithoutHeader; - this.prevBlockOffset = that.prevBlockOffset; - this.buf = that.buf.duplicate(); - this.offset = that.offset; - this.onDiskDataSizeWithHeader = that.onDiskDataSizeWithHeader; - this.fileContext = that.fileContext; - this.nextBlockOnDiskSize = that.nextBlockOnDiskSize; - } + private int onDiskSizeWithoutHeader; + + /** + * Size of pure data. Does not include header or checksums. Header field 2. + * @see Writer#putHeader(byte[], int, int, int, int) + */ + private final int uncompressedSizeWithoutHeader; + + /** + * The offset of the previous block on disk. Header field 3. + * @see Writer#putHeader(byte[], int, int, int, int) + */ + private final long prevBlockOffset; + + /** + * Size on disk of header + data. Excludes checksum. Header field 6, + * OR calculated from {@link #onDiskSizeWithoutHeader} when using HDFS checksum. + * @see Writer#putHeader(byte[], int, int, int, int) + */ + private final int onDiskDataSizeWithHeader; + + /** The in-memory representation of the hfile block */ + private ByteBuff buf; + + /** Meta data that holds meta information on the hfileblock */ + private HFileContext fileContext; + + /** + * The offset of this block in the file. Populated by the reader for + * convenience of access. This offset is not part of the block header. + */ + private long offset = UNSET; + + /** + * The on-disk size of the next block, including the header, obtained by + * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the next block's + * header, or -1 if unknown. + */ + private int nextBlockOnDiskSizeWithHeader = UNSET; + + private MemoryType memType = MemoryType.EXCLUSIVE; /** * Creates a new {@link HFile} block from the given fields. This constructor * is used when the block data has already been read and uncompressed, - * and is sitting in a byte buffer and we want to stuff the block into cache. - * See {@link Writer#getBlockForCaching(CacheConfig)}. - * - * <p>TODO: The caller presumes no checksumming - * required of this block instance since going into cache; checksum already verified on - * underlying block data pulled in from filesystem. Is that correct? What if cache is SSD? + * and is sitting in a byte buffer. * * @param blockType the type of this block, see {@link BlockType} * @param onDiskSizeWithoutHeader see {@link #onDiskSizeWithoutHeader} @@ -313,94 +267,86 @@ public class HFileBlock implements Cacheable { * @param fileContext HFile meta data */ HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader, int uncompressedSizeWithoutHeader, - long prevBlockOffset, ByteBuffer b, boolean fillHeader, long offset, - final int nextBlockOnDiskSize, int onDiskDataSizeWithHeader, HFileContext fileContext) { - init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader, - prevBlockOffset, offset, onDiskDataSizeWithHeader, nextBlockOnDiskSize, fileContext); - this.buf = new SingleByteBuff(b); + long prevBlockOffset, ByteBuff buf, boolean fillHeader, long offset, + int onDiskDataSizeWithHeader, HFileContext fileContext) { + this.blockType = blockType; + this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader; + this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader; + this.prevBlockOffset = prevBlockOffset; + this.buf = buf; + this.offset = offset; + this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader; + this.fileContext = fileContext; if (fillHeader) { overwriteHeader(); } this.buf.rewind(); } - /** - * Creates a block from an existing buffer starting with a header. Rewinds - * and takes ownership of the buffer. By definition of rewind, ignores the - * buffer position, but if you slice the buffer beforehand, it will rewind - * to that point. - * @param buf Has header, content, and trailing checksums if present. - */ - HFileBlock(ByteBuff buf, boolean usesHBaseChecksum, MemoryType memType, final long offset, - final int nextBlockOnDiskSize, HFileContext fileContext) throws IOException { - buf.rewind(); - final BlockType blockType = BlockType.read(buf); - final int onDiskSizeWithoutHeader = buf.getInt(); - final int uncompressedSizeWithoutHeader = buf.getInt(); - final long prevBlockOffset = buf.getLong(); - byte checksumType = buf.get(); - int bytesPerChecksum = buf.getInt(); - int onDiskDataSizeWithHeader = buf.getInt(); - // This constructor is called when we deserialize a block from cache and when we read a block in - // from the fs. fileCache is null when deserialized from cache so need to make up one. - HFileContextBuilder fileContextBuilder = fileContext != null? - new HFileContextBuilder(fileContext): new HFileContextBuilder(); - fileContextBuilder.withHBaseCheckSum(usesHBaseChecksum); - if (usesHBaseChecksum) { - // Use the checksum type and bytes per checksum from header, not from filecontext. - fileContextBuilder.withChecksumType(ChecksumType.codeToType(checksumType)); - fileContextBuilder.withBytesPerCheckSum(bytesPerChecksum); - } else { - fileContextBuilder.withChecksumType(ChecksumType.NULL); - fileContextBuilder.withBytesPerCheckSum(0); - // Need to fix onDiskDataSizeWithHeader; there are not checksums after-block-data - onDiskDataSizeWithHeader = onDiskSizeWithoutHeader + headerSize(usesHBaseChecksum); - } - fileContext = fileContextBuilder.build(); - assert usesHBaseChecksum == fileContext.isUseHBaseChecksum(); - init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader, - prevBlockOffset, offset, onDiskDataSizeWithHeader, nextBlockOnDiskSize, fileContext); - this.memType = memType; - this.offset = offset; - this.buf = buf; - this.buf.rewind(); + HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader, int uncompressedSizeWithoutHeader, + long prevBlockOffset, ByteBuffer buf, boolean fillHeader, long offset, + int onDiskDataSizeWithHeader, HFileContext fileContext) { + this(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader, prevBlockOffset, + new SingleByteBuff(buf), fillHeader, offset, onDiskDataSizeWithHeader, fileContext); } /** - * Called from constructors. + * Copy constructor. Creates a shallow copy of {@code that}'s buffer. */ - private void init(BlockType blockType, int onDiskSizeWithoutHeader, - int uncompressedSizeWithoutHeader, long prevBlockOffset, - long offset, int onDiskDataSizeWithHeader, final int nextBlockOnDiskSize, - HFileContext fileContext) { - this.blockType = blockType; - this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader; - this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader; - this.prevBlockOffset = prevBlockOffset; - this.offset = offset; - this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader; - this.nextBlockOnDiskSize = nextBlockOnDiskSize; - this.fileContext = fileContext; + HFileBlock(HFileBlock that) { + this.blockType = that.blockType; + this.onDiskSizeWithoutHeader = that.onDiskSizeWithoutHeader; + this.uncompressedSizeWithoutHeader = that.uncompressedSizeWithoutHeader; + this.prevBlockOffset = that.prevBlockOffset; + this.buf = that.buf.duplicate(); + this.offset = that.offset; + this.onDiskDataSizeWithHeader = that.onDiskDataSizeWithHeader; + this.fileContext = that.fileContext; + this.nextBlockOnDiskSizeWithHeader = that.nextBlockOnDiskSizeWithHeader; + } + + HFileBlock(ByteBuffer b, boolean usesHBaseChecksum) throws IOException { + this(new SingleByteBuff(b), usesHBaseChecksum); } /** - * Parse total ondisk size including header and checksum. Its second field in header after - * the magic bytes. - * @param headerBuf Header ByteBuffer. Presumed exact size of header. - * @return Size of the block with header included. + * Creates a block from an existing buffer starting with a header. Rewinds + * and takes ownership of the buffer. By definition of rewind, ignores the + * buffer position, but if you slice the buffer beforehand, it will rewind + * to that point. */ - private static int getOnDiskSizeWithHeader(final ByteBuffer headerBuf) { - // Set hbase checksum to true always calling headerSize. - return headerBuf.getInt(BlockType.MAGIC_LENGTH) + headerSize(true); + HFileBlock(ByteBuff b, boolean usesHBaseChecksum) throws IOException { + this(b, usesHBaseChecksum, MemoryType.EXCLUSIVE); } /** - * @return the on-disk size of the next block (including the header size and any checksums if - * present) read by peeking into the next block's header; use as a hint when doing - * a read of the next block when scanning or running over a file. + * Creates a block from an existing buffer starting with a header. Rewinds + * and takes ownership of the buffer. By definition of rewind, ignores the + * buffer position, but if you slice the buffer beforehand, it will rewind + * to that point. */ - public int getNextBlockOnDiskSize() { - return nextBlockOnDiskSize; + HFileBlock(ByteBuff b, boolean usesHBaseChecksum, MemoryType memType) throws IOException { + b.rewind(); + blockType = BlockType.read(b); + onDiskSizeWithoutHeader = b.getInt(); + uncompressedSizeWithoutHeader = b.getInt(); + prevBlockOffset = b.getLong(); + HFileContextBuilder contextBuilder = new HFileContextBuilder(); + contextBuilder.withHBaseCheckSum(usesHBaseChecksum); + if (usesHBaseChecksum) { + contextBuilder.withChecksumType(ChecksumType.codeToType(b.get())); + contextBuilder.withBytesPerCheckSum(b.getInt()); + this.onDiskDataSizeWithHeader = b.getInt(); + } else { + contextBuilder.withChecksumType(ChecksumType.NULL); + contextBuilder.withBytesPerCheckSum(0); + this.onDiskDataSizeWithHeader = + onDiskSizeWithoutHeader + HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM; + } + this.fileContext = contextBuilder.build(); + this.memType = memType; + buf = b; + buf.rewind(); } public BlockType getBlockType() { @@ -468,26 +414,49 @@ public class HFileBlock implements Cacheable { * @return the buffer with header skipped and checksum omitted. */ public ByteBuff getBufferWithoutHeader() { - ByteBuff dup = getBufferReadOnly(); - // Now set it up so Buffer spans content only -- no header or no checksums. - return dup.position(headerSize()).limit(buf.limit() - totalChecksumBytes()).slice(); + ByteBuff dup = this.buf.duplicate(); + dup.position(headerSize()); + dup.limit(buf.limit() - totalChecksumBytes()); + return dup.slice(); } /** - * Returns a read-only duplicate of the buffer this block stores internally ready to be read. - * Clients must not modify the buffer object though they may set position and limit on the - * returned buffer since we pass back a duplicate. This method has to be public because it is used + * Returns the buffer this block stores internally. The clients must not + * modify the buffer object. This method has to be public because it is used * in {@link CompoundBloomFilter} to avoid object creation on every Bloom - * filter lookup, but has to be used with caution. Buffer holds header, block content, - * and any follow-on checksums if present. + * filter lookup, but has to be used with caution. Checksum data is not + * included in the returned buffer but header data is. * * @return the buffer of this block for read-only operations */ - public ByteBuff getBufferReadOnly() { - // TODO: ByteBuf does not support asReadOnlyBuffer(). Fix. + ByteBuff getBufferReadOnly() { + ByteBuff dup = this.buf.duplicate(); + dup.limit(buf.limit() - totalChecksumBytes()); + return dup.slice(); + } + + /** + * Returns the buffer of this block, including header data. The clients must + * not modify the buffer object. This method has to be public because it is + * used in {@link org.apache.hadoop.hbase.io.hfile.bucket.BucketCache} to avoid buffer copy. + * + * @return the buffer with header and checksum included for read-only operations + */ + public ByteBuff getBufferReadOnlyWithHeader() { ByteBuff dup = this.buf.duplicate(); - assert dup.position() == 0; - return dup; + return dup.slice(); + } + + /** + * Returns a byte buffer of this block, including header data and checksum, positioned at + * the beginning of header. The underlying data array is not copied. + * + * @return the byte buffer with header and checksum included + */ + ByteBuff getBufferWithHeader() { + ByteBuff dupBuf = buf.duplicate(); + dupBuf.rewind(); + return dupBuf; } private void sanityCheckAssertion(long valueFromBuf, long valueFromField, @@ -512,38 +481,39 @@ public class HFileBlock implements Cacheable { * valid header consistent with the fields. Assumes a packed block structure. * This function is primary for testing and debugging, and is not * thread-safe, because it alters the internal buffer pointer. - * Used by tests only. */ - @VisibleForTesting void sanityCheck() throws IOException { - // Duplicate so no side-effects - ByteBuff dup = this.buf.duplicate().rewind(); - sanityCheckAssertion(BlockType.read(dup), blockType); + buf.rewind(); + + sanityCheckAssertion(BlockType.read(buf), blockType); - sanityCheckAssertion(dup.getInt(), onDiskSizeWithoutHeader, "onDiskSizeWithoutHeader"); + sanityCheckAssertion(buf.getInt(), onDiskSizeWithoutHeader, + "onDiskSizeWithoutHeader"); - sanityCheckAssertion(dup.getInt(), uncompressedSizeWithoutHeader, + sanityCheckAssertion(buf.getInt(), uncompressedSizeWithoutHeader, "uncompressedSizeWithoutHeader"); - sanityCheckAssertion(dup.getLong(), prevBlockOffset, "prevBlockOffset"); + sanityCheckAssertion(buf.getLong(), prevBlockOffset, "prevBlocKOffset"); if (this.fileContext.isUseHBaseChecksum()) { - sanityCheckAssertion(dup.get(), this.fileContext.getChecksumType().getCode(), "checksumType"); - sanityCheckAssertion(dup.getInt(), this.fileContext.getBytesPerChecksum(), + sanityCheckAssertion(buf.get(), this.fileContext.getChecksumType().getCode(), "checksumType"); + sanityCheckAssertion(buf.getInt(), this.fileContext.getBytesPerChecksum(), "bytesPerChecksum"); - sanityCheckAssertion(dup.getInt(), onDiskDataSizeWithHeader, "onDiskDataSizeWithHeader"); + sanityCheckAssertion(buf.getInt(), onDiskDataSizeWithHeader, "onDiskDataSizeWithHeader"); } int cksumBytes = totalChecksumBytes(); int expectedBufLimit = onDiskDataSizeWithHeader + cksumBytes; - if (dup.limit() != expectedBufLimit) { - throw new AssertionError("Expected limit " + expectedBufLimit + ", got " + dup.limit()); + if (buf.limit() != expectedBufLimit) { + throw new AssertionError("Expected buffer limit " + expectedBufLimit + + ", got " + buf.limit()); } // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read the next // block's header, so there are two sensible values for buffer capacity. int hdrSize = headerSize(); - if (dup.capacity() != expectedBufLimit && dup.capacity() != expectedBufLimit + hdrSize) { - throw new AssertionError("Invalid buffer capacity: " + dup.capacity() + + if (buf.capacity() != expectedBufLimit && + buf.capacity() != expectedBufLimit + hdrSize) { + throw new AssertionError("Invalid buffer capacity: " + buf.capacity() + ", expected " + expectedBufLimit + " or " + (expectedBufLimit + hdrSize)); } } @@ -590,6 +560,30 @@ public class HFileBlock implements Cacheable { } /** + * Called after reading a block with provided onDiskSizeWithHeader. + */ + private void validateOnDiskSizeWithoutHeader(int expectedOnDiskSizeWithoutHeader) + throws IOException { + if (onDiskSizeWithoutHeader != expectedOnDiskSizeWithoutHeader) { + String dataBegin = null; + if (buf.hasArray()) { + dataBegin = Bytes.toStringBinary(buf.array(), buf.arrayOffset(), Math.min(32, buf.limit())); + } else { + ByteBuff bufDup = getBufferReadOnly(); + byte[] dataBeginBytes = new byte[Math.min(32, bufDup.limit() - bufDup.position())]; + bufDup.get(dataBeginBytes); + dataBegin = Bytes.toStringBinary(dataBeginBytes); + } + String blockInfoMsg = + "Block offset: " + offset + ", data starts with: " + dataBegin; + throw new IOException("On-disk size without header provided is " + + expectedOnDiskSizeWithoutHeader + ", but block " + + "header contains " + onDiskSizeWithoutHeader + ". " + + blockInfoMsg); + } + } + + /** * Retrieves the decompressed/decrypted view of this block. An encoded block remains in its * encoded structure. Internal structures are shared between instances where applicable. */ @@ -613,10 +607,33 @@ public class HFileBlock implements Cacheable { ctx.prepareDecoding(unpacked.getOnDiskSizeWithoutHeader(), unpacked.getUncompressedSizeWithoutHeader(), unpacked.getBufferWithoutHeader(), dup); + + // Preserve the next block's header bytes in the new block if we have them. + if (unpacked.hasNextBlockHeader()) { + // Both the buffers are limited till checksum bytes and avoid the next block's header. + // Below call to copyFromBufferToBuffer() will try positional read/write from/to buffers when + // any of the buffer is DBB. So we change the limit on a dup buffer. No copying just create + // new BB objects + ByteBuff inDup = this.buf.duplicate(); + inDup.limit(inDup.limit() + headerSize()); + ByteBuff outDup = unpacked.buf.duplicate(); + outDup.limit(outDup.limit() + unpacked.headerSize()); + outDup.put( + unpacked.headerSize() + unpacked.uncompressedSizeWithoutHeader + + unpacked.totalChecksumBytes(), inDup, this.onDiskDataSizeWithHeader, + unpacked.headerSize()); + } return unpacked; } /** + * Return true when this buffer includes next block's header. + */ + private boolean hasNextBlockHeader() { + return nextBlockOnDiskSizeWithHeader > 0; + } + + /** * Always allocates a new buffer of the correct size. Copies header bytes * from the existing buffer. Does not change header fields. * Reserve room to keep checksum bytes too. @@ -624,7 +641,8 @@ public class HFileBlock implements Cacheable { private void allocateBuffer() { int cksumBytes = totalChecksumBytes(); int headerSize = headerSize(); - int capacityNeeded = headerSize + uncompressedSizeWithoutHeader + cksumBytes; + int capacityNeeded = headerSize + uncompressedSizeWithoutHeader + + cksumBytes + (hasNextBlockHeader() ? headerSize : 0); // TODO we need consider allocating offheap here? ByteBuffer newBuf = ByteBuffer.allocate(capacityNeeded); @@ -652,8 +670,9 @@ public class HFileBlock implements Cacheable { } /** An additional sanity-check in case no compression or encryption is being used. */ - public void sanityCheckUncompressedSize() throws IOException { - if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + totalChecksumBytes()) { + public void assumeUncompressed() throws IOException { + if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + + totalChecksumBytes()) { throw new IOException("Using no compression but " + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", " + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader @@ -661,14 +680,11 @@ public class HFileBlock implements Cacheable { } } - /** - * Cannot be {@link #UNSET}. Must be a legitimate value. Used re-making the {@link CacheKey} when - * block is returned to the cache. - * @return the offset of this block in the file it was read from - */ + /** @return the offset of this block in the file it was read from */ long getOffset() { if (offset < 0) { - throw new IllegalStateException("HFile block offset not initialized properly"); + throw new IllegalStateException( + "HFile block offset not initialized properly"); } return offset; } @@ -728,6 +744,7 @@ public class HFileBlock implements Cacheable { // We could not read the "extra data", but that is OK. break; } + if (ret < 0) { throw new IOException("Premature EOF from inputStream (read " + "returned " + ret + ", was trying to read " + necessaryLen @@ -782,6 +799,14 @@ public class HFileBlock implements Cacheable { } /** + * @return the on-disk size of the next block (including the header size) + * that was read by peeking into the next block's header + */ + public int getNextBlockOnDiskSizeWithHeader() { + return nextBlockOnDiskSizeWithHeader; + } + + /** * Unified version 2 {@link HFile} block writer. The intended usage pattern * is as follows: * <ol> @@ -813,8 +838,8 @@ public class HFileBlock implements Cacheable { private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx; /** - * The stream we use to accumulate data into a block in an uncompressed format. - * We reset this stream at the end of each block and reuse it. The + * The stream we use to accumulate data in uncompressed format for each + * block. We reset this stream at the end of each block and reuse it. The * header is written as the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this * stream. */ @@ -842,7 +867,7 @@ public class HFileBlock implements Cacheable { * if compression is turned on. It also includes the checksum data that * immediately follows the block data. (header + data + checksums) */ - private byte[] onDiskBlockBytesWithHeader; + private byte[] onDiskBytesWithHeader; /** * The size of the checksum data on disk. It is used only if data is @@ -859,7 +884,7 @@ public class HFileBlock implements Cacheable { * {@link org.apache.hadoop.hbase.HConstants#HFILEBLOCK_HEADER_SIZE}. * Does not store checksums. */ - private byte[] uncompressedBlockBytesWithHeader; + private byte[] uncompressedBytesWithHeader; /** * Current block's start offset in the {@link HFile}. Set in @@ -967,19 +992,18 @@ public class HFileBlock implements Cacheable { Preconditions.checkState(state != State.INIT, "Unexpected state: " + state); - if (state == State.BLOCK_READY) { + if (state == State.BLOCK_READY) return; - } // This will set state to BLOCK_READY. finishBlock(); } /** - * Finish up writing of the block. - * Flushes the compressing stream (if using compression), fills out the header, - * does any compression/encryption of bytes to flush out to disk, and manages - * the cache on write content, if applicable. Sets block write state to "block ready". + * An internal method that flushes the compressing stream (if using + * compression), serializes the header, and takes care of the separate + * uncompressed stream for caching on write, if applicable. Sets block + * write state to "block ready". */ private void finishBlock() throws IOException { if (blockType == BlockType.DATA) { @@ -988,40 +1012,41 @@ public class HFileBlock implements Cacheable { blockType = dataBlockEncodingCtx.getBlockType(); } userDataStream.flush(); - // This does an array copy, so it is safe to cache this byte array when cache-on-write. + // This does an array copy, so it is safe to cache this byte array. // Header is still the empty, 'dummy' header that is yet to be filled out. - uncompressedBlockBytesWithHeader = baosInMemory.toByteArray(); + uncompressedBytesWithHeader = baosInMemory.toByteArray(); prevOffset = prevOffsetByType[blockType.getId()]; - // We need to set state before we can package the block up for cache-on-write. In a way, the - // block is ready, but not yet encoded or compressed. + // We need to set state before we can package the block up for + // cache-on-write. In a way, the block is ready, but not yet encoded or + // compressed. state = State.BLOCK_READY; if (blockType == BlockType.DATA || blockType == BlockType.ENCODED_DATA) { - onDiskBlockBytesWithHeader = dataBlockEncodingCtx. - compressAndEncrypt(uncompressedBlockBytesWithHeader); + onDiskBytesWithHeader = dataBlockEncodingCtx + .compressAndEncrypt(uncompressedBytesWithHeader); } else { - onDiskBlockBytesWithHeader = defaultBlockEncodingCtx. - compressAndEncrypt(uncompressedBlockBytesWithHeader); + onDiskBytesWithHeader = this.defaultBlockEncodingCtx. + compressAndEncrypt(uncompressedBytesWithHeader); } // Calculate how many bytes we need for checksum on the tail of the block. int numBytes = (int) ChecksumUtil.numBytes( - onDiskBlockBytesWithHeader.length, + onDiskBytesWithHeader.length, fileContext.getBytesPerChecksum()); // Put the header for the on disk bytes; header currently is unfilled-out - putHeader(onDiskBlockBytesWithHeader, 0, - onDiskBlockBytesWithHeader.length + numBytes, - uncompressedBlockBytesWithHeader.length, onDiskBlockBytesWithHeader.length); + putHeader(onDiskBytesWithHeader, 0, + onDiskBytesWithHeader.length + numBytes, + uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length); // Set the header for the uncompressed bytes (for cache-on-write) -- IFF different from - // onDiskBlockBytesWithHeader array. - if (onDiskBlockBytesWithHeader != uncompressedBlockBytesWithHeader) { - putHeader(uncompressedBlockBytesWithHeader, 0, - onDiskBlockBytesWithHeader.length + numBytes, - uncompressedBlockBytesWithHeader.length, onDiskBlockBytesWithHeader.length); + // onDiskBytesWithHeader array. + if (onDiskBytesWithHeader != uncompressedBytesWithHeader) { + putHeader(uncompressedBytesWithHeader, 0, + onDiskBytesWithHeader.length + numBytes, + uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length); } onDiskChecksum = new byte[numBytes]; ChecksumUtil.generateChecksums( - onDiskBlockBytesWithHeader, 0, onDiskBlockBytesWithHeader.length, + onDiskBytesWithHeader, 0, onDiskBytesWithHeader.length, onDiskChecksum, 0, fileContext.getChecksumType(), fileContext.getBytesPerChecksum()); } @@ -1076,7 +1101,7 @@ public class HFileBlock implements Cacheable { protected void finishBlockAndWriteHeaderAndData(DataOutputStream out) throws IOException { ensureBlockReady(); - out.write(onDiskBlockBytesWithHeader); + out.write(onDiskBytesWithHeader); out.write(onDiskChecksum); } @@ -1095,12 +1120,12 @@ public class HFileBlock implements Cacheable { // This is not very optimal, because we are doing an extra copy. // But this method is used only by unit tests. byte[] output = - new byte[onDiskBlockBytesWithHeader.length + new byte[onDiskBytesWithHeader.length + onDiskChecksum.length]; - System.arraycopy(onDiskBlockBytesWithHeader, 0, output, 0, - onDiskBlockBytesWithHeader.length); + System.arraycopy(onDiskBytesWithHeader, 0, output, 0, + onDiskBytesWithHeader.length); System.arraycopy(onDiskChecksum, 0, output, - onDiskBlockBytesWithHeader.length, onDiskChecksum.length); + onDiskBytesWithHeader.length, onDiskChecksum.length); return output; } @@ -1128,7 +1153,7 @@ public class HFileBlock implements Cacheable { */ int getOnDiskSizeWithoutHeader() { expectState(State.BLOCK_READY); - return onDiskBlockBytesWithHeader.length + + return onDiskBytesWithHeader.length + onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE; } @@ -1141,7 +1166,7 @@ public class HFileBlock implements Cacheable { */ int getOnDiskSizeWithHeader() { expectState(State.BLOCK_READY); - return onDiskBlockBytesWithHeader.length + onDiskChecksum.length; + return onDiskBytesWithHeader.length + onDiskChecksum.length; } /** @@ -1149,7 +1174,7 @@ public class HFileBlock implements Cacheable { */ int getUncompressedSizeWithoutHeader() { expectState(State.BLOCK_READY); - return uncompressedBlockBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE; + return uncompressedBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE; } /** @@ -1157,7 +1182,7 @@ public class HFileBlock implements Cacheable { */ int getUncompressedSizeWithHeader() { expectState(State.BLOCK_READY); - return uncompressedBlockBytesWithHeader.length; + return uncompressedBytesWithHeader.length; } /** @return true if a block is being written */ @@ -1187,7 +1212,7 @@ public class HFileBlock implements Cacheable { */ ByteBuffer getUncompressedBufferWithHeader() { expectState(State.BLOCK_READY); - return ByteBuffer.wrap(uncompressedBlockBytesWithHeader); + return ByteBuffer.wrap(uncompressedBytesWithHeader); } /** @@ -1200,7 +1225,7 @@ public class HFileBlock implements Cacheable { */ ByteBuffer getOnDiskBufferWithHeader() { expectState(State.BLOCK_READY); - return ByteBuffer.wrap(onDiskBlockBytesWithHeader); + return ByteBuffer.wrap(onDiskBytesWithHeader); } private void expectState(State expectedState) { @@ -1232,10 +1257,6 @@ public class HFileBlock implements Cacheable { * block does not have checksum data even though the header minor * version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a * 0 value in bytesPerChecksum. - * - * <p>TODO: Should there be an option where a cache can ask that hbase preserve block - * checksums for checking after a block comes out of the cache? Otehrwise, cache is responsible - * for blocks being wholesome (ECC memory or if file-backed, it does checksumming). */ HFileBlock getBlockForCaching(CacheConfig cacheConf) { HFileContext newContext = new HFileContextBuilder() @@ -1249,13 +1270,13 @@ public class HFileBlock implements Cacheable { .withIncludesMvcc(fileContext.isIncludesMvcc()) .withIncludesTags(fileContext.isIncludesTags()) .build(); - return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(), + return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(), getUncompressedSizeWithoutHeader(), prevOffset, - cacheConf.shouldCacheCompressed(blockType.getCategory())? + cacheConf.shouldCacheCompressed(blockType.getCategory()) ? getOnDiskBufferWithHeader() : getUncompressedBufferWithHeader(), - FILL_HEADER, startOffset, UNSET, - onDiskBlockBytesWithHeader.length + onDiskChecksum.length, newContext); + FILL_HEADER, startOffset, + onDiskBytesWithHeader.length + onDiskChecksum.length, newContext); } } @@ -1301,9 +1322,12 @@ public class HFileBlock implements Cacheable { * @param offset * @param onDiskSize the on-disk size of the entire block, including all * applicable headers, or -1 if unknown + * @param uncompressedSize the uncompressed size of the compressed part of + * the block, or -1 if unknown * @return the newly read block */ - HFileBlock readBlockData(long offset, long onDiskSize, boolean pread) throws IOException; + HFileBlock readBlockData(long offset, long onDiskSize, + int uncompressedSize, boolean pread) throws IOException; /** * Creates a block iterator over the given portion of the {@link HFile}. @@ -1356,11 +1380,6 @@ public class HFileBlock implements Cacheable { /** Default context used when BlockType != {@link BlockType#ENCODED_DATA}. */ private final HFileBlockDefaultDecodingContext defaultDecodingCtx; - /** - * When we read a block, we overread and pull in the next blocks header too. We will save it - * here. If moving serially through the file, we will trip over this caching of the next blocks - * header so we won't have to do explicit seek to find next blocks lengths, etc. - */ private ThreadLocal<PrefetchedHeader> prefetchedHeaderForThread = new ThreadLocal<PrefetchedHeader>() { @Override @@ -1424,7 +1443,7 @@ public class HFileBlock implements Cacheable { public HFileBlock nextBlock() throws IOException { if (offset >= endOffset) return null; - HFileBlock b = readBlockData(offset, -1, false); + HFileBlock b = readBlockData(offset, -1, -1, false); offset += b.getOnDiskSizeWithHeader(); return b.unpack(fileContext, owner); } @@ -1444,7 +1463,7 @@ public class HFileBlock implements Cacheable { /** * Does a positional read or a seek and read into the given buffer. Returns - * the on-disk size of the next block, or -1 if it could not be read/determined; e.g. EOF. + * the on-disk size of the next block, or -1 if it could not be determined. * * @param dest destination buffer * @param destOffset offset into the destination buffer at where to put the bytes we read @@ -1454,8 +1473,7 @@ public class HFileBlock implements Cacheable { * @param pread whether we should do a positional read * @param istream The input source of data * @return the on-disk size of the next block with header size included, or - * -1 if it could not be determined; if not -1, the <code>dest</code> INCLUDES the - * next header + * -1 if it could not be determined * @throws IOException */ protected int readAtOffset(FSDataInputStream istream, byte [] dest, int destOffset, int size, @@ -1487,16 +1505,16 @@ public class HFileBlock implements Cacheable { } // Try to read the next block header. - if (!readWithExtra(istream, dest, destOffset, size, hdrSize)) { + if (!readWithExtra(istream, dest, destOffset, size, hdrSize)) return -1; - } } finally { streamLock.unlock(); } } else { // Positional read. Better for random reads; or when the streamLock is already locked. int extraSize = peekIntoNextBlock ? hdrSize : 0; - if (!positionalReadWithExtra(istream, fileOffset, dest, destOffset, size, extraSize)) { + if (!positionalReadWithExtra(istream, fileOffset, dest, destOffset, + size, extraSize)) { return -1; } } @@ -1512,12 +1530,16 @@ public class HFileBlock implements Cacheable { * @param offset the offset in the stream to read at * @param onDiskSizeWithHeaderL the on-disk size of the block, including * the header, or -1 if unknown + * @param uncompressedSize the uncompressed size of the the block. Always + * expected to be -1. This parameter is only used in version 1. * @param pread whether to use a positional read */ @Override - public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL, boolean pread) + public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL, + int uncompressedSize, boolean pread) throws IOException { - // Get a copy of the current state of whether to validate + + // get a copy of the current state of whether to validate // hbase checksums or not for this read call. This is not // thread-safe but the one constaint is that if we decide // to skip hbase checksum verification then we are @@ -1526,7 +1548,8 @@ public class HFileBlock implements Cacheable { FSDataInputStream is = streamWrapper.getStream(doVerificationThruHBaseChecksum); HFileBlock blk = readBlockDataInternal(is, offset, - onDiskSizeWithHeaderL, pread, + onDiskSizeWithHeaderL, + uncompressedSize, pread, doVerificationThruHBaseChecksum); if (blk == null) { HFile.LOG.warn("HBase checksum verification failed for file " + @@ -1553,7 +1576,8 @@ public class HFileBlock implements Cacheable { // a few more than precisely this number. is = this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD); doVerificationThruHBaseChecksum = false; - blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, pread, + blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, + uncompressedSize, pread, doVerificationThruHBaseChecksum); if (blk != null) { HFile.LOG.warn("HDFS checksum verification suceeded for file " + @@ -1581,139 +1605,175 @@ public class HFileBlock implements Cacheable { } /** - * @return Check <code>onDiskSizeWithHeaderL</code> size is healthy and then return it as an int - * @throws IOException - */ - private static int checkAndGetSizeAsInt(final long onDiskSizeWithHeaderL, final int hdrSize) - throws IOException { - if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1) - || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) { - throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL - + ": expected to be at least " + hdrSize - + " and at most " + Integer.MAX_VALUE + ", or -1"); - } - return (int)onDiskSizeWithHeaderL; - } - - /** - * Check threadlocal cache for this block's header; we usually read it on the tail of reading - * the previous block to save a seek. Otherwise, we have to do a seek to read the header before - * we can pull in the block. - * @return The cached block header or null if not found. - * @see #cacheNextBlockHeader(long, byte[], int, int) - */ - private ByteBuffer getCachedHeader(final long offset) { - PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get(); - // PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get(); - return prefetchedHeader != null && prefetchedHeader.offset == offset? - prefetchedHeader.buf: null; - } - - /** - * Save away the next blocks header in thread local. - * @see #getCachedHeader(long) - */ - private void cacheNextBlockHeader(final long nextBlockOffset, - final byte [] header, final int headerOffset, final int headerLength) { - PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get(); - prefetchedHeader.offset = nextBlockOffset; - System.arraycopy(header, headerOffset, prefetchedHeader.header, 0, headerLength); - } - - /** - * Verify the passed in onDiskSizeWithHeader aligns with what is in the header else something - * is not right. - * @throws IOException - */ - private void verifyOnDiskSizeMatchesHeader(final int passedIn, final ByteBuffer headerBuf, - final long offset) - throws IOException { - // Assert size provided aligns with what is in the header - int fromHeader = getOnDiskSizeWithHeader(headerBuf); - if (passedIn != fromHeader) { - throw new IOException("Passed in onDiskSizeWithHeader=" + passedIn + " != " + fromHeader + - ", offset=" + offset + ", fileContext=" + this.fileContext); - } - } - - /** * Reads a version 2 block. * * @param offset the offset in the stream to read at * @param onDiskSizeWithHeaderL the on-disk size of the block, including - * the header and checksums if present or -1 if unknown + * the header, or -1 if unknown + * @param uncompressedSize the uncompressed size of the the block. Always + * expected to be -1. This parameter is only used in version 1. * @param pread whether to use a positional read * @param verifyChecksum Whether to use HBase checksums. * If HBase checksum is switched off, then use HDFS checksum. * @return the HFileBlock or null if there is a HBase checksum mismatch */ private HFileBlock readBlockDataInternal(FSDataInputStream is, long offset, - long onDiskSizeWithHeaderL, boolean pread, boolean verifyChecksum) + long onDiskSizeWithHeaderL, int uncompressedSize, boolean pread, + boolean verifyChecksum) throws IOException { if (offset < 0) { throw new IOException("Invalid offset=" + offset + " trying to read " - + "block (onDiskSize=" + onDiskSizeWithHeaderL + ")"); + + "block (onDiskSize=" + onDiskSizeWithHeaderL + + ", uncompressedSize=" + uncompressedSize + ")"); } - int onDiskSizeWithHeader = checkAndGetSizeAsInt(onDiskSizeWithHeaderL, hdrSize); - ByteBuffer headerBuf = getCachedHeader(offset); - if (LOG.isTraceEnabled()) { - LOG.trace("Reading " + this.fileContext.getHFileName() + " at offset=" + offset + - ", pread=" + pread + ", verifyChecksum=" + verifyChecksum + ", cachedHeader=" + - headerBuf + ", onDiskSizeWithHeader=" + onDiskSizeWithHeader); + + if (uncompressedSize != -1) { + throw new IOException("Version 2 block reader API does not need " + + "the uncompressed size parameter"); + } + + if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1) + || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) { + throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL + + ": expected to be at least " + hdrSize + + " and at most " + Integer.MAX_VALUE + ", or -1 (offset=" + + offset + ", uncompressedSize=" + uncompressedSize + ")"); + } + + int onDiskSizeWithHeader = (int) onDiskSizeWithHeaderL; + + // See if we can avoid reading the header. This is desirable, because + // we will not incur a backward seek operation if we have already + // read this block's header as part of the previous read's look-ahead. + // And we also want to skip reading the header again if it has already + // been read. + // TODO: How often does this optimization fire? Has to be same thread so the thread local + // is pertinent and we have to be reading next block as in a big scan. + ByteBuffer headerBuf = null; + PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get(); + boolean preReadHeader = false; + if (prefetchedHeader != null && prefetchedHeader.offset == offset) { + headerBuf = prefetchedHeader.buf; + preReadHeader = true; } - if (onDiskSizeWithHeader <= 0) { - // We were not passed the block size. Need to get it from the header. If header was not in - // cache, need to seek to pull it in. This latter might happen when we are doing the first - // read in a series of reads or a random read, and we don't have access to the block index. - // This is costly and should happen very rarely. + // Allocate enough space to fit the next block's header too. + int nextBlockOnDiskSize = 0; + byte[] onDiskBlock = null; + + HFileBlock b = null; + boolean fastPath = false; + boolean readHdrOnly = false; + if (onDiskSizeWithHeader > 0) { + fastPath = true; + // We know the total on-disk size. Read the entire block into memory, + // then parse the header. This code path is used when + // doing a random read operation relying on the block index, as well as + // when the client knows the on-disk size from peeking into the next + // block's header (e.g. this block's header) when reading the previous + // block. This is the faster and more preferable case. + + // Size that we have to skip in case we have already read the header. + int preReadHeaderSize = headerBuf == null ? 0 : hdrSize; + onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize]; // room for this block plus the + // next block's header + nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, + preReadHeaderSize, onDiskSizeWithHeader - preReadHeaderSize, + true, offset + preReadHeaderSize, pread); + if (headerBuf != null) { + // the header has been read when reading the previous block, copy + // to this block's header + // headerBuf is HBB + assert headerBuf.hasArray(); + System.arraycopy(headerBuf.array(), + headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize); + } else { + headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize); + } + // We know the total on-disk size but not the uncompressed size. Parse the header. + try { + // TODO: FIX!!! Expensive parse just to get a length + b = new HFileBlock(headerBuf, fileContext.isUseHBaseChecksum()); + } catch (IOException ex) { + // Seen in load testing. Provide comprehensive debug info. + throw new IOException("Failed to read compressed block at " + + offset + + ", onDiskSizeWithoutHeader=" + + onDiskSizeWithHeader + + ", preReadHeaderSize=" + + hdrSize + + ", header.length=" + + prefetchedHeader.header.length + + ", header bytes: " + + Bytes.toStringBinary(prefetchedHeader.header, 0, + hdrSize), ex); + } + // if the caller specifies a onDiskSizeWithHeader, validate it. + int onDiskSizeWithoutHeader = onDiskSizeWithHeader - hdrSize; + assert onDiskSizeWithoutHeader >= 0; + b.validateOnDiskSizeWithoutHeader(onDiskSizeWithoutHeader); + } else { + // Check headerBuf to see if we have read this block's header as part of + // reading the previous block. This is an optimization of peeking into + // the next block's header (e.g.this block's header) when reading the + // previous block. This is the faster and more preferable case. If the + // header is already there, don't read the header again. + + // Unfortunately, we still have to do a separate read operation to + // read the header. if (headerBuf == null) { + readHdrOnly = true; + // From the header, determine the on-disk size of the given hfile + // block, and read the remaining data, thereby incurring two read + // operations. This might happen when we are doing the first read + // in a series of reads or a random read, and we don't have access + // to the block index. This is costly and should happen very rarely. headerBuf = ByteBuffer.allocate(hdrSize); - readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(), hdrSize, false, - offset, pread); + // headerBuf is HBB + readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(), + hdrSize, false, offset, pread); } - onDiskSizeWithHeader = getOnDiskSizeWithHeader(headerBuf); - } - int preReadHeaderSize = headerBuf == null? 0 : hdrSize; - // Allocate enough space to fit the next block's header too; saves a seek next time through. - // onDiskBlock is whole block + header + checksums then extra hdrSize to read next header; - // onDiskSizeWithHeader is header, body, and any checksums if present. - // TODO: Make this ByteBuffer-based. Will make it easier to go to HDFS with BBPool (offheap). - byte[] onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize]; - int nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, preReadHeaderSize, - onDiskSizeWithHeader - preReadHeaderSize, true, offset + preReadHeaderSize, pread); - if (headerBuf != null) { - // The header has been read when reading the previous block OR in a distinct header-only - // read. Copy to this block's header. + // TODO: FIX!!! Expensive parse just to get a length + b = new HFileBlock(headerBuf, fileContext.isUseHBaseChecksum()); + // onDiskBlock is whole block + header + checksums then extra hdrSize to read next header + onDiskBlock = new byte[b.getOnDiskSizeWithHeader() + hdrSize]; + // headerBuf is HBB. Copy hdr into onDiskBlock System.arraycopy(headerBuf.array(), headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize); - } else { - headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize); + nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, hdrSize, + b.getOnDiskSizeWithHeader() - hdrSize, true, offset + hdrSize, pread); + onDiskSizeWithHeader = b.onDiskSizeWithoutHeader + hdrSize; } - // Do a few checks before we go instantiate HFileBlock. - assert onDiskSizeWithHeader > this.hdrSize; - verifyOnDiskSizeMatchesHeader(onDiskSizeWithHeader, headerBuf, offset); - // The onDiskBlock will become the headerAndDataBuffer for this block. - // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already - // contains the header of next block, so no need to set next block's header in it. - HFileBlock hFileBlock = - new HFileBlock(new SingleByteBuff(ByteBuffer.wrap(onDiskBlock, 0, onDiskSizeWithHeader)), - this.fileContext.isUseHBaseChecksum(), MemoryType.EXCLUSIVE, offset, - nextBlockOnDiskSize, fileContext); - // Run check on uncompressed sizings. + if (!fileContext.isCompressedOrEncrypted()) { - hFileBlock.sanityCheckUncompressed(); + b.assumeUncompressed(); } - if (verifyChecksum && !validateBlockChecksum(hFileBlock, offset, onDiskBlock, hdrSize)) { - return null; + + if (verifyChecksum && !validateBlockChecksum(b, offset, onDiskBlock, hdrSize)) { + return null; // checksum mismatch } - if (LOG.isTraceEnabled()) { - LOG.trace("Read " + hFileBlock); + + // The onDiskBlock will become the headerAndDataBuffer for this block. + // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already + // contains the header of next block, so no need to set next + // block's header in it. + b = new HFileBlock(ByteBuffer.wrap(onDiskBlock, 0, onDiskSizeWithHeader), + this.fileContext.isUseHBaseChecksum()); + + b.nextBlockOnDiskSizeWithHeader = nextBlockOnDiskSize; + + // Set prefetched header + if (b.hasNextBlockHeader()) { + prefetchedHeader.offset = offset + b.getOnDiskSizeWithHeader(); + System.arraycopy(onDiskBlock, onDiskSizeWithHeader, prefetchedHeader.header, 0, hdrSize); } - // Cache next block header if we read it for the next time through here. - if (nextBlockOnDiskSize != -1) { - cacheNextBlockHeader(offset + hFileBlock.getOnDiskSizeWithHeader(), - onDiskBlock, onDiskSizeWithHeader, hdrSize); + + b.offset = offset; + b.fileContext.setIncludesTags(this.fileContext.isIncludesTags()); + b.fileContext.setIncludesMvcc(this.fileContext.isIncludesMvcc()); + if (LOG.isTraceEnabled()) { + LOG.trace("Read preReadHeader=" + preReadHeader + ", fastPath=" + fastPath + + ", readHdrOnly=" + readHdrOnly + ", " + b); } - return hFileBlock; + return b; } @Override @@ -1759,73 +1819,42 @@ public class HFileBlock implements Cacheable { } } - /** An additional sanity-check in case no compression or encryption is being used. */ - void sanityCheckUncompressed() throws IOException { - if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + - totalChecksumBytes()) { - throw new IOException("Using no compression but " - + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", " - + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader - + ", numChecksumbytes=" + totalChecksumBytes()); - } - } - - // Cacheable implementation @Override public int getSerializedLength() { if (buf != null) { - // Include extra bytes for block metadata. - return this.buf.limit() + BLOCK_METADATA_SPACE; + // include extra bytes for the next header when it's available. + int extraSpace = hasNextBlockHeader() ? headerSize() : 0; + return this.buf.limit() + extraSpace + HFileBlock.EXTRA_SERIALIZATION_SPACE; } return 0; } - // Cacheable implementation @Override public void serialize(ByteBuffer destination) { - // BE CAREFUL!! There is a custom version of this serialization over in BucketCache#doDrain. - // Make sure any changes in here are reflected over there. - this.buf.get(destination, 0, getSerializedLength() - BLOCK_METADATA_SPACE); - destination = addMetaData(destination); - - // Make it ready for reading. flip sets position to zero and limit to current position which - // is what we want if we do not want to serialize the block plus checksums if present plus - // metadata. - destination.flip(); - } - - /** - * For use by bucketcache. This exposes internals. - */ - public ByteBuffer getMetaData() { - ByteBuffer bb = ByteBuffer.allocate(BLOCK_METADATA_SPACE); - bb = addMetaData(bb); - bb.flip(); - return bb; + this.buf.get(destination, 0, getSerializedLength() - EXTRA_SERIALIZATION_SPACE); + serializeExtraInfo(destination); } /** - * Adds metadata at current position (position is moved forward). Does not flip or reset. - * @return The passed <code>destination</code> with metadata added. + * Write out the content of EXTRA_SERIALIZATION_SPACE. Public so can be accessed by BucketCache. */ - private ByteBuffer addMetaData(final ByteBuffer destination) { + public void serializeExtraInfo(ByteBuffer destination) { destination.put(this.fileContext.isUseHBaseChecksum() ? (byte) 1 : (byte) 0); destination.putLong(this.offset); - destination.putInt(this.nextBlockOnDiskSize); - return destination; + destination.putInt(this.nextBlockOnDiskSizeWithHeader); + destination.rewind(); } - // Cacheable implementation @Override public CacheableDeserializer<Cacheable> getDeserializer() { - return HFileBlock.BLOCK_DESERIALIZER; + return HFileBlock.blockDeserializer; } @Override public int hashCode() { int result = 1; result = result * 31 + blockType.hashCode(); - result = result * 31 + nextBlockOnDiskSize; + result = result * 31 + nextBlockOnDiskSizeWithHeader; result = result * 31 + (int) (offset ^ (offset >>> 32)); result = result * 31 + onDiskSizeWithoutHeader; result = result * 31 + (int) (prevBlockOffset ^ (prevBlockOffset >>> 32)); @@ -1851,10 +1880,9 @@ public class HFileBlock implements Cacheable { if (castedComparison.blockType != this.blockType) { return false; } - if (castedComparison.nextBlockOnDiskSize != this.nextBlockOnDiskSize) { + if (castedComparison.nextBlockOnDiskSizeWithHeader != this.nextBlockOnDiskSizeWithHeader) { return false; } - // Offset is important. Needed when we have to remake cachekey when block is returned to cache. if (castedComparison.offset != this.offset) { return false; } @@ -1940,7 +1968,7 @@ public class HFileBlock implements Cacheable { } /** - * @return This HFileBlocks fileContext which will a derivative of the + * @return the HFileContext used to create this HFileBlock. Not necessary the * fileContext for the file from which this block's data was originally read. */ HFileContext getHFileContext() { @@ -1964,7 +1992,6 @@ public class HFileBlock implements Cacheable { * This is mostly helpful for debugging. This assumes that the block * has minor version > 0. */ - @VisibleForTesting static String toStringHeader(ByteBuff buf) throws IOException { byte[] magicBuf = new byte[Math.min(buf.limit() - buf.position(), BlockType.MAGIC_LENGTH)]; buf.get(magicBuf); http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java index 506f08d..9f29f97 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java @@ -60,7 +60,7 @@ import org.apache.hadoop.util.StringUtils; * Examples of how to use the block index writer can be found in * {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter} and * {@link HFileWriterImpl}. Examples of how to use the reader can be - * found in {@link HFileReaderImpl} and + * found in {@link HFileWriterImpl} and * {@link org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex}. */ @InterfaceAudience.Private http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java index d71911f..8f5040e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java @@ -252,20 +252,18 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { long end = 0; try { end = getTrailer().getLoadOnOpenDataOffset(); + HFileBlock prevBlock = null; if (LOG.isTraceEnabled()) { LOG.trace("File=" + path.toString() + ", offset=" + offset + ", end=" + end); } - // TODO: Could we use block iterator in here? Would that get stuff into the cache? - HFileBlock prevBlock = null; while (offset < end) { if (Thread.interrupted()) { break; } - // Perhaps we got our block from cache? Unlikely as this may be, if it happens, then - // the internal-to-hfileblock thread local which holds the overread that gets the - // next header, will not have happened...so, pass in the onDiskSize gotten from the - // cached block. This 'optimization' triggers extremely rarely I'd say. - long onDiskSize = prevBlock != null? prevBlock.getNextBlockOnDiskSize(): -1; + long onDiskSize = -1; + if (prevBlock != null) { + onDiskSize = prevBlock.getNextBlockOnDiskSizeWithHeader(); + } HFileBlock block = readBlock(offset, onDiskSize, true, false, false, false, null, null); // Need not update the current block. Ideally here the readBlock won't find the @@ -905,8 +903,9 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { // We are reading the next block without block type validation, because // it might turn out to be a non-data block. - block = reader.readBlock(block.getOffset() + block.getOnDiskSizeWithHeader(), - block.getNextBlockOnDiskSize(), cacheBlocks, pread, + block = reader.readBlock(block.getOffset() + + block.getOnDiskSizeWithHeader(), + block.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread, isCompaction, true, null, getEffectiveDataBlockEncoding()); if (block != null && !block.getBlockType().isData()) { // Findbugs: NP_NULL_ON_SOME_PATH // Whatever block we read we will be returning it unless @@ -1440,8 +1439,8 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { // Cache Miss, please load. } - HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset, blockSize, true). - unpack(hfileContext, fsBlockReader); + HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset, + blockSize, -1, true).unpack(hfileContext, fsBlockReader); // Cache the block if (cacheBlock) { @@ -1527,8 +1526,8 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { traceScope.getSpan().addTimelineAnnotation("blockCacheMiss"); } // Load block from filesystem. - HFileBlock hfileBlock = - fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread); + HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1, + pread); validateBlockType(hfileBlock, expectedBlockType); HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader); BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory(); @@ -1872,7 +1871,6 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { * @return Scanner on this file. */ @Override - @VisibleForTesting public HFileScanner getScanner(boolean cacheBlocks, final boolean pread) { return getScanner(cacheBlocks, pread, false); } http://git-wip-us.apache.org/repos/asf/hbase/blob/54a543de/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java index e0f3d74..c67bdd4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java @@ -99,21 +99,18 @@ public interface HFileScanner extends Shipper, Closeable { * @throws IOException */ boolean seekTo() throws IOException; - /** * Scans to the next entry in the file. * @return Returns false if you are at the end otherwise true if more in file. * @throws IOException */ boolean next() throws IOException; - /** * Gets the current key in the form of a cell. You must call * {@link #seekTo(Cell)} before this method. * @return gets the current key as a Cell. */ Cell getKey(); - /** * Gets a buffer view to the current value. You must call * {@link #seekTo(Cell)} before this method. @@ -122,35 +119,26 @@ public interface HFileScanner extends Shipper, Closeable { * the position is 0, the start of the buffer view. */ ByteBuffer getValue(); - /** * @return Instance of {@link org.apache.hadoop.hbase.Cell}. */ Cell getCell(); - /** * Convenience method to get a copy of the key as a string - interpreting the * bytes as UTF8. You must call {@link #seekTo(Cell)} before this method. * @return key as a string - * @deprecated Since hbase-2.0.0 */ - @Deprecated String getKeyString(); - /** * Convenience method to get a copy of the value as a string - interpreting * the bytes as UTF8. You must call {@link #seekTo(Cell)} before this method. * @return value as a string - * @deprecated Since hbase-2.0.0 */ - @Deprecated String getValueString(); - /** * @return Reader that underlies this Scanner instance. */ HFile.Reader getReader(); - /** * @return True is scanner has had one of the seek calls invoked; i.e. * {@link #seekBefore(Cell)} or {@link #seekTo()} or {@link #seekTo(Cell)}.
