This is an automated email from the ASF dual-hosted git repository.
yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 9ca72f9d932b [HUDI-9791] Fix native HFile writer to be compatible with
HBase HFile reader (#13873)
9ca72f9d932b is described below
commit 9ca72f9d932b14b414ca605560d1145de7e525a6
Author: Lin Liu <[email protected]>
AuthorDate: Fri Sep 12 08:11:50 2025 -0700
[HUDI-9791] Fix native HFile writer to be compatible with HBase HFile
reader (#13873)
Co-authored-by: Y Ethan Guo <[email protected]>
---
hudi-io/pom.xml | 36 +++
.../java/org/apache/hudi/io/hfile/HFileBlock.java | 9 +-
.../org/apache/hudi/io/hfile/HFileContext.java | 19 +-
.../org/apache/hudi/io/hfile/HFileDataBlock.java | 24 +-
.../apache/hudi/io/hfile/HFileFileInfoBlock.java | 4 +
.../java/org/apache/hudi/io/hfile/HFileInfo.java | 17 +-
.../org/apache/hudi/io/hfile/HFileWriterImpl.java | 47 ++-
.../hudi/io/hfile/TestHFileCompatibility.java | 311 ++++++++++++++++++++
.../hudi/io/hfile/TestHFileReadCompatibility.java | 315 +++++++++++++++++++++
.../org/apache/hudi/io/hfile/TestHFileWriter.java | 23 +-
.../org/apache/hudi/io/hfile/TestHfileBlock.java | 60 ++++
.../hudi/io/storage/HoodieHBaseKVComparator.java | 29 ++
.../src/test/resources/hfile/hbase-generated.hfile | Bin 0 -> 4619 bytes
.../src/test/resources/hfile/hudi-generated.hfile | Bin 0 -> 4654 bytes
pom.xml | 6 +-
15 files changed, 868 insertions(+), 32 deletions(-)
diff --git a/hudi-io/pom.xml b/hudi-io/pom.xml
index 94b05512575d..c2a08e7e22fb 100644
--- a/hudi-io/pom.xml
+++ b/hudi-io/pom.xml
@@ -173,5 +173,41 @@
<version>${slf4j.version}</version>
<scope>provided</scope>
</dependency>
+
+ <!-- HBase dependencies for testing HFile compatibility -->
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-common</artifactId>
+ <version>2.4.13</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-client</artifactId>
+ <version>2.4.13</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-server</artifactId>
+ <version>2.4.13</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-testing-util</artifactId>
+ <version>2.4.13</version>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
</dependencies>
</project>
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
index 14290362e48e..4c61e6b3b8be 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
@@ -189,6 +189,9 @@ public abstract class HFileBlock {
* @return the number of checksum chunks.
*/
static int numChecksumChunks(long numBytes, int bytesPerChecksum) {
+ if (bytesPerChecksum == 0) {
+ return 0;
+ }
long numChunks = numBytes / bytesPerChecksum;
if (numBytes % bytesPerChecksum != 0) {
numChunks++;
@@ -283,7 +286,11 @@ public abstract class HFileBlock {
// 5. Checksum type.
buf.put(context.getChecksumType().getCode());
// 6. Bytes covered per checksum.
- buf.putInt(DEFAULT_BYTES_PER_CHECKSUM);
+ // Note that: Default value is 16K. There is a check on
+ // onDiskSizeWithoutHeader = uncompressedSizeWithoutHeader + Checksum.
+ // In order to pass this check, either we make isUseHBaseChecksum false in
HFileContext (hbase),
+ // or we set this value to zero.
+ buf.putInt(0);
// 7. onDiskDataSizeWithHeader
int onDiskDataSizeWithHeader =
HFileBlock.HFILEBLOCK_HEADER_SIZE + compressedBlockData.limit();
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileContext.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileContext.java
index d2864e21ac50..8523e8cddc61 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileContext.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileContext.java
@@ -31,12 +31,17 @@ public class HFileContext {
private final HoodieCompressor compressor;
private final ChecksumType checksumType;
private final int blockSize;
+ private final long fileCreationTime;
- private HFileContext(CompressionCodec compressionCodec, int blockSize,
ChecksumType checksumType) {
+ private HFileContext(CompressionCodec compressionCodec,
+ int blockSize,
+ ChecksumType checksumType,
+ long fileCreationTime) {
this.compressionCodec = compressionCodec;
this.compressor = HoodieCompressorFactory.getCompressor(compressionCodec);
this.blockSize = blockSize;
this.checksumType = checksumType;
+ this.fileCreationTime = fileCreationTime;
}
CompressionCodec getCompressionCodec() {
@@ -55,6 +60,10 @@ public class HFileContext {
return checksumType;
}
+ long getFileCreationTime() {
+ return fileCreationTime;
+ }
+
public static Builder builder() {
return new Builder();
}
@@ -63,6 +72,7 @@ public class HFileContext {
private CompressionCodec compressionCodec = CompressionCodec.NONE;
private int blockSize = 1024 * 1024;
private ChecksumType checksumType = ChecksumType.NULL;
+ private long fileCreationTime = System.currentTimeMillis();
public Builder blockSize(int blockSize) {
this.blockSize = blockSize;
@@ -79,8 +89,13 @@ public class HFileContext {
return this;
}
+ public Builder fileCreationTime(long fileCreationTime) {
+ this.fileCreationTime = fileCreationTime;
+ return this;
+ }
+
public HFileContext build() {
- return new HFileContext(compressionCodec, blockSize, checksumType);
+ return new HFileContext(compressionCodec, blockSize, checksumType,
fileCreationTime);
}
}
}
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
index 7f62b1c54162..230c9c686000 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
@@ -26,7 +26,9 @@ import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
+import static org.apache.hudi.io.hfile.DataSize.SIZEOF_BYTE;
import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT16;
+import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT64;
import static
org.apache.hudi.io.hfile.HFileReader.SEEK_TO_BEFORE_BLOCK_FIRST_KEY;
import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_FOUND;
import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_IN_RANGE;
@@ -36,11 +38,18 @@ import static org.apache.hudi.io.hfile.KeyValue.KEY_OFFSET;
* Represents a {@link HFileBlockType#DATA} block.
*/
public class HFileDataBlock extends HFileBlock {
+ private static final int KEY_LENGTH_LENGTH = SIZEOF_INT16;
+ private static final int COLUMN_FAMILY_LENGTH = SIZEOF_BYTE;
+ private static final int VERSION_TIMESTAMP_LENGTH = SIZEOF_INT64;
+ private static final int KEY_TYPE_LENGTH = SIZEOF_BYTE;
// Hudi does not use HFile MVCC timestamp version so the version
// is always 0, thus the byte length of the version is always 1.
// This assumption is also validated when parsing {@link HFileInfo},
// i.e., the maximum MVCC timestamp in a HFile must be 0.
private static final long ZERO_TS_VERSION_BYTE_LENGTH = 1;
+ // Hudi does not set version timestamp for key value pairs,
+ // so the latest timestamp is used.
+ private static final long LATEST_TIMESTAMP = Long.MAX_VALUE;
// End offset of content in the block, relative to the start of the start of
the block
protected final int uncompressedContentEndRelativeOffset;
@@ -203,17 +212,30 @@ public class HFileDataBlock extends HFileBlock {
ByteBuffer dataBuf = ByteBuffer.allocate(context.getBlockSize());
for (KeyValueEntry kv : entriesToWrite) {
// Length of key + length of a short variable indicating length of key.
- dataBuf.putInt(kv.key.length + SIZEOF_INT16);
+ // Note that 10 extra bytes are required by hbase reader.
+ // That is: 1 byte for column family length, 8 bytes for timestamp, 1
bytes for key type.
+ dataBuf.putInt(kv.key.length + KEY_LENGTH_LENGTH + COLUMN_FAMILY_LENGTH
+ VERSION_TIMESTAMP_LENGTH + KEY_TYPE_LENGTH);
// Length of value.
dataBuf.putInt(kv.value.length);
// Key content length.
dataBuf.putShort((short)kv.key.length);
// Key.
dataBuf.put(kv.key);
+ // Column family length: constant 0.
+ dataBuf.put((byte)0);
+ // Column qualifier: assume 0 bits.
+ // Timestamp: using the latest.
+ dataBuf.putLong(LATEST_TIMESTAMP);
+ // Key type: constant Put (4) in Hudi.
+ // Minimum((byte) 0), Put((byte) 4), Delete((byte) 8),
+ // DeleteFamilyVersion((byte) 10), DeleteColumn((byte) 12),
+ // DeleteFamily((byte) 14), Maximum((byte) 255).
+ dataBuf.put((byte)4);
// Value.
dataBuf.put(kv.value);
// MVCC.
dataBuf.put((byte)0);
+
// Copy to output stream.
baos.write(dataBuf.array(), 0, dataBuf.position());
// Clear the buffer.
diff --git
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileFileInfoBlock.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileFileInfoBlock.java
index 2f9918f6d07a..e6387e804e8d 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileFileInfoBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileFileInfoBlock.java
@@ -82,6 +82,10 @@ public class HFileFileInfoBlock extends HFileBlock {
fileInfoToWrite.put(name, value);
}
+ public boolean containsKey(String name) {
+ return fileInfoToWrite.containsKey(name);
+ }
+
@Override
public ByteBuffer getUncompressedBlockDataToWrite() {
ByteBuffer buff = ByteBuffer.allocate(context.getBlockSize() * 2);
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileInfo.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileInfo.java
index b2c8e141675b..824ec1328e6c 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileInfo.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileInfo.java
@@ -31,28 +31,27 @@ public class HFileInfo {
private static final String RESERVED_PREFIX = "hfile.";
static final UTF8StringKey LAST_KEY =
new UTF8StringKey(RESERVED_PREFIX + "LASTKEY");
- private static final UTF8StringKey FILE_CREATION_TIME_TS =
+ static final UTF8StringKey FILE_CREATION_TIME_TS =
new UTF8StringKey(RESERVED_PREFIX + "CREATE_TIME_TS");
- private static final UTF8StringKey KEY_VALUE_VERSION =
+ static final UTF8StringKey KEY_VALUE_VERSION =
new UTF8StringKey("KEY_VALUE_VERSION");
static final UTF8StringKey MAX_MVCC_TS_KEY =
new UTF8StringKey("MAX_MEMSTORE_TS_KEY");
-
- private static final int KEY_VALUE_VERSION_WITH_MVCC_TS = 1;
+ static final UTF8StringKey AVG_KEY_LEN =
+ new UTF8StringKey(RESERVED_PREFIX + "AVG_KEY_LEN");
+ static final UTF8StringKey AVG_VALUE_LEN =
+ new UTF8StringKey(RESERVED_PREFIX + "AVG_VALUE_LEN");
+ static final int KEY_VALUE_VERSION_WITH_MVCC_TS = 1;
private final Map<UTF8StringKey, byte[]> infoMap;
private final long fileCreationTime;
private final Option<Key> lastKey;
- private final long maxMvccTs;
- private final boolean containsMvccTs;
public HFileInfo(Map<UTF8StringKey, byte[]> infoMap) {
this.infoMap = infoMap;
this.fileCreationTime = parseFileCreationTime();
this.lastKey = parseLastKey();
- this.maxMvccTs = parseMaxMvccTs();
- this.containsMvccTs = maxMvccTs > 0;
- if (containsMvccTs) {
+ if (parseMaxMvccTs() > 0) {
// The HFile written by Hudi does not contain MVCC timestamps.
// Parsing MVCC timestamps is not supported.
throw new UnsupportedOperationException("HFiles with MVCC timestamps are
not supported");
diff --git
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileWriterImpl.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileWriterImpl.java
index c8153328ca7d..b9c835d06186 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileWriterImpl.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileWriterImpl.java
@@ -22,8 +22,6 @@ package org.apache.hudi.io.hfile;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.io.hfile.protobuf.generated.HFileProtos;
-import com.google.protobuf.ByteString;
-
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
@@ -35,9 +33,11 @@ import java.util.Map;
import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT16;
import static
org.apache.hudi.io.hfile.HFileBlock.getVariableLengthEncodedBytes;
import static org.apache.hudi.io.hfile.HFileBlockType.TRAILER;
+import static
org.apache.hudi.io.hfile.HFileInfo.KEY_VALUE_VERSION_WITH_MVCC_TS;
import static org.apache.hudi.io.hfile.HFileInfo.LAST_KEY;
import static org.apache.hudi.io.hfile.HFileInfo.MAX_MVCC_TS_KEY;
import static org.apache.hudi.io.hfile.HFileTrailer.TRAILER_SIZE;
+import static org.apache.hudi.io.util.IOUtils.toBytes;
/**
* Pure Java implementation of HFile writer (HFile v3 format) for Hudi.
@@ -68,6 +68,8 @@ public class HFileWriterImpl implements HFileWriter {
private long firstDataBlockOffset = -1;
private long lastDataBlockOffset;
private long totalNumberOfRecords = 0;
+ private long totalKeyLength = 0;
+ private long totalValueLength = 0;
public HFileWriterImpl(HFileContext context, OutputStream outputStream) {
this.outputStream = outputStream;
@@ -87,6 +89,8 @@ public class HFileWriterImpl implements HFileWriter {
public void append(String key, byte[] value) throws IOException {
byte[] keyBytes = StringUtils.getUTF8Bytes(key);
lastKey = keyBytes;
+ totalKeyLength += keyBytes.length;
+ totalValueLength += value.length;
// Records with the same key must be put into the same block.
// Here 9 = 4 bytes of key length + 4 bytes of value length + 1 byte MVCC.
if (!Arrays.equals(currentDataBlock.getLastKeyContent(), keyBytes)
@@ -170,10 +174,7 @@ public class HFileWriterImpl implements HFileWriter {
metaIndexBlock.setStartOffsetInBuffForWrite(currentOffset);
writeBuffer(metaIndexBuffer);
// Write File Info.
- fileInfoBlock.add(
- new String(LAST_KEY.getBytes(), StandardCharsets.UTF_8),
- addKeyLength(lastKey));
- fileInfoBlock.setStartOffsetInBuffForWrite(currentOffset);
+ finishFileInfo();
writeBuffer(fileInfoBlock.serialize());
}
@@ -191,7 +192,6 @@ public class HFileWriterImpl implements HFileWriter {
builder.setLastDataBlockOffset(lastDataBlockOffset);
builder.setComparatorClassName(COMPARATOR_CLASS_NAME);
builder.setCompressionCodec(context.getCompressionCodec().getId());
- builder.setEncryptionKey(ByteString.EMPTY);
HFileProtos.TrailerProto trailerProto = builder.build();
ByteBuffer trailer = ByteBuffer.allocate(TRAILER_SIZE);
@@ -216,7 +216,38 @@ public class HFileWriterImpl implements HFileWriter {
private void initFileInfo() {
fileInfoBlock.add(
new String(MAX_MVCC_TS_KEY.getBytes(), StandardCharsets.UTF_8),
- new byte[]{0});
+ toBytes(0L));
+ }
+
+ protected void finishFileInfo() {
+ // Record last key.
+ fileInfoBlock.add(
+ new String(LAST_KEY.getBytes(), StandardCharsets.UTF_8),
+ addKeyLength(lastKey));
+ fileInfoBlock.setStartOffsetInBuffForWrite(currentOffset);
+
+ // Average key length.
+ int avgKeyLen = totalNumberOfRecords == 0
+ ? 0 : (int) (totalKeyLength / totalNumberOfRecords);
+ fileInfoBlock.add(
+ new String(HFileInfo.AVG_KEY_LEN.getBytes(), StandardCharsets.UTF_8),
+ toBytes(avgKeyLen));
+ fileInfoBlock.add(
+ new String(HFileInfo.FILE_CREATION_TIME_TS.getBytes(),
StandardCharsets.UTF_8),
+ toBytes(context.getFileCreationTime()));
+
+ // Average value length.
+ int avgValueLen = totalNumberOfRecords == 0
+ ? 0 : (int) (totalValueLength / totalNumberOfRecords);
+ fileInfoBlock.add(
+ new String(HFileInfo.AVG_VALUE_LEN.getBytes(), StandardCharsets.UTF_8),
+ toBytes(avgValueLen));
+
+ // NOTE: Set this property to make sure the key value and MVCC timestamp
+ // pairs are properly decoded
+ appendFileInfo(
+ new String(HFileInfo.KEY_VALUE_VERSION.getBytes(),
StandardCharsets.UTF_8),
+ toBytes(KEY_VALUE_VERSION_WITH_MVCC_TS));
}
// Note: HFileReaderImpl assumes that:
diff --git
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileCompatibility.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileCompatibility.java
new file mode 100644
index 000000000000..97dc821cb79d
--- /dev/null
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileCompatibility.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
+import org.apache.hudi.io.ByteArraySeekableDataInputStream;
+import org.apache.hudi.io.SeekableDataInputStream;
+import org.apache.hudi.io.util.IOUtils;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.CellComparatorImpl;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileContext;
+import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import static org.apache.hudi.common.util.FileIOUtils.readAsByteArray;
+import static org.apache.hudi.io.hfile.HFileInfo.KEY_VALUE_VERSION;
+import static org.apache.hudi.io.util.IOUtils.readInt;
+import static org.apache.hudi.io.util.IOUtils.toBytes;
+
+class TestHFileCompatibility {
+ // Test data - simple key-value pairs
+ private static final List<TestRecord> TEST_RECORDS = Arrays.asList(
+ new TestRecord("row1", "value1"),
+ new TestRecord("row2", "value2"),
+ new TestRecord("row3", "value3"),
+ new TestRecord("row4", "value4"),
+ new TestRecord("row5", "value5")
+ );
+
+ @ParameterizedTest
+ @CsvSource({
+ "/hfile/hudi-generated.hfile,/hfile/hbase-generated.hfile",
+ "/hfile/hbase-generated.hfile,/hfile/hudi-generated.hfile"
+ })
+ void testHFileReadCompatibility(String hudiFilePath, String hbaseFilePath)
throws Exception {
+ try (HFileReader hudiReader = createHFileReaderFromResource(hudiFilePath);
+ org.apache.hadoop.hbase.io.hfile.HFile.Reader hbaseReader =
+ createHBaseHFileReaderFromResource(hbaseFilePath)) {
+ // Validate number of entries.
+ Assertions.assertEquals(5, hudiReader.getNumKeyValueEntries());
+ Assertions.assertEquals(5, hbaseReader.getEntries());
+ // Validate data block content.
+ hudiReader.seekTo();
+ HFileScanner scanner = hbaseReader.getScanner(true, true);
+ scanner.seekTo();
+ int i = 0;
+ do {
+ org.apache.hudi.io.hfile.KeyValue keyValue =
hudiReader.getKeyValue().get();
+ Cell cell = scanner.getCell();
+ // Ensure Hudi record is correct.
+ Assertions.assertEquals(TEST_RECORDS.get(i).key,
keyValue.getKey().getContentInString());
+ byte[] value = Arrays.copyOfRange(
+ keyValue.getBytes(),
+ keyValue.getValueOffset(),
+ keyValue.getValueOffset() + keyValue.getValueLength());
+ Assertions.assertArrayEquals(value,
TEST_RECORDS.get(i).value.getBytes());
+ // Ensure Hbase record is correct.
+ byte[] key = Arrays.copyOfRange(
+ cell.getRowArray(),
+ cell.getRowOffset(),
+ cell.getRowOffset() + cell.getRowLength());
+ Assertions.assertArrayEquals(TEST_RECORDS.get(i).key.getBytes(), key);
+ value = Arrays.copyOfRange(
+ cell.getValueArray(),
+ cell.getValueOffset(),
+ cell.getValueOffset() + cell.getValueLength());
+ Assertions.assertArrayEquals(value,
TEST_RECORDS.get(i).value.getBytes());
+ i++;
+ } while (hudiReader.next() && scanner.next());
+
+ // Compare some meta information.
+ // LAST KEY.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.LAST_KEY.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.LAST_KEY).isPresent());
+ if (hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get().length
+ <
hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes()).length) {
+ Assertions.assertTrue(isPrefix(
+ hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get(),
+ hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes())));
+ } else {
+ Assertions.assertTrue(isPrefix(
+ hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes()),
+ hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get()));
+ }
+ // Average key length.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.AVG_KEY_LEN.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.AVG_KEY_LEN).isPresent());
+ // Average value length.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.AVG_VALUE_LEN.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.AVG_VALUE_LEN).isPresent());
+ Assertions.assertTrue(
+ hbaseReader.getHFileInfo().getAvgValueLen()
+ >=
readInt(hudiReader.getMetaInfo(HFileInfo.AVG_VALUE_LEN).get(), 0));
+ // MVCC.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().shouldIncludeMemStoreTS());
+ // Note that MemStoreTS is not set.
+ Assertions.assertFalse(hbaseReader.getHFileInfo().isDecodeMemstoreTS());
+
Assertions.assertTrue(hudiReader.getMetaInfo(KEY_VALUE_VERSION).isPresent());
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.MAX_MVCC_TS_KEY).isPresent());
+ Assertions.assertEquals(0L,
+ IOUtils.readLong(
+ hudiReader.getMetaInfo(HFileInfo.MAX_MVCC_TS_KEY).get(), 0));
+ }
+ }
+
+ @Test
+ void testHbaseReaderSucceedsWhenKeyValueVersionIsSetTo1() throws IOException
{
+ String fileName = "hudi-generated-for-keyvalue-versions";
+ Path tempFile = new Path(Files.createTempFile(fileName,
".hfile").toString());
+ // By default this value is set to 1. Here we explicitly set it to 1 for
test purpose.
+ writeHFileWithHudi(tempFile, 1);
+
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ // Create HBase HFile.Reader from the temporary file
+ HFile.Reader reader = HFile.createReader(fs, new
Path(tempFile.toString()), conf);
+ byte[] keyValueVersion =
reader.getHFileInfo().get(KEY_VALUE_VERSION.getBytes());
+ Assertions.assertEquals(1, IOUtils.readInt(keyValueVersion, 0));
+ // Values from trailer still works.
+ Assertions.assertEquals(5, reader.getEntries());
+ // Scanning the file succeeds.
+ HFileScanner scanner = reader.getScanner(true, true);
+ scanner.seekTo();
+ Assertions.assertDoesNotThrow(() -> {
+ int i = 0;
+ do {
+ Cell cell = scanner.getCell();
+ byte[] key = Arrays.copyOfRange(
+ cell.getRowArray(),
+ cell.getRowOffset(),
+ cell.getRowOffset() + cell.getRowLength());
+ Assertions.assertArrayEquals(TEST_RECORDS.get(i).key.getBytes(), key);
+ i++;
+ } while (scanner.next());
+ });
+ }
+
+ static boolean isPrefix(byte[] prefix, byte[] array) {
+ if (prefix.length > array.length) {
+ return false; // can't be prefix if longer
+ }
+ for (int i = 0; i < prefix.length; i++) {
+ if (prefix[i] != array[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ static HFileReader createHFileReaderFromResource(String fileName) throws
IOException {
+ // Read HFile data from resources
+ byte[] hfileData = readHFileFromResources(fileName);
+ // Convert to ByteBuffer
+ ByteBuffer buffer = ByteBuffer.wrap(hfileData);
+ // Create SeekableDataInputStream
+ SeekableDataInputStream inputStream = new ByteArraySeekableDataInputStream(
+ new ByteBufferBackedInputStream(buffer)
+ );
+ // Create and return HFileReaderImpl
+ return new HFileReaderImpl(inputStream, hfileData.length);
+ }
+
+ static HFile.Reader createHBaseHFileReaderFromResource(String fileName)
throws IOException {
+ // Read HFile data from resources
+ byte[] hfileData = readHFileFromResources(fileName);
+ // Create a temporary file to write the HFile data
+ Path tempFile = new Path(Files.createTempFile("hbase_hfile_",
".hfile").toString());
+ try {
+ // Write the byte array to temporary file
+ Files.write(Paths.get(tempFile.toString()), hfileData);
+ // Create Hadoop Configuration and FileSystem
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ // Create HBase HFile.Reader from the temporary file
+ HFile.Reader reader = HFile.createReader(fs, new
Path(tempFile.toString()), conf);
+ // Note: The temporary file will be cleaned up when the reader is closed
+ // or you can manually delete it after use
+ return reader;
+ } catch (IOException e) {
+ // Clean up temp file if creation fails
+ Files.deleteIfExists(Paths.get(tempFile.toString()));
+ throw e;
+ }
+ }
+
+ private static byte[] readHFileFromResources(String filename) throws
IOException {
+ long size =
Objects.requireNonNull(TestHFileCompatibility.class.getResource(filename))
+ .openConnection().getContentLength();
+ return readAsByteArray(
+ TestHFileReader.class.getResourceAsStream(filename), (int) size);
+ }
+
+ /**
+ * The following are the functions used to generate hfile used in the tests.
+ */
+ void testWriteHFiles() throws IOException, URISyntaxException {
+ String hbaseFile =
Paths.get("src/test/resources/hfile/hbase-generated.hfile").toAbsolutePath().toString();
+ String hudiFile =
Paths.get("src/test/resources/hfile/hudi-generated.hfile").toAbsolutePath().toString();
+ writeHFileWithHbase(new Path(hbaseFile));
+ writeHFileWithHudi(new Path(hudiFile));
+ }
+
+ private void writeHFileWithHbase(Path filePath) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+
+ // Create HFile context with appropriate settings
+ HFileContext context = new HFileContextBuilder()
+ .withBlockSize(64 * 1024)
+ .withCompression(Compression.Algorithm.NONE)
+ .withCellComparator(CellComparatorImpl.COMPARATOR)
+ .withIncludesMvcc(true)
+ .build();
+
+ // Create HBase HFile writer
+ try (HFile.Writer writer = HFile.getWriterFactory(conf, new
CacheConfig(conf))
+ .withPath(fs, filePath).withFileContext(context).create()) {
+ // Write test records as HBase KeyValue objects
+ for (TestRecord record : TEST_RECORDS) {
+ KeyValue kv = new KeyValue(
+ Bytes.toBytes(record.key), // row
+ new byte[0], // family
+ new byte[0], // qualifier
+ 0L, // timestamp
+ Bytes.toBytes(record.value) // value
+ );
+ writer.append(kv);
+ }
+ }
+ }
+
+ private void writeHFileWithHudi(Path filePath) throws IOException {
+ writeHFileWithHudi(filePath, 1);
+ }
+
+ private void writeHFileWithHudi(Path filePath, int keyValueVersion) throws
IOException {
+ org.apache.hudi.io.hfile.HFileContext context =
org.apache.hudi.io.hfile.HFileContext.builder()
+ .blockSize(64 * 1024)
+ .build();
+ try (DataOutputStream outputStream = new DataOutputStream(
+ Files.newOutputStream(Paths.get(filePath.toString())));
+ HFileWriter writer = new HFileWriterImpl(context, outputStream)) {
+ for (TestRecord record : TEST_RECORDS) {
+ writer.append(record.key, record.value.getBytes("UTF-8"));
+ }
+ writer.appendMetaInfo("bloom_filter", "random_string".getBytes());
+ // To validate if a specific KEY_VALUE_VERSION value should be set.
+ if (keyValueVersion != 1) {
+ writer.appendFileInfo(
+ new String(KEY_VALUE_VERSION.getBytes(), StandardCharsets.UTF_8),
toBytes(keyValueVersion));
+ }
+ }
+ }
+
+ // Simple test record class
+ private static class TestRecord {
+ final String key;
+ final String value;
+
+ TestRecord(String key, String value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public String toString() {
+ return "TestRecord{key='" + key + "', value='" + value + "'}";
+ }
+ }
+}
diff --git
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReadCompatibility.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReadCompatibility.java
new file mode 100644
index 000000000000..45f1dab9c931
--- /dev/null
+++
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReadCompatibility.java
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
+import org.apache.hudi.io.ByteArraySeekableDataInputStream;
+import org.apache.hudi.io.SeekableDataInputStream;
+import org.apache.hudi.io.util.IOUtils;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.CellComparatorImpl;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileContext;
+import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import static org.apache.hudi.common.util.FileIOUtils.readAsByteArray;
+import static org.apache.hudi.io.hfile.HFileInfo.KEY_VALUE_VERSION;
+import static org.apache.hudi.io.util.IOUtils.readInt;
+import static org.apache.hudi.io.util.IOUtils.toBytes;
+
+class TestHFileReadCompatibility {
+ // Test data - simple key-value pairs
+ private static final List<TestRecord> TEST_RECORDS = Arrays.asList(
+ new TestRecord("row1", "value1"),
+ new TestRecord("row2", "value2"),
+ new TestRecord("row3", "value3"),
+ new TestRecord("row4", "value4"),
+ new TestRecord("row5", "value5")
+ );
+
+ @ParameterizedTest
+ @CsvSource({
+ "/hfile/hudi-generated.hfile,/hfile/hbase-generated.hfile",
+ "/hfile/hbase-generated.hfile,/hfile/hudi-generated.hfile"
+ })
+ void testHFileReadCompatibility(String hudiFilePath, String hbaseFilePath)
throws Exception {
+ try (HFileReader hudiReader = createHFileReaderFromResource(hudiFilePath);
+ org.apache.hadoop.hbase.io.hfile.HFile.Reader hbaseReader =
+ createHBaseHFileReaderFromResource(hbaseFilePath)) {
+ // Validate number of entries.
+ Assertions.assertEquals(5, hudiReader.getNumKeyValueEntries());
+ Assertions.assertEquals(5, hbaseReader.getEntries());
+ // Validate data block content.
+ hudiReader.seekTo();
+ HFileScanner scanner = hbaseReader.getScanner(true, true);
+ scanner.seekTo();
+ int i = 0;
+ do {
+ org.apache.hudi.io.hfile.KeyValue keyValue =
hudiReader.getKeyValue().get();
+ Cell cell = scanner.getCell();
+ // Ensure Hudi record is correct.
+ Assertions.assertEquals(TEST_RECORDS.get(i).key,
keyValue.getKey().getContentInString());
+ byte[] value = Arrays.copyOfRange(
+ keyValue.getBytes(),
+ keyValue.getValueOffset(),
+ keyValue.getValueOffset() + keyValue.getValueLength());
+ Assertions.assertArrayEquals(value,
TEST_RECORDS.get(i).value.getBytes());
+ // Ensure Hbase record is correct.
+ byte[] key = Arrays.copyOfRange(
+ cell.getRowArray(),
+ cell.getRowOffset(),
+ cell.getRowOffset() + cell.getRowLength());
+ Assertions.assertArrayEquals(TEST_RECORDS.get(i).key.getBytes(), key);
+ value = Arrays.copyOfRange(
+ cell.getValueArray(),
+ cell.getValueOffset(),
+ cell.getValueOffset() + cell.getValueLength());
+ Assertions.assertArrayEquals(value,
TEST_RECORDS.get(i).value.getBytes());
+ i++;
+ } while (hudiReader.next() && scanner.next());
+
+ // Compare some meta information.
+ // LAST KEY.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.LAST_KEY.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.LAST_KEY).isPresent());
+ // The last key value returned from hbase contains the extra fields,
+ // e.g., column family, column qualifier, timestamp, key type, which is
10 more bytes.
+ // Therefore, the last key value from hudi should be the prefix since
hudi does not use these
+ // extra fields.
+ if (hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get().length
+ <
hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes()).length) {
+ Assertions.assertTrue(isPrefix(
+ hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get(),
+ hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes())));
+ } else {
+ Assertions.assertTrue(isPrefix(
+ hbaseReader.getHFileInfo().get(HFileInfo.LAST_KEY.getBytes()),
+ hudiReader.getMetaInfo(HFileInfo.LAST_KEY).get()));
+ }
+ // Average key length.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.AVG_KEY_LEN.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.AVG_KEY_LEN).isPresent());
+ // Average value length.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().containsKey(HFileInfo.AVG_VALUE_LEN.getBytes()));
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.AVG_VALUE_LEN).isPresent());
+ Assertions.assertTrue(
+ hbaseReader.getHFileInfo().getAvgValueLen()
+ >=
readInt(hudiReader.getMetaInfo(HFileInfo.AVG_VALUE_LEN).get(), 0));
+ // MVCC.
+
Assertions.assertTrue(hbaseReader.getHFileInfo().shouldIncludeMemStoreTS());
+ // Note that MemStoreTS is not set.
+ Assertions.assertFalse(hbaseReader.getHFileInfo().isDecodeMemstoreTS());
+
Assertions.assertTrue(hudiReader.getMetaInfo(KEY_VALUE_VERSION).isPresent());
+
Assertions.assertTrue(hudiReader.getMetaInfo(HFileInfo.MAX_MVCC_TS_KEY).isPresent());
+ Assertions.assertEquals(0L,
+ IOUtils.readLong(
+ hudiReader.getMetaInfo(HFileInfo.MAX_MVCC_TS_KEY).get(), 0));
+ }
+ }
+
+ @Test
+ void testHbaseReaderSucceedsWhenKeyValueVersionIsSetTo1() throws IOException
{
+ String fileName = "hudi-generated-for-keyvalue-versions";
+ Path tempFile = new Path(Files.createTempFile(fileName,
".hfile").toString());
+ // By default this value is set to 1. Here we explicitly set it to 1 for
test purpose.
+ writeHFileWithHudi(tempFile, 1);
+
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ // Create HBase HFile.Reader from the temporary file
+ HFile.Reader reader = HFile.createReader(fs, new
Path(tempFile.toString()), conf);
+ byte[] keyValueVersion =
reader.getHFileInfo().get(KEY_VALUE_VERSION.getBytes());
+ Assertions.assertEquals(1, IOUtils.readInt(keyValueVersion, 0));
+ // Values from trailer still works.
+ Assertions.assertEquals(5, reader.getEntries());
+ // Scanning the file succeeds.
+ HFileScanner scanner = reader.getScanner(true, true);
+ scanner.seekTo();
+ Assertions.assertDoesNotThrow(() -> {
+ int i = 0;
+ do {
+ Cell cell = scanner.getCell();
+ byte[] key = Arrays.copyOfRange(
+ cell.getRowArray(),
+ cell.getRowOffset(),
+ cell.getRowOffset() + cell.getRowLength());
+ Assertions.assertArrayEquals(TEST_RECORDS.get(i).key.getBytes(), key);
+ i++;
+ } while (scanner.next());
+ });
+ }
+
+ static boolean isPrefix(byte[] prefix, byte[] array) {
+ if (prefix.length > array.length) {
+ return false; // can't be prefix if longer
+ }
+ for (int i = 0; i < prefix.length; i++) {
+ if (prefix[i] != array[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ static HFileReader createHFileReaderFromResource(String fileName) throws
IOException {
+ // Read HFile data from resources
+ byte[] hfileData = readHFileFromResources(fileName);
+ // Convert to ByteBuffer
+ ByteBuffer buffer = ByteBuffer.wrap(hfileData);
+ // Create SeekableDataInputStream
+ SeekableDataInputStream inputStream = new ByteArraySeekableDataInputStream(
+ new ByteBufferBackedInputStream(buffer)
+ );
+ // Create and return HFileReaderImpl
+ return new HFileReaderImpl(inputStream, hfileData.length);
+ }
+
+ static HFile.Reader createHBaseHFileReaderFromResource(String fileName)
throws IOException {
+ // Read HFile data from resources
+ byte[] hfileData = readHFileFromResources(fileName);
+ // Create a temporary file to write the HFile data
+ Path tempFile = new Path(Files.createTempFile("hbase_hfile_",
".hfile").toString());
+ try {
+ // Write the byte array to temporary file
+ Files.write(Paths.get(tempFile.toString()), hfileData);
+ // Create Hadoop Configuration and FileSystem
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ // Create HBase HFile.Reader from the temporary file
+ HFile.Reader reader = HFile.createReader(fs, new
Path(tempFile.toString()), conf);
+ // Note: The temporary file will be cleaned up when the reader is closed
+ // or you can manually delete it after use
+ return reader;
+ } catch (IOException e) {
+ // Clean up temp file if creation fails
+ Files.deleteIfExists(Paths.get(tempFile.toString()));
+ throw e;
+ }
+ }
+
+ private static byte[] readHFileFromResources(String filename) throws
IOException {
+ long size =
Objects.requireNonNull(TestHFileReadCompatibility.class.getResource(filename))
+ .openConnection().getContentLength();
+ return readAsByteArray(
+ TestHFileReader.class.getResourceAsStream(filename), (int) size);
+ }
+
+ /**
+ * The following are the functions used to generate hfile used in the tests.
+ */
+ void testWriteHFiles() throws IOException, URISyntaxException {
+ String hbaseFile =
Paths.get("src/test/resources/hfile/hbase-generated.hfile").toAbsolutePath().toString();
+ String hudiFile =
Paths.get("src/test/resources/hfile/hudi-generated.hfile").toAbsolutePath().toString();
+ writeHFileWithHbase(new Path(hbaseFile));
+ writeHFileWithHudi(new Path(hudiFile));
+ }
+
+ private void writeHFileWithHbase(Path filePath) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+
+ // Create HFile context with appropriate settings
+ HFileContext context = new HFileContextBuilder()
+ .withBlockSize(64 * 1024)
+ .withCompression(Compression.Algorithm.NONE)
+ .withCellComparator(CellComparatorImpl.COMPARATOR)
+ .withIncludesMvcc(true)
+ .build();
+
+ // Create HBase HFile writer
+ try (HFile.Writer writer = HFile.getWriterFactory(conf, new
CacheConfig(conf))
+ .withPath(fs, filePath).withFileContext(context).create()) {
+ // Write test records as HBase KeyValue objects
+ for (TestRecord record : TEST_RECORDS) {
+ KeyValue kv = new KeyValue(
+ Bytes.toBytes(record.key), // row
+ new byte[0], // family
+ new byte[0], // qualifier
+ 0L, // timestamp
+ Bytes.toBytes(record.value) // value
+ );
+ writer.append(kv);
+ }
+ }
+ }
+
+ private void writeHFileWithHudi(Path filePath) throws IOException {
+ writeHFileWithHudi(filePath, 1);
+ }
+
+ private void writeHFileWithHudi(Path filePath, int keyValueVersion) throws
IOException {
+ org.apache.hudi.io.hfile.HFileContext context =
org.apache.hudi.io.hfile.HFileContext.builder()
+ .blockSize(64 * 1024)
+ .build();
+ try (DataOutputStream outputStream = new DataOutputStream(
+ Files.newOutputStream(Paths.get(filePath.toString())));
+ HFileWriter writer = new HFileWriterImpl(context, outputStream)) {
+ for (TestRecord record : TEST_RECORDS) {
+ writer.append(record.key, record.value.getBytes("UTF-8"));
+ }
+ writer.appendMetaInfo("bloom_filter", "random_string".getBytes());
+ // To validate if a specific KEY_VALUE_VERSION value should be set.
+ if (keyValueVersion != 1) {
+ writer.appendFileInfo(
+ new String(KEY_VALUE_VERSION.getBytes(), StandardCharsets.UTF_8),
toBytes(keyValueVersion));
+ }
+ }
+ }
+
+ // Simple test record class
+ private static class TestRecord {
+ final String key;
+ final String value;
+
+ TestRecord(String key, String value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public String toString() {
+ return "TestRecord{key='" + key + "', value='" + value + "'}";
+ }
+ }
+}
diff --git
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
index fe9cf62dfbb0..da87dddf1ae1 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
@@ -41,6 +41,9 @@ import java.util.Arrays;
import static org.apache.hudi.io.hfile.HFileBlockType.DATA;
import static org.apache.hudi.io.hfile.HFileBlockType.TRAILER;
+import static org.apache.hudi.io.hfile.HFileInfo.AVG_KEY_LEN;
+import static org.apache.hudi.io.hfile.HFileInfo.AVG_VALUE_LEN;
+import static org.apache.hudi.io.hfile.HFileInfo.KEY_VALUE_VERSION;
import static org.apache.hudi.io.hfile.HFileInfo.LAST_KEY;
import static org.apache.hudi.io.hfile.HFileInfo.MAX_MVCC_TS_KEY;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -72,7 +75,7 @@ class TestHFileWriter {
@Test
void testSameKeyLocation() throws IOException {
// 50 bytes for data part limit.
- HFileContext context = new HFileContext.Builder().blockSize(50).build();
+ HFileContext context = new HFileContext.Builder().blockSize(100).build();
String testFile = TEST_FILE;
try (DataOutputStream outputStream =
new DataOutputStream(Files.newOutputStream(Paths.get(testFile)));
@@ -98,7 +101,7 @@ class TestHFileWriter {
reader.initializeMetadata();
assertEquals(20, reader.getNumKeyValueEntries());
HFileTrailer trailer = reader.getTrailer();
- assertEquals(6, trailer.getDataIndexCount());
+ assertEquals(4, trailer.getDataIndexCount());
int i = 0;
for (Key key : reader.getDataBlockIndexMap().keySet()) {
assertArrayEquals(
@@ -107,7 +110,7 @@ class TestHFileWriter {
if (i == 0) {
i++;
} else {
- i += 2;
+ i += 4;
}
}
}
@@ -116,7 +119,7 @@ class TestHFileWriter {
@Test
void testUniqueKeyLocation() throws IOException {
// 50 bytes for data part limit.
- HFileContext context = new HFileContext.Builder().blockSize(50).build();
+ HFileContext context = new HFileContext.Builder().blockSize(100).build();
String testFile = TEST_FILE;
try (DataOutputStream outputStream =
new DataOutputStream(Files.newOutputStream(Paths.get(testFile)));
@@ -138,7 +141,7 @@ class TestHFileWriter {
reader.initializeMetadata();
assertEquals(50, reader.getNumKeyValueEntries());
HFileTrailer trailer = reader.getTrailer();
- assertEquals(25, trailer.getDataIndexCount());
+ assertEquals(13, trailer.getDataIndexCount());
reader.seekTo();
for (int i = 0; i < 50; i++) {
KeyValue kv = reader.getKeyValue().get();
@@ -171,7 +174,7 @@ class TestHFileWriter {
private static void validateHFileSize() throws IOException {
Path path = Paths.get(TEST_FILE);
long actualSize = Files.size(path);
- long expectedSize = 4366L;
+ long expectedSize = 4521;
assertEquals(expectedSize, actualSize);
}
@@ -193,7 +196,11 @@ class TestHFileWriter {
reader.initializeMetadata();
assertEquals(3, reader.getNumKeyValueEntries());
assertTrue(reader.getMetaInfo(LAST_KEY).isPresent());
- assertEquals(1, reader.getMetaInfo(MAX_MVCC_TS_KEY).get().length);
+ assertEquals(4, reader.getMetaInfo(AVG_KEY_LEN).get().length);
+ assertEquals(4, reader.getMetaInfo(AVG_VALUE_LEN).get().length);
+ assertEquals(8, reader.getMetaInfo(MAX_MVCC_TS_KEY).get().length);
+ assertEquals(1,
+
ByteBuffer.wrap(reader.getMetaInfo(KEY_VALUE_VERSION).get()).getInt());
}
}
@@ -244,7 +251,7 @@ class TestHFileWriter {
byte[] key = new byte[keyLen];
buf.get(key);
- byte[] keyContent = Arrays.copyOfRange(key, 2, key.length);
+ byte[] keyContent = Arrays.copyOfRange(key, 2, key.length - 10);
assertArrayEquals(expectedKey.getBytes(StandardCharsets.UTF_8),
keyContent);
byte[] value = new byte[valLen];
diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
new file mode 100644
index 000000000000..93f90141b155
--- /dev/null
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+class TestHfileBlock {
+ @Test
+ void testNumChecksumChunksZeroBytes() {
+ Assertions.assertEquals(0, HFileBlock.numChecksumChunks(0L, 512));
+ }
+
+ @Test
+ void testNumChecksumChunksZeroBytesPerChecksum() {
+ Assertions.assertEquals(0, HFileBlock.numChecksumChunks(100L, 0));
+ }
+
+ @Test
+ void testNumChecksumChunksExactDivision() {
+ Assertions.assertEquals(2, HFileBlock.numChecksumChunks(1024L, 512));
+ }
+
+ @Test
+ void testNumChecksumChunksWithRemainder() {
+ Assertions.assertEquals(3, HFileBlock.numChecksumChunks(1200L, 512));
+ }
+
+ @Test
+ void testNumChecksumChunksSingleChunk() {
+ Assertions.assertEquals(1, HFileBlock.numChecksumChunks(200L, 512));
+ }
+
+ @Test
+ void testNumChecksumChunksOverflowThrows() {
+ long numBytes = ((long) Integer.MAX_VALUE / HFileBlock.CHECKSUM_SIZE + 1)
+ * 1024; // force too many chunks
+ assertThrows(IllegalArgumentException.class,
+ () -> HFileBlock.numChecksumChunks(numBytes, 1024));
+ }
+}
diff --git
a/hudi-io/src/test/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
b/hudi-io/src/test/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
new file mode 100644
index 000000000000..c6d66fb10e02
--- /dev/null
+++
b/hudi-io/src/test/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hadoop.hbase.CellComparatorImpl;
+
+/**
+ * This class is used by HBase HFile reader to read HFiles written by Hudi
+ * along with this comparator class
+ */
+public class HoodieHBaseKVComparator extends CellComparatorImpl {
+}
diff --git a/hudi-io/src/test/resources/hfile/hbase-generated.hfile
b/hudi-io/src/test/resources/hfile/hbase-generated.hfile
new file mode 100644
index 000000000000..b1ea85fa3648
Binary files /dev/null and
b/hudi-io/src/test/resources/hfile/hbase-generated.hfile differ
diff --git a/hudi-io/src/test/resources/hfile/hudi-generated.hfile
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile
new file mode 100644
index 000000000000..27d5f9af1345
Binary files /dev/null and
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile differ
diff --git a/pom.xml b/pom.xml
index 6fc24473c8a1..984bff9dd673 100644
--- a/pom.xml
+++ b/pom.xml
@@ -415,9 +415,9 @@
<exclude>log4j:log4j</exclude>
<exclude>ch.qos.logback:logback-classic</exclude>
<!-- NOTE: We're banning any HBase deps -->
- <exclude>org.apache.hbase:hbase-common:*</exclude>
- <exclude>org.apache.hbase:hbase-client:*</exclude>
- <exclude>org.apache.hbase:hbase-server:*</exclude>
+
<exclude>org.apache.hbase:hbase-common:*:*:compile</exclude>
+
<exclude>org.apache.hbase:hbase-client:*:*:compile</exclude>
+
<exclude>org.apache.hbase:hbase-server:*:*:compile</exclude>
<!--To upgrade snappy because pre 1.1.8.2 does not work on
m1 mac-->
<exclude>org.xerial.snappy:snappy-java:*</exclude>
</excludes>