This is an automated email from the ASF dual-hosted git repository.
yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new d92b97516d6 [HUDI-6578] Fix log compaction on parquet data blocks in
log files (#9253)
d92b97516d6 is described below
commit d92b97516d6ebe8e4455b019bf1fca8af39a0057
Author: Y Ethan Guo <[email protected]>
AuthorDate: Fri Jul 21 13:24:59 2023 -0700
[HUDI-6578] Fix log compaction on parquet data blocks in log files (#9253)
---
.../apache/hudi/table/TestHoodieMergeOnReadTable.java | 16 ++++++++++++----
.../apache/hudi/common/config/HoodieStorageConfig.java | 5 +++++
.../common/table/log/AbstractHoodieLogRecordReader.java | 1 +
3 files changed, 18 insertions(+), 4 deletions(-)
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java
index 43dea6d3b83..0b410f5a9d1 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java
@@ -23,6 +23,7 @@ import org.apache.hudi.client.SparkRDDReadClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.config.HoodieStorageConfig;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
@@ -67,6 +68,7 @@ import org.apache.spark.sql.Row;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
@@ -309,19 +311,25 @@ public class TestHoodieMergeOnReadTable extends
SparkClientFunctionalTestHarness
}
@ParameterizedTest
- @ValueSource(booleans = {true})
- public void testLogBlocksCountsAfterLogCompaction(boolean
populateMetaFields) throws Exception {
+ @CsvSource({"true,avro", "true,parquet", "false,avro", "false,parquet"})
+ public void testLogBlocksCountsAfterLogCompaction(boolean
populateMetaFields, String logFileFormat) throws Exception {
HoodieCompactionConfig compactionConfig =
HoodieCompactionConfig.newBuilder()
.withMaxNumDeltaCommitsBeforeCompaction(1)
.withLogCompactionBlocksThreshold(1)
.build();
- // insert 100 recordsx
+ // insert 100 records
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true)
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build())
.withCompactionConfig(compactionConfig);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
- HoodieWriteConfig config = cfgBuilder.build();
+ HoodieWriteConfig config = cfgBuilder
+ .withStorageConfig(HoodieStorageConfig.newBuilder()
+ .hfileMaxFileSize(1024 * 1024 * 1024)
+ .parquetMaxFileSize(1024 * 1024 * 1024)
+ .logFileDataBlockFormat(logFileFormat)
+ .build())
+ .build();
setUp(config.getProps());
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
index ac432435868..cec7f8f18c5 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
@@ -339,6 +339,11 @@ public class HoodieStorageConfig extends HoodieConfig {
return this;
}
+ public Builder logFileDataBlockFormat(String format) {
+ storageConfig.setValue(LOGFILE_DATA_BLOCK_FORMAT, format);
+ return this;
+ }
+
public Builder logFileDataBlockMaxSize(long dataBlockSize) {
storageConfig.setValue(LOGFILE_DATA_BLOCK_MAX_SIZE,
String.valueOf(dataBlockSize));
return this;
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
index 78e701a07d4..6ef1a6f5542 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
@@ -477,6 +477,7 @@ public abstract class AbstractHoodieLogRecordReader {
switch (logBlock.getBlockType()) {
case HFILE_DATA_BLOCK:
case AVRO_DATA_BLOCK:
+ case PARQUET_DATA_BLOCK:
case DELETE_BLOCK:
List<HoodieLogBlock> logBlocksList =
instantToBlocksMap.getOrDefault(instantTime, new ArrayList<>());
if (logBlocksList.size() == 0) {