This is an automated email from the ASF dual-hosted git repository.
vbalaji pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 5e769ecb66e [HUDI-7278] make bloom filter skippable for CPU saving
(#10457)
5e769ecb66e is described below
commit 5e769ecb66e1d1b6f8c270f7b10f3860a927f273
Author: kongwei <[email protected]>
AuthorDate: Fri Jan 12 17:37:51 2024 +0800
[HUDI-7278] make bloom filter skippable for CPU saving (#10457)
* make bloom filter skippable for CPU saving
---------
Co-authored-by: wei.kong <[email protected]>
---
.../org/apache/hudi/config/HoodieWriteConfig.java | 4 +++
.../io/storage/HoodieSparkFileWriterFactory.java | 3 +--
.../storage/TestHoodieAvroFileWriterFactory.java | 31 ++++++++++++++++++++++
.../hudi/common/config/HoodieStorageConfig.java | 11 ++++++++
.../io/storage/HoodieAvroFileWriterFactory.java | 3 +--
.../hudi/io/storage/HoodieFileWriterFactory.java | 10 +++++++
6 files changed, 58 insertions(+), 4 deletions(-)
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index dd589cf4618..8fd3546671e 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -2150,6 +2150,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return getString(HoodieStorageConfig.PARQUET_FIELD_ID_WRITE_ENABLED);
}
+ public boolean parquetBloomFilterEnabled() {
+ return
getBooleanOrDefault(HoodieStorageConfig.PARQUET_WITH_BLOOM_FILTER_ENABLED);
+ }
+
public Option<HoodieLogBlock.HoodieLogBlockType> getLogDataBlockFormat() {
return
Option.ofNullable(getString(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT))
.map(HoodieLogBlock.HoodieLogBlockType::fromId);
diff --git
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
index d2ab83f1481..5feefa3bee2 100644
---
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
+++
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
@@ -44,8 +44,7 @@ public class HoodieSparkFileWriterFactory extends
HoodieFileWriterFactory {
String instantTime, Path path, Configuration conf, HoodieConfig config,
Schema schema,
TaskContextSupplier taskContextSupplier) throws IOException {
boolean populateMetaFields =
config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
- boolean enableBloomFilter = populateMetaFields;
- Option<BloomFilter> filter = enableBloomFilter ?
Option.of(createBloomFilter(config)) : Option.empty();
+ Option<BloomFilter> filter = enableBloomFilter(populateMetaFields, config)
? Option.of(createBloomFilter(config)) : Option.empty();
String compressionCodecName =
config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME);
// Support PARQUET_COMPRESSION_CODEC_NAME is ""
if (compressionCodecName.isEmpty()) {
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java
index 3afe6ee6708..120ae4fe891 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java
@@ -19,9 +19,11 @@
package org.apache.hudi.io.storage;
import org.apache.hudi.client.SparkTaskContextSupplier;
+import org.apache.hudi.common.config.HoodieStorageConfig;
import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.index.HoodieIndex.IndexType;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.testutils.HoodieClientTestBase;
@@ -31,6 +33,7 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -74,4 +77,32 @@ public class TestHoodieAvroFileWriterFactory extends
HoodieClientTestBase {
}, "should fail since log storage writer is not supported yet.");
assertTrue(thrown.getMessage().contains("format not supported yet."));
}
+
+ @Test
+ public void testEnableBloomFilter() {
+ HoodieWriteConfig config = getConfig(IndexType.BLOOM);
+ assertTrue(HoodieFileWriterFactory.enableBloomFilter(true, config));
+ assertFalse(HoodieFileWriterFactory.enableBloomFilter(false, config));
+
+ config = getConfig(IndexType.SIMPLE);
+ assertTrue(HoodieFileWriterFactory.enableBloomFilter(true, config));
+
+ config = getConfig(IndexType.SIMPLE);
+ assertTrue(HoodieFileWriterFactory.enableBloomFilter(true, config));
+
+ config = getConfigBuilder(IndexType.BLOOM)
+ .withStorageConfig(HoodieStorageConfig.newBuilder()
+ .parquetBloomFilterEnable(false).build()).build();
+ assertTrue(HoodieFileWriterFactory.enableBloomFilter(true, config));
+
+ config = getConfigBuilder(IndexType.SIMPLE)
+ .withStorageConfig(HoodieStorageConfig.newBuilder()
+ .parquetBloomFilterEnable(true).build()).build();
+ assertTrue(HoodieFileWriterFactory.enableBloomFilter(true, config));
+
+ config = getConfigBuilder(IndexType.SIMPLE)
+ .withStorageConfig(HoodieStorageConfig.newBuilder()
+ .parquetBloomFilterEnable(false).build()).build();
+ assertFalse(HoodieFileWriterFactory.enableBloomFilter(true, config));
+ }
}
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
index 2660b0b22c8..d68b8326ca8 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java
@@ -152,6 +152,12 @@ public class HoodieStorageConfig extends HoodieConfig {
.withDocumentation("Would only be effective with Spark 3.3+. Sets
spark.sql.parquet.fieldId.write.enabled. "
+ "If enabled, Spark will write out parquet native field ids that
are stored inside StructField's metadata as parquet.field.id to parquet
files.");
+ public static final ConfigProperty<Boolean>
PARQUET_WITH_BLOOM_FILTER_ENABLED = ConfigProperty
+ .key("hoodie.parquet.bloom.filter.enabled")
+ .defaultValue(true)
+ .withDocumentation("Control whether to write bloom filter or not.
Default true. "
+ + "We can set to false in non bloom index cases for CPU resource
saving.");
+
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM_NAME
= ConfigProperty
.key("hoodie.hfile.compression.algorithm")
.defaultValue("GZ")
@@ -420,6 +426,11 @@ public class HoodieStorageConfig extends HoodieConfig {
return this;
}
+ public Builder parquetBloomFilterEnable(boolean parquetBloomFilterEnable) {
+ storageConfig.setValue(PARQUET_WITH_BLOOM_FILTER_ENABLED,
String.valueOf(parquetBloomFilterEnable));
+ return this;
+ }
+
public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm)
{
storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME,
hfileCompressionAlgorithm);
return this;
diff --git
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java
index 8ed597ed920..471ab149fa5 100644
---
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java
+++
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java
@@ -51,8 +51,7 @@ public class HoodieAvroFileWriterFactory extends
HoodieFileWriterFactory {
String instantTime, Path path, Configuration conf, HoodieConfig config,
Schema schema,
TaskContextSupplier taskContextSupplier) throws IOException {
boolean populateMetaFields =
config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
- boolean enableBloomFilter = populateMetaFields;
- HoodieAvroWriteSupport writeSupport = getHoodieAvroWriteSupport(conf,
schema, config, enableBloomFilter);
+ HoodieAvroWriteSupport writeSupport = getHoodieAvroWriteSupport(conf,
schema, config, enableBloomFilter(populateMetaFields, config));
String compressionCodecName =
config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME);
// Support PARQUET_COMPRESSION_CODEC_NAME is ""
diff --git
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
index b968d0572b8..133feedb0ea 100644
---
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
+++
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
@@ -128,4 +128,14 @@ public class HoodieFileWriterFactory {
config.getIntOrDefault(HoodieStorageConfig.BLOOM_FILTER_DYNAMIC_MAX_ENTRIES),
config.getStringOrDefault(HoodieStorageConfig.BLOOM_FILTER_TYPE));
}
+
+ /**
+ * Check if need to enable bloom filter.
+ */
+ public static boolean enableBloomFilter(boolean populateMetaFields,
HoodieConfig config) {
+ return populateMetaFields &&
(config.getBooleanOrDefault(HoodieStorageConfig.PARQUET_WITH_BLOOM_FILTER_ENABLED)
+ // HoodieIndexConfig is located in the package hudi-client-common, and
the package hudi-client-common depends on the package hudi-common,
+ // so the class HoodieIndexConfig cannot be accessed in hudi-common,
otherwise there will be a circular dependency problem
+ || (config.contains("hoodie.index.type") &&
config.getString("hoodie.index.type").contains("BLOOM")));
+ }
}