This is an automated email from the ASF dual-hosted git repository.
vhs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 236dc2233a70 test(lance): Add test of bloomFilter support to
TestLanceDataSource (#18388)
236dc2233a70 is described below
commit 236dc2233a707dfd768047afaf1be6aeffaed21d
Author: Vova Kolmakov <[email protected]>
AuthorDate: Mon Mar 30 22:56:54 2026 +0700
test(lance): Add test of bloomFilter support to TestLanceDataSource (#18388)
Co-authored-by: Vova Kolmakov <[email protected]>
---
.../hudi/functional/TestLanceDataSource.scala | 69 +++++++++++++++++++++-
1 file changed, 68 insertions(+), 1 deletion(-)
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLanceDataSource.scala
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLanceDataSource.scala
index 8dabe5bfdacf..662a6885e3e7 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLanceDataSource.scala
+++
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLanceDataSource.scala
@@ -19,19 +19,26 @@ package org.apache.hudi.functional
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.DefaultSparkRecordMerger
+import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig}
+import org.apache.hudi.common.engine.HoodieLocalEngineContext
import org.apache.hudi.common.model.HoodieTableType
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
+import org.apache.hudi.common.table.view.{FileSystemViewManager,
FileSystemViewStorageConfig}
import org.apache.hudi.common.testutils.HoodieTestUtils
import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.io.storage.HoodieSparkLanceReader
+import org.apache.hudi.storage.StoragePath
import org.apache.hudi.testutils.HoodieSparkClientTestBase
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.junit.jupiter.api.{AfterEach, BeforeEach}
-import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
+import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse,
assertNotNull, assertTrue}
import org.junit.jupiter.api.condition.DisabledIfSystemProperty
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.EnumSource
+import java.util.stream.Collectors
+
import scala.collection.JavaConverters._
/**
@@ -717,6 +724,66 @@ class TestLanceDataSource extends
HoodieSparkClientTestBase {
assertTrue(actual.except(expectedDf).isEmpty)
}
+ @ParameterizedTest
+ @EnumSource(value = classOf[HoodieTableType])
+ def testBloomFilterAndMinMaxKeys(tableType: HoodieTableType): Unit = {
+ val tableName = s"test_lance_bloom_${tableType.name().toLowerCase}"
+ val tablePath = s"$basePath/$tableName"
+
+ val records = Seq(
+ (1, "Alice", 30, 95.5),
+ (2, "Bob", 25, 87.3),
+ (3, "Charlie", 35, 92.1)
+ )
+ val df = createDataFrame(records)
+
+ writeDataframe(tableType, tableName, tablePath, df, saveMode =
SaveMode.Overwrite, operation = Some("insert"))
+
+ // Build MetaClient and FileSystemView to find lance base files
+ val metaClient = HoodieTableMetaClient.builder()
+ .setConf(HoodieTestUtils.getDefaultStorageConf)
+ .setBasePath(tablePath)
+ .build()
+
+ val engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf)
+ val metadataConfig = HoodieMetadataConfig.newBuilder.build
+ val viewManager = FileSystemViewManager.createViewManager(
+ engineContext, metadataConfig,
FileSystemViewStorageConfig.newBuilder.build,
+ HoodieCommonConfig.newBuilder.build,
+ (mc: HoodieTableMetaClient) => metaClient.getTableFormat
+ .getMetadataFactory.create(engineContext, mc.getStorage,
metadataConfig, tablePath))
+ val fsView = viewManager.getFileSystemView(metaClient)
+
+ // Get all lance base files across all partitions
+ val baseFiles =
fsView.getLatestBaseFiles("").collect(Collectors.toList[org.apache.hudi.common.model.HoodieBaseFile])
+ assertTrue(baseFiles.size() > 0, "Should have at least one base file")
+
+ baseFiles.asScala.foreach { baseFile =>
+ val reader = new HoodieSparkLanceReader(new
StoragePath(baseFile.getPath))
+ try {
+ // Verify bloom filter
+ val bloomFilter = reader.readBloomFilter()
+ assertNotNull(bloomFilter, "Bloom filter should not be null")
+
+ // All written record keys should be present
+ // Record keys are the "id" field values as strings
+ Seq("1", "2", "3").foreach { key =>
+ assertTrue(bloomFilter.mightContain(key), s"Bloom filter should
contain key $key")
+ }
+ // Non-existent key should (very likely) not be present
+ assertFalse(bloomFilter.mightContain("nonexistent_key"), "Bloom filter
should not contain nonexistent key")
+
+ // Verify min/max record keys
+ val minMaxKeys = reader.readMinMaxRecordKeys()
+ assertEquals("1", minMaxKeys(0), "Min key should be '1'")
+ assertEquals("3", minMaxKeys(1), "Max key should be '3'")
+ } finally {
+ reader.close()
+ }
+ }
+ fsView.close()
+ }
+
private def createDataFrame(records: Seq[(Int, String, Int, Double)]) = {
spark.createDataFrame(records).toDF("id", "name", "age",
"score").coalesce(1)
}