yihua commented on code in PR #11579:
URL: https://github.com/apache/hudi/pull/11579#discussion_r1692356320
##########
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##########
@@ -332,7 +332,7 @@ public final class HoodieMetadataConfig extends
HoodieConfig {
public static final ConfigProperty<Boolean>
ENABLE_METADATA_INDEX_PARTITION_STATS = ConfigProperty
.key(METADATA_PREFIX + ".index.partition.stats.enable")
- .defaultValue(true)
+ .defaultValue(false)
Review Comment:
Have you tried enabling this config? Do all tests still pass?
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestPartitionStatsIndexWithSql.scala:
##########
@@ -261,41 +261,75 @@ class TestPartitionStatsIndexWithSql extends
HoodieSparkSqlTestBase {
|""".stripMargin
)
+ writeAndValidatePartitionStats(tableName, tablePath)
+ }
+ }
+ }
+
+ test(s"Test partition stats index without configuring columns to index") {
+ Seq("cow", "mor").foreach { tableType =>
+ withTempDir { tmp =>
+ val tableName = generateTableName
+ val tablePath = s"${tmp.getCanonicalPath}/$tableName"
+ // create table and enable partition stats without configuring columns
to index
spark.sql(
s"""
- | insert into $tableName
- | values (1, 'a1', 1000, 10), (2, 'a2', 2000, 20), (3, 'a3',
3000, 30), (4, 'a4', 2000, 10), (5, 'a5', 3000, 20), (6, 'a6', 4000, 30)
- | """.stripMargin
+ |create table $tableName (
+ | id int,
+ | name string,
+ | price int,
+ | ts long
+ |) using hudi
+ |partitioned by (ts)
+ |tblproperties (
+ | type = '$tableType',
+ | primaryKey = 'id',
+ | preCombineField = 'price',
+ | hoodie.metadata.index.partition.stats.enable = 'true'
+ |)
+ |location '$tablePath'
+ |""".stripMargin
)
- // Validate partition_stats index exists
- val metaClient = HoodieTableMetaClient.builder()
- .setBasePath(tablePath)
- .setConf(HoodieTestUtils.getDefaultStorageConf)
- .build()
- assertResult(tableName)(metaClient.getTableConfig.getTableName)
-
assertTrue(metaClient.getTableConfig.getMetadataPartitions.contains(PARTITION_STATS.getPartitionPath))
+ writeAndValidatePartitionStats(tableName, tablePath)
+ }
+ }
+ }
- spark.sql("set hoodie.metadata.enable=true")
- spark.sql("set hoodie.enable.data.skipping=true")
- spark.sql("set hoodie.fileIndex.dataSkippingFailureMode=strict")
- checkAnswer(s"select id, name, price, ts from $tableName where
price>3000")(
- Seq(6, "a6", 4000, 30)
- )
+ private def writeAndValidatePartitionStats(tableName: String, tablePath:
String): Unit = {
+ spark.sql(
+ s"""
+ | insert into $tableName
+ | values (1, 'a1', 1000, 10), (2, 'a2', 2000, 20), (3, 'a3', 3000,
30), (4, 'a4', 2000, 10), (5, 'a5', 3000, 20), (6, 'a6', 4000, 30)
+ | """.stripMargin
+ )
- // Test price update, assert latest value and ensure file pruning
- spark.sql(s"update $tableName set price = price + 1 where id = 6")
- checkAnswer(s"select id, name, price, ts from $tableName where
price>3000")(
- Seq(6, "a6", 4001, 30)
- )
+ // Validate partition_stats index exists
+ val metaClient = HoodieTableMetaClient.builder()
+ .setBasePath(tablePath)
+ .setConf(HoodieTestUtils.getDefaultStorageConf)
+ .build()
+ assertResult(tableName)(metaClient.getTableConfig.getTableName)
+
assertTrue(metaClient.getTableConfig.getMetadataPartitions.contains(PARTITION_STATS.getPartitionPath))
Review Comment:
Could we also directly read the partition_stats index and validate the index
content?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]