Re: [PR] [HUDI-7958] Create partition stats index for all columns when no cols specified [hudi]

via GitHub Thu, 29 Aug 2024 16:09:15 -0700


yihua commented on code in PR #11579:
URL: https://github.com/apache/hudi/pull/11579#discussion_r1737442509



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestPartitionStatsIndexWithSql.scala:
##########
@@ -261,43 +262,89 @@ class TestPartitionStatsIndexWithSql extends 
HoodieSparkSqlTestBase {
              |""".stripMargin
         )
 
+        writeAndValidatePartitionStats(tableName, tablePath)
+      }
+    }
+  }
+
+  test(s"Test partition stats index without configuring columns to index") {
+    Seq("cow", "mor").foreach { tableType =>
+      withTempDir { tmp =>
+        val tableName = generateTableName
+        val tablePath = s"${tmp.getCanonicalPath}/$tableName"
+        // create table and enable partition stats without configuring columns 
to index
         spark.sql(
           s"""
-             | insert into $tableName
-             | values (1, 'a1', 1000, 10), (2, 'a2', 2000, 20), (3, 'a3', 
3000, 30), (4, 'a4', 2000, 10), (5, 'a5', 3000, 20), (6, 'a6', 4000, 30)
-             | """.stripMargin
+             |create table $tableName (
+             |  id int,
+             |  name string,
+             |  price int,
+             |  ts long
+             |) using hudi
+             |partitioned by (ts)
+             |tblproperties (
+             |  type = '$tableType',
+             |  primaryKey = 'id',
+             |  preCombineField = 'price',
+             |  hoodie.metadata.index.partition.stats.enable = 'true'
+             |)
+             |location '$tablePath'
+             |""".stripMargin
         )
 
-        // Validate partition_stats index exists
-        val metaClient = HoodieTableMetaClient.builder()
-          .setBasePath(tablePath)
-          .setConf(HoodieTestUtils.getDefaultStorageConf)
-          .build()
-        assertResult(tableName)(metaClient.getTableConfig.getTableName)
-        
assertTrue(metaClient.getTableConfig.getMetadataPartitions.contains(PARTITION_STATS.getPartitionPath))
-
-        spark.sql("set hoodie.metadata.enable=true")
-        spark.sql("set hoodie.enable.data.skipping=true")
-        spark.sql("set hoodie.fileIndex.dataSkippingFailureMode=strict")
-        checkAnswer(s"select id, name, price, ts from $tableName where 
price>3000")(
-          Seq(6, "a6", 4000, 30)
+        writeAndValidatePartitionStats(tableName, tablePath)
+        // validate partition stats index for id column
+        checkAnswer(s"select key, ColumnStatsMetadata.minValue.member1.value, 
ColumnStatsMetadata.maxValue.member1.value from hudi_metadata('$tableName') 
where type=3 and ColumnStatsMetadata.columnName='id'")(
+          Seq(getPartitionStatsIndexKey("ts=10", "id"), 1, 4),
+          Seq(getPartitionStatsIndexKey("ts=20", "id"), 2, 5),
+          Seq(getPartitionStatsIndexKey("ts=30", "id"), 3, 6)
         )
-
-        // Test price update, assert latest value and ensure file pruning
-        spark.sql(s"update $tableName set price = price + 1 where id = 6")
-        checkAnswer(s"select id, name, price, ts from $tableName where 
price>3000")(
-          Seq(6, "a6", 4001, 30)
+        // validate partition stats index for name column
+        checkAnswer(s"select key, ColumnStatsMetadata.minValue.member6.value, 
ColumnStatsMetadata.maxValue.member6.value from hudi_metadata('$tableName') 
where type=3 and ColumnStatsMetadata.columnName='name'")(
+          Seq(getPartitionStatsIndexKey("ts=10", "name"), "a1", "a4"),
+          Seq(getPartitionStatsIndexKey("ts=20", "name"), "a2", "a5"),
+          Seq(getPartitionStatsIndexKey("ts=30", "name"), "a3", "a6")
         )
-
-        verifyFilePruning(
-          Map.apply(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key -> "true", 
HoodieMetadataConfig.ENABLE.key -> "true"),
-          GreaterThan(AttributeReference("price", IntegerType)(), 
Literal(3000)),
-          HoodieTableMetaClient.reload(metaClient),
-          isDataSkippingExpected = true)
       }
     }
   }
 
+  private def writeAndValidatePartitionStats(tableName: String, tablePath: 
String): Unit = {
+    spark.sql(
+      s"""
+         | insert into $tableName
+         | values (1, 'a1', 1000, 10), (2, 'a2', 2000, 20), (3, 'a3', 3000, 
30), (4, 'a4', 2000, 10), (5, 'a5', 3000, 20), (6, 'a6', 4000, 30)
+         | """.stripMargin
+    )
+
+    // Validate partition_stats index exists
+    val metaClient = HoodieTableMetaClient.builder()
+      .setBasePath(tablePath)
+      .setConf(HoodieTestUtils.getDefaultStorageConf)
+      .build()
+    assertResult(tableName)(metaClient.getTableConfig.getTableName)
+    
assertTrue(metaClient.getTableConfig.getMetadataPartitions.contains(PARTITION_STATS.getPartitionPath))
+
+    spark.sql("set hoodie.metadata.enable=true")
+    spark.sql("set hoodie.enable.data.skipping=true")
+    spark.sql("set hoodie.fileIndex.dataSkippingFailureMode=strict")
+    checkAnswer(s"select id, name, price, ts from $tableName where 
price>3000")(
+      Seq(6, "a6", 4000, 30)
+    )
+
+    // Test price update, assert latest value and ensure file pruning
+    spark.sql(s"update $tableName set price = price + 1 where id = 6")
+    checkAnswer(s"select id, name, price, ts from $tableName where 
price>3000")(
+      Seq(6, "a6", 4001, 30)
+    )
+
+    verifyFilePruning(

Review Comment:
   Also call this with data skipping disabled to make sure partition stats is 
not used?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-7958] Create partition stats index for all columns when no cols specified [hudi]

Reply via email to