This is an automated email from the ASF dual-hosted git repository.
codope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new ca058f35b41 [HUDI-8742] Add expression index test with auto keygen
(#12517)
ca058f35b41 is described below
commit ca058f35b413bce99ca192db575b8d5e90594f75
Author: Lokesh Jain <[email protected]>
AuthorDate: Mon Dec 23 20:36:18 2024 +0530
[HUDI-8742] Add expression index test with auto keygen (#12517)
---
.../hudi/command/index/TestExpressionIndex.scala | 88 ++++++++++++++++++++--
1 file changed, 81 insertions(+), 7 deletions(-)
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestExpressionIndex.scala
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestExpressionIndex.scala
index e6329809628..c3dbc771be6 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestExpressionIndex.scala
+++
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestExpressionIndex.scala
@@ -35,7 +35,7 @@ import org.apache.hudi.config.{HoodieCleanConfig,
HoodieCompactionConfig, Hoodie
import org.apache.hudi.hive.testutils.HiveTestUtil
import org.apache.hudi.hive.{HiveSyncTool, HoodieHiveSyncClient}
import org.apache.hudi.index.HoodieIndex
-import org.apache.hudi.index.expression.{HoodieExpressionIndex,
HoodieSparkExpressionIndex}
+import org.apache.hudi.index.expression.HoodieExpressionIndex
import org.apache.hudi.metadata.{HoodieMetadataFileSystemView,
MetadataPartitionType}
import org.apache.hudi.storage.StoragePath
import org.apache.hudi.sync.common.HoodieSyncConfig.{META_SYNC_BASE_PATH,
META_SYNC_DATABASE_NAME, META_SYNC_NO_PARTITION_METADATA, META_SYNC_TABLE_NAME}
@@ -743,11 +743,87 @@ class TestExpressionIndex extends HoodieSparkSqlTestBase {
}
}
+ /**
+ * Test expression index with auto generation of record keys
+ */
+ test("Test Expression Index With AutoGen") {
+ withTempDir { tmp =>
+ Seq("cow", "mor").foreach { tableType =>
+ val tableName = generateTableName + s"_stats_pruning_binary_$tableType"
+ val basePath = s"${tmp.getCanonicalPath}/$tableName"
+
+ spark.sql("set hoodie.fileIndex.dataSkippingFailureMode=strict")
+
+ spark.sql(
+ s"""
+ CREATE TABLE $tableName (
+ | ts LONG,
+ | id STRING,
+ | rider STRING,
+ | driver STRING,
+ | fare DOUBLE,
+ | dateDefault STRING,
+ | date STRING,
+ | city STRING,
+ | state STRING
+ |) USING HUDI
+ |options(
+ | type = '$tableType',
+ | hoodie.metadata.enable = 'true',
+ | hoodie.enable.data.skipping = 'true'
+ |)
+ |PARTITIONED BY (state)
+ |location '$basePath'
+ |""".stripMargin)
+
+ spark.sql("set hoodie.parquet.small.file.limit=0")
+ if (HoodieSparkUtils.gteqSpark3_4) {
+ spark.sql("set spark.sql.defaultColumn.enabled=false")
+ }
+
+ spark.sql(
+ s"""
+ |insert into $tableName(ts, id, rider, driver, fare, dateDefault,
date, city, state) VALUES
+ | (1695414520,'trip2','rider-C','driver-M',27.70,'2024-11-30
01:30:40', '2024-11-30', 'sunnyvale','california'),
+ | (1699349649,'trip5','rider-A','driver-Q',3.32, '2019-11-30
01:30:40', '2019-11-30', 'san_diego','texas')
+ |""".stripMargin)
+ val tableSchema: StructType =
+ StructType(
+ Seq(
+ StructField("ts", LongType),
+ StructField("id", StringType),
+ StructField("rider", StringType),
+ StructField("driver", StringType),
+ StructField("fare", DoubleType),
+ StructField("dateDefault", StringType),
+ StructField("date", StringType),
+ StructField("city", StringType),
+ StructField("state", StringType)
+ )
+ )
+
+ spark.sql(s"create index idx_datestr on $tableName using
column_stats(ts) options(expr='from_unixtime')")
+ checkAnswer(s"select id, rider, from_unixtime(ts) from $tableName
where from_unixtime(ts) > '1970-01-03'")(
+ Seq("trip2", "rider-C", "2023-09-22 20:28:40"),
+ Seq("trip5", "rider-A", "2023-11-07 09:34:09")
+ )
+
+ // validate pruning
+ val opts = Map.apply(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key ->
"true", HoodieMetadataConfig.ENABLE.key -> "true")
+ val metaClient = createMetaClient(spark, basePath)
+ val fromUnixTime = resolveExpr(spark,
unapply(functions.from_unixtime(functions.col("ts"))).get, tableSchema)
+ val literal = Literal.create("2023-09-22 20:28:40")
+ val dataFilter = EqualTo(fromUnixTime, literal)
+ verifyFilePruning(opts, dataFilter, metaClient, isDataSkippingExpected
= true)
+ spark.sql(s"drop index idx_datestr on $tableName")
+ }
+ }
+ }
+
/**
* Test expression index with invalid options
*/
- @Test
- def testInvalidOptions(): Unit = {
+ test("Test Expression Index Creation With Invalid Options") {
withTempDir { tmp =>
Seq("cow", "mor").foreach { tableType =>
val tableName = generateTableName + s"_stats_pruning_binary_$tableType"
@@ -802,8 +878,7 @@ class TestExpressionIndex extends HoodieSparkSqlTestBase {
/**
* Test expression index with data skipping for date and timestamp based
expressions.
*/
- @Test
- def testColumnStatsPruningWithDateTimestampExpressions(): Unit = {
+ test("Test Expression Index Column Stat Pruning With Timestamp Expressions")
{
withTempDir { tmp =>
Seq("cow", "mor").foreach { tableType =>
val tableName = generateTableName +
s"_stats_pruning_date_expr_$tableType"
@@ -952,8 +1027,7 @@ class TestExpressionIndex extends HoodieSparkSqlTestBase {
/**
* Test expression index with data skipping for string expressions.
*/
- @Test
- def testColumnStatsPruningWithStringExpressions(): Unit = {
+ test("Test Expression Index Column Stat Pruning With String Expression") {
withTempDir { tmp =>
Seq("cow", "mor").foreach { tableType =>
val tableName = generateTableName +
s"_stats_pruning_string_expr_$tableType"