codope commented on code in PR #12420:
URL: https://github.com/apache/hudi/pull/12420#discussion_r1874239940
##########
website/docs/sql_ddl.md:
##########
@@ -342,65 +377,65 @@ CREATE TABLE hudi_table_func_index (
) USING HUDI
tblproperties (primaryKey = 'uuid')
PARTITIONED BY (city)
-location 'file:///tmp/hudi_table_func_index';
+location 'file:///tmp/hudi_table_expr_index';
-- disable small file handling so the each insert creates new file --
set hoodie.parquet.small.file.limit=0;
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
-
--- Query with hour function filter but no idex yet --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
+
+-- Query with hour function filter but no index yet --
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.208 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#3465, fare#3464, rider#3462, driver#3463],
Statistics(sizeInBytes=899.5 KiB)
+- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 =
chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=2.5 MiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
== Physical Plan ==
*(1) Project [city#3465, fare#3464, rider#3462, driver#3463]
+- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
--- create the functional index --
-CREATE INDEX ts_hour ON hudi_table_func_index USING column_stats(ts)
options(func='hour');
+-- create the expression index --
+CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts)
options(expr='hour');
-- query after creating the index --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.202 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#2970, fare#2969, rider#2967, driver#2968],
Statistics(sizeInBytes=449.8 KiB)
+- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 =
chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=1278.3 KiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
== Physical Plan ==
*(1) Project [city#2970, fare#2969, rider#2967, driver#2968]
+- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
```
</details>
-#### Create Partition Stats and Secondary Index (Experimental)
+#### Create Partition Stats and Secondary Index
Review Comment:
You're right. Now those two need not be set, as they are true by default.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]