Re: [PR] [DOCS] Refine quickstart docs further for 1.0 [hudi]

via GitHub Fri, 06 Dec 2024 19:21:58 -0800


vinothchandar commented on code in PR #12420:
URL: https://github.com/apache/hudi/pull/12420#discussion_r1874184392



##########
website/docs/quick-start-guide.md:
##########
@@ -1140,11 +1175,17 @@ multiple records with the same key. See section below.
 
 ## Ordering Field
 Hudi also allows users to specify a _precombine_ field, which will be used to 
order and resolve conflicts between multiple versions of the same record. This 
is very important for 
-use-cases like applying database CDC logs to a Hudi table, where a given 
record may be appear multiple times in the source data due to repeated upstream 
updates. 
+use-cases like applying database CDC logs to a Hudi table, where a given 
record may appear multiple times in the source data due to repeated upstream 
updates. 
 Hudi also uses this mechanism to support out-of-order data arrival into a 
table, where records may need to be resolved in a different order than their 
commit time. 
-For e.g using a _created_at_ timestamp field as the precombine field will 
prevent older versions of a record from overwriting newer ones or being exposed 
to queries, even 
+For e.g. using a _created_at_ timestamp field as the precombine field will 
prevent older versions of a record from overwriting newer ones or being exposed 
to queries, even 
 if they are written at a later commit time to the table. This is one of the 
key features, that makes Hudi, best suited for dealing with streaming data.
 
+To enable different merge semantics, Hudi supports merge modes, which define 
how the base and log files are ordered in a
+file slice and further how different records with the same record key within 
that file slice are merged consistently to

Review Comment:
   can we use `ordering.field` insted of precombine.. we have two configs right.



##########
website/docs/sql_ddl.md:
##########
@@ -342,65 +377,65 @@ CREATE TABLE hudi_table_func_index (
 ) USING HUDI
 tblproperties (primaryKey = 'uuid')
 PARTITIONED BY (city)
-location 'file:///tmp/hudi_table_func_index';
+location 'file:///tmp/hudi_table_expr_index';
 
 -- disable small file handling so the each insert creates new file --
 set hoodie.parquet.small.file.limit=0;
 
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20 
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19 
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-18 
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22 
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-24 
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22 
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 
,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20 
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19 
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
-
--- Query with hour function filter but no idex yet --
-spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_func_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20 
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19 
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-18 
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22 
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-24 
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22 
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 
,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20 
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19 
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
+
+-- Query with hour function filter but no index yet --
+spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_expr_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
 san_francisco  93.5    rider-E driver-O
 san_francisco  33.9    rider-D driver-L
 sao_paulo      43.4    rider-G driver-Q
 Time taken: 0.208 seconds, Fetched 3 row(s)
 
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_func_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_expr_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
 == Optimized Logical Plan ==
 Project [city#3465, fare#3464, rider#3462, driver#3463], 
Statistics(sizeInBytes=899.5 KiB)
 +- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 = 
chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))), 
Statistics(sizeInBytes=2.5 MiB)
-   +- Relation 
default.hudi_table_func_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
 parquet, Statistics(sizeInBytes=2.5 MiB)
+   +- Relation 
default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
 parquet, Statistics(sizeInBytes=2.5 MiB)
 
 == Physical Plan ==
 *(1) Project [city#3465, fare#3464, rider#3462, driver#3463]
 +- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp), 
Some(Asia/Kolkata)) > 12))
    +- *(1) ColumnarToRow
-      +- FileScan parquet 
default.hudi_table_func_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
 Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters: 
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
+      +- FileScan parquet 
default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
 Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: 
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
       
      
--- create the functional index --
-CREATE INDEX ts_hour ON hudi_table_func_index USING column_stats(ts) 
options(func='hour');
+-- create the expression index --
+CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts) 
options(expr='hour');
 
 -- query after creating the index --
-spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_func_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_expr_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
 san_francisco  93.5    rider-E driver-O
 san_francisco  33.9    rider-D driver-L
 sao_paulo      43.4    rider-G driver-Q
 Time taken: 0.202 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_func_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_expr_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
 == Optimized Logical Plan ==
 Project [city#2970, fare#2969, rider#2967, driver#2968], 
Statistics(sizeInBytes=449.8 KiB)
 +- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 = 
chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))), 
Statistics(sizeInBytes=1278.3 KiB)
-   +- Relation 
default.hudi_table_func_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
 parquet, Statistics(sizeInBytes=1278.3 KiB)
+   +- Relation 
default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
 parquet, Statistics(sizeInBytes=1278.3 KiB)
 
 == Physical Plan ==
 *(1) Project [city#2970, fare#2969, rider#2967, driver#2968]
 +- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp), 
Some(Asia/Kolkata)) > 12))
    +- *(1) ColumnarToRow
-      +- FileScan parquet 
default.hudi_table_func_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
 Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters: 
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
+      +- FileScan parquet 
default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
 Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: 
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
       
 ```
 </details>
 
-#### Create Partition Stats and Secondary Index (Experimental)
+#### Create Partition Stats and Secondary Index

Review Comment:
   do we need to set hoodie.metadata.enable and hoodie.eable.data.skipping 
anymore? are nt they defaults.. 



##########
website/docs/quick-start-guide.md:
##########
@@ -627,13 +647,28 @@ WHEN NOT MATCHED THEN INSERT *
 
 :::info Key requirements
 1. For a Hudi table with user defined primary record [keys](#keys), the join 
condition is expected to contain the primary keys of the table.
-For a Hudi table with Hudi generated primary keys, the join condition can be 
on any arbitrary data columns.
-2. For Merge-On-Read tables, partial column updates are not yet supported, 
i.e. **all columns** need to be SET from a 
-MERGE statement either using `SET *` or using `SET column1 = expression1 [, 
column2 = expression2 ...]`. 
+For a Hudi table with Hudi generated primary keys, the join condition can be 
on any arbitrary data columns. 
 :::
 </TabItem>
 </Tabs>
 
+### Merging Data with Partial Updates {#merge-partial-update}
+
+Partial updates only write updated columns instead of full update record. This 
is useful when you have hundreds of
+columns and only a few columns are updated. It reduces the write amplification 
as well as helps in lowering the query
+latency. `MERGE INTO` statement above can be modified to use partial updates 
as shown below.
+
+```sql
+MERGE INTO hudi_table AS target
+USING fare_adjustment AS source
+ON target.uuid = source.uuid
+WHEN MATCHED THEN UPDATE SET fare = source.fare

Review Comment:
   sorry, what exactly is  the modification from example above. the previous 
one does not use `UPDATE SET *` 



##########
website/docs/sql_ddl.md:
##########
@@ -473,8 +508,10 @@ Time taken: 0.83 seconds, Fetched 2 row(s)
 - Predicate on internal meta fields such as `_hoodie_record_key` or 
`_hoodie_partition_path` cannot be used for data
   skipping. Queries with such predicates cannot leverage the indexes.
 - Secondary index is not supported for nested fields.
+- Secondary index can be created only if record index is available in the table
+- Secondary index can only be used for tables using 
OverwriteWithLatestAvroPayload payload or COMMIT_TIME_ORDERING merge mode 

Review Comment:
   whats the equivalent. should we say its not supported for tables with 
different event time merge modes?  We cannot write all this in terms of payload 
alone. we should write based on merge mode and then also add a note on how it 
works for payloads.



##########
website/docs/sql_ddl.md:
##########
@@ -103,6 +103,49 @@ TBLPROPERTIES (
 );
 ```
 
+### Create table with record merge mode {#create-table-with-record-merge-mode}
+
+Hudi supports different [record merge modes](/docs/next/record_merger) to 
handle merge of incoming records with existing
+records. To create a table with specific record merge mode, you can set 
`recordMergeMode` option.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode (
+  id INT,
+  name STRING,
+  ts LONG,
+  price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+  type = 'mor',
+  primaryKey = 'id',
+  precombineField = 'ts',
+  recordMergeMode = 'EVENT_TIME_ORDERING'
+)
+LOCATION 'file:///tmp/hudi_table_merge_mode/';
+```
+
+With `EVENT_TIME_ORDERING`, the record with the larger event time 
(`precombineField`) overwrites the record with the
+smaller event time on the same key, regardless of transaction time. Users can 
set `CUSTOM` mode to provide their own
+merge logic. With `CUSTOM` merge mode, you also need to provide your payload 
class that implements the merge logic. For 
+example, you can use `PartialUpdateAvroPayload` to merge the records as below.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode_custom (
+  id INT,
+  name STRING,
+  ts LONG,
+  price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+  type = 'mor',
+  primaryKey = 'id',
+  precombineField = 'ts',
+  recordMergeMode = 'CUSTOM',
+  'hoodie.datasource.write.payload.class' = 
'org.apache.hudi.common.model.PartialUpdateAvroPayload'

Review Comment:
   we need to show merger... not a payload.



##########
website/docs/sql_ddl.md:
##########
@@ -342,65 +377,65 @@ CREATE TABLE hudi_table_func_index (
 ) USING HUDI
 tblproperties (primaryKey = 'uuid')
 PARTITIONED BY (city)
-location 'file:///tmp/hudi_table_func_index';
+location 'file:///tmp/hudi_table_expr_index';
 
 -- disable small file handling so the each insert creates new file --
 set hoodie.parquet.small.file.limit=0;
 
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20 
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19 
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-18 
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22 
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-24 
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22 
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 
,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20 
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19 
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
-
--- Query with hour function filter but no idex yet --
-spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_func_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20 
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19 
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-18 
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22 
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-24 
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22 
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 
,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20 
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19 
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
+
+-- Query with hour function filter but no index yet --
+spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_expr_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
 san_francisco  93.5    rider-E driver-O
 san_francisco  33.9    rider-D driver-L
 sao_paulo      43.4    rider-G driver-Q
 Time taken: 0.208 seconds, Fetched 3 row(s)
 
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_func_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_expr_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
 == Optimized Logical Plan ==
 Project [city#3465, fare#3464, rider#3462, driver#3463], 
Statistics(sizeInBytes=899.5 KiB)
 +- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 = 
chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))), 
Statistics(sizeInBytes=2.5 MiB)
-   +- Relation 
default.hudi_table_func_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
 parquet, Statistics(sizeInBytes=2.5 MiB)
+   +- Relation 
default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
 parquet, Statistics(sizeInBytes=2.5 MiB)
 
 == Physical Plan ==
 *(1) Project [city#3465, fare#3464, rider#3462, driver#3463]
 +- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp), 
Some(Asia/Kolkata)) > 12))
    +- *(1) ColumnarToRow
-      +- FileScan parquet 
default.hudi_table_func_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
 Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters: 
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
+      +- FileScan parquet 
default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
 Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: 
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
       
      
--- create the functional index --
-CREATE INDEX ts_hour ON hudi_table_func_index USING column_stats(ts) 
options(func='hour');
+-- create the expression index --
+CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts) 
options(expr='hour');
 
 -- query after creating the index --
-spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_func_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> SELECT city, fare, rider, driver FROM  hudi_table_expr_index WHERE  
city NOT IN ('chennai') AND hour(ts) > 12;
 san_francisco  93.5    rider-E driver-O
 san_francisco  33.9    rider-D driver-L
 sao_paulo      43.4    rider-G driver-Q
 Time taken: 0.202 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_func_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM  
hudi_table_expr_index WHERE  city NOT IN ('chennai') AND hour(ts) > 12;
 == Optimized Logical Plan ==
 Project [city#2970, fare#2969, rider#2967, driver#2968], 
Statistics(sizeInBytes=449.8 KiB)
 +- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 = 
chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))), 
Statistics(sizeInBytes=1278.3 KiB)
-   +- Relation 
default.hudi_table_func_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
 parquet, Statistics(sizeInBytes=1278.3 KiB)
+   +- Relation 
default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
 parquet, Statistics(sizeInBytes=1278.3 KiB)
 
 == Physical Plan ==
 *(1) Project [city#2970, fare#2969, rider#2967, driver#2968]
 +- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp), 
Some(Asia/Kolkata)) > 12))
    +- *(1) ColumnarToRow
-      +- FileScan parquet 
default.hudi_table_func_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
 Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters: 
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
+      +- FileScan parquet 
default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
 Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as 
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: 
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: 
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters: 
[IsNotNull(ts)], ReadSchema: 
struct<ts:string,rider:string,driver:string,fare:double>
       
 ```
 </details>
 
-#### Create Partition Stats and Secondary Index (Experimental)
+#### Create Partition Stats and Secondary Index

Review Comment:
   we should simplify this down as much as possible.. 



##########
website/docs/metadata_indexing.md:
##########
@@ -11,27 +11,30 @@ of Hudi depends on the metadata table. Different types of 
index, from `files` in
 to `column_stats` index for data skipping, are part of the metadata table. A 
fundamental tradeoff in any data system
 that supports indices is to balance the write throughput with index updates. A 
brute-force way is to lock out the writes
 while indexing. Hudi supports index creation using SQL, Datasource as well as 
async indexing. However, very large tables 
-can take hours to index. This is where Hudi's novel asynchronous metadata 
indexing comes into play. Indexes in Hudi are
-created in two phases and uses a mix of optimistic concurrency control and 
log-based concurrency control models. The two
+can take hours to index. This is where Hudi's novel concurrent indexing comes 
into play. 
+
+## Concurrent Indexing
+
+Indexes in Hudi are created in two phases and uses a mix of optimistic 
concurrency control and log-based concurrency control models. The two
 phase approach ensures that the other writers are unblocked.
 
 - Scheduling - This is the first phase which schedules an indexing plan and is 
protected by a lock. Indexing plan considers all the completed commits upto 
indexing instant.
 - Execution - This phase creates the index files as mentioned in the index 
plan. At the end of the phase Hudi ensures the completed commits after indexing 
instant used already created index plan to add corresponding index metadata. 
This check is protected by a metadata table lock and in case of failures 
indexing is aborted.
 
-We can now create different metadata indices, including `files`, 
`bloom_filters`, `column_stats`, `partition_stats` and `record_index` 
-asynchronously in Hudi, which are then used by readers and writers to improve 
performance. Being able to index without blocking writing
-has two benefits,
+We can now create different metadata indices, including `files`, 
`bloom_filters`, `column_stats`, `partition_stats`, `record_index`, 
`secondary_index`

Review Comment:
   again, terminology. 
   
   metadata table : underlying infrastructure to store metadata+indexes.
   metadata : file listings, col_stats, partition_stats (whatever is needed to 
plan standard queries)
   index: rli, ei, si.. bloom index.. (actual indexes)



##########
website/docs/sql_ddl.md:
##########
@@ -103,6 +103,49 @@ TBLPROPERTIES (
 );
 ```
 
+### Create table with record merge mode {#create-table-with-record-merge-mode}
+
+Hudi supports different [record merge modes](/docs/next/record_merger) to 
handle merge of incoming records with existing
+records. To create a table with specific record merge mode, you can set 
`recordMergeMode` option.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode (
+  id INT,
+  name STRING,
+  ts LONG,
+  price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+  type = 'mor',
+  primaryKey = 'id',
+  precombineField = 'ts',

Review Comment:
   same. can we use ordering field cfg



##########
website/docs/quick-start-guide.md:
##########
@@ -454,27 +454,47 @@ VALUES
 (1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai'      ),
 
(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
 
--- Create bloom filter expression index on city column
-CREATE INDEX idx_bloom_city ON hudi_indexed_table USING bloom_filters(city) 
OPTIONS(expr='identity');
+-- Create bloom filter expression index on driver column

Review Comment:
   above - why do we need to configure the payload class. Can we remove this?
   
       hoodie.datasource.write.payload.class = 
"org.apache.hudi.common.model.OverwriteWithLatestAvroPayload"
   



##########
website/docs/quick-start-guide.md:
##########
@@ -454,27 +454,47 @@ VALUES
 (1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 
,'chennai'      ),
 
(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
 
--- Create bloom filter expression index on city column
-CREATE INDEX idx_bloom_city ON hudi_indexed_table USING bloom_filters(city) 
OPTIONS(expr='identity');
+-- Create bloom filter expression index on driver column
+CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING 
bloom_filters(driver) OPTIONS(expr='identity');
 -- It would show bloom filter expression index
 SHOW INDEXES FROM hudi_indexed_table;
--- Query on city column would prune the data using the idx_bloom_city index
-SELECT uuid, rider FROM hudi_indexed_table WHERE city = 'san_francisco';
+-- Query on driver column would prune the data using the idx_bloom_driver index
+SELECT uuid, rider FROM hudi_indexed_table WHERE driver = 'driver-S';
 
 -- Create column stat expression index on ts column
-CREATE INDEX idx_column_driver ON hudi_indexed_table USING column_stats(rider) 
OPTIONS(expr='upper');
+CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) 
OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd');
 -- Shows both expression indexes
 SHOW INDEXES FROM hudi_indexed_table;
 -- Query on ts column would prune the data using the idx_column_ts index
-SELECT * FROM hudi_indexed_table WHERE upper(driver) = 'DRIVER-S';
+SELECT * FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = 
'2023-09-24';
 
 -- Create secondary index on rider column
 CREATE INDEX record_index ON hudi_indexed_table (uuid);
 CREATE INDEX idx_rider ON hudi_indexed_table (rider);
+SET hoodie.metadata.record.index.enable=true;

Review Comment:
   to confirm - data skipping is already enabled by default?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [DOCS] Refine quickstart docs further for 1.0 [hudi]

Reply via email to