vinothchandar commented on code in PR #12420:
URL: https://github.com/apache/hudi/pull/12420#discussion_r1874184392
##########
website/docs/quick-start-guide.md:
##########
@@ -1140,11 +1175,17 @@ multiple records with the same key. See section below.
## Ordering Field
Hudi also allows users to specify a _precombine_ field, which will be used to
order and resolve conflicts between multiple versions of the same record. This
is very important for
-use-cases like applying database CDC logs to a Hudi table, where a given
record may be appear multiple times in the source data due to repeated upstream
updates.
+use-cases like applying database CDC logs to a Hudi table, where a given
record may appear multiple times in the source data due to repeated upstream
updates.
Hudi also uses this mechanism to support out-of-order data arrival into a
table, where records may need to be resolved in a different order than their
commit time.
-For e.g using a _created_at_ timestamp field as the precombine field will
prevent older versions of a record from overwriting newer ones or being exposed
to queries, even
+For e.g. using a _created_at_ timestamp field as the precombine field will
prevent older versions of a record from overwriting newer ones or being exposed
to queries, even
if they are written at a later commit time to the table. This is one of the
key features, that makes Hudi, best suited for dealing with streaming data.
+To enable different merge semantics, Hudi supports merge modes, which define
how the base and log files are ordered in a
+file slice and further how different records with the same record key within
that file slice are merged consistently to
Review Comment:
can we use `ordering.field` insted of precombine.. we have two configs right.
##########
website/docs/sql_ddl.md:
##########
@@ -342,65 +377,65 @@ CREATE TABLE hudi_table_func_index (
) USING HUDI
tblproperties (primaryKey = 'uuid')
PARTITIONED BY (city)
-location 'file:///tmp/hudi_table_func_index';
+location 'file:///tmp/hudi_table_expr_index';
-- disable small file handling so the each insert creates new file --
set hoodie.parquet.small.file.limit=0;
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
-
--- Query with hour function filter but no idex yet --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
+
+-- Query with hour function filter but no index yet --
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.208 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#3465, fare#3464, rider#3462, driver#3463],
Statistics(sizeInBytes=899.5 KiB)
+- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 =
chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=2.5 MiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
== Physical Plan ==
*(1) Project [city#3465, fare#3464, rider#3462, driver#3463]
+- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
--- create the functional index --
-CREATE INDEX ts_hour ON hudi_table_func_index USING column_stats(ts)
options(func='hour');
+-- create the expression index --
+CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts)
options(expr='hour');
-- query after creating the index --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.202 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#2970, fare#2969, rider#2967, driver#2968],
Statistics(sizeInBytes=449.8 KiB)
+- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 =
chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=1278.3 KiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
== Physical Plan ==
*(1) Project [city#2970, fare#2969, rider#2967, driver#2968]
+- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
```
</details>
-#### Create Partition Stats and Secondary Index (Experimental)
+#### Create Partition Stats and Secondary Index
Review Comment:
do we need to set hoodie.metadata.enable and hoodie.eable.data.skipping
anymore? are nt they defaults..
##########
website/docs/quick-start-guide.md:
##########
@@ -627,13 +647,28 @@ WHEN NOT MATCHED THEN INSERT *
:::info Key requirements
1. For a Hudi table with user defined primary record [keys](#keys), the join
condition is expected to contain the primary keys of the table.
-For a Hudi table with Hudi generated primary keys, the join condition can be
on any arbitrary data columns.
-2. For Merge-On-Read tables, partial column updates are not yet supported,
i.e. **all columns** need to be SET from a
-MERGE statement either using `SET *` or using `SET column1 = expression1 [,
column2 = expression2 ...]`.
+For a Hudi table with Hudi generated primary keys, the join condition can be
on any arbitrary data columns.
:::
</TabItem>
</Tabs>
+### Merging Data with Partial Updates {#merge-partial-update}
+
+Partial updates only write updated columns instead of full update record. This
is useful when you have hundreds of
+columns and only a few columns are updated. It reduces the write amplification
as well as helps in lowering the query
+latency. `MERGE INTO` statement above can be modified to use partial updates
as shown below.
+
+```sql
+MERGE INTO hudi_table AS target
+USING fare_adjustment AS source
+ON target.uuid = source.uuid
+WHEN MATCHED THEN UPDATE SET fare = source.fare
Review Comment:
sorry, what exactly is the modification from example above. the previous
one does not use `UPDATE SET *`
##########
website/docs/sql_ddl.md:
##########
@@ -473,8 +508,10 @@ Time taken: 0.83 seconds, Fetched 2 row(s)
- Predicate on internal meta fields such as `_hoodie_record_key` or
`_hoodie_partition_path` cannot be used for data
skipping. Queries with such predicates cannot leverage the indexes.
- Secondary index is not supported for nested fields.
+- Secondary index can be created only if record index is available in the table
+- Secondary index can only be used for tables using
OverwriteWithLatestAvroPayload payload or COMMIT_TIME_ORDERING merge mode
Review Comment:
whats the equivalent. should we say its not supported for tables with
different event time merge modes? We cannot write all this in terms of payload
alone. we should write based on merge mode and then also add a note on how it
works for payloads.
##########
website/docs/sql_ddl.md:
##########
@@ -103,6 +103,49 @@ TBLPROPERTIES (
);
```
+### Create table with record merge mode {#create-table-with-record-merge-mode}
+
+Hudi supports different [record merge modes](/docs/next/record_merger) to
handle merge of incoming records with existing
+records. To create a table with specific record merge mode, you can set
`recordMergeMode` option.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode (
+ id INT,
+ name STRING,
+ ts LONG,
+ price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+ type = 'mor',
+ primaryKey = 'id',
+ precombineField = 'ts',
+ recordMergeMode = 'EVENT_TIME_ORDERING'
+)
+LOCATION 'file:///tmp/hudi_table_merge_mode/';
+```
+
+With `EVENT_TIME_ORDERING`, the record with the larger event time
(`precombineField`) overwrites the record with the
+smaller event time on the same key, regardless of transaction time. Users can
set `CUSTOM` mode to provide their own
+merge logic. With `CUSTOM` merge mode, you also need to provide your payload
class that implements the merge logic. For
+example, you can use `PartialUpdateAvroPayload` to merge the records as below.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode_custom (
+ id INT,
+ name STRING,
+ ts LONG,
+ price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+ type = 'mor',
+ primaryKey = 'id',
+ precombineField = 'ts',
+ recordMergeMode = 'CUSTOM',
+ 'hoodie.datasource.write.payload.class' =
'org.apache.hudi.common.model.PartialUpdateAvroPayload'
Review Comment:
we need to show merger... not a payload.
##########
website/docs/sql_ddl.md:
##########
@@ -342,65 +377,65 @@ CREATE TABLE hudi_table_func_index (
) USING HUDI
tblproperties (primaryKey = 'uuid')
PARTITIONED BY (city)
-location 'file:///tmp/hudi_table_func_index';
+location 'file:///tmp/hudi_table_expr_index';
-- disable small file handling so the each insert creates new file --
set hoodie.parquet.small.file.limit=0;
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
-INSERT INTO hudi_table_func_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
-
--- Query with hour function filter but no idex yet --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
03:58:59','334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
08:46:34','e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-18
17:45:31','9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
13:12:56','1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-24
06:15:45','e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-22
15:21:36','7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
,'sao_paulo');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-20
12:35:45','3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai');
+INSERT INTO hudi_table_expr_index VALUES ('2023-09-19
05:34:56','c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
+
+-- Query with hour function filter but no index yet --
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.208 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#3465, fare#3464, rider#3462, driver#3463],
Statistics(sizeInBytes=899.5 KiB)
+- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 =
chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=2.5 MiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465]
parquet, Statistics(sizeInBytes=2.5 MiB)
== Physical Plan ==
*(1) Project [city#3465, fare#3464, rider#3462, driver#3463]
+- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465]
Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
--- create the functional index --
-CREATE INDEX ts_hour ON hudi_table_func_index USING column_stats(ts)
options(func='hour');
+-- create the expression index --
+CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts)
options(expr='hour');
-- query after creating the index --
-spark-sql> SELECT city, fare, rider, driver FROM hudi_table_func_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE
city NOT IN ('chennai') AND hour(ts) > 12;
san_francisco 93.5 rider-E driver-O
san_francisco 33.9 rider-D driver-L
sao_paulo 43.4 rider-G driver-Q
Time taken: 0.202 seconds, Fetched 3 row(s)
-spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_func_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
+spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM
hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12;
== Optimized Logical Plan ==
Project [city#2970, fare#2969, rider#2967, driver#2968],
Statistics(sizeInBytes=449.8 KiB)
+- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 =
chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))),
Statistics(sizeInBytes=1278.3 KiB)
- +- Relation
default.hudi_table_func_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
+ +- Relation
default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970]
parquet, Statistics(sizeInBytes=1278.3 KiB)
== Physical Plan ==
*(1) Project [city#2970, fare#2969, rider#2967, driver#2968]
+- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp),
Some(Asia/Kolkata)) > 12))
+- *(1) ColumnarToRow
- +- FileScan parquet
default.hudi_table_func_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_func_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
+ +- FileScan parquet
default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970]
Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as
timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location:
HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters:
[isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters:
[IsNotNull(ts)], ReadSchema:
struct<ts:string,rider:string,driver:string,fare:double>
```
</details>
-#### Create Partition Stats and Secondary Index (Experimental)
+#### Create Partition Stats and Secondary Index
Review Comment:
we should simplify this down as much as possible..
##########
website/docs/metadata_indexing.md:
##########
@@ -11,27 +11,30 @@ of Hudi depends on the metadata table. Different types of
index, from `files` in
to `column_stats` index for data skipping, are part of the metadata table. A
fundamental tradeoff in any data system
that supports indices is to balance the write throughput with index updates. A
brute-force way is to lock out the writes
while indexing. Hudi supports index creation using SQL, Datasource as well as
async indexing. However, very large tables
-can take hours to index. This is where Hudi's novel asynchronous metadata
indexing comes into play. Indexes in Hudi are
-created in two phases and uses a mix of optimistic concurrency control and
log-based concurrency control models. The two
+can take hours to index. This is where Hudi's novel concurrent indexing comes
into play.
+
+## Concurrent Indexing
+
+Indexes in Hudi are created in two phases and uses a mix of optimistic
concurrency control and log-based concurrency control models. The two
phase approach ensures that the other writers are unblocked.
- Scheduling - This is the first phase which schedules an indexing plan and is
protected by a lock. Indexing plan considers all the completed commits upto
indexing instant.
- Execution - This phase creates the index files as mentioned in the index
plan. At the end of the phase Hudi ensures the completed commits after indexing
instant used already created index plan to add corresponding index metadata.
This check is protected by a metadata table lock and in case of failures
indexing is aborted.
-We can now create different metadata indices, including `files`,
`bloom_filters`, `column_stats`, `partition_stats` and `record_index`
-asynchronously in Hudi, which are then used by readers and writers to improve
performance. Being able to index without blocking writing
-has two benefits,
+We can now create different metadata indices, including `files`,
`bloom_filters`, `column_stats`, `partition_stats`, `record_index`,
`secondary_index`
Review Comment:
again, terminology.
metadata table : underlying infrastructure to store metadata+indexes.
metadata : file listings, col_stats, partition_stats (whatever is needed to
plan standard queries)
index: rli, ei, si.. bloom index.. (actual indexes)
##########
website/docs/sql_ddl.md:
##########
@@ -103,6 +103,49 @@ TBLPROPERTIES (
);
```
+### Create table with record merge mode {#create-table-with-record-merge-mode}
+
+Hudi supports different [record merge modes](/docs/next/record_merger) to
handle merge of incoming records with existing
+records. To create a table with specific record merge mode, you can set
`recordMergeMode` option.
+
+```sql
+CREATE TABLE IF NOT EXISTS hudi_table_merge_mode (
+ id INT,
+ name STRING,
+ ts LONG,
+ price DOUBLE
+) USING hudi
+TBLPROPERTIES (
+ type = 'mor',
+ primaryKey = 'id',
+ precombineField = 'ts',
Review Comment:
same. can we use ordering field cfg
##########
website/docs/quick-start-guide.md:
##########
@@ -454,27 +454,47 @@ VALUES
(1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai' ),
(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
--- Create bloom filter expression index on city column
-CREATE INDEX idx_bloom_city ON hudi_indexed_table USING bloom_filters(city)
OPTIONS(expr='identity');
+-- Create bloom filter expression index on driver column
Review Comment:
above - why do we need to configure the payload class. Can we remove this?
hoodie.datasource.write.payload.class =
"org.apache.hudi.common.model.OverwriteWithLatestAvroPayload"
##########
website/docs/quick-start-guide.md:
##########
@@ -454,27 +454,47 @@ VALUES
(1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
,'chennai' ),
(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
--- Create bloom filter expression index on city column
-CREATE INDEX idx_bloom_city ON hudi_indexed_table USING bloom_filters(city)
OPTIONS(expr='identity');
+-- Create bloom filter expression index on driver column
+CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING
bloom_filters(driver) OPTIONS(expr='identity');
-- It would show bloom filter expression index
SHOW INDEXES FROM hudi_indexed_table;
--- Query on city column would prune the data using the idx_bloom_city index
-SELECT uuid, rider FROM hudi_indexed_table WHERE city = 'san_francisco';
+-- Query on driver column would prune the data using the idx_bloom_driver index
+SELECT uuid, rider FROM hudi_indexed_table WHERE driver = 'driver-S';
-- Create column stat expression index on ts column
-CREATE INDEX idx_column_driver ON hudi_indexed_table USING column_stats(rider)
OPTIONS(expr='upper');
+CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts)
OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd');
-- Shows both expression indexes
SHOW INDEXES FROM hudi_indexed_table;
-- Query on ts column would prune the data using the idx_column_ts index
-SELECT * FROM hudi_indexed_table WHERE upper(driver) = 'DRIVER-S';
+SELECT * FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') =
'2023-09-24';
-- Create secondary index on rider column
CREATE INDEX record_index ON hudi_indexed_table (uuid);
CREATE INDEX idx_rider ON hudi_indexed_table (rider);
+SET hoodie.metadata.record.index.enable=true;
Review Comment:
to confirm - data skipping is already enabled by default?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]