This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new e87bf7c26c6 [HUDI-6861] update sql pages for 0.14.0 (#9699)
e87bf7c26c6 is described below
commit e87bf7c26c6dbf3bf2e374445d9d2a5c797d3b0f
Author: Jon Vexler <[email protected]>
AuthorDate: Fri Sep 15 22:44:00 2023 -0400
[HUDI-6861] update sql pages for 0.14.0 (#9699)
---------
Co-authored-by: Jonathan Vexler <=>
---
website/docs/procedures.md | 175 +++++++++++++++++++++++++--------------
website/docs/table_management.md | 31 +++----
2 files changed, 129 insertions(+), 77 deletions(-)
diff --git a/website/docs/procedures.md b/website/docs/procedures.md
index b280afb346c..ba2d1c06968 100644
--- a/website/docs/procedures.md
+++ b/website/docs/procedures.md
@@ -468,14 +468,14 @@ archive commits.
**Input**
-| Parameter Name | Type | Required | Default Value | Description
|
-|-----------------|---------|----------|---------------|--------------------------------------------------|
-| table | String | N | None | Hudi table name
|
-| path | String | N | None | Path of table
|
-| min_commits | Int | N | 20 | Configuration as
'hoodie.keep.min.commits' |
-| max_commits | Int | N | 30 | Configuration as
'hoodie.keep.max.commits' |
-| retain_commits | Int | N | 10 | Configuration as
'hoodie.commits.archival.batch' |
-| enable_metadata | Boolean | N | false | Enable the internal
metadata table |
+| Parameter Name |
Type | Required | Default Value | Description
|
+|------------------------------------------------------------------------|---------|----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table |
String | N | None | Hudi table name
|
+| path |
String | N | None | Path of table
|
+| [min_commits](/docs/next/configurations#hoodiekeepmincommits) | Int
| N | 20 | Similar to hoodie.keep.max.commits, but
controls the minimum number of instants to retain in the active timeline.
|
+| [max_commits](/docs/next/configurations#hoodiekeepmaxcommits) | Int
| N | 30 | Archiving service moves older entries from
timeline into an archived log after each write, to keep the metadata overhead
constant, even as the table size grows. This config controls the maximum number
of instants to retain in the active timeline. |
+| [retain_commits](/docs/next/configurations#hoodiecommitsarchivalbatch) | Int
| N | 10 | Archiving of instants is batched in
best-effort manner, to pack more instants into a single archive log. This
config controls such archival batch size.
|
+| [enable_metadata](/docs/next/configurations#hoodiemetadataenable) |
Boolean | N | false | Enable the internal metadata table
|
**Output**
@@ -669,16 +669,16 @@ copy table to a temporary view.
**Input**
-| Parameter Name | Type | Required | Default Value | Description
|
-|---------------------|---------|----------|---------------|-------------------------------------------------|
-| table | String | Y | None | Hudi table name
|
-| query_type | String | N | "snapshot" | Configuration as
'hoodie.datasource.query.type' |
-| view_name | String | Y | None | Name of view
|
-| begin_instance_time | String | N | "" | Begin instance
time |
-| end_instance_time | String | N | "" | End instance time
|
-| as_of_instant | String | N | "" | As of instant
time |
-| replace | Boolean | N | false | Replace an
existed view |
-| global | Boolean | N | false | Global view
|
+| Parameter Name | Type
| Required | Default Value | Description
|
+|-------------------------------------------------------------------|---------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table | String
| Y | None | Hudi table name
|
+| [query_type](/docs/next/configurations#hoodiedatasourcequerytype) | String
| N | "snapshot" | Whether data needs to be read, in `incremental`
mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest
view, based on base files) (or) `snapshot` mode (obtain latest view, by merging
base and (if any) log files) |
+| view_name | String
| Y | None | Name of view
|
+| begin_instance_time | String
| N | "" | Begin instance time
|
+| end_instance_time | String
| N | "" | End instance time
|
+| as_of_instant | String
| N | "" | As of instant time
|
+| replace | Boolean
| N | false | Replace an existed view
|
+| global | Boolean
| N | false | Global view
|
**Output**
@@ -702,15 +702,15 @@ copy table to a new table.
**Input**
-| Parameter Name | Type | Required | Default Value | Description
|
-|---------------------|--------|----------|---------------|-------------------------------------------------|
-| table | String | Y | None | Hudi table name
|
-| query_type | String | N | "snapshot" | Configuration as
'hoodie.datasource.query.type' |
-| new_table | String | Y | None | Name of new table
|
-| begin_instance_time | String | N | "" | Begin instance
time |
-| end_instance_time | String | N | "" | End instance time
|
-| as_of_instant | String | N | "" | As of instant time
|
-| save_mode | String | N | "overwrite" | Save mode
|
+| Parameter Name | Type |
Required | Default Value | Description
|
+|-------------------------------------------------------------------|--------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table | String |
Y | None | Hudi table name
|
+| [query_type](/docs/next/configurations#hoodiedatasourcequerytype) | String |
N | "snapshot" | Whether data needs to be read, in `incremental` mode
(new data since an instantTime) (or) `read_optimized` mode (obtain latest view,
based on base files) (or) `snapshot` mode (obtain latest view, by merging base
and (if any) log files) |
+| new_table | String |
Y | None | Name of new table
|
+| begin_instance_time | String |
N | "" | Begin instance time
|
+| end_instance_time | String |
N | "" | End instance time
|
+| as_of_instant | String |
N | "" | As of instant time
|
+| save_mode | String |
N | "overwrite" | Save mode
|
**Output**
@@ -1348,12 +1348,13 @@ If both parameters are given, ``table`` will take
effect.
**Input**
-| Parameter Name | Type | Required | Default Value | Description
|
-|----------------|--------|----------|---------------|-------------------------------------|
-| op | String | N | None | Operation type, `RUN`
or `SCHEDULE` |
-| table | String | N | None | Name of table to be
compacted |
-| path | String | N | None | Path of table to be
compacted |
-| timestamp | String | N | None | Instant time
|
+| Parameter Name | Type | Required | Default Value | Description
|
+|----------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------|
+| op | String | N | None | Operation type, `RUN`
or `SCHEDULE` |
+| table | String | N | None | Name of table to be
compacted |
+| path | String | N | None | Path of table to be
compacted |
+| timestamp | String | N | None | Instant time
|
+| options | String | N | None | comma separated list
of Hudi configs for compaction in the format "config1=value1,config2=value2" |
**Output**
@@ -1379,6 +1380,10 @@ Run compaction with table path and timestamp
```
call run_compaction(op => 'run', path => '/tmp/hoodie/test_hudi_table',
timestamp => '20220408153658568');
```
+Run compaction with options
+```
+call run_compaction(op => 'run', table => 'test_hudi_table', options =>
hoodie.compaction.strategy=org.apache.hudi.table.action.compact.strategy.LogFileNumBasedCompactionStrategy,hoodie.compaction.logfile.num.threshold=3);
+```
Schedule compaction with table name
```
@@ -1458,6 +1463,47 @@ call show_compaction(table => 'test_hudi_table', limit
=> 1);
|-------------------|------------|---------|
| 20220408153707928 | compaction | 10 |
+### run_clean
+
+Run cleaner on a hoodie table.
+
+**Input**
+
+| Parameter Name
| Type | Required | Default Value | Description
[...]
+|---------------------------------------------------------------------------------------|---------|----------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[...]
+| table
| String | Y | None | Name of table to be cleaned
[...]
+| schedule_in_line
| Boolean | N | true | Set "true" if you want to
schedule and run a clean. Set false if you have already scheduled a clean and
want to run that.
[...]
+| [clean_policy](/docs/next/configurations#hoodiecleanerpolicy)
| String | N | None |
org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used.
The cleaner service deletes older file slices files to re-claim space. Long
running query plans may often refer to older file slices and will break if
those are cleaned, before the query has had a chance to run. So, it is good to
make sure that the data is retained for more than the ma [...]
+| [retain_commits](/docs/next/configurations#hoodiecleanercommitsretained)
| Int | N | None | When KEEP_LATEST_COMMITS
cleaning policy is used, the number of commits to retain, without cleaning.
This will be retained for num_of_commits * time_between_commits (scheduled).
This also directly translates into how much data retention the table supports
for incremental queries.
[...]
+| [hours_retained](/docs/next/configurations#hoodiecleanerhoursretained)
| Int | N | None | When KEEP_LATEST_BY_HOURS
cleaning policy is used, the number of hours for which commits need to be
retained. This config provides a more flexible option as compared to number of
commits retained for cleaning service. Setting this property ensures all the
files, but the latest in a file group, corresponding to commits with commit
times older than the configured n [...]
+|
[file_versions_retained](/docs/next/configurations#hoodiecleanerfileversionsretained)
| Int | N | None | When KEEP_LATEST_FILE_VERSIONS cleaning
policy is used, the minimum number of file slices to retain in each file group,
during cleaning.
[...]
+| [trigger_strategy](/docs/next/configurations#hoodiecleantriggerstrategy)
| String | N | None |
org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when
cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service
every N commits, determined by `hoodie.clean.max.commits`
[...]
+| [trigger_max_commits](/docs/next/configurations/#hoodiecleanmaxcommits)
| Int | N | None | Number of commits after the
last clean operation, before scheduling of a new clean is attempted.
[...]
+| [options](/docs/next/configurations/#Clean-Configs)
| String | N | None | comma separated list of Hudi
configs for cleaning in the format "config1=value1,config2=value2"
[...]
+
+**Output**
+
+| Parameter Name | Type |
+|---------------------------|--------|
+| start_clean_time | String |
+| time_taken_in_millis | Long |
+| total_files_deleted | Int |
+| earliest_commit_to_retain | String |
+| bootstrap_part_metadata | String |
+| version | Int |
+
+**Example**
+
+Run clean with table name
+```
+call run_clean(table => 'test_hudi_table');
+```
+
+Run clean with keep latest file versions policy
+```
+call run_clean(table => 'test_hudi_table', trigger_max_commits => 2,
clean_policy => 'KEEP_LATEST_FILE_VERSIONS', file_versions_retained => 1)
+```
+
### delete_marker
Delete marker files of a hudi table.
@@ -1521,17 +1567,20 @@ Sync the table's latest schema to Hive metastore.
**Input**
-| Parameter Name | Type | Required | Default Value | Description
|
-|---------------------------|--------|----------|---------------|--------------------------------------------------------------------------|
-| table | String | Y | None | Hudi table
name |
-| metastore_uri | String | N | "" |
Metastore_uri |
-| username | String | N | "" | User name
|
-| password | String | N | "" | Password
|
-| use_jdbc | String | N | "" | Configration
as 'hoodie.datasource.hive_sync.use_jdbc' |
-| mode | String | N | "" |
Configuration as 'hoodie.datasource.hive_sync.mode' |
-| partition_fields | String | N | "" |
Configuration as 'hoodie.datasource.hive_sync.partition_fields' |
|
-| partition_extractor_class | String | N | "" |
Configuration as 'hoodie.datasource.hive_sync.partition_extractor_class' |
-| strategy | String | N | "" |
Configuration as 'hoodie.datasource.hive_sync.table.strategy' |
+| Parameter Name
| Type | Required | Default Value | Description
|
+|-----------------------------------------------------------------------------------------------------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table
| String | Y | None | Hudi table
name
|
+| metastore_uri
| String | N | "" |
Metastore_uri
|
+| username
| String | N | "" | User name
|
+| password
| String | N | "" | Password
|
+| [use_jdbc](/docs/next/configurations#hoodiedatasourcehive_syncuse_jdbc)
| String | N | "" | Use JDBC
when hive synchronization is enabled
|
+| [mode](/docs/next/configurations#hoodiedatasourcehive_syncmode)
| String | N | "" | Mode to
choose for Hive ops. Valid values are hms, jdbc and hiveql.
|
+|
[partition_fields](/docs/next/configurations#hoodiedatasourcehive_syncpartition_fields)
| String | N | "" | Field in the table to
use for determining hive partition columns.
|
|
+|
[partition_extractor_class](/docs/next/configurations#hoodiedatasourcehive_syncpartition_extractor_class)
| String | N | "" | Class which implements
PartitionValueExtractor to extract the partition values, default
'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
|
+| [strategy](/docs/next/configurations#hoodiedatasourcehive_synctablestrategy)
| String | N | "" | Hive table
synchronization strategy. Available option: RO, RT, ALL.
|
+| [sync_incremental](/docs/next/configurations#hoodiemetasyncincremental)
| String | N | "" | Whether to
incrementally sync the partitions to the metastore, i.e., only added, changed,
and deleted partitions based on the commit metadata. If set to `false`, the
meta sync executes a full partition sync operation when partitions are lost. |
+
+
**Output**
@@ -1735,25 +1784,25 @@ Convert an existing table to Hudi.
**Input**
-| Parameter Name | Type | Required | Default Value
| Description
|
-|-------------------------------|---------|----------|-------------------------------------------------------------------------------|--------------------------------------------|
-| table | String | Y | None
| Name of table to be
clustered |
-| table_type | String | Y | None
| Table type,
MERGE_ON_READ or COPY_ON_WRITE |
-| bootstrap_path | String | Y | None
| Bootstrap path
|
-| base_path | String | Y | None
| Base path
|
-| rowKey_field | String | Y | None
| Primary key field
|
-| base_file_format | String | N | "PARQUET"
| Format of base file
|
-| partition_path_field | String | N | ""
| Partitioned column
field |
-| bootstrap_index_class | String | N |
"org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex" |
Class of bootstrap index |
-| selector_class | String | N |
"org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector" |
Class of selector |
-| key_generator_class | String | N |
"org.apache.hudi.keygen.SimpleKeyGenerator" |
Class of key generator |
-| full_bootstrap_input_provider | String | N |
"org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider" |
Class of full bootstrap input provider |
-| schema_provider_class | String | N | ""
| Class of schema
provider |
-| payload_class | String | N |
"org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" |
Class of payload |
-| parallelism | Int | N | 1500
| Parallelism
|
-| enable_hive_sync | Boolean | N | false
| Whether to enable hive
sync |
-| props_file_path | String | N | ""
| Path of properties file
|
-| bootstrap_overwrite | Boolean | N | false
| Overwrite bootstrap
path |
+| Parameter Name
| Type | Required | Default Value
| Description
[...]
+|------------------------------------------------------------------------------|---------|----------|-------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[...]
+| table
| String | Y | None
| Name of table to be clustered
[...]
+| table_type
| String | Y | None
| Table type, MERGE_ON_READ or COPY_ON_WRITE
[...]
+| [bootstrap_path](/docs/next/configurations#hoodiebootstrapbasepath)
| String | Y | None
| Base path of the dataset that needs to be bootstrapped
as a Hudi table
[...]
+| base_path
| String | Y | None
| Base path
[...]
+| rowKey_field
| String | Y | None
| Primary key field
[...]
+| base_file_format
| String | N | "PARQUET"
| Format of base file
[...]
+| partition_path_field
| String | N | ""
| Partitioned column field
[...]
+| [bootstrap_index_class](/docs/next/configurations#hoodiebootstrapindexclass)
| String | N |
"org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex" |
Implementation to use, for mapping a skeleton base file to a bootstrap base
file.
[...]
+| [selector_class](/docs/next/configurations#hoodiebootstrapmodeselector)
| String | N |
"org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector" |
Selects the mode in which each file/partition in the bootstrapped dataset gets
bootstrapped
[...]
+| key_generator_class
| String | N | "org.apache.hudi.keygen.SimpleKeyGenerator"
| Class of key generator
[...]
+| full_bootstrap_input_provider
| String | N |
"org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider" |
Class of full bootstrap input provider
[...]
+| schema_provider_class
| String | N | ""
| Class of schema provider
[...]
+| payload_class
| String | N |
"org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" |
Class of payload
[...]
+| [parallelism](/docs/next/configurations#hoodiebootstrapparallelism)
| Int | N | 1500
| For metadata-only bootstrap, Hudi parallelizes the
operation so that each table partition is handled by one Spark task. This
config limits the number of parallelism. We pick the configured parallelism if
the number of table partitions is larger than this configured value. The
parallelism is assigned to the nu [...]
+| enable_hive_sync
| Boolean | N | false
| Whether to enable hive sync
[...]
+| props_file_path
| String | N | ""
| Path of properties file
[...]
+| bootstrap_overwrite
| Boolean | N | false
| Overwrite bootstrap path
[...]
**Output**
diff --git a/website/docs/table_management.md b/website/docs/table_management.md
index d00797ac99e..672ade8ccb8 100644
--- a/website/docs/table_management.md
+++ b/website/docs/table_management.md
@@ -18,13 +18,13 @@ Only SparkSQL needs an explicit Create Table command. No
Create Table command is
Users can set table options while creating a hudi table.
-| Parameter Name | Description | (Optional/Required) : Default Value |
-|------------|--------|--------|
-| primaryKey | The primary key names of the table, multiple fields separated
by commas. | (Optional) : `id`|
-| type | The type of table to create ([read more](/docs/table_types)).
<br></br> `cow` = COPY-ON-WRITE, `mor` = MERGE-ON-READ.| (Optional) : `cow` |
-| preCombineField | The Pre-Combine field of the table. | (Optional) : `ts`|
+| Parameter Name | Default | Description
|
+|-----------------|----------------|-----------------------------------------------------------------------------------------------------------------------|
+| primaryKey | id (Optional) | The primary key names of the table,
multiple fields separated by commas.
|
+| type | cow (Optional) | The type of table to create ([read
more](/docs/table_types)). <br></br> `cow` = COPY-ON-WRITE, `mor` =
MERGE-ON-READ. |
+| preCombineField | ts (Optional) | The Pre-Combine field of the table.
|
-To set any custom hudi config(like index type, max parquet size, etc), see the
"Set hudi config section" .
+To set any custom hudi config(like index type, max parquet size, etc), see the
section [Set hudi config options](#set-hoodie-config-options) .
### Table Type
Here is an example of creating a COW table.
@@ -36,7 +36,7 @@ create table if not exists hudi_table2(
name string,
price double
) using hudi
-options (
+tblproperties (
type = 'cow'
);
```
@@ -51,7 +51,7 @@ create table if not exists hudi_table0 (
name string,
price double
) using hudi
-options (
+tblproperties (
type = 'cow',
primaryKey = 'id'
);
@@ -69,7 +69,7 @@ create table if not exists hudi_table1 (
price double,
ts bigint
) using hudi
-options (
+tblproperties (
type = 'mor',
primaryKey = 'id,name',
preCombineField = 'ts'
@@ -77,6 +77,9 @@ options (
```
### Partitioned Table
+:::note
+When created in spark-sql, partition columns will always be the last columns
of the table.
+:::
Here is an example of creating a COW partitioned table.
```sql
create table if not exists hudi_table_p0 (
@@ -85,7 +88,7 @@ name string,
dt string,
hh string
) using hudi
-options (
+tblproperties (
type = 'cow',
primaryKey = 'id'
)
@@ -118,7 +121,7 @@ select 1 as id, 'a1' as name, 10 as price;
```sql
create table h2 using hudi
-options (type = 'cow', primaryKey = 'id')
+tblproperties (type = 'cow', primaryKey = 'id')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as dt;
@@ -131,7 +134,7 @@ select 1 as id, 'a1' as name, 10 as price, 1000 as dt;
create table parquet_mngd using parquet location
'file:///tmp/parquet_dataset/*.parquet';
# CTAS by loading data into hudi table
-create table hudi_tbl using hudi location 'file:/tmp/hudi/hudi_tbl/' options (
+create table hudi_tbl using hudi location 'file:/tmp/hudi/hudi_tbl/'
tblproperties (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
@@ -148,7 +151,7 @@ create table if not exists h3(
name string,
price double
) using hudi
-options (
+tblproperties (
primaryKey = 'id',
type = 'mor',
${hoodie.config.key1} = '${hoodie.config.value2}',
@@ -162,7 +165,7 @@ create table if not exists h3(
name string,
price double
) using hudi
-options (
+tblproperties (
primaryKey = 'id',
type = 'mor',
hoodie.cleaner.fileversions.retained = '20',