[
https://issues.apache.org/jira/browse/HUDI-2781?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17451641#comment-17451641
]
Yann Byron commented on HUDI-2781:
----------------------------------
[~xushiyan]
Environment:
spark 3.0.3, hive3.1.2, hudi release-0.10.0-rc2
Conclusions:
* must provide `primaryKey` when create table, so should remove related
content from [https://hudi.apache.org/docs/quick-start-guide];
* ShowPartitions's result is wrong when delete/drop partitions;
* Others is ok.
Details:
Create Table:
{code:java}
-- cow table, without partition fields and preCombineField
create table if not exists cow_nonpt_nonpcf_tbl (
id int,
name string,
price double
) using hudi
options (
type = 'cow',
primaryKey = 'id'
);
-- mor table, without partition fields and preCombineField
create table if not exists mor_nonpt_nonpcf_tbl (
id int,
name string,
price double
) using hudi
options (
type = 'mor',
primaryKey = 'id'
);
-- cow table, without partition fields, with preCombineField
create table if not exists cow_nonpt_pcf_tbl (
id int,
name string,
price double,
ts bigint
) using hudi
options (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
);
-- mor table, without partition fields, with preCombineField
create table if not exists mor_nonpt_pcf_tbl (
id int,
name string,
price double,
ts bigint
) using hudi
options (
type = 'mor',
primaryKey = 'id',
preCombineField = 'ts'
);
-- cow table, with partition fields, without preCombineField
create table if not exists cow_pt_nonpcf_tbl (
id bigint,
name string,
dt string,
hh string
) using hudi
location 'file:///tmp/hudi/cow_pt_nonpcf_tbl'
tblproperties (
primaryKey = 'id'
)
partitioned by (dt, hh);
-- mor table, with partition fields, with preCombineField
create table if not exists mor_pt_pcf_tbl (
id bigint,
name string,
ts bigint,
dt string,
hh string
) using hudi
location 'file:///tmp/hudi/mor_pt_pcf_tbl'
tblproperties (
type = 'mor',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt, hh); {code}
CTAS
{code:java}
-- partitioned table and use `options`
create table ctas_cow_pt_nonpcf_tbl using hudi
options (type = 'cow', primaryKey = 'id')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as dt;
-- non-partitioned table and use `tblproperties`
create table ctas_cow_nonpt_nonpcf_tbl using hudi
tblproperties (primaryKey = 'id')
as
select 1 as id, 'a1' as name, 10 as price; {code}
Create table based on existing path
{code:java}
create table existing_hudi_tbl using hudi
options (
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt)
location 'file:///tmp/hudi/dataframe_hudi_table'; {code}
Insert
{code:java}
-- normal insert into
insert into cow_nonpt_nonpcf_tbl select 1, 'a1', 20;
insert into mor_nonpt_nonpcf_tbl select 1, 'a1', 20;
insert into cow_nonpt_pcf_tbl select 1, 'a1', 20, 1000;
insert into mor_nonpt_pcf_tbl select 1, 'a1', 20, 1000;
-- insert static partition
insert into cow_pt_nonpcf_tbl partition(dt = '2021-01-02', hh='10') select 1,
'a1';
insert into mor_pt_pcf_tbl partition(dt = '2021-01-02', hh='10') select 1,
'a1', '1000';
-- insert dynamic partition
insert into cow_pt_nonpcf_tbl select 2, 'a2', '2021-01-02', '11';
insert into mor_pt_pcf_tbl select 2, 'a2', '1000', '2021-01-02', '11'; {code}
Insert overwrite
{code:java}
-- insert overwrite table
insert overwrite table cow_nonpt_nonpcf_tbl select 3, 'a3', 30;
insert overwrite table mor_nonpt_nonpcf_tbl select 3, 'a3', 30;
insert overwrite cow_nonpt_pcf_tbl select 3, 'a3', 30, 1000;
insert overwrite mor_nonpt_pcf_tbl select 3, 'a3', 30, 1000;
-- insert overwrite table with static partition
insert overwrite cow_pt_nonpcf_tbl partition(dt = '2021-01-02', hh='10') select
1, 'a1_1';
insert overwrite mor_pt_pcf_tbl partition(dt = '2021-01-02', hh='10') select 1,
'a1_1', '1100';
-- insert overwrite table with dynamic partition
insert overwrite table cow_pt_nonpcf_tbl select 2 as id, 'a2_2', '2021-01-02'
as dt, '11' as hh;
insert overwrite table mor_pt_pcf_tbl select 2 as id, 'a2_2', '2200',
'2021-01-02' as dt, '11' as hh; {code}
Update
* if no preCombineField provided, can not use `update` syntax.
{code:java}
update cow_nonpt_pcf_tbl set price = price * 2, name = 'a3_3', ts = 3000 where
id = 3;
update mor_nonpt_pcf_tbl set price = price * 2, name = 'a3_3', ts = 3000 where
id = 3;
update mor_pt_pcf_tbl set name = 'aa_2', ts = 2222 where id % 2 = 0; {code}
Merge into
{code:java}
-- source table using delta for merging into non-partitioned table
create table merge_source (id int, name string, price double, ts bigint) using
hudi
tblproperties (primaryKey = 'id', preCombineField = 'ts');
insert into merge_source values (1, "new_a1", 22.22, 4001), (2, "new_a2",
33.33, 4001), (3, "new_a3", 44.44, 4001);
merge into cow_nonpt_nonpcf_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;
merge into mor_nonpt_pcf_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;
-- source table using parquet for merging into partitioned table
create table merge_source2 (id int, name string, flag string, dt string, hh
string) using parquet;
insert into merge_source2 values (1, "new_a1", 'update', '2021-01-02', '10'),
(2, "new_a2", 'delete', '2021-01-02', '11'), (3, "new_a3", 'insert',
'2021-01-02', '12');
merge into cow_pt_nonpcf_tbl as target
using (
select id, name, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then update set id = source.id, name =
source.name
when matched and flag = 'delete' then delete
when not matched then insert (id, name, dt, hh) values(source.id, source.name,
source.dt, source.hh)
;
merge into mor_pt_pcf_tbl as target
using (
select id, name, '1000' as ts, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then update set *
when matched and flag = 'delete' then delete
when not matched then insert (id, name, ts, dt, hh) values(source.id,
source.name, source.ts, source.dt, source.hh)
; {code}
Delete
{code:java}
delete from cow_nonpt_nonpcf_tbl where id = 1;
delete from mor_nonpt_pcf_tbl where id = 1; {code}
Alter
{code:java}
--rename to:
ALTER TABLE cow_nonpt_nonpcf_tbl RENAME TO cow_nonpt_nonpcf_tbl_2;
--add column:
ALTER TABLE cow_nonpt_nonpcf_tbl_2 add columns(ext0 string);
--change column:
ALTER TABLE cow_nonpt_nonpcf_tbl_2 change column id id bigint;
--show partition:
show partitions mor_pt_pcf_tbl;
show partitions cow_pt_nonpcf_tbl;
--drop partition:
alter table cow_pt_nonpcf_tbl drop partition (dt='2021-01-02', hh='10');
--set properties;
alter table mor_nonpt_pcf_tbl set tblproperties (hoodie.keep.max.commits =
'10');
alter table mor_nonpt_pcf_tbl set serdeproperties (hoodie.keep.max.commits =
'10'); {code}
> Test 0.10 RC for Spark 3.x
> --------------------------
>
> Key: HUDI-2781
> URL: https://issues.apache.org/jira/browse/HUDI-2781
> Project: Apache Hudi
> Issue Type: Test
> Components: Spark Integration
> Reporter: Raymond Xu
> Assignee: Yann Byron
> Priority: Blocker
> Labels: pull-request-available, sev:high
> Fix For: 0.10.0
>
>
> Combinations
> # Spark 3.0 & 3.1.x against Hive 2
> # Spark 3.0 & 3.1.X against Hive 3
> # Spark 3.2 against Hive 2
>
> Let's test a COW and MOR long running DAG across these environments and get a
> report with bugs/issues
>
> We have YAMLs here, that can be run across all different environments listed
> here.
> [https://github.com/apache/hudi/tree/master/docker/demo/config/test-suite]
--
This message was sent by Atlassian Jira
(v8.20.1#820001)