[ 
https://issues.apache.org/jira/browse/HUDI-5857?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jing Zhang updated HUDI-5857:
-----------------------------
    Description: 
Snapshot query result is wrong after apply insert overwrite to an existed table 
with simple bucket index.

The bug could be produced by the following steps.
 # create a mor table with bucket index
{code:java}
create table test_hudi_zj0221(  
  id int,  
  name string,  
  price double, 
  ts long,  
  dt string) using hudipartitioned by (dt)
options(  
  type='mor',  
  primaryKey='id',  
  preCombineField = 'ts', 
  'hoodie.index.type'='BUCKET',
  'hoodie.bucket.index.num.buckets'='8) {code}

 # insert into data
{code:java}
insert
into test_hudi_zj0221 select 8 as id, 'hudi3' as name, 30 as price, 3000 as ts,
'2021-05-05' as dt;

insert
into test_hudi_zj0221 select 9 as id, 'hudi3' as name, 30 as price, 3000 as ts,
'2021-05-05' as dt;

insert
into test_hudi_zj0221 select 10 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 11 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 12 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 13 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 14 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 15 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt; {code}

 # find something wrong, use insert overwrite to overwrite a partition
{code:java}
 insert overwrite table test_hudi_zj0221 partition(dt = '2021-05-05') select 
2222, 'a2',30, 3000; {code}

 # snapshot query on the table
{code:java}
select * from test_hudi_zj0221 where dt='2021-05-05';
-- or
select * from test_hudi_zj0221; {code}

  was:
Snapshot query result is wrong after apply insert overwrite to an existed table 
with simple bucket index.

The bug could be produced by the following steps.
 # create a mor table with bucket index
{code:java}
create table test_hudi_zj0221(  
  id int,  
  name string,  
  price double, 
  ts long,  
  dt string) using hudipartitioned by (dt)
options(  
  type='mor',  
  primaryKey='id',  
  preCombineField = 'ts', 
  'hoodie.index.type'='BUCKET',
'hoodie.storage.layout.partitioner.class'='org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner',
 'hoodie.bucket.index.num.buckets'='8', 
'hoodie.datasource.write.recordkey.field' = 'id', 
'hoodie.storage.layout.type'='BUCKET') {code}

 # insert into data
{code:java}
insert
into test_hudi_zj0221 select 8 as id, 'hudi3' as name, 30 as price, 3000 as ts,
'2021-05-05' as dt;

insert
into test_hudi_zj0221 select 9 as id, 'hudi3' as name, 30 as price, 3000 as ts,
'2021-05-05' as dt;

insert
into test_hudi_zj0221 select 10 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 11 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 12 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 13 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 14 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt;

insert
into test_hudi_zj0221 select 15 as id, 'hudi3' as name, 30 as price, 3000 as
ts, '2021-05-05' as dt; {code}

 # find something wrong, use insert overwrite to overwrite a partition
{code:java}
 insert overwrite table test_hudi_zj0221 partition(dt = '2021-05-05') select 
2222, 'a2',30, 3000; {code}

 # snapshot query on the table
{code:java}
select * from test_hudi_zj0221 where dt='2021-05-05';
-- or
select * from test_hudi_zj0221; {code}


> Snapshot query result is wrong after apply insert overwrite to an existed 
> table with simple bucket index
> --------------------------------------------------------------------------------------------------------
>
>                 Key: HUDI-5857
>                 URL: https://issues.apache.org/jira/browse/HUDI-5857
>             Project: Apache Hudi
>          Issue Type: Bug
>            Reporter: Jing Zhang
>            Assignee: Jing Zhang
>            Priority: Major
>              Labels: pull-request-available
>         Attachments: image-2023-02-27-15-55-26-568.png
>
>
> Snapshot query result is wrong after apply insert overwrite to an existed 
> table with simple bucket index.
> The bug could be produced by the following steps.
>  # create a mor table with bucket index
> {code:java}
> create table test_hudi_zj0221(  
>   id int,  
>   name string,  
>   price double, 
>   ts long,  
>   dt string) using hudipartitioned by (dt)
> options(  
>   type='mor',  
>   primaryKey='id',  
>   preCombineField = 'ts', 
>   'hoodie.index.type'='BUCKET',
>   'hoodie.bucket.index.num.buckets'='8) {code}
>  # insert into data
> {code:java}
> insert
> into test_hudi_zj0221 select 8 as id, 'hudi3' as name, 30 as price, 3000 as 
> ts,
> '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 9 as id, 'hudi3' as name, 30 as price, 3000 as 
> ts,
> '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 10 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 11 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 12 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 13 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 14 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt;
> insert
> into test_hudi_zj0221 select 15 as id, 'hudi3' as name, 30 as price, 3000 as
> ts, '2021-05-05' as dt; {code}
>  # find something wrong, use insert overwrite to overwrite a partition
> {code:java}
>  insert overwrite table test_hudi_zj0221 partition(dt = '2021-05-05') select 
> 2222, 'a2',30, 3000; {code}
>  # snapshot query on the table
> {code:java}
> select * from test_hudi_zj0221 where dt='2021-05-05';
> -- or
> select * from test_hudi_zj0221; {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to