[jira] [Updated] (HIVE-28277) HIVE does not support update operations for ICEBERG of type location_based_table.

yongzhi.shao (Jira) Wed, 22 May 2024 20:26:05 -0700


     [ 
https://issues.apache.org/jira/browse/HIVE-28277?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


yongzhi.shao updated HIVE-28277:
--------------------------------
    Description: 
Currently, when I update the location_based_table using hive, hive incorrectly 
empties all data directories and metadata directories.

After the update statement is executed, the iceberg table is corrupted.

 
{code:java}
--spark 3.4.1 + iceberg 1.5.2:
CREATE TABLE IF NOT EXISTS datacenter.default.test_data_04 (
id string,name string
)
using iceberg
PARTITIONED BY (name)
TBLPROPERTIES 
('read.orc.vectorization.enabled'='true','write.format.default'='orc','write.orc.bloom.filter.columns'='id','write.orc.compression-codec'='zstd','write.metadata.previous-versions-max'='3','write.metadata.delete-after-commit.enabled'='true');

insert into datacenter.default.test_data_04(id,name) values('1','a'),('2','b');

--hive4:
CREATE EXTERNAL TABLE default.test_data_04
STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' 
LOCATION 'hdfs://xxxx/iceberg-catalog/warehouse/default/test_data_04'
TBLPROPERTIES 
('iceberg.catalog'='location_based_table','engine.hive.enabled'='true');

select distinct id,name from (select id,name from default.test_data_04 limit 
10) s1; --2 row

update test_data_04 set name = 'adasd' where id = '1';

ERROR:
2024-05-23T10:26:32,028 ERROR [HiveServer2-Background-Pool: Thread-297] 
hive.HiveIcebergStorageHandler: Error while trying to commit job: 
job_17061635207991_169536, job_17061635207990_169536, 
job_17061635207992_169536, starting rollback changes for table: 
default.test_data_04
org.apache.iceberg.exceptions.NoSuchTableException: Table does not exist at 
location: /iceberg-catalog/warehouse/default/test_data_04


BEFORE UPDATE:
ICEBERG TABLE DIR:
[root@xxxx ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
Found 2 items
drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
/iceberg-catalog/warehouse/default/test_data_04/data
drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
/iceberg-catalog/warehouse/default/test_data_04/metadata


AFTER UPDATE:
ICEBERG TABLE DIR:

[root@XXX ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
Found 3 items
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_1
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_2
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_3


{code}
 

 

  was:
Currently, when I update the location_based_table using hive, hive incorrectly 
empties all data directories and metadata directories.

After the update statement is executed, the iceberg table is corrupted.

 
{code:java}
--spark:
CREATE TABLE IF NOT EXISTS datacenter.default.test_data_04 (
id string,name string
)
using iceberg
PARTITIONED BY (name)
TBLPROPERTIES 
('read.orc.vectorization.enabled'='true','write.format.default'='orc','write.orc.bloom.filter.columns'='id','write.orc.compression-codec'='zstd','write.metadata.previous-versions-max'='3','write.metadata.delete-after-commit.enabled'='true');

insert into datacenter.default.test_data_04(id,name) values('1','a'),('2','b');

--hive4:
CREATE EXTERNAL TABLE default.test_data_04
STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' 
LOCATION 'hdfs://xxxx/iceberg-catalog/warehouse/default/test_data_04'
TBLPROPERTIES 
('iceberg.catalog'='location_based_table','engine.hive.enabled'='true');

select distinct id,name from (select id,name from default.test_data_04 limit 
10) s1; --2 row

update test_data_04 set name = 'adasd' where id = '1';

ERROR:
2024-05-23T10:26:32,028 ERROR [HiveServer2-Background-Pool: Thread-297] 
hive.HiveIcebergStorageHandler: Error while trying to commit job: 
job_17061635207991_169536, job_17061635207990_169536, 
job_17061635207992_169536, starting rollback changes for table: 
default.test_data_04
org.apache.iceberg.exceptions.NoSuchTableException: Table does not exist at 
location: /iceberg-catalog/warehouse/default/test_data_04


BEFORE UPDATE:
ICEBERG TABLE DIR:
[root@xxxx ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
Found 2 items
drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
/iceberg-catalog/warehouse/default/test_data_04/data
drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
/iceberg-catalog/warehouse/default/test_data_04/metadata


AFTER UPDATE:
ICEBERG TABLE DIR:

[root@XXX ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
Found 3 items
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_1
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_2
drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
/iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_3


{code}
 

 


> HIVE does not support update operations for ICEBERG of type 
> location_based_table.
> ---------------------------------------------------------------------------------
>
>                 Key: HIVE-28277
>                 URL: https://issues.apache.org/jira/browse/HIVE-28277
>             Project: Hive
>          Issue Type: Improvement
>          Components: Iceberg integration
>    Affects Versions: 4.0.0
>         Environment: ICEBERG:1.5.2
> HIVE 4.0.0
>            Reporter: yongzhi.shao
>            Priority: Major
>
> Currently, when I update the location_based_table using hive, hive 
> incorrectly empties all data directories and metadata directories.
> After the update statement is executed, the iceberg table is corrupted.
>  
> {code:java}
> --spark 3.4.1 + iceberg 1.5.2:
> CREATE TABLE IF NOT EXISTS datacenter.default.test_data_04 (
> id string,name string
> )
> using iceberg
> PARTITIONED BY (name)
> TBLPROPERTIES 
> ('read.orc.vectorization.enabled'='true','write.format.default'='orc','write.orc.bloom.filter.columns'='id','write.orc.compression-codec'='zstd','write.metadata.previous-versions-max'='3','write.metadata.delete-after-commit.enabled'='true');
> insert into datacenter.default.test_data_04(id,name) 
> values('1','a'),('2','b');
> --hive4:
> CREATE EXTERNAL TABLE default.test_data_04
> STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' 
> LOCATION 'hdfs://xxxx/iceberg-catalog/warehouse/default/test_data_04'
> TBLPROPERTIES 
> ('iceberg.catalog'='location_based_table','engine.hive.enabled'='true');
> select distinct id,name from (select id,name from default.test_data_04 limit 
> 10) s1; --2 row
> update test_data_04 set name = 'adasd' where id = '1';
> ERROR:
> 2024-05-23T10:26:32,028 ERROR [HiveServer2-Background-Pool: Thread-297] 
> hive.HiveIcebergStorageHandler: Error while trying to commit job: 
> job_17061635207991_169536, job_17061635207990_169536, 
> job_17061635207992_169536, starting rollback changes for table: 
> default.test_data_04
> org.apache.iceberg.exceptions.NoSuchTableException: Table does not exist at 
> location: /iceberg-catalog/warehouse/default/test_data_04
> BEFORE UPDATE:
> ICEBERG TABLE DIR:
> [root@xxxx ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
> Found 2 items
> drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
> /iceberg-catalog/warehouse/default/test_data_04/data
> drwxr-xr-x   - hive hdfs          0 2024-05-23 09:26 
> /iceberg-catalog/warehouse/default/test_data_04/metadata
> AFTER UPDATE:
> ICEBERG TABLE DIR:
> [root@XXX ~]# hdfs dfs -ls /iceberg-catalog/warehouse/default/test_data_04
> Found 3 items
> drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
> /iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_1
> drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
> /iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_2
> drwxr-xr-x   - hive hdfs          0 2024-05-23 10:26 
> /iceberg-catalog/warehouse/default/test_data_04/-tmp.HIVE_UNION_SUBDIR_3
> {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HIVE-28277) HIVE does not support update operations for ICEBERG of type location_based_table.

Reply via email to