[
https://issues.apache.org/jira/browse/HIVE-27901?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
yongzhi.shao updated HIVE-27901:
--------------------------------
Description:
I am using HIVE4.0.0-BETA for testing.
BTW,I found that the performance of HIVE reading ICEBERG table is still very
slow.
How should I deal with this problem?
I count a 7 billion table and compare the performance difference between HIVE
reading ICEBERG-ORC and ORC table respectively.
We use ICEBERG 1.4.2, ICEBERG-ORC with ZSTD compression enabled.
ORC with SNAPPY compression.
HADOOP version 3.1.1 (native zstd not supported).
{code:java}
--spark3.4.1+iceberg 1.4.2
CREATE TABLE datacenter.dwd.b_std_trade (
uni_order_id STRING,
data_from BIGINT,
partner STRING,
plat_code STRING,
order_id STRING,
uni_shop_id STRING,
uni_id STRING,
guide_id STRING,
shop_id STRING,
plat_account STRING,
total_fee DOUBLE,
item_discount_fee DOUBLE,
trade_discount_fee DOUBLE,
adjust_fee DOUBLE,
post_fee DOUBLE,
discount_rate DOUBLE,
payment_no_postfee DOUBLE,
payment DOUBLE,
pay_time STRING,
product_num BIGINT,
order_status STRING,
is_refund STRING,
refund_fee DOUBLE,
insert_time STRING,
created STRING,
endtime STRING,
modified STRING,
trade_type STRING,
receiver_name STRING,
receiver_country STRING,
receiver_state STRING,
receiver_city STRING,
receiver_district STRING,
receiver_town STRING,
receiver_address STRING,
receiver_mobile STRING,
trade_source STRING,
delivery_type STRING,
consign_time STRING,
orders_num BIGINT,
is_presale BIGINT,
presale_status STRING,
first_fee_paytime STRING,
last_fee_paytime STRING,
first_paid_fee DOUBLE,
tenant STRING,
tidb_modified STRING,
step_paid_fee DOUBLE,
seller_flag STRING,
is_used_store_card BIGINT,
store_card_used DOUBLE,
store_card_basic_used DOUBLE,
store_card_expand_used DOUBLE,
order_promotion_num BIGINT,
item_promotion_num BIGINT,
buyer_remark STRING,
seller_remark STRING,
trade_business_type STRING)
USING iceberg
PARTITIONED BY (uni_shop_id, truncate(4, created))
LOCATION '/iceberg-catalog/warehouse/dwd/b_std_trade'
TBLPROPERTIES (
'current-snapshot-id' = '7217819472703702905',
'format' = 'iceberg/orc',
'format-version' = '1',
'hive.stored-as' = 'iceberg',
'read.orc.vectorization.enabled' = 'true',
'sort-order' = 'uni_shop_id ASC NULLS FIRST, created ASC NULLS FIRST',
'write.distribution-mode' = 'hash',
'write.format.default' = 'orc',
'write.metadata.delete-after-commit.enabled' = 'true',
'write.metadata.previous-versions-max' = '3',
'write.orc.bloom.filter.columns' = 'order_id',
'write.orc.compression-codec' = 'zstd')
--hive-iceberg
CREATE EXTERNAL TABLE iceberg_dwd.b_std_trade
STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'
LOCATION 'hdfs://xxxx/iceberg-catalog/warehouse/dwd/b_std_trade'
TBLPROPERTIES
('iceberg.catalog'='location_based_table','engine.hive.enabled'='true');
--inner orc table( set hive default format = orc )
create table if not exists iceberg_dwd.orc_inner_table as select * from
iceberg_dwd.b_std_trade;{code}
!image-2023-11-22-18-32-28-344.png!
!image-2023-11-22-18-33-01-885.png!
Also, I have another question. The Submit Plan statistic is clearly incorrect.
Is this something that needs to be fixed?
!image-2023-11-22-18-33-32-915.png!
was:
I am using HIVE4.0.0-BETA for testing.
BTW,I found that the performance of HIVE reading ICEBERG table is still very
slow.
How should I deal with this problem?
I count a 7 billion table and compare the performance difference between HIVE
reading ICEBERG-ORC and ORC table respectively.
We use ICEBERG 1.4.2, ICEBERG-ORC with ZSTD compression enabled.
ORC with SNAPPY compression.
HADOOP version 3.1.1 (native zstd not supported).
!image-2023-11-22-18-32-28-344.png!
!image-2023-11-22-18-33-01-885.png!
Also, I have another question. The Submit Plan statistic is clearly incorrect.
Is this something that needs to be fixed?
!image-2023-11-22-18-33-32-915.png!
> Hive's performance for querying the Iceberg table is very poor.
> ---------------------------------------------------------------
>
> Key: HIVE-27901
> URL: https://issues.apache.org/jira/browse/HIVE-27901
> Project: Hive
> Issue Type: Bug
> Components: Iceberg integration
> Affects Versions: 4.0.0-beta-1
> Reporter: yongzhi.shao
> Priority: Major
> Attachments: image-2023-11-22-18-32-28-344.png,
> image-2023-11-22-18-33-01-885.png, image-2023-11-22-18-33-32-915.png
>
>
> I am using HIVE4.0.0-BETA for testing.
> BTW,I found that the performance of HIVE reading ICEBERG table is still very
> slow.
> How should I deal with this problem?
> I count a 7 billion table and compare the performance difference between HIVE
> reading ICEBERG-ORC and ORC table respectively.
> We use ICEBERG 1.4.2, ICEBERG-ORC with ZSTD compression enabled.
> ORC with SNAPPY compression.
> HADOOP version 3.1.1 (native zstd not supported).
> {code:java}
> --spark3.4.1+iceberg 1.4.2
> CREATE TABLE datacenter.dwd.b_std_trade (
> uni_order_id STRING,
> data_from BIGINT,
> partner STRING,
> plat_code STRING,
> order_id STRING,
> uni_shop_id STRING,
> uni_id STRING,
> guide_id STRING,
> shop_id STRING,
> plat_account STRING,
> total_fee DOUBLE,
> item_discount_fee DOUBLE,
> trade_discount_fee DOUBLE,
> adjust_fee DOUBLE,
> post_fee DOUBLE,
> discount_rate DOUBLE,
> payment_no_postfee DOUBLE,
> payment DOUBLE,
> pay_time STRING,
> product_num BIGINT,
> order_status STRING,
> is_refund STRING,
> refund_fee DOUBLE,
> insert_time STRING,
> created STRING,
> endtime STRING,
> modified STRING,
> trade_type STRING,
> receiver_name STRING,
> receiver_country STRING,
> receiver_state STRING,
> receiver_city STRING,
> receiver_district STRING,
> receiver_town STRING,
> receiver_address STRING,
> receiver_mobile STRING,
> trade_source STRING,
> delivery_type STRING,
> consign_time STRING,
> orders_num BIGINT,
> is_presale BIGINT,
> presale_status STRING,
> first_fee_paytime STRING,
> last_fee_paytime STRING,
> first_paid_fee DOUBLE,
> tenant STRING,
> tidb_modified STRING,
> step_paid_fee DOUBLE,
> seller_flag STRING,
> is_used_store_card BIGINT,
> store_card_used DOUBLE,
> store_card_basic_used DOUBLE,
> store_card_expand_used DOUBLE,
> order_promotion_num BIGINT,
> item_promotion_num BIGINT,
> buyer_remark STRING,
> seller_remark STRING,
> trade_business_type STRING)
> USING iceberg
> PARTITIONED BY (uni_shop_id, truncate(4, created))
> LOCATION '/iceberg-catalog/warehouse/dwd/b_std_trade'
> TBLPROPERTIES (
> 'current-snapshot-id' = '7217819472703702905',
> 'format' = 'iceberg/orc',
> 'format-version' = '1',
> 'hive.stored-as' = 'iceberg',
> 'read.orc.vectorization.enabled' = 'true',
> 'sort-order' = 'uni_shop_id ASC NULLS FIRST, created ASC NULLS FIRST',
> 'write.distribution-mode' = 'hash',
> 'write.format.default' = 'orc',
> 'write.metadata.delete-after-commit.enabled' = 'true',
> 'write.metadata.previous-versions-max' = '3',
> 'write.orc.bloom.filter.columns' = 'order_id',
> 'write.orc.compression-codec' = 'zstd')
> --hive-iceberg
> CREATE EXTERNAL TABLE iceberg_dwd.b_std_trade
> STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'
> LOCATION 'hdfs://xxxx/iceberg-catalog/warehouse/dwd/b_std_trade'
> TBLPROPERTIES
> ('iceberg.catalog'='location_based_table','engine.hive.enabled'='true');
> --inner orc table( set hive default format = orc )
> create table if not exists iceberg_dwd.orc_inner_table as select * from
> iceberg_dwd.b_std_trade;{code}
>
> !image-2023-11-22-18-32-28-344.png!
> !image-2023-11-22-18-33-01-885.png!
> Also, I have another question. The Submit Plan statistic is clearly
> incorrect. Is this something that needs to be fixed?
> !image-2023-11-22-18-33-32-915.png!
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)