[ 
https://issues.apache.org/jira/browse/HIVE-26975?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rajesh Balamohan updated HIVE-26975:
------------------------------------
    Summary: Iceberg: MERGE: Wrong reducer estimate causing smaller files to be 
created  (was: MERGE: Wrong reducer estimate causing smaller files to be 
created)

> Iceberg: MERGE: Wrong reducer estimate causing smaller files to be created
> --------------------------------------------------------------------------
>
>                 Key: HIVE-26975
>                 URL: https://issues.apache.org/jira/browse/HIVE-26975
>             Project: Hive
>          Issue Type: Improvement
>          Components: Iceberg integration
>            Reporter: Rajesh Balamohan
>            Priority: Major
>              Labels: performance
>
> * "Merge into" estimates wrong number of reducers causing more number of 
> small files to be created.* e.g 400+ files in 3+ MB file each.*
>  * This can be reproduced by writing data into "store_sales" table in iceberg 
> format via another source table (using merge-into).
>  ** e.g  Running this few times will create wrong number of reduce tasks 
> causing lot of small files to be created in iceberg table.
> {noformat}
> MERGE INTO store_sales_t t
> using ssv s
> ON ( t.ss_item_sk = s.ss_item_sk
>      AND t.ss_customer_sk = s.ss_customer_sk
>      AND t.ss_sold_date_sk = "2451181"
>      AND ( ( Floor(( s.ss_item_sk ) / 1000) * 1000 ) BETWEEN 1000 AND 2000 )
>      AND s.ss_ext_discount_amt < 0.0 )
> WHEN matched AND t.ss_ext_discount_amt IS NULL THEN
>   UPDATE SET ss_ext_discount_amt = 0.0
> WHEN NOT matched THEN
>   INSERT ( ss_sold_time_sk,
>            ss_item_sk,
>            ss_customer_sk,
>            ss_cdemo_sk,
>            ss_hdemo_sk,
>            ss_addr_sk,
>            ss_store_sk,
>            ss_promo_sk,
>            ss_ticket_number,
>            ss_quantity,
>            ss_wholesale_cost,
>            ss_list_price,
>            ss_sales_price,
>            ss_ext_discount_amt,
>            ss_ext_sales_price,
>            ss_ext_wholesale_cost,
>            ss_ext_list_price,
>            ss_ext_tax,
>            ss_coupon_amt,
>            ss_net_paid,
>            ss_net_paid_inc_tax,
>            ss_net_profit,
>            ss_sold_date_sk )
>   VALUES ( s.ss_sold_time_sk,
>            s.ss_item_sk,
>            s.ss_customer_sk,
>            s.ss_cdemo_sk,
>            s.ss_hdemo_sk,
>            s.ss_addr_sk,
>            s.ss_store_sk,
>            s.ss_promo_sk,
>            s.ss_ticket_number,
>            s.ss_quantity,
>            s.ss_wholesale_cost,
>            s.ss_list_price,
>            s.ss_sales_price,
>            s.ss_ext_discount_amt,
>            s.ss_ext_sales_price,
>            s.ss_ext_wholesale_cost,
>            s.ss_ext_list_price,
>            s.ss_ext_tax,
>            s.ss_coupon_amt,
>            s.ss_net_paid,
>            s.ss_net_paid_inc_tax,
>            s.ss_net_profit,
>            "2451181") 
> {noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to