Sourabh Badhya created HIVE-27536:
-------------------------------------

             Summary: Merge task must be invoked after optimisation for 
external CTAS queries
                 Key: HIVE-27536
                 URL: https://issues.apache.org/jira/browse/HIVE-27536
             Project: Hive
          Issue Type: Bug
            Reporter: Sourabh Badhya
            Assignee: Sourabh Badhya


Merge task is not invoked on S3 file system / object stores when CTAS query is 
performed. 
Repro test - Test.q
{code:java}
--! qt:dataset:src
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.merge.tezfiles=true;
set hive.blobstore.supported.schemes=hdfs,file;
set hive.merge.smallfiles.avgsize=7500;

-- SORT_QUERY_RESULTS

create table part_source(key string, value string) partitioned by (ds string);
create table source(key string);

-- The partitioned table must have 2 files per partition (necessary for merge 
task)
insert overwrite table part_source partition(ds='102') select * from src;
insert into table part_source partition(ds='102') select * from src;
insert overwrite table part_source partition(ds='103') select * from src;
insert into table part_source partition(ds='102') select * from src;

-- The unpartitioned table must have 2 files.
insert overwrite table source select key from src;
insert into table source select key from src;

-- Create CTAS tables both for unpartitioned and partitioned cases for ORC 
formats.
explain analyze create external table ctas_table stored as orc as select * from 
source;
create external table ctas_table stored as orc as select * from source;
explain analyze create external table ctas_part_table partitioned by (ds) 
stored as orc as select * from part_source;
create external table ctas_part_table partitioned by (ds) stored as orc as 
select * from part_source;

-- This must be 1 indicating there is 1 file after merge.
select count(distinct(INPUT__FILE__NAME)) from ctas_table;
-- This must be 2 indicating there is 1 file per partition after merge.
select count(distinct(INPUT__FILE__NAME)) from ctas_part_table;

-- Create CTAS tables both for unpartitioned and partitioned cases for non-ORC 
formats.
explain analyze create external table ctas_table_non_orc as select * from 
source;
create external table ctas_table_non_orc as select * from source;
explain analyze create external table ctas_part_table_non_orc partitioned by 
(ds) as select * from part_source;
create external table ctas_part_table_non_orc partitioned by (ds) as select * 
from part_source;

-- This must be 1 indicating there is 1 file after merge.
select count(distinct(INPUT__FILE__NAME)) from ctas_table_non_orc;
-- This must be 2 indicating there is 1 file per partition after merge.
select count(distinct(INPUT__FILE__NAME)) from ctas_part_table_non_orc;
{code}
 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to