[ 
https://issues.apache.org/jira/browse/HIVE-27536?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HIVE-27536:
----------------------------------
    Labels: pull-request-available  (was: )

> Merge task must be invoked after optimisation for external CTAS queries
> -----------------------------------------------------------------------
>
>                 Key: HIVE-27536
>                 URL: https://issues.apache.org/jira/browse/HIVE-27536
>             Project: Hive
>          Issue Type: Bug
>            Reporter: Sourabh Badhya
>            Assignee: Sourabh Badhya
>            Priority: Major
>              Labels: pull-request-available
>
> Merge task is not invoked on S3 file system / object stores when CTAS query 
> is performed. 
> Repro test - Test.q
> {code:java}
> --! qt:dataset:src
> set hive.mapred.mode=nonstrict;
> set hive.explain.user=false;
> set hive.merge.mapredfiles=true;
> set hive.merge.mapfiles=true;
> set hive.merge.tezfiles=true;
> set hive.blobstore.supported.schemes=hdfs,file;
> set hive.merge.smallfiles.avgsize=7500;
> -- SORT_QUERY_RESULTS
> create table part_source(key string, value string) partitioned by (ds string);
> create table source(key string);
> -- The partitioned table must have 2 files per partition (necessary for merge 
> task)
> insert overwrite table part_source partition(ds='102') select * from src;
> insert into table part_source partition(ds='102') select * from src;
> insert overwrite table part_source partition(ds='103') select * from src;
> insert into table part_source partition(ds='102') select * from src;
> -- The unpartitioned table must have 2 files.
> insert overwrite table source select key from src;
> insert into table source select key from src;
> -- Create CTAS tables both for unpartitioned and partitioned cases for ORC 
> formats.
> explain analyze create external table ctas_table stored as orc as select * 
> from source;
> create external table ctas_table stored as orc as select * from source;
> explain analyze create external table ctas_part_table partitioned by (ds) 
> stored as orc as select * from part_source;
> create external table ctas_part_table partitioned by (ds) stored as orc as 
> select * from part_source;
> -- This must be 1 indicating there is 1 file after merge.
> select count(distinct(INPUT__FILE__NAME)) from ctas_table;
> -- This must be 2 indicating there is 1 file per partition after merge.
> select count(distinct(INPUT__FILE__NAME)) from ctas_part_table;
> -- Create CTAS tables both for unpartitioned and partitioned cases for 
> non-ORC formats.
> explain analyze create external table ctas_table_non_orc as select * from 
> source;
> create external table ctas_table_non_orc as select * from source;
> explain analyze create external table ctas_part_table_non_orc partitioned by 
> (ds) as select * from part_source;
> create external table ctas_part_table_non_orc partitioned by (ds) as select * 
> from part_source;
> -- This must be 1 indicating there is 1 file after merge.
> select count(distinct(INPUT__FILE__NAME)) from ctas_table_non_orc;
> -- This must be 2 indicating there is 1 file per partition after merge.
> select count(distinct(INPUT__FILE__NAME)) from ctas_part_table_non_orc;
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to