[
https://issues.apache.org/jira/browse/HIVE-27303?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
ASF GitHub Bot updated HIVE-27303:
----------------------------------
Labels: pull-request-available (was: )
> select query result is different when enable/disable mapjoin with UNION ALL
> ---------------------------------------------------------------------------
>
> Key: HIVE-27303
> URL: https://issues.apache.org/jira/browse/HIVE-27303
> Project: Hive
> Issue Type: Bug
> Reporter: Mahesh Raju Somalaraju
> Assignee: Mahesh Raju Somalaraju
> Priority: Major
> Labels: pull-request-available
>
> select query result is different when enable/disable mapjoin with UNION ALL
> Below are the reproduce steps.
> As per query when map.join is disabled it should not give rows(duplicate).
> Same is working fine with map.join=true.
> Expected result: Empty rows.
> Problem: returning duplicate rows.
> Steps:
> ----------
> SET hive.server2.tez.queue.access.check=true;
> SET tez.queue.name=default
> SET hive.query.results.cache.enabled=false;
> SET hive.fetch.task.conversion=none;
> SET hive.execution.engine=tez;
> SET hive.stats.autogather=true;
> SET hive.server2.enable.doAs=false;
> SET hive.auto.convert.join=false;
> drop table if exists hive1_tbl_data;
> drop table if exists hive2_tbl_data;
> drop table if exists hive3_tbl_data;
> drop table if exists hive4_tbl_data;
> CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN
> string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM
> string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
> TBLPROPERTIES (
> 'TRANSLATED_TO_EXTERNAL'='true',
> 'bucketing_version'='2',
> 'external.table.purge'='true',
> 'parquet.compression'='SNAPPY');
> CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN
> string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM
> string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
> TBLPROPERTIES (
> 'TRANSLATED_TO_EXTERNAL'='true',
> 'bucketing_version'='2',
> 'external.table.purge'='true',
> 'parquet.compression'='SNAPPY');
> CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN
> string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM
> string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
> TBLPROPERTIES (
> 'TRANSLATED_TO_EXTERNAL'='true',
> 'bucketing_version'='2',
> 'external.table.purge'='true',
> 'parquet.compression'='SNAPPY');
> CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN
> string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM
> string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
> TBLPROPERTIES (
> 'TRANSLATED_TO_EXTERNAL'='true',
> 'bucketing_version'='2',
> 'external.table.purge'='true',
> 'parquet.compression'='SNAPPY');
>
> insert into table hive1_tbl_data select
> '00001','john','doe','[email protected]','2014-01-01 12:01:02','4000-10000';
> insert into table hive1_tbl_data select
> '00002','john','doe','[email protected]','2014-01-01
> 12:01:02','4000-10000';insert into table hive2_tbl_data select
> '00001','john','doe','[email protected]','2014-01-01 12:01:02','00001';
> insert into table hive2_tbl_data select
> '00002','john','doe','[email protected]','2014-01-01 12:01:02','00001';
>
> select
> t.COLUMID
> from (
> select distinct
> t.COLUMID as COLUMID
> from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM
> hive1_tbl_data) t
> ) t
> left join (
> select
> distinct t.COLUMID
> from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM
> hive2_tbl_data) t
> ) t1 on t.COLUMID = t1.COLUMID
> where t1.COLUMID is null;
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)