Sahil Takiar created HIVE-14082:
-----------------------------------

             Summary: Multi-Insert Query Fails with GROUP BY, DISTINCT, and 
WHERE clauses
                 Key: HIVE-14082
                 URL: https://issues.apache.org/jira/browse/HIVE-14082
             Project: Hive
          Issue Type: Bug
    Affects Versions: 2.1.0, 1.1.0
            Reporter: Sahil Takiar


The following MULTI-INSERT Query Fails in Hive. I've listed the query required 
to re-produce this failure, as well as a few similar queries that work properly.

Setup Queries:

{code}
DROP SCHEMA IF EXISTS multi_table_insert_bug CASCADE;
CREATE SCHEMA multi_table_insert_bug;
USE multi_table_insert_bug;

DROP TABLE IF EXISTS multi_table_insert_source;
DROP TABLE IF EXISTS multi_table_insert_test;

CREATE TABLE multi_table_insert_source (
  date_column DATE,
  column_1 STRING,
  column_2 STRING,
  column_3 STRING,
  column_4 STRING
);

CREATE TABLE multi_table_insert_test (
  column_1 STRING,
  column_2 STRING,
  line_count INT,
  distinct_count_by_1_column INT,
  distinct_count_by_2_columns INT
)
PARTITIONED BY (partition_column INT);

INSERT OVERWRITE TABLE multi_table_insert_source VALUES
  ('2016-01-22', 'value_1_1', 'value_1_2', 'value_1_3', 'value_1_4'),
  ('2016-01-22', 'value_2_1', 'value_2_2', 'value_2_3', 'value_2_4'),
  ('2016-01-22', 'value_3_1', 'value_3_2', 'value_3_3', 'value_3_4'),
  ('2016-01-22', 'value_4_1', 'value_4_2', 'value_4_3', 'value_4_4'),
  ('2016-01-22', 'value_5_1', 'value_5_2', 'value_5_3', 'value_5_4');
{code}


The following queries run successfully:

*Query 1:*

{code}
FROM multi_table_insert_source
  INSERT OVERWRITE TABLE multi_table_insert_test PARTITION (partition_column = 
365)
  SELECT
    column_1,
    column_2,
    COUNT(*) AS line_count,
    COUNT(DISTINCT column_3) AS distinct_count_by_1_column,
    COUNT(DISTINCT date_column, column_3) AS distinct_count_by_2_columns
  WHERE date_column >= DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP()), 365)
  GROUP BY
    column_1,
    column_2;
{code}

*Query 2:*

{code}
FROM multi_table_insert_source
  INSERT OVERWRITE TABLE multi_table_insert_test PARTITION (partition_column = 
365)
  SELECT
    column_1,
    column_2,
    COUNT(*) AS line_count,
    COUNT(DISTINCT column_3) AS distinct_count_by_1_column,
    COUNT(DISTINCT date_column, column_3) AS distinct_count_by_2_columns
--  WHERE date_column >= DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP()), 365)
  GROUP BY
    column_1,
    column_2
  INSERT OVERWRITE TABLE multi_table_insert_test PARTITION (partition_column = 
1096)
  SELECT
    column_1,
    column_2,
    COUNT(*) AS line_count,
    COUNT(DISTINCT column_3) AS distinct_count_by_1_column,
    COUNT(DISTINCT date_column, column_3) AS distinct_count_by_2_columns
--  WHERE date_column >= DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP()), 1096)
  GROUP BY
    column_1,
    column_2;
{code}

The following query fails with an {{IndexOutOfBoundsException Index: 3, Size: 
3}} the only difference between this query and the previous one is the WHERE 
clause that I've commented out above.

*Query 3:*

{code}
FROM multi_table_insert_source
  INSERT OVERWRITE TABLE multi_table_insert_test PARTITION (partition_column = 
365)
  SELECT
    column_1,
    column_2,
    COUNT(*) AS line_count,
    COUNT(DISTINCT column_3) AS distinct_count_by_1_column,
    COUNT(DISTINCT date_column, column_3) AS distinct_count_by_2_columns
  WHERE date_column >= DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP()), 365)
  GROUP BY
    column_1,
    column_2
  INSERT OVERWRITE TABLE multi_table_insert_test PARTITION (partition_column = 
1096)
  SELECT
    column_1,
    column_2,
    COUNT(*) AS line_count,
    COUNT(DISTINCT column_3) AS distinct_count_by_1_column,
    COUNT(DISTINCT date_column, column_3) AS distinct_count_by_2_columns
  WHERE date_column >= DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP()), 1096)
  GROUP BY
    column_1,
    column_2;
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to