[
https://issues.apache.org/jira/browse/HIVE-23889?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
László Bodor updated HIVE-23889:
--------------------------------
Description:
HIVE-21784 uses a new WriterOptions instead of the field in OrcRecordUpdater:
https://github.com/apache/hive/commit/f62379ba279f41b843fcd5f3d4a107b6fcd04dec#diff-bb969e858664d98848960a801fd58b5cR580-R583
so in this scenario, the overwrite creates an empty bucket file, which is fine
as that was the intention of that patch, but it creates that with invalid
schema:
{code}
CREATE TABLE default.bdaa28846 (
cda_id int,
cda_run_id varchar(255),
cda_load_ts timestamp,
global_party_id string)
PARTITIONED BY (
cda_date int,
cda_job_name varchar(12))
CLUSTERED BY (cda_id)
INTO 2 BUCKETS
STORED AS ORC;
INSERT OVERWRITE TABLE default.bdaa28846 PARTITION (cda_date = 20200601 ,
cda_job_name = 'core_base')
SELECT 1 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
'global_party_id' global_party_id
UNION ALL
SELECT 2 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
'global_party_id' global_party_id;
ALTER TABLE default.bdaa28846 ADD COLUMNS (group_id string) CASCADE ;
INSERT OVERWRITE TABLE default.bdaa28846 PARTITION (cda_date = 20200601 ,
cda_job_name = 'core_base')
SELECT 1 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
'global_party_id' global_party_id, 'group_id' as group_id;
{code}
was:HIVE-
> Empty bucket files are inserted with invalid schema after HIVE-21784
> --------------------------------------------------------------------
>
> Key: HIVE-23889
> URL: https://issues.apache.org/jira/browse/HIVE-23889
> Project: Hive
> Issue Type: Bug
> Reporter: László Bodor
> Priority: Major
>
> HIVE-21784 uses a new WriterOptions instead of the field in OrcRecordUpdater:
> https://github.com/apache/hive/commit/f62379ba279f41b843fcd5f3d4a107b6fcd04dec#diff-bb969e858664d98848960a801fd58b5cR580-R583
> so in this scenario, the overwrite creates an empty bucket file, which is
> fine as that was the intention of that patch, but it creates that with
> invalid schema:
> {code}
> CREATE TABLE default.bdaa28846 (
> cda_id int,
> cda_run_id varchar(255),
> cda_load_ts timestamp,
> global_party_id string)
> PARTITIONED BY (
> cda_date int,
> cda_job_name varchar(12))
> CLUSTERED BY (cda_id)
> INTO 2 BUCKETS
> STORED AS ORC;
> INSERT OVERWRITE TABLE default.bdaa28846 PARTITION (cda_date = 20200601 ,
> cda_job_name = 'core_base')
> SELECT 1 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
> 'global_party_id' global_party_id
> UNION ALL
> SELECT 2 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
> 'global_party_id' global_party_id;
> ALTER TABLE default.bdaa28846 ADD COLUMNS (group_id string) CASCADE ;
> INSERT OVERWRITE TABLE default.bdaa28846 PARTITION (cda_date = 20200601 ,
> cda_job_name = 'core_base')
> SELECT 1 as cda_id,'cda_run_id' as cda_run_id, NULL as cda_load_ts,
> 'global_party_id' global_party_id, 'group_id' as group_id;
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)