[ 
https://issues.apache.org/jira/browse/HIVE-29328?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Zsolt Miskolczi updated HIVE-29328:
-----------------------------------
    Description: 
Orc file format allows metadata field names with ignoring casing. For example, 
we have manual tests when query based compaction creates Orc files with 
lowercase fields (the root cause for this is under unvestigation).

As OrcInputFormat.isOriginal currently checks the field names with strict 
casing, FixAcidKeyIndex can fail if the Orc file footer contains its metadata 
fields (like currentTransaction) lowercasing.

 

To repro the issue manually: 

 
{{set hive.support.concurrency=true;}}
{{set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;}}
{{set hive.compactor.crud.query.based;}}

{{DROP TABLE IF EXISTS testtable;}}
{{DROP TABLE IF EXISTS testtable_chg1;}}
{{DROP TABLE IF EXISTS testtable_chg2;}}

{{CREATE TABLE testtable (iD int, tXt string) STORED AS ORC}}
{\{ TBLPROPERTIES ('NO_AUTO_COMPACTION'='true', 'transactional'='true');}}
{{CREATE TABLE testtable_chg1 (iD int, tXt string) STORED AS ORC;}}
{{CREATE TABLE testtable_chg2 (iD int, tXt string) STORED AS ORC;}}

{{INSERT INTO testtable VALUES (1, 'base-A'), (2, 'base-B');}}
{{INSERT INTO testtable_chg1 VALUES (1, 'chg1-A'), (3, 'chg1-C'), (4, 
'chg1-D');}}
{{INSERT INTO testtable_chg2 VALUES (1, 'chg2-A'), (3, 'chg2-C'), (5, 
'chg2-E');}}

{{MERGE INTO testtable AS T}}
{\{ USING testtable_chg1 AS S ON T.id = S.id}}
{\{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET txt 
= S.txt}}
{\{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}

{{MERGE INTO testtable AS T}}
{\{ USING testtable_chg2 AS S ON T.id = S.id}}
{\{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET txt 
= S.txt}}
{\{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}

{{ALTER TABLE testtable COMPACT 'MINOR' AND WAIT;}}

 

And in terminal:

 

{{orc-tools data delta_0000001_0000003_v0000005/bucket_00000}}
{{Processing data file delta_0000001_0000003_v0000005/bucket_00000 [length: 
777]}}
{{{"operation":0,"originaltransaction":1,"bucket":536870912,"rowid":0,"currenttransaction":1,"row":\{"id":1,"txt":"base-A"}}}}
{{{"operation":0,"originaltransaction":1,"bucket":536870912,"rowid":1,"currenttransaction":1,"row":\{"id":2,"txt":"base-B"}}}}
{{{"operation":0,"originaltransaction":2,"bucket":536870913,"rowid":0,"currenttransaction":2,"row":\{"id":1,"txt":"chg1-A"}}}}
{{{"operation":0,"originaltransaction":2,"bucket":536870914,"rowid":0,"currenttransaction":2,"row":\{"id":3,"txt":"chg1-C"}}}}
{{{"operation":0,"originaltransaction":2,"bucket":536870914,"rowid":1,"currenttransaction":2,"row":\{"id":4,"txt":"chg1-D"}}}}
{{{"operation":0,"originaltransaction":3,"bucket":536870913,"rowid":0,"currenttransaction":3,"row":\{"id":1,"txt":"chg2-A"}}}}
{{{"operation":0,"originaltransaction":3,"bucket":536870913,"rowid":1,"currenttransaction":3,"row":\{"id":3,"txt":"chg2-C"}}}}
{{{"operation":0,"originaltransaction":3,"bucket":536870914,"rowid":0,"currenttransaction":3,"row":\{"id":5,"txt":"chg2-E"}}}}

 

  was:
Orc file format allows metadata field names with ignoring casing. For example, 
we have manual tests when query based compaction creates Orc files with 
lowercase fields (the root cause for this is under unvestigation).

As OrcInputFormat.isOriginal currently checks the field names with strict 
casing, FixAcidKeyIndex can fail if the Orc file footer contains its metadata 
fields (like currentTransaction) lowercasing.

 

To repro the issue manually: 

 
{{set hive.support.concurrency=true;}}
{{set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;}}
{{set hive.compactor.crud.query.based;}}

{{DROP TABLE IF EXISTS testtable;}}
{{DROP TABLE IF EXISTS testtable_chg1;}}
{{DROP TABLE IF EXISTS testtable_chg2;}}

{{CREATE TABLE testtable (iD int, tXt string) STORED AS ORC}}
{{ TBLPROPERTIES ('NO_AUTO_COMPACTION'='true', 'transactional'='true');}}
{{CREATE TABLE testtable_chg1 (iD int, tXt string) STORED AS ORC;}}
{{CREATE TABLE testtable_chg2 (iD int, tXt string) STORED AS ORC;}}

{{INSERT INTO testtable VALUES (1, 'base-A'), (2, 'base-B');}}
{{INSERT INTO testtable_chg1 VALUES (1, 'chg1-A'), (3, 'chg1-C'), (4, 
'chg1-D');}}
{{INSERT INTO testtable_chg2 VALUES (1, 'chg2-A'), (3, 'chg2-C'), (5, 
'chg2-E');}}

{{MERGE INTO testtable AS T}}
{{ USING testtable_chg1 AS S ON T.id = S.id}}
{{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET txt 
= S.txt}}
{{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}

{{MERGE INTO testtable AS T}}
{{ USING testtable_chg2 AS S ON T.id = S.id}}
{{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET txt 
= S.txt}}
{{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}

{{ALTER TABLE testtable COMPACT 'MINOR' AND WAIT;}}

 

 


> Orc acid footer metadata should be case insensitive
> ---------------------------------------------------
>
>                 Key: HIVE-29328
>                 URL: https://issues.apache.org/jira/browse/HIVE-29328
>             Project: Hive
>          Issue Type: Bug
>            Reporter: Zsolt Miskolczi
>            Priority: Major
>              Labels: pull-request-available
>
> Orc file format allows metadata field names with ignoring casing. For 
> example, we have manual tests when query based compaction creates Orc files 
> with lowercase fields (the root cause for this is under unvestigation).
> As OrcInputFormat.isOriginal currently checks the field names with strict 
> casing, FixAcidKeyIndex can fail if the Orc file footer contains its metadata 
> fields (like currentTransaction) lowercasing.
>  
> To repro the issue manually: 
>  
> {{set hive.support.concurrency=true;}}
> {{set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;}}
> {{set hive.compactor.crud.query.based;}}
> {{DROP TABLE IF EXISTS testtable;}}
> {{DROP TABLE IF EXISTS testtable_chg1;}}
> {{DROP TABLE IF EXISTS testtable_chg2;}}
> {{CREATE TABLE testtable (iD int, tXt string) STORED AS ORC}}
> {\{ TBLPROPERTIES ('NO_AUTO_COMPACTION'='true', 'transactional'='true');}}
> {{CREATE TABLE testtable_chg1 (iD int, tXt string) STORED AS ORC;}}
> {{CREATE TABLE testtable_chg2 (iD int, tXt string) STORED AS ORC;}}
> {{INSERT INTO testtable VALUES (1, 'base-A'), (2, 'base-B');}}
> {{INSERT INTO testtable_chg1 VALUES (1, 'chg1-A'), (3, 'chg1-C'), (4, 
> 'chg1-D');}}
> {{INSERT INTO testtable_chg2 VALUES (1, 'chg2-A'), (3, 'chg2-C'), (5, 
> 'chg2-E');}}
> {{MERGE INTO testtable AS T}}
> {\{ USING testtable_chg1 AS S ON T.id = S.id}}
> {\{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET 
> txt = S.txt}}
> {\{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}
> {{MERGE INTO testtable AS T}}
> {\{ USING testtable_chg2 AS S ON T.id = S.id}}
> {\{ WHEN MATCHED AND (T.txt != S.txt AND S.txt IS NOT NULL) THEN UPDATE SET 
> txt = S.txt}}
> {\{ WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.txt);}}
> {{ALTER TABLE testtable COMPACT 'MINOR' AND WAIT;}}
>  
> And in terminal:
>  
> {{orc-tools data delta_0000001_0000003_v0000005/bucket_00000}}
> {{Processing data file delta_0000001_0000003_v0000005/bucket_00000 [length: 
> 777]}}
> {{{"operation":0,"originaltransaction":1,"bucket":536870912,"rowid":0,"currenttransaction":1,"row":\{"id":1,"txt":"base-A"}}}}
> {{{"operation":0,"originaltransaction":1,"bucket":536870912,"rowid":1,"currenttransaction":1,"row":\{"id":2,"txt":"base-B"}}}}
> {{{"operation":0,"originaltransaction":2,"bucket":536870913,"rowid":0,"currenttransaction":2,"row":\{"id":1,"txt":"chg1-A"}}}}
> {{{"operation":0,"originaltransaction":2,"bucket":536870914,"rowid":0,"currenttransaction":2,"row":\{"id":3,"txt":"chg1-C"}}}}
> {{{"operation":0,"originaltransaction":2,"bucket":536870914,"rowid":1,"currenttransaction":2,"row":\{"id":4,"txt":"chg1-D"}}}}
> {{{"operation":0,"originaltransaction":3,"bucket":536870913,"rowid":0,"currenttransaction":3,"row":\{"id":1,"txt":"chg2-A"}}}}
> {{{"operation":0,"originaltransaction":3,"bucket":536870913,"rowid":1,"currenttransaction":3,"row":\{"id":3,"txt":"chg2-C"}}}}
> {{{"operation":0,"originaltransaction":3,"bucket":536870914,"rowid":0,"currenttransaction":3,"row":\{"id":5,"txt":"chg2-E"}}}}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to