[
https://issues.apache.org/jira/browse/HUDI-7955?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
voon updated HUDI-7955:
-----------------------
Description:
The invocation of *getPrimitiveJavaObject* returns a different implementation
of timestamp in Hive3 and Hive2.
- Hive2: *java.sql.Timestamp*
- Hive3: *org.apache.hadoop.hive.common.type.Timestamp*
Hudi common is compiled with Hive2, but Trino is using Hive3, causing the
discrepancy between compile and runtime. When execution flow falls into this
section of the code where the trigger conditions are listed below:
1. MOR table is used
2. User is querying the _rt table
3. User's table has a *TIMESTAMP* type and query requires it
4. Merge is required as record is present in both Parquet and Log file
Error below will be thrown:
{code:java}
Query 20240704_075218_05052_yfmfc failed: 'java.sql.Timestamp
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
java.lang.NoSuchMethodError: 'java.sql.Timestamp
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serializePrimitive(HiveAvroSerializer.java:304)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:212)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.setUpRecordFieldFromWritable(HiveAvroSerializer.java:121)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:108)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.convertArrayWritableToHoodieRecord(RealtimeCompactedRecordReader.java:185)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.mergeRecord(RealtimeCompactedRecordReader.java:172)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:114)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:49)
at
org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:88)
at
org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:36)
at
io.trino.plugin.hive.GenericHiveRecordCursor.advanceNextPosition(GenericHiveRecordCursor.java:215)
at
io.trino.spi.connector.RecordPageSource.getNextPage(RecordPageSource.java:88)
at
io.trino.plugin.hudi.HudiPageSource.getNextPage(HudiPageSource.java:120){code}
h1. Hive3
!image-2024-07-05-18-11-33-420.png|width=509,height=572!
h1. Hive2
!image-2024-07-05-18-13-28-135.png|width=507,height=501!
h1. How to reproduce
{code:java}
CREATE TABLE dev_hudi.hudi_7955__hive3_timestamp_issue (
id INT,
name STRING,
timestamp_col TIMESTAMP,
grass_region STRING
) USING hudi
PARTITIONED BY (grass_region)
tblproperties (
primaryKey = 'id',
type = 'mor',
precombineField = 'id',
hoodie.index.type = 'BUCKET',
hoodie.index.bucket.engine = 'CONSISTENT_HASHING',
hoodie.compact.inline = 'true'
)
LOCATION 'hdfs://path/to/hudi_tables/hudi_7955__hive3_timestamp_issue';
-- 5 separate commits to trigger compaction
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (1, 'alex1',
now(), 'SG');
-- No error here as there no MERGE is required between Parquet + Log
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (2, 'alex2',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (3, 'alex3',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5',
now(), 'SG');
-- Should contain no rows here as table is compacted and all rows are in
parquet file
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';
-- 2 separate commits which will be written to logs (update)
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4_1',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5_1',
now(), 'SG');
-- Not querying any TIMESTAMP columns, no errors will be thrown
SELECT _hoodie_file_name, id FROM dev_hudi.hudi_7955__hive3_timestamp_issue_rt
WHERE _hoodie_file_name NOT LIKE '%parquet%';
-- Error should be thrown here as we are including the timestamp column
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';{code}
h1. Solution
Hive shimming should be applied to obtaining *getPrimitiveJavaObject* too.
was:
The invocation of *getPrimitiveJavaObject* returns a different implementation
of timestamp in Hive3 and Hive2.
- Hive2: *java.sql.Timestamp*
- Hive3: *org.apache.hadoop.hive.common.type.Timestamp*
Hudi common is compiled with Hive2, but Trino is using Hive3, causing the
discrepancy between compile and runtime. When execution flow falls into this
section of the code where the trigger conditions are listed below:
1. MOR table is used
2. User is querying the _rt table
3. User's table has a *TIMESTAMP* type and query requires it
4. Merge is required as record is present in both Parquet and Log file
Error below will be thrown:
{code:java}
Query 20240704_075218_05052_yfmfc failed: 'java.sql.Timestamp
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
java.lang.NoSuchMethodError: 'java.sql.Timestamp
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serializePrimitive(HiveAvroSerializer.java:304)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:212)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.setUpRecordFieldFromWritable(HiveAvroSerializer.java:121)
at
org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:108)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.convertArrayWritableToHoodieRecord(RealtimeCompactedRecordReader.java:185)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.mergeRecord(RealtimeCompactedRecordReader.java:172)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:114)
at
org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:49)
at
org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:88)
at
org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:36)
at
io.trino.plugin.hive.GenericHiveRecordCursor.advanceNextPosition(GenericHiveRecordCursor.java:215)
at
io.trino.spi.connector.RecordPageSource.getNextPage(RecordPageSource.java:88)
at
io.trino.plugin.hudi.HudiPageSource.getNextPage(HudiPageSource.java:120){code}
h1. Hive3
!image-2024-07-05-18-11-33-420.png|width=509,height=572!
h1. Hive2
!image-2024-07-05-18-13-28-135.png|width=507,height=501!
h1. How to reproduce
{code:java}
CREATE TABLE dev_hudi.hudi_7955__hive3_timestamp_issue (
id INT,
name STRING,
timestamp_col TIMESTAMP,
grass_region STRING
) USING hudi
PARTITIONED BY (grass_region)
tblproperties (
primaryKey = 'id',
type = 'mor',
precombineField = 'id',
hoodie.index.type = 'BUCKET',
hoodie.index.bucket.engine = 'CONSISTENT_HASHING',
hoodie.compact.inline = 'true'
)
LOCATION 'hdfs://path/to/hudi_tables/hudi_7955__hive3_timestamp_issue';
-- 5 separate commits to trigger compaction
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (1, 'alex1',
now(), 'SG');
-- No error here as there no MERGE is required between Parquet + Log
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (2, 'alex2',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (3, 'alex3',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5',
now(), 'SG');
-- Should contain no rows here as table is compacted and all rows are in
parquet file
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';
-- 2 separate commits which will be written to logs (update)
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4_1',
now(), 'SG');
INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5_1',
now(), 'SG');
-- Not querying any TIMESTAMP columns, no errors will be thrown
SELECT _hoodie_file_name, id FROM dev_hudi.hudi_7955__hive3_timestamp_issue_rt
WHERE _hoodie_file_name NOT LIKE '%parquet%';
-- Error should be thrown here as we are including the timestamp column
SELECT _hoodie_file_name, id, timestamp_col FROM
dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
'%parquet%';{code}
h1. Solution
Hive shimming should be applied to obtaining *getPrimitiveJavaObject* too.
> Account for WritableTimestampObjectInspector#getPrimitiveJavaObject Hive3 and
> Hive2 discrepancies
> -------------------------------------------------------------------------------------------------
>
> Key: HUDI-7955
> URL: https://issues.apache.org/jira/browse/HUDI-7955
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: voon
> Assignee: voon
> Priority: Major
> Attachments: image-2024-07-05-18-11-33-420.png,
> image-2024-07-05-18-13-28-135.png
>
>
> The invocation of *getPrimitiveJavaObject* returns a different implementation
> of timestamp in Hive3 and Hive2.
> - Hive2: *java.sql.Timestamp*
> - Hive3: *org.apache.hadoop.hive.common.type.Timestamp*
> Hudi common is compiled with Hive2, but Trino is using Hive3, causing the
> discrepancy between compile and runtime. When execution flow falls into this
> section of the code where the trigger conditions are listed below:
> 1. MOR table is used
> 2. User is querying the _rt table
> 3. User's table has a *TIMESTAMP* type and query requires it
> 4. Merge is required as record is present in both Parquet and Log file
> Error below will be thrown:
> {code:java}
> Query 20240704_075218_05052_yfmfc failed: 'java.sql.Timestamp
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
> java.lang.NoSuchMethodError: 'java.sql.Timestamp
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector.getPrimitiveJavaObject(java.lang.Object)'
> at
> org.apache.hudi.hadoop.utils.HiveAvroSerializer.serializePrimitive(HiveAvroSerializer.java:304)
> at
> org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:212)
> at
> org.apache.hudi.hadoop.utils.HiveAvroSerializer.setUpRecordFieldFromWritable(HiveAvroSerializer.java:121)
> at
> org.apache.hudi.hadoop.utils.HiveAvroSerializer.serialize(HiveAvroSerializer.java:108)
> at
> org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.convertArrayWritableToHoodieRecord(RealtimeCompactedRecordReader.java:185)
> at
> org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.mergeRecord(RealtimeCompactedRecordReader.java:172)
> at
> org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:114)
> at
> org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.next(RealtimeCompactedRecordReader.java:49)
> at
> org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:88)
> at
> org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.next(HoodieRealtimeRecordReader.java:36)
> at
> io.trino.plugin.hive.GenericHiveRecordCursor.advanceNextPosition(GenericHiveRecordCursor.java:215)
> at
> io.trino.spi.connector.RecordPageSource.getNextPage(RecordPageSource.java:88)
> at
> io.trino.plugin.hudi.HudiPageSource.getNextPage(HudiPageSource.java:120){code}
> h1. Hive3
> !image-2024-07-05-18-11-33-420.png|width=509,height=572!
> h1. Hive2
> !image-2024-07-05-18-13-28-135.png|width=507,height=501!
>
> h1. How to reproduce
>
>
> {code:java}
> CREATE TABLE dev_hudi.hudi_7955__hive3_timestamp_issue (
> id INT,
> name STRING,
> timestamp_col TIMESTAMP,
> grass_region STRING
> ) USING hudi
> PARTITIONED BY (grass_region)
> tblproperties (
> primaryKey = 'id',
> type = 'mor',
> precombineField = 'id',
> hoodie.index.type = 'BUCKET',
> hoodie.index.bucket.engine = 'CONSISTENT_HASHING',
> hoodie.compact.inline = 'true'
> )
> LOCATION 'hdfs://path/to/hudi_tables/hudi_7955__hive3_timestamp_issue';
> -- 5 separate commits to trigger compaction
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (1, 'alex1',
> now(), 'SG');
> -- No error here as there no MERGE is required between Parquet + Log
> SELECT _hoodie_file_name, id, timestamp_col FROM
> dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
> '%parquet%';
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (2, 'alex2',
> now(), 'SG');
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (3, 'alex3',
> now(), 'SG');
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4',
> now(), 'SG');
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5',
> now(), 'SG');
> -- Should contain no rows here as table is compacted and all rows are in
> parquet file
> SELECT _hoodie_file_name, id, timestamp_col FROM
> dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
> '%parquet%';
> -- 2 separate commits which will be written to logs (update)
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (4, 'alex4_1',
> now(), 'SG');
> INSERT INTO dev_hudi.hudi_7955__hive3_timestamp_issue VALUES (5, 'alex5_1',
> now(), 'SG');
> -- Not querying any TIMESTAMP columns, no errors will be thrown
> SELECT _hoodie_file_name, id FROM
> dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
> '%parquet%';
> -- Error should be thrown here as we are including the timestamp column
> SELECT _hoodie_file_name, id, timestamp_col FROM
> dev_hudi.hudi_7955__hive3_timestamp_issue_rt WHERE _hoodie_file_name NOT LIKE
> '%parquet%';{code}
>
> h1. Solution
> Hive shimming should be applied to obtaining *getPrimitiveJavaObject* too.
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)