[
https://issues.apache.org/jira/browse/HUDI-1894?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17343960#comment-17343960
]
Eldhose Paul commented on HUDI-1894:
------------------------------------
[~shivnarayan] I did couple of tests, and here is what I found,
*Data in both parquet file and Avro(log files), where Avro has the latest data.
Data is populated as indented.*
{code:java}
scala> val ji2 =
spark.read.format("org.apache.hudi").load("/user/hive/warehouse/jira_expl.db/jiraissue_events")
ji2: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 49 more fields]
scala> ji2.filter($"id" === 1242436).select($"_hoodie_commit_time",
$"_hoodie_commit_seqno", $"_hoodie_record_key", $"_hoodie_partition_path",
$"_hoodie_file_name",$"resolutiondate", $"archiveddate").show(false)
+-------------------+--------------------+------------------+----------------------+--------------------------------------+--------------+------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|resolutiondate|archiveddate|
+-------------------+--------------------+------------------+----------------------+--------------------------------------+--------------+------------+
|20210513112220 |20210513112220_4_172|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0|null |null |
+-------------------+--------------------+------------------+----------------------+--------------------------------------+--------------+------------+
//Data in parquet:
scala> val ji =
spark.read.format("parquet").load("/user/hive/warehouse/jira_expl.db/jiraissue_events/*.parquet")
ji: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 49 more fields]
scala> ji.filter($"id" === 1242436).withColumn("inputfile",
input_file_name()).select($"_hoodie_commit_time", $"_hoodie_commit_seqno",
$"_hoodie_record_key", $"_hoodie_partition_path",
$"_hoodie_file_name",$"resolutiondate", $"archiveddate",
$"inputfile").show(false)
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|resolutiondate|archiveddate|inputfile
|
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|20210513111030 |20210513111030_5_778|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-3403-164433_20210513111030.parquet|null
|null
|hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-4339-208912_20210513112035.parquet|
|20210513111030 |20210513111030_5_778|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-3403-164433_20210513111030.parquet|null
|null
|hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-3403-164433_20210513111030.parquet|
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
{code}
*Compaction Happens for log file:
.2d6621ac-7da9-46d7-be43-c4f45f785e11-0_20210513112035.log.1_4-4499-216741*
{code:java}
-rwxrwx--x+ 3 hive hive 9996 2021-05-13 11:30
/user/hive/warehouse/jira_expl.db/jiraissue_events/.hoodie/20210513113040.commit
-rwxrwx--x+ 3 hive hive 0 2021-05-13 11:30
/user/hive/warehouse/jira_expl.db/jiraissue_events/.hoodie/20210513113040.compaction.inflight
-rwxrwx--x+ 3 hive hive 3970 2021-05-13 11:30
/user/hive/warehouse/jira_expl.db/jiraissue_events/.hoodie/20210513113040.compaction.requested
hdfs dfs -cat
/user/hive/warehouse/jira_expl.db/jiraissue_events/.hoodie/20210513113040.compaction.requested
Objavro.schema▒{"type":"record","name":"HoodieCompactionPlan","namespace":"org.apache.hudi.avro.model","fields":[{"name":"operations","type":["null",{"type":"array","items":{"type":"record","name":"HoodieCompactionOperation","fields":[{"name":"baseInstantTime","type":["null",{"type":"string","avro.java.string":"String"}]},{"name":"deltaFilePaths","type":["null",{"type":"array","items":{"type":"string","avro.java.string":"String"}}],"default":null},{"name":"dataFilePath","type":["null",{"type":"string","avro.java.string":"String"}],"default":null},{"name":"fileId","type":["null",{"type":"string","avro.java.string":"String"}]},{"name":"partitionPath","type":["null",{"type":"string","avro.java.string":"String"}],"default":null},{"name":"metrics","type":["null",{"type":"map","values":"double","avro.java.string":"String"}],"default":null},{"name":"bootstrapFilePath","type":["null",{"type":"string","avro.java.string":"String"}],"default":null}]}}],"default":null},{"name":"extraMetadata","type":["null",{"type":"map","values":{"type":"string","avro.java.string":"String"},"avro.java.string":"String"}],"default":null},{"name":"version","type":["int","null"],"default":1}]}▒I▒▒O▒H▒▒▒3Շ▒*20210513112035▒.bae6f29e-9458-4e34-a3b2-570db49b9cf1-0_20210513112035.log.1_1-4499-216740▒bae6f29e-9458-4e34-a3b2-570db49b9cf1-0_1-4339-208908_20210513112035.parquetLbae6f29e-9458-4e34-a3b2-570db49b9cf1-0
TOTAL_LOG_FILES▒?TOTAL_IO_MBO@
TOTAL_IO_READ_MB?@(TOTAL_LOG_FILES_SIZE▒▒▒@"TOTAL_IO_WRITE_MB?@20210513112035▒.031a359b-f8f0-417a-888b-45f2a0b3a26f-0_20210513112035.log.1_5-4499-216738▒031a359b-f8f0-417a-888b-45f2a0b3a26f-0_0-4339-208907_20210513112035.parquetL031a359b-f8f0-417a-888b-45f2a0b3a26f-0
TOTAL_LOG_FILES▒?TOTAL_IO_MB@
TOTAL_IO_READ_MB▒?(TOTAL_LOG_FILES_SIZEd▒@"TOTAL_IO_WRITE_MB▒?20210513112035▒.e54bd2c3-6b9c-4e4b-95d7-0a7b81b83dd0-0_20210513112035.log.1_0-4499-216737▒e54bd2c3-6b9c-4e4b-95d7-0a7b81b83dd0-0_2-4339-208909_20210513112035.parquetLe54bd2c3-6b9c-4e4b-95d7-0a7b81b83dd0-0
TOTAL_LOG_FILES▒?TOTAL_IO_MB<@
TOTAL_IO_READ_MB,@(TOTAL_LOG_FILES_SIZE▒▒▒@"TOTAL_IO_WRITE_MB,@20210513112035▒.7cf9fc72-7612-4ec6-8df3-e41086cea8f2-0_20210513112035.log.1_5-4702-226479▒7cf9fc72-7612-4ec6-8df3-e41086cea8f2-0_6-4339-208913_20210513112035.parquetL7cf9fc72-7612-4ec6-8df3-e41086cea8f2-0
TOTAL_LOG_FILES▒?TOTAL_IO_MBH@
TOTAL_IO_READ_MB8@(TOTAL_LOG_FILES_SIZE▒▒@"TOTAL_IO_WRITE_MB8@20210513112035▒.2d6621ac-7da9-46d7-be43-c4f45f785e11-0_20210513112035.log.1_4-4499-216741▒2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-4339-208912_20210513112035.parquetL2d6621ac-7da9-46d7-be43-c4f45f785e11-0
TOTAL_LOG_FILES▒?TOTAL_IO_MB6@
TOTAL_IO_READ_MB&@(TOTAL_LOG_FILES_SIZE.▒@"TOTAL_IO_WRITE_MB&@20210513112035▒.39681b5f-33e4-4b51-bc9e-9225d78bbb16-0_20210513112035.log.1_2-4499-216736▒39681b5f-33e4-4b51-bc9e-9225d78bbb16-0_3-4339-208910_20210513112035.parquetL39681b5f-33e4-4b51-bc9e-9225d78bbb16-0
TOTAL_LOG_FILES▒?TOTAL_IO_MB6@
TOTAL_IO_READ_MB&@(TOTAL_LOG_FILES_SIZEк@"TOTAL_IO_WRITE_MB&@20210513110051▒.cae7231d-8549-423f-8e0f-fd7246fb0780-0_20210513110051.log.1_2-4702-226475▒cae7231d-8549-423f-8e0f-fd7246fb0780-0_8-2437-118170_20210513110051.parquetLcae7231d-8549-423f-8e0f-fd7246fb0780-0
TOTAL_LOG_FILES▒?TOTAL_IO_MB4@
TOTAL_IO_READ_MB$@(TOTAL_LOG_FILES_SIZE▒▒@"TOTAL_IO_WRITE_MB$@20210513105028▒.7a457126-49ed-4a01-a256-c20bfceafa25-0_20210513105028.log.1_0-5269-253664▒7a457126-49ed-4a01-a256-c20bfceafa25-0_8-1367-66146_20210513105028.parquetL7a457126-49ed-4a01-a256-c20bfceafa25-0
TOTAL_LOG_FILES▒?TOTAL_IO_MBG@
TOTAL_IO_READ_MB7@(TOTAL_LOG_FILES_SIZE>▒@"TOTAL_IO_WRITE_MB7@▒I▒▒O▒H▒▒▒3Շ
{code}
*After compaction 1st Try@11:31 ish - looks perfect*
{code:java}
scala> val ji2 =
spark.read.format("org.apache.hudi").load("/user/hive/warehouse/jira_expl.db/jiraissue_events")
ji2: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 49 more fields]
scala> ji2.filter($"id" === 1242436).select($"_hoodie_commit_time",
$"_hoodie_commit_seqno", $"_hoodie_record_key", $"_hoodie_partition_path",
$"_hoodie_file_name",$"resolutiondate", $"archiveddate").show(false)
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|resolutiondate|archiveddate|
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+
|20210513113040 |20210513113040_4_1602|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_4-5279-253683_20210513113040.parquet|null
|null |
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+
{code}
*After Compaction 2nd Try @11:33 ish - :( messed up*
{code:java}
scala> val ji2 =
spark.read.format("org.apache.hudi").load("/user/hive/warehouse/jira_expl.db/jiraissue_events")
ji2: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 49 more fields]
scala> ji2.filter($"id" === 1242436).select($"_hoodie_commit_time",
$"_hoodie_commit_seqno", $"_hoodie_record_key", $"_hoodie_partition_path",
$"_hoodie_file_name",$"resolutiondate", $"archiveddate").show(false)
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+-----------------------+-------------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|resolutiondate |archiveddate
|
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+-----------------------+-------------------+
|20210513113040 |20210513113040_4_1602|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_4-5279-253683_20210513113040.parquet|2020-02-13
09:04:07.482|1969-12-31 19:00:00|
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+-----------------------+-------------------+
scala> val ji =
spark.read.format("parquet").load("/user/hive/warehouse/jira_expl.db/jiraissue_events/*.parquet")
ji: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 49 more fields]
scala> ji.filter($"id" === 1242436).withColumn("inputfile",
input_file_name()).select($"_hoodie_commit_time", $"_hoodie_commit_seqno",
$"_hoodie_record_key", $"_hoodie_partition_path",
$"_hoodie_file_name",$"resolutiondate", $"archiveddate",
$"inputfile").show(false)
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|resolutiondate|archiveddate|inputfile
|
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|20210513111030 |20210513111030_5_778 |1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-3403-164433_20210513111030.parquet|null
|null
|hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/2d6621ac-7da9-46d7-be43-c4f45f785e11-0_5-4339-208912_20210513112035.parquet|
|20210513113040 |20210513113040_4_1602|1242436.0 |
|2d6621ac-7da9-46d7-be43-c4f45f785e11-0_4-5279-253683_20210513113040.parquet|null
|null
|hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/2d6621ac-7da9-46d7-be43-c4f45f785e11-0_4-5279-253683_20210513113040.parquet|
+-------------------+---------------------+------------------+----------------------+---------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
{code}
> NULL values in timestamp column defaulted
> ------------------------------------------
>
> Key: HUDI-1894
> URL: https://issues.apache.org/jira/browse/HUDI-1894
> Project: Apache Hudi
> Issue Type: Bug
> Components: Spark Integration
> Reporter: Eldhose Paul
> Assignee: sivabalan narayanan
> Priority: Major
> Labels: sev:critical
>
> Reading timestamp column from hudi and underlying parquet files in spark
> gives different results.
> *hudi properties:*
> {code:java}
> hdfs dfs -cat
> /user/hive/warehouse/jira_expl.db/jiraissue_events/.hoodie/hoodie.properties
> #Properties saved on Tue May 11 17:17:22 EDT 2021
> #Tue May 11 17:17:22 EDT 2021
> hoodie.compaction.payload.class=org.apache.hudi.common.model.OverwriteWithLatestAvroPayload
> hoodie.table.name=jiraissue
> hoodie.archivelog.folder=archived
> hoodie.table.type=MERGE_ON_READ
> hoodie.table.version=1
> hoodie.timeline.layout.version=1
> {code}
>
> *Reading directly from parquet using Spark:*
> {code:java}
> scala> val ji =
> spark.read.format("parquet").load("/user/hive/warehouse/jira_expl.db/jiraissue_events/*.parquet")
> ji: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
> _hoodie_commit_seqno: string ... 49 more fields]scala> ji.filter($"id" ===
> 1237858).withColumn("inputfile",
> input_file_name()).select($"_hoodie_commit_time", $"_hoodie_commit_seqno",
> $"_hoodie_record_key", $"_hoodie_partition_path",
> $"_hoodie_file_name",$"resolutiondate", $"archiveddate",
> $"inputfile").show(false)
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+--------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------+
> |_hoodie_commit_time|_hoodie_commit_seqno
> |_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
> |resolutiondate|archiveddate|inputfile
>
> |
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+--------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------+
> |20210511171722 |20210511171722_7_13718|1237858.0 |
>
> |832cf07f-637b-4a4c-ab08-6929554f003a-0_7-98-5106_20210511171722.parquet|null
> |null
> |hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/832cf07f-637b-4a4c-ab08-6929554f003a-0_7-98-5106_20210511171722.parquet
> |
> |20210511171722 |20210511171722_7_13718|1237858.0 |
>
> |832cf07f-637b-4a4c-ab08-6929554f003a-0_7-98-5106_20210511171722.parquet|null
> |null
> |hdfs://nameservice1/user/hive/warehouse/jira_expl.db/jiraissue_events/832cf07f-637b-4a4c-ab08-6929554f003a-0_8-1610-78711_20210511173615.parquet%7C
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+--------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------+
> {code}
> *Reading `hudi` using Spark:*
> {code:java}
> scala> val jih =
> spark.read.format("org.apache.hudi").load("/user/hive/warehouse/jira_expl.db/jiraissue_events")
> jih: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
> _hoodie_commit_seqno: string ... 49 more fields]scala> jih.filter($"id" ===
> 1237858).select($"_hoodie_commit_time", $"_hoodie_commit_seqno",
> $"_hoodie_record_key", $"_hoodie_partition_path",
> $"_hoodie_file_name",$"resolutiondate", $"archiveddate").show(false)
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+-------------------+-------------------+
> |_hoodie_commit_time|_hoodie_commit_seqno
> |_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
> |resolutiondate |archiveddate |
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+-------------------+-------------------+
> |20210511171722 |20210511171722_7_13718|1237858.0 |
>
> |832cf07f-637b-4a4c-ab08-6929554f003a-0_7-98-5106_20210511171722.parquet|2018-07-30
> 14:58:52|1969-12-31 19:00:00|
> +-------------------+----------------------+------------------+----------------------+-----------------------------------------------------------------------+-------------------+-------------------+
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)