[ 
https://issues.apache.org/jira/browse/HUDI-4825?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Raymond Xu closed HUDI-4825.
----------------------------
    Resolution: Fixed

> Commit metadata in Json contains redundant information
> ------------------------------------------------------
>
>                 Key: HUDI-4825
>                 URL: https://issues.apache.org/jira/browse/HUDI-4825
>             Project: Apache Hudi
>          Issue Type: Bug
>            Reporter: Ethan Guo
>            Assignee: Ethan Guo
>            Priority: Critical
>              Labels: pull-request-available
>             Fix For: 0.12.1
>
>
> The commit metadata in Json (*.commit, *.deltacommit) written to the Hudi 
> timeline under .hoodie contains redundant fields that can be trimmed.  As 
> shown below, the same set of write stats is written to both 
> "partitionToWriteStats" and "writeStats", doubling the size and increasing 
> the serde overhead.  Other fields like "totalRecordsDeleted", 
> "writePartitionPaths", "fileIdAndRelativePaths", etc., can be removed as well 
> as they are derived from "partitionToWriteStats" and not directly used by 
> HoodieCommitMetadata class.
> Example commit metadata:
>  
> {code:java}
> {
>   "partitionToWriteStats" : {
>     "2022/1/31" : [ {
>       "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
>       "path" : 
> "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
>       "prevCommit" : "20220410134320333",
>       "numWrites" : 250175,
>       "numDeletes" : 0,
>       "numUpdateWrites" : 0,
>       "numInserts" : 50035,
>       "totalWriteBytes" : 90720802,
>       "totalWriteErrors" : 0,
>       "tempPath" : null,
>       "partitionPath" : "2022/1/31",
>       "totalLogRecords" : 0,
>       "totalLogFilesCompacted" : 0,
>       "totalLogSizeCompacted" : 0,
>       "totalUpdatedRecordsCompacted" : 0,
>       "totalLogBlocks" : 0,
>       "totalCorruptLogBlock" : 0,
>       "totalRollbackBlocks" : 0,
>       "fileSizeInBytes" : 90720802,
>       "minEventTime" : null,
>       "maxEventTime" : null
>     } ],
>     ...
>   },
>   "compacted" : false,
>   "extraMetadata" : {
>     "schema" : 
> "{\"type\":\"record\",\"name\":\"hoodie_source\",\"namespace\":\"hoodie.source\",\"fields\":[{\"name\":\"key\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"ts\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"textField\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"decimalField\",\"type\":[\"null\",\"float\"],\"default\":null},{\"name\":\"longField\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"arrayField\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"mapField\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"round\",\"type\":[\"null\",\"int\"],\"default\":null}]}",
>     "deltastreamer.checkpoint.key" : "17"
>   },
>   "operationType" : "INSERT",
>   "writeStats" : [ {
>     "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
>     "path" : 
> "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
>     "prevCommit" : "20220410134320333",
>     "numWrites" : 250175,
>     "numDeletes" : 0,
>     "numUpdateWrites" : 0,
>     "numInserts" : 50035,
>     "totalWriteBytes" : 90720802,
>     "totalWriteErrors" : 0,
>     "tempPath" : null,
>     "partitionPath" : "2022/1/31",
>     "totalLogRecords" : 0,
>     "totalLogFilesCompacted" : 0,
>     "totalLogSizeCompacted" : 0,
>     "totalUpdatedRecordsCompacted" : 0,
>     "totalLogBlocks" : 0,
>     "totalCorruptLogBlock" : 0,
>     "totalRollbackBlocks" : 0,
>     "fileSizeInBytes" : 90720802,
>     "minEventTime" : null,
>     "maxEventTime" : null
>   }, 
>   ... 
>   ],
>   "totalRecordsDeleted" : 0,
>   "totalLogFilesSize" : 0,
>   "totalScanTime" : 0,
>   "totalCreateTime" : 0,
>   "totalUpsertTime" : 309120,
>   "minAndMaxEventTime" : {
>     "Optional.empty" : {
>       "val" : null,
>       "present" : false
>     }
>   },
>   "writePartitionPaths" : [ "2022/1/31", "2022/1/30", "2022/1/28", 
> "2022/1/27", "2022/2/2", "2022/1/29", "2022/1/24", "2022/2/1", "2022/1/26", 
> "2022/1/25" ],
>   "fileIdAndRelativePaths" : {
>     "3e31414c-fb4c-4ce9-aa27-a43640d94430-0" : 
> "2022/1/25/3e31414c-fb4c-4ce9-aa27-a43640d94430-0_9-9-47_20220410134618909.parquet",
>     ...
>   },
>   "totalLogRecordsCompacted" : 0,
>   "totalLogFilesCompacted" : 0,
>   "totalCompactedRecordsUpdated" : 0
> } {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to