[
https://issues.apache.org/jira/browse/HUDI-4825?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Raymond Xu closed HUDI-4825.
----------------------------
Resolution: Fixed
> Commit metadata in Json contains redundant information
> ------------------------------------------------------
>
> Key: HUDI-4825
> URL: https://issues.apache.org/jira/browse/HUDI-4825
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: Ethan Guo
> Assignee: Ethan Guo
> Priority: Critical
> Labels: pull-request-available
> Fix For: 0.12.1
>
>
> The commit metadata in Json (*.commit, *.deltacommit) written to the Hudi
> timeline under .hoodie contains redundant fields that can be trimmed. As
> shown below, the same set of write stats is written to both
> "partitionToWriteStats" and "writeStats", doubling the size and increasing
> the serde overhead. Other fields like "totalRecordsDeleted",
> "writePartitionPaths", "fileIdAndRelativePaths", etc., can be removed as well
> as they are derived from "partitionToWriteStats" and not directly used by
> HoodieCommitMetadata class.
> Example commit metadata:
>
> {code:java}
> {
> "partitionToWriteStats" : {
> "2022/1/31" : [ {
> "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
> "path" :
> "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
> "prevCommit" : "20220410134320333",
> "numWrites" : 250175,
> "numDeletes" : 0,
> "numUpdateWrites" : 0,
> "numInserts" : 50035,
> "totalWriteBytes" : 90720802,
> "totalWriteErrors" : 0,
> "tempPath" : null,
> "partitionPath" : "2022/1/31",
> "totalLogRecords" : 0,
> "totalLogFilesCompacted" : 0,
> "totalLogSizeCompacted" : 0,
> "totalUpdatedRecordsCompacted" : 0,
> "totalLogBlocks" : 0,
> "totalCorruptLogBlock" : 0,
> "totalRollbackBlocks" : 0,
> "fileSizeInBytes" : 90720802,
> "minEventTime" : null,
> "maxEventTime" : null
> } ],
> ...
> },
> "compacted" : false,
> "extraMetadata" : {
> "schema" :
> "{\"type\":\"record\",\"name\":\"hoodie_source\",\"namespace\":\"hoodie.source\",\"fields\":[{\"name\":\"key\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"ts\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"textField\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"decimalField\",\"type\":[\"null\",\"float\"],\"default\":null},{\"name\":\"longField\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"arrayField\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"mapField\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"round\",\"type\":[\"null\",\"int\"],\"default\":null}]}",
> "deltastreamer.checkpoint.key" : "17"
> },
> "operationType" : "INSERT",
> "writeStats" : [ {
> "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
> "path" :
> "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
> "prevCommit" : "20220410134320333",
> "numWrites" : 250175,
> "numDeletes" : 0,
> "numUpdateWrites" : 0,
> "numInserts" : 50035,
> "totalWriteBytes" : 90720802,
> "totalWriteErrors" : 0,
> "tempPath" : null,
> "partitionPath" : "2022/1/31",
> "totalLogRecords" : 0,
> "totalLogFilesCompacted" : 0,
> "totalLogSizeCompacted" : 0,
> "totalUpdatedRecordsCompacted" : 0,
> "totalLogBlocks" : 0,
> "totalCorruptLogBlock" : 0,
> "totalRollbackBlocks" : 0,
> "fileSizeInBytes" : 90720802,
> "minEventTime" : null,
> "maxEventTime" : null
> },
> ...
> ],
> "totalRecordsDeleted" : 0,
> "totalLogFilesSize" : 0,
> "totalScanTime" : 0,
> "totalCreateTime" : 0,
> "totalUpsertTime" : 309120,
> "minAndMaxEventTime" : {
> "Optional.empty" : {
> "val" : null,
> "present" : false
> }
> },
> "writePartitionPaths" : [ "2022/1/31", "2022/1/30", "2022/1/28",
> "2022/1/27", "2022/2/2", "2022/1/29", "2022/1/24", "2022/2/1", "2022/1/26",
> "2022/1/25" ],
> "fileIdAndRelativePaths" : {
> "3e31414c-fb4c-4ce9-aa27-a43640d94430-0" :
> "2022/1/25/3e31414c-fb4c-4ce9-aa27-a43640d94430-0_9-9-47_20220410134618909.parquet",
> ...
> },
> "totalLogRecordsCompacted" : 0,
> "totalLogFilesCompacted" : 0,
> "totalCompactedRecordsUpdated" : 0
> } {code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)