mgstahl-sophos commented on issue #13995:
URL: https://github.com/apache/hudi/issues/13995#issuecomment-3381945745

   With this patch applied, `AvgRecordSize` is now very similar between the 
versions. The small difference I believe to different compression.
   
   ```
   printf "%-10s %s\n" "Hudi 0.14" "Hudi 1.0"
   paste <(cat 
/tmp/hudi-ingest-testperfhudi014registryo-driver-logs-at-start.txt | grep 
AvgRecordSize | awk '{print $NF}') \
         <(cat 
/tmp/hudi-ingest-testperfhudi1tv6registryo-driver-logs-at-start.txt | grep 
AvgRecordSize | awk '{print $NF}') | head -23
   Hudi 0.14  Hudi 1.0
   1024 1024
   166  155
   132  126
   130  125
   128  123
   125  121
   124  120
   123  120
   122  119
   122  119
   121  118
   122  119
   120  117
   122  119
   122  120
   122  119
   122  119
   122  119
   122  119
   121  118
   121  118
   120  117
   121  117
   ```
   
   `totalFileGroupCount` and `totalFileCount` are also very similar:
   ```
   jq -n -L ./scripts/timeline/ --slurpfile hudi014 
/tmp/timeline-stats-testperfhudi014registryo.json \
      --slurpfile hudi1tv6 /tmp/timeline-stats-testperfhudi1tv6registryo.json '
      import "timeline-functions" as timeline;
      timeline::compare_stat_files($hudi014[0]; $hudi1tv6[0]; "hudi014"; 
"hudi1tv6")' | jq '.|del(.cumulativeByCommit)'
   
   {
     "summary": {
       "hudi014": {
         "commitCount": 31,
         "elapsed": "PT50M45S",
         "averageCommitInterval": "PT1M41S",
         "totalFileGroupCount": 1981,
         "totalFileCount": 19488,
         "totalFileBytes": 506819459949,
         "totalInsertCount": 1166269879,
         "totalRecordCount": 4169856806,
         "totalUpsertTime": 218687753,
         "averageFileSizeBytes": 26006746,
         "averageInsertCount": 59846,
         "averageRecordCount": 213971,
         "averageRecordSize": 122,
         "averageUpsertTime": 11222
       },
       "hudi1tv6": {
         "commitCount": 31,
         "elapsed": "PT1H0M16S",
         "averageCommitInterval": "PT2M0S",
         "totalFileGroupCount": 1973,
         "totalFileCount": 19483,
         "totalFileBytes": 497038826251,
         "totalInsertCount": 1166269879,
         "totalRecordCount": 4200996840,
         "totalUpsertTime": 234184768,
         "averageFileSizeBytes": 25511412,
         "averageInsertCount": 59861,
         "averageRecordCount": 215624,
         "averageRecordSize": 119,
         "averageUpsertTime": 12020
       }
     },
     "differences": {
       "elapsed": "PT9M31S",
       "averageCommitInterval": "PT19S",
       "totalFileGroupCount": -8,
       "totalFileCount": -5,
       "totalFileBytes": -9780633698,
       "totalInsertCount": 0,
       "totalRecordCount": 31140034,
       "totalUpsertTime": 15497015,
       "averageFileSizeBytes": -495334,
       "averageInsertCount": 15,
       "averageRecordCount": 1653,
       "averageRecordSize": -3,
       "averageUpsertTime": 798
     },
     "ratios": {
       "elapsed": 1.19,
       "averageCommitInterval": 1.19,
       "totalFileGroupCount": 1,
       "totalFileCount": 1,
       "totalFileBytes": 0.98,
       "totalInsertCount": 1,
       "totalRecordCount": 1.01,
       "totalUpsertTime": 1.07,
       "averageFileSizeBytes": 0.98,
       "averageInsertCount": 1,
       "averageRecordCount": 1.01,
       "averageRecordSize": 0.98,
       "averageUpsertTime": 1.07
     }
   }
   ```
   
   I'll close this ticket.
   
   Thank you for the quick resolution!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to