[
https://issues.apache.org/jira/browse/HUDI-6679?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Ethan Guo updated HUDI-6679:
----------------------------
Description:
When both files and record_index partitions are enabled, for the first commit
in the data table, the transaction fails when initializing the second partition
in the MDT. In this case, the timelines look like below. In this case, when
restarting the pipeline, the rollback triggers irrelevant bootstrap rollback
logic causing MDT to be corrupted, not properly re-initializing the
record_index partition.
DT
{code:java}
<commit_time>.commit.requested
<commit_time>.commit.inflight {code}
MDT
{code:java}
00000000000000010.deltacommit.requested
00000000000000010.deltacommit.inflight
00000000000000010.deltacommit
00000000000000011.deltacommit.requested
00000000000000011.deltacommit.inflight{code}
Afterwards
{code:java}
╔═════╤══════════════════════╤═══════════════════╤═══════════╤════════════════╤════════════════╤════════════════╤═════════════╤═══════════╤════════════════╤════════════════╤════════════════╗
║ No. │ Instant │ Action │ State │ Requested │
Inflight │ Completed │ MT │ MT │ MT │ MT
│ MT ║
║ │ │ │ │ Time │
Time │ Time │ Action │ State │ Requested │
Inflight │ Completed ║
║ │ │ │ │ │
│ │ │ │ Time │
Time │ Time ║
╠═════╪══════════════════════╪═══════════════════╪═══════════╪════════════════╪════════════════╪════════════════╪═════════════╪═══════════╪════════════════╪════════════════╪════════════════╣
║ 0 │ 20230807063905364 │ rollback │ COMPLETED │ 08-06 23:39:06 │
08-06 23:39:07 │ 08-06 23:40:38 │ - │ - │ - │ -
│ - ║
║ │ │ Rolls back │ │ │
│ │ │ │ │
│ ║
║ │ │ 20230807063647472 │ │ │
│ │ │ │ │
│ ║
╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
║ 1 │ 20230807063905364010 │ - │ - │ - │
- │ - │ deltacommit │ COMPLETED │ 08-06 23:40:49 │
08-06 23:40:49 │ 08-06 23:40:51 ║
╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
║ 2 │ 20230807064006967 │ deltacommit │ REQUESTED │ 08-06 23:40:39 │
- │ - │ - │ - │ - │ -
│ - ║
║ │ │ Rolled back by │ │ │
│ │ │ │ │
│ ║
║ │ │ 20230807064227290 │ │ │
│ │ │ │ │
│ ║
╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
║ 3 │ 20230807064041714 │ - │ - │ - │
- │ - │ restore │ COMPLETED │ 08-06 23:40:43 │
08-06 23:40:43 │ 08-06 23:40:48 ║
╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
║ 4 │ 20230807064227290 │ rollback │ INFLIGHT │ 08-06 23:42:28 │
08-06 23:42:29 │ - │ - │ - │ - │ -
│ - ║
║ │ │ Rolls back │ │ │
│ │ │ │ │
│ ║
║ │ │ 20230807064006967 │ │ │
│ │ │ │ │
│ ║
╚═════╧══════════════════════╧═══════════════════╧═══════════╧════════════════╧════════════════╧════════════════╧═════════════╧═══════════╧════════════════╧════════════════╧════════════════╝
{code}
{code:java}
org.apache.hudi.exception.HoodieRollbackException: Failed to rollback
s3a://<base_path>/hoodie_table commits 20230807064006967
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:918)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:865)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:739)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:723)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:718)
at
org.apache.hudi.client.BaseHoodieWriteClient.lambda$startCommitWithTime$97cdbdca$1(BaseHoodieWriteClient.java:928)
at
org.apache.hudi.common.util.CleanerUtils.rollbackFailedWrites(CleanerUtils.java:222)
at
org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:927)
at
org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:920)
at
org.apache.hudi.utilities.streamer.StreamSync.startCommit(StreamSync.java:890)
at
org.apache.hudi.utilities.streamer.StreamSync.writeToSink(StreamSync.java:767)
at
org.apache.hudi.utilities.streamer.StreamSync.syncOnce(StreamSync.java:445)
at
org.apache.hudi.utilities.streamer.HoodieStreamer$StreamSyncService.lambda$startService$1(HoodieStreamer.java:767)
at
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.IllegalArgumentException: FileGroup count for MDT
partition files should be >0
at
org.apache.hudi.common.util.ValidationUtils.checkArgument(ValidationUtils.java:42)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.prepRecords(HoodieBackedTableMetadataWriter.java:1098)
at
org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commitInternal(SparkHoodieBackedTableMetadataWriter.java:135)
at
org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commit(SparkHoodieBackedTableMetadataWriter.java:122)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.processAndCommit(HoodieBackedTableMetadataWriter.java:837)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.update(HoodieBackedTableMetadataWriter.java:1013)
at
org.apache.hudi.table.action.BaseActionExecutor.lambda$writeTableMetadata$2(BaseActionExecutor.java:77)
at org.apache.hudi.common.util.Option.ifPresent(Option.java:97)
at
org.apache.hudi.table.action.BaseActionExecutor.writeTableMetadata(BaseActionExecutor.java:77)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.finishRollback(BaseRollbackActionExecutor.java:264)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.runRollback(BaseRollbackActionExecutor.java:120)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.execute(BaseRollbackActionExecutor.java:141)
at
org.apache.hudi.table.HoodieSparkMergeOnReadTable.rollback(HoodieSparkMergeOnReadTable.java:218)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:901)
... 16 more {code}
was:
When both files and record_index partitions are enabled, for the first commit
in the data table, the transaction fails when initializing the second partition
in the MDT. In this case, the timelines look like below. In this case, when
restarting the pipeline, the rollback triggers irrelevant bootstrap rollback
logic causing MDT to be corrupted.
DT
{code:java}
<commit_time>.commit.requested
<commit_time>.commit.inflight {code}
MDT
{code:java}
00000000000000010.deltacommit.requested
00000000000000010.deltacommit.inflight
00000000000000010.deltacommit
00000000000000011.deltacommit.requested
00000000000000011.deltacommit.inflight{code}
{code:java}
org.apache.hudi.exception.HoodieRollbackException: Failed to rollback
s3a://<base_path>/hoodie_table commits 20230807064006967
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:918)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:865)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:739)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:723)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:718)
at
org.apache.hudi.client.BaseHoodieWriteClient.lambda$startCommitWithTime$97cdbdca$1(BaseHoodieWriteClient.java:928)
at
org.apache.hudi.common.util.CleanerUtils.rollbackFailedWrites(CleanerUtils.java:222)
at
org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:927)
at
org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:920)
at
org.apache.hudi.utilities.streamer.StreamSync.startCommit(StreamSync.java:890)
at
org.apache.hudi.utilities.streamer.StreamSync.writeToSink(StreamSync.java:767)
at
org.apache.hudi.utilities.streamer.StreamSync.syncOnce(StreamSync.java:445)
at
org.apache.hudi.utilities.streamer.HoodieStreamer$StreamSyncService.lambda$startService$1(HoodieStreamer.java:767)
at
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.IllegalArgumentException: FileGroup count for MDT
partition files should be >0
at
org.apache.hudi.common.util.ValidationUtils.checkArgument(ValidationUtils.java:42)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.prepRecords(HoodieBackedTableMetadataWriter.java:1098)
at
org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commitInternal(SparkHoodieBackedTableMetadataWriter.java:135)
at
org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commit(SparkHoodieBackedTableMetadataWriter.java:122)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.processAndCommit(HoodieBackedTableMetadataWriter.java:837)
at
org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.update(HoodieBackedTableMetadataWriter.java:1013)
at
org.apache.hudi.table.action.BaseActionExecutor.lambda$writeTableMetadata$2(BaseActionExecutor.java:77)
at org.apache.hudi.common.util.Option.ifPresent(Option.java:97)
at
org.apache.hudi.table.action.BaseActionExecutor.writeTableMetadata(BaseActionExecutor.java:77)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.finishRollback(BaseRollbackActionExecutor.java:264)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.runRollback(BaseRollbackActionExecutor.java:120)
at
org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.execute(BaseRollbackActionExecutor.java:141)
at
org.apache.hudi.table.HoodieSparkMergeOnReadTable.rollback(HoodieSparkMergeOnReadTable.java:218)
at
org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:901)
... 16 more {code}
> Fix initialization of metadata table partitions upon failure
> ------------------------------------------------------------
>
> Key: HUDI-6679
> URL: https://issues.apache.org/jira/browse/HUDI-6679
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: Ethan Guo
> Assignee: Ethan Guo
> Priority: Blocker
> Fix For: 0.14.0
>
>
> When both files and record_index partitions are enabled, for the first commit
> in the data table, the transaction fails when initializing the second
> partition in the MDT. In this case, the timelines look like below. In this
> case, when restarting the pipeline, the rollback triggers irrelevant
> bootstrap rollback logic causing MDT to be corrupted, not properly
> re-initializing the record_index partition.
> DT
> {code:java}
> <commit_time>.commit.requested
> <commit_time>.commit.inflight {code}
> MDT
> {code:java}
> 00000000000000010.deltacommit.requested
> 00000000000000010.deltacommit.inflight
> 00000000000000010.deltacommit
> 00000000000000011.deltacommit.requested
> 00000000000000011.deltacommit.inflight{code}
> Afterwards
> {code:java}
> ╔═════╤══════════════════════╤═══════════════════╤═══════════╤════════════════╤════════════════╤════════════════╤═════════════╤═══════════╤════════════════╤════════════════╤════════════════╗
> ║ No. │ Instant │ Action │ State │ Requested
> │ Inflight │ Completed │ MT │ MT │ MT
> │ MT │ MT ║
> ║ │ │ │ │ Time
> │ Time │ Time │ Action │ State │ Requested
> │ Inflight │ Completed ║
> ║ │ │ │ │
> │ │ │ │ │ Time
> │ Time │ Time ║
> ╠═════╪══════════════════════╪═══════════════════╪═══════════╪════════════════╪════════════════╪════════════════╪═════════════╪═══════════╪════════════════╪════════════════╪════════════════╣
> ║ 0 │ 20230807063905364 │ rollback │ COMPLETED │ 08-06 23:39:06
> │ 08-06 23:39:07 │ 08-06 23:40:38 │ - │ - │ -
> │ - │ - ║
> ║ │ │ Rolls back │ │
> │ │ │ │ │
> │ │ ║
> ║ │ │ 20230807063647472 │ │
> │ │ │ │ │
> │ │ ║
> ╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
> ║ 1 │ 20230807063905364010 │ - │ - │ -
> │ - │ - │ deltacommit │ COMPLETED │ 08-06 23:40:49
> │ 08-06 23:40:49 │ 08-06 23:40:51 ║
> ╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
> ║ 2 │ 20230807064006967 │ deltacommit │ REQUESTED │ 08-06 23:40:39
> │ - │ - │ - │ - │ -
> │ - │ - ║
> ║ │ │ Rolled back by │ │
> │ │ │ │ │
> │ │ ║
> ║ │ │ 20230807064227290 │ │
> │ │ │ │ │
> │ │ ║
> ╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
> ║ 3 │ 20230807064041714 │ - │ - │ -
> │ - │ - │ restore │ COMPLETED │ 08-06 23:40:43
> │ 08-06 23:40:43 │ 08-06 23:40:48 ║
> ╟─────┼──────────────────────┼───────────────────┼───────────┼────────────────┼────────────────┼────────────────┼─────────────┼───────────┼────────────────┼────────────────┼────────────────╢
> ║ 4 │ 20230807064227290 │ rollback │ INFLIGHT │ 08-06 23:42:28
> │ 08-06 23:42:29 │ - │ - │ - │ -
> │ - │ - ║
> ║ │ │ Rolls back │ │
> │ │ │ │ │
> │ │ ║
> ║ │ │ 20230807064006967 │ │
> │ │ │ │ │
> │ │ ║
> ╚═════╧══════════════════════╧═══════════════════╧═══════════╧════════════════╧════════════════╧════════════════╧═════════════╧═══════════╧════════════════╧════════════════╧════════════════╝
> {code}
>
> {code:java}
> org.apache.hudi.exception.HoodieRollbackException: Failed to rollback
> s3a://<base_path>/hoodie_table commits 20230807064006967
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:918)
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:865)
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:739)
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:723)
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:718)
> at
> org.apache.hudi.client.BaseHoodieWriteClient.lambda$startCommitWithTime$97cdbdca$1(BaseHoodieWriteClient.java:928)
> at
> org.apache.hudi.common.util.CleanerUtils.rollbackFailedWrites(CleanerUtils.java:222)
> at
> org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:927)
> at
> org.apache.hudi.client.BaseHoodieWriteClient.startCommitWithTime(BaseHoodieWriteClient.java:920)
> at
> org.apache.hudi.utilities.streamer.StreamSync.startCommit(StreamSync.java:890)
> at
> org.apache.hudi.utilities.streamer.StreamSync.writeToSink(StreamSync.java:767)
> at
> org.apache.hudi.utilities.streamer.StreamSync.syncOnce(StreamSync.java:445)
> at
> org.apache.hudi.utilities.streamer.HoodieStreamer$StreamSyncService.lambda$startService$1(HoodieStreamer.java:767)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:750)
> Caused by: java.lang.IllegalArgumentException: FileGroup count for MDT
> partition files should be >0
> at
> org.apache.hudi.common.util.ValidationUtils.checkArgument(ValidationUtils.java:42)
> at
> org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.prepRecords(HoodieBackedTableMetadataWriter.java:1098)
> at
> org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commitInternal(SparkHoodieBackedTableMetadataWriter.java:135)
> at
> org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commit(SparkHoodieBackedTableMetadataWriter.java:122)
> at
> org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.processAndCommit(HoodieBackedTableMetadataWriter.java:837)
> at
> org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.update(HoodieBackedTableMetadataWriter.java:1013)
> at
> org.apache.hudi.table.action.BaseActionExecutor.lambda$writeTableMetadata$2(BaseActionExecutor.java:77)
> at org.apache.hudi.common.util.Option.ifPresent(Option.java:97)
> at
> org.apache.hudi.table.action.BaseActionExecutor.writeTableMetadata(BaseActionExecutor.java:77)
> at
> org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.finishRollback(BaseRollbackActionExecutor.java:264)
> at
> org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.runRollback(BaseRollbackActionExecutor.java:120)
> at
> org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.execute(BaseRollbackActionExecutor.java:141)
> at
> org.apache.hudi.table.HoodieSparkMergeOnReadTable.rollback(HoodieSparkMergeOnReadTable.java:218)
> at
> org.apache.hudi.client.BaseHoodieTableServiceClient.rollback(BaseHoodieTableServiceClient.java:901)
> ... 16 more {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)