pvary commented on code in PR #6528:
URL: https://github.com/apache/iceberg/pull/6528#discussion_r1063261916


##########
flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java:
##########
@@ -554,21 +579,178 @@ public void testMultipleJobsWriteSameTable() throws 
Exception {
         assertFlinkManifests(0);
         SimpleDataUtil.assertTableRows(table, tableRows);
         assertSnapshotSize(i + 1);
-        assertMaxCommittedCheckpointId(jobId, checkpointId + 1);
+        assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId + 1);
       }
     }
   }
 
+  @Test
+  public void testMultipleSinksWriteSameTable() throws Exception {
+    long timestamp = 0;
+    List<RowData> tableRows = Lists.newArrayList();
+
+    JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()};
+    OperatorID[] operatorIds1 =
+        new OperatorID[] {new OperatorID(), new OperatorID(), new 
OperatorID()};
+    OperatorID[] operatorIds2 =
+        new OperatorID[] {new OperatorID(), new OperatorID(), new 
OperatorID()};
+    for (int i = 0; i < 20; i++) {
+      int jobIndex = i % 3;
+      int checkpointId = i / 3;
+      JobID jobId = jobs[jobIndex];
+      OperatorID operatorId1 = operatorIds1[jobIndex];
+      OperatorID operatorId2 = operatorIds2[jobIndex];
+      try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness1 = 
createStreamSink(jobId);
+          OneInputStreamOperatorTestHarness<WriteResult, Void> harness2 = 
createStreamSink(jobId)) {
+        harness1.getStreamConfig().setOperatorID(operatorId1);
+        harness1.setup();
+        harness1.open();
+        harness2.getStreamConfig().setOperatorID(operatorId2);
+        harness2.setup();
+        harness2.open();
+
+        assertSnapshotSize(2 * i);
+        assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId == 0 ? 
-1 : checkpointId);
+        assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId == 0 ? 
-1 : checkpointId);
+
+        List<RowData> rows1 = 
Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-1-" + i));
+        tableRows.addAll(rows1);
+
+        DataFile dataFile1 = writeDataFile(String.format("data-1-%d", i), 
rows1);
+        harness1.processElement(of(dataFile1), ++timestamp);
+        harness1.snapshot(checkpointId + 1, ++timestamp);
+
+        List<RowData> rows2 = 
Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-2-" + i));
+        tableRows.addAll(rows2);
+
+        DataFile dataFile2 = writeDataFile(String.format("data-2-%d", i), 
rows2);
+        harness2.processElement(of(dataFile2), ++timestamp);
+        harness2.snapshot(checkpointId + 1, ++timestamp);
+
+        assertFlinkManifests(2);
+
+        harness1.notifyOfCompletedCheckpoint(checkpointId + 1);
+        harness2.notifyOfCompletedCheckpoint(checkpointId + 1);
+
+        assertFlinkManifests(0);
+        SimpleDataUtil.assertTableRows(table, tableRows);
+        assertSnapshotSize(2 * i + 2);
+        assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId + 1);
+        assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId + 1);
+      }
+    }
+  }
+
+  @Test
+  public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception {
+    long checkpointId = 0;
+    long timestamp = 0;
+    List<RowData> expectedRows = Lists.newArrayList();
+    OperatorSubtaskState snapshot1;
+    OperatorSubtaskState snapshot2;
+
+    JobID jobId = new JobID();
+    OperatorID operatorId1 = new OperatorID();
+    OperatorID operatorId2 = new OperatorID();
+    try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness1 = 
createStreamSink(jobId);
+        OneInputStreamOperatorTestHarness<WriteResult, Void> harness2 = 
createStreamSink(jobId)) {
+      harness1.getStreamConfig().setOperatorID(operatorId1);
+      harness1.setup();
+      harness1.open();
+      harness2.getStreamConfig().setOperatorID(operatorId2);
+      harness2.setup();
+      harness2.open();
+
+      assertSnapshotSize(0);
+      assertMaxCommittedCheckpointId(jobId, operatorId1, -1L);
+      assertMaxCommittedCheckpointId(jobId, operatorId2, -1L);
+
+      RowData row1 = SimpleDataUtil.createRowData(1, "hello1");
+      expectedRows.add(row1);
+      DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1));
+
+      harness1.processElement(of(dataFile1), ++timestamp);
+      snapshot1 = harness1.snapshot(++checkpointId, ++timestamp);
+
+      RowData row2 = SimpleDataUtil.createRowData(1, "hello2");
+      expectedRows.add(row2);
+      DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2));
+
+      harness2.processElement(of(dataFile2), ++timestamp);
+      snapshot2 = harness2.snapshot(checkpointId, ++timestamp);
+      assertFlinkManifests(2);
+
+      // Only notify one of the committers
+      harness1.notifyOfCompletedCheckpoint(checkpointId);
+      assertFlinkManifests(1);
+
+      // Only the first row is committed at this point
+      SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1));
+      assertSnapshotSize(1);
+      assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId);
+      assertMaxCommittedCheckpointId(jobId, operatorId2, -1);
+    }
+
+    // Restore from the given snapshot
+    try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness1 = 
createStreamSink(jobId);
+        OneInputStreamOperatorTestHarness<WriteResult, Void> harness2 = 
createStreamSink(jobId)) {
+      harness1.getStreamConfig().setOperatorID(operatorId1);
+      harness1.setup();
+      harness1.initializeState(snapshot1);
+      harness1.open();
+
+      harness2.getStreamConfig().setOperatorID(operatorId2);

Review Comment:
   I might be misunderstand something.
   
   We need the operatorId to be constant, and static. If it is changing, then 
we will have issues with the recovery as all of the previously collected, but 
not committed changes will be lost (as the state is lost for the operator). 
This is already mentioned in the javadoc for the 
`FlinkSink.Builder.uidPrefix(String)`. This behaviour does not change with this 
PR.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to