pnowojski commented on code in PR #20233:
URL: https://github.com/apache/flink/pull/20233#discussion_r925283630


##########
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImpl.java:
##########
@@ -60,89 +63,73 @@
 public class ChannelStateWriterImpl implements ChannelStateWriter {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(ChannelStateWriterImpl.class);
-    private static final int DEFAULT_MAX_CHECKPOINTS =
-            1000; // includes max-concurrent-checkpoints + checkpoints to be 
aborted (scheduled via
-    // mailbox)
 
     private final String taskName;
     private final ChannelStateWriteRequestExecutor executor;
-    private final ConcurrentMap<Long, ChannelStateWriteResult> results;
-    private final int maxCheckpoints;
 
-    /**
-     * Creates a {@link ChannelStateWriterImpl} with {@link 
#DEFAULT_MAX_CHECKPOINTS} as {@link
-     * #maxCheckpoints}.
-     */
-    public ChannelStateWriterImpl(
-            String taskName, int subtaskIndex, CheckpointStorageWorkerView 
streamFactoryResolver) {
-        this(taskName, subtaskIndex, streamFactoryResolver, 
DEFAULT_MAX_CHECKPOINTS);
-    }
+    private final Object lock = new Object();
+
+    @GuardedBy("lock")
+    private long ongoingCheckpointId;
+
+    // The checkpoint that checkpointId is less than or equal to 
maxAbortedCheckpointId should be
+    // aborted.
+    @GuardedBy("lock")
+    private long maxAbortedCheckpointId;
+
+    // The channel state write result of ongoingCheckpointId.
+    @GuardedBy("lock")
+    private ChannelStateWriteResult result;
 
     /**
      * Creates a {@link ChannelStateWriterImpl} with {@link 
ChannelStateSerializerImpl default}
      * {@link ChannelStateSerializer}, and a {@link 
ChannelStateWriteRequestExecutorImpl}.
      *
      * @param taskName
      * @param streamFactoryResolver a factory to obtain output stream factory 
for a given checkpoint
-     * @param maxCheckpoints maximum number of checkpoints to be written 
currently or finished but
-     *     not taken yet.
      */
-    ChannelStateWriterImpl(
-            String taskName,
-            int subtaskIndex,
-            CheckpointStorageWorkerView streamFactoryResolver,
-            int maxCheckpoints) {
+    public ChannelStateWriterImpl(
+            String taskName, int subtaskIndex, CheckpointStorageWorkerView 
streamFactoryResolver) {
         this(
                 taskName,
-                new ConcurrentHashMap<>(maxCheckpoints),
                 new ChannelStateWriteRequestExecutorImpl(
                         taskName,
                         new ChannelStateWriteRequestDispatcherImpl(
                                 taskName,
                                 subtaskIndex,
                                 streamFactoryResolver,
-                                new ChannelStateSerializerImpl())),
-                maxCheckpoints);
+                                new ChannelStateSerializerImpl())));
     }
 
-    ChannelStateWriterImpl(
-            String taskName,
-            ConcurrentMap<Long, ChannelStateWriteResult> results,
-            ChannelStateWriteRequestExecutor executor,
-            int maxCheckpoints) {
+    ChannelStateWriterImpl(String taskName, ChannelStateWriteRequestExecutor 
executor) {
         this.taskName = taskName;
-        this.results = results;
-        this.maxCheckpoints = maxCheckpoints;
         this.executor = executor;
+        this.ongoingCheckpointId = 0;
+        this.maxAbortedCheckpointId = 0;
     }
 
     @Override
     public void start(long checkpointId, CheckpointOptions checkpointOptions) {
         LOG.debug("{} starting checkpoint {} ({})", taskName, checkpointId, 
checkpointOptions);
-        ChannelStateWriteResult result = new ChannelStateWriteResult();
-        ChannelStateWriteResult put =
-                results.computeIfAbsent(
+        synchronized (lock) {
+            Preconditions.checkState(checkpointId > ongoingCheckpointId);
+            if (isCheckpointSubsumedOrAborted(checkpointId)) {
+                LOG.debug(
+                        "The checkpoint {} of task {} has been aborted, so 
don't start.",
                         checkpointId,
-                        id -> {
-                            Preconditions.checkState(
-                                    results.size() < maxCheckpoints,
-                                    String.format(
-                                            "%s can't start %d, results.size() 
> maxCheckpoints: %d > %d",
-                                            taskName,
-                                            checkpointId,
-                                            results.size(),
-                                            maxCheckpoints));
-                            enqueue(
-                                    new CheckpointStartRequest(
-                                            checkpointId,
-                                            result,
-                                            
checkpointOptions.getTargetLocation()),
-                                    false);
-                            return result;
-                        });
-        Preconditions.checkArgument(
-                put == result,
-                taskName + " result future already present for checkpoint " + 
checkpointId);
+                        taskName);
+                return;
+            }
+            if (result != null) {
+                result.fail(new CancellationException("Cancel old 
checkpoint."));

Review Comment:
   I think the exception should be `new 
CheckpointException(CheckpointFailureReason#CHECKPOINT_DECLINED_SUBSUMED)`?



##########
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImpl.java:
##########
@@ -60,89 +63,73 @@
 public class ChannelStateWriterImpl implements ChannelStateWriter {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(ChannelStateWriterImpl.class);
-    private static final int DEFAULT_MAX_CHECKPOINTS =
-            1000; // includes max-concurrent-checkpoints + checkpoints to be 
aborted (scheduled via
-    // mailbox)
 
     private final String taskName;
     private final ChannelStateWriteRequestExecutor executor;
-    private final ConcurrentMap<Long, ChannelStateWriteResult> results;
-    private final int maxCheckpoints;
 
-    /**
-     * Creates a {@link ChannelStateWriterImpl} with {@link 
#DEFAULT_MAX_CHECKPOINTS} as {@link
-     * #maxCheckpoints}.
-     */
-    public ChannelStateWriterImpl(
-            String taskName, int subtaskIndex, CheckpointStorageWorkerView 
streamFactoryResolver) {
-        this(taskName, subtaskIndex, streamFactoryResolver, 
DEFAULT_MAX_CHECKPOINTS);
-    }
+    private final Object lock = new Object();
+
+    @GuardedBy("lock")
+    private long ongoingCheckpointId;
+
+    // The checkpoint that checkpointId is less than or equal to 
maxAbortedCheckpointId should be
+    // aborted.
+    @GuardedBy("lock")
+    private long maxAbortedCheckpointId;
+
+    // The channel state write result of ongoingCheckpointId.
+    @GuardedBy("lock")
+    private ChannelStateWriteResult result;
 
     /**
      * Creates a {@link ChannelStateWriterImpl} with {@link 
ChannelStateSerializerImpl default}
      * {@link ChannelStateSerializer}, and a {@link 
ChannelStateWriteRequestExecutorImpl}.
      *
      * @param taskName
      * @param streamFactoryResolver a factory to obtain output stream factory 
for a given checkpoint
-     * @param maxCheckpoints maximum number of checkpoints to be written 
currently or finished but
-     *     not taken yet.
      */
-    ChannelStateWriterImpl(
-            String taskName,
-            int subtaskIndex,
-            CheckpointStorageWorkerView streamFactoryResolver,
-            int maxCheckpoints) {
+    public ChannelStateWriterImpl(
+            String taskName, int subtaskIndex, CheckpointStorageWorkerView 
streamFactoryResolver) {
         this(
                 taskName,
-                new ConcurrentHashMap<>(maxCheckpoints),
                 new ChannelStateWriteRequestExecutorImpl(
                         taskName,
                         new ChannelStateWriteRequestDispatcherImpl(
                                 taskName,
                                 subtaskIndex,
                                 streamFactoryResolver,
-                                new ChannelStateSerializerImpl())),
-                maxCheckpoints);
+                                new ChannelStateSerializerImpl())));
     }
 
-    ChannelStateWriterImpl(
-            String taskName,
-            ConcurrentMap<Long, ChannelStateWriteResult> results,
-            ChannelStateWriteRequestExecutor executor,
-            int maxCheckpoints) {
+    ChannelStateWriterImpl(String taskName, ChannelStateWriteRequestExecutor 
executor) {
         this.taskName = taskName;
-        this.results = results;
-        this.maxCheckpoints = maxCheckpoints;
         this.executor = executor;
+        this.ongoingCheckpointId = 0;
+        this.maxAbortedCheckpointId = 0;
     }
 
     @Override
     public void start(long checkpointId, CheckpointOptions checkpointOptions) {
         LOG.debug("{} starting checkpoint {} ({})", taskName, checkpointId, 
checkpointOptions);
-        ChannelStateWriteResult result = new ChannelStateWriteResult();
-        ChannelStateWriteResult put =
-                results.computeIfAbsent(
+        synchronized (lock) {
+            Preconditions.checkState(checkpointId > ongoingCheckpointId);
+            if (isCheckpointSubsumedOrAborted(checkpointId)) {
+                LOG.debug(
+                        "The checkpoint {} of task {} has been aborted, so 
don't start.",
                         checkpointId,
-                        id -> {
-                            Preconditions.checkState(
-                                    results.size() < maxCheckpoints,
-                                    String.format(
-                                            "%s can't start %d, results.size() 
> maxCheckpoints: %d > %d",
-                                            taskName,
-                                            checkpointId,
-                                            results.size(),
-                                            maxCheckpoints));
-                            enqueue(
-                                    new CheckpointStartRequest(
-                                            checkpointId,
-                                            result,
-                                            
checkpointOptions.getTargetLocation()),
-                                    false);
-                            return result;
-                        });
-        Preconditions.checkArgument(
-                put == result,
-                taskName + " result future already present for checkpoint " + 
checkpointId);
+                        taskName);
+                return;
+            }
+            if (result != null) {
+                result.fail(new CancellationException("Cancel old 
checkpoint."));

Review Comment:
   I think it would be safer to fail the futures outside of this lock. We don't 
know what other locks will be acquired from the actions that are connected to 
those futures.
   
   So something like:
   
   ```
   @Nullable ChannelStatWriteResult toFail = null;
   
   synchronized (lock) { 
     ...
     toFail = result;
     ...
   }
   
   if (toFail != null) {
     toFail.fail();
   }
   ```
   ?
   
   Or maybe for the sake of being consistent use the same pattern as in other 
places?
   
   ```
    enqueue(ChannelStateWriteRequest.abort(...));
   ```
   
   as it _seems_ to achieve a similar thing?
   
   I'm not sure which of this option is better here.



##########
flink-runtime/src/test/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImplTest.java:
##########
@@ -120,15 +141,28 @@ public void testAbortIgnoresMissing() throws Exception {
         runWithSyncWorker(this::callAbort);
     }
 
+    @Test
+    public void testOldCheckpointIsAborted() {
+        ChannelStateWriterImpl writer = openWriter();
+        callStart(1, writer);
+        ChannelStateWriteResult result1 = writer.getWriteResult(1);
+        assertFalse(result1.isDone());
+
+        callStart(2, writer);
+        assertNull(writer.getWriteResult(1));
+        assertFalse(writer.getWriteResult(2).isDone());
+
+        assertWriteResultIsCompletedExceptionally(result1);

Review Comment:
   Can you add assertion that the exception is 
`CheckpointException(CheckpointFailureReason#CHECKPOINT_DECLINED_SUBSUMED)`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to