1996fanrui commented on code in PR #20233:
URL: https://github.com/apache/flink/pull/20233#discussion_r923351868
##########
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImpl.java:
##########
@@ -60,89 +63,78 @@
public class ChannelStateWriterImpl implements ChannelStateWriter {
private static final Logger LOG =
LoggerFactory.getLogger(ChannelStateWriterImpl.class);
- private static final int DEFAULT_MAX_CHECKPOINTS =
- 1000; // includes max-concurrent-checkpoints + checkpoints to be
aborted (scheduled via
- // mailbox)
Review Comment:
Hi @pnowojski , thanks for your review.
The `checkpointId < nextExpectedCheckpointId` may happen. For example,
`AlternatingWaitingForFirstBarrierUnaligned#barrierReceived` don't check
whether checkpointId is aborted, if received a barrier, it will call the
ChannelStateWriter.start().
So, we should use `ongoingCheckpointId`, and just update it within
`ChannelStateWriterImpl#start`
We should ignore checkpoint when the checkpointId is aborted, so we need
the `NavigableSet<Long> abortedCheckpointIds;`.
##########
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImpl.java:
##########
@@ -60,89 +63,78 @@
public class ChannelStateWriterImpl implements ChannelStateWriter {
private static final Logger LOG =
LoggerFactory.getLogger(ChannelStateWriterImpl.class);
- private static final int DEFAULT_MAX_CHECKPOINTS =
- 1000; // includes max-concurrent-checkpoints + checkpoints to be
aborted (scheduled via
- // mailbox)
private final String taskName;
private final ChannelStateWriteRequestExecutor executor;
- private final ConcurrentMap<Long, ChannelStateWriteResult> results;
- private final int maxCheckpoints;
- /**
- * Creates a {@link ChannelStateWriterImpl} with {@link
#DEFAULT_MAX_CHECKPOINTS} as {@link
- * #maxCheckpoints}.
- */
- public ChannelStateWriterImpl(
- String taskName, int subtaskIndex, CheckpointStorageWorkerView
streamFactoryResolver) {
- this(taskName, subtaskIndex, streamFactoryResolver,
DEFAULT_MAX_CHECKPOINTS);
- }
+ private final Object lock = new Object();
+
+ @GuardedBy("lock")
+ private long ongoingCheckpointId;
+
+ @GuardedBy("lock")
+ private final NavigableSet<Long> abortedCheckpointIds;
+
+ // The result of ongoingCheckpointId, the checkpoint that CheckpointId is
less than
+ // ongoingCheckpointId should be aborted due to concurrent unaligned
checkpoint is currently not
+ // supported.
+ @GuardedBy("lock")
+ private ChannelStateWriteResult result;
/**
* Creates a {@link ChannelStateWriterImpl} with {@link
ChannelStateSerializerImpl default}
* {@link ChannelStateSerializer}, and a {@link
ChannelStateWriteRequestExecutorImpl}.
*
* @param taskName
* @param streamFactoryResolver a factory to obtain output stream factory
for a given checkpoint
- * @param maxCheckpoints maximum number of checkpoints to be written
currently or finished but
- * not taken yet.
*/
- ChannelStateWriterImpl(
- String taskName,
- int subtaskIndex,
- CheckpointStorageWorkerView streamFactoryResolver,
- int maxCheckpoints) {
+ public ChannelStateWriterImpl(
+ String taskName, int subtaskIndex, CheckpointStorageWorkerView
streamFactoryResolver) {
this(
taskName,
- new ConcurrentHashMap<>(maxCheckpoints),
new ChannelStateWriteRequestExecutorImpl(
taskName,
new ChannelStateWriteRequestDispatcherImpl(
taskName,
subtaskIndex,
streamFactoryResolver,
- new ChannelStateSerializerImpl())),
- maxCheckpoints);
+ new ChannelStateSerializerImpl())));
}
- ChannelStateWriterImpl(
- String taskName,
- ConcurrentMap<Long, ChannelStateWriteResult> results,
- ChannelStateWriteRequestExecutor executor,
- int maxCheckpoints) {
+ ChannelStateWriterImpl(String taskName, ChannelStateWriteRequestExecutor
executor) {
this.taskName = taskName;
- this.results = results;
- this.maxCheckpoints = maxCheckpoints;
this.executor = executor;
+ this.abortedCheckpointIds = new TreeSet<>();
+ this.ongoingCheckpointId = 0;
}
@Override
public void start(long checkpointId, CheckpointOptions checkpointOptions) {
LOG.debug("{} starting checkpoint {} ({})", taskName, checkpointId,
checkpointOptions);
- ChannelStateWriteResult result = new ChannelStateWriteResult();
- ChannelStateWriteResult put =
- results.computeIfAbsent(
+ synchronized (lock) {
+ if (isUnavailableCheckpoint(checkpointId)) {
+ LOG.debug(
+ "The checkpoint {} of task {} has been aborted, so
don't start.",
checkpointId,
- id -> {
- Preconditions.checkState(
- results.size() < maxCheckpoints,
- String.format(
- "%s can't start %d, results.size()
> maxCheckpoints: %d > %d",
- taskName,
- checkpointId,
- results.size(),
- maxCheckpoints));
- enqueue(
- new CheckpointStartRequest(
- checkpointId,
- result,
-
checkpointOptions.getTargetLocation()),
- false);
- return result;
- });
- Preconditions.checkArgument(
- put == result,
- taskName + " result future already present for checkpoint " +
checkpointId);
+ taskName);
Review Comment:
It may happen. For example,
`AlternatingWaitingForFirstBarrierUnaligned#barrierReceived` don't check
whether checkpointId is aborted, if received a barrier, it will call the
ChannelStateWriter.start().
##########
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/channel/ChannelStateWriterImpl.java:
##########
@@ -60,89 +63,78 @@
public class ChannelStateWriterImpl implements ChannelStateWriter {
private static final Logger LOG =
LoggerFactory.getLogger(ChannelStateWriterImpl.class);
- private static final int DEFAULT_MAX_CHECKPOINTS =
- 1000; // includes max-concurrent-checkpoints + checkpoints to be
aborted (scheduled via
- // mailbox)
private final String taskName;
private final ChannelStateWriteRequestExecutor executor;
- private final ConcurrentMap<Long, ChannelStateWriteResult> results;
- private final int maxCheckpoints;
- /**
- * Creates a {@link ChannelStateWriterImpl} with {@link
#DEFAULT_MAX_CHECKPOINTS} as {@link
- * #maxCheckpoints}.
- */
- public ChannelStateWriterImpl(
- String taskName, int subtaskIndex, CheckpointStorageWorkerView
streamFactoryResolver) {
- this(taskName, subtaskIndex, streamFactoryResolver,
DEFAULT_MAX_CHECKPOINTS);
- }
+ private final Object lock = new Object();
+
+ @GuardedBy("lock")
+ private long ongoingCheckpointId;
+
+ @GuardedBy("lock")
+ private final NavigableSet<Long> abortedCheckpointIds;
+
+ // The result of ongoingCheckpointId, the checkpoint that CheckpointId is
less than
+ // ongoingCheckpointId should be aborted due to concurrent unaligned
checkpoint is currently not
+ // supported.
+ @GuardedBy("lock")
+ private ChannelStateWriteResult result;
Review Comment:
Removed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]