[ https://issues.apache.org/jira/browse/IGNITE-15192?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ilya Kazakov updated IGNITE-15192: ---------------------------------- Description: It is about race which was detected in https://issues.apache.org/jira/browse/IGNITE-15099. The fix from the ticket above fixed the wrong heartbeat, but do not fix a race, which allows checkpointer thread go ahead and do not await on ctx0.awaitPendingTasksFinished() (in ChecpointWorkflow.markCheckpointBegin ), which leads to: - checkpointer thread enter in blocking section - and after this a checkpointer thread hertbeat could be updated by prallel thread. {code:java} // CheckpointContextImpl#executor @Override public Executor executor() { return asyncRunner == null ? null : cmd -> { try { GridFutureAdapter<?> res = new GridFutureAdapter<>(); res.listen(fut -> heartbeatUpdater.updateHeartbeat()); // Listener is invoked concurrently with pending future finish asyncRunner.execute(U.wrapIgniteFuture(cmd, res)); pendingTaskFuture.add(res); } catch (RejectedExecutionException e) { assert false : "A task should never be rejected by async runner"; } }; } {code} {code:java} // Checkpointer#waitCheckpointEvent try { synchronized (this) { long remaining = U.nanosToMillis(scheduledCp.nextCpNanos - System.nanoTime()); while (remaining > 0 && !isCancelled()) { blockingSectionBegin(); try { wait(remaining); // At this point and till blockingSectionEnd call heartbeat should be equal to Long.MAX_VALUE remaining = U.nanosToMillis(scheduledCp.nextCpNanos - System.nanoTime()); } finally { blockingSectionEnd(); } } } } {code} Discussion is here: [https://lists.apache.org/thread.html/r789abd9005d70a8fa1de29d3af394069e859ca6e1eea8bfd3e3e0494%40%3Cdev.ignite.apache.org%3E] was: It is about race which was detected in https://issues.apache.org/jira/browse/IGNITE-15099. {code:java} // CheckpointContextImpl#executor @Override public Executor executor() { return asyncRunner == null ? null : cmd -> { try { GridFutureAdapter<?> res = new GridFutureAdapter<>(); res.listen(fut -> heartbeatUpdater.updateHeartbeat()); // Listener is invoked concurrently with pending future finish asyncRunner.execute(U.wrapIgniteFuture(cmd, res)); pendingTaskFuture.add(res); } catch (RejectedExecutionException e) { assert false : "A task should never be rejected by async runner"; } }; } {code} {code:java} // Checkpointer#waitCheckpointEvent try { synchronized (this) { long remaining = U.nanosToMillis(scheduledCp.nextCpNanos - System.nanoTime()); while (remaining > 0 && !isCancelled()) { blockingSectionBegin(); try { wait(remaining); // At this point and till blockingSectionEnd call heartbeat should be equal to Long.MAX_VALUE remaining = U.nanosToMillis(scheduledCp.nextCpNanos - System.nanoTime()); } finally { blockingSectionEnd(); } } } } {code} Discussion is here: https://lists.apache.org/thread.html/r789abd9005d70a8fa1de29d3af394069e859ca6e1eea8bfd3e3e0494%40%3Cdev.ignite.apache.org%3E > Fix race in Checkpointer listeners invocation and illegal > Checkpointer-heartbeat update from different threads > -------------------------------------------------------------------------------------------------------------- > > Key: IGNITE-15192 > URL: https://issues.apache.org/jira/browse/IGNITE-15192 > Project: Ignite > Issue Type: Improvement > Affects Versions: 2.10 > Reporter: Ilya Kazakov > Assignee: Ilya Kazakov > Priority: Minor > Time Spent: 10m > Remaining Estimate: 0h > > It is about race which was detected in > https://issues.apache.org/jira/browse/IGNITE-15099. > The fix from the ticket above fixed the wrong heartbeat, but do not fix a > race, which allows checkpointer thread go ahead and do not await on > ctx0.awaitPendingTasksFinished() (in ChecpointWorkflow.markCheckpointBegin ), > which leads to: > - checkpointer thread enter in blocking section > - and after this a checkpointer thread hertbeat could be updated by prallel > thread. > > {code:java} > // CheckpointContextImpl#executor > @Override public Executor executor() { > return asyncRunner == null ? null : cmd -> { > try { > GridFutureAdapter<?> res = new GridFutureAdapter<>(); > res.listen(fut -> heartbeatUpdater.updateHeartbeat()); // > Listener is invoked concurrently with pending future finish > asyncRunner.execute(U.wrapIgniteFuture(cmd, res)); > pendingTaskFuture.add(res); > } > catch (RejectedExecutionException e) { > assert false : "A task should never be rejected by async > runner"; > } > }; > } > {code} > > {code:java} > // Checkpointer#waitCheckpointEvent > try { > synchronized (this) { > long remaining = U.nanosToMillis(scheduledCp.nextCpNanos - > System.nanoTime()); > while (remaining > 0 && !isCancelled()) { > blockingSectionBegin(); > try { > wait(remaining); > // At this point and till blockingSectionEnd call heartbeat > should be equal to Long.MAX_VALUE > remaining = U.nanosToMillis(scheduledCp.nextCpNanos - > System.nanoTime()); > } > finally { > blockingSectionEnd(); > } > } > } > } > {code} > Discussion is here: > [https://lists.apache.org/thread.html/r789abd9005d70a8fa1de29d3af394069e859ca6e1eea8bfd3e3e0494%40%3Cdev.ignite.apache.org%3E] -- This message was sent by Atlassian Jira (v8.3.4#803005)