Repository: aurora Updated Branches: refs/heads/master c69ccb9f1 -> e2ea19147
Fix cron id collision bug by avoiding state in Quartz jobs There is a pretty rare situation that can occur that will cause the scheduler to crash. The steps are: 1. Schedule and start a cron (runs every minute, graceful shutdown period > 1 minute) 2. Perform 2 runs of the cron 3. Deschedule the cron 4. Reschedule the cron 5. Perform 3 runs of the cron 6. Scheduler will crash on the 3rd run due to an ID collision between the already running cron and a new cron trying to start The reason for this bug is that some state is persisted between cron scheduling/descheduling via `killFollowups`. We use Quartz `JobDataMap` to hold a "work in progress" token, while the `killFollowups` set indicates "completion" in order to ensure there are no concurrent runs. Descheduling a cron will remove the "work in progress" token while ignoring the "completion" token in `killFollowups`. Later, a "work in progress" token may be added and a "completion" token may be seen mistakenly from a previous schedule, causing a concurrent run. For the example above, the runs in step 2 will add the key to the set to show that all runs are finished and another run can start. The 3rd run in step 5 will mistakenly see that the 2nd run has started and finished since the "completion" token was preserved from the first set of runs in step 2. This will erroneously trigger a concurrent run causing a ID collision. We should not preserve any state between cron scheduling/descheduling outside of the given Quartz `JobDataMap` abstraction. We can use the presence of a value here to achieve the same thing as `killFollowups`. Reviewed at https://reviews.apache.org/r/65680/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/e2ea1914 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/e2ea1914 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/e2ea1914 Branch: refs/heads/master Commit: e2ea191473397691605602c6e40c6aad8a56d81a Parents: c69ccb9 Author: Jordan Ly <[email protected]> Authored: Mon Feb 19 21:33:21 2018 -0800 Committer: Jordan Ly <[email protected]> Committed: Mon Feb 19 21:33:21 2018 -0800 ---------------------------------------------------------------------- .../scheduler/cron/quartz/AuroraCronJob.java | 17 ++++++++--------- .../scheduler/cron/quartz/AuroraCronJobTest.java | 9 +++++++-- 2 files changed, 15 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/e2ea1914/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java index 3604dd4..b77e032 100644 --- a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java +++ b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java @@ -27,7 +27,6 @@ import javax.inject.Qualifier; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; import org.apache.aurora.common.stats.Stats; import org.apache.aurora.common.stats.StatsProvider; @@ -92,7 +91,6 @@ class AuroraCronJob implements Job, EventSubscriber { private final StateManager stateManager; private final BackoffHelper delayedStartBackoff; private final BatchWorker<NoResult> batchWorker; - private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet(); /** * Annotation for the max cron batch size. @@ -145,17 +143,16 @@ class AuroraCronJob implements Job, EventSubscriber { // Prevent a concurrent run for this job in case a previous trigger took longer to run. // This approach relies on saving the "work in progress" token within the job context itself - // (see below) and relying on killFollowups to signal "work completion". + // (see below). if (context.getJobDetail().getJobDataMap().containsKey(path)) { CRON_JOB_CONCURRENT_RUNS.incrementAndGet(); - if (killFollowups.contains(key)) { - context.getJobDetail().getJobDataMap().remove(path); - killFollowups.remove(key); - LOG.info("Resetting job context for cron {}", path); - } else { + if (context.getJobDetail().getJobDataMap().get(path) == null) { LOG.info("Ignoring trigger as another concurrent run is active for cron {}", path); return; } + + context.getJobDetail().getJobDataMap().remove(path); + LOG.info("Resetting job context for cron {}", path); } CompletableFuture<NoResult> scheduleResult = batchWorker.<NoResult>execute(storeProvider -> { @@ -203,6 +200,8 @@ class AuroraCronJob implements Job, EventSubscriber { LOG.info("Waiting for job to terminate before launching cron job " + path); // Use job detail map to signal a "work in progress" condition to subsequent triggers. + // A null value indicates that work is still in progress, while a non-null value (key in + // this case) indicates that the work has completed. context.getJobDetail().getJobDataMap().put(path, null); batchWorker.executeWithReplay( delayedStartBackoff.getBackoffStrategy(), @@ -218,7 +217,7 @@ class AuroraCronJob implements Job, EventSubscriber { } }) .thenAccept(ignored -> { - killFollowups.add(key); + context.getJobDetail().getJobDataMap().put(path, key); LOG.info("Finished delayed launch for cron " + path); }); break; http://git-wip-us.apache.org/repos/asf/aurora/blob/e2ea1914/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java index 5083fcd..8ae9bb5 100644 --- a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java +++ b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java @@ -50,7 +50,7 @@ import static org.easymock.EasyMock.capture; import static org.easymock.EasyMock.eq; import static org.easymock.EasyMock.expect; import static org.easymock.EasyMock.expectLastCall; -import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class AuroraCronJobTest extends EasyMockTest { @@ -157,13 +157,18 @@ public class AuroraCronJobTest extends EasyMockTest { // Simulate a trigger in progress. jobDetails.getJobDataMap().put(JobKeys.canonicalString(AURORA_JOB_KEY), null); - assertFalse(jobDetails.getJobDataMap().isEmpty()); + assertEquals( + jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)), + null); // Attempt a concurrent run that must be rejected. auroraCronJob.doExecute(context); // Complete previous run and trigger another one. killResult.complete(BatchWorker.NO_RESULT); + assertEquals( + jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)), + AURORA_JOB_KEY); auroraCronJob.doExecute(context); assertTrue(jobDetails.getJobDataMap().isEmpty()); }
