Repository: aurora Updated Branches: refs/heads/master d95fc2ff9 -> e3f496a65
Revert "Fix cron id collision bug by avoiding state in Quartz jobs" This reverts commit e2ea191473397691605602c6e40c6aad8a56d81a. A bug was found where jobs that were killed via the KILL_EXISTING flag would set `path` as `null` in `JobDataMap` that would block concurrent runs, but that value would never be set to `key` after the the delayed run finished because it would run outside of the `Job` execution. The issue in https://reviews.apache.org/r/65680/ will occur again, but it is rare and has been around for a few years. This bug was not caught in the unit test `testKillExisting` because `executeWithReplay` is mocked and runs synchronously within the `Job` execution, allowing the `key` to be persisted. Reviewed at https://reviews.apache.org/r/65810/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/e3f496a6 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/e3f496a6 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/e3f496a6 Branch: refs/heads/master Commit: e3f496a65eae03dde0392627f3fcde7b93a82df9 Parents: d95fc2f Author: Jordan Ly <[email protected]> Authored: Tue Feb 27 10:18:46 2018 -0800 Committer: Jordan Ly <[email protected]> Committed: Tue Feb 27 10:18:46 2018 -0800 ---------------------------------------------------------------------- .../scheduler/cron/quartz/AuroraCronJob.java | 17 +++++++++-------- .../scheduler/cron/quartz/AuroraCronJobTest.java | 9 ++------- 2 files changed, 11 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/e3f496a6/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java index b77e032..3604dd4 100644 --- a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java +++ b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java @@ -27,6 +27,7 @@ import javax.inject.Qualifier; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; import org.apache.aurora.common.stats.Stats; import org.apache.aurora.common.stats.StatsProvider; @@ -91,6 +92,7 @@ class AuroraCronJob implements Job, EventSubscriber { private final StateManager stateManager; private final BackoffHelper delayedStartBackoff; private final BatchWorker<NoResult> batchWorker; + private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet(); /** * Annotation for the max cron batch size. @@ -143,16 +145,17 @@ class AuroraCronJob implements Job, EventSubscriber { // Prevent a concurrent run for this job in case a previous trigger took longer to run. // This approach relies on saving the "work in progress" token within the job context itself - // (see below). + // (see below) and relying on killFollowups to signal "work completion". if (context.getJobDetail().getJobDataMap().containsKey(path)) { CRON_JOB_CONCURRENT_RUNS.incrementAndGet(); - if (context.getJobDetail().getJobDataMap().get(path) == null) { + if (killFollowups.contains(key)) { + context.getJobDetail().getJobDataMap().remove(path); + killFollowups.remove(key); + LOG.info("Resetting job context for cron {}", path); + } else { LOG.info("Ignoring trigger as another concurrent run is active for cron {}", path); return; } - - context.getJobDetail().getJobDataMap().remove(path); - LOG.info("Resetting job context for cron {}", path); } CompletableFuture<NoResult> scheduleResult = batchWorker.<NoResult>execute(storeProvider -> { @@ -200,8 +203,6 @@ class AuroraCronJob implements Job, EventSubscriber { LOG.info("Waiting for job to terminate before launching cron job " + path); // Use job detail map to signal a "work in progress" condition to subsequent triggers. - // A null value indicates that work is still in progress, while a non-null value (key in - // this case) indicates that the work has completed. context.getJobDetail().getJobDataMap().put(path, null); batchWorker.executeWithReplay( delayedStartBackoff.getBackoffStrategy(), @@ -217,7 +218,7 @@ class AuroraCronJob implements Job, EventSubscriber { } }) .thenAccept(ignored -> { - context.getJobDetail().getJobDataMap().put(path, key); + killFollowups.add(key); LOG.info("Finished delayed launch for cron " + path); }); break; http://git-wip-us.apache.org/repos/asf/aurora/blob/e3f496a6/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java index 8ae9bb5..5083fcd 100644 --- a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java +++ b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java @@ -50,7 +50,7 @@ import static org.easymock.EasyMock.capture; import static org.easymock.EasyMock.eq; import static org.easymock.EasyMock.expect; import static org.easymock.EasyMock.expectLastCall; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; public class AuroraCronJobTest extends EasyMockTest { @@ -157,18 +157,13 @@ public class AuroraCronJobTest extends EasyMockTest { // Simulate a trigger in progress. jobDetails.getJobDataMap().put(JobKeys.canonicalString(AURORA_JOB_KEY), null); - assertEquals( - jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)), - null); + assertFalse(jobDetails.getJobDataMap().isEmpty()); // Attempt a concurrent run that must be rejected. auroraCronJob.doExecute(context); // Complete previous run and trigger another one. killResult.complete(BatchWorker.NO_RESULT); - assertEquals( - jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)), - AURORA_JOB_KEY); auroraCronJob.doExecute(context); assertTrue(jobDetails.getJobDataMap().isEmpty()); }
