Repository: aurora
Updated Branches:
  refs/heads/master c69ccb9f1 -> e2ea19147


Fix cron id collision bug by avoiding state in Quartz jobs

There is a pretty rare situation that can occur that will cause the scheduler 
to crash.

The steps are:

1. Schedule and start a cron (runs every minute, graceful shutdown period
   > 1 minute)
2. Perform 2 runs of the cron
3. Deschedule the cron
4. Reschedule the cron
5. Perform 3 runs of the cron
6. Scheduler will crash on the 3rd run due to an ID collision between the 
already running cron and
a new cron trying to start

The reason for this bug is that some state is persisted between cron
scheduling/descheduling via `killFollowups`. We use Quartz `JobDataMap` to hold 
a "work in progress"
token, while the `killFollowups` set indicates "completion" in order to ensure 
there are no
concurrent runs. Descheduling a cron will remove the "work in progress" token 
while ignoring the
"completion" token in `killFollowups`. Later, a "work in progress" token may be 
added and
a "completion" token may be seen mistakenly from a previous schedule, causing a 
concurrent run.

For the example above, the runs in step 2 will add the key to the set to show 
that all runs are
finished and another run can start. The 3rd run in step
5 will mistakenly see that the 2nd run has started and finished since the 
"completion" token was
preserved from the first set of runs in step 2. This will erroneously trigger a 
concurrent run
causing a ID collision.

We should not preserve any state between cron scheduling/descheduling outside 
of the given Quartz
`JobDataMap` abstraction. We can use the presence of a value here to achieve 
the same thing as
`killFollowups`.

Reviewed at https://reviews.apache.org/r/65680/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/e2ea1914
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/e2ea1914
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/e2ea1914

Branch: refs/heads/master
Commit: e2ea191473397691605602c6e40c6aad8a56d81a
Parents: c69ccb9
Author: Jordan Ly <jordan....@gmail.com>
Authored: Mon Feb 19 21:33:21 2018 -0800
Committer: Jordan Ly <j...@twitter.com>
Committed: Mon Feb 19 21:33:21 2018 -0800

----------------------------------------------------------------------
 .../scheduler/cron/quartz/AuroraCronJob.java       | 17 ++++++++---------
 .../scheduler/cron/quartz/AuroraCronJobTest.java   |  9 +++++++--
 2 files changed, 15 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/e2ea1914/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java 
b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
index 3604dd4..b77e032 100644
--- a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
+++ b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
@@ -27,7 +27,6 @@ import javax.inject.Qualifier;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Sets;
 
 import org.apache.aurora.common.stats.Stats;
 import org.apache.aurora.common.stats.StatsProvider;
@@ -92,7 +91,6 @@ class AuroraCronJob implements Job, EventSubscriber {
   private final StateManager stateManager;
   private final BackoffHelper delayedStartBackoff;
   private final BatchWorker<NoResult> batchWorker;
-  private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet();
 
   /**
    * Annotation for the max cron batch size.
@@ -145,17 +143,16 @@ class AuroraCronJob implements Job, EventSubscriber {
 
     // Prevent a concurrent run for this job in case a previous trigger took 
longer to run.
     // This approach relies on saving the "work in progress" token within the 
job context itself
-    // (see below) and relying on killFollowups to signal "work completion".
+    // (see below).
     if (context.getJobDetail().getJobDataMap().containsKey(path)) {
       CRON_JOB_CONCURRENT_RUNS.incrementAndGet();
-      if (killFollowups.contains(key)) {
-        context.getJobDetail().getJobDataMap().remove(path);
-        killFollowups.remove(key);
-        LOG.info("Resetting job context for cron {}", path);
-      } else {
+      if (context.getJobDetail().getJobDataMap().get(path) == null) {
         LOG.info("Ignoring trigger as another concurrent run is active for 
cron {}", path);
         return;
       }
+
+      context.getJobDetail().getJobDataMap().remove(path);
+      LOG.info("Resetting job context for cron {}", path);
     }
 
     CompletableFuture<NoResult> scheduleResult = 
batchWorker.<NoResult>execute(storeProvider -> {
@@ -203,6 +200,8 @@ class AuroraCronJob implements Job, EventSubscriber {
 
           LOG.info("Waiting for job to terminate before launching cron job " + 
path);
           // Use job detail map to signal a "work in progress" condition to 
subsequent triggers.
+          // A null value indicates that work is still in progress, while a 
non-null value (key in
+          // this case) indicates that the work has completed.
           context.getJobDetail().getJobDataMap().put(path, null);
           batchWorker.executeWithReplay(
               delayedStartBackoff.getBackoffStrategy(),
@@ -218,7 +217,7 @@ class AuroraCronJob implements Job, EventSubscriber {
                 }
               })
               .thenAccept(ignored -> {
-                killFollowups.add(key);
+                context.getJobDetail().getJobDataMap().put(path, key);
                 LOG.info("Finished delayed launch for cron " + path);
               });
           break;

http://git-wip-us.apache.org/repos/asf/aurora/blob/e2ea1914/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java 
b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
index 5083fcd..8ae9bb5 100644
--- 
a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
+++ 
b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
@@ -50,7 +50,7 @@ import static org.easymock.EasyMock.capture;
 import static org.easymock.EasyMock.eq;
 import static org.easymock.EasyMock.expect;
 import static org.easymock.EasyMock.expectLastCall;
-import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 public class AuroraCronJobTest extends EasyMockTest {
@@ -157,13 +157,18 @@ public class AuroraCronJobTest extends EasyMockTest {
 
     // Simulate a trigger in progress.
     jobDetails.getJobDataMap().put(JobKeys.canonicalString(AURORA_JOB_KEY), 
null);
-    assertFalse(jobDetails.getJobDataMap().isEmpty());
+    assertEquals(
+        
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
+        null);
 
     // Attempt a concurrent run that must be rejected.
     auroraCronJob.doExecute(context);
 
     // Complete previous run and trigger another one.
     killResult.complete(BatchWorker.NO_RESULT);
+    assertEquals(
+        
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
+        AURORA_JOB_KEY);
     auroraCronJob.doExecute(context);
     assertTrue(jobDetails.getJobDataMap().isEmpty());
   }

Reply via email to