Repository: aurora
Updated Branches:
  refs/heads/master d95fc2ff9 -> e3f496a65


Revert "Fix cron id collision bug by avoiding state in Quartz jobs"

This reverts commit e2ea191473397691605602c6e40c6aad8a56d81a.

A bug was found where jobs that were killed via the KILL_EXISTING flag would 
set `path` as `null` in
`JobDataMap` that would block concurrent runs, but that value would never be 
set to `key` after the
the delayed run finished because it would run outside of the `Job` execution.

The issue in https://reviews.apache.org/r/65680/ will occur again, but it is 
rare and has been
around for a few years.

This bug was not caught in the unit test `testKillExisting` because 
`executeWithReplay` is mocked
and runs synchronously within the `Job` execution, allowing the `key` to be 
persisted.

Reviewed at https://reviews.apache.org/r/65810/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/e3f496a6
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/e3f496a6
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/e3f496a6

Branch: refs/heads/master
Commit: e3f496a65eae03dde0392627f3fcde7b93a82df9
Parents: d95fc2f
Author: Jordan Ly <[email protected]>
Authored: Tue Feb 27 10:18:46 2018 -0800
Committer: Jordan Ly <[email protected]>
Committed: Tue Feb 27 10:18:46 2018 -0800

----------------------------------------------------------------------
 .../scheduler/cron/quartz/AuroraCronJob.java       | 17 +++++++++--------
 .../scheduler/cron/quartz/AuroraCronJobTest.java   |  9 ++-------
 2 files changed, 11 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/e3f496a6/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java 
b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
index b77e032..3604dd4 100644
--- a/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
+++ b/src/main/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJob.java
@@ -27,6 +27,7 @@ import javax.inject.Qualifier;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
 
 import org.apache.aurora.common.stats.Stats;
 import org.apache.aurora.common.stats.StatsProvider;
@@ -91,6 +92,7 @@ class AuroraCronJob implements Job, EventSubscriber {
   private final StateManager stateManager;
   private final BackoffHelper delayedStartBackoff;
   private final BatchWorker<NoResult> batchWorker;
+  private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet();
 
   /**
    * Annotation for the max cron batch size.
@@ -143,16 +145,17 @@ class AuroraCronJob implements Job, EventSubscriber {
 
     // Prevent a concurrent run for this job in case a previous trigger took 
longer to run.
     // This approach relies on saving the "work in progress" token within the 
job context itself
-    // (see below).
+    // (see below) and relying on killFollowups to signal "work completion".
     if (context.getJobDetail().getJobDataMap().containsKey(path)) {
       CRON_JOB_CONCURRENT_RUNS.incrementAndGet();
-      if (context.getJobDetail().getJobDataMap().get(path) == null) {
+      if (killFollowups.contains(key)) {
+        context.getJobDetail().getJobDataMap().remove(path);
+        killFollowups.remove(key);
+        LOG.info("Resetting job context for cron {}", path);
+      } else {
         LOG.info("Ignoring trigger as another concurrent run is active for 
cron {}", path);
         return;
       }
-
-      context.getJobDetail().getJobDataMap().remove(path);
-      LOG.info("Resetting job context for cron {}", path);
     }
 
     CompletableFuture<NoResult> scheduleResult = 
batchWorker.<NoResult>execute(storeProvider -> {
@@ -200,8 +203,6 @@ class AuroraCronJob implements Job, EventSubscriber {
 
           LOG.info("Waiting for job to terminate before launching cron job " + 
path);
           // Use job detail map to signal a "work in progress" condition to 
subsequent triggers.
-          // A null value indicates that work is still in progress, while a 
non-null value (key in
-          // this case) indicates that the work has completed.
           context.getJobDetail().getJobDataMap().put(path, null);
           batchWorker.executeWithReplay(
               delayedStartBackoff.getBackoffStrategy(),
@@ -217,7 +218,7 @@ class AuroraCronJob implements Job, EventSubscriber {
                 }
               })
               .thenAccept(ignored -> {
-                context.getJobDetail().getJobDataMap().put(path, key);
+                killFollowups.add(key);
                 LOG.info("Finished delayed launch for cron " + path);
               });
           break;

http://git-wip-us.apache.org/repos/asf/aurora/blob/e3f496a6/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java 
b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
index 8ae9bb5..5083fcd 100644
--- 
a/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
+++ 
b/src/test/java/org/apache/aurora/scheduler/cron/quartz/AuroraCronJobTest.java
@@ -50,7 +50,7 @@ import static org.easymock.EasyMock.capture;
 import static org.easymock.EasyMock.eq;
 import static org.easymock.EasyMock.expect;
 import static org.easymock.EasyMock.expectLastCall;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 public class AuroraCronJobTest extends EasyMockTest {
@@ -157,18 +157,13 @@ public class AuroraCronJobTest extends EasyMockTest {
 
     // Simulate a trigger in progress.
     jobDetails.getJobDataMap().put(JobKeys.canonicalString(AURORA_JOB_KEY), 
null);
-    assertEquals(
-        
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
-        null);
+    assertFalse(jobDetails.getJobDataMap().isEmpty());
 
     // Attempt a concurrent run that must be rejected.
     auroraCronJob.doExecute(context);
 
     // Complete previous run and trigger another one.
     killResult.complete(BatchWorker.NO_RESULT);
-    assertEquals(
-        
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
-        AURORA_JOB_KEY);
     auroraCronJob.doExecute(context);
     assertTrue(jobDetails.getJobDataMap().isEmpty());
   }

Reply via email to