kfaraz commented on code in PR #19541:
URL: https://github.com/apache/druid/pull/19541#discussion_r3401743639
##########
indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java:
##########
@@ -186,14 +186,133 @@ public boolean
createOrUpdateAndStartSupervisor(SupervisorSpec spec)
synchronized (lock) {
Preconditions.checkState(started, "SupervisorManager not started");
- final boolean shouldUpdateSpec = shouldUpdateSupervisor(spec);
+ // Persist whenever the spec actually changed (or is new) — independent
of whether a restart is
+ // required. This stops/recreates the supervisor regardless; persistence
must not be gated on the
+ // restart decision, otherwise a no-restart change (e.g. taskCount under
autoscaling) would be
+ // applied to the running supervisor but lost from the metadata store.
+ final boolean specChanged = isSpecChangedAndValidate(spec);
SupervisorSpec existingSpec =
possiblyStopAndRemoveSupervisorInternal(spec.getId(), false);
spec.merge(existingSpec);
- createAndStartSupervisorInternal(spec, shouldUpdateSpec);
- return shouldUpdateSpec;
+ createAndStartSupervisorInternal(spec, specChanged);
+ return specChanged;
}
}
+ /**
+ * Result of applying a submitted supervisor spec. {@code modified} means
the persisted spec changed;
+ * {@code restarted} means the running supervisor was stopped and recreated.
+ */
+ public static final class SpecUpdateResult
+ {
+ private final boolean modified;
+ private final boolean restarted;
+
+ private SpecUpdateResult(final boolean modified, final boolean restarted)
+ {
+ this.modified = modified;
+ this.restarted = restarted;
+ }
+
+ public static SpecUpdateResult of(final boolean modified, final boolean
restarted)
+ {
+ return new SpecUpdateResult(modified, restarted);
+ }
+
+ public boolean isModified()
+ {
+ return modified;
+ }
+
+ public boolean isRestarted()
+ {
+ return restarted;
+ }
+ }
+
+ /**
+ * Decides whether the submitted spec needs a restart and applies it under a
single lock, so the decision
+ * cannot go stale between deciding and acting (which would let a concurrent
POST drop a write or persist a
+ * spec that the running supervisor needs to be recreated for). With {@code
skipRestartIfUnmodified} set, an
+ * unchanged spec is a no-op and a changed spec whose {@link
SupervisorSpec#requireRestart} is false (e.g. a
+ * taskCount change under autoscaling) is persisted without recreating the
supervisor; otherwise the
+ * supervisor is stopped and recreated (the only behavior when the flag is
false).
+ */
+ public SpecUpdateResult createOrUpdateAndStartSupervisor(
Review Comment:
It seems error prone to have 2 flavors of the
`createOrUpdateAndStartSupervisor` method.
Please try to merge them and pass some appropriate value for the skip
restart flag if needed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]