Github user geomacy commented on a diff in the pull request: https://github.com/apache/brooklyn-server/pull/947#discussion_r167329657 --- Diff: software/base/src/main/java/org/apache/brooklyn/entity/software/base/SoftwareProcessImpl.java --- @@ -385,33 +384,76 @@ protected void callRebindHooks() { connectSensors(); } else { long delay = (long) (Math.random() * configuredMaxDelay.toMilliseconds()); - LOG.debug("Scheduled reconnection of sensors on {} in {}ms", this, delay); + LOG.debug("Scheduling reconnection of sensors on {} in {}ms", this, delay); - // This is functionally equivalent to new scheduledExecutor.schedule(job, delay, TimeUnit.MILLISECONDS). - // It uses the entity's execution context to schedule and thus execute the job. - Map<?,?> flags = MutableMap.of("delay", Duration.millis(delay), "maxIterations", 1, "cancelOnException", true); - Callable<Void> job = new Callable<Void>() { - public Void call() { - try { - if (getManagementSupport().isNoLongerManaged()) { - LOG.debug("Entity {} no longer managed; ignoring scheduled connect sensors on rebind", SoftwareProcessImpl.this); - return null; - } - connectSensors(); - } catch (Throwable e) { - LOG.warn("Problem connecting sensors on rebind of "+SoftwareProcessImpl.this, e); - Exceptions.propagateIfFatal(e); - } - return null; - }}; - ScheduledTask scheduledTask = new ScheduledTask(flags, new BasicTask<Void>(job)); - getExecutionContext().submit(scheduledTask); + scheduleConnectSensorsOnRebind(Duration.millis(delay)); } + // don't wait here - it may be long-running, e.g. if remote entity has died, and we don't want to block rebind waiting or cause it to fail // the service will subsequently show service not up and thus failure // waitForServiceUp(); } + protected void scheduleConnectSensorsOnRebind(Duration delay) { + Callable<Void> job = new Callable<Void>() { + public Void call() { + try { + if (!getManagementContext().isRunning()) { + LOG.debug("Management context not running; entity {} ignoring scheduled connect-sensors on rebind", SoftwareProcessImpl.this); + return null; + } + if (getManagementSupport().isNoLongerManaged()) { + LOG.debug("Entity {} no longer managed; ignoring scheduled connect-sensors on rebind", SoftwareProcessImpl.this); + return null; + } + + // Don't call connectSensors until the entity is actually managed. + // See https://issues.apache.org/jira/browse/BROOKLYN-580 + boolean rebindActive = getManagementContext().getRebindManager().isRebindActive(); + if (!getManagementSupport().wasDeployed()) { + if (rebindActive) { + // We are still rebinding, and entity not yet managed - reschedule. + Duration configuredMaxDelay = getConfig(MAXIMUM_REBIND_SENSOR_CONNECT_DELAY); + if (configuredMaxDelay == null) configuredMaxDelay = Duration.millis(100); + long delay = (long) (Math.random() * configuredMaxDelay.toMilliseconds()); + delay = Math.max(10, delay); + LOG.debug("Entity {} not yet managed; re-scheduling connect-sensors on rebind in {}ms", SoftwareProcessImpl.this, delay); + + scheduleConnectSensorsOnRebind(Duration.millis(delay)); --- End diff -- I assume this is safe from the point of view of not risking an infinite loop of recursive calls? E.g. if somehow there were to be an exception in the entity that meant `wasDeployed()` never became true? I take it this would be safe because sooner or later `rebindActive` will be false. I see the comment you added below about this on `rebindTest()` as well. It seems improbable that there would be some combination of circumstances which would prevent `wasDeployed` becoming true but somehow also prevent `rebindActive` becoming false. Nevertheless you might like to think about adding some failsafe "circuit breaker" capability here?
---