sashapolo commented on code in PR #784:
URL: https://github.com/apache/ignite-3/pull/784#discussion_r857365488
##########
modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java:
##########
@@ -396,90 +390,83 @@ public CompletableFuture<Ignite> start(@Language("HOCON")
@Nullable String cfg)
nodeCfgMgr.configurationRegistry().initializeDefaults();
}
- waitForJoinPermission();
-
- // Start the remaining components.
- List<IgniteComponent> otherComponents = List.of(
+ // Start the components that are required to join the cluster.
+ lifecycleManager.startComponents(
nettyBootstrapFactory,
clusterSvc,
restComponent,
- computeComponent,
raftMgr,
- txManager,
- clusterCfgMgr,
- cmgMgr,
- metaStorageMgr,
- baselineMgr,
- dataStorageMgr,
- distributedTblMgr,
- qryEngine,
- clientHandlerModule
- );
-
- for (IgniteComponent component : otherComponents) {
- doStartComponent(name, startedComponents, component);
- }
-
- CompletableFuture<Void> configurationCatchUpFuture =
RecoveryCompletionFutureFactory.create(
- metaStorageMgr,
- clusterCfgMgr,
- fut -> new ConfigurationCatchUpListener(cfgStorage, fut,
LOG)
+ cmgMgr
);
- notifyConfigurationListeners();
-
- // Deploy all registered watches because all components are ready
and have registered their listeners.
- metaStorageMgr.deployWatches();
-
- if (!status.compareAndSet(Status.STARTING, Status.STARTED)) {
- throw new NodeStoppingException();
- }
-
- return configurationCatchUpFuture.thenApply(v -> this);
+ return cmgMgr.joinFuture()
+ .thenAccept(v -> {
+ // Start all other components after the join request
has completed and the node has been validated.
+ try {
+ lifecycleManager.startComponents(
+ metaStorageMgr,
+ clusterCfgMgr,
+ computeComponent,
+ txManager,
+ baselineMgr,
+ dataStorageMgr,
+ distributedTblMgr,
+ qryEngine,
+ clientHandlerModule
+ );
+
+ // Deploy all registered watches because all
components are ready and have registered their listeners.
+ metaStorageMgr.deployWatches();
+ } catch (NodeStoppingException e) {
+ throw new CompletionException(e);
+ }
+ })
+ .thenCompose(v -> {
+ // Recovery future must be created before
configuration listeners are triggered.
+ CompletableFuture<Void> recoveryFuture =
RecoveryCompletionFutureFactory.create(
+ clusterCfgMgr,
+ fut -> new
ConfigurationCatchUpListener(cfgStorage, fut, LOG)
+ );
+
+ return
CompletableFuture.allOf(notifyConfigurationListeners(), recoveryFuture);
+ })
+ // Signal that local recovery is complete and the node is
ready to join the cluster.
+ .thenCompose(v -> cmgMgr.onJoinReady())
+ .thenAccept(v -> {
+ try {
+ // Transfer the node to the STARTED state.
+ lifecycleManager.onStartComplete();
+ } catch (NodeStoppingException e) {
+ throw new CompletionException(e);
+ }
+ })
+ .handle((v, e) -> {
+ if (e != null) {
+ throw handleStartException(e);
+ }
+
+ return this;
+ });
} catch (Exception e) {
- String errMsg = "Unable to start node=[" + name + "].";
+ throw handleStartException(e);
+ }
+ }
- LOG.error(errMsg, e);
+ private RuntimeException handleStartException(Throwable e) {
+ String errMsg = "Unable to start node=[" + name + "].";
- doStopNode(startedComponents);
+ LOG.error(errMsg, e);
- throw new IgniteException(errMsg, e);
- }
- }
+ lifecycleManager.stopAllComponents();
- /**
- * Awaits for a permission to join the cluster, i.e. node join response
from Cluster Management group.
- * After the completion of this method, the node is considered as
validated.
- */
- private void waitForJoinPermission() {
- // TODO: implement, see
https://issues.apache.org/jira/browse/IGNITE-16472
+ return new IgniteException(errMsg, e);
}
/**
* Stops ignite node.
*/
public void stop() {
- if (status.getAndSet(Status.STOPPING) == Status.STARTED) {
- doStopNode(List.of(
- longJvmPauseDetector,
- vaultMgr,
- nodeCfgMgr,
- clusterSvc,
- computeComponent,
- raftMgr,
- txManager,
- metaStorageMgr,
- clusterCfgMgr,
- cmgMgr,
- baselineMgr,
- dataStorageMgr,
- distributedTblMgr,
- qryEngine,
- restComponent,
- clientHandlerModule,
- nettyBootstrapFactory
- ));
- }
+ lifecycleManager.stopNode();
Review Comment:
I've fixed this comment, but then I remembered that I actually did this on
purpose: it is possible that a node may hang during join in which case it will
never acquire the STARTED state. This will make it impossible to stop it in
"afterEach" in tests, for example
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]