Mmuzaf commented on a change in pull request #7714:
URL: https://github.com/apache/ignite/pull/7714#discussion_r413699375



##########
File path: 
modules/core/src/main/java/org/apache/ignite/internal/util/distributed/DistributedProcess.java
##########
@@ -257,11 +258,20 @@ private void sendSingleMessage(Process p) {
         SingleNodeMessage<R> singleMsg = new SingleNodeMessage<>(p.id, type, 
p.resFut.result(),
             (Exception)p.resFut.error());
 
-        if (F.eq(ctx.localNodeId(), p.crdId))
-            onSingleNodeMessageReceived(singleMsg, p.crdId);
+        UUID crdId = p.crdId;
+
+        if (F.eq(ctx.localNodeId(), crdId))
+            onSingleNodeMessageReceived(singleMsg, crdId);
         else {
             try {
-                ctx.io().sendToGridTopic(p.crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+                ctx.io().sendToGridTopic(crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+            }
+            catch (ClusterTopologyCheckedException e) {
+                // The coordinator was failed. The single message will be sent 
when a new coordinator initilized.

Review comment:
       initilized -> initialized

##########
File path: 
modules/core/src/main/java/org/apache/ignite/internal/util/distributed/DistributedProcess.java
##########
@@ -257,11 +258,20 @@ private void sendSingleMessage(Process p) {
         SingleNodeMessage<R> singleMsg = new SingleNodeMessage<>(p.id, type, 
p.resFut.result(),
             (Exception)p.resFut.error());
 
-        if (F.eq(ctx.localNodeId(), p.crdId))
-            onSingleNodeMessageReceived(singleMsg, p.crdId);
+        UUID crdId = p.crdId;
+
+        if (F.eq(ctx.localNodeId(), crdId))
+            onSingleNodeMessageReceived(singleMsg, crdId);
         else {
             try {
-                ctx.io().sendToGridTopic(p.crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+                ctx.io().sendToGridTopic(crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+            }
+            catch (ClusterTopologyCheckedException e) {
+                // The coordinator was failed. The single message will be sent 
when a new coordinator initilized.
+                if (log.isDebugEnabled()) {
+                    log.debug("Failed to send a single message to coordinator: 
[crdId=" + crdId +
+                        ", processId=" + p.id +", error=" + e.getMessage() + 
']');
+                }
             }
             catch (IgniteCheckedException e) {
                 log.error("Unable to send message to coordinator.", e);

Review comment:
       Can you please check, do we need logging here? It seems `FailureHandler` 
will log exactly the same message.

##########
File path: 
modules/core/src/main/java/org/apache/ignite/internal/util/distributed/DistributedProcess.java
##########
@@ -367,6 +377,9 @@ private Process(UUID id) {
 
     /** Defines distributed processes. */
     public enum DistributedProcessType {
+        /** For test purpose only. */

Review comment:
       Personally, I don't support such cases. 
   - we can use the `DEFAULT` type of the distributed process which can be used 
on all nodes. It's also nice to mention that a newly created distributed 
process must be registered on all nodes (e.g. via compute?).
   - already existing types can be also reused in tests.

##########
File path: 
modules/core/src/test/java/org/apache/ignite/internal/util/DistributedProcessCoordinatorLeftTest.java
##########
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.util;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.UUID;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.apache.ignite.Ignite;
+import org.apache.ignite.configuration.IgniteConfiguration;
+import org.apache.ignite.failure.FailureContext;
+import org.apache.ignite.failure.FailureHandler;
+import org.apache.ignite.internal.IgniteEx;
+import org.apache.ignite.internal.util.distributed.DistributedProcess;
+import org.apache.ignite.internal.util.typedef.G;
+import org.apache.ignite.testframework.GridTestUtils;
+import org.apache.ignite.testframework.ListeningTestLogger;
+import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
+import org.junit.Test;
+
+import static java.util.concurrent.TimeUnit.MILLISECONDS;
+import static org.apache.ignite.events.EventType.EVT_NODE_FAILED;
+import static org.apache.ignite.events.EventType.EVT_NODE_LEFT;
+import static 
org.apache.ignite.internal.util.distributed.DistributedProcess.DistributedProcessType.TEST_PROCESS;
+
+/**
+ * Tests {@link DistributedProcess} in case of coordinator node left.
+ */
+public class DistributedProcessCoordinatorLeftTest extends 
GridCommonAbstractTest {
+    /** */
+    public static final long TIMEOUT = 20_000L;
+
+    /** */
+    public static final int NODES_CNT = 3;
+
+    /** Latch to send single message on node left. */
+    private final CountDownLatch nodeLeftLatch = new CountDownLatch(NODES_CNT 
- 1);
+
+    /** Latch to await sending single messages to a failed coordinator. */
+    private final CountDownLatch msgSendLatch = new CountDownLatch(NODES_CNT - 
1);
+
+    /** Failure handler invocation flag. */
+    private final AtomicBoolean failure = new AtomicBoolean();
+
+    /** */
+    private final ListeningTestLogger listeningLog = new 
ListeningTestLogger(true, log);
+
+    /** {@inheritDoc} */
+    @Override protected IgniteConfiguration getConfiguration(String 
igniteInstanceName) throws Exception {
+        IgniteConfiguration cfg = super.getConfiguration(igniteInstanceName);
+
+        cfg.setGridLogger(listeningLog);
+
+        cfg.setLocalEventListeners(Collections.singletonMap(event -> {
+            nodeLeftLatch.countDown();
+
+            try {
+                msgSendLatch.await();
+            }
+            catch (InterruptedException e) {
+                fail("Unexpected interrupt.");
+            }
+
+            return false;
+        }, new int[] {EVT_NODE_LEFT, EVT_NODE_FAILED}));
+
+        cfg.setFailureHandler(new FailureHandler() {
+            @Override public boolean onFailure(Ignite ignite, FailureContext 
failureCtx) {
+                failure.set(true);
+
+                return false;
+            }
+        });
+
+        return cfg;
+    }
+
+    /**
+     * Tests that coordinator failing during sending single result not cause 
node failure and the process finishes.
+     *
+     * <ol>
+     *  <li>Start new process of {@link DistributedProcess}.</li>
+     *  <li>The coordinator fails.</li>
+     *  <li>Nodes try to send a single message to the not-alive 
coordinator.</li>
+     *  <li>{@link DistributedProcess} process a node left event and 
reinitialize a new coordinator.</li>
+     *  <li>Process finishes.</li>
+     * </ol>
+     *
+     * @throws Exception If failed.
+     */
+    @Test
+    public void testCoordinatorFailed() throws Exception {
+        startGrids(NODES_CNT);
+
+        CountDownLatch startLatch = new CountDownLatch(NODES_CNT);
+        CountDownLatch finishLatch = new CountDownLatch(NODES_CNT - 1);
+
+        HashMap<String, DistributedProcess<Integer, Integer>> processes = new 
HashMap<>();
+
+        int processRes = 1;
+
+        for (Ignite grid : G.allGrids()) {
+            DistributedProcess<Integer, Integer> dp = new 
DistributedProcess<>(((IgniteEx)grid).context(), TEST_PROCESS,
+                req -> GridTestUtils.runAsync(() -> {
+                    startLatch.countDown();

Review comment:
       Should we release latch only after the `single-message` future 
completed? It's still possible that a new coordinator will be assigned prior to 
the single message sent.

##########
File path: 
modules/core/src/main/java/org/apache/ignite/internal/util/distributed/DistributedProcess.java
##########
@@ -257,11 +258,20 @@ private void sendSingleMessage(Process p) {
         SingleNodeMessage<R> singleMsg = new SingleNodeMessage<>(p.id, type, 
p.resFut.result(),
             (Exception)p.resFut.error());
 
-        if (F.eq(ctx.localNodeId(), p.crdId))
-            onSingleNodeMessageReceived(singleMsg, p.crdId);
+        UUID crdId = p.crdId;
+
+        if (F.eq(ctx.localNodeId(), crdId))
+            onSingleNodeMessageReceived(singleMsg, crdId);
         else {
             try {
-                ctx.io().sendToGridTopic(p.crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+                ctx.io().sendToGridTopic(crdId, 
GridTopic.TOPIC_DISTRIBUTED_PROCESS, singleMsg, SYSTEM_POOL);
+            }
+            catch (ClusterTopologyCheckedException e) {
+                // The coordinator was failed. The single message will be sent 
when a new coordinator initilized.

Review comment:
       was failed -> has failed?

##########
File path: 
modules/core/src/test/java/org/apache/ignite/testsuites/IgniteUtilSelfTestSuite.java
##########
@@ -136,7 +137,10 @@
     // control.sh
     CommandHandlerParsingTest.class,
 
-    GridCountDownCallbackTest.class
+    GridCountDownCallbackTest.class,
+
+    // Distributed process.

Review comment:
       I don't think we need such comments :-)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to