prashantpogde commented on a change in pull request #1998:
URL: https://github.com/apache/ozone/pull/1998#discussion_r604587035



##########
File path: 
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHDDSUpgrade.java
##########
@@ -305,4 +400,579 @@ public void 
testFinalizationFromInitialVersionToLatestVersion()
     // Verify that new pipeline can be created with upgraded datanodes.
     testPostUpgradePipelineCreation();
   }
+
+  /*
+   * All the subsequent tests here are failure cases. Some of the tests below
+   * could simultaneously fail one or more nodes at specific execution points
+   * and in different thread contexts.
+   * Upgrade path key execution points are defined in
+   * UpgradeFinalizer:UpgradeTestInjectionPoints.
+   */
+
+  /*
+   * Helper function to inject SCM failure and a SCM restart at a given
+   * execution point during SCM-Upgrade.
+   *
+   * Injects Failure in  : SCM
+   * Executing-Thread-Context : SCM-Upgrade
+   */
+  private Boolean injectSCMFailureDuringSCMUpgrade()
+      throws InterruptedException, TimeoutException, AuthenticationException,
+      IOException {
+    // For some tests this could get called in a different thread context.
+    // We need to guard concurrent updates to the cluster.
+    synchronized(cluster) {
+      cluster.restartStorageContainerManager(true);
+      loadSCMState();
+    }
+    // The ongoing current SCM Upgrade is getting aborted at this point. We
+    // need to schedule a new SCM Upgrade on a different thread context.
+    Thread t = new Thread(new Runnable() {
+      @Override
+      public void run() {
+        try {
+          loadSCMState();
+          scm.finalizeUpgrade("xyz");
+        } catch (IOException e) {
+          e.printStackTrace();
+          Assert.fail(e.getMessage());
+        }
+      }
+    });
+    t.start();
+    return true;
+  }
+
+  /*
+   * Helper function to inject DataNode failures and DataNode restarts at a
+   * given execution point during SCM-Upgrade. Please note that it fails all
+   * the DataNodes in the cluster and is part of test cases that simulate
+   * multi-node failure at specific code-execution points during SCM Upgrade.
+   * Please note that this helper function should be called in the thread
+   * context of an SCM-Upgrade only. The return value has a significance that
+   * it does not abort the currently ongoing SCM upgrade. because this
+   * failure injection does not fail the SCM node and only impacts datanodes,
+   *  we do not need to schedule another scm-finalize-upgrade here.
+   *
+   * Injects Failure in  : All the DataNodes
+   * Executing-Thread-Context : SCM-Upgrade
+   */
+  private Boolean injectDataNodeFailureDuringSCMUpgrade() {
+    try {
+      // Work on a Copy of current set of DataNodes to avoid
+      // running into tricky situations.
+      List<HddsDatanodeService> currentDataNodes =
+          new ArrayList<>(cluster.getHddsDatanodes());
+      for (HddsDatanodeService ds: currentDataNodes) {
+        DatanodeDetails dn = ds.getDatanodeDetails();
+        cluster.restartHddsDatanode(dn, false);
+      }
+      cluster.waitForClusterToBeReady();
+    } catch (Exception e) {
+      LOG.info("DataNode Restarts Failed!");
+      Assert.fail(e.getMessage());
+    }
+    loadSCMState();
+    // returning false from injection function, continues currently ongoing
+    // SCM-Upgrade-Finalization.
+    return false;
+  }
+
+  /*
+   * Helper function to inject a DataNode failure and restart for a specific
+   * DataNode. This injection function can target a specific DataNode and
+   * thus facilitates getting called in the upgrade-finalization thread context
+   * of that specific DataNode.
+   *
+   * Injects Failure in  : Given DataNodes
+   * Executing-Thread-Context : the same DataNode that we are failing here.
+   */
+  private Thread injectDataNodeFailureDuringDataNodeUpgrade(
+      DatanodeDetails dn) {
+    Thread t = null;
+    try {
+      // Schedule the DataNode restart on a separate thread context
+      // otherwise DataNode restart will hang. Also any cluster modification
+      // needs to be guarded since it could get modified in multiple 
independent
+      // threads.
+      t = new Thread(new Runnable() {
+        @Override
+        public void run() {
+          try {
+            synchronized (cluster) {
+              cluster.restartHddsDatanode(dn, true);
+            }
+          } catch (Exception e) {
+            e.printStackTrace();
+            Assert.fail(e.getMessage());
+          }
+        }
+      });
+    } catch (Exception e) {
+      LOG.info("DataNode Restart Failed!");
+      Assert.fail(e.getMessage());
+    }
+    return t;
+  }
+
+  /*
+   * Helper function to inject coordinated failures and restarts across
+   * all the DataNode as well as SCM. This can help create targeted test cases
+   * to inject such comprehensive failures in SCM-Upgrade-Context as well as
+   * DataNode-Upgrade-Context.
+   *
+   * Injects Failure in  : SCM as well as ALL the DataNodes.
+   * Executing-Thread-Context : Either the SCM-Upgrade-Finalizer or the
+   *                            DataNode-Upgrade-Finalizer.
+   */
+  private Thread injectSCMAndDataNodeFailureTogetherAtTheSameTime()
+      throws InterruptedException, TimeoutException, AuthenticationException,
+      IOException {
+    // This needs to happen in a separate thread context otherwise
+    // DataNode restart will hang.
+    return new Thread(new Runnable() {
+      @Override
+      public void run() {
+        try {
+          // Since we are modifying cluster in an independent thread context,
+          // we synchronize access to it to avoid concurrent modification
+          // exception.
+          synchronized (cluster) {
+            // Work on a Copy of current set of DataNodes to avoid
+            // running into tricky situations.
+            List<HddsDatanodeService> currentDataNodes =
+                new ArrayList<>(cluster.getHddsDatanodes());
+            for (HddsDatanodeService ds: currentDataNodes) {
+              DatanodeDetails dn = ds.getDatanodeDetails();
+              cluster.restartHddsDatanode(dn, false);
+            }
+            cluster.restartStorageContainerManager(false);
+            cluster.waitForClusterToBeReady();
+          }
+        } catch (Exception e) {
+          e.printStackTrace();
+          Assert.fail(e.getMessage());
+        }
+      }
+    });
+  }
+
+  /*
+   * We have various test cases to target single-node or multi-node failures
+   * below.
+   **/
+
+  /*
+   * One node(SCM) failure case:
+   * Thread-Context : SCM-Upgrade
+   *
+   * Test SCM failure During SCM Upgrade before execution point
+   * "PreFinalizeUpgrade". All meaningful Upgrade execution points
+   * are defined in UpgradeFinalizer:UpgradeTestInjectionPoints.
+   */
+  @Test
+  public void testScmFailuresBeforeScmPreFinalizeUpgrade()
+      throws Exception {
+    scm.getUpgradeFinalizer().configureTestInjectionFunction(
+        BeforePreFinalizeUpgrade,
+        () -> {
+          return injectSCMFailureDuringSCMUpgrade();
+        });

Review comment:
       done.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to