MarkGaox commented on code in PR #2736:
URL: https://github.com/apache/helix/pull/2736#discussion_r1468994177


##########
helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java:
##########
@@ -302,16 +305,36 @@ private Response batchGetStoppableInstances(String 
clusterId, JsonNode node, boo
       ClusterService clusterService =
           new ClusterServiceImpl(getDataAccssor(clusterId), 
getConfigAccessor());
       ClusterTopology clusterTopology = 
clusterService.getClusterTopology(clusterId);
-      StoppableInstancesSelector stoppableInstancesSelector =
+      StoppableInstancesSelector.StoppableInstancesSelectorBuilder builder =
           new StoppableInstancesSelector.StoppableInstancesSelectorBuilder()
               .setClusterId(clusterId)
               .setOrderOfZone(orderOfZone)
               .setCustomizedInput(customizedInput)
               .setMaintenanceService(maintenanceService)
               .setClusterTopology(clusterTopology)
               .setDataAccessor((ZKHelixDataAccessor) getDataAccssor(clusterId))
-              .build();
+              .setContinueOnFailure(continueOnFailures);
+
+      if (notExceedingMaxOfflineInstances) {
+        ClusterConfig clusterConfig = 
getConfigAccessor().getClusterConfig(clusterId);
+        if (clusterConfig == null) {
+          String message =
+              "Invalid cluster name: " + clusterId + ". Cluster config does 
not exist.";
+          _logger.error(message);
+          return badRequest(message);
+        }
+        // If maxOfflineInstancesAllowed is not set, it means there is no 
limit on the number of offline instances.
+        // Therefore, builder sets the maxOfflineInstancesAllowed to the 
default value, Integer.MAX_VALUE.
+        if (clusterConfig.getMaxOfflineInstancesAllowed() != -1) {
+          
builder.setMaxAdditionalOfflineInstances(clusterConfig.getMaxOfflineInstancesAllowed());
+        }
+      }
+
+      StoppableInstancesSelector stoppableInstancesSelector = builder.build();
       stoppableInstancesSelector.calculateOrderOfZone(instances, random);
+      Set<String> invalidInstances = new HashSet<>(toBeStoppedInstances);

Review Comment:
   Good catch!



##########
helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java:
##########
@@ -129,32 +136,53 @@ public ObjectNode 
getStoppableInstancesCrossZones(List<String> instances,
       if (instanceSet.isEmpty()) {
         continue;
       }
-      populateStoppableInstances(new ArrayList<>(instanceSet), 
toBeStoppedInstancesSet, stoppableInstances,
-          failedStoppableInstances);
+      populateStoppableInstances(new ArrayList<>(instanceSet), 
toBeStoppedInstancesSet,
+          stoppableInstances, failedStoppableInstances,
+          _maxAdditionalOfflineInstances - toBeStoppedInstancesSet.size());
     }
     processNonexistentInstances(instances, failedStoppableInstances);
     return result;
   }
 
   private void populateStoppableInstances(List<String> instances, Set<String> 
toBeStoppedInstances,
-      ArrayNode stoppableInstances, ObjectNode failedStoppableInstances) 
throws IOException {
+      ArrayNode stoppableInstances, ObjectNode failedStoppableInstances,
+      int allowedOfflineInstances) throws IOException {
     Map<String, StoppableCheck> instancesStoppableChecks =
         _maintenanceService.batchGetInstancesStoppableChecks(_clusterId, 
instances,
             _customizedInput, toBeStoppedInstances);
 
     for (Map.Entry<String, StoppableCheck> instanceStoppableCheck : 
instancesStoppableChecks.entrySet()) {
       String instance = instanceStoppableCheck.getKey();
       StoppableCheck stoppableCheck = instanceStoppableCheck.getValue();
-      if (!stoppableCheck.isStoppable()) {
-        ArrayNode failedReasonsNode = 
failedStoppableInstances.putArray(instance);
-        for (String failedReason : stoppableCheck.getFailedChecks()) {
-          
failedReasonsNode.add(JsonNodeFactory.instance.textNode(failedReason));
-        }
-      } else {
+      if (stoppableCheck.isStoppable() && allowedOfflineInstances > 0) {
         stoppableInstances.add(instance);
         // Update the toBeStoppedInstances set with the currently identified 
stoppable instance.
         // This ensures that subsequent checks in other zones are aware of 
this instance's stoppable status.
         toBeStoppedInstances.add(instance);
+        allowedOfflineInstances--;
+        continue;
+      }
+      ArrayNode failedReasonsNode = 
failedStoppableInstances.putArray(instance);
+      boolean failedHelixOwnChecks = false;
+      if (allowedOfflineInstances <= 0) {

Review Comment:
   It won't work if the size of `toBeStoppedInstances` > size of 
`maxAdditionalOfflineInstance`



##########
helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java:
##########
@@ -129,32 +136,53 @@ public ObjectNode 
getStoppableInstancesCrossZones(List<String> instances,
       if (instanceSet.isEmpty()) {
         continue;
       }
-      populateStoppableInstances(new ArrayList<>(instanceSet), 
toBeStoppedInstancesSet, stoppableInstances,
-          failedStoppableInstances);
+      populateStoppableInstances(new ArrayList<>(instanceSet), 
toBeStoppedInstancesSet,
+          stoppableInstances, failedStoppableInstances,
+          _maxAdditionalOfflineInstances - toBeStoppedInstancesSet.size());
     }
     processNonexistentInstances(instances, failedStoppableInstances);
     return result;
   }
 
   private void populateStoppableInstances(List<String> instances, Set<String> 
toBeStoppedInstances,
-      ArrayNode stoppableInstances, ObjectNode failedStoppableInstances) 
throws IOException {
+      ArrayNode stoppableInstances, ObjectNode failedStoppableInstances,
+      int allowedOfflineInstances) throws IOException {
     Map<String, StoppableCheck> instancesStoppableChecks =
         _maintenanceService.batchGetInstancesStoppableChecks(_clusterId, 
instances,
             _customizedInput, toBeStoppedInstances);
 
     for (Map.Entry<String, StoppableCheck> instanceStoppableCheck : 
instancesStoppableChecks.entrySet()) {
       String instance = instanceStoppableCheck.getKey();
       StoppableCheck stoppableCheck = instanceStoppableCheck.getValue();
-      if (!stoppableCheck.isStoppable()) {
-        ArrayNode failedReasonsNode = 
failedStoppableInstances.putArray(instance);
-        for (String failedReason : stoppableCheck.getFailedChecks()) {
-          
failedReasonsNode.add(JsonNodeFactory.instance.textNode(failedReason));
-        }
-      } else {
+      if (stoppableCheck.isStoppable() && allowedOfflineInstances > 0) {
         stoppableInstances.add(instance);
         // Update the toBeStoppedInstances set with the currently identified 
stoppable instance.
         // This ensures that subsequent checks in other zones are aware of 
this instance's stoppable status.
         toBeStoppedInstances.add(instance);
+        allowedOfflineInstances--;

Review Comment:
   Good catch. Will count the number of offline instances in current cluster, 
and set `maxAdditionalOfflineInstance` = `maxAllowedOfflineInstance` - 
`currentOfflineInstance`



##########
helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java:
##########
@@ -302,16 +305,36 @@ private Response batchGetStoppableInstances(String 
clusterId, JsonNode node, boo
       ClusterService clusterService =
           new ClusterServiceImpl(getDataAccssor(clusterId), 
getConfigAccessor());
       ClusterTopology clusterTopology = 
clusterService.getClusterTopology(clusterId);
-      StoppableInstancesSelector stoppableInstancesSelector =
+      StoppableInstancesSelector.StoppableInstancesSelectorBuilder builder =
           new StoppableInstancesSelector.StoppableInstancesSelectorBuilder()
               .setClusterId(clusterId)
               .setOrderOfZone(orderOfZone)
               .setCustomizedInput(customizedInput)
               .setMaintenanceService(maintenanceService)
               .setClusterTopology(clusterTopology)
               .setDataAccessor((ZKHelixDataAccessor) getDataAccssor(clusterId))
-              .build();
+              .setContinueOnFailure(continueOnFailures);
+
+      if (notExceedingMaxOfflineInstances) {
+        ClusterConfig clusterConfig = 
getConfigAccessor().getClusterConfig(clusterId);
+        if (clusterConfig == null) {
+          String message =
+              "Invalid cluster name: " + clusterId + ". Cluster config does 
not exist.";
+          _logger.error(message);
+          return badRequest(message);
+        }
+        // If maxOfflineInstancesAllowed is not set, it means there is no 
limit on the number of offline instances.
+        // Therefore, builder sets the maxOfflineInstancesAllowed to the 
default value, Integer.MAX_VALUE.
+        if (clusterConfig.getMaxOfflineInstancesAllowed() != -1) {
+          
builder.setMaxAdditionalOfflineInstances(clusterConfig.getMaxOfflineInstancesAllowed());

Review Comment:
   I think a even more reasonable solution is to not allow user do 
stoppableCheck if they didn't provide `maxOfflineInstancesAllowed` in their 
cluster config. What do you think?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to