siddharthteotia commented on a change in pull request #4446: Add support in the 
rebalancer for the user to provide minimum number of serving replicas
URL: https://github.com/apache/incubator-pinot/pull/4446#discussion_r308990986
 
 

 ##########
 File path: 
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/TableRebalancer.java
 ##########
 @@ -139,112 +153,293 @@ public RebalanceResult rebalance(TableConfig 
tableConfig, RebalanceSegmentStrate
       }
 
       if (EqualityUtils.isEqual(targetIdealState, currentIdealState)) {
-        LOGGER.info("Table {} is rebalanced.", tableNameWithType);
-
         LOGGER.info("Finished rebalancing table {} in {} ms.", 
tableNameWithType,
             TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime));
         
result.setIdealStateMapping(targetIdealState.getRecord().getMapFields());
         result.setPartitionAssignment(targetPartitionAssignment);
+        result.setStatus("Successfully finished rebalancing");
+        result.setStatusCode(RebalanceResult.RebalanceStatus.DONE);
         return result;
       }
 
       // if ideal state needs to change, get the next 'safe' state (based on 
whether downtime is OK or not)
       IdealState nextIdealState = getNextState(currentIdealState, 
targetIdealState, rebalanceConfig);
+      LOGGER.info("Got new ideal state after making changes to partition map 
of all segments. Will attempt to persist this in ZK. Logging the difference 
between new and target ideal state");
+      printIdealStateDifference(targetIdealState, nextIdealState);
 
       // If the ideal state is large enough, enable compression
       if (HelixHelper.MAX_PARTITION_COUNT_IN_UNCOMPRESSED_IDEAL_STATE < 
nextIdealState.getPartitionSet().size()) {
         nextIdealState.getRecord().setBooleanField("enableCompression", true);
       }
 
-      // Check version and set ideal state
+      // Check version and set ideal state to nextIdealState
       try {
-        LOGGER.info("Updating IdealState for table {}", tableNameWithType);
+        LOGGER.info("Going to update current IdealState in ZK for table {}, 
current version {} and creation time {}",
+            tableNameWithType, currentIdealState.getRecord().getVersion(), 
currentIdealState.getRecord().getCreationTime());
         if (zkBaseDataAccessor
             .set(idealStateKey.getPath(), nextIdealState.getRecord(), 
currentIdealState.getRecord().getVersion(),
                 AccessOption.PERSISTENT)) {
+          LOGGER.info("Successfully persisted the ideal state for table {} in 
ZK. Will wait for External view to converge",
+              tableNameWithType);
+          ++_rebalancerStats.updatestoIdealStateInZK;
           // if we succeeded, wait for the change to stabilize
           waitForStable(tableNameWithType);
           // clear retries as it tracks failures with each idealstate update 
attempt
           retries = 0;
+          LOGGER.info("External view converged for the change in ideal state 
for table {}. Will start the next iteration (if any)",
+              tableNameWithType);
           continue;
         }
-        // in case of any error, we retry a bounded number of types
+        // in case of any error, we retry a bounded number of times
       } catch (ZkBadVersionException e) {
-        LOGGER.warn("Version changed while updating ideal state for resource: 
{}", tableNameWithType);
+        // we will go back in the loop and reattempt by recomputing the target 
ideal state
+        LOGGER.info("Version changed while updating ideal state for resource: 
{}, was expecting version {}",
+            tableNameWithType, currentIdealState.getRecord().getVersion());
       } catch (Exception e) {
-        LOGGER.warn("Caught exception while updating ideal state for resource: 
{}", tableNameWithType, e);
+        if (e instanceof IllegalStateException && e.getCause() instanceof 
ExternalViewErrored) {
+          if (e.getCause() instanceof ExternalViewErrored) {
+            LOGGER.error("External view reported error for table {} after 
updating ideal state", tableNameWithType);
+          } else if (e.getCause() instanceof ExternalViewConvergeTimeout) {
+            LOGGER.error("Timedout while waiting for external view to converge 
for table {}", tableNameWithType);
+          }
+          // remove the cause as it is private and is only used to detect 
exact error within this class
+          throw new IllegalStateException(e.getMessage());
+        } else {
+          LOGGER.error("Caught exception while or after updating ideal state 
for resource: {}", tableNameWithType, e);
+          throw e;
+        }
       }
 
+      // if we are here, we failed to persist the updated ideal state in ZK
+      // due to version mismatch, will attempt again
       previousIdealState = currentIdealState;
       if (retries++ > MAX_RETRIES) {
         LOGGER.error("Unable to rebalance table {} in {} attempts. Giving up", 
tableNameWithType, MAX_RETRIES);
+        result.setStatus("Cancelling the rebalance operation as we have 
exhausted the max number of retries after ZK version mismatch");
+        result.setStatusCode(RebalanceResult.RebalanceStatus.FAILED);
         return result;
       }
       // wait before retrying
       try {
         Thread.sleep(retryDelayMs);
       } catch (InterruptedException e) {
         LOGGER.error("Got interrupted while rebalancing table {}", 
tableNameWithType);
+        result.setStatus("Rebalancer got interrupted");
+        result.setStatusCode(RebalanceResult.RebalanceStatus.FAILED);
         Thread.currentThread().interrupt();
         return result;
       }
     }
   }
 
+  private void validateMinReplicas(final String tableNameWithType,
 
 Review comment:
   Done. No longer iterating over ideal state. Just using 
idealstate.getReplicas()

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to