This is an automated email from the ASF dual-hosted git repository. jxue pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/helix.git
commit 4f863c3549631b5e9fccc6f40b513fff6fe435fa Author: narendly <[email protected]> AuthorDate: Mon Feb 25 18:09:08 2019 -0800 [HELIX-801] HELIX: Implement maintenance history for maintenance mode This diff implements maintenance history for entering and exiting maintenance mode. Changelist: 1. Implement a separate DataUpdater for LeaderHistory ZNode update 2. Implement recording of maintenance history in LeaderHistory ZNode 3. Fix the bug where only the last few history entries are kept --- .../main/java/org/apache/helix/PropertyKey.java | 6 +- .../java/org/apache/helix/PropertyPathBuilder.java | 4 +- .../helix/controller/GenericHelixController.java | 3 + .../stages/MaintenanceRecoveryStage.java | 1 - .../manager/zk/DistributedLeaderElection.java | 29 ++- .../org/apache/helix/manager/zk/ZKHelixAdmin.java | 32 +++- .../org/apache/helix/model/ControllerHistory.java | 212 +++++++++++++++++++++ .../java/org/apache/helix/model/LeaderHistory.java | 118 ------------ .../org/apache/helix/model/MaintenanceSignal.java | 2 +- .../controller/TestClusterMaintenanceMode.java | 98 ++++++++-- .../controller/TestControllerHistory.java | 14 +- .../server/resources/helix/ClusterAccessor.java | 4 +- 12 files changed, 357 insertions(+), 166 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/PropertyKey.java b/helix-core/src/main/java/org/apache/helix/PropertyKey.java index 1ba3108..e16198d 100644 --- a/helix-core/src/main/java/org/apache/helix/PropertyKey.java +++ b/helix-core/src/main/java/org/apache/helix/PropertyKey.java @@ -30,7 +30,7 @@ import org.apache.helix.model.HealthStat; import org.apache.helix.model.HelixConfigScope.ConfigScopeProperty; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; -import org.apache.helix.model.LeaderHistory; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.LiveInstance; import org.apache.helix.model.MaintenanceSignal; import org.apache.helix.model.Message; @@ -656,11 +656,11 @@ public class PropertyKey { } /** - * Get a property key associated with {@link LeaderHistory} + * Get a property key associated with {@link ControllerHistory} * @return {@link PropertyKey} */ public PropertyKey controllerLeaderHistory() { - return new PropertyKey(HISTORY, LeaderHistory.class, _clusterName); + return new PropertyKey(HISTORY, ControllerHistory.class, _clusterName); } /** diff --git a/helix-core/src/main/java/org/apache/helix/PropertyPathBuilder.java b/helix-core/src/main/java/org/apache/helix/PropertyPathBuilder.java index e85431c..2bde247 100644 --- a/helix-core/src/main/java/org/apache/helix/PropertyPathBuilder.java +++ b/helix-core/src/main/java/org/apache/helix/PropertyPathBuilder.java @@ -29,7 +29,7 @@ import org.apache.helix.model.CurrentState; import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; -import org.apache.helix.model.LeaderHistory; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.LiveInstance; import org.apache.helix.model.MaintenanceSignal; import org.apache.helix.model.Message; @@ -63,7 +63,7 @@ public class PropertyPathBuilder { typeToClassMapping.put(MESSAGES, Message.class); typeToClassMapping.put(CURRENTSTATES, CurrentState.class); typeToClassMapping.put(STATUSUPDATES, StatusUpdate.class); - typeToClassMapping.put(HISTORY, LeaderHistory.class); + typeToClassMapping.put(HISTORY, ControllerHistory.class); typeToClassMapping.put(PAUSE, PauseSignal.class); typeToClassMapping.put(MAINTENANCE, MaintenanceSignal.class); // TODO: Below must handle the case for future versions of Task Framework with a different path diff --git a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java index 1ba26f1..dda4552 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java +++ b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java @@ -304,6 +304,9 @@ public class GenericHelixController implements IdealStateChangeListener, Pipeline rebalancePipeline = new Pipeline(pipelineName); rebalancePipeline.addStage(new BestPossibleStateCalcStage()); rebalancePipeline.addStage(new IntermediateStateCalcStage()); + // Need to add MaintenanceRecoveryStage here because MAX_PARTITIONS_PER_INSTANCE check could + // only occur after IntermediateStateCalcStage calculation + rebalancePipeline.addStage(new MaintenanceRecoveryStage()); rebalancePipeline.addStage(new ResourceMessageGenerationPhase()); rebalancePipeline.addStage(new MessageSelectionStage()); rebalancePipeline.addStage(new MessageThrottleStage()); diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java index 11d3c92..e366ffa 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java @@ -126,7 +126,6 @@ public class MaintenanceRecoveryStage extends AbstractAsyncBaseStage { // Config is not set; return return false; } - Map<String, PartitionStateMap> resourceStatesMap = intermediateStateOutput.getResourceStatesMap(); Map<String, Integer> instancePartitionCounts = new HashMap<>(); diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/DistributedLeaderElection.java b/helix-core/src/main/java/org/apache/helix/manager/zk/DistributedLeaderElection.java index a8fb704..9d9d522 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/DistributedLeaderElection.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/DistributedLeaderElection.java @@ -22,6 +22,8 @@ package org.apache.helix.manager.zk; import java.lang.management.ManagementFactory; import java.util.List; +import org.I0Itec.zkclient.DataUpdater; +import org.apache.helix.AccessOption; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; import org.apache.helix.HelixManager; @@ -30,9 +32,10 @@ import org.apache.helix.InstanceType; import org.apache.helix.NotificationContext; import org.apache.helix.PropertyKey.Builder; import org.apache.helix.PropertyType; +import org.apache.helix.ZNRecord; import org.apache.helix.api.listeners.ControllerChangeListener; import org.apache.helix.controller.GenericHelixController; -import org.apache.helix.model.LeaderHistory; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.LiveInstance; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -156,14 +159,22 @@ public class DistributedLeaderElection implements ControllerChangeListener { private void updateHistory(HelixManager manager) { HelixDataAccessor accessor = manager.getHelixDataAccessor(); Builder keyBuilder = accessor.keyBuilder(); - - LeaderHistory history = accessor.getProperty(keyBuilder.controllerLeaderHistory()); - if (history == null) { - history = new LeaderHistory(PropertyType.HISTORY.toString()); - } - history - .updateHistory(manager.getClusterName(), manager.getInstanceName(), manager.getVersion()); - if (!accessor.setProperty(keyBuilder.controllerLeaderHistory(), history)) { + final String clusterName = manager.getClusterName(); + final String instanceName = manager.getInstanceName(); + final String version = manager.getVersion(); + + // Record a MaintenanceSignal history + if (!accessor.getBaseDataAccessor().update(keyBuilder.controllerLeaderHistory().getPath(), + new DataUpdater<ZNRecord>() { + @Override + public ZNRecord update(ZNRecord oldRecord) { + if (oldRecord == null) { + oldRecord = new ZNRecord(PropertyType.HISTORY.toString()); + } + return new ControllerHistory(oldRecord).updateHistory(clusterName, instanceName, + version); + } + }, AccessOption.PERSISTENT)) { LOG.error("Failed to persist leader history to ZK!"); } } diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index d388afe..5a1310a 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -60,6 +60,7 @@ import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.ClusterConstraints; import org.apache.helix.model.ClusterConstraints.ConstraintType; import org.apache.helix.model.ConstraintItem; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.CurrentState; import org.apache.helix.model.ExternalView; import org.apache.helix.model.HelixConfigScope; @@ -417,9 +418,9 @@ public class ZKHelixAdmin implements HelixAdmin { * @param customFields * @param triggeringEntity */ - private void processMaintenanceMode(String clusterName, boolean enabled, String reason, - MaintenanceSignal.AutoTriggerReason internalReason, Map<String, String> customFields, - MaintenanceSignal.TriggeringEntity triggeringEntity) { + private void processMaintenanceMode(String clusterName, final boolean enabled, final String reason, + final MaintenanceSignal.AutoTriggerReason internalReason, final Map<String, String> customFields, + final MaintenanceSignal.TriggeringEntity triggeringEntity) { HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor<ZNRecord>(_zkClient)); Builder keyBuilder = accessor.keyBuilder(); @@ -427,6 +428,7 @@ public class ZKHelixAdmin implements HelixAdmin { triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" : "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason); + final long currentTime = System.currentTimeMillis(); if (!enabled) { // Exit maintenance mode accessor.removeProperty(keyBuilder.maintenance()); @@ -436,7 +438,7 @@ public class ZKHelixAdmin implements HelixAdmin { if (reason != null) { maintenanceSignal.setReason(reason); } - maintenanceSignal.setTimestamp(System.currentTimeMillis()); + maintenanceSignal.setTimestamp(currentTime); maintenanceSignal.setTriggeringEntity(triggeringEntity); switch (triggeringEntity) { case CONTROLLER: @@ -458,9 +460,29 @@ public class ZKHelixAdmin implements HelixAdmin { break; } if (!accessor.createMaintenance(maintenanceSignal)) { - throw new HelixException("Failed to create maintenance signal"); + throw new HelixException("Failed to create maintenance signal!"); } } + + // Record a MaintenanceSignal history + if (!accessor.getBaseDataAccessor().update(keyBuilder.controllerLeaderHistory().getPath(), + new DataUpdater<ZNRecord>() { + @Override + public ZNRecord update(ZNRecord oldRecord) { + try { + if (oldRecord == null) { + oldRecord = new ZNRecord(PropertyType.HISTORY.toString()); + } + return new ControllerHistory(oldRecord).updateMaintenanceHistory(enabled, reason, + currentTime, internalReason, customFields, triggeringEntity); + } catch (IOException e) { + logger.error("Failed to update maintenance history! Exception: {}", e); + return oldRecord; + } + } + }, AccessOption.PERSISTENT)) { + logger.error("Failed to write maintenance history to ZK!"); + } } @Override diff --git a/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java new file mode 100644 index 0000000..4d87ffc --- /dev/null +++ b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java @@ -0,0 +1,212 @@ +package org.apache.helix.model; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.helix.HelixProperty; +import org.apache.helix.ZNRecord; +import org.codehaus.jackson.map.ObjectMapper; + +/** + * The history of instances that have served as the leader controller + */ +public class ControllerHistory extends HelixProperty { + private final static int HISTORY_SIZE = 10; + private final static int MAINTENANCE_HISTORY_SIZE = 20; + + private enum ConfigProperty { + HISTORY, + TIME, + DATE, + VERSION, + CONTROLLER + } + + private enum MaintenanceConfigKey { + MAINTENANCE_HISTORY, + OPERATION_TYPE, + DATE, + REASON + + } + + private enum OperationType { + // The following are options for OPERATION_TYPE in MaintenanceConfigKey + ENTER, + EXIT + } + + public enum HistoryUpdaterType { + CONTROLLER_LEADERSHIP, + MAINTENANCE + } + + public ControllerHistory(String id) { + super(id); + } + + public ControllerHistory(ZNRecord record) { + super(record); + } + + /** + * Save up to HISTORY_SIZE number of leaders in FIFO order + * @param clusterName the cluster the instance leads + * @param instanceName the name of the leader instance + */ + public ZNRecord updateHistory(String clusterName, String instanceName, String version) { + /* keep this for back-compatible */ + // TODO: remove this in future when we confirmed no one consumes it + List<String> list = _record.getListField(clusterName); + if (list == null) { + list = new ArrayList<>(); + _record.setListField(clusterName, list); + } + + while (list.size() >= HISTORY_SIZE) { + list.remove(0); + } + list.add(instanceName); + // TODO: remove above in future when we confirmed no one consumes it */ + + List<String> historyList = _record.getListField(ConfigProperty.HISTORY.name()); + if (historyList == null) { + historyList = new ArrayList<>(); + _record.setListField(ConfigProperty.HISTORY.name(), historyList); + } + + // Keep only the last HISTORY_SIZE entries + while (historyList.size() >= HISTORY_SIZE) { + historyList.remove(0); + } + + Map<String, String> historyEntry = new HashMap<>(); + + long currentTime = System.currentTimeMillis(); + DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss"); + df.setTimeZone(TimeZone.getTimeZone("UTC")); + String dateTime = df.format(new Date(currentTime)); + + historyEntry.put(ConfigProperty.CONTROLLER.name(), instanceName); + historyEntry.put(ConfigProperty.TIME.name(), String.valueOf(currentTime)); + historyEntry.put(ConfigProperty.DATE.name(), dateTime); + historyEntry.put(ConfigProperty.VERSION.name(), version); + + historyList.add(historyEntry.toString()); + return _record; + } + + /** + * Get history list + * @return + */ + public List<String> getHistoryList() { + List<String> historyList = _record.getListField(ConfigProperty.HISTORY.name()); + if (historyList == null) { + historyList = new ArrayList<>(); + } + + return historyList; + } + + /** + * Record up to MAINTENANCE_HISTORY_SIZE number of changes to MaintenanceSignal in FIFO order. + * @param enabled + * @param reason + * @param currentTime + * @param internalReason + * @param customFields + * @param triggeringEntity + */ + public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long currentTime, + MaintenanceSignal.AutoTriggerReason internalReason, Map<String, String> customFields, + MaintenanceSignal.TriggeringEntity triggeringEntity) throws IOException { + List<String> maintenanceHistoryList = + _record.getListField(MaintenanceConfigKey.MAINTENANCE_HISTORY.name()); + if (maintenanceHistoryList == null) { + maintenanceHistoryList = new ArrayList<>(); + _record.setListField(MaintenanceConfigKey.MAINTENANCE_HISTORY.name(), maintenanceHistoryList); + } + + // Keep only the last MAINTENANCE_HISTORY_SIZE entries + while (maintenanceHistoryList.size() >= MAINTENANCE_HISTORY_SIZE) { + maintenanceHistoryList.remove(0); + } + + DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:" + "mm:ss"); + df.setTimeZone(TimeZone.getTimeZone("UTC")); + String dateTime = df.format(new Date(currentTime)); + + // Populate maintenanceEntry + Map<String, String> maintenanceEntry = new HashMap<>(); + maintenanceEntry.put(MaintenanceConfigKey.OPERATION_TYPE.name(), + enabled ? OperationType.ENTER.name() : OperationType.EXIT.name()); + maintenanceEntry.put(MaintenanceConfigKey.REASON.name(), reason); + maintenanceEntry.put(MaintenanceConfigKey.DATE.name(), dateTime); + maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(currentTime)); + maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + triggeringEntity.name()); + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) { + // If auto-triggered + maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.AUTO_TRIGGER_REASON.name(), + internalReason.name()); + } else { + // If manually triggered + if (customFields != null && !customFields.isEmpty()) { + for (Map.Entry<String, String> customFieldEntry : customFields.entrySet()) { + if (!maintenanceEntry.containsKey(customFieldEntry.getKey())) { + // Make sure custom entries do not overwrite pre-defined fields + maintenanceEntry.put(customFieldEntry.getKey(), customFieldEntry.getValue()); + } + } + } + } + maintenanceHistoryList.add(new ObjectMapper().writeValueAsString(maintenanceEntry)); + return _record; + } + + /** + * Get maintenance history list. + * @return + */ + public List<String> getMaintenanceHistoryList() { + List<String> maintenanceHistoryList = + _record.getListField(MaintenanceConfigKey.MAINTENANCE_HISTORY.name()); + if (maintenanceHistoryList == null) { + maintenanceHistoryList = new ArrayList<>(); + } + return maintenanceHistoryList; + } + + @Override + public boolean isValid() { + return true; + } +} diff --git a/helix-core/src/main/java/org/apache/helix/model/LeaderHistory.java b/helix-core/src/main/java/org/apache/helix/model/LeaderHistory.java deleted file mode 100644 index 31d125f..0000000 --- a/helix-core/src/main/java/org/apache/helix/model/LeaderHistory.java +++ /dev/null @@ -1,118 +0,0 @@ -package org.apache.helix.model; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; - -import org.apache.helix.HelixProperty; -import org.apache.helix.ZNRecord; - -/** - * The history of instances that have served as the leader controller - */ -public class LeaderHistory extends HelixProperty { - private final static int HISTORY_SIZE = 10; - - private enum ConfigProperty { - HISTORY, - TIME, - DATE, - VERSION, - CONTROLLER - } - - public LeaderHistory(String id) { - super(id); - } - - public LeaderHistory(ZNRecord record) { - super(record); - } - - /** - * Save up to HISTORY_SIZE number of leaders in FIFO order - * @param clusterName the cluster the instance leads - * @param instanceName the name of the leader instance - */ - public void updateHistory(String clusterName, String instanceName, String version) { - /* keep this for back-compatible */ - // TODO: remove this in future when we confirmed no one consumes it - List<String> list = _record.getListField(clusterName); - if (list == null) { - list = new ArrayList<String>(); - _record.setListField(clusterName, list); - } - - if (list.size() == HISTORY_SIZE) { - list.remove(0); - } - list.add(instanceName); - // TODO: remove above in future when we confirmed no one consumes it */ - - List<String> historyList = _record.getListField(ConfigProperty.HISTORY.name()); - if (historyList == null) { - historyList = new ArrayList<String>(); - _record.setListField(ConfigProperty.HISTORY.name(), historyList); - } - - if (historyList.size() == HISTORY_SIZE) { - historyList.remove(0); - } - - Map<String, String> historyEntry = new HashMap<String, String>(); - - long currentTime = System.currentTimeMillis(); - DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss"); - df.setTimeZone(TimeZone.getTimeZone("UTC")); - String dateTime = df.format(new Date(currentTime)); - - historyEntry.put(ConfigProperty.CONTROLLER.name(), instanceName); - historyEntry.put(ConfigProperty.TIME.name(), String.valueOf(currentTime)); - historyEntry.put(ConfigProperty.DATE.name(), dateTime); - historyEntry.put(ConfigProperty.VERSION.name(), version); - - historyList.add(historyEntry.toString()); - } - - /** - * Get history list - * @return - */ - public List<String> getHistoryList() { - List<String> historyList = _record.getListField(ConfigProperty.HISTORY.name()); - if (historyList == null) { - historyList = new ArrayList<String>(); - } - - return historyList; - } - - @Override - public boolean isValid() { - return true; - } -} diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 41cbd0e..511e6f3 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -29,7 +29,7 @@ public class MaintenanceSignal extends PauseSignal { /** * Pre-defined fields set by Helix Controller only. */ - private enum MaintenanceSignalProperty { + public enum MaintenanceSignalProperty { TRIGGERED_BY, TIMESTAMP, AUTO_TRIGGER_REASON diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 510cfe1..ccf7c12 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -20,14 +20,21 @@ package org.apache.helix.integration.controller; */ import com.google.common.collect.ImmutableMap; +import java.io.IOException; +import java.util.HashMap; import java.util.Map; +import org.apache.helix.HelixDataAccessor; +import org.apache.helix.PropertyKey; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.integration.task.TaskTestBase; import org.apache.helix.integration.task.WorkflowGenerator; import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.MaintenanceSignal; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.map.type.TypeFactory; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -37,6 +44,8 @@ public class TestClusterMaintenanceMode extends TaskTestBase { private MockParticipantManager _newInstance; private String newResourceAddedDuringMaintenanceMode = String.format("%s_%s", WorkflowGenerator.DEFAULT_TGT_DB, 1); + private HelixDataAccessor _dataAccessor; + private PropertyKey.Builder _keyBuilder; @BeforeClass public void beforeClass() throws Exception { @@ -45,6 +54,8 @@ public class TestClusterMaintenanceMode extends TaskTestBase { _numReplicas = 3; _numPartitions = 5; super.beforeClass(); + _dataAccessor = _manager.getHelixDataAccessor(); + _keyBuilder = _dataAccessor.keyBuilder(); } @AfterClass @@ -143,8 +154,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that the cluster is in maintenance - MaintenanceSignal maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); // Now bring up 2 instances @@ -156,8 +166,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that the cluster is no longer in maintenance (auto-recovered) - maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNull(maintenanceSignal); } @@ -181,8 +190,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // The cluster should still be in maintenance because it was enabled manually - MaintenanceSignal maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); } @@ -202,8 +210,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that maintenance signal was triggered by Controller - MaintenanceSignal maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); @@ -216,8 +223,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that maintenance mode has successfully overwritten with the right TRIGGERED_BY field - maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); for (Map.Entry<String, String> entry : customFields.entrySet()) { @@ -244,8 +250,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { // Since 3 instances are missing, the cluster should have gone back under maintenance // automatically - MaintenanceSignal maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); @@ -261,8 +266,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that the cluster exited maintenance - maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNull(maintenanceSignal); // Kill 3 instances, which would put cluster in maintenance automatically @@ -272,8 +276,7 @@ public class TestClusterMaintenanceMode extends TaskTestBase { Thread.sleep(500L); // Check that cluster is back under maintenance - maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); @@ -298,12 +301,71 @@ public class TestClusterMaintenanceMode extends TaskTestBase { // Check that the cluster is still in maintenance (should not have auto-exited because it would // fail the MaxPartitionsPerInstance check) - maintenanceSignal = _manager.getHelixDataAccessor() - .getProperty(_manager.getHelixDataAccessor().keyBuilder().maintenance()); + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(maintenanceSignal); Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(maintenanceSignal.getAutoTriggerReason(), MaintenanceSignal.AutoTriggerReason.MAX_PARTITION_PER_INSTANCE_EXCEEDED); } + + /** + * Test that the Controller correctly records maintenance history in various situations. + * @throws InterruptedException + */ + @Test(dependsOnMethods = "testMaxPartitionLimit") + public void testMaintenanceHistory() throws InterruptedException, IOException { + // In maintenance mode, by controller, for MAX_PARTITION_PER_INSTANCE_EXCEEDED + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map<String, String> lastHistoryEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + + // **The KV pairs are hard-coded in here for the ease of reading!** + Assert.assertEquals(lastHistoryEntry.get("OPERATION_TYPE"), "ENTER"); + Assert.assertEquals(lastHistoryEntry.get("TRIGGERED_BY"), "CONTROLLER"); + Assert.assertEquals(lastHistoryEntry.get("AUTO_TRIGGER_REASON"), + "MAX_PARTITION_PER_INSTANCE_EXCEEDED"); + + // Remove the maxPartitionPerInstance config + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + System.out.println("Set clusterconfig"); + + Thread.sleep(500L); + + // Now check that the cluster exited maintenance + // EXIT, CONTROLLER, for MAX_PARTITION_PER_INSTANCE_EXCEEDED + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastHistoryEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastHistoryEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastHistoryEntry.get("TRIGGERED_BY"), "CONTROLLER"); + Assert.assertEquals(lastHistoryEntry.get("AUTO_TRIGGER_REASON"), + "MAX_PARTITION_PER_INSTANCE_EXCEEDED"); + + // Manually put the cluster in maintenance with a custom field + Map<String, String> customFieldMap = ImmutableMap.of("k1", "v1", "k2", "v2"); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "TEST", + customFieldMap); + Thread.sleep(500L); + // ENTER, USER, for reason TEST, no internalReason + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastHistoryEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastHistoryEntry.get("OPERATION_TYPE"), "ENTER"); + Assert.assertEquals(lastHistoryEntry.get("TRIGGERED_BY"), "USER"); + Assert.assertEquals(lastHistoryEntry.get("REASON"), "TEST"); + Assert.assertNull(lastHistoryEntry.get("AUTO_TRIGGER_REASON")); + } + + /** + * Convert a String representation of a Map into a Map object for verification purposes. + * @param value + * @return + */ + private static Map<String, String> convertStringToMap(String value) throws IOException { + return new ObjectMapper().readValue(value, + TypeFactory.mapType(HashMap.class, String.class, String.class)); + } } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerHistory.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerHistory.java index 8a20b22..a01afcd 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerHistory.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerHistory.java @@ -26,7 +26,7 @@ import org.apache.helix.InstanceType; import org.apache.helix.PropertyKey; import org.apache.helix.integration.common.ZkStandAloneCMTestBase; import org.apache.helix.integration.manager.ClusterControllerManager; -import org.apache.helix.model.LeaderHistory; +import org.apache.helix.model.ControllerHistory; import org.testng.Assert; import org.testng.annotations.Test; @@ -40,9 +40,9 @@ public class TestControllerHistory extends ZkStandAloneCMTestBase { PropertyKey.Builder keyBuilder = new PropertyKey.Builder(CLUSTER_NAME); PropertyKey propertyKey = keyBuilder.controllerLeaderHistory(); - LeaderHistory leaderHistory = manager.getHelixDataAccessor().getProperty(propertyKey); - Assert.assertNotNull(leaderHistory); - List<String> list = leaderHistory.getRecord().getListField("HISTORY"); + ControllerHistory controllerHistory = manager.getHelixDataAccessor().getProperty(propertyKey); + Assert.assertNotNull(controllerHistory); + List<String> list = controllerHistory.getRecord().getListField("HISTORY"); Assert.assertEquals(list.size(), 1); for (int i = 0; i <= 12; i++) { @@ -51,9 +51,9 @@ public class TestControllerHistory extends ZkStandAloneCMTestBase { _controller.syncStart(); } - leaderHistory = manager.getHelixDataAccessor().getProperty(propertyKey); - Assert.assertNotNull(leaderHistory); - list = leaderHistory.getRecord().getListField("HISTORY"); + controllerHistory = manager.getHelixDataAccessor().getProperty(propertyKey); + Assert.assertNotNull(controllerHistory); + list = controllerHistory.getRecord().getListField("HISTORY"); Assert.assertEquals(list.size(), 10); manager.disconnect(); } diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java index 3892fc6..e901d9c 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java @@ -43,7 +43,7 @@ import org.apache.helix.manager.zk.ZKUtil; import org.apache.helix.manager.zk.client.HelixZkClient; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.HelixConfigScope; -import org.apache.helix.model.LeaderHistory; +import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.LiveInstance; import org.apache.helix.model.Message; import org.apache.helix.model.StateModelDefinition; @@ -329,7 +329,7 @@ public class ClusterAccessor extends AbstractHelixResource { Map<String, Object> controllerHistory = new HashMap<>(); controllerHistory.put(Properties.id.name(), clusterId); - LeaderHistory history = + ControllerHistory history = dataAccessor.getProperty(dataAccessor.keyBuilder().controllerLeaderHistory()); if (history != null) { controllerHistory.put(Properties.history.name(), history.getHistoryList());
