Repository: ambari Updated Branches: refs/heads/branch-2.4 5b6b782ff -> 3ffb0e848
AMBARI-17127 - Ambari Stale Alert Triggers For Server-Side Performance Alert (jonathanhurley) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/3ffb0e84 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/3ffb0e84 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/3ffb0e84 Branch: refs/heads/branch-2.4 Commit: 3ffb0e848061e682d3692e2d85093465b589f928 Parents: 5b6b782 Author: Jonathan Hurley <[email protected]> Authored: Wed Jun 8 14:20:00 2016 -0400 Committer: Jonathan Hurley <[email protected]> Committed: Thu Jun 9 12:56:11 2016 -0400 ---------------------------------------------------------------------- .../ambari/server/alerts/AlertRunnable.java | 29 ++++ .../alerts/AmbariPerformanceRunnable.java | 29 ---- .../server/alerts/StaleAlertRunnable.java | 139 ++++++++++++++++--- ambari-server/src/main/resources/alerts.json | 13 +- .../server/alerts/StaleAlertRunnableTest.java | 116 +++++++++++++++- 5 files changed, 276 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/3ffb0e84/ambari-server/src/main/java/org/apache/ambari/server/alerts/AlertRunnable.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/alerts/AlertRunnable.java b/ambari-server/src/main/java/org/apache/ambari/server/alerts/AlertRunnable.java index 21ae9b0..ea583e4 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/alerts/AlertRunnable.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/alerts/AlertRunnable.java @@ -29,6 +29,7 @@ import org.apache.ambari.server.orm.entities.AlertDefinitionEntity; import org.apache.ambari.server.state.Alert; import org.apache.ambari.server.state.Cluster; import org.apache.ambari.server.state.Clusters; +import org.apache.commons.lang.math.NumberUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -133,4 +134,32 @@ public abstract class AlertRunnable implements Runnable { LOG.error("Unable to run the {} alert", m_definitionName, exception); } } + + /** + * Converts the given value to an integer safely. + * + * @param value + * @param defaultValue + * @return + */ + int getThresholdValue(Object value, int defaultValue) { + if (null == value) { + return defaultValue; + } + + if (value instanceof Number) { + return ((Number) value).intValue(); + } + + if (!(value instanceof String)) { + value = value.toString(); + } + + if (!NumberUtils.isNumber((String) value)) { + return defaultValue; + } + + Number number = NumberUtils.createNumber((String) value); + return number.intValue(); + } } http://git-wip-us.apache.org/repos/asf/ambari/blob/3ffb0e84/ambari-server/src/main/java/org/apache/ambari/server/alerts/AmbariPerformanceRunnable.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/alerts/AmbariPerformanceRunnable.java b/ambari-server/src/main/java/org/apache/ambari/server/alerts/AmbariPerformanceRunnable.java index 7ed2427..bbb583b 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/alerts/AmbariPerformanceRunnable.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/alerts/AmbariPerformanceRunnable.java @@ -48,7 +48,6 @@ import org.apache.ambari.server.state.alert.ParameterizedSource.AlertParameter; import org.apache.ambari.server.state.alert.ServerSource; import org.apache.ambari.server.state.services.AmbariServerAlertService; import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.math.NumberUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.security.core.context.SecurityContextHolder; @@ -388,34 +387,6 @@ public class AmbariPerformanceRunnable extends AlertRunnable { } /** - * Converts the given value to an integer safely. - * - * @param value - * @param defaultValue - * @return - */ - int getThresholdValue(Object value, int defaultValue) { - if (null == value) { - return defaultValue; - } - - if (value instanceof Number) { - return ((Number) value).intValue(); - } - - if (!(value instanceof String)) { - value = value.toString(); - } - - if (!NumberUtils.isNumber((String) value)) { - return defaultValue; - } - - Number number = NumberUtils.createNumber((String) value); - return number.intValue(); - } - - /** * The {@link PerformanceResult} class is used to wrap the result of a * {@link PerformanceArea}. */ http://git-wip-us.apache.org/repos/asf/ambari/blob/3ffb0e84/ambari-server/src/main/java/org/apache/ambari/server/alerts/StaleAlertRunnable.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/alerts/StaleAlertRunnable.java b/ambari-server/src/main/java/org/apache/ambari/server/alerts/StaleAlertRunnable.java index cf12bbf..7cb679b 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/alerts/StaleAlertRunnable.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/alerts/StaleAlertRunnable.java @@ -17,6 +17,8 @@ */ package org.apache.ambari.server.alerts; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; import java.text.MessageFormat; import java.util.Collections; import java.util.HashMap; @@ -33,9 +35,15 @@ import org.apache.ambari.server.state.Alert; import org.apache.ambari.server.state.AlertState; import org.apache.ambari.server.state.Cluster; import org.apache.ambari.server.state.MaintenanceState; +import org.apache.ambari.server.state.alert.AlertDefinition; +import org.apache.ambari.server.state.alert.AlertDefinitionFactory; +import org.apache.ambari.server.state.alert.ParameterizedSource.AlertParameter; +import org.apache.ambari.server.state.alert.ServerSource; import org.apache.ambari.server.state.alert.SourceType; import org.apache.ambari.server.state.services.AmbariServerAlertService; import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.inject.Inject; @@ -48,6 +56,11 @@ import com.google.inject.Inject; */ public class StaleAlertRunnable extends AlertRunnable { /** + * Logger. + */ + private final static Logger LOG = LoggerFactory.getLogger(StaleAlertRunnable.class); + + /** * The message for the alert when all services have run in their designated * intervals. */ @@ -72,12 +85,33 @@ public class StaleAlertRunnable extends AlertRunnable { private static final int MINUTES_PER_HOUR = 60; /** + * The multiplier for the interval of the definition which is being checked + * for staleness. If this value is {@code 2}, then alerts are considered stale + * if they haven't run in more than 2x their interval value. + */ + private static final int INTERVAL_WAIT_FACTOR_DEFAULT = 2; + + /** + * A parameter which exposes the interval multipler to use for calculating + * staleness. If this does not exist, then + * {@link #INTERVAL_WAIT_FACTOR_DEFAULT} will be used. + */ + private static final String STALE_INTERVAL_MULTIPLIER_PARAM_KEY = "stale.interval.multiplier"; + + /** * Used to get the current alerts and the last time they ran. */ @Inject private AlertsDAO m_alertsDao; /** + * Used for converting {@link AlertDefinitionEntity} into + * {@link AlertDefinition} instances. + */ + @Inject + private AlertDefinitionFactory m_definitionFactory; + + /** * Constructor. * * @param definitionName @@ -91,8 +125,17 @@ public class StaleAlertRunnable extends AlertRunnable { */ @Override List<Alert> execute(Cluster cluster, AlertDefinitionEntity myDefinition) { - Set<String> staleAlerts = new TreeSet<String>(); - Map<String, Set<String>> staleHostAlerts = new HashMap<>(); + // get the multiplier + int waitFactor = getWaitFactorMultiplier(myDefinition); + + // use the uptime of the Ambari Server as a way to determine if we need to + // give the alert more time to report in + RuntimeMXBean rb = ManagementFactory.getRuntimeMXBean(); + long uptime = rb.getUptime(); + + int totalStaleAlerts = 0; + Set<String> staleAlertGroupings = new TreeSet<String>(); + Map<String, Set<String>> staleAlertsByHost = new HashMap<>(); Set<String> hostsWithStaleAlerts = new TreeSet<>(); // get the cluster's current alerts @@ -102,7 +145,8 @@ public class StaleAlertRunnable extends AlertRunnable { long now = System.currentTimeMillis(); // for each current alert, check to see if the last time it ran is - // more than 2x its interval value (indicating it hasn't run) + // more than INTERVAL_WAIT_FACTOR * its interval value (indicating it hasn't + // run) for (AlertCurrentEntity current : currentAlerts) { AlertHistoryEntity history = current.getAlertHistory(); AlertDefinitionEntity currentDefinition = history.getAlertDefinition(); @@ -130,37 +174,52 @@ public class StaleAlertRunnable extends AlertRunnable { // convert minutes to milliseconds for the definition's interval long intervalInMillis = currentDefinition.getScheduleInterval() * MINUTE_TO_MS_CONVERSION; - // if the last time it was run is >= 2x the interval, it's stale + // if the server hasn't been up long enough to consider this alert stale, + // then don't mark it stale - this is to protect against cases where + // Ambari was down for a while and after startup it hasn't received the + // alert because it has a longer interval than this stale alert check: + // + // Stale alert check - every 5 minutes + // Foo alert cehck - every 10 minutes + // Ambari down for 35 minutes for upgrade + if (uptime <= waitFactor * intervalInMillis) { + continue; + } + + // if the last time it was run is >= INTERVAL_WAIT_FACTOR * the interval, + // it's stale long timeDifference = now - current.getLatestTimestamp(); - if (timeDifference >= 2 * intervalInMillis) { + if (timeDifference >= waitFactor * intervalInMillis) { + // increase the count + totalStaleAlerts++; // it is technically possible to have a null/blank label; if so, // default to the name of the definition String label = currentDefinition.getLabel(); if (StringUtils.isEmpty(label)) { label = currentDefinition.getDefinitionName(); - } + } if (null != history.getHostName()) { // keep track of the host, if not null String hostName = history.getHostName(); hostsWithStaleAlerts.add(hostName); - if (!staleHostAlerts.containsKey(hostName)) { - staleHostAlerts.put(hostName, new TreeSet<String>()); - } + if (!staleAlertsByHost.containsKey(hostName)) { + staleAlertsByHost.put(hostName, new TreeSet<String>()); + } - staleHostAlerts.get(hostName).add(MessageFormat.format(TIMED_LABEL_MSG, label, + staleAlertsByHost.get(hostName).add(MessageFormat.format(TIMED_LABEL_MSG, label, millisToHumanReadableStr(timeDifference))); } else { // non host alerts - staleAlerts.add(label); - } + staleAlertGroupings.add(label); } + } } - for (String host : staleHostAlerts.keySet()) { - staleAlerts.add(MessageFormat.format(HOST_LABEL_MSG, host, - StringUtils.join(staleHostAlerts.get(host), ",\n "))); + for (String host : staleAlertsByHost.keySet()) { + staleAlertGroupings.add(MessageFormat.format(HOST_LABEL_MSG, host, + StringUtils.join(staleAlertsByHost.get(host), ",\n "))); } AlertState alertState = AlertState.OK; @@ -168,10 +227,10 @@ public class StaleAlertRunnable extends AlertRunnable { // if there are stale alerts, mark as CRITICAL with the list of // alerts - if (!staleAlerts.isEmpty()) { + if (!staleAlertGroupings.isEmpty()) { alertState = AlertState.CRITICAL; - alertText = MessageFormat.format(STALE_ALERTS_MSG, staleAlerts.size(), - hostsWithStaleAlerts.size(), StringUtils.join(staleAlerts, ",\n")); + alertText = MessageFormat.format(STALE_ALERTS_MSG, totalStaleAlerts, + hostsWithStaleAlerts.size(), StringUtils.join(staleAlertGroupings, ",\n")); } Alert alert = new Alert(myDefinition.getDefinitionName(), null, myDefinition.getServiceName(), @@ -209,5 +268,49 @@ public class StaleAlertRunnable extends AlertRunnable { } return result.trim(); } + + /** + * Gets the wait factor multiplier off of the definition, returning + * {@link #INTERVAL_WAIT_FACTOR_DEFAULT} if not specified. This will look for + * {@link #STALE_INTERVAL_MULTIPLIER_PARAM_KEY} in the definition parameters. + * The value returned from this method will be guaranteed to be in the range + * of 2 to 10. + * + * @param entity + * the definition to read + * @return the wait factor interval multiplier + */ + private int getWaitFactorMultiplier(AlertDefinitionEntity entity) { + // start with the default + int waitFactor = INTERVAL_WAIT_FACTOR_DEFAULT; + + // coerce the entity into a business object so that the list of parameters + // can be extracted and used for threshold calculation + try { + AlertDefinition definition = m_definitionFactory.coerce(entity); + ServerSource serverSource = (ServerSource) definition.getSource(); + List<AlertParameter> parameters = serverSource.getParameters(); + for (AlertParameter parameter : parameters) { + Object value = parameter.getValue(); + + if (StringUtils.equals(parameter.getName(), STALE_INTERVAL_MULTIPLIER_PARAM_KEY)) { + waitFactor = getThresholdValue(value, INTERVAL_WAIT_FACTOR_DEFAULT); + } + } + + if (waitFactor < 2 || waitFactor > 10) { + LOG.warn( + "The interval multipler of {} is outside the valid range for {} and will be set to 2", + waitFactor, entity.getLabel()); + + waitFactor = 2; + } + } catch (Exception exception) { + LOG.error("Unable to read the {} parameter for {}", STALE_INTERVAL_MULTIPLIER_PARAM_KEY, + StaleAlertRunnable.class.getSimpleName(), exception); + } + + return waitFactor; + } } http://git-wip-us.apache.org/repos/asf/ambari/blob/3ffb0e84/ambari-server/src/main/resources/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/alerts.json b/ambari-server/src/main/resources/alerts.json index f55211f..9cffff5 100644 --- a/ambari-server/src/main/resources/alerts.json +++ b/ambari-server/src/main/resources/alerts.json @@ -25,7 +25,18 @@ "enabled": true, "source": { "type": "SERVER", - "class": "org.apache.ambari.server.alerts.StaleAlertRunnable" + "class": "org.apache.ambari.server.alerts.StaleAlertRunnable", + "parameters": [ + { + "name": "stale.interval.multiplier", + "display_name": "Interval Multiplier", + "value": 2, + "type": "NUMERIC", + "description": "The number of intervals which must pass before an an alert is considered stale. This value is a multiplier for each alert's individual interval.", + "units": "Intervals", + "threshold": "CRITICAL" + } + ] } }, { http://git-wip-us.apache.org/repos/asf/ambari/blob/3ffb0e84/ambari-server/src/test/java/org/apache/ambari/server/alerts/StaleAlertRunnableTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/alerts/StaleAlertRunnableTest.java b/ambari-server/src/test/java/org/apache/ambari/server/alerts/StaleAlertRunnableTest.java index 9a9d989..2111beb 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/alerts/StaleAlertRunnableTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/alerts/StaleAlertRunnableTest.java @@ -22,8 +22,11 @@ import static junit.framework.Assert.assertEquals; import static org.easymock.EasyMock.createNiceMock; import static org.easymock.EasyMock.expect; import static org.easymock.EasyMock.replay; +import static org.easymock.EasyMock.reset; import static org.easymock.EasyMock.verify; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.HashMap; @@ -52,6 +55,10 @@ import org.easymock.EasyMock; import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.powermock.api.easymock.PowerMock; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.junit4.PowerMockRunner; import com.google.common.eventbus.EventBus; import com.google.inject.Binder; @@ -62,6 +69,8 @@ import com.google.inject.Module; /** * Tests {@link StaleAlertRunnableTest}. */ +@RunWith(PowerMockRunner.class) +@PrepareForTest({ StaleAlertRunnable.class, ManagementFactory.class }) public class StaleAlertRunnableTest { private final static long CLUSTER_ID = 1; @@ -84,6 +93,7 @@ public class StaleAlertRunnableTest { private AlertEventPublisher m_eventPublisher; private EventBus m_synchronizedBus; + private RuntimeMXBean m_runtimeMXBean; /** * @@ -113,12 +123,16 @@ public class StaleAlertRunnableTest { clusterMap.put(CLUSTER_NAME, m_cluster); // mock the definition for the alert + expect(m_definition.getDefinitionId()).andReturn(1L).atLeastOnce(); expect(m_definition.getDefinitionName()).andReturn(DEFINITION_NAME).atLeastOnce(); expect(m_definition.getServiceName()).andReturn(DEFINITION_SERVICE).atLeastOnce(); expect(m_definition.getComponentName()).andReturn(DEFINITION_COMPONENT).atLeastOnce(); expect(m_definition.getLabel()).andReturn(DEFINITION_LABEL).atLeastOnce(); expect(m_definition.getEnabled()).andReturn(true).atLeastOnce(); expect(m_definition.getScheduleInterval()).andReturn(DEFINITION_INTERVAL).atLeastOnce(); + expect(m_definition.getClusterId()).andReturn(CLUSTER_ID).atLeastOnce(); + + expect(m_definition.getSource()).andReturn("{\"type\" : \"SERVER\"}").anyTimes(); // mock the cluster expect(m_cluster.getClusterId()).andReturn(CLUSTER_ID).atLeastOnce(); @@ -134,8 +148,16 @@ public class StaleAlertRunnableTest { expect(m_alertsDao.findCurrentByCluster(CLUSTER_ID)).andReturn( m_currentAlerts).atLeastOnce(); + // mock out the uptime to be a while (since most tests are not testing + // system uptime) + m_runtimeMXBean = EasyMock.createNiceMock(RuntimeMXBean.class); + PowerMock.mockStatic(ManagementFactory.class); + expect(ManagementFactory.getRuntimeMXBean()).andReturn(m_runtimeMXBean).atLeastOnce(); + PowerMock.replay(ManagementFactory.class); + expect(m_runtimeMXBean.getUptime()).andReturn(360000L); + replay(m_definition, m_cluster, m_clusters, - m_definitionDao, m_alertsDao); + m_definitionDao, m_alertsDao, m_runtimeMXBean); } /** @@ -260,7 +282,7 @@ public class StaleAlertRunnableTest { */ @Test public void testStaleAlertInMaintenaceMode() { - // create current alerts that are not stale + // create current alerts that are stale AlertDefinitionEntity definition = new AlertDefinitionEntity(); definition.setClusterId(CLUSTER_ID); definition.setDefinitionName("foo-definition"); @@ -322,6 +344,96 @@ public class StaleAlertRunnableTest { } /** + * Tests that stale alerts are not reported if the server has not be running + * long enough. + */ + @Test + public void testStaleAlertWithServerUptime() { + // reset the Runtime MX bean to a low value + reset(m_runtimeMXBean); + expect(m_runtimeMXBean.getUptime()).andReturn(1000L); + replay(m_runtimeMXBean); + + // create current alerts that are stale (5 minute interval) + AlertDefinitionEntity definition = new AlertDefinitionEntity(); + definition.setClusterId(CLUSTER_ID); + definition.setDefinitionName("foo-definition"); + definition.setServiceName("HDFS"); + definition.setComponentName("NAMENODE"); + definition.setEnabled(true); + definition.setScheduleInterval(5); + + // create current alerts that are stale + AlertCurrentEntity current1 = createNiceMock(AlertCurrentEntity.class); + AlertHistoryEntity history1 = createNiceMock(AlertHistoryEntity.class); + + expect(current1.getAlertHistory()).andReturn(history1).atLeastOnce(); + expect(history1.getAlertDefinition()).andReturn(definition).atLeastOnce(); + + // use a timestamp that would trigger the alert, say 3x the interval ago (so + // 15 minutes ago) + long now = System.currentTimeMillis(); + long staleTime = now - (definition.getScheduleInterval() * 60 * 1000 * 3); + + expect(current1.getMaintenanceState()).andReturn(MaintenanceState.OFF).atLeastOnce(); + expect(current1.getLatestTimestamp()).andReturn(staleTime).atLeastOnce(); + + replay(current1, history1); + + m_currentAlerts.add(current1); + + // precondition that no events were fired + assertEquals(0, m_listener.getAlertEventReceivedCount(AlertReceivedEvent.class)); + + // instantiate and inject mocks + StaleAlertRunnable runnable = new StaleAlertRunnable(m_definition.getDefinitionName()); + m_injector.injectMembers(runnable); + + // run the alert + runnable.run(); + + // ensure that our mock MX bean was used + verify(m_runtimeMXBean); + + // verify that our uptime was too short so nothing should have been + // triggered + assertEquals(1, m_listener.getAlertEventReceivedCount(AlertReceivedEvent.class)); + List<AlertEvent> events = m_listener.getAlertEventInstances(AlertReceivedEvent.class); + assertEquals(1, events.size()); + + AlertReceivedEvent event = (AlertReceivedEvent) events.get(0); + Alert alert = event.getAlert(); + assertEquals("AMBARI", alert.getService()); + assertEquals("AMBARI_SERVER", alert.getComponent()); + assertEquals(AlertState.OK, alert.getState()); + assertEquals(DEFINITION_NAME, alert.getName()); + + // now reset the mocks to indicate that Ambari has been up long enough + m_listener.reset(); + long uptime = definition.getScheduleInterval() * 60 * 1000 * 4; + reset(m_runtimeMXBean); + expect(m_runtimeMXBean.getUptime()).andReturn(uptime); + replay(m_runtimeMXBean); + + // run the alert again and verify that the same stale alert caused a + // CRITICAL + runnable.run(); + + // recheck for the stale alert + events = m_listener.getAlertEventInstances(AlertReceivedEvent.class); + assertEquals(1, events.size()); + + event = (AlertReceivedEvent) events.get(0); + alert = event.getAlert(); + assertEquals("AMBARI", alert.getService()); + assertEquals("AMBARI_SERVER", alert.getComponent()); + assertEquals(AlertState.CRITICAL, alert.getState()); + assertEquals(DEFINITION_NAME, alert.getName()); + + assertEquals(1, m_listener.getAlertEventReceivedCount(AlertReceivedEvent.class)); + } + + /** * */ private class MockModule implements Module {
