Repository: oozie Updated Branches: refs/heads/master f82c12408 -> b45a314b2
OOZIE-1826 Add thread which detects JVM pauses (rkanter) Project: http://git-wip-us.apache.org/repos/asf/oozie/repo Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/b45a314b Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/b45a314b Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/b45a314b Branch: refs/heads/master Commit: b45a314b21a0f15e103a7ca8ce4e7f87985722bb Parents: f82c124 Author: Robert Kanter <[email protected]> Authored: Wed May 14 11:04:06 2014 -0700 Committer: Robert Kanter <[email protected]> Committed: Wed May 14 11:05:15 2014 -0700 ---------------------------------------------------------------------- .../oozie/service/JvmPauseMonitorService.java | 185 +++++++++++++++++++ core/src/main/resources/oozie-default.xml | 27 ++- release-log.txt | 1 + 3 files changed, 212 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java new file mode 100644 index 0000000..0f1a5f6 --- /dev/null +++ b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java @@ -0,0 +1,185 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.oozie.service; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import java.lang.management.GarbageCollectorMXBean; +import java.lang.management.ManagementFactory; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Daemon; +import org.apache.oozie.util.Instrumentation; +import org.apache.oozie.util.XLog; + +/** + * This class sets up a simple thread that sleeps for a short period of time. If the sleep takes significantly longer than its + * target time, it implies that the JVM or host machine has paused processing, which may cause other problems. If such a pause is + * detected, the thread logs a message and updates the Instrumentation. + * + * Adapted from org.apache.hadoop.util.JvmPauseMonitor + */ +public class JvmPauseMonitorService implements Service { + + private static XLog LOG = XLog.getLog(JvmPauseMonitorService.class); + + public static final String CONF_PREFIX = Service.CONF_PREFIX + "JvmPauseMonitorService."; + + /** + * The target sleep time + */ + private static final long SLEEP_INTERVAL_MS = 500; + + /** + * log WARN if we detect a pause longer than this threshold + */ + private long warnThresholdMs; + private static final String WARN_THRESHOLD_KEY = CONF_PREFIX + "warn-threshold.ms"; + private static final long WARN_THRESHOLD_DEFAULT = 10000; + + /** + * log INFO if we detect a pause longer than this threshold + */ + private long infoThresholdMs; + private static final String INFO_THRESHOLD_KEY = CONF_PREFIX + "info-threshold.ms"; + private static final long INFO_THRESHOLD_DEFAULT = 1000; + + private Thread monitorThread; + private volatile boolean shouldRun = true; + private Instrumentation instrumentation; + + @Override + public void init(Services services) throws ServiceException { + Configuration conf = services.getConf(); + warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT); + infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT); + + instrumentation = services.get(InstrumentationService.class).get(); + + Preconditions.checkState(monitorThread == null, + "Already started"); + monitorThread = new Daemon(new Monitor()); + monitorThread.start(); + } + + @Override + public void destroy() { + shouldRun = false; + monitorThread.interrupt(); + try { + monitorThread.join(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + @Override + public Class<? extends Service> getInterface() { + return JvmPauseMonitorService.class; + } + + private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep, + Map<String, GcTimes> gcTimesBeforeSleep) { + Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(), gcTimesBeforeSleep.keySet()); + List<String> gcDiffs = Lists.newArrayList(); + for (String name : gcBeanNames) { + GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name)); + if (diff.gcCount != 0) { + gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString()); + } + } + + String ret = "Detected pause in JVM or host machine (eg GC): pause of approximately " + extraSleepTime + "ms\n"; + if (gcDiffs.isEmpty()) { + ret += "No GCs detected"; + } else { + ret += Joiner.on("\n").join(gcDiffs); + } + return ret; + } + + private Map<String, GcTimes> getGcTimes() { + Map<String, GcTimes> map = Maps.newHashMap(); + List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans(); + for (GarbageCollectorMXBean gcBean : gcBeans) { + map.put(gcBean.getName(), new GcTimes(gcBean)); + } + return map; + } + + private static class GcTimes { + + private GcTimes(GarbageCollectorMXBean gcBean) { + gcCount = gcBean.getCollectionCount(); + gcTimeMillis = gcBean.getCollectionTime(); + } + + private GcTimes(long count, long time) { + this.gcCount = count; + this.gcTimeMillis = time; + } + + private GcTimes subtract(GcTimes other) { + return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis); + } + + @Override + public String toString() { + return "count=" + gcCount + " time=" + gcTimeMillis + "ms"; + } + + private long gcCount; + private long gcTimeMillis; + } + + private class Monitor implements Runnable { + + @Override + public void run() { + Stopwatch sw = new Stopwatch(); + Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes(); + while (shouldRun) { + sw.reset().start(); + try { + Thread.sleep(SLEEP_INTERVAL_MS); + } catch (InterruptedException ie) { + return; + } + long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS; + Map<String, GcTimes> gcTimesAfterSleep = getGcTimes(); + + if (extraSleepTime > warnThresholdMs) { + LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); + instrumentation.incr("jvm", "pause.warn-threshold", 1); + } else if (extraSleepTime > infoThresholdMs) { + LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); + instrumentation.incr("jvm", "pause.info-threshold", 1); + } + instrumentation.incr("jvm", "pause.extraSleepTime", extraSleepTime); + + gcTimesBeforeSleep = gcTimesAfterSleep; + } + } + } +} http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/resources/oozie-default.xml ---------------------------------------------------------------------- diff --git a/core/src/main/resources/oozie-default.xml b/core/src/main/resources/oozie-default.xml index c35d090..9788daf 100644 --- a/core/src/main/resources/oozie-default.xml +++ b/core/src/main/resources/oozie-default.xml @@ -129,7 +129,8 @@ org.apache.oozie.service.PauseTransitService, org.apache.oozie.service.GroupsService, org.apache.oozie.service.ProxyUserService, - org.apache.oozie.service.XLogStreamingService + org.apache.oozie.service.XLogStreamingService, + org.apache.oozie.service.JvmPauseMonitorService </value> <description> All services to be created and managed by Oozie Services singleton. @@ -2081,4 +2082,28 @@ </description> </property> + <!-- JvmPauseMonitorService Configuration --> + <property> + <name>oozie.service.JvmPauseMonitorService.warn-threshold.ms</name> + <value>10000</value> + <description> + The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate + that the JVM or host machine is overloaded or other problems. This thread sleeps for 500ms; if it sleeps for + significantly longer, then there is likely a problem. This property specifies the threadshold for when Oozie should log + a WARN level message; there is also a counter named "jvm.pause.warn-threshold". + </description> + </property> + + <property> + <name>oozie.service.JvmPauseMonitorService.info-threshold.ms</name> + <value>1000</value> + <description> + The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate + that the JVM or host machine is overloaded or other problems. This thread sleeps for 500ms; if it sleeps for + significantly longer, then there is likely a problem. This property specifies the threadshold for when Oozie should log + an INFO level message; there is also a counter named "jvm.pause.info-threshold". + </description> + </property> + + </configuration> http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/release-log.txt ---------------------------------------------------------------------- diff --git a/release-log.txt b/release-log.txt index da86677..9223025 100644 --- a/release-log.txt +++ b/release-log.txt @@ -1,5 +1,6 @@ -- Oozie 4.1.0 release (trunk - unreleased) +OOZIE-1826 Add thread which detects JVM pauses (rkanter) OOZIE-1791 add IGNORED status to Coordinator Job and Action (ryota) OOZIE-1825 Optimize wf_jobs protoconf storage (puru via rohini) OOZIE-1831 Oozie upgrade fails if workflow jobs are in running or suspended state (satish.mittal via rohini)
