Author: jing9 Date: Fri Aug 30 07:36:45 2013 New Revision: 1518899 URL: http://svn.apache.org/r1518899 Log: HDFS-5140. Too many safemode monitor threads being created in the standby namenode causing it to fail with out of memory error. Contributed by Jing Zhao.
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1518899&r1=1518898&r2=1518899&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Fri Aug 30 07:36:45 2013 @@ -407,6 +407,9 @@ Release 2.1.1-beta - UNRELEASED HDFS-5077. NPE in FSNamesystem.commitBlockSynchronization(). (Plamen Jeliazkov via shv) + HDFS-5140. Too many safemode monitor threads being created in the standby + namenode causing it to fail with out of memory error. (jing9) + Release 2.1.0-beta - 2013-08-22 INCOMPATIBLE CHANGES Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=1518899&r1=1518898&r2=1518899&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Fri Aug 30 07:36:45 2013 @@ -366,7 +366,7 @@ public class FSNamesystem implements Nam final LeaseManager leaseManager = new LeaseManager(this); - Daemon smmthread = null; // SafeModeMonitor thread + volatile Daemon smmthread = null; // SafeModeMonitor thread Daemon nnrmthread = null; // NamenodeResourceMonitor thread @@ -4555,7 +4555,9 @@ public class FSNamesystem implements Nam // Have to have write-lock since leaving safemode initializes // repl queues, which requires write lock assert hasWriteLock(); - if (needEnter()) { + // if smmthread is already running, the block threshold must have been + // reached before, there is no need to enter the safe mode again + if (smmthread == null && needEnter()) { enter(); // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { @@ -4564,7 +4566,7 @@ public class FSNamesystem implements Nam reportStatus("STATE* Safe mode ON.", false); return; } - // the threshold is reached + // the threshold is reached or was reached before if (!isOn() || // safe mode is off extension <= 0 || threshold <= 0) { // don't need to wait this.leave(); // leave safe mode @@ -4576,9 +4578,11 @@ public class FSNamesystem implements Nam } // start monitor reached = now(); - smmthread = new Daemon(new SafeModeMonitor()); - smmthread.start(); - reportStatus("STATE* Safe mode extension entered.", true); + if (smmthread == null) { + smmthread = new Daemon(new SafeModeMonitor()); + smmthread.start(); + reportStatus("STATE* Safe mode extension entered.", true); + } // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { @@ -4814,6 +4818,7 @@ public class FSNamesystem implements Nam if (safeMode.canLeave()) { // Leave safe mode. safeMode.leave(); + smmthread = null; break; } } finally { @@ -4829,7 +4834,6 @@ public class FSNamesystem implements Nam if (!fsRunning) { LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); } - smmthread = null; } }