Author: cutting Date: Wed May 2 14:37:21 2007 New Revision: 534624 URL: http://svn.apache.org/viewvc?view=rev&rev=534624 Log: HADOOP-1312. Fix a ConcurrentModificationException in NameNode that killed the heartbeat monitoring thread. Contributed by Dhruba.
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=534624&r1=534623&r2=534624 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Wed May 2 14:37:21 2007 @@ -312,6 +312,10 @@ 92. HADOOP-1308. Use generics to restrict types when classes are passed as parameters to JobConf methods. (Michael Bieniosek via cutting) +93. HADOOP-1312. Fix a ConcurrentModificationException in NameNode + that killed the heartbeat monitoring thread. + (Dhruba Borthakur via cutting) + Release 0.12.3 - 2007-04-06 Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?view=diff&rev=534624&r1=534623&r2=534624 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Wed May 2 14:37:21 2007 @@ -1253,29 +1253,33 @@ ******************************************************/ class LeaseMonitor implements Runnable { public void run() { - while (fsRunning) { - synchronized (FSNamesystem.this) { - synchronized (leases) { - Lease top; - while ((sortedLeases.size() > 0) && - ((top = sortedLeases.first()) != null)) { - if (top.expiredHardLimit()) { - top.releaseLocks(); - leases.remove(top.holder); - LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size()); - if (!sortedLeases.remove(top)) { - LOG.info("Unknown failure trying to remove " + top + " from lease set."); + try { + while (fsRunning) { + synchronized (FSNamesystem.this) { + synchronized (leases) { + Lease top; + while ((sortedLeases.size() > 0) && + ((top = sortedLeases.first()) != null)) { + if (top.expiredHardLimit()) { + top.releaseLocks(); + leases.remove(top.holder); + LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size()); + if (!sortedLeases.remove(top)) { + LOG.info("Unknown failure trying to remove " + top + " from lease set."); + } + } else { + break; } - } else { - break; } } } + try { + Thread.sleep(2000); + } catch (InterruptedException ie) { + } } - try { - Thread.sleep(2000); - } catch (InterruptedException ie) { - } + } catch (Exception e) { + FSNamesystem.LOG.error(StringUtils.stringifyException(e)); } } } @@ -1636,7 +1640,11 @@ */ public void run() { while (fsRunning) { - heartbeatCheck(); + try { + heartbeatCheck(); + } catch (Exception e) { + FSNamesystem.LOG.error(StringUtils.stringifyException(e)); + } try { Thread.sleep(heartbeatRecheckInterval); } catch (InterruptedException ie) { @@ -1809,10 +1817,12 @@ * @author hairong */ private void removeDatanode(DatanodeDescriptor nodeInfo) { - if (nodeInfo.isAlive) { - updateStats(nodeInfo, false); - heartbeats.remove(nodeInfo); - nodeInfo.isAlive = false; + synchronized (heartbeats) { + if (nodeInfo.isAlive) { + updateStats(nodeInfo, false); + heartbeats.remove(nodeInfo); + nodeInfo.isAlive = false; + } } for (Iterator<Block> it = nodeInfo.getBlockIterator(); it.hasNext();) {