Author: cutting Date: Fri Jun 15 14:45:03 2007 New Revision: 547800 URL: http://svn.apache.org/viewvc?view=rev&rev=547800 Log: HADOOP-1300. Improve removal of excess block replicas to be rack-aware. Contributed by Hairong.
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=547800&r1=547799&r2=547800 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Fri Jun 15 14:45:03 2007 @@ -140,6 +140,10 @@ 44. HADOOP-1482. Fix secondary namenode to roll info port. (Dhruba Borthakur via cutting) + 45. HADOOP-1300. Improve removal of excess block replicas to be + rack-aware. Attempts are now made to keep replicas on more + racks. (Hairong Kuang via cutting) + Release 0.13.0 - 2007-06-08 Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?view=diff&rev=547800&r1=547799&r2=547800 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Fri Jun 15 14:45:03 2007 @@ -29,6 +29,7 @@ import java.io.*; import java.util.*; +import java.util.Map.Entry; /*************************************************** * FSNamesystem does the actual bookkeeping work for the @@ -2179,18 +2180,57 @@ * * srcNodes.size() - dstNodes.size() == replication * - * We pick node with least free space - * In the future, we might enforce some kind of policy - * (like making sure replicates are spread across racks). + * We pick node that make sure that replicas are spread across racks and + * also try hard to pick one with least free space. + * The algorithm is first to pick a node with least free space from nodes + * that are on a rack holding more than one replicas of the block. + * So removing such a replica won't remove a rack. + * If no such a node is available, + * then pick a node with least free space */ void chooseExcessReplicates(Collection<DatanodeDescriptor> nonExcess, Block b, short replication) { + // first form a rack to datanodes map and + HashMap<String, ArrayList<DatanodeDescriptor>> rackMap = + new HashMap<String, ArrayList<DatanodeDescriptor>>(); + for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator(); + iter.hasNext();) { + DatanodeDescriptor node = iter.next(); + String rackName = node.getNetworkLocation(); + ArrayList<DatanodeDescriptor> datanodeList = rackMap.get(rackName); + if(datanodeList==null) { + datanodeList = new ArrayList<DatanodeDescriptor>(); + } + datanodeList.add(node); + rackMap.put(rackName, datanodeList); + } + + // split nodes into two sets + // priSet contains nodes on rack with more than one replica + // remains contains the remaining nodes + ArrayList<DatanodeDescriptor> priSet = new ArrayList<DatanodeDescriptor>(); + ArrayList<DatanodeDescriptor> remains = new ArrayList<DatanodeDescriptor>(); + for( Iterator<Entry<String, ArrayList<DatanodeDescriptor>>> iter = + rackMap.entrySet().iterator(); iter.hasNext(); ) { + Entry<String, ArrayList<DatanodeDescriptor>> rackEntry = iter.next(); + ArrayList<DatanodeDescriptor> datanodeList = rackEntry.getValue(); + if( datanodeList.size() == 1 ) { + remains.add(datanodeList.get(0)); + } else { + priSet.addAll(datanodeList); + } + } + + // pick one node with least space from priSet if it is not empty + // otherwise one node with least space from remains while (nonExcess.size() - replication > 0) { DatanodeInfo cur = null; long minSpace = Long.MAX_VALUE; - - for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator(); iter.hasNext();) { - DatanodeInfo node = iter.next(); + + Iterator<DatanodeDescriptor> iter = + priSet.isEmpty() ? remains.iterator() : priSet.iterator(); + while( iter.hasNext() ) { + DatanodeDescriptor node = iter.next(); long free = node.getRemaining(); if (minSpace > free) { @@ -2198,7 +2238,24 @@ cur = node; } } - + + // adjust rackmap, priSet, and remains + String rack = cur.getNetworkLocation(); + ArrayList<DatanodeDescriptor> datanodes = rackMap.get(rack); + datanodes.remove(cur); + if(datanodes.isEmpty()) { + rackMap.remove(rack); + } + if (priSet.isEmpty()) { + remains.remove(cur); + } else { + priSet.remove(cur); + if (datanodes.size() == 1) { + priSet.remove(datanodes.get(0)); + remains.add(datanodes.get(0)); + } + } + nonExcess.remove(cur); Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());