Author: dhruba Date: Mon Sep 17 23:39:50 2007 New Revision: 576729 URL: http://svn.apache.org/viewvc?rev=576729&view=rev Log: HADOOP-1762. The Namenode fsimage does not contain the list of Datanodes. (Raghu Angadi via dhruba)
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSEditLog.java lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSImage.java lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=576729&r1=576728&r2=576729&view=diff ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Mon Sep 17 23:39:50 2007 @@ -83,6 +83,9 @@ BUG FIXES + HADOOP-1762. The Namenode fsimage does not contain a list of + Datanodes. (Raghu Angadi via dhruba) + HADOOP-1890. Removed debugging prints introduced by HADOOP-1774. (Raghu Angadi via dhruba) Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java?rev=576729&r1=576728&r2=576729&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java Mon Sep 17 23:39:50 2007 @@ -36,6 +36,8 @@ import java.io.*; import java.net.*; import java.util.*; +import java.security.NoSuchAlgorithmException; +import java.security.SecureRandom; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsRecord; import org.apache.hadoop.metrics.Updater; @@ -337,6 +339,36 @@ return "<namenode>"; } + private void setNewStorageID(DatanodeRegistration dnReg) { + /* Return + * "DS-randInt-ipaddr-currentTimeMillis" + * It is considered extermely rare for all these numbers to match + * on a different machine accidentally for the following + * a) SecureRandom(INT_MAX) is pretty much random (1 in 2 billion), and + * b) Good chance ip address would be different, and + * c) Even on the same machine, Datanode is designed to use different ports. + * d) Good chance that these are started at different times. + * For a confict to occur all the 4 above have to match!. + * The format of this string can be changed anytime in future without + * affecting its functionality. + */ + String ip = "unknownIP"; + try { + ip = DNS.getDefaultIP("default"); + } catch (UnknownHostException ignored) { + LOG.warn("Could not find ip address of \"default\" inteface."); + } + + int rand = 0; + try { + rand = SecureRandom.getInstance("SHA1PRNG").nextInt(Integer.MAX_VALUE); + } catch (NoSuchAlgorithmException e) { + LOG.warn("Could not use SecureRandom"); + rand = (new Random()).nextInt(Integer.MAX_VALUE); + } + dnReg.storageID = "DS-" + rand + "-"+ ip + "-" + dnReg.getPort() + "-" + + System.currentTimeMillis(); + } /** * Register datanode * <p> @@ -349,6 +381,9 @@ * @throws IOException */ private void register() throws IOException { + if (dnRegistration.getStorageID().equals("")) { + setNewStorageID(dnRegistration); + } while(shouldRun) { try { // reset name to machineName. Mainly for web interface. Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSEditLog.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSEditLog.java?rev=576729&r1=576728&r2=576729&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSEditLog.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSEditLog.java Mon Sep 17 23:39:50 2007 @@ -43,8 +43,9 @@ private static final byte OP_DELETE = 2; private static final byte OP_MKDIR = 3; private static final byte OP_SET_REPLICATION = 4; - private static final byte OP_DATANODE_ADD = 5; - private static final byte OP_DATANODE_REMOVE = 6; + //the following two are used only for backword compatibility : + @Deprecated private static final byte OP_DATANODE_ADD = 5; + @Deprecated private static final byte OP_DATANODE_REMOVE = 6; private ArrayList<EditLogOutputStream> editStreams = null; private FSImage fsimage = null; @@ -383,8 +384,7 @@ + " for version " + logVersion); FSImage.DatanodeImage nodeimage = new FSImage.DatanodeImage(); nodeimage.readFields(in); - DatanodeDescriptor node = nodeimage.getDatanodeDescriptor(); - fsNamesys.unprotectedAddDatanode(node); + //Datnodes are not persistent any more. break; } case OP_DATANODE_REMOVE: { @@ -394,11 +394,7 @@ DatanodeID nodeID = new DatanodeID(); nodeID.readFields(in); DatanodeDescriptor node = fsNamesys.getDatanode(nodeID); - if (node != null) { - fsNamesys.unprotectedRemoveDatanode(node); - // physically remove node from datanodeMap - fsNamesys.wipeDatanode(nodeID); - } + //Datanodes are not persistent any more. break; } default: { @@ -550,22 +546,6 @@ new UTF8(src), FSEditLog.toLogLong(timestamp)}; logEdit(OP_DELETE, new ArrayWritable(UTF8.class, info), null); - } - - /** - * Creates a record in edit log corresponding to a new data node - * registration event. - */ - void logAddDatanode(DatanodeDescriptor node) { - logEdit(OP_DATANODE_ADD, new FSImage.DatanodeImage(node), null); - } - - /** - * Creates a record in edit log corresponding to a data node - * removal event. - */ - void logRemoveDatanode(DatanodeID nodeID) { - logEdit(OP_DATANODE_REMOVE, new DatanodeID(nodeID), null); } static UTF8 toLogReplication(short replication) { Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSImage.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSImage.java?rev=576729&r1=576728&r2=576729&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSImage.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSImage.java Mon Sep 17 23:39:50 2007 @@ -850,32 +850,25 @@ } /** - * Save list of datanodes contained in [EMAIL PROTECTED] FSNamesystem#datanodeMap}. - * Only the [EMAIL PROTECTED] DatanodeInfo} part is stored. - * The [EMAIL PROTECTED] DatanodeDescriptor#blocks} is transient. + * Earlier version used to store all the known datanodes. + * DFS don't store datanodes anymore. * * @param out output stream * @throws IOException */ void saveDatanodes(DataOutputStream out) throws IOException { - Map datanodeMap = FSNamesystem.getFSNamesystem().datanodeMap; - int size = datanodeMap.size(); - out.writeInt(size); - for(Iterator it = datanodeMap.values().iterator(); it.hasNext();) { - DatanodeImage nodeImage = new DatanodeImage((DatanodeDescriptor) it.next()); - nodeImage.write(out); - } + // we don't store datanodes anymore. + out.writeInt(0); } void loadDatanodes(int version, DataInputStream in) throws IOException { if (version > -3) // pre datanode image version return; - FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem(); int size = in.readInt(); for(int i = 0; i < size; i++) { DatanodeImage nodeImage = new DatanodeImage(); nodeImage.readFields(in); - fsNamesys.unprotectedAddDatanode(nodeImage.getDatanodeDescriptor()); + // We don't need to add these descriptors any more. } } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?rev=576729&r1=576728&r2=576729&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Mon Sep 17 23:39:50 2007 @@ -1557,15 +1557,7 @@ * * @see DataNode#register() */ - public void registerDatanode(DatanodeRegistration nodeReg, - String networkLocation - ) throws IOException { - registerDatanodeInternal(nodeReg, networkLocation); - getEditLog().logSync(); - } - - private synchronized void registerDatanodeInternal( - DatanodeRegistration nodeReg, + public synchronized void registerDatanode(DatanodeRegistration nodeReg, String networkLocation ) throws IOException { @@ -1604,8 +1596,6 @@ removeDatanode(nodeN); // physically remove node from datanodeMap wipeDatanode(nodeN); - // and log removal - getEditLog().logRemoveDatanode(nodeN); nodeN = null; } @@ -1618,13 +1608,19 @@ + "node restarted."); } else { // nodeS is found - // The registering datanode is a replacement node for the existing - // data storage, which from now on will be served by a new node. - NameNode.stateChangeLog.debug( - "BLOCK* NameSystem.registerDatanode: " + /* The registering datanode is a replacement node for the existing + data storage, which from now on will be served by a new node. + If this message repeats, both nodes might have same storageID + by (insanely rare) random chance. User needs to restart one of the + nodes with its data cleared (or user can just remove the StorageID + value in "VERSION" file under the data directory of the datanode, + but this is might not work if VERSION file format has changed + */ + NameNode.stateChangeLog.info( "BLOCK* NameSystem.registerDatanode: " + "node " + nodeS.getName() - + " is replaced by " + nodeReg.getName() + "."); - getEditLog().logRemoveDatanode(nodeS); + + " is replaced by " + nodeReg.getName() + + " with the same storageID " + + nodeReg.getStorageID()); } // update cluster map clusterMap.remove(nodeS); @@ -1632,9 +1628,6 @@ nodeS.setNetworkLocation(networkLocation); clusterMap.add(nodeS); nodeS.setHostName(hostName); - if ( nodeS != nodeN ) { - getEditLog().logAddDatanode( nodeS ); - } // also treat the registration message as a heartbeat synchronized(heartbeats) { @@ -1662,7 +1655,6 @@ = new DatanodeDescriptor(nodeReg, networkLocation, hostName); unprotectedAddDatanode(nodeDescr); clusterMap.add(nodeDescr); - getEditLog().logAddDatanode(nodeDescr); // also treat the registration message as a heartbeat synchronized(heartbeats) {