Author: hairong
Date: Wed Dec 3 11:59:31 2008
New Revision: 723047
URL: http://svn.apache.org/viewvc?rev=723047&view=rev
Log:
Merge -r 723012:723013 from trunk to move the change log of HADOOP-4679 into
the branch 0.18
Added:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java
Modified:
hadoop/core/branches/branch-0.18/CHANGES.txt
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
Modified: hadoop/core/branches/branch-0.18/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/CHANGES.txt?rev=723047&r1=723046&r2=723047&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/CHANGES.txt Wed Dec 3 11:59:31 2008
@@ -63,6 +63,9 @@
HADOOP-4726. Fix documentation typos "the the". (Edward J. Yoon via
szetszwo)
+ HADOOP-4679. Datanode prints tons of log messages: waiting for threadgroup
+ to exit, active threads is XX. (hairong)
+
Release 0.18.2 - 2008-11-03
BUG FIXES
Modified:
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java?rev=723047&r1=723046&r2=723047&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
(original)
+++
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
Wed Dec 3 11:59:31 2008
@@ -316,6 +316,7 @@
ServerSocketChannel.open().socket() : new ServerSocket();
Server.bind(ss, socAddr, 0);
ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE);
+ ss.setSoTimeout(conf.getInt("dfs.dataXceiver.timeoutInMS", 30000)); //30s
// adjust machine name with the actual port
tmpPort = ss.getLocalPort();
selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
@@ -556,6 +557,8 @@
/**
* Shut down this instance of the datanode.
* Returns only after shutdown is complete.
+ * This method can only be called by the offerService thread.
+ * Otherwise, deadlock might occur.
*/
public void shutdown() {
if (infoServer != null) {
@@ -622,7 +625,8 @@
* when IOException occurs.
* If so, handle the error */
private void checkDiskError( IOException e ) throws IOException {
- if (e.getMessage().startsWith("No space left on device")) {
+ if (e.getMessage() != null &&
+ e.getMessage().startsWith("No space left on device")) {
throw new DiskOutOfSpaceException("No space left on device");
} else {
checkDiskError();
@@ -640,12 +644,12 @@
private void handleDiskError(String errMsgr) {
LOG.warn("DataNode is shutting down.\n" + errMsgr);
+ shouldRun = false;
try {
namenode.errorReport(
dnRegistration, DatanodeProtocol.DISK_ERROR,
errMsgr);
} catch(IOException ignored) {
}
- shutdown();
}
/** Number of concurrent xceivers per node. */
@@ -834,7 +838,9 @@
return false;
case DatanodeProtocol.DNA_REGISTER:
// namenode requested a registration - at start or if NN lost contact
- register();
+ if (shouldRun) {
+ register();
+ }
break;
case DatanodeProtocol.DNA_FINALIZE:
storage.finalizeUpgrade();
@@ -984,6 +990,8 @@
Socket s = ss.accept();
s.setTcpNoDelay(true);
new Daemon(threadGroup, new DataXceiver(s)).start();
+ } catch (SocketTimeoutException ignored) {
+ // wake up to see if should continue to run
} catch (IOException ie) {
LOG.warn(dnRegistration + ":DataXceiveServer: "
+ StringUtils.stringifyException(ie));
@@ -2373,6 +2381,11 @@
}
} catch(IOException ioe) {
IOUtils.closeStream(this);
+ IOException cause = FSDataset.getCauseIfDiskError(ioe);
+ if (cause != null) { // possible disk error
+ ioe = cause;
+ checkDiskError(ioe);
+ }
throw ioe;
}
}
@@ -2965,6 +2978,11 @@
dn.dataNodeThread.start();
}
}
+
+ /** check if a datanode is up */
+ static boolean isDatanodeUp(DataNode dn) {
+ return dn.dataNodeThread != null && dn.dataNodeThread.isAlive();
+ }
/** Instantiate a single datanode object. This must be run by invoking
* [EMAIL PROTECTED] DataNode#runDatanodeDaemon(DataNode)} subsequently.
Modified:
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java?rev=723047&r1=723046&r2=723047&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
(original)
+++
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
Wed Dec 3 11:59:31 2008
@@ -386,7 +386,13 @@
}
// Create the zero-length temp file
//
- if (!f.createNewFile()) {
+ boolean fileCreated = false;
+ try {
+ fileCreated = f.createNewFile();
+ } catch (IOException ioe) {
+ throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe);
+ }
+ if (!fileCreated) {
throw new IOException("Unexpected problem in creating temporary file
for "+
b + ". File " + f + " should be creatable, but
is already present.");
}
@@ -905,6 +911,20 @@
}
}
+ private final static String DISK_ERROR = "Possible disk error on file
creation: ";
+ /** Get the cause of an I/O exception if caused by a possible disk error
+ * @param ioe an I/O exception
+ * @return cause if the I/O exception is caused by a possible disk error;
+ * null otherwise.
+ */
+ static IOException getCauseIfDiskError(IOException ioe) {
+ if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) {
+ return (IOException)ioe.getCause();
+ } else {
+ return null;
+ }
+ }
+
/**
* Start writing to a block file
* If isRecovery is true and the block pre-exists, then we kill all
Added:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java?rev=723047&view=auto
==============================================================================
---
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java
(added)
+++
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java
Wed Dec 3 11:59:31 2008
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import java.io.File;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.dfs.DFSTestUtil;
+import org.apache.hadoop.dfs.MiniDFSCluster;
+
+import junit.framework.TestCase;
+
+/** Test if a datanode can handle disk error correctly*/
+public class TestDiskError extends TestCase {
+ public void testShutdown() throws Exception {
+ // bring up a cluster of 3
+ Configuration conf = new Configuration();
+ conf.setLong("dfs.block.size", 512L);
+ conf.setInt("dfs.dataXceiver.timeoutInMS", 1000);
+ MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
+ cluster.waitActive();
+ FileSystem fs = cluster.getFileSystem();
+ final int dnIndex = 0;
+ File dataDir = new File(
+ System.getProperty("test.build.data", "build/test/data"), "dfs");
+ dataDir = new File(dataDir, "data");
+ File dir1 = new File(new File(dataDir, "data"+(2*dnIndex+1)), "tmp");
+ File dir2 = new File(new File(dataDir, "data"+(2*dnIndex+2)), "tmp");
+ try {
+ // make the data directory of the first datanode to be readonly
+ assertTrue(dir1.setReadOnly());
+ assertTrue(dir2.setReadOnly());
+
+ // create files and make sure that first datanode will be down
+ DataNode dn = cluster.getDataNodes().get(dnIndex);
+ for (int i=0; DataNode.isDatanodeUp(dn); i++) {
+ Path fileName = new Path("/test.txt"+i);
+ DFSTestUtil.createFile(fs, fileName, 1024, (short)2, 1L);
+ DFSTestUtil.waitReplication(fs, fileName, (short)2);
+ }
+ } finally {
+ // restore its old permission
+ dir1.setWritable(true);
+ dir2.setWritable(true);
+ cluster.shutdown();
+ }
+ }
+}