Author: todd
Date: Wed Apr 4 23:21:43 2012
New Revision: 1309623
URL: http://svn.apache.org/viewvc?rev=1309623&view=rev
Log:
HDFS-1378. Edit log replay should track and report file offsets in case of
errors. Backport by Colin Patrick McCabe.
Added:
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
Modified:
hadoop/common/branches/branch-1/CHANGES.txt
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Wed Apr 4 23:21:43 2012
@@ -84,6 +84,9 @@ Release 1.1.0 - unreleased
HDFS-3131. Improve TestStorageRestore. (Brandon Li via atm)
+ HDFS-1378. Edit log replay should track and report file offsets in case of
+ errors. (atm and todd, backport by Colin Patrick McCabe via todd)
+
BUG FIXES
MAPREDUCE-4087. [Gridmix] GenerateDistCacheData job of Gridmix can
Modified:
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
---
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
(original)
+++
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
Wed Apr 4 23:21:43 2012
@@ -25,9 +25,12 @@ import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.FilterInputStream;
+import java.io.InputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Iterator;
import java.lang.Math;
import java.nio.channels.FileChannel;
@@ -502,7 +505,14 @@ public class FSEditLog {
long highestGenStamp = -1;
long startTime = FSNamesystem.now();
- DataInputStream in = new DataInputStream(new BufferedInputStream(edits));
+ // Keep track of the file offsets of the last several opcodes.
+ // This is handy when manually recovering corrupted edits files.
+ PositionTrackingInputStream tracker =
+ new PositionTrackingInputStream(new BufferedInputStream(edits));
+ long recentOpcodeOffsets[] = new long[4];
+ Arrays.fill(recentOpcodeOffsets, -1);
+
+ DataInputStream in = new DataInputStream(tracker);
try {
// Read log file version. Could be missing.
in.mark(4);
@@ -542,6 +552,8 @@ public class FSEditLog {
} catch (EOFException e) {
break; // no more transactions
}
+ recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
+ tracker.getPos();
numEdits++;
switch (opcode) {
case OP_ADD:
@@ -850,21 +862,34 @@ public class FSEditLog {
}
}
}
- } catch (IOException ex) {
- // Failed to load 0.20.203 version edits during upgrade. This version has
- // conflicting opcodes with the later releases. The editlog must be
- // emptied by restarting the namenode, before proceeding with the
upgrade.
+ } catch (Throwable t) {
+ // Catch Throwable because in the case of a truly corrupt edits log, any
+ // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
if (Storage.is203LayoutVersion(logVersion) &&
logVersion != FSConstants.LAYOUT_VERSION) {
+ // Failed to load 0.20.203 version edits during upgrade. This version
has
+ // conflicting opcodes with the later releases. The editlog must be
+ // emptied by restarting the namenode, before proceeding with the
upgrade.
String msg = "During upgrade, failed to load the editlog version " +
- logVersion + " from release 0.20.203. Please go back to the old " +
- " release and restart the namenode. This empties the editlog " +
- " and saves the namespace. Resume the upgrade after this step.";
- throw new IOException(msg, ex);
- } else {
- throw ex;
+ logVersion + " from release 0.20.203. Please go back to the old " +
+ " release and restart the namenode. This empties the editlog " +
+ " and saves the namespace. Resume the upgrade after this step.";
+ throw new IOException(msg, t);
}
-
+ StringBuilder sb = new StringBuilder();
+ sb.append("Error replaying edit log at offset " + tracker.getPos());
+ if (recentOpcodeOffsets[0] != -1) {
+ Arrays.sort(recentOpcodeOffsets);
+ sb.append("\nRecent opcode offsets:");
+ for (long offset : recentOpcodeOffsets) {
+ if (offset != -1) {
+ sb.append(' ').append(offset);
+ }
+ }
+ }
+ String errorMessage = sb.toString();
+ FSImage.LOG.error(errorMessage);
+ throw new IOException(errorMessage, t);
} finally {
in.close();
}
@@ -1407,4 +1432,52 @@ public class FSEditLog {
}
return blocks;
}
+
+ /**
+ * Stream wrapper that keeps track of the current file position.
+ */
+ private static class PositionTrackingInputStream extends FilterInputStream {
+ private long curPos = 0;
+ private long markPos = -1;
+
+ public PositionTrackingInputStream(InputStream is) {
+ super(is);
+ }
+
+ public int read() throws IOException {
+ int ret = super.read();
+ if (ret != -1) curPos++;
+ return ret;
+ }
+
+ public int read(byte[] data) throws IOException {
+ int ret = super.read(data);
+ if (ret > 0) curPos += ret;
+ return ret;
+ }
+
+ public int read(byte[] data, int offset, int length) throws IOException {
+ int ret = super.read(data, offset, length);
+ if (ret > 0) curPos += ret;
+ return ret;
+ }
+
+ public void mark(int limit) {
+ super.mark(limit);
+ markPos = curPos;
+ }
+
+ public void reset() throws IOException {
+ if (markPos == -1) {
+ throw new IOException("Not marked!");
+ }
+ super.reset();
+ curPos = markPos;
+ markPos = -1;
+ }
+
+ public long getPos() {
+ return curPos;
+ }
+ }
}
Modified:
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
---
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
(original)
+++
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
Wed Apr 4 23:21:43 2012
@@ -1001,4 +1001,9 @@ public class MiniDFSCluster {
public String getDataDirectory() {
return data_dir.getAbsolutePath();
}
+
+ public static File getBaseDir() {
+ return new File(System.getProperty(
+ "test.build.data", "build/test/data"), "dfs/");
+ }
}
Added:
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java?rev=1309623&view=auto
==============================================================================
---
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
(added)
+++
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
Wed Apr 4 23:21:43 2012
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.Test;
+
+public class TestEditLogLoading {
+
+ private static final int NUM_DATA_NODES = 0;
+
+ @Test
+ public void testDisplayRecentEditLogOpCodes() throws IOException {
+ // start a cluster
+ Configuration conf = new Configuration();
+ conf.set("dfs.name.dir", new File(MiniDFSCluster.getBaseDir(),
"name").getPath());
+
+ MiniDFSCluster cluster = null;
+ FileSystem fileSys = null;
+ cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, true, false, null,
null);
+ cluster.waitActive();
+ fileSys = cluster.getFileSystem();
+ final FSNamesystem namesystem = cluster.getNameNode().getNamesystem();
+
+ FSImage fsimage = namesystem.getFSImage();
+ final FSEditLog editLog = fsimage.getEditLog();
+ for (int i = 0; i < 20; i++) {
+ fileSys.mkdirs(new Path("/tmp/tmp" + i));
+ }
+ File editFile = editLog.getFsEditName();
+ System.out.println("edit log file: " + editFile);
+ editLog.close();
+ cluster.shutdown();
+
+ // Corrupt the edits file.
+ long fileLen = editFile.length();
+ RandomAccessFile rwf = new RandomAccessFile(editFile, "rw");
+ rwf.seek(fileLen - 40);
+ for (int i = 0; i < 20; i++) {
+ rwf.write((byte) 2); // FSEditLog.DELETE
+ }
+ rwf.close();
+
+ String expectedErrorMessage = "^Error replaying edit log at offset \\d+\n";
+ expectedErrorMessage += "Recent opcode offsets: (\\d+\\s*){4}$";
+ try {
+ cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, false, false,
null, null);
+ cluster.waitActive();
+ fail("should not be able to start");
+ } catch (IOException e) {
+ assertTrue("error message contains opcodes message",
+ e.getMessage().matches(expectedErrorMessage));
+ }
+ }
+}