Author: wang Date: Wed Aug 14 23:38:07 2013 New Revision: 1514097 URL: http://svn.apache.org/r1514097 Log: HDFS-4816. transitionToActive blocks if the SBN is doing checkpoint image transfer. (Andrew Wang)
Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1514097&r1=1514096&r2=1514097&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Wed Aug 14 23:38:07 2013 @@ -44,6 +44,9 @@ Release 2.3.0 - UNRELEASED HDFS-5065. TestSymlinkHdfsDisable fails on Windows. (ivanmi) + HDFS-4816. transitionToActive blocks if the SBN is doing checkpoint image + transfer. (Andrew Wang) + Release 2.1.1-beta - UNRELEASED INCOMPATIBLE CHANGES Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java?rev=1514097&r1=1514096&r2=1514097&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java Wed Aug 14 23:38:07 2013 @@ -17,9 +17,17 @@ */ package org.apache.hadoop.hdfs.server.namenode.ha; +import static org.apache.hadoop.util.Time.now; + import java.io.IOException; import java.net.InetSocketAddress; import java.security.PrivilegedAction; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.ThreadFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -38,10 +46,10 @@ import org.apache.hadoop.hdfs.util.Cance import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; -import static org.apache.hadoop.util.Time.now; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.util.concurrent.ThreadFactoryBuilder; /** * Thread which runs inside the NN when it's in Standby state, @@ -57,6 +65,7 @@ public class StandbyCheckpointer { private final FSNamesystem namesystem; private long lastCheckpointTime; private final CheckpointerThread thread; + private final ThreadFactory uploadThreadFactory; private String activeNNAddress; private InetSocketAddress myNNAddress; @@ -72,6 +81,8 @@ public class StandbyCheckpointer { this.namesystem = ns; this.checkpointConf = new CheckpointConf(conf); this.thread = new CheckpointerThread(); + this.uploadThreadFactory = new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("TransferFsImageUpload-%d").build(); setNameNodeAddresses(conf); } @@ -142,7 +153,7 @@ public class StandbyCheckpointer { private void doCheckpoint() throws InterruptedException, IOException { assert canceler != null; - long txid; + final long txid; namesystem.writeLockInterruptibly(); try { @@ -171,9 +182,26 @@ public class StandbyCheckpointer { } // Upload the saved checkpoint back to the active - TransferFsImage.uploadImageFromStorage( - activeNNAddress, myNNAddress, - namesystem.getFSImage().getStorage(), txid); + // Do this in a separate thread to avoid blocking transition to active + // See HDFS-4816 + ExecutorService executor = + Executors.newSingleThreadExecutor(uploadThreadFactory); + Future<Void> upload = executor.submit(new Callable<Void>() { + @Override + public Void call() throws IOException { + TransferFsImage.uploadImageFromStorage( + activeNNAddress, myNNAddress, + namesystem.getFSImage().getStorage(), txid); + return null; + } + }); + executor.shutdown(); + try { + upload.get(); + } catch (ExecutionException e) { + throw new IOException("Exception during image upload: " + e.getMessage(), + e.getCause()); + } } /** @@ -301,6 +329,7 @@ public class StandbyCheckpointer { LOG.info("Checkpoint was cancelled: " + ce.getMessage()); canceledCount++; } catch (InterruptedException ie) { + LOG.info("Interrupted during checkpointing", ie); // Probably requested shutdown. continue; } catch (Throwable t) { Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java?rev=1514097&r1=1514096&r2=1514097&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java Wed Aug 14 23:38:07 2013 @@ -239,6 +239,34 @@ public class TestStandbyCheckpoints { assertTrue(canceledOne); } + + /** + * Test cancellation of ongoing checkpoints when failover happens + * mid-checkpoint during image upload from standby to active NN. + */ + @Test(timeout=60000) + public void testCheckpointCancellationDuringUpload() throws Exception { + // don't compress, we want a big image + cluster.getConfiguration(0).setBoolean( + DFSConfigKeys.DFS_IMAGE_COMPRESS_KEY, false); + cluster.getConfiguration(1).setBoolean( + DFSConfigKeys.DFS_IMAGE_COMPRESS_KEY, false); + // Throttle SBN upload to make it hang during upload to ANN + cluster.getConfiguration(1).setLong( + DFSConfigKeys.DFS_IMAGE_TRANSFER_RATE_KEY, 100); + cluster.restartNameNode(0); + cluster.restartNameNode(1); + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + + cluster.transitionToActive(0); + + doEdits(0, 100); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(104)); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + } /** * Make sure that clients will receive StandbyExceptions even when a