Author: wang Date: Thu Jan 30 00:00:40 2014 New Revision: 1562646 URL: http://svn.apache.org/r1562646 Log: HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools.
Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Thu Jan 30 00:00:40 2014 @@ -38,7 +38,6 @@ Release 2.3.0 - UNRELEASED HDFS-4949. Centralized cache management in HDFS (wang and cmccabe) - IMPROVEMENTS HDFS-5360. Improvement of usage message of renameSnapshot and @@ -522,6 +521,9 @@ Release 2.3.0 - UNRELEASED HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be closed before method returns (Ted Yu via todd) + HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools. + (wang) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java Thu Jan 30 00:00:40 2014 @@ -193,6 +193,17 @@ public final class CacheManager { } + /** + * Resets all tracked directives and pools. Called during 2NN checkpointing to + * reset FSNamesystem state. See {FSNamesystem{@link #clear()}. + */ + void clear() { + directivesById.clear(); + directivesByPath.clear(); + cachePools.clear(); + nextDirectiveId = 1; + } + public void startMonitorThread() { crmLock.lock(); try { Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Thu Jan 30 00:00:40 2014 @@ -521,6 +521,7 @@ public class FSNamesystem implements Nam leaseManager.removeAllLeases(); inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID); snapshotManager.clearSnapshottableDirs(); + cacheManager.clear(); } @VisibleForTesting Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java Thu Jan 30 00:00:40 2014 @@ -1001,7 +1001,12 @@ public class SecondaryNameNode implement sig.mostRecentCheckpointTxId + " even though it should have " + "just been downloaded"); } - dstImage.reloadFromImageFile(file, dstNamesystem); + dstNamesystem.writeLock(); + try { + dstImage.reloadFromImageFile(file, dstNamesystem); + } finally { + dstNamesystem.writeUnlock(); + } dstNamesystem.dir.imageLoadComplete(); } // error simulation code for junit test Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java Thu Jan 30 00:00:40 2014 @@ -69,6 +69,7 @@ import org.apache.hadoop.hdfs.protocol.C import org.apache.hadoop.hdfs.protocol.CachePoolStats; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; import org.apache.hadoop.hdfs.server.datanode.DataNode; @@ -528,77 +529,111 @@ public class TestCacheDirectives { @Test(timeout=60000) public void testCacheManagerRestart() throws Exception { - // Create and validate a pool - final String pool = "poolparty"; - String groupName = "partygroup"; - FsPermission mode = new FsPermission((short)0777); - long limit = 747; - dfs.addCachePool(new CachePoolInfo(pool) - .setGroupName(groupName) - .setMode(mode) - .setLimit(limit)); - RemoteIterator<CachePoolEntry> pit = dfs.listCachePools(); - assertTrue("No cache pools found", pit.hasNext()); - CachePoolInfo info = pit.next().getInfo(); - assertEquals(pool, info.getPoolName()); - assertEquals(groupName, info.getGroupName()); - assertEquals(mode, info.getMode()); - assertEquals(limit, (long)info.getLimit()); - assertFalse("Unexpected # of cache pools found", pit.hasNext()); - - // Create some cache entries - int numEntries = 10; - String entryPrefix = "/party-"; - long prevId = -1; - final Date expiry = new Date(); - for (int i=0; i<numEntries; i++) { - prevId = dfs.addCacheDirective( - new CacheDirectiveInfo.Builder(). - setPath(new Path(entryPrefix + i)).setPool(pool). - setExpiration( - CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())). - build()); - } - RemoteIterator<CacheDirectiveEntry> dit - = dfs.listCacheDirectives(null); - for (int i=0; i<numEntries; i++) { - assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); - CacheDirectiveInfo cd = dit.next().getInfo(); - assertEquals(i+1, cd.getId().longValue()); - assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); - assertEquals(pool, cd.getPool()); - } - assertFalse("Unexpected # of cache directives found", dit.hasNext()); - - // Restart namenode - cluster.restartNameNode(); + SecondaryNameNode secondary = null; + try { + // Start a secondary namenode + conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, + "0.0.0.0:0"); + secondary = new SecondaryNameNode(conf); - // Check that state came back up - pit = dfs.listCachePools(); - assertTrue("No cache pools found", pit.hasNext()); - info = pit.next().getInfo(); - assertEquals(pool, info.getPoolName()); - assertEquals(pool, info.getPoolName()); - assertEquals(groupName, info.getGroupName()); - assertEquals(mode, info.getMode()); - assertEquals(limit, (long)info.getLimit()); - assertFalse("Unexpected # of cache pools found", pit.hasNext()); + // Create and validate a pool + final String pool = "poolparty"; + String groupName = "partygroup"; + FsPermission mode = new FsPermission((short)0777); + long limit = 747; + dfs.addCachePool(new CachePoolInfo(pool) + .setGroupName(groupName) + .setMode(mode) + .setLimit(limit)); + RemoteIterator<CachePoolEntry> pit = dfs.listCachePools(); + assertTrue("No cache pools found", pit.hasNext()); + CachePoolInfo info = pit.next().getInfo(); + assertEquals(pool, info.getPoolName()); + assertEquals(groupName, info.getGroupName()); + assertEquals(mode, info.getMode()); + assertEquals(limit, (long)info.getLimit()); + assertFalse("Unexpected # of cache pools found", pit.hasNext()); + + // Create some cache entries + int numEntries = 10; + String entryPrefix = "/party-"; + long prevId = -1; + final Date expiry = new Date(); + for (int i=0; i<numEntries; i++) { + prevId = dfs.addCacheDirective( + new CacheDirectiveInfo.Builder(). + setPath(new Path(entryPrefix + i)).setPool(pool). + setExpiration( + CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())). + build()); + } + RemoteIterator<CacheDirectiveEntry> dit + = dfs.listCacheDirectives(null); + for (int i=0; i<numEntries; i++) { + assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); + CacheDirectiveInfo cd = dit.next().getInfo(); + assertEquals(i+1, cd.getId().longValue()); + assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); + assertEquals(pool, cd.getPool()); + } + assertFalse("Unexpected # of cache directives found", dit.hasNext()); + + // Checkpoint once to set some cache pools and directives on 2NN side + secondary.doCheckpoint(); + + // Add some more CacheManager state + final String imagePool = "imagePool"; + dfs.addCachePool(new CachePoolInfo(imagePool)); + prevId = dfs.addCacheDirective(new CacheDirectiveInfo.Builder() + .setPath(new Path("/image")).setPool(imagePool).build()); + + // Save a new image to force a fresh fsimage download + dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER); + dfs.saveNamespace(); + dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE); + + // Checkpoint again forcing a reload of FSN state + boolean fetchImage = secondary.doCheckpoint(); + assertTrue("Secondary should have fetched a new fsimage from NameNode", + fetchImage); + + // Remove temp pool and directive + dfs.removeCachePool(imagePool); + + // Restart namenode + cluster.restartNameNode(); + + // Check that state came back up + pit = dfs.listCachePools(); + assertTrue("No cache pools found", pit.hasNext()); + info = pit.next().getInfo(); + assertEquals(pool, info.getPoolName()); + assertEquals(pool, info.getPoolName()); + assertEquals(groupName, info.getGroupName()); + assertEquals(mode, info.getMode()); + assertEquals(limit, (long)info.getLimit()); + assertFalse("Unexpected # of cache pools found", pit.hasNext()); + + dit = dfs.listCacheDirectives(null); + for (int i=0; i<numEntries; i++) { + assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); + CacheDirectiveInfo cd = dit.next().getInfo(); + assertEquals(i+1, cd.getId().longValue()); + assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); + assertEquals(pool, cd.getPool()); + assertEquals(expiry.getTime(), cd.getExpiration().getMillis()); + } + assertFalse("Unexpected # of cache directives found", dit.hasNext()); - dit = dfs.listCacheDirectives(null); - for (int i=0; i<numEntries; i++) { - assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); - CacheDirectiveInfo cd = dit.next().getInfo(); - assertEquals(i+1, cd.getId().longValue()); - assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); - assertEquals(pool, cd.getPool()); - assertEquals(expiry.getTime(), cd.getExpiration().getMillis()); + long nextId = dfs.addCacheDirective( + new CacheDirectiveInfo.Builder(). + setPath(new Path("/foobar")).setPool(pool).build()); + assertEquals(prevId + 1, nextId); + } finally { + if (secondary != null) { + secondary.shutdown(); + } } - assertFalse("Unexpected # of cache directives found", dit.hasNext()); - - long nextId = dfs.addCacheDirective( - new CacheDirectiveInfo.Builder(). - setPath(new Path("/foobar")).setPool(pool).build()); - assertEquals(prevId + 1, nextId); } /** Modified: hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java?rev=1562646&r1=1562645&r2=1562646&view=diff ============================================================================== --- hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java (original) +++ hadoop/common/branches/branch-2.3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java Thu Jan 30 00:00:40 2014 @@ -1637,7 +1637,7 @@ public class TestCheckpoint { * Test that the secondary namenode correctly deletes temporary edits * on startup. */ - @Test(timeout = 30000) + @Test(timeout = 60000) public void testDeleteTemporaryEditsOnStartup() throws IOException { Configuration conf = new HdfsConfiguration(); SecondaryNameNode secondary = null;