[ https://issues.apache.org/jira/browse/HDFS-17599?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17870826#comment-17870826 ]
ASF GitHub Bot commented on HDFS-17599: --------------------------------------- haiyang1987 commented on code in PR #6980: URL: https://github.com/apache/hadoop/pull/6980#discussion_r1703134443 ########## hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java: ########## @@ -1005,6 +1011,146 @@ public void testMoverWithStripedFile() throws Exception { } } + @Test(timeout = 300000) + public void testMoverWithStripedFileMaintenance() throws Exception { + final Configuration conf = new HdfsConfiguration(); + initConfWithStripe(conf); + + // Start 9 datanodes + int numOfDatanodes = 9; + int storagesPerDatanode = 2; + long capacity = 9 * defaultBlockSize; + long[][] capacities = new long[numOfDatanodes][storagesPerDatanode]; + for (int i = 0; i < numOfDatanodes; i++) { + for(int j = 0; j < storagesPerDatanode; j++){ + capacities[i][j] = capacity; + } + } + final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(numOfDatanodes) + .storagesPerDatanode(storagesPerDatanode) + .storageTypes(new StorageType[][]{ + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}, + {StorageType.SSD, StorageType.SSD}}) + .storageCapacities(capacities) + .build(); + + try { + cluster.waitActive(); + cluster.getFileSystem().enableErasureCodingPolicy( + StripedFileTestUtil.getDefaultECPolicy().getName()); + + ClientProtocol client = NameNodeProxies.createProxy(conf, + cluster.getFileSystem(0).getUri(), ClientProtocol.class).getProxy(); + String barDir = "/bar"; + client.mkdirs(barDir, new FsPermission((short) 777), true); + // Set "/bar" directory with ALL_SSD storage policy. + client.setStoragePolicy(barDir, "ALL_SSD"); + // Set an EC policy on "/bar" directory + client.setErasureCodingPolicy(barDir, + StripedFileTestUtil.getDefaultECPolicy().getName()); + + // Write file to barDir + final String fooFile = "/bar/foo"; + long fileLen = 6 * defaultBlockSize; + DFSTestUtil.createFile(cluster.getFileSystem(), new Path(fooFile), + fileLen,(short) 3, 0); + + // Verify storage types and locations + LocatedBlocks locatedBlocks = + client.getBlockLocations(fooFile, 0, fileLen); + DatanodeInfoWithStorage location = null; + for(LocatedBlock lb : locatedBlocks.getLocatedBlocks()){ + location = lb.getLocations()[8]; + for(StorageType type : lb.getStorageTypes()){ + Assert.assertEquals(StorageType.SSD, type); + } + } + + // Maintain the last datanode later + FSNamesystem ns = cluster.getNamesystem(0); + DatanodeManager datanodeManager = ns.getBlockManager().getDatanodeManager(); + DatanodeDescriptor dn = datanodeManager.getDatanode(location.getDatanodeUuid()); + + StripedFileTestUtil.verifyLocatedStripedBlocks(locatedBlocks, + dataBlocks + parityBlocks); + + // Start 5 more datanodes for mover + capacities = new long[5][storagesPerDatanode]; + for (int i = 0; i < 5; i++) { + for(int j = 0; j < storagesPerDatanode; j++){ + capacities[i][j] = capacity; + } + } + cluster.startDataNodes(conf, 5, + new StorageType[][]{ + {StorageType.DISK, StorageType.DISK}, + {StorageType.DISK, StorageType.DISK}, + {StorageType.DISK, StorageType.DISK}, + {StorageType.DISK, StorageType.DISK}, + {StorageType.DISK, StorageType.DISK}}, + true, null, null, null, capacities, + null, false, false, false, null, null, null); + cluster.triggerHeartbeats(); + + // Move blocks to DISK + client.setStoragePolicy(barDir, "HOT"); + int rc = ToolRunner.run(conf, new Mover.Cli(), + new String[] { "-p", barDir }); + Review Comment: How about add verify the number of DISK storage types of logic? ``` // Verify the number of DISK storage types waitForLocatedBlockWithDiskStorageType(cluster.getFileSystem(), fooFile, 5); ``` > Fix the mismatch between locations and indices for mover > -------------------------------------------------------- > > Key: HDFS-17599 > URL: https://issues.apache.org/jira/browse/HDFS-17599 > Project: Hadoop HDFS > Issue Type: Bug > Affects Versions: 3.3.0, 3.4.0 > Reporter: Tao Li > Assignee: Tao Li > Priority: Major > Labels: pull-request-available > Attachments: image-2024-08-03-17-59-08-059.png, > image-2024-08-03-18-00-01-950.png > > > We set the EC policy to (6+3) and also have nodes that were in state > ENTERING_MAINTENANCE. > > When we move the data of some directories from SSD to HDD, some blocks move > fail due to disk full, as shown in the figure below > (blk_-9223372033441574269). > We tried to move again and found the following error "{color:#ff0000}Replica > does not exist{color}". > Observing the information of fsck, it can be found that the wrong > blockid(blk_-9223372033441574270) was found when moving block. > > {*}Mover Logs{*}: > !image-2024-08-03-17-59-08-059.png|width=741,height=85! > > {*}FSCK Info{*}: > !image-2024-08-03-18-00-01-950.png|width=738,height=120! > > {*}Root Cause{*}: > Similar to this HDFS-16333, when mover is initialized, only the `LIVE` node > is processed. As a result, the datanode in the `ENTERING_MAINTENANCE` state > in the locations is filtered when initializing `DBlockStriped`, but the > indices are not adapted, resulting in a mismatch between the location and > indices lengths. Finally, ec block calculates the wrong blockid when getting > internal block (see `DBlockStriped#getInternalBlock`). > > We added debug logs, and a few key messages are shown below. > {color:#ff0000}The result is an incorrect correspondence: xx.xx.7.31 -> > -9223372033441574270{color}. > {code:java} > DBlock getInternalBlock(StorageGroup storage) { > // storage == xx.xx.7.31 > // idxInLocs == 1 (location ([xx.xx.,85.29:DISK, xx.xx.7.31:DISK, > xx.xx.207.22:DISK, xx.xx.8.25:DISK, xx.xx.79.30:DISK, xx.xx.87.21:DISK, > xx.xx.8.38:DISK]), xx.xx.179.31 is in the ENTERING_MAINTENANCE state is > filtered) > int idxInLocs = locations.indexOf(storage); > if (idxInLocs == -1) { > return null; > } > // idxInGroup == 2 (indices is [1,2,3,4,5,6,7,8]) > byte idxInGroup = indices[idxInLocs]; > // blkId: -9223372033441574272 + 2 = -9223372033441574270 > long blkId = getBlock().getBlockId() + idxInGroup; > long numBytes = getInternalBlockLength(getNumBytes(), cellSize, > dataBlockNum, idxInGroup); > Block blk = new Block(getBlock()); > blk.setBlockId(blkId); > blk.setNumBytes(numBytes); > DBlock dblk = new DBlock(blk); > dblk.addLocation(storage); > return dblk; > } {code} > {*}Solution{*}: > When initializing DBlockStriped, if any location is filtered out, we need to > remove the corresponding element in the indices to do the adaptation. > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org