[ https://issues.apache.org/jira/browse/HDFS-14446?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
HuangTao updated HDFS-14446: ---------------------------- Description: {code:java} [2019-04-20T18:59:16.564+08:00] [INFO] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : BlockRecoveryWorker: NameNode at /XXX.YYY.134.11:8021 calls recoverBlock(BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037, targets=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]], newGenerationStamp=1038, newBlock=null, isStriped=true) [2019-04-20T18:59:17.298+08:00] [INFO] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : initReplicaRecovery: blk_-9223372036854775417_1037, recoveryId=1038, replica=null [2019-04-20T18:59:17.398+08:00] [WARN] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : recoverBlocks FAILED: RecoveringStripedBlock{BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037; getBlockSize()=0; corrupt=false; offset=-1; locs=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]]} java.io.IOException: BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037 has no enough internal blocks, unable to start recovery. Locations=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]] at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.checkLocations(BlockRecoveryWorker.java:532) at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.recover(BlockRecoveryWorker.java:437) at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1.run(BlockRecoveryWorker.java:602) at java.lang.Thread.run(Thread.java:745) {code} NN try to recovery UnderConstruction erasure-coding file, but the last block has no RBW files wrote on DN. The DN can't find internal blocks and won't tell NN to delete the last block, which will cause NN infinitely loop on releasing the file's lease. was: {noformat} [2019-04-20T18:59:16.564+08:00] [INFO] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : BlockRecoveryWorker: NameNode at /XXX.YYY.134.11:8021 calls recoverBlock(BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037, targets=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]], newGenerationStamp=1038, newBlock=null, isStriped=true) [2019-04-20T18:59:17.298+08:00] [INFO] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : initReplicaRecovery: blk_-9223372036854775417_1037, recoveryId=1038, replica=null [2019-04-20T18:59:17.398+08:00] [WARN] [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : recoverBlocks FAILED: RecoveringStripedBlock{BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037; getBlockSize()=0; corrupt=false; offset=-1; locs=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]]} java.io.IOException: BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037 has no enough internal blocks, unable to start recovery. Locations=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]] at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.checkLocations(BlockRecoveryWorker.java:532) at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.recover(BlockRecoveryWorker.java:437) at org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1.run(BlockRecoveryWorker.java:602) at java.lang.Thread.run(Thread.java:745) {noformat} NN try to recovery UnderConstruction erasure-coding file, but the last block has no RBW files wrote on DN. The DN can't find internal blocks and won't tell NN to delete the last block, which will cause NN infinitely loop on releasing the file's lease. > No enough internal blocks causes NN infinitely loop on releasing lease > ---------------------------------------------------------------------- > > Key: HDFS-14446 > URL: https://issues.apache.org/jira/browse/HDFS-14446 > Project: Hadoop HDFS > Issue Type: Bug > Components: datanode, erasure-coding, namenode > Affects Versions: 3.3.0 > Reporter: HuangTao > Priority: Major > > {code:java} > [2019-04-20T18:59:16.564+08:00] [INFO] > [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : > BlockRecoveryWorker: NameNode at /XXX.YYY.134.11:8021 calls > recoverBlock(BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037, > targets=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]], > newGenerationStamp=1038, newBlock=null, isStriped=true) > [2019-04-20T18:59:17.298+08:00] [INFO] > [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : > initReplicaRecovery: blk_-9223372036854775417_1037, recoveryId=1038, > replica=null [2019-04-20T18:59:17.398+08:00] [WARN] > [org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1@44e74aa0] : > recoverBlocks FAILED: > RecoveringStripedBlock{BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037; > getBlockSize()=0; corrupt=false; offset=-1; > locs=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]]} > java.io.IOException: > BP-1516815134-XXX.YYY.134.11-1555507434409:blk_-9223372036854775424_1037 has > no enough internal blocks, unable to start recovery. > Locations=[DatanodeInfoWithStorage[XXX.YYY.134.135:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.202:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.196:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.197:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.138:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.199:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.235:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.134:50010,null,null], > DatanodeInfoWithStorage[XXX.YYY.134.35:50010,null,null]] at > org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.checkLocations(BlockRecoveryWorker.java:532) > at > org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$RecoveryTaskStriped.recover(BlockRecoveryWorker.java:437) > at > org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker$1.run(BlockRecoveryWorker.java:602) > at java.lang.Thread.run(Thread.java:745) > {code} > NN try to recovery UnderConstruction erasure-coding file, but the last block > has no RBW files wrote on DN. The DN can't find internal blocks and won't > tell NN to delete the last block, which will cause NN infinitely loop on > releasing the file's lease. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org