[
https://issues.apache.org/jira/browse/HDFS-15175?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17133970#comment-17133970
]
Wan Chang commented on HDFS-15175:
----------------------------------
Seems that I can't attach file in comment
Here is the diff content
{code:java}
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
index 963628f9ac4..2d5ed13b6f9 100644
---
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
@@ -412,6 +412,17 @@ private static void write(List<AclEntry> aclEntries,
DataOutputStream out)
return PBHelperClient.convertXAttrs(proto.getXAttrsList());
}
+ private static Block[] deepCopy(Block[] blocks) {
+ if (blocks == null || blocks.length == 0) {
+ return blocks;
+ }
+ Block[] copy = new Block[blocks.length];
+ for (int i = 0; i < blocks.length; ++i) {
+ copy[i] = blocks[i] == null ? null : new Block(blocks[i]);
+ }
+ return copy;
+ }
+
@SuppressWarnings("unchecked")
static abstract class AddCloseOp
extends FSEditLogOp
@@ -500,7 +511,7 @@ public String getPath() {
throw new RuntimeException("Can't have more than " + MAX_BLOCKS +
" in an AddCloseOp.");
}
- this.blocks = blocks;
+ this.blocks = FSEditLogOp.deepCopy(blocks);
return (T)this;
}
@@ -978,7 +989,7 @@ public String getPath() {
}
AddBlockOp setPenultimateBlock(Block pBlock) {
- this.penultimateBlock = pBlock;
+ this.penultimateBlock = pBlock == null ? null : new Block(pBlock);
return this;
}
@@ -987,7 +998,7 @@ Block getPenultimateBlock() {
}
AddBlockOp setLastBlock(Block lastBlock) {
- this.lastBlock = lastBlock;
+ this.lastBlock = lastBlock == null ? null : new Block(lastBlock);
return this;
}
@@ -1090,7 +1101,7 @@ public String getPath() {
}
UpdateBlocksOp setBlocks(Block[] blocks) {
- this.blocks = blocks;
+ this.blocks = FSEditLogOp.deepCopy(blocks);
return this;
}
@@ -2881,7 +2892,8 @@ TruncateOp setTimestamp(long timestamp) {
}
TruncateOp setTruncateBlock(Block truncateBlock) {
- this.truncateBlock = truncateBlock;
+ this.truncateBlock = truncateBlock == null ?
+ null : new Block(truncateBlock);
return this;
}
{code}
> Multiple CloseOp shared block instance causes the standby namenode to crash
> when rolling editlog
> ------------------------------------------------------------------------------------------------
>
> Key: HDFS-15175
> URL: https://issues.apache.org/jira/browse/HDFS-15175
> Project: Hadoop HDFS
> Issue Type: Bug
> Affects Versions: 2.9.2
> Reporter: Yicong Cai
> Assignee: Yicong Cai
> Priority: Critical
>
>
> {panel:title=Crash exception}
> 2020-02-16 09:24:46,426 [507844305] - ERROR [Edit log
> tailer:FSEditLogLoader@245] - Encountered exception on operation CloseOp
> [length=0, inodeId=0, path=..., replication=3, mtime=1581816138774,
> atime=1581814760398, blockSize=536870912, blocks=[blk_5568434562_4495417845],
> permissions=da_music:hdfs:rw-r-----, aclEntries=null, clientName=,
> clientMachine=, overwrite=false, storagePolicyId=0, opCode=OP_CLOSE,
> txid=32625024993]
> java.io.IOException: File is not under construction: ......
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.applyEditLogOp(FSEditLogLoader.java:442)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:237)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:146)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:891)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:872)
> at
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.doTailEdits(EditLogTailer.java:262)
> at
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:395)
> at
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$300(EditLogTailer.java:348)
> at
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:365)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:360)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1873)
> at
> org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:479)
> at
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:361)
> {panel}
>
> {panel:title=Editlog}
> <RECORD>
> <OPCODE>OP_REASSIGN_LEASE</OPCODE>
> <DATA>
> <TXID>32625021150</TXID>
> <LEASEHOLDER>DFSClient_NONMAPREDUCE_-969060727_197760</LEASEHOLDER>
> <PATH>......</PATH>
> <NEWHOLDER>DFSClient_NONMAPREDUCE_1000868229_201260</NEWHOLDER>
> </DATA>
> </RECORD>
> ......
> <RECORD>
> <OPCODE>OP_CLOSE</OPCODE>
> <DATA>
> <TXID>32625023743</TXID>
> <LENGTH>0</LENGTH>
> <INODEID>0</INODEID>
> <PATH>......</PATH>
> <REPLICATION>3</REPLICATION>
> <MTIME>1581816135883</MTIME>
> <ATIME>1581814760398</ATIME>
> <BLOCKSIZE>536870912</BLOCKSIZE>
> <CLIENT_NAME></CLIENT_NAME>
> <CLIENT_MACHINE></CLIENT_MACHINE>
> <OVERWRITE>false</OVERWRITE>
> <BLOCK>
> <BLOCK_ID>5568434562</BLOCK_ID>
> <NUM_BYTES>185818644</NUM_BYTES>
> <GENSTAMP>4495417845</GENSTAMP>
> </BLOCK>
> <PERMISSION_STATUS>
> <USERNAME>da_music</USERNAME>
> <GROUPNAME>hdfs</GROUPNAME>
> <MODE>416</MODE>
> </PERMISSION_STATUS>
> </DATA>
> </RECORD>
> ......
> <RECORD>
> <OPCODE>OP_TRUNCATE</OPCODE>
> <DATA>
> <TXID>32625024049</TXID>
> <SRC>......</SRC>
> <CLIENTNAME>DFSClient_NONMAPREDUCE_1000868229_201260</CLIENTNAME>
> <CLIENTMACHINE>......</CLIENTMACHINE>
> <NEWLENGTH>185818644</NEWLENGTH>
> <TIMESTAMP>1581816136336</TIMESTAMP>
> <BLOCK>
> <BLOCK_ID>5568434562</BLOCK_ID>
> <NUM_BYTES>185818648</NUM_BYTES>
> <GENSTAMP>4495417845</GENSTAMP>
> </BLOCK>
> </DATA>
> </RECORD>
> ......
> <RECORD>
> <OPCODE>OP_CLOSE</OPCODE>
> <DATA>
> <TXID>32625024993</TXID>
> <LENGTH>0</LENGTH>
> <INODEID>0</INODEID>
> <PATH>......</PATH>
> <REPLICATION>3</REPLICATION>
> <MTIME>1581816138774</MTIME>
> <ATIME>1581814760398</ATIME>
> <BLOCKSIZE>536870912</BLOCKSIZE>
> <CLIENT_NAME></CLIENT_NAME>
> <CLIENT_MACHINE></CLIENT_MACHINE>
> <OVERWRITE>false</OVERWRITE>
> <BLOCK>
> <BLOCK_ID>5568434562</BLOCK_ID>
> <NUM_BYTES>185818644</NUM_BYTES>
> <GENSTAMP>4495417845</GENSTAMP>
> </BLOCK>
> <PERMISSION_STATUS>
> <USERNAME>da_music</USERNAME>
> <GROUPNAME>hdfs</GROUPNAME>
> <MODE>416</MODE>
> </PERMISSION_STATUS>
> </DATA>
> </RECORD>
> {panel}
>
>
> The block size should be 185818648 in the first CloseOp. When truncate is
> used, the block size becomes 185818644. The CloseOp/TruncateOp/CloseOp is
> synchronized to the JournalNode in the same batch. The block used by CloseOp
> twice is the same instance, which causes the first CloseOp has wrong block
> size. When SNN rolling Editlog, TruncateOp does not make the file to the
> UnderConstruction state. Then, when the second CloseOp is executed, the file
> is not in the UnderConstruction state, and SNN crashes.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]