[ 
https://issues.apache.org/jira/browse/HDFS-15175?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17133970#comment-17133970
 ] 

Wan Chang commented on HDFS-15175:
----------------------------------

Seems that I can't attach file in comment

Here is the diff content
{code:java}
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
index 963628f9ac4..2d5ed13b6f9 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
@@ -412,6 +412,17 @@ private static void write(List<AclEntry> aclEntries, 
DataOutputStream out)
     return PBHelperClient.convertXAttrs(proto.getXAttrsList());
   }
 
+  private static Block[] deepCopy(Block[] blocks) {
+    if (blocks == null || blocks.length == 0) {
+      return blocks;
+    }
+    Block[] copy = new Block[blocks.length];
+    for (int i = 0; i < blocks.length; ++i) {
+      copy[i] = blocks[i] == null ? null : new Block(blocks[i]);
+    }
+    return copy;
+  }
+
   @SuppressWarnings("unchecked")
   static abstract class AddCloseOp
          extends FSEditLogOp
@@ -500,7 +511,7 @@ public String getPath() {
         throw new RuntimeException("Can't have more than " + MAX_BLOCKS +
             " in an AddCloseOp.");
       }
-      this.blocks = blocks;
+      this.blocks = FSEditLogOp.deepCopy(blocks);
       return (T)this;
     }
     
@@ -978,7 +989,7 @@ public String getPath() {
     }
 
     AddBlockOp setPenultimateBlock(Block pBlock) {
-      this.penultimateBlock = pBlock;
+      this.penultimateBlock = pBlock == null ? null : new Block(pBlock);
       return this;
     }
     
@@ -987,7 +998,7 @@ Block getPenultimateBlock() {
     }
     
     AddBlockOp setLastBlock(Block lastBlock) {
-      this.lastBlock = lastBlock;
+      this.lastBlock = lastBlock == null ? null : new Block(lastBlock);
       return this;
     }
     
@@ -1090,7 +1101,7 @@ public String getPath() {
     }
 
     UpdateBlocksOp setBlocks(Block[] blocks) {
-      this.blocks = blocks;
+      this.blocks = FSEditLogOp.deepCopy(blocks);
       return this;
     }
     
@@ -2881,7 +2892,8 @@ TruncateOp setTimestamp(long timestamp) {
     }
 
     TruncateOp setTruncateBlock(Block truncateBlock) {
-      this.truncateBlock = truncateBlock;
+      this.truncateBlock = truncateBlock == null ?
+          null : new Block(truncateBlock);
       return this;
     }
{code}

> Multiple CloseOp shared block instance causes the standby namenode to crash 
> when rolling editlog
> ------------------------------------------------------------------------------------------------
>
>                 Key: HDFS-15175
>                 URL: https://issues.apache.org/jira/browse/HDFS-15175
>             Project: Hadoop HDFS
>          Issue Type: Bug
>    Affects Versions: 2.9.2
>            Reporter: Yicong Cai
>            Assignee: Yicong Cai
>            Priority: Critical
>
>  
> {panel:title=Crash exception}
> 2020-02-16 09:24:46,426 [507844305] - ERROR [Edit log 
> tailer:FSEditLogLoader@245] - Encountered exception on operation CloseOp 
> [length=0, inodeId=0, path=..., replication=3, mtime=1581816138774, 
> atime=1581814760398, blockSize=536870912, blocks=[blk_5568434562_4495417845], 
> permissions=da_music:hdfs:rw-r-----, aclEntries=null, clientName=, 
> clientMachine=, overwrite=false, storagePolicyId=0, opCode=OP_CLOSE, 
> txid=32625024993]
>  java.io.IOException: File is not under construction: ......
>  at 
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.applyEditLogOp(FSEditLogLoader.java:442)
>  at 
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:237)
>  at 
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:146)
>  at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:891)
>  at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:872)
>  at 
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.doTailEdits(EditLogTailer.java:262)
>  at 
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:395)
>  at 
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$300(EditLogTailer.java:348)
>  at 
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:365)
>  at java.security.AccessController.doPrivileged(Native Method)
>  at javax.security.auth.Subject.doAs(Subject.java:360)
>  at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1873)
>  at 
> org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:479)
>  at 
> org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:361)
> {panel}
>  
> {panel:title=Editlog}
> <RECORD>
>  <OPCODE>OP_REASSIGN_LEASE</OPCODE>
>  <DATA>
>  <TXID>32625021150</TXID>
>  <LEASEHOLDER>DFSClient_NONMAPREDUCE_-969060727_197760</LEASEHOLDER>
>  <PATH>......</PATH>
>  <NEWHOLDER>DFSClient_NONMAPREDUCE_1000868229_201260</NEWHOLDER>
>  </DATA>
>  </RECORD>
> ......
> <RECORD>
>  <OPCODE>OP_CLOSE</OPCODE>
>  <DATA>
>  <TXID>32625023743</TXID>
>  <LENGTH>0</LENGTH>
>  <INODEID>0</INODEID>
>  <PATH>......</PATH>
>  <REPLICATION>3</REPLICATION>
>  <MTIME>1581816135883</MTIME>
>  <ATIME>1581814760398</ATIME>
>  <BLOCKSIZE>536870912</BLOCKSIZE>
>  <CLIENT_NAME></CLIENT_NAME>
>  <CLIENT_MACHINE></CLIENT_MACHINE>
>  <OVERWRITE>false</OVERWRITE>
>  <BLOCK>
>  <BLOCK_ID>5568434562</BLOCK_ID>
>  <NUM_BYTES>185818644</NUM_BYTES>
>  <GENSTAMP>4495417845</GENSTAMP>
>  </BLOCK>
>  <PERMISSION_STATUS>
>  <USERNAME>da_music</USERNAME>
>  <GROUPNAME>hdfs</GROUPNAME>
>  <MODE>416</MODE>
>  </PERMISSION_STATUS>
>  </DATA>
>  </RECORD>
> ......
> <RECORD>
>  <OPCODE>OP_TRUNCATE</OPCODE>
>  <DATA>
>  <TXID>32625024049</TXID>
>  <SRC>......</SRC>
>  <CLIENTNAME>DFSClient_NONMAPREDUCE_1000868229_201260</CLIENTNAME>
>  <CLIENTMACHINE>......</CLIENTMACHINE>
>  <NEWLENGTH>185818644</NEWLENGTH>
>  <TIMESTAMP>1581816136336</TIMESTAMP>
>  <BLOCK>
>  <BLOCK_ID>5568434562</BLOCK_ID>
>  <NUM_BYTES>185818648</NUM_BYTES>
>  <GENSTAMP>4495417845</GENSTAMP>
>  </BLOCK>
>  </DATA>
>  </RECORD>
> ......
> <RECORD>
>  <OPCODE>OP_CLOSE</OPCODE>
>  <DATA>
>  <TXID>32625024993</TXID>
>  <LENGTH>0</LENGTH>
>  <INODEID>0</INODEID>
>  <PATH>......</PATH>
>  <REPLICATION>3</REPLICATION>
>  <MTIME>1581816138774</MTIME>
>  <ATIME>1581814760398</ATIME>
>  <BLOCKSIZE>536870912</BLOCKSIZE>
>  <CLIENT_NAME></CLIENT_NAME>
>  <CLIENT_MACHINE></CLIENT_MACHINE>
>  <OVERWRITE>false</OVERWRITE>
>  <BLOCK>
>  <BLOCK_ID>5568434562</BLOCK_ID>
>  <NUM_BYTES>185818644</NUM_BYTES>
>  <GENSTAMP>4495417845</GENSTAMP>
>  </BLOCK>
>  <PERMISSION_STATUS>
>  <USERNAME>da_music</USERNAME>
>  <GROUPNAME>hdfs</GROUPNAME>
>  <MODE>416</MODE>
>  </PERMISSION_STATUS>
>  </DATA>
>  </RECORD>
> {panel}
>  
>  
> The block size should be 185818648 in the first CloseOp. When truncate is 
> used, the block size becomes 185818644. The CloseOp/TruncateOp/CloseOp is 
> synchronized to the JournalNode in the same batch. The block used by CloseOp 
> twice is the same instance, which causes the first CloseOp has wrong block 
> size. When SNN rolling Editlog, TruncateOp does not make the file to the 
> UnderConstruction state. Then, when the second CloseOp is executed, the file 
> is not in the UnderConstruction state, and SNN crashes.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org

Reply via email to