This is an automated email from the ASF dual-hosted git repository.
huhaiyang pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/trunk by this push:
new 56192be95af HDFS-17861. The mis-behavior of commitBlockSynchronization
may cause standby namenode and observer namenode crash. (#8120)
56192be95af is described below
commit 56192be95af06c0a04ae6503858199d240f823e0
Author: hfutatzhanghb <[email protected]>
AuthorDate: Fri Feb 13 17:11:17 2026 +0800
HDFS-17861. The mis-behavior of commitBlockSynchronization may cause
standby namenode and observer namenode crash. (#8120)
---
.../hadoop/hdfs/server/namenode/FSNamesystem.java | 1 +
...TestBlockRecoveryCauseStandbyNameNodeCrash.java | 177 +++++++++++++++++++++
2 files changed, 178 insertions(+)
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
index c4aa3778253..5938cd46684 100644
---
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@@ -4107,6 +4107,7 @@ void commitBlockSynchronization(ExtendedBlock oldBlock,
boolean remove = iFile.removeLastBlock(blockToDel) != null;
if (remove) {
blockManager.removeBlock(storedBlock);
+ FSDirWriteFileOp.persistBlocks(dir, src, iFile, false);
}
} else {
// update last block
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockRecoveryCauseStandbyNameNodeCrash.java
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockRecoveryCauseStandbyNameNodeCrash.java
new file mode 100644
index 00000000000..56fa0d49a01
--- /dev/null
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockRecoveryCauseStandbyNameNodeCrash.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
+import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.Whitebox;
+import org.apache.hadoop.util.StringUtils;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.concurrent.TimeoutException;
+import java.util.function.Supplier;
+
+import static org.junit.jupiter.api.Assertions.fail;
+
+public class TestBlockRecoveryCauseStandbyNameNodeCrash {
+ public static final Logger LOG = LoggerFactory
+ .getLogger(TestBlockRecoveryCauseStandbyNameNodeCrash.class);
+ private final ErasureCodingPolicy ecPolicy =
+ StripedFileTestUtil.getDefaultECPolicy();
+ private final int dataBlocks = ecPolicy.getNumDataUnits();
+ private final int parityBlocks = ecPolicy.getNumParityUnits();
+ private final int cellSize = ecPolicy.getCellSize();
+ private final int stripesPerBlock = 4;
+ private final int blockSize = cellSize * stripesPerBlock;
+
+ private final String fakeUsername = "fakeUser1";
+ private final String fakeGroup = "supergroup";
+
+ private MiniDFSCluster cluster;
+ private DistributedFileSystem dfs;
+ private Configuration conf;
+ private Configuration newConf;
+ private final Path dir = new Path("/" + this.getClass().getSimpleName());
+ private Path p = new Path(dir, "testfile");
+
+ @BeforeEach
+ public void setup() throws IOException {
+ conf = new HdfsConfiguration();
+ conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize);
+ conf.setLong(HdfsClientConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY, 60000L);
+ conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 0);
+ conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ final int numDNs = dataBlocks + parityBlocks;
+ cluster = new MiniDFSCluster.Builder(conf)
+ .numDataNodes(numDNs)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology(2, 50070))
+ .build();
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ newConf = cluster.getConfiguration(0);
+ dfs = cluster.getFileSystem(0);
+ dfs.enableErasureCodingPolicy(ecPolicy.getName());
+ dfs.mkdirs(dir);
+ dfs.setErasureCodingPolicy(dir, ecPolicy.getName());
+ }
+
+ @AfterEach
+ public void tearDown() {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * 1. PauseIBR on some datanodes and write 25MB data (two block groups).
+ * 2. Mock client quiet exceptionally.
+ * 3. Trigger lease recovery.
+ * 4. Standby NameNode crashed.
+ */
+ @Test
+ public void testCommitBlockSynchronizationWithDeleteECBlockgroupCommitted() {
+ int curCellSize = (int) 1024 * 1024;
+ try {
+ for (int i = 0; i < parityBlocks + 1; i++) {
+ DataNodeTestUtils.pauseIBR(cluster.getDataNodes().get(i));
+ }
+ final FSDataOutputStream out = dfs.create(p);
+ final DFSStripedOutputStream stripedOut = (DFSStripedOutputStream) out
+ .getWrappedStream();
+ for (int pos = 0; pos < (stripesPerBlock * dataBlocks + 1) *
curCellSize; pos++) {
+ out.write(StripedFileTestUtil.getByte(pos));
+ }
+ for (int i = 0; i < dataBlocks + parityBlocks; i++) {
+ StripedDataStreamer s = stripedOut.getStripedDataStreamer(i);
+ waitStreamerAllAcked(s);
+ stopBlockStream(s);
+ }
+ recoverLease();
+ LOG.info("Trigger recover lease manually successfully.");
+ } catch (Throwable e) {
+ String msg = "failed testCase" + StringUtils.stringifyException(e);
+ fail(msg);
+ } finally {
+ for (int i = 0; i < parityBlocks + 1; i++) {
+ DataNodeTestUtils.resumeIBR(cluster.getDataNodes().get(i));
+ }
+ }
+ }
+
+ /**
+ * Stop the block stream without immediately inducing a hard failure.
+ * Packets can continue to be queued until the streamer hits a socket
timeout.
+ *
+ * @param s the streamer to stop.
+ * @throws Exception
+ */
+ private void stopBlockStream(StripedDataStreamer s) throws Exception {
+ IOUtils.NullOutputStream nullOutputStream = new IOUtils.NullOutputStream();
+ Whitebox.setInternalState(s, "blockStream",
+ new DataOutputStream(nullOutputStream));
+ }
+
+ private void recoverLease() throws Exception {
+ final DistributedFileSystem dfs2 =
+ (DistributedFileSystem) getFSAsAnotherUser(newConf);
+ try {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ try {
+ return dfs2.recoverLease(p);
+ } catch (IOException e) {
+ LOG.info("BZL#Test. recoverLease() failed: " + e.getMessage());
+ return false;
+ }
+ }
+ }, 5000, 24000);
+ } catch (TimeoutException e) {
+ throw new IOException("Timeout waiting for recoverLease()");
+ }
+ }
+
+ private FileSystem getFSAsAnotherUser(final Configuration c)
+ throws IOException, InterruptedException {
+ return FileSystem.get(FileSystem.getDefaultUri(c), c,
+ UserGroupInformation
+ .createUserForTesting(fakeUsername, new String[]{fakeGroup})
+ .getUserName());
+ }
+
+ public static void waitStreamerAllAcked(DataStreamer s) throws IOException {
+ long toWaitFor = s.getLastQueuedSeqno();
+ s.waitForAckedSeqno(toWaitFor);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]