This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new e7b71991335 HBASE-29294 Master crashed because of failing to update
master region (#6979)
e7b71991335 is described below
commit e7b71991335016c712da7ccdb46c143d350a6548
Author: Duo Zhang <[email protected]>
AuthorDate: Wed May 14 20:00:37 2025 +0800
HBASE-29294 Master crashed because of failing to update master region
(#6979)
Signed-off-by: Viraj Jasani <[email protected]>
(cherry picked from commit 91ecd467410d413d08193f9720ff946063712d51)
---
.../org/apache/hadoop/hbase/ipc/RpcServer.java | 11 ++--
.../hadoop/hbase/master/region/MasterRegion.java | 29 +++++++--
.../store/region/RegionProcedureStore.java | 75 ++++++++--------------
.../hbase/master/region/MasterRegionTestBase.java | 8 ++-
.../master/region/TestMasterRegionRpcTimeout.java | 58 +++++++++++++++++
.../TestMasterRegionWALSyncTimeoutIOException.java | 3 +
6 files changed, 120 insertions(+), 64 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
index eacecdcddbc..bba1e66b1f9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
@@ -745,10 +745,9 @@ public abstract class RpcServer implements
RpcServerInterface, ConfigurationObse
}
/**
- * Used by {@link
org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore}. For
- * master's rpc call, it may generate new procedure and mutate the region
which store procedure.
- * There are some check about rpc when mutate region, such as rpc timeout
check. So unset the rpc
- * call to avoid the rpc check.
+ * Used by {@link org.apache.hadoop.hbase.master.region.MasterRegion}, to
avoid hit row lock
+ * timeout when updating master region in a rpc call. See HBASE-23895,
HBASE-29251 and HBASE-29294
+ * for more details.
* @return the currently ongoing rpc call
*/
public static Optional<RpcCall> unsetCurrentCall() {
@@ -758,8 +757,8 @@ public abstract class RpcServer implements
RpcServerInterface, ConfigurationObse
}
/**
- * Used by {@link
org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore}. Set the
- * rpc call back after mutate region.
+ * Used by {@link org.apache.hadoop.hbase.master.region.MasterRegion}. Set
the rpc call back after
+ * mutate region.
*/
public static void setCurrentCall(RpcCall rpcCall) {
CurCall.set(rpcCall);
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
index b956b4ff417..7fd7f6fa5c4 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
@@ -22,6 +22,7 @@ import static
org.apache.hadoop.hbase.HConstants.HREGION_LOGDIR_NAME;
import com.google.errorprone.annotations.RestrictedApi;
import java.io.IOException;
import java.util.List;
+import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -40,6 +41,8 @@ import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.ipc.RpcCall;
+import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.log.HBaseMarkers;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegion.FlushResult;
@@ -154,12 +157,7 @@ public final class MasterRegion {
}
}
- /**
- * Performs the mutation to the master region using UpdateMasterRegion
update action.
- * @param action Update region action.
- * @throws IOException IO error that causes active master to abort.
- */
- public void update(UpdateMasterRegion action) throws IOException {
+ private void update0(UpdateMasterRegion action) throws IOException {
for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) {
try {
// If the update is successful, return immediately.
@@ -191,6 +189,25 @@ public final class MasterRegion {
}
}
+ /**
+ * Performs the mutation to the master region using UpdateMasterRegion
update action.
+ * @param action Update region action.
+ * @throws IOException IO error that causes active master to abort.
+ */
+ public void update(UpdateMasterRegion action) throws IOException {
+ // Since now we will abort master when updating master region fails, and
when updating, if the
+ // rpc is already timed out, we will hit a TimeoutIOException which
indicates that we can not
+ // get the row lock in time, so here we need to unset the rpc call to
prevent this, otherwise
+ // master will abort with a rpc timeout, which is not necessary...
+ // See HBASE-29294.
+ Optional<RpcCall> rpcCall = RpcServer.unsetCurrentCall();
+ try {
+ update0(action);
+ } finally {
+ rpcCall.ifPresent(RpcServer::setCurrentCall);
+ }
+ }
+
/**
* Log the error and abort the master daemon immediately. Use this utility
only when procedure
* state store update fails and the only way to recover is by terminating
the active master so
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
index 9cc42bf587f..0e666fe6c94 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
@@ -29,7 +29,6 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Optional;
import org.apache.commons.lang3.mutable.MutableLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -41,8 +40,6 @@ import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.ipc.RpcCall;
-import org.apache.hadoop.hbase.ipc.RpcServer;
import org.apache.hadoop.hbase.log.HBaseMarkers;
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure;
@@ -304,20 +301,6 @@ public class RegionProcedureStore extends
ProcedureStoreBase {
rowsToLock.add(row);
}
- /**
- * Insert procedure may be called by master's rpc call. There are some check
about the rpc call
- * when mutate region. Here unset the current rpc call and set it back in
finally block. See
- * HBASE-23895 for more details.
- */
- private void runWithoutRpcCall(Runnable runnable) {
- Optional<RpcCall> rpcCall = RpcServer.unsetCurrentCall();
- try {
- runnable.run();
- } finally {
- rpcCall.ifPresent(RpcServer::setCurrentCall);
- }
- }
-
@Override
public void insert(Procedure<?> proc, Procedure<?>[] subProcs) {
if (subProcs == null || subProcs.length == 0) {
@@ -327,50 +310,44 @@ public class RegionProcedureStore extends
ProcedureStoreBase {
}
List<Mutation> mutations = new ArrayList<>(subProcs.length + 1);
List<byte[]> rowsToLock = new ArrayList<>(subProcs.length + 1);
- runWithoutRpcCall(() -> {
- try {
- serializePut(proc, mutations, rowsToLock);
- for (Procedure<?> subProc : subProcs) {
- serializePut(subProc, mutations, rowsToLock);
- }
- region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock,
NO_NONCE, NO_NONCE));
- } catch (IOException e) {
- LOG.error(HBaseMarkers.FATAL, "Failed to insert proc {}, sub procs
{}", proc,
- Arrays.toString(subProcs), e);
- throw new UncheckedIOException(e);
+ try {
+ serializePut(proc, mutations, rowsToLock);
+ for (Procedure<?> subProc : subProcs) {
+ serializePut(subProc, mutations, rowsToLock);
}
- });
+ region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock,
NO_NONCE, NO_NONCE));
+ } catch (IOException e) {
+ LOG.error(HBaseMarkers.FATAL, "Failed to insert proc {}, sub procs {}",
proc,
+ Arrays.toString(subProcs), e);
+ throw new UncheckedIOException(e);
+ }
}
@Override
public void insert(Procedure<?>[] procs) {
List<Mutation> mutations = new ArrayList<>(procs.length);
List<byte[]> rowsToLock = new ArrayList<>(procs.length);
- runWithoutRpcCall(() -> {
- try {
- for (Procedure<?> proc : procs) {
- serializePut(proc, mutations, rowsToLock);
- }
- region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock,
NO_NONCE, NO_NONCE));
- } catch (IOException e) {
- LOG.error(HBaseMarkers.FATAL, "Failed to insert procs {}",
Arrays.toString(procs), e);
- throw new UncheckedIOException(e);
+ try {
+ for (Procedure<?> proc : procs) {
+ serializePut(proc, mutations, rowsToLock);
}
- });
+ region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock,
NO_NONCE, NO_NONCE));
+ } catch (IOException e) {
+ LOG.error(HBaseMarkers.FATAL, "Failed to insert procs {}",
Arrays.toString(procs), e);
+ throw new UncheckedIOException(e);
+ }
}
@Override
public void update(Procedure<?> proc) {
- runWithoutRpcCall(() -> {
- try {
- ProcedureProtos.Procedure proto =
ProcedureUtil.convertToProtoProcedure(proc);
- region.update(r -> r.put(new
Put(Bytes.toBytes(proc.getProcId())).addColumn(PROC_FAMILY,
- PROC_QUALIFIER, proto.toByteArray())));
- } catch (IOException e) {
- LOG.error(HBaseMarkers.FATAL, "Failed to update proc {}", proc, e);
- throw new UncheckedIOException(e);
- }
- });
+ try {
+ ProcedureProtos.Procedure proto =
ProcedureUtil.convertToProtoProcedure(proc);
+ region.update(r -> r.put(new
Put(Bytes.toBytes(proc.getProcId())).addColumn(PROC_FAMILY,
+ PROC_QUALIFIER, proto.toByteArray())));
+ } catch (IOException e) {
+ LOG.error(HBaseMarkers.FATAL, "Failed to update proc {}", proc, e);
+ throw new UncheckedIOException(e);
+ }
}
@Override
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
index 11da1b03c11..fe8eef23ab1 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
@@ -53,6 +53,8 @@ public class MasterRegionTestBase {
protected DirScanPool logCleanerPool;
+ protected Server server;
+
protected static byte[] CF1 = Bytes.toBytes("f1");
protected static byte[] CF2 = Bytes.toBytes("f2");
@@ -92,9 +94,9 @@ public class MasterRegionTestBase {
Configuration conf = htu.getConfiguration();
configure(conf);
choreService = new ChoreService(getClass().getSimpleName());
- hfileCleanerPool =
DirScanPool.getHFileCleanerScanPool(htu.getConfiguration());
- logCleanerPool = DirScanPool.getLogCleanerScanPool(htu.getConfiguration());
- Server server = mock(Server.class);
+ hfileCleanerPool = DirScanPool.getHFileCleanerScanPool(conf);
+ logCleanerPool = DirScanPool.getLogCleanerScanPool(conf);
+ server = mock(Server.class);
when(server.getConfiguration()).thenReturn(conf);
when(server.getServerName())
.thenReturn(ServerName.valueOf("localhost", 12345,
EnvironmentEdgeManager.currentTime()));
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
new file mode 100644
index 00000000000..66cdaf4cc76
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.region;
+
+import static org.junit.Assert.assertSame;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.ipc.RpcCall;
+import org.apache.hadoop.hbase.ipc.RpcServer;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Make sure that we will not get rpc timeout when updating master region.
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestMasterRegionRpcTimeout extends MasterRegionTestBase {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestMasterRegionRpcTimeout.class);
+
+ @Test
+ public void testRpcTimeout() throws IOException {
+ RpcCall call = mock(RpcCall.class);
+ // fake a RpcCall which is already timed out
+ when(call.getDeadline()).thenReturn(EnvironmentEdgeManager.currentTime() -
10000);
+ RpcServer.setCurrentCall(call);
+ // make sure that the update operation does not throw timeout exception
+ region.update(
+ r -> r.put(new Put(Bytes.toBytes("row")).addColumn(CF1, QUALIFIER,
Bytes.toBytes("value"))));
+ assertSame(call, RpcServer.getCurrentCall().get());
+ }
+}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
index 810135b8370..f9d75f0d742 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.hbase.master.region;
import static org.junit.Assert.assertThrows;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.verify;
import java.io.IOException;
import java.time.Duration;
@@ -80,6 +82,7 @@ public class TestMasterRegionWALSyncTimeoutIOException
extends MasterRegionTestB
Thread.sleep(Duration.ofSeconds(1).toMillis());
}
});
+ verify(server).abort(any(), any());
}
public static class SlowAsyncFSWAL extends AsyncFSWAL {