Repository: hbase Updated Branches: refs/heads/master a8b642067 -> 9e42a9e31
HBASE-21472 Should not persist the dispatched field for RegionRemoteProcedureBase Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/9e42a9e3 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/9e42a9e3 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/9e42a9e3 Branch: refs/heads/master Commit: 9e42a9e3103944f3ab338d8e79f05faefc0ac093 Parents: a8b6420 Author: zhangduo <zhang...@apache.org> Authored: Wed Nov 14 21:22:24 2018 +0800 Committer: zhangduo <zhang...@apache.org> Committed: Thu Nov 15 09:41:38 2018 +0800 ---------------------------------------------------------------------- .../src/main/protobuf/MasterProcedure.proto | 1 - .../master/assignment/CloseRegionProcedure.java | 2 +- .../master/assignment/OpenRegionProcedure.java | 2 +- .../assignment/RegionRemoteProcedureBase.java | 26 ++- ...stRegionAssignedToMultipleRegionServers.java | 180 +++++++++++++++++++ 5 files changed, 201 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/9e42a9e3/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto ---------------------------------------------------------------------- diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 073e444..8bb69cf 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -541,7 +541,6 @@ message RegionStateTransitionStateData { message RegionRemoteProcedureBaseStateData { required RegionInfo region = 1; required ServerName target_server = 2; - required bool dispatched = 3; } message OpenRegionProcedureStateData { http://git-wip-us.apache.org/repos/asf/hbase/blob/9e42a9e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index 0d22a9c..fd672fa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -83,6 +83,6 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { @Override protected boolean shouldDispatch(RegionStateNode regionNode) { - return !regionNode.isInState(RegionState.State.CLOSED, RegionState.State.ABNORMALLY_CLOSED); + return regionNode.isInState(RegionState.State.CLOSING); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/9e42a9e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 125ef11..ed5836d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -68,6 +68,6 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { @Override protected boolean shouldDispatch(RegionStateNode regionNode) { - return !regionNode.isInState(RegionState.State.OPEN); + return regionNode.isInState(RegionState.State.OPENING); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/9e42a9e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index 3e05cde..0809759 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -108,6 +108,17 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur } @Override + protected boolean waitInitialized(MasterProcedureEnv env) { + if (TableName.isMetaTableName(getTableName())) { + return false; + } + // First we need meta to be loaded, and second, if meta is not online then we will likely to + // fail when updating meta so we wait until it is assigned. + AssignmentManager am = env.getAssignmentManager(); + return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, region); + } + + @Override protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { throw new UnsupportedOperationException(); } @@ -120,11 +131,13 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur /** * Check whether we still need to make the call to RS. * <p/> - * Usually this will not happen if we do not allow assigning a already onlined region. But if we - * have something wrong in the RSProcedureDispatcher, where we have already sent the request to - * RS, but then we tell the upper layer the remote call is failed due to rpc timeout or connection - * closed or anything else, then this issue can still happen. So here we add a check to make it - * more robust. + * This could happen when master restarts. Since we do not know whether a request has already been + * sent to the region server after we add a remote operation to the dispatcher, so the safe way is + * to not persist the dispatched field and try to add the remote operation again. But it is + * possible that we do have already sent the request to region server and it has also sent back + * the response, so here we need to check the region state, if it is not in the expecting state, + * we should give up, otherwise we may hang for ever, as the region server will just ignore + * redundant calls. */ protected abstract boolean shouldDispatch(RegionStateNode regionNode); @@ -165,7 +178,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { serializer.serialize(RegionRemoteProcedureBaseStateData.newBuilder() .setRegion(ProtobufUtil.toRegionInfo(region)) - .setTargetServer(ProtobufUtil.toServerName(targetServer)).setDispatched(dispatched).build()); + .setTargetServer(ProtobufUtil.toServerName(targetServer)).build()); } @Override @@ -174,6 +187,5 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur serializer.deserialize(RegionRemoteProcedureBaseStateData.class); region = ProtobufUtil.toRegionInfo(data.getRegion()); targetServer = ProtobufUtil.toServerName(data.getTargetServer()); - dispatched = data.getDispatched(); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/9e42a9e3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionAssignedToMultipleRegionServers.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionAssignedToMultipleRegionServers.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionAssignedToMultipleRegionServers.java new file mode 100644 index 0000000..0d8202b --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionAssignedToMultipleRegionServers.java @@ -0,0 +1,180 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.PleaseHoldException; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.StartMiniClusterOption; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.RegionPlan; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestRegionAssignedToMultipleRegionServers { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRegionAssignedToMultipleRegionServers.class); + + private static final List<ServerName> EXCLUDE_SERVERS = new ArrayList<>(); + + private static boolean HALT = false; + + private static boolean KILL = false; + + private static CountDownLatch ARRIVE; + + private static final class ServerManagerForTest extends ServerManager { + + public ServerManagerForTest(MasterServices master) { + super(master); + } + + @Override + public List<ServerName> createDestinationServersList() { + return super.createDestinationServersList(EXCLUDE_SERVERS); + } + } + + private static final class AssignmentManagerForTest extends AssignmentManager { + + public AssignmentManagerForTest(MasterServices master) { + super(master); + } + + @Override + public ReportRegionStateTransitionResponse reportRegionStateTransition( + ReportRegionStateTransitionRequest req) throws PleaseHoldException { + if (req.getTransition(0).getTransitionCode() == TransitionCode.OPENED) { + if (ARRIVE != null) { + ARRIVE.countDown(); + ARRIVE = null; + } + while (HALT) { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + if (KILL) { + throw new PleaseHoldException("Inject error!"); + } + } + } + return super.reportRegionStateTransition(req); + } + } + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + protected AssignmentManager createAssignmentManager(MasterServices master) { + return new AssignmentManagerForTest(master); + } + + @Override + protected ServerManager createServerManager(MasterServices master) throws IOException { + setupClusterConnection(); + return new ServerManagerForTest(master); + } + } + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static TableName NAME = TableName.valueOf("Assign"); + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); + UTIL + .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(2).build()); + UTIL.createTable(NAME, CF); + UTIL.waitTableAvailable(NAME); + UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void test() throws Exception { + RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + RegionStateNode rsn = am.getRegionStates().getRegionStateNode(region); + + ServerName sn = rsn.getRegionLocation(); + ARRIVE = new CountDownLatch(1); + HALT = true; + am.moveAsync(new RegionPlan(region, sn, sn)); + ARRIVE.await(); + + // let's restart the master + EXCLUDE_SERVERS.add(rsn.getRegionLocation()); + KILL = true; + HMaster activeMaster = UTIL.getMiniHBaseCluster().getMaster(); + activeMaster.abort("For testing"); + activeMaster.getThread().join(); + KILL = false; + + // sleep a while to reproduce the problem, as after the fix in HBASE-21472 the execution logic + // is changed so the old code to reproduce the problem can not compile... + Thread.sleep(10000); + HALT = false; + Thread.sleep(5000); + + HRegionServer rs = UTIL.getMiniHBaseCluster().getRegionServer(sn); + assertNotNull(rs.getRegion(region.getEncodedName())); + assertNull(UTIL.getOtherRegionServer(rs).getRegion(region.getEncodedName())); + } +}