This is an automated email from the ASF dual-hosted git repository.
wchevreuil pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-1 by this push:
new f99e899 HBASE-23693 Split failure may cause region hole and data loss
when use zk assign (#1071)
f99e899 is described below
commit f99e899ca3a6d28d935793a42af16c527e8e0d87
Author: thangTang <[email protected]>
AuthorDate: Tue Feb 11 00:57:30 2020 +0800
HBASE-23693 Split failure may cause region hole and data loss when use zk
assign (#1071)
---
.../org/apache/hadoop/hbase/MetaTableAccessor.java | 14 +++++
.../apache/hadoop/hbase/master/RegionStates.java | 66 +++++++++++++++++++---
2 files changed, 73 insertions(+), 7 deletions(-)
diff --git
a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index 440f8c6..1624364 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -843,6 +843,20 @@ public class MetaTableAccessor {
/**
* Returns the daughter regions by reading the corresponding columns of the
catalog table
* Result.
+ * @param connection connection we're using
+ * @param parent region information of parent
+ * @return a pair of HRegionInfo or PairOfSameType(null, null) if the region
is not a split
+ * parent
+ */
+ public static PairOfSameType<HRegionInfo> getDaughterRegionsFromParent(
+ final Connection connection, HRegionInfo parent) throws IOException {
+ Result parentResult = getRegionResult(connection, parent.getRegionName());
+ return getDaughterRegions(parentResult);
+ }
+
+ /**
+ * Returns the daughter regions by reading the corresponding columns of the
catalog table
+ * Result.
* @param data a Result object from the catalog table scan
* @return a pair of HRegionInfo or PairOfSameType(null, null) if the region
is not a split
* parent
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 3a02bdb..e31868e 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -44,13 +44,17 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableStateManager;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.client.Mutation;
+import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
+import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ConfigUtil;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
@@ -737,11 +741,13 @@ public class RegionStates {
public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final
ServerName sn) {
// Offline all regions on this server not already in transition.
List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
- Set<HRegionInfo> regionsToClean = new HashSet<HRegionInfo>();
+ Set<Pair<HRegionInfo, HRegionInfo>> regionsToClean =
+ new HashSet<Pair<HRegionInfo, HRegionInfo>>();
// Offline regions outside the loop and synchronized block to avoid
// ConcurrentModificationException and deadlock in case of meta anassigned,
// but RegionState a blocked.
Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
+ Map<String, HRegionInfo> daughter2Parent = new HashMap<>();
synchronized (this) {
Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
if (assignedRegions == null) {
@@ -758,8 +764,20 @@ public class RegionStates {
// Delete the ZNode if exists
ZKAssign.deleteNodeFailSilent(watcher, region);
regionsToOffline.add(region);
+ PairOfSameType<HRegionInfo> daughterRegions =
+
MetaTableAccessor.getDaughterRegionsFromParent(this.server.getConnection(),
region);
+ if (daughterRegions != null) {
+ if (daughterRegions.getFirst() != null) {
+
daughter2Parent.put(daughterRegions.getFirst().getEncodedName(), region);
+ }
+ if (daughterRegions.getSecond() != null) {
+
daughter2Parent.put(daughterRegions.getSecond().getEncodedName(), region);
+ }
+ }
} catch (KeeperException ke) {
server.abort("Unexpected ZK exception deleting node " + region,
ke);
+ } catch (IOException e) {
+ LOG.warn("get daughter from meta exception " + region, e);
}
}
}
@@ -783,10 +801,20 @@ public class RegionStates {
LOG.info("Found region in " + state +
" to be reassigned by ServerCrashProcedure for " + sn);
rits.add(hri);
- } else if(state.isSplittingNew() || state.isMergingNew()) {
- LOG.info("Offline/Cleanup region if no meta entry exists, hri: " +
hri +
- " state: " + state);
- regionsToClean.add(state.getRegion());
+ } else if (state.isSplittingNew() || state.isMergingNew()) {
+ LOG.info(
+ "Offline/Cleanup region if no meta entry exists, hri: " + hri +
" state: " + state);
+ if (daughter2Parent.containsKey(hri.getEncodedName())) {
+ HRegionInfo parent = daughter2Parent.get(hri.getEncodedName());
+ HRegionInfo info = getHRIFromMeta(parent);
+ if (info != null && info.isSplit() && info.isOffline()) {
+ regionsToClean.add(Pair.newPair(state.getRegion(), info));
+ } else {
+ regionsToClean.add(Pair.<HRegionInfo,
HRegionInfo>newPair(state.getRegion(), null));
+ }
+ } else {
+ regionsToClean.add(Pair.<HRegionInfo,
HRegionInfo>newPair(state.getRegion(), null));
+ }
} else {
LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
}
@@ -803,6 +831,19 @@ public class RegionStates {
return rits;
}
+ private HRegionInfo getHRIFromMeta(HRegionInfo parent) {
+ Result result = null;
+ try {
+ result =
+ MetaTableAccessor.getRegionResult(this.server.getConnection(),
parent.getRegionName());
+ HRegionInfo info = MetaTableAccessor.getHRegionInfo(result);
+ return info;
+ } catch (IOException e) {
+ LOG.error("got exception when query meta with region " +
parent.getEncodedName(), e);
+ return null;
+ }
+ }
+
/**
* This method does an RPC to hbase:meta. Do not call this method with a
lock/synchronize held.
* In ZK mode we rollback and hence cleanup daughters/merged region. We also
cleanup if
@@ -810,12 +851,14 @@ public class RegionStates {
*
* @param hris The hris to check if empty in hbase:meta and if so, clean
them up.
*/
- private void cleanFailedSplitMergeRegions(Set<HRegionInfo> hris) {
+ private void cleanFailedSplitMergeRegions(Set<Pair<HRegionInfo,
HRegionInfo>> hris) {
if (hris.isEmpty()) {
return;
}
- for (HRegionInfo hri : hris) {
+ for (Pair<HRegionInfo, HRegionInfo> hriPair : hris) {
+ HRegionInfo hri = hriPair.getFirst();
+ HRegionInfo parentInfo = hriPair.getSecond();
// This is RPC to meta table. It is done while we have a synchronize on
// regionstates. No progress will be made if meta is not available at
this time.
// This is a cleanup task. Not critical.
@@ -829,6 +872,15 @@ public class RegionStates {
if (regionPair != null) {
MetaTableAccessor.deleteRegion(this.server.getConnection(), hri);
}
+ if (parentInfo != null) {
+ List<Mutation> mutations = new ArrayList<Mutation>();
+ HRegionInfo copyOfParent = new HRegionInfo(parentInfo);
+ copyOfParent.setOffline(false);
+ copyOfParent.setSplit(false);
+ Put putParent =
MetaTableAccessor.makePutFromRegionInfo(copyOfParent);
+ mutations.add(putParent);
+ MetaTableAccessor.mutateMetaTable(this.server.getConnection(),
mutations);
+ }
LOG.debug("Cleaning up HDFS since no meta entry exists, hri: " +
hri);
FSUtils.deleteRegionDir(server.getConfiguration(), hri);
}