This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/master by this push:
     new d75a700  HBASE-23369 Auto-close 'unknown' Regions reported as OPEN on 
RegionServers
d75a700 is described below

commit d75a7001b176bfb189aee83f89c834821b50e4a7
Author: stack <[email protected]>
AuthorDate: Tue Dec 3 16:26:17 2019 -0800

    HBASE-23369 Auto-close 'unknown' Regions reported as OPEN on RegionServers
    
    Master force-closes unknown/incorrect Regions OPEN on RS
    
    M hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
     Added a note and small refactor.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
     Fix an NPE when CJ ran.
    
    M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
     Minor clean up of log message; make it clearer.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
     Make it so closeRegionSilentlyAndWait can be used w/o timeout.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
     If a RegionServer Report notes a Region is OPEN and the Master does not
     know of said Region, close it (We used to crash out the RegionServer)
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
     Minor tweak of toString -- label should be state, not rit (confusing).
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
     Doc.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
     Add region name to exception.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
     Be more careful about which Regions we queue up for reassign. This
     procedure is run by the operator so could happen at any time. We
     will likely be running this when Master has some accounting of
     cluster members so check its answers for what Regions were on
     server before running.
    
    M 
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
     Doc and we were misrepresenting the case where a Region as not in RIT
     when we got CLOSE -- we were reporting it as though it was already
     trying to CLOSE.
---
 .../org/apache/hadoop/hbase/MetaTableAccessor.java |  5 +-
 .../apache/hadoop/hbase/master/CatalogJanitor.java |  2 +-
 .../org/apache/hadoop/hbase/master/HMaster.java    |  2 +-
 .../apache/hadoop/hbase/master/ServerManager.java  |  8 +++-
 .../hbase/master/assignment/AssignmentManager.java | 54 ++++++++++++++++------
 .../hbase/master/assignment/RegionStateNode.java   |  2 +-
 .../hbase/master/assignment/RegionStates.java      |  8 +++-
 .../assignment/TransitRegionStateProcedure.java    |  5 +-
 .../hadoop/hbase/regionserver/HRegionServer.java   |  5 +-
 9 files changed, 66 insertions(+), 25 deletions(-)

diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index 57bb76b..cba3f2d 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -320,6 +320,7 @@ public class MetaTableAccessor {
    * is stored in the name, so the returned object should only be used for the 
fields
    * in the regionName.
    */
+  // This should be moved to RegionInfo? TODO.
   public static RegionInfo parseRegionInfoFromRegionName(byte[] regionName) 
throws IOException {
     byte[][] fields = RegionInfo.parseRegionName(regionName);
     long regionId = Long.parseLong(Bytes.toString(fields[2]));
@@ -1249,9 +1250,7 @@ public class MetaTableAccessor {
    * Generates and returns a Put containing the region into for the catalog 
table
    */
   public static Put makePutFromRegionInfo(RegionInfo regionInfo, long ts) 
throws IOException {
-    Put put = new Put(regionInfo.getRegionName(), ts);
-    addRegionInfo(put, regionInfo);
-    return put;
+    return addRegionInfo(new Put(regionInfo.getRegionName(), ts), regionInfo);
   }
 
   /**
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
index d7e97ec..4b792f9 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
@@ -735,7 +735,7 @@ public class CatalogJanitor extends ScheduledChore {
           continue;
         }
         RegionState rs = 
this.services.getAssignmentManager().getRegionStates().getRegionState(ri);
-        if (rs.isClosedOrAbnormallyClosed()) {
+        if (rs == null || rs.isClosedOrAbnormallyClosed()) {
           // If closed against an 'Unknown Server', that is should be fine.
           continue;
         }
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 9cf38f6..ceef6b5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -1855,7 +1855,7 @@ public class HMaster extends HRegionServer implements 
MasterServices {
         } catch (HBaseIOException hioe) {
           //should ignore failed plans here, avoiding the whole balance plans 
be aborted
           //later calls of balance() can fetch up the failed and skipped plans
-          LOG.warn("Failed balance plan: {}, just skip it", plan, hioe);
+          LOG.warn("Failed balance plan {}, skipping...", plan, hioe);
         }
         //rpCount records balance plans processed, does not care if a plan 
succeeds
         rpCount++;
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index f9e5c77..a61f96a 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -675,8 +675,9 @@ public class ServerManager {
   }
 
   /**
-   * Contacts a region server and waits up to timeout ms to close the region. 
This bypasses the
-   * active hmaster.
+   * Contacts a region server and waits up to timeout ms
+   * to close the region.  This bypasses the active hmaster.
+   * Pass -1 as timeout if you do not want to wait on result.
    */
   public static void closeRegionSilentlyAndWait(AsyncClusterConnection 
connection,
       ServerName server, RegionInfo region, long timeout) throws IOException, 
InterruptedException {
@@ -687,6 +688,9 @@ public class ServerManager {
     } catch (IOException e) {
       LOG.warn("Exception when closing region: " + 
region.getRegionNameAsString(), e);
     }
+    if (timeout < 0) {
+      return;
+    }
     long expiration = timeout + System.currentTimeMillis();
     while (System.currentTimeMillis() < expiration) {
       try {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index a236c15..d6ae254 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -36,6 +36,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.HBaseIOException;
 import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MetaTableAccessor;
 import org.apache.hadoop.hbase.PleaseHoldException;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
@@ -629,7 +630,8 @@ public class AssignmentManager {
       ServerName targetServer) throws HBaseIOException {
     RegionStateNode regionNode = 
this.regionStates.getRegionStateNode(regionInfo);
     if (regionNode == null) {
-      throw new UnknownRegionException("No RegionState found for " + 
regionInfo.getEncodedName());
+      throw new UnknownRegionException("No RegionStateNode found for " +
+          regionInfo.getEncodedName() + "(Closed/Deleted?)");
     }
     TransitRegionStateProcedure proc;
     regionNode.lock();
@@ -944,7 +946,7 @@ public class AssignmentManager {
     if (regionNode == null) {
       // the table/region is gone. maybe a delete, split, merge
       throw new UnexpectedStateException(String.format(
-        "Server %s was trying to transition region %s to %s. but the region 
was removed.",
+        "Server %s was trying to transition region %s to %s. but Region is not 
known.",
         serverName, regionInfo, state));
     }
     LOG.trace("Update region transition serverName={} region={} 
regionState={}", serverName,
@@ -966,7 +968,8 @@ public class AssignmentManager {
           state.equals(TransitionCode.CLOSED)) {
           LOG.info("RegionServer {} {}", state, 
regionNode.getRegionInfo().getEncodedName());
         } else {
-          LOG.warn("No matching procedure found for {} transition to {}", 
regionNode, state);
+          LOG.warn("No matching procedure found for {} transition on {} to {}",
+              serverName, regionNode, state);
         }
       }
     } finally {
@@ -1092,7 +1095,27 @@ public class AssignmentManager {
     checkOnlineRegionsReport(serverNode, regionNames);
   }
 
-  // just check and output possible inconsistency, without actually doing 
anything
+  /**
+   * Close <code>regionName</code> on <code>sn</code> silently and immediately 
without
+   * using a Procedure or going via hbase:meta. For case where a 
RegionServer's hosting
+   * of a Region is not aligned w/ the Master's accounting of Region state. 
This is for
+   * cleaning up an error in accounting.
+   */
+  private void closeRegionSilently(ServerName sn, byte [] regionName) {
+    try {
+      RegionInfo ri = 
MetaTableAccessor.parseRegionInfoFromRegionName(regionName);
+      // Pass -1 for timeout. Means do not wait.
+      
ServerManager.closeRegionSilentlyAndWait(this.master.getClusterConnection(), 
sn, ri, -1);
+    } catch (Exception e) {
+      LOG.error("Failed trying to close {} on {}", 
Bytes.toStringBinary(regionName), sn, e);
+    }
+  }
+
+  /**
+   * Check that what the RegionServer reports aligns with the Master's image.
+   * If disagreement, we will tell the RegionServer to expediently close
+   * a Region we do not think it should have.
+   */
   private void checkOnlineRegionsReport(ServerStateNode serverNode, 
Set<byte[]> regionNames) {
     ServerName serverName = serverNode.getServerName();
     for (byte[] regionName : regionNames) {
@@ -1101,10 +1124,13 @@ public class AssignmentManager {
       }
       RegionStateNode regionNode = 
regionStates.getRegionStateNodeFromName(regionName);
       if (regionNode == null) {
-        LOG.warn("No region state node for {}, it should already be on {}",
-          Bytes.toStringBinary(regionName), serverName);
+        String regionNameAsStr = Bytes.toStringBinary(regionName);
+        LOG.warn("No RegionStateNode for {} but reported as up on {}; 
closing...",
+            regionNameAsStr, serverName);
+        closeRegionSilently(serverNode.getServerName(), regionName);
         continue;
       }
+      final long lag = 1000;
       regionNode.lock();
       try {
         long diff = EnvironmentEdgeManager.currentTime() - 
regionNode.getLastUpdate();
@@ -1112,17 +1138,19 @@ public class AssignmentManager {
           // This is possible as a region server has just closed a region but 
the region server
           // report is generated before the closing, but arrive after the 
closing. Make sure there
           // is some elapsed time so less false alarms.
-          if (!regionNode.getRegionLocation().equals(serverName) && diff > 
1000) {
-            LOG.warn("{} reported OPEN on server={} but state has otherwise", 
regionNode,
-              serverName);
+          if (!regionNode.getRegionLocation().equals(serverName) && diff > 
lag) {
+            LOG.warn("Reporting {} server does not match {} (time since last " 
+
+                    "update={}ms); closing...",
+              serverName, regionNode, diff);
+            closeRegionSilently(serverNode.getServerName(), regionName);
           }
         } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
           // So, we can get report that a region is CLOSED or SPLIT because a 
heartbeat
           // came in at about same time as a region transition. Make sure 
there is some
           // elapsed time so less false alarms.
-          if (diff > 1000) {
-            LOG.warn("{} reported an unexpected OPEN on {}; time since last 
update={}ms",
-              regionNode, serverName, diff);
+          if (diff > lag) {
+            LOG.warn("Reporting {} state does not match {} (time since last 
update={}ms)",
+              serverName, regionNode, diff);
           }
         }
       } finally {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
index 30e6706..e4a18d0 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
@@ -282,7 +282,7 @@ public class RegionStateNode implements 
Comparable<RegionStateNode> {
 
   public String toShortString() {
     // rit= is the current Region-In-Transition State -- see State enum.
-    return String.format("rit=%s, location=%s", getState(), 
getRegionLocation());
+    return String.format("state=%s, location=%s", getState(), 
getRegionLocation());
   }
 
   public String toDescriptiveString() {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
index 3eba00d..8d22c0e 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
@@ -96,6 +96,9 @@ public class RegionStates {
 
   public RegionStates() { }
 
+  /**
+   * Called on stop of AssignmentManager.
+   */
   public void clear() {
     regionsMap.clear();
     regionInTransition.clear();
@@ -728,12 +731,15 @@ public class RegionStates {
     return serverMap.computeIfAbsent(serverName, key -> new 
ServerStateNode(key));
   }
 
+  /**
+   * Called by SCP at end of successful processing.
+   */
   public void removeServer(final ServerName serverName) {
     serverMap.remove(serverName);
   }
 
   /**
-   * @return Pertinent ServerStateNode or NULL if none found.
+   * @return Pertinent ServerStateNode or NULL if none found (Do not make 
modifications).
    */
   @VisibleForTesting
   public ServerStateNode getServerNode(final ServerName serverName) {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 3cb41eb..f63de8b 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -248,7 +248,8 @@ public class TransitRegionStateProcedure
 
     if (retries > 
env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
       // Throw exception to backoff and retry when failed open too many times
-      throw new HBaseIOException("Failed to open region");
+      throw new HBaseIOException("Failed confirm OPEN of " + regionNode +
+          " (remote log may yield more detail on why).");
     } else {
       // Here we do not throw exception because we want to the region to be 
online ASAP
       return Flow.HAS_MORE_STATE;
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 6a778b8c..524fcd9 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -3170,7 +3170,7 @@ public class HRegionServer extends HasThread implements
 
   /**
    * Close asynchronously a region, can be called from the master or 
internally by the regionserver
-   * when stopping. If called from the master, the region will update the 
znode status.
+   * when stopping. If called from the master, the region will update the 
status.
    *
    * <p>
    * If an opening was in progress, this method will cancel it, but will not 
start a new close. The
@@ -3200,6 +3200,7 @@ public class HRegionServer extends HasThread implements
       }
     }
 
+    // previous can come back 'null' if not in map.
     final Boolean previous = 
this.regionsInTransitionInRS.putIfAbsent(Bytes.toBytes(encodedName),
         Boolean.FALSE);
 
@@ -3221,6 +3222,8 @@ public class HRegionServer extends HasThread implements
         throw new NotServingRegionException("The region " + encodedName +
           " was opening but not yet served. Opening is cancelled.");
       }
+    } else if (previous == null) {
+      LOG.info("Received CLOSE for {}", encodedName);
     } else if (Boolean.FALSE.equals(previous)) {
       LOG.info("Received CLOSE for the region: " + encodedName +
         ", which we are already trying to CLOSE, but not completed yet");

Reply via email to