This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.5 by this push:
     new 2ac964cf354 HBASE-28113 Modify the way of acquiring the 
RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442)
2ac964cf354 is described below

commit 2ac964cf35418f95d27719cc098fb5a3a9ab4137
Author: hiping-tech <58875741+hiping-t...@users.noreply.github.com>
AuthorDate: Thu Oct 19 23:22:10 2023 +0800

    HBASE-28113 Modify the way of acquiring the RegionStateNode lock in 
checkOnlineRegionsReport to tryLock (#5442)
    
    * To prevent threads from being blocked by the lock of RegionStateNode, 
modify the way of acquiring the RegionStateNode lock in 
checkOnlineRegionsReport to tryLock.
    
    Co-authored-by: lvhaiping.lhp <lvhaiping....@alibaba-inc.com>
    Signed-off-by: Duo Zhang <zhang...@apache.org>
    (cherry picked from commit e07d1fe0059d5dc17d1aad7c582e486c0f75fe52)
---
 .../hbase/master/assignment/AssignmentManager.java | 52 +++++++++++++---------
 .../hbase/master/assignment/RegionStateNode.java   |  4 ++
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index c7bc2171791..6ae0bdebb38 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1347,29 +1347,39 @@ public class AssignmentManager {
         continue;
       }
       final long lag = 1000;
-      regionNode.lock();
-      try {
-        long diff = EnvironmentEdgeManager.currentTime() - 
regionNode.getLastUpdate();
-        if (regionNode.isInState(State.OPENING, State.OPEN)) {
-          // This is possible as a region server has just closed a region but 
the region server
-          // report is generated before the closing, but arrive after the 
closing. Make sure there
-          // is some elapsed time so less false alarms.
-          if (!regionNode.getRegionLocation().equals(serverName) && diff > 
lag) {
-            LOG.warn("Reporting {} server does not match {} (time since last "
-              + "update={}ms); closing...", serverName, regionNode, diff);
-            closeRegionSilently(serverNode.getServerName(), regionName);
-          }
-        } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
-          // So, we can get report that a region is CLOSED or SPLIT because a 
heartbeat
-          // came in at about same time as a region transition. Make sure 
there is some
-          // elapsed time so less false alarms.
-          if (diff > lag) {
-            LOG.warn("Reporting {} state does not match {} (time since last 
update={}ms)",
-              serverName, regionNode, diff);
+      // This is just a fallback check designed to identify unexpected data 
inconsistencies, so we
+      // use tryLock to attempt to acquire the lock, and if the lock cannot be 
acquired, we skip the
+      // check. This will not cause any additional problems and also prevents 
the regionServerReport
+      // call from being stuck for too long which may cause deadlock on region 
assignment.
+      if (regionNode.tryLock()) {
+        try {
+          long diff = EnvironmentEdgeManager.currentTime() - 
regionNode.getLastUpdate();
+          if (regionNode.isInState(State.OPENING, State.OPEN)) {
+            // This is possible as a region server has just closed a region 
but the region server
+            // report is generated before the closing, but arrive after the 
closing. Make sure
+            // there
+            // is some elapsed time so less false alarms.
+            if (!regionNode.getRegionLocation().equals(serverName) && diff > 
lag) {
+              LOG.warn("Reporting {} server does not match {} (time since last 
"
+                + "update={}ms); closing...", serverName, regionNode, diff);
+              closeRegionSilently(serverNode.getServerName(), regionName);
+            }
+          } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
+            // So, we can get report that a region is CLOSED or SPLIT because 
a heartbeat
+            // came in at about same time as a region transition. Make sure 
there is some
+            // elapsed time so less false alarms.
+            if (diff > lag) {
+              LOG.warn("Reporting {} state does not match {} (time since last 
update={}ms)",
+                serverName, regionNode, diff);
+            }
           }
+        } finally {
+          regionNode.unlock();
         }
-      } finally {
-        regionNode.unlock();
+      } else {
+        LOG.warn(
+          "Unable to acquire lock for regionNode {}. It is likely that another 
thread is currently holding the lock. To avoid deadlock, skip execution for 
now.",
+          regionNode);
       }
     }
   }
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
index 7d2d47db630..2c61d3d427b 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
@@ -319,6 +319,10 @@ public class RegionStateNode implements 
Comparable<RegionStateNode> {
     lock.lock();
   }
 
+  public boolean tryLock() {
+    return lock.tryLock();
+  }
+
   public void unlock() {
     lock.unlock();
   }

Reply via email to