This is an automated email from the ASF dual-hosted git repository. zhangduo pushed a commit to branch branch-2.5 in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.5 by this push: new 2ac964cf354 HBASE-28113 Modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442) 2ac964cf354 is described below commit 2ac964cf35418f95d27719cc098fb5a3a9ab4137 Author: hiping-tech <58875741+hiping-t...@users.noreply.github.com> AuthorDate: Thu Oct 19 23:22:10 2023 +0800 HBASE-28113 Modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442) * To prevent threads from being blocked by the lock of RegionStateNode, modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock. Co-authored-by: lvhaiping.lhp <lvhaiping....@alibaba-inc.com> Signed-off-by: Duo Zhang <zhang...@apache.org> (cherry picked from commit e07d1fe0059d5dc17d1aad7c582e486c0f75fe52) --- .../hbase/master/assignment/AssignmentManager.java | 52 +++++++++++++--------- .../hbase/master/assignment/RegionStateNode.java | 4 ++ 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index c7bc2171791..6ae0bdebb38 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1347,29 +1347,39 @@ public class AssignmentManager { continue; } final long lag = 1000; - regionNode.lock(); - try { - long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); - if (regionNode.isInState(State.OPENING, State.OPEN)) { - // This is possible as a region server has just closed a region but the region server - // report is generated before the closing, but arrive after the closing. Make sure there - // is some elapsed time so less false alarms. - if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { - LOG.warn("Reporting {} server does not match {} (time since last " - + "update={}ms); closing...", serverName, regionNode, diff); - closeRegionSilently(serverNode.getServerName(), regionName); - } - } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { - // So, we can get report that a region is CLOSED or SPLIT because a heartbeat - // came in at about same time as a region transition. Make sure there is some - // elapsed time so less false alarms. - if (diff > lag) { - LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", - serverName, regionNode, diff); + // This is just a fallback check designed to identify unexpected data inconsistencies, so we + // use tryLock to attempt to acquire the lock, and if the lock cannot be acquired, we skip the + // check. This will not cause any additional problems and also prevents the regionServerReport + // call from being stuck for too long which may cause deadlock on region assignment. + if (regionNode.tryLock()) { + try { + long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); + if (regionNode.isInState(State.OPENING, State.OPEN)) { + // This is possible as a region server has just closed a region but the region server + // report is generated before the closing, but arrive after the closing. Make sure + // there + // is some elapsed time so less false alarms. + if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { + LOG.warn("Reporting {} server does not match {} (time since last " + + "update={}ms); closing...", serverName, regionNode, diff); + closeRegionSilently(serverNode.getServerName(), regionName); + } + } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { + // So, we can get report that a region is CLOSED or SPLIT because a heartbeat + // came in at about same time as a region transition. Make sure there is some + // elapsed time so less false alarms. + if (diff > lag) { + LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", + serverName, regionNode, diff); + } } + } finally { + regionNode.unlock(); } - } finally { - regionNode.unlock(); + } else { + LOG.warn( + "Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.", + regionNode); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java index 7d2d47db630..2c61d3d427b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java @@ -319,6 +319,10 @@ public class RegionStateNode implements Comparable<RegionStateNode> { lock.lock(); } + public boolean tryLock() { + return lock.tryLock(); + } + public void unlock() { lock.unlock(); }