This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new cd3a83fb6dc HBASE-28113 Modify the way of acquiring the
RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442)
cd3a83fb6dc is described below
commit cd3a83fb6dcf1d8e22a7fd5a12caa222df5b7667
Author: hiping-tech <[email protected]>
AuthorDate: Thu Oct 19 23:22:10 2023 +0800
HBASE-28113 Modify the way of acquiring the RegionStateNode lock in
checkOnlineRegionsReport to tryLock (#5442)
* To prevent threads from being blocked by the lock of RegionStateNode,
modify the way of acquiring the RegionStateNode lock in
checkOnlineRegionsReport to tryLock.
Co-authored-by: lvhaiping.lhp <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
(cherry picked from commit e07d1fe0059d5dc17d1aad7c582e486c0f75fe52)
---
.../hbase/master/assignment/AssignmentManager.java | 52 +++++++++++++---------
.../hbase/master/assignment/RegionStateNode.java | 4 ++
2 files changed, 35 insertions(+), 21 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 2a5f31ef86a..d385c786618 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1279,29 +1279,39 @@ public class AssignmentManager {
continue;
}
final long lag = 1000;
- regionNode.lock();
- try {
- long diff = EnvironmentEdgeManager.currentTime() -
regionNode.getLastUpdate();
- if (regionNode.isInState(State.OPENING, State.OPEN)) {
- // This is possible as a region server has just closed a region but
the region server
- // report is generated before the closing, but arrive after the
closing. Make sure there
- // is some elapsed time so less false alarms.
- if (!regionNode.getRegionLocation().equals(serverName) && diff >
lag) {
- LOG.warn("Reporting {} server does not match {} (time since last "
- + "update={}ms); closing...", serverName, regionNode, diff);
- closeRegionSilently(serverNode.getServerName(), regionName);
- }
- } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
- // So, we can get report that a region is CLOSED or SPLIT because a
heartbeat
- // came in at about same time as a region transition. Make sure
there is some
- // elapsed time so less false alarms.
- if (diff > lag) {
- LOG.warn("Reporting {} state does not match {} (time since last
update={}ms)",
- serverName, regionNode, diff);
+ // This is just a fallback check designed to identify unexpected data
inconsistencies, so we
+ // use tryLock to attempt to acquire the lock, and if the lock cannot be
acquired, we skip the
+ // check. This will not cause any additional problems and also prevents
the regionServerReport
+ // call from being stuck for too long which may cause deadlock on region
assignment.
+ if (regionNode.tryLock()) {
+ try {
+ long diff = EnvironmentEdgeManager.currentTime() -
regionNode.getLastUpdate();
+ if (regionNode.isInState(State.OPENING, State.OPEN)) {
+ // This is possible as a region server has just closed a region
but the region server
+ // report is generated before the closing, but arrive after the
closing. Make sure
+ // there
+ // is some elapsed time so less false alarms.
+ if (!regionNode.getRegionLocation().equals(serverName) && diff >
lag) {
+ LOG.warn("Reporting {} server does not match {} (time since last
"
+ + "update={}ms); closing...", serverName, regionNode, diff);
+ closeRegionSilently(serverNode.getServerName(), regionName);
+ }
+ } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
+ // So, we can get report that a region is CLOSED or SPLIT because
a heartbeat
+ // came in at about same time as a region transition. Make sure
there is some
+ // elapsed time so less false alarms.
+ if (diff > lag) {
+ LOG.warn("Reporting {} state does not match {} (time since last
update={}ms)",
+ serverName, regionNode, diff);
+ }
}
+ } finally {
+ regionNode.unlock();
}
- } finally {
- regionNode.unlock();
+ } else {
+ LOG.warn(
+ "Unable to acquire lock for regionNode {}. It is likely that another
thread is currently holding the lock. To avoid deadlock, skip execution for
now.",
+ regionNode);
}
}
}
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
index 0ff7f9b014a..8df2ce41048 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
@@ -307,6 +307,10 @@ public class RegionStateNode implements
Comparable<RegionStateNode> {
lock.lock();
}
+ public boolean tryLock() {
+ return lock.tryLock();
+ }
+
public void unlock() {
lock.unlock();
}