virajjasani commented on code in PR #5513:
URL: https://github.com/apache/hbase/pull/5513#discussion_r1389596832
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java:
##########
@@ -1399,23 +1408,75 @@ private void
createMissingCFsInMetaDuringUpgrade(TableDescriptor metaDescriptor)
* Check hbase:meta is up and ready for reading. For use during Master
startup only.
* @return True if meta is UP and online and startup can progress.
Otherwise, meta is not online
* and we will hold here until operator intervention.
+ * @throws IOException If the master restart is required.
*/
@InterfaceAudience.Private
- public boolean waitForMetaOnline() {
+ public boolean waitForMetaOnline() throws IOException {
return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO);
}
/**
+ * Wait until the region is reported online on a live regionserver.
+ * @param ri Region info.
* @return True if region is online and scannable else false if an error or
shutdown (Otherwise we
* just block in here holding up all forward-progess).
+ * @throws IOException If the master restart is required.
*/
- private boolean isRegionOnline(RegionInfo ri) {
+ private boolean isRegionOnline(RegionInfo ri) throws IOException {
RetryCounter rc = null;
while (!isStopped()) {
RegionState rs =
this.assignmentManager.getRegionStates().getRegionState(ri);
if (rs != null && rs.isOpened()) {
if (this.getServerManager().isServerOnline(rs.getServerName())) {
return true;
+ } else {
+ ServerName serverNameForRegion = rs.getServerName();
+ Optional<Procedure<MasterProcedureEnv>> scpForServer =
+ this.procedureExecutor.getProcedures().stream()
+ .filter(p -> p instanceof ServerCrashProcedure
+ && serverNameForRegion.equals(((ServerCrashProcedure)
p).getServerName()))
+ .findFirst();
+ if (!scpForServer.isPresent()) {
+ LOG.info("{} has state {} but the server {} is not online,
scheduling recovery.",
+ ri.getRegionNameAsString(), rs, rs.getServerName());
+ this.getServerManager().expireServer(rs.getServerName(), true);
+ int numRetries = this.getConfiguration()
+ .getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_RETRIES, 20);
+ int sleepInterval = this.getConfiguration()
+ .getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_INTERVAL_MS,
2000);
+ while (numRetries > 0) {
+ scpForServer = this.procedureExecutor.getProcedures().stream()
+ .filter(p -> p instanceof ServerCrashProcedure
+ && serverNameForRegion.equals(((ServerCrashProcedure)
p).getServerName()))
+ .findFirst();
+ if (scpForServer.isPresent()) {
+ ServerCrashProcedure proc = (ServerCrashProcedure)
scpForServer.get();
+ if (proc.isFinished() || proc.isSuccess()) {
+ rs =
this.assignmentManager.getRegionStates().getRegionState(ri);
+ if (rs != null && rs.isOpened()) {
+ if
(this.getServerManager().isServerOnline(rs.getServerName())) {
+ return true;
+ }
+ }
+ }
+ }
+ Threads.sleep(sleepInterval);
+ numRetries--;
+ }
+ if (numRetries == 0) {
+ rs = this.assignmentManager.getRegionStates().getRegionState(ri);
+ if (rs != null && rs.isOpened()) {
+ if
(this.getServerManager().isServerOnline(rs.getServerName())) {
+ return true;
+ }
+ }
+ throw new PleaseRestartMasterException("Scheduled SCP for old
server for region "
Review Comment:
Here we crash only because even after scheduling SCP for the region (meta or
namespace), we still find the region not online.
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java:
##########
@@ -1399,23 +1408,75 @@ private void
createMissingCFsInMetaDuringUpgrade(TableDescriptor metaDescriptor)
* Check hbase:meta is up and ready for reading. For use during Master
startup only.
* @return True if meta is UP and online and startup can progress.
Otherwise, meta is not online
* and we will hold here until operator intervention.
+ * @throws IOException If the master restart is required.
*/
@InterfaceAudience.Private
- public boolean waitForMetaOnline() {
+ public boolean waitForMetaOnline() throws IOException {
return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO);
}
/**
+ * Wait until the region is reported online on a live regionserver.
+ * @param ri Region info.
* @return True if region is online and scannable else false if an error or
shutdown (Otherwise we
* just block in here holding up all forward-progess).
+ * @throws IOException If the master restart is required.
*/
- private boolean isRegionOnline(RegionInfo ri) {
+ private boolean isRegionOnline(RegionInfo ri) throws IOException {
RetryCounter rc = null;
while (!isStopped()) {
RegionState rs =
this.assignmentManager.getRegionStates().getRegionState(ri);
if (rs != null && rs.isOpened()) {
if (this.getServerManager().isServerOnline(rs.getServerName())) {
return true;
+ } else {
+ ServerName serverNameForRegion = rs.getServerName();
+ Optional<Procedure<MasterProcedureEnv>> scpForServer =
+ this.procedureExecutor.getProcedures().stream()
+ .filter(p -> p instanceof ServerCrashProcedure
+ && serverNameForRegion.equals(((ServerCrashProcedure)
p).getServerName()))
+ .findFirst();
+ if (!scpForServer.isPresent()) {
+ LOG.info("{} has state {} but the server {} is not online,
scheduling recovery.",
+ ri.getRegionNameAsString(), rs, rs.getServerName());
+ this.getServerManager().expireServer(rs.getServerName(), true);
+ int numRetries = this.getConfiguration()
+ .getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_RETRIES, 20);
+ int sleepInterval = this.getConfiguration()
+ .getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_INTERVAL_MS,
2000);
+ while (numRetries > 0) {
+ scpForServer = this.procedureExecutor.getProcedures().stream()
+ .filter(p -> p instanceof ServerCrashProcedure
+ && serverNameForRegion.equals(((ServerCrashProcedure)
p).getServerName()))
+ .findFirst();
+ if (scpForServer.isPresent()) {
Review Comment:
That is also fine because eventually this will be successful:
```
if (numRetries == 0) {
rs =
this.assignmentManager.getRegionStates().getRegionState(ri);
if (rs != null && rs.isOpened()) {
if
(this.getServerManager().isServerOnline(rs.getServerName())) {
return true;
}
}
```
So if the region (meta/namespace) was assigned successfully, we will return
true from here eventually.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]