Apache9 commented on code in PR #5534:
URL: https://github.com/apache/hbase/pull/5534#discussion_r1407666801
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java:
##########
@@ -139,33 +170,57 @@ protected Flow executeFromState(MasterProcedureEnv env,
ReopenTableRegionsState
case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED:
regions =
regions.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened)
.filter(l -> l != null).collect(Collectors.toList());
- if (regions.isEmpty()) {
- return Flow.NO_MORE_STATE;
+ // we need to create a set of region names because the HRegionLocation
hashcode is only
+ // based
+ // on the server name
+ Set<byte[]> currentRegionBatchNames = currentRegionBatch.stream()
+ .map(r -> r.getRegion().getRegionName()).collect(Collectors.toSet());
+ currentRegionBatch = regions.stream()
+ .filter(r ->
currentRegionBatchNames.contains(r.getRegion().getRegionName()))
+ .collect(Collectors.toList());
+ if (currentRegionBatch.isEmpty()) {
+ if (regions.isEmpty()) {
+ return Flow.NO_MORE_STATE;
+ } else {
+
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS);
+ if (reopenBatchBackoffMillis > 0) {
+ backoff(reopenBatchBackoffMillis);
+ }
+ return Flow.HAS_MORE_STATE;
+ }
}
- if (regions.stream().anyMatch(loc -> canSchedule(env, loc))) {
+ if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env,
loc))) {
retryCounter = null;
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS);
+ if (reopenBatchBackoffMillis > 0) {
+ backoff(reopenBatchBackoffMillis);
+ }
return Flow.HAS_MORE_STATE;
}
// We can not schedule TRSP for all the regions need to reopen, wait
for a while and retry
// again.
if (retryCounter == null) {
retryCounter =
ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
- long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
+ long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.info(
- "There are still {} region(s) which need to be reopened for table {}
are in "
+ "There are still {} region(s) which need to be reopened for table
{}. {} are in "
+ "OPENING state, suspend {}secs and try again later",
- regions.size(), tableName, backoff / 1000);
- setTimeout(Math.toIntExact(backoff));
- setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
- skipPersistence();
+ regions.size(), tableName, currentRegionBatch.size(), backoffMillis
/ 1000);
+ backoff(backoffMillis);
throw new ProcedureSuspendedException();
default:
throw new UnsupportedOperationException("unhandled state=" + state);
}
}
+ private void backoff(long millis) throws ProcedureSuspendedException {
+ setTimeout(Math.toIntExact(millis));
+ setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
+ skipPersistence();
Review Comment:
The flow for ReopenTableRegionsProcedure is
1. Get all regions of the table, and then go to step 2
2. Schedule TRSP to reopen the regions in the list, and then go to step 3
3. (After all the TRSP's are finished) Check whether there are still regions
need to be reopened, with RegionStates.checkReopened method, update the regions
list. If no regions need to be reopened, we are done. Otherwise, go to step 2.
So in step 3, if we are still in step 3 after suspending, it is OK to not
serialize anything, as we rely on the RegionStates.checkReopened to filter out
the regions which have already been reopened, not by any procedure states.
After master restarts, the region states will be reconstructed by reading meta.
Thanks.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]