bbeaudreault commented on code in PR #5534: URL: https://github.com/apache/hbase/pull/5534#discussion_r1407617976
########## hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java: ########## @@ -139,33 +170,57 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED: regions = regions.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened) .filter(l -> l != null).collect(Collectors.toList()); - if (regions.isEmpty()) { - return Flow.NO_MORE_STATE; + // we need to create a set of region names because the HRegionLocation hashcode is only + // based + // on the server name + Set<byte[]> currentRegionBatchNames = currentRegionBatch.stream() + .map(r -> r.getRegion().getRegionName()).collect(Collectors.toSet()); + currentRegionBatch = regions.stream() + .filter(r -> currentRegionBatchNames.contains(r.getRegion().getRegionName())) + .collect(Collectors.toList()); + if (currentRegionBatch.isEmpty()) { + if (regions.isEmpty()) { + return Flow.NO_MORE_STATE; + } else { + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + if (reopenBatchBackoffMillis > 0) { + backoff(reopenBatchBackoffMillis); + } + return Flow.HAS_MORE_STATE; + } } - if (regions.stream().anyMatch(loc -> canSchedule(env, loc))) { + if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env, loc))) { retryCounter = null; setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + if (reopenBatchBackoffMillis > 0) { + backoff(reopenBatchBackoffMillis); + } return Flow.HAS_MORE_STATE; } // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry // again. if (retryCounter == null) { retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); } - long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts(); LOG.info( - "There are still {} region(s) which need to be reopened for table {} are in " + "There are still {} region(s) which need to be reopened for table {}. {} are in " + "OPENING state, suspend {}secs and try again later", - regions.size(), tableName, backoff / 1000); - setTimeout(Math.toIntExact(backoff)); - setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); - skipPersistence(); + regions.size(), tableName, currentRegionBatch.size(), backoffMillis / 1000); + backoff(backoffMillis); throw new ProcedureSuspendedException(); default: throw new UnsupportedOperationException("unhandled state=" + state); } } + private void backoff(long millis) throws ProcedureSuspendedException { + setTimeout(Math.toIntExact(millis)); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + skipPersistence(); Review Comment: I think we need some way to track which regions still have to be restarted after backoff. I don't think we're modifying the proto here yet so agree persistence wouldn't help much yet. But maybe we need to update the proto? Otherwise in case of master restart, it'll just reopen all regions again? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org