This is an automated email from the ASF dual-hosted git repository. onichols pushed a commit to branch support/1.13 in repository https://gitbox.apache.org/repos/asf/geode.git
commit 53d32c325c808d6e965a45cdf36aca1a71db2183 Author: Kamilla Aslami <[email protected]> AuthorDate: Fri Jan 8 14:57:02 2021 -0800 GEODE-7861: Improve error reporting in GMSJoinLeave.join() (#5839) * GEODE-7861: Improve error reporting in GMSJoinLeave.join() * Fix LocatorDUnitTest.testNoLocator * Changes after the code review * Fix typo (cherry picked from commit 089c1ba7e20606f8201a4cd8f7221f6adc60ba5c) --- .../apache/geode/distributed/LocatorDUnitTest.java | 3 +- .../gms/membership/GMSJoinLeaveJUnitTest.java | 100 ++++++++++++++++----- .../internal/membership/gms/GMSMembership.java | 7 +- .../membership/gms/interfaces/JoinLeave.java | 12 ++- .../membership/gms/membership/GMSJoinLeave.java | 76 ++++++++++------ 5 files changed, 143 insertions(+), 55 deletions(-) diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java index d3c1733..e0f8966 100644 --- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java +++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java @@ -1002,7 +1002,8 @@ public class LocatorDUnitTest implements Serializable { } catch (GemFireConfigException ex) { String s = ex.getMessage(); - assertThat(s.contains("Locator does not exist")).isTrue(); + assertThat(s.contains("Could not contact any of the locators")) + .isTrue(); } } diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java index 3a86a1e..fd15db6 100644 --- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java +++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java @@ -22,12 +22,14 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Matchers.isA; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.timeout; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -115,11 +117,18 @@ public class GMSJoinLeaveJUnitTest { public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave) throws Exception { + String locator = "localhost[12345]"; + initMocks(enableNetworkPartition, useTestGMSJoinLeave, locator, locator); + } + + public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave, + String locators, String startLocator) + throws Exception { mockConfig = mock(MembershipConfig.class); when(mockConfig.isNetworkPartitionDetectionEnabled()).thenReturn(enableNetworkPartition); when(mockConfig.getSecurityUDPDHAlgo()).thenReturn(""); - when(mockConfig.getStartLocator()).thenReturn("localhost[12345]"); - when(mockConfig.getLocators()).thenReturn("localhost[12345]"); + when(mockConfig.getStartLocator()).thenReturn(startLocator); + when(mockConfig.getLocators()).thenReturn(locators); when(mockConfig.getMcastPort()).thenReturn(0); when(mockConfig.getMemberTimeout()).thenReturn(2000L); @@ -1423,14 +1432,7 @@ public class GMSJoinLeaveJUnitTest { @Test public void testCoordinatorFindRequestSuccess() throws Exception { initMocks(false); - HashSet<MemberIdentifier> registrants = new HashSet<>(); - registrants.add(mockMembers[0]); - FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false, - null, registrants, false, true, null); - - when(locatorClient.requestToServer(isA(HostAndPort.class), - isA(FindCoordinatorRequest.class), anyInt(), anyBoolean())) - .thenReturn(fcr); + mockRequestToServer(isA(HostAndPort.class)); boolean foundCoordinator = gmsJoinLeave.findCoordinator(); assertTrue(gmsJoinLeave.searchState.toString(), foundCoordinator); @@ -1441,24 +1443,82 @@ public class GMSJoinLeaveJUnitTest { public void testCoordinatorFindRequestFailure() throws Exception { try { initMocks(false); - HashSet<MemberIdentifier> registrants = new HashSet<>(); - registrants.add(mockMembers[0]); - FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], - false, null, registrants, false, true, null); + mockRequestToServer(eq(new HostAndPort("localhost", 12346))); GMSMembershipView view = createView(); JoinResponseMessage jrm = new JoinResponseMessage(mockMembers[0], view, 0); gmsJoinLeave.setJoinResponseMessage(jrm); - when(locatorClient.requestToServer(eq(new HostAndPort("localhost", 12346)), - isA(FindCoordinatorRequest.class), anyInt(), anyBoolean())) - .thenReturn(fcr); - - assertFalse("Should not be able to join ", gmsJoinLeave.join()); + assertThatThrownBy(gmsJoinLeave::join) + .isInstanceOf(MembershipConfigurationException.class); } finally { - } } + @Test + public void testJoinFailureWhenSleepInterrupted() throws Exception { + initMocks(false); + mockRequestToServer(isA(HostAndPort.class)); + + when(mockConfig.getMemberTimeout()).thenReturn(100L); + when(mockConfig.getJoinTimeout()).thenReturn(1000L); + + GMSJoinLeave spyGmsJoinLeave = spy(gmsJoinLeave); + when(spyGmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP)) + .thenThrow(new InterruptedException()); + + assertThatThrownBy(spyGmsJoinLeave::join) + .isInstanceOf(MembershipConfigurationException.class) + .hasMessageContaining("Retry sleep interrupted"); + } + + @Test + public void testJoinFailureWhenTimeout() throws Exception { + initMocks(false); + mockRequestToServer(isA(HostAndPort.class)); + + assertThatThrownBy(() -> gmsJoinLeave.join()) + .isInstanceOf(MembershipConfigurationException.class) + .hasMessageContaining("Operation timed out"); + } + + @Test + public void testPauseIfThereIsNoCoordinator() throws InterruptedException { + locatorClient = mock(TcpClient.class); + gmsJoinLeave = new GMSJoinLeave(locatorClient); + assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP)) + .isFalse(); + assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(1, GMSJoinLeave.JOIN_RETRY_SLEEP)).isTrue(); + } + + @Test + public void testJoinFailureWhenNoLocator() throws Exception { + final String locator1 = "locator1[12345]"; + final String locator2 = "locator2[54321]"; + locatorClient = mock(TcpClient.class); + + initMocks(false, false, locator1 + ',' + locator2, locator1); + when(locatorClient.requestToServer(any(), any(), anyInt(), anyBoolean())) + .thenThrow(IOException.class); + + assertThatThrownBy(gmsJoinLeave::join) + .isInstanceOf(MembershipConfigurationException.class) + .hasMessageContaining( + "Could not contact any of the locators: [HostAndPort[locator1:12345], HostAndPort[locator2:54321]]") + .hasCauseInstanceOf(IOException.class); + } + + private void mockRequestToServer(HostAndPort hostAndPort) + throws IOException, ClassNotFoundException { + HashSet<MemberIdentifier> registrants = new HashSet<>(); + registrants.add(mockMembers[0]); + + FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false, + null, registrants, false, true, null); + when(locatorClient.requestToServer(hostAndPort, + isA(FindCoordinatorRequest.class), anyInt(), anyBoolean())) + .thenReturn(fcr); + } + private void waitForViewAndFinalCheckInProgress(int viewId) throws InterruptedException { // wait for the view processing thread to collect and process the requests int sleeps = 0; diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java index 1986931..434093c 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java @@ -571,12 +571,7 @@ public class GMSMembership<ID extends MemberIdentifier> implements Membership<ID this.isJoining = true; // added for bug #44373 // connect - boolean ok = services.getJoinLeave().join(); - - if (!ok) { - throw new MembershipConfigurationException("Unable to join the distributed system. " - + "Operation either timed out, was stopped or Locator does not exist."); - } + services.getJoinLeave().join(); MembershipView<ID> initialView = createGeodeView(services.getJoinLeave().getView()); latestView = new MembershipView<>(initialView, initialView.getViewId()); diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java index 7228162..1880a26 100755 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java @@ -16,6 +16,7 @@ package org.apache.geode.distributed.internal.membership.gms.interfaces; import org.apache.geode.distributed.internal.membership.api.MemberIdentifier; import org.apache.geode.distributed.internal.membership.api.MemberStartupException; +import org.apache.geode.distributed.internal.membership.api.MembershipConfigurationException; import org.apache.geode.distributed.internal.membership.gms.GMSMembershipView; /** @@ -26,10 +27,15 @@ import org.apache.geode.distributed.internal.membership.gms.GMSMembershipView; public interface JoinLeave<ID extends MemberIdentifier> extends Service<ID> { /** - * joins the distributed system and returns true if successful, false if not. Throws - * MemberStartupException and MemberConfigurationException + * joins the distributed system. + * + * @throws MemberStartupException if there was a problem joining the cluster after membership + * configuration has + * completed. + * @throws MembershipConfigurationException if operation either timed out, was stopped or locator + * does not exist. */ - boolean join() throws MemberStartupException; + void join() throws MemberStartupException; /** * leaves the distributed system. Should be invoked before stop() diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java index 208cfb5..87587d5 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java @@ -273,11 +273,13 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> int lastFindCoordinatorInViewId = -1000; final Set<FindCoordinatorResponse<ID>> responses = new HashSet<>(); public int responsesExpected; + Exception lastLocatorException; void cleanup() { alreadyTried.clear(); possibleCoordinator = null; view = null; + lastLocatorException = null; synchronized (responses) { responses.clear(); } @@ -315,14 +317,14 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> * @return true if successful, false if not */ @Override - public boolean join() throws MemberStartupException { + public void join() throws MemberStartupException { try { if (Boolean.getBoolean(BYPASS_DISCOVERY_PROPERTY)) { synchronized (viewInstallationLock) { becomeCoordinator(); } - return true; + return; } SearchState<ID> state = searchState; @@ -355,11 +357,11 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> synchronized (viewInstallationLock) { becomeCoordinator(); } - return true; + return; } } else { if (attemptToJoin()) { - return true; + return; } if (this.isStopping) { break; @@ -383,40 +385,45 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> break; } } - try { - if (found && !state.hasContactedAJoinedLocator) { - // if locators are restarting they may be handing out IDs from a stale view that - // we should go through quickly. Otherwise we should sleep a bit to let failure - // detection select a new coordinator - if (state.possibleCoordinator.getVmViewId() < 0) { - logger.debug("sleeping for {} before making another attempt to find the coordinator", - retrySleep); - Thread.sleep(retrySleep); - } else { + if (found && !state.hasContactedAJoinedLocator) { + try { + if (pauseIfThereIsNoCoordinator(state.possibleCoordinator.getVmViewId(), retrySleep)) { // since we were given a coordinator that couldn't be used we should keep trying tries = 0; giveupTime = System.currentTimeMillis() + timeout; } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new MembershipConfigurationException( + "Retry sleep interrupted. Giving up on joining the distributed system."); } - } catch (InterruptedException e) { - logger.debug("retry sleep interrupted - giving up on joining the distributed system"); - return false; } } // for if (!this.isJoined) { logger.debug("giving up attempting to join the distributed system after " + (System.currentTimeMillis() - startTime) + "ms"); - } - // to preserve old behavior we need to throw a MemberStartupException if - // unable to contact any of the locators - if (!this.isJoined && state.hasContactedAJoinedLocator) { - throw new MemberStartupException("Unable to join the distributed system in " - + (System.currentTimeMillis() - startTime) + "ms"); - } + // to preserve old behavior we need to throw a MemberStartupException if + // unable to contact any of the locators + if (state.hasContactedAJoinedLocator) { + throw new MemberStartupException("Unable to join the distributed system in " + + (System.currentTimeMillis() - startTime) + "ms"); + } - return this.isJoined; + if (state.locatorsContacted == 0) { + throw new MembershipConfigurationException( + "Unable to join the distributed system. Could not contact any of the locators: " + + locators, + state.lastLocatorException); + } + + if (System.currentTimeMillis() > giveupTime) { + throw new MembershipConfigurationException( + "Unable to join the distributed system. Operation timed out"); + } + } + return; } finally { // notify anyone waiting on the address to be completed if (this.isJoined) { @@ -428,6 +435,24 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> } } + boolean pauseIfThereIsNoCoordinator(int viewId, long retrySleep) + throws InterruptedException { + // if locators are restarting they may be handing out IDs from a stale view that + // we should go through quickly. Otherwise we should sleep a bit to let failure + // detection select a new coordinator + if (viewId < 0) { + // the process hasn't finished joining the cluster. + logger.debug("sleeping for {} before making another attempt to find the coordinator", + retrySleep); + Thread.sleep(retrySleep); + } else { + // the member has joined the cluster. + return true; + } + + return false; + } + /** * send a join request and wait for a reply. Process the reply. This may throw a * MemberStartupException or an exception from the authenticator, if present. @@ -1199,6 +1224,7 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID> } catch (IOException | ClassNotFoundException problem) { logger.info("Unable to contact locator " + laddr + ": " + problem); logger.debug("Exception thrown when contacting a locator", problem); + state.lastLocatorException = problem; if (state.locatorsContacted == 0 && System.currentTimeMillis() < giveUpTime) { try { Thread.sleep(FIND_LOCATOR_RETRY_SLEEP);
