[ https://issues.apache.org/jira/browse/GOBBLIN-1107?focusedWorklogId=414478&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-414478 ]
ASF GitHub Bot logged work on GOBBLIN-1107: ------------------------------------------- Author: ASF GitHub Bot Created on: 02/Apr/20 04:35 Start Date: 02/Apr/20 04:35 Worklog Time Spent: 10m Work Description: autumnust commented on pull request #2947: GOBBLIN-1107: Lazily initialize Helix TaskStateModelFactory in Gobbli… URL: https://github.com/apache/incubator-gobblin/pull/2947#discussion_r402046579 ########## File path: gobblin-cluster/src/test/java/org/apache/gobblin/cluster/GobblinTaskRunnerTest.java ########## @@ -156,13 +179,73 @@ public void testConnectHelixManagerWithRetry() { //Ensure that connect with retry succeeds corruptGobblinTaskRunner.connectHelixManagerWithRetry(); Assert.assertTrue(true); + + corruptGobblinTaskRunner.disconnectHelixManager(); + } + + @Test (groups = {"disabledOnTravis"}) + public void testTaskAssignmentAfterHelixConnectionRetry() + throws Exception { + this.suite = new TaskAssignmentAfterConnectionRetry(); + + String zkConnectString = suite.getManagerConfig().getString(GobblinClusterConfigurationKeys.ZK_CONNECTION_STRING_KEY); + String clusterName = suite.getManagerConfig().getString(GobblinClusterConfigurationKeys.HELIX_CLUSTER_NAME_KEY); + //A test manager instance for observing the state of the cluster + HelixManager helixManager = HelixManagerFactory.getZKHelixManager(clusterName, "TestManager", InstanceType.SPECTATOR, zkConnectString); + + suite.startCluster(); + + helixManager.connect(); + + //Ensure that Helix has created a workflow + AssertWithBackoff.create().maxSleepMs(1000).backoffFactor(1). + assertTrue(ClusterIntegrationTest.isTaskStarted(helixManager, TaskAssignmentAfterConnectionRetry.JOB_ID), "Waiting for the job to start..."); + + //Ensure that the SleepingTask is running + AssertWithBackoff.create().maxSleepMs(100).timeoutMs(2000).backoffFactor(1). + assertTrue(ClusterIntegrationTest.isTaskRunning(TaskAssignmentAfterConnectionRetry.TASK_STATE_FILE),"Waiting for the task to enter running state"); + + helixManager.disconnect(); } + + public static class TaskAssignmentAfterConnectionRetry extends IntegrationBasicSuite { Review comment: Can we rename `HelixAssignedParticipantCheckTest#IntegrationJobSuite` to be something generic, and have this class extend that instead ? `overrideJobConfigs` seems to be commonly used when we want to simulate a scenarios when task has to fail somehow. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 414478) Time Spent: 1h 10m (was: 1h) > Lazily initialize Helix TaskStateModelFactory in GobblinTaskRunner > ------------------------------------------------------------------ > > Key: GOBBLIN-1107 > URL: https://issues.apache.org/jira/browse/GOBBLIN-1107 > Project: Apache Gobblin > Issue Type: Improvement > Components: gobblin-cluster > Affects Versions: 0.15.0 > Reporter: Sudarshan Vasudevan > Assignee: Hung Tran > Priority: Major > Fix For: 0.15.0 > > Time Spent: 1h 10m > Remaining Estimate: 0h > > Current behavior eagerly initializes the taskStateModelFactory inside the > constructor. This causes Helix task assignment to be stuck when a Helix > manager connection needs to be retried. -- This message was sent by Atlassian Jira (v8.3.4#803005)