[GitHub] [flink] tillrohrmann commented on a change in pull request #15071: [FLINK-21135][coord] Adjust parallelism of job for reactive mode

GitBox Wed, 03 Mar 2021 05:32:21 -0800


tillrohrmann commented on a change in pull request #15071:
URL: https://github.com/apache/flink/pull/15071#discussion_r586410604




##########
File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java
##########
@@ -248,9 +250,24 @@ public AdaptiveScheduler(
                         declarativeSlotPool::reserveFreeSlot,
                         declarativeSlotPool::freeReservedSlot);
 
-        for (JobVertex vertex : jobGraph.getVertices()) {
-            if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
-                vertex.setParallelism(1);
+        if (configuration.get(JobManagerOptions.SCHEDULER_MODE)
+                == SchedulerExecutionMode.REACTIVE) {
+            LOG.info("Modifying job parallelism for running in reactive 
mode.");
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getMaxParallelism() == 
JobVertex.MAX_PARALLELISM_DEFAULT) {
+                    
vertex.setParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                    
vertex.setMaxParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                } else {
+                    vertex.setParallelism(vertex.getMaxParallelism());
+                }
+            }
+        } else {
+            // non-reactive mode (test execution with adaptive scheduler): 
ensure parallelism is set
+            // for all vertices.
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
+                    vertex.setParallelism(1);
+                }
             }

Review comment:
       I think it should not be the `AdaptiveScheduler's` responsibility to set 
up the job correctly. If we do it, then we make the `AdaptiveScheduler` aware 
of the `ReactiveMode` which it does not need to know. The reactive mode is a 
user of the `AdaptiveScheduler` and thus needs to know about the 
`AdaptiveScheduler`. The other direction is not necessary.

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten
+        
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 
0L));
+        final DataStream<String> input = env.addSource(new 
ParallelismTrackingSource());
+        // we set maxParallelism = 1 and assert it never exceeds it
+        input.addSink(new 
ParallelismTrackingSink<>()).getTransformation().setMaxParallelism(1);

Review comment:
       We could also think about testing that an explicitly configured max 
parallelism is honoured in a separate test case. That way the individual test 
cases are easer to understand.

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten
+        
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 
0L));
+        final DataStream<String> input = env.addSource(new 
ParallelismTrackingSource());
+        // we set maxParallelism = 1 and assert it never exceeds it
+        input.addSink(new 
ParallelismTrackingSink<>()).getTransformation().setMaxParallelism(1);
+
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        env.executeAsync();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        ParallelismTrackingSource.expectInstances(2 * 
NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        final MiniCluster miniCluster = 
MINI_CLUSTER_WITH_CLIENT_RESOURCE.getMiniCluster();
+
+        // add additional TaskManager
+        miniCluster.startTaskManager();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        // prepare for and scale down
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        miniCluster.terminateTaskManager(0);
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+    }
+
+    private static class ParallelismTrackingSource implements 
ParallelSourceFunction<String> {
+        private volatile boolean running = true;
+
+        private static CountDownLatch instances;
+
+        public static void expectInstances(int count) {
+            instances = new CountDownLatch(count);

Review comment:
       I think this needs to be synchronized or at least volatile. Otherwise 
the test might become unstable.

##########
File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java
##########
@@ -248,9 +250,24 @@ public AdaptiveScheduler(
                         declarativeSlotPool::reserveFreeSlot,
                         declarativeSlotPool::freeReservedSlot);
 
-        for (JobVertex vertex : jobGraph.getVertices()) {
-            if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
-                vertex.setParallelism(1);
+        if (configuration.get(JobManagerOptions.SCHEDULER_MODE)
+                == SchedulerExecutionMode.REACTIVE) {
+            LOG.info("Modifying job parallelism for running in reactive 
mode.");
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getMaxParallelism() == 
JobVertex.MAX_PARALLELISM_DEFAULT) {
+                    
vertex.setParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);

Review comment:
       Didn't we say in the FLIP that the reactive mode only works if no 
parallelism has been set?

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten

Review comment:
       Doesn't this configure the parallelism for every operator? Reading our 
FLIP, we've said it only works if no explicit parallelism has been configured.

##########
File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java
##########
@@ -248,9 +250,24 @@ public AdaptiveScheduler(
                         declarativeSlotPool::reserveFreeSlot,
                         declarativeSlotPool::freeReservedSlot);
 
-        for (JobVertex vertex : jobGraph.getVertices()) {
-            if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
-                vertex.setParallelism(1);
+        if (configuration.get(JobManagerOptions.SCHEDULER_MODE)
+                == SchedulerExecutionMode.REACTIVE) {
+            LOG.info("Modifying job parallelism for running in reactive 
mode.");
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getMaxParallelism() == 
JobVertex.MAX_PARALLELISM_DEFAULT) {
+                    
vertex.setParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                    
vertex.setMaxParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                } else {
+                    vertex.setParallelism(vertex.getMaxParallelism());
+                }
+            }
+        } else {
+            // non-reactive mode (test execution with adaptive scheduler): 
ensure parallelism is set
+            // for all vertices.

Review comment:
       The code reads a bit as if we have this branch here to make some tests 
work. If this is the case, then I would suggest to not do it and instead fix 
the test code.

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten
+        
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 
0L));
+        final DataStream<String> input = env.addSource(new 
ParallelismTrackingSource());
+        // we set maxParallelism = 1 and assert it never exceeds it
+        input.addSink(new 
ParallelismTrackingSink<>()).getTransformation().setMaxParallelism(1);
+
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        env.executeAsync();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        ParallelismTrackingSource.expectInstances(2 * 
NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        final MiniCluster miniCluster = 
MINI_CLUSTER_WITH_CLIENT_RESOURCE.getMiniCluster();
+
+        // add additional TaskManager
+        miniCluster.startTaskManager();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        // prepare for and scale down
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        miniCluster.terminateTaskManager(0);
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();

Review comment:
       I would split scaling up and down into two test cases.

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten
+        
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 
0L));
+        final DataStream<String> input = env.addSource(new 
ParallelismTrackingSource());
+        // we set maxParallelism = 1 and assert it never exceeds it
+        input.addSink(new 
ParallelismTrackingSink<>()).getTransformation().setMaxParallelism(1);
+
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        env.executeAsync();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        ParallelismTrackingSource.expectInstances(2 * 
NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        final MiniCluster miniCluster = 
MINI_CLUSTER_WITH_CLIENT_RESOURCE.getMiniCluster();
+
+        // add additional TaskManager
+        miniCluster.startTaskManager();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        // prepare for and scale down
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        miniCluster.terminateTaskManager(0);
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+    }
+
+    private static class ParallelismTrackingSource implements 
ParallelSourceFunction<String> {
+        private volatile boolean running = true;
+
+        private static CountDownLatch instances;
+
+        public static void expectInstances(int count) {
+            instances = new CountDownLatch(count);
+        }
+
+        public static void waitForInstances() throws InterruptedException {
+            instances.await();
+        }
+
+        @Override
+        public void run(SourceContext<String> ctx) throws Exception {
+            instances.countDown();
+            while (running) {
+                ctx.collect("test");

Review comment:
       Let's use the checkpointing lock to stick to the contract.

##########
File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java
##########
@@ -248,9 +250,24 @@ public AdaptiveScheduler(
                         declarativeSlotPool::reserveFreeSlot,
                         declarativeSlotPool::freeReservedSlot);
 
-        for (JobVertex vertex : jobGraph.getVertices()) {
-            if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
-                vertex.setParallelism(1);
+        if (configuration.get(JobManagerOptions.SCHEDULER_MODE)
+                == SchedulerExecutionMode.REACTIVE) {
+            LOG.info("Modifying job parallelism for running in reactive 
mode.");
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getMaxParallelism() == 
JobVertex.MAX_PARALLELISM_DEFAULT) {
+                    
vertex.setParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                    
vertex.setMaxParallelism(Transformation.UPPER_BOUND_MAX_PARALLELISM);
+                } else {
+                    vertex.setParallelism(vertex.getMaxParallelism());
+                }
+            }
+        } else {
+            // non-reactive mode (test execution with adaptive scheduler): 
ensure parallelism is set
+            // for all vertices.
+            for (JobVertex vertex : jobGraph.getVertices()) {
+                if (vertex.getParallelism() == 
ExecutionConfig.PARALLELISM_DEFAULT) {
+                    vertex.setParallelism(1);
+                }
             }

Review comment:
       Moreover, I would suggest to required that the `JobGraph` is already 
properly configured when it arrives at the `AdaptiveScheduler`.

##########
File path: 
flink-tests/src/test/java/org/apache/flink/test/scheduling/ReactiveModeITCase.java
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.scheduling;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.ClusterOptions;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.configuration.SchedulerExecutionMode;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.runtime.testutils.MiniClusterResource;
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
+import org.apache.flink.test.util.MiniClusterWithClientResource;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+
+/** Tests for Reactive Mode (FLIP-159). */
+public class ReactiveModeITCase extends TestLogger {
+    private static final int NUMBER_SLOTS_PER_TASK_MANAGER = 2;
+    private static final int INITIAL_NUMBER_TASK_MANAGERS = 1;
+
+    @ClassRule
+    public static final MiniClusterResource MINI_CLUSTER_WITH_CLIENT_RESOURCE =
+            new MiniClusterWithClientResource(
+                    new MiniClusterResourceConfiguration.Builder()
+                            .setConfiguration(getReactiveModeConfiguration())
+                            
.setNumberTaskManagers(INITIAL_NUMBER_TASK_MANAGERS)
+                            
.setNumberSlotsPerTaskManager(NUMBER_SLOTS_PER_TASK_MANAGER)
+                            .build());
+
+    private static Configuration getReactiveModeConfiguration() {
+        final Configuration conf = new Configuration();
+
+        conf.set(JobManagerOptions.SCHEDULER, 
JobManagerOptions.SchedulerType.Adaptive);
+        conf.set(JobManagerOptions.SCHEDULER_MODE, 
SchedulerExecutionMode.REACTIVE);
+        conf.set(ClusterOptions.ENABLE_DECLARATIVE_RESOURCE_MANAGEMENT, true);
+
+        return conf;
+    }
+
+    @Test
+    public void testScaleUpAndDownWithMaxParallelism() throws Exception {
+        StreamExecutionEnvironment env = 
StreamExecutionEnvironment.getExecutionEnvironment();
+        env.setParallelism(1); // we set parallelism to ensure it's overwritten
+        
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 
0L));
+        final DataStream<String> input = env.addSource(new 
ParallelismTrackingSource());
+        // we set maxParallelism = 1 and assert it never exceeds it
+        input.addSink(new 
ParallelismTrackingSink<>()).getTransformation().setMaxParallelism(1);
+
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        env.executeAsync();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        ParallelismTrackingSource.expectInstances(2 * 
NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        final MiniCluster miniCluster = 
MINI_CLUSTER_WITH_CLIENT_RESOURCE.getMiniCluster();
+
+        // add additional TaskManager
+        miniCluster.startTaskManager();
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+        // prepare for and scale down
+        
ParallelismTrackingSource.expectInstances(NUMBER_SLOTS_PER_TASK_MANAGER);
+        ParallelismTrackingSink.expectInstances(1);
+
+        miniCluster.terminateTaskManager(0);
+
+        ParallelismTrackingSource.waitForInstances();
+        ParallelismTrackingSink.waitForInstances();
+    }
+
+    private static class ParallelismTrackingSource implements 
ParallelSourceFunction<String> {
+        private volatile boolean running = true;
+
+        private static CountDownLatch instances;
+
+        public static void expectInstances(int count) {
+            instances = new CountDownLatch(count);
+        }
+
+        public static void waitForInstances() throws InterruptedException {
+            instances.await();
+        }
+
+        @Override
+        public void run(SourceContext<String> ctx) throws Exception {
+            instances.countDown();
+            while (running) {
+                ctx.collect("test");
+                Thread.sleep(100);
+            }
+        }
+
+        @Override
+        public void cancel() {
+            running = false;
+        }
+    }
+
+    private static class ParallelismTrackingSink<T> extends 
RichSinkFunction<T> {
+        private static CountDownLatch instances;
+
+        public static void expectInstances(int count) {
+            instances = new CountDownLatch(count);

Review comment:
       Same here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [flink] tillrohrmann commented on a change in pull request #15071: [FLINK-21135][coord] Adjust parallelism of job for reactive mode

Reply via email to