[flink-kubernetes-operator] branch main updated: [FLINK-32002] Adjust autoscaler defaults for release (#586)

mxm Tue, 09 May 2023 12:13:12 -0700

This is an automated email from the ASF dual-hosted git repository.

mxm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/flink-kubernetes-operator.git



The following commit(s) were added to refs/heads/main by this push:
     new 063f22c0 [FLINK-32002] Adjust autoscaler defaults for release (#586)
063f22c0 is described below

commit 063f22c09519ee53968474d2a4fd2289bb1ae442
Author: Maximilian Michels <[email protected]>
AuthorDate: Tue May 9 21:12:06 2023 +0200

    [FLINK-32002] Adjust autoscaler defaults for release (#586)
    
    The goal is to make the make the out-of-the-box experience as smooth as 
possible.
    Hence, the defaults are adjusted to have a larger metric window, wider 
bounds for
    scaling up and down, less quick downscaling, and a maximum parallelism of 
200.
---
 .../generated/auto_scaler_configuration.html          | 18 +++++++++---------
 .../operator/autoscaler/config/AutoScalerOptions.java | 19 ++++++++++---------
 .../MetricsCollectionAndEvaluationTest.java           |  4 ++--
 .../operator/autoscaler/ScalingExecutorTest.java      |  2 +-
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/docs/layouts/shortcodes/generated/auto_scaler_configuration.html 
b/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
index 11461049..e97f9382 100644
--- a/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
+++ b/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
@@ -16,7 +16,7 @@
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.catch-up.duration</h5></td>
-            <td style="word-wrap: break-word;">10 min</td>
+            <td style="word-wrap: break-word;">5 min</td>
             <td>Duration</td>
             <td>The target duration for fully processing any backlog after a 
scaling operation. Set to 0 to disable backlog based scaling.</td>
         </tr>
@@ -34,7 +34,7 @@
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.history.max.count</h5></td>
-            <td style="word-wrap: break-word;">1</td>
+            <td style="word-wrap: break-word;">3</td>
             <td>Integer</td>
             <td>Maximum number of past scaling decisions to retain per 
vertex.</td>
         </tr>
@@ -46,13 +46,13 @@
         </tr>
         <tr>
             <td><h5>kubernetes.operator.job.autoscaler.metrics.window</h5></td>
-            <td style="word-wrap: break-word;">5 min</td>
+            <td style="word-wrap: break-word;">10 min</td>
             <td>Duration</td>
             <td>Scaling metrics aggregation window size.</td>
         </tr>
         <tr>
             <td><h5>kubernetes.operator.job.autoscaler.restart.time</h5></td>
-            <td style="word-wrap: break-word;">5 min</td>
+            <td style="word-wrap: break-word;">3 min</td>
             <td>Duration</td>
             <td>Expected restart time to be used until the operator can 
determine it reliably from history.</td>
         </tr>
@@ -64,9 +64,9 @@
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.scale-up.grace-period</h5></td>
-            <td style="word-wrap: break-word;">10 min</td>
+            <td style="word-wrap: break-word;">1 h</td>
             <td>Duration</td>
-            <td>Period in which no scale down is allowed after a scale up</td>
+            <td>Duration in which no scale down of a vertex is allowed after 
it has been scaled up.</td>
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.scaling.effectiveness.detection.enabled</h5></td>
@@ -100,9 +100,9 @@
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.target.utilization.boundary</h5></td>
-            <td style="word-wrap: break-word;">0.1</td>
+            <td style="word-wrap: break-word;">0.4</td>
             <td>Double</td>
-            <td>Target vertex utilization boundary. Scaling won't be performed 
if utilization is within (target - boundary, target + boundary)</td>
+            <td>Target vertex utilization boundary. Scaling won't be performed 
if the current processing rate is within [target_rate / (target_utilization - 
boundary), (target_rate / (target_utilization + boundary)]</td>
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.vertex.exclude.ids</h5></td>
@@ -112,7 +112,7 @@
         </tr>
         <tr>
             
<td><h5>kubernetes.operator.job.autoscaler.vertex.max-parallelism</h5></td>
-            <td style="word-wrap: break-word;">2147483647</td>
+            <td style="word-wrap: break-word;">200</td>
             <td>Integer</td>
             <td>The maximum parallelism the autoscaler can use. Note that this 
limit will be ignored if it is higher than the max parallelism configured in 
the Flink config or directly on each operator.</td>
         </tr>
diff --git 
a/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
 
b/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
index 4d496f02..df32aa5e 100644
--- 
a/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
+++ 
b/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
@@ -49,7 +49,7 @@ public class AutoScalerOptions {
     public static final ConfigOption<Duration> METRICS_WINDOW =
             autoScalerConfig("metrics.window")
                     .durationType()
-                    .defaultValue(Duration.ofMinutes(5))
+                    .defaultValue(Duration.ofMinutes(10))
                     .withDescription("Scaling metrics aggregation window 
size.");
 
     public static final ConfigOption<Duration> STABILIZATION_INTERVAL =
@@ -68,15 +68,16 @@ public class AutoScalerOptions {
     public static final ConfigOption<Double> TARGET_UTILIZATION_BOUNDARY =
             autoScalerConfig("target.utilization.boundary")
                     .doubleType()
-                    .defaultValue(0.1)
+                    .defaultValue(0.4)
                     .withDescription(
-                            "Target vertex utilization boundary. Scaling won't 
be performed if utilization is within (target - boundary, target + boundary)");
+                            "Target vertex utilization boundary. Scaling won't 
be performed if the current processing rate is within [target_rate / 
(target_utilization - boundary), (target_rate / (target_utilization + 
boundary)]");
 
     public static final ConfigOption<Duration> SCALE_UP_GRACE_PERIOD =
             autoScalerConfig("scale-up.grace-period")
                     .durationType()
-                    .defaultValue(Duration.ofMinutes(10))
-                    .withDescription("Period in which no scale down is allowed 
after a scale up");
+                    .defaultValue(Duration.ofHours(1))
+                    .withDescription(
+                            "Duration in which no scale down of a vertex is 
allowed after it has been scaled up.");
 
     public static final ConfigOption<Integer> VERTEX_MIN_PARALLELISM =
             autoScalerConfig("vertex.min-parallelism")
@@ -87,7 +88,7 @@ public class AutoScalerOptions {
     public static final ConfigOption<Integer> VERTEX_MAX_PARALLELISM =
             autoScalerConfig("vertex.max-parallelism")
                     .intType()
-                    .defaultValue(Integer.MAX_VALUE)
+                    .defaultValue(200)
                     .withDescription(
                             "The maximum parallelism the autoscaler can use. 
Note that this limit will be ignored if it is higher than the max parallelism 
configured in the Flink config or directly on each operator.");
 
@@ -101,14 +102,14 @@ public class AutoScalerOptions {
     public static final ConfigOption<Duration> CATCH_UP_DURATION =
             autoScalerConfig("catch-up.duration")
                     .durationType()
-                    .defaultValue(Duration.ofMinutes(10))
+                    .defaultValue(Duration.ofMinutes(5))
                     .withDescription(
                             "The target duration for fully processing any 
backlog after a scaling operation. Set to 0 to disable backlog based scaling.");
 
     public static final ConfigOption<Duration> RESTART_TIME =
             autoScalerConfig("restart.time")
                     .durationType()
-                    .defaultValue(Duration.ofMinutes(5))
+                    .defaultValue(Duration.ofMinutes(3))
                     .withDescription(
                             "Expected restart time to be used until the 
operator can determine it reliably from history.");
 
@@ -136,7 +137,7 @@ public class AutoScalerOptions {
     public static final ConfigOption<Integer> VERTEX_SCALING_HISTORY_COUNT =
             autoScalerConfig("history.max.count")
                     .intType()
-                    .defaultValue(1)
+                    .defaultValue(3)
                     .withDescription(
                             "Maximum number of past scaling decisions to 
retain per vertex.");
 
diff --git 
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
 
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
index 57c74452..88ae09b5 100644
--- 
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
+++ 
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
@@ -387,10 +387,10 @@ public class MetricsCollectionAndEvaluationTest {
                 5000.,
                 
evaluation.get(source1).get(ScalingMetric.TRUE_PROCESSING_RATE).getCurrent());
         assertEquals(
-                833.,
+                1667.,
                 
evaluation.get(source1).get(ScalingMetric.SCALE_DOWN_RATE_THRESHOLD).getCurrent());
         assertEquals(
-                625.,
+                500.,
                 
evaluation.get(source1).get(ScalingMetric.SCALE_UP_RATE_THRESHOLD).getCurrent());
 
         scalingExecutor.scaleResource(app, scalingInfo, conf, evaluation);
diff --git 
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
 
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
index cfe0695e..0bff7357 100644
--- 
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
+++ 
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
@@ -200,7 +200,7 @@ public class ScalingExecutorTest {
                         source,
                         evaluated(10, 80, 100),
                         filterOperator,
-                        evaluated(10, 60, 100),
+                        evaluated(10, 30, 100),
                         sink,
                         evaluated(10, 80, 100));
         // filter operator should not scale

[flink-kubernetes-operator] branch main updated: [FLINK-32002] Adjust autoscaler defaults for release (#586)

Reply via email to