This is an automated email from the ASF dual-hosted git repository.
mxm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/flink-kubernetes-operator.git
The following commit(s) were added to refs/heads/main by this push:
new 063f22c0 [FLINK-32002] Adjust autoscaler defaults for release (#586)
063f22c0 is described below
commit 063f22c09519ee53968474d2a4fd2289bb1ae442
Author: Maximilian Michels <[email protected]>
AuthorDate: Tue May 9 21:12:06 2023 +0200
[FLINK-32002] Adjust autoscaler defaults for release (#586)
The goal is to make the make the out-of-the-box experience as smooth as
possible.
Hence, the defaults are adjusted to have a larger metric window, wider
bounds for
scaling up and down, less quick downscaling, and a maximum parallelism of
200.
---
.../generated/auto_scaler_configuration.html | 18 +++++++++---------
.../operator/autoscaler/config/AutoScalerOptions.java | 19 ++++++++++---------
.../MetricsCollectionAndEvaluationTest.java | 4 ++--
.../operator/autoscaler/ScalingExecutorTest.java | 2 +-
4 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
b/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
index 11461049..e97f9382 100644
--- a/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
+++ b/docs/layouts/shortcodes/generated/auto_scaler_configuration.html
@@ -16,7 +16,7 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.catch-up.duration</h5></td>
- <td style="word-wrap: break-word;">10 min</td>
+ <td style="word-wrap: break-word;">5 min</td>
<td>Duration</td>
<td>The target duration for fully processing any backlog after a
scaling operation. Set to 0 to disable backlog based scaling.</td>
</tr>
@@ -34,7 +34,7 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.history.max.count</h5></td>
- <td style="word-wrap: break-word;">1</td>
+ <td style="word-wrap: break-word;">3</td>
<td>Integer</td>
<td>Maximum number of past scaling decisions to retain per
vertex.</td>
</tr>
@@ -46,13 +46,13 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.metrics.window</h5></td>
- <td style="word-wrap: break-word;">5 min</td>
+ <td style="word-wrap: break-word;">10 min</td>
<td>Duration</td>
<td>Scaling metrics aggregation window size.</td>
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.restart.time</h5></td>
- <td style="word-wrap: break-word;">5 min</td>
+ <td style="word-wrap: break-word;">3 min</td>
<td>Duration</td>
<td>Expected restart time to be used until the operator can
determine it reliably from history.</td>
</tr>
@@ -64,9 +64,9 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.scale-up.grace-period</h5></td>
- <td style="word-wrap: break-word;">10 min</td>
+ <td style="word-wrap: break-word;">1 h</td>
<td>Duration</td>
- <td>Period in which no scale down is allowed after a scale up</td>
+ <td>Duration in which no scale down of a vertex is allowed after
it has been scaled up.</td>
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.scaling.effectiveness.detection.enabled</h5></td>
@@ -100,9 +100,9 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.target.utilization.boundary</h5></td>
- <td style="word-wrap: break-word;">0.1</td>
+ <td style="word-wrap: break-word;">0.4</td>
<td>Double</td>
- <td>Target vertex utilization boundary. Scaling won't be performed
if utilization is within (target - boundary, target + boundary)</td>
+ <td>Target vertex utilization boundary. Scaling won't be performed
if the current processing rate is within [target_rate / (target_utilization -
boundary), (target_rate / (target_utilization + boundary)]</td>
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.vertex.exclude.ids</h5></td>
@@ -112,7 +112,7 @@
</tr>
<tr>
<td><h5>kubernetes.operator.job.autoscaler.vertex.max-parallelism</h5></td>
- <td style="word-wrap: break-word;">2147483647</td>
+ <td style="word-wrap: break-word;">200</td>
<td>Integer</td>
<td>The maximum parallelism the autoscaler can use. Note that this
limit will be ignored if it is higher than the max parallelism configured in
the Flink config or directly on each operator.</td>
</tr>
diff --git
a/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
b/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
index 4d496f02..df32aa5e 100644
---
a/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
+++
b/flink-kubernetes-operator-autoscaler/src/main/java/org/apache/flink/kubernetes/operator/autoscaler/config/AutoScalerOptions.java
@@ -49,7 +49,7 @@ public class AutoScalerOptions {
public static final ConfigOption<Duration> METRICS_WINDOW =
autoScalerConfig("metrics.window")
.durationType()
- .defaultValue(Duration.ofMinutes(5))
+ .defaultValue(Duration.ofMinutes(10))
.withDescription("Scaling metrics aggregation window
size.");
public static final ConfigOption<Duration> STABILIZATION_INTERVAL =
@@ -68,15 +68,16 @@ public class AutoScalerOptions {
public static final ConfigOption<Double> TARGET_UTILIZATION_BOUNDARY =
autoScalerConfig("target.utilization.boundary")
.doubleType()
- .defaultValue(0.1)
+ .defaultValue(0.4)
.withDescription(
- "Target vertex utilization boundary. Scaling won't
be performed if utilization is within (target - boundary, target + boundary)");
+ "Target vertex utilization boundary. Scaling won't
be performed if the current processing rate is within [target_rate /
(target_utilization - boundary), (target_rate / (target_utilization +
boundary)]");
public static final ConfigOption<Duration> SCALE_UP_GRACE_PERIOD =
autoScalerConfig("scale-up.grace-period")
.durationType()
- .defaultValue(Duration.ofMinutes(10))
- .withDescription("Period in which no scale down is allowed
after a scale up");
+ .defaultValue(Duration.ofHours(1))
+ .withDescription(
+ "Duration in which no scale down of a vertex is
allowed after it has been scaled up.");
public static final ConfigOption<Integer> VERTEX_MIN_PARALLELISM =
autoScalerConfig("vertex.min-parallelism")
@@ -87,7 +88,7 @@ public class AutoScalerOptions {
public static final ConfigOption<Integer> VERTEX_MAX_PARALLELISM =
autoScalerConfig("vertex.max-parallelism")
.intType()
- .defaultValue(Integer.MAX_VALUE)
+ .defaultValue(200)
.withDescription(
"The maximum parallelism the autoscaler can use.
Note that this limit will be ignored if it is higher than the max parallelism
configured in the Flink config or directly on each operator.");
@@ -101,14 +102,14 @@ public class AutoScalerOptions {
public static final ConfigOption<Duration> CATCH_UP_DURATION =
autoScalerConfig("catch-up.duration")
.durationType()
- .defaultValue(Duration.ofMinutes(10))
+ .defaultValue(Duration.ofMinutes(5))
.withDescription(
"The target duration for fully processing any
backlog after a scaling operation. Set to 0 to disable backlog based scaling.");
public static final ConfigOption<Duration> RESTART_TIME =
autoScalerConfig("restart.time")
.durationType()
- .defaultValue(Duration.ofMinutes(5))
+ .defaultValue(Duration.ofMinutes(3))
.withDescription(
"Expected restart time to be used until the
operator can determine it reliably from history.");
@@ -136,7 +137,7 @@ public class AutoScalerOptions {
public static final ConfigOption<Integer> VERTEX_SCALING_HISTORY_COUNT =
autoScalerConfig("history.max.count")
.intType()
- .defaultValue(1)
+ .defaultValue(3)
.withDescription(
"Maximum number of past scaling decisions to
retain per vertex.");
diff --git
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
index 57c74452..88ae09b5 100644
---
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
+++
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/MetricsCollectionAndEvaluationTest.java
@@ -387,10 +387,10 @@ public class MetricsCollectionAndEvaluationTest {
5000.,
evaluation.get(source1).get(ScalingMetric.TRUE_PROCESSING_RATE).getCurrent());
assertEquals(
- 833.,
+ 1667.,
evaluation.get(source1).get(ScalingMetric.SCALE_DOWN_RATE_THRESHOLD).getCurrent());
assertEquals(
- 625.,
+ 500.,
evaluation.get(source1).get(ScalingMetric.SCALE_UP_RATE_THRESHOLD).getCurrent());
scalingExecutor.scaleResource(app, scalingInfo, conf, evaluation);
diff --git
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
index cfe0695e..0bff7357 100644
---
a/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
+++
b/flink-kubernetes-operator-autoscaler/src/test/java/org/apache/flink/kubernetes/operator/autoscaler/ScalingExecutorTest.java
@@ -200,7 +200,7 @@ public class ScalingExecutorTest {
source,
evaluated(10, 80, 100),
filterOperator,
- evaluated(10, 60, 100),
+ evaluated(10, 30, 100),
sink,
evaluated(10, 80, 100));
// filter operator should not scale