This is an automated email from the ASF dual-hosted git repository.
villebro pushed a commit to branch main
in repository
https://gitbox.apache.org/repos/asf/superset-kubernetes-operator.git
The following commit(s) were added to refs/heads/main by this push:
new 22d8688 fix(lifecycle): validate cron schedules and unstick
maintenance on webServer removal (#48)
22d8688 is described below
commit 22d868809a46f11cd503131c4733aaf36709e5ab
Author: Ville Brofeldt <[email protected]>
AuthorDate: Tue May 12 10:27:18 2026 -0700
fix(lifecycle): validate cron schedules and unstick maintenance on
webServer removal (#48)
---
docs/architecture/internals.md | 9 ++++++++-
docs/index.md | 13 ++++++-------
internal/controller/lifecycle.go | 3 +++
internal/controller/maintenance.go | 8 ++++++++
internal/controller/schedule.go | 24 ++++++++++++++++++++++++
5 files changed, 49 insertions(+), 8 deletions(-)
diff --git a/docs/architecture/internals.md b/docs/architecture/internals.md
index 2796218..2a67b5a 100644
--- a/docs/architecture/internals.md
+++ b/docs/architecture/internals.md
@@ -336,12 +336,19 @@ During lifecycle drain, the parent:
- Service selector changes propagate in ~1 second via the endpoints controller,
giving instant traffic switchover regardless of ingress implementation
-- Works for all access patterns: Ingress, direct Service, port-forward
+- Works for all access patterns: Ingress, Gateway API, direct Service
- No orphan deletion complexity — the Service is always owned by the parent,
so GC of child CRs never affects it
- The child `SupersetWebServer` reconciler skips Service management (the parent
handles it), keeping the child controller simple
+> **Note for developers using `kubectl port-forward`:** port-forward
establishes a
+> tunnel to a specific pod, not through the Service selector. When that pod is
+> deleted during drain, the tunnel breaks with a "lost connection to pod"
error.
+> This does not affect Ingress/Gateway users — they route through
EndpointSlices
+> and see seamless transitions. Restart port-forward to reconnect to the
+> maintenance pod.
+
### Alternatives Considered
**Orphan deletion + selector patch** (previous design): Used
`propagationPolicy:
diff --git a/docs/index.md b/docs/index.md
index 84a7a87..72862a7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -29,14 +29,13 @@ The operator manages the full Superset lifecycle: database
migrations, configura
## Features
- **Sane defaults** — production-ready settings out of the box that adapt
automatically to your workload
-- **Painless management** — structured configuration fields with per-component
config generated automatically
-- **Full control** — every default is overridable, from high-level presets
down to individual fields, with a raw Python escape hatch for anything not
covered
-- **Flat configuration** — shared top-level defaults inherited by all
components, with per-component overrides (primitives replace, collections merge)
+- **Automatic config rendering** — structured fields for metastore, Valkey,
Gunicorn, and Celery generate correct `superset_config.py` per component;
config changes trigger rolling restarts
+- **Full control** — every default is overridable, from high-level presets
down to individual container fields, with a raw Python escape hatch for
anything not covered
- **Component toggle** — enable CeleryWorker, CeleryBeat, CeleryFlower,
WebsocketServer, or McpServer by setting their spec; omit to disable
-- **Lifecycle management** — database cloning, migration, and initialization
run as managed Pods before components deploy
-- **Checksum-driven rollouts** — config changes automatically trigger rolling
restarts of affected components
-- **Networking** — Gateway API (HTTPRoute) and Ingress support
-- **HPA with custom metrics**, PodDisruptionBudgets, NetworkPolicies,
Prometheus ServiceMonitor
+- **Zero-downtime upgrades** — maintenance page serves users during database
migrations; the operator drains components gracefully, runs lifecycle tasks,
and restores traffic only after the new version is healthy
+- **Database cloning** — snapshot a production database into staging or QA
environments on demand or on a cron schedule, with automatic migration and init
afterward
+- **Networking** — Gateway API (HTTPRoute) and Ingress support with
per-component routing
+- **Production hardening** — HPA with custom metrics, PodDisruptionBudgets,
NetworkPolicies, Prometheus ServiceMonitor
## What it looks like
diff --git a/internal/controller/lifecycle.go b/internal/controller/lifecycle.go
index c7598b8..fd76dd0 100644
--- a/internal/controller/lifecycle.go
+++ b/internal/controller/lifecycle.go
@@ -105,6 +105,9 @@ func (r *SupersetReconciler) reconcileLifecycle(
superset.Status.Lifecycle = &supersetv1alpha1.LifecycleStatus{}
}
+ // Validate cron schedules early so invalid expressions are surfaced
immediately.
+ r.validateSchedules(superset)
+
// Resolve the current lifecycle image.
var imageOverride *supersetv1alpha1.ImageOverrideSpec
if superset.Spec.Lifecycle != nil {
diff --git a/internal/controller/maintenance.go
b/internal/controller/maintenance.go
index 79412b2..8b16366 100644
--- a/internal/controller/maintenance.go
+++ b/internal/controller/maintenance.go
@@ -128,6 +128,14 @@ func (r *SupersetReconciler) reconcileMaintenanceReturn(
}
log := logf.FromContext(ctx)
+ // If webServer was removed while maintenance is active, clear
immediately
+ // rather than waiting forever for a Deployment that won't come.
+ if superset.Spec.WebServer == nil {
+ superset.Status.Lifecycle.MaintenanceActive = false
+ log.Info("WebServer removed while maintenance active, clearing
maintenance")
+ return true, nil
+ }
+
// Check web-server Deployment readiness before switching traffic.
webDeployName := naming.ResourceBaseName(superset.Name,
naming.ComponentWebServer)
deploy := &appsv1.Deployment{}
diff --git a/internal/controller/schedule.go b/internal/controller/schedule.go
index 231ba3e..2e23b02 100644
--- a/internal/controller/schedule.go
+++ b/internal/controller/schedule.go
@@ -21,6 +21,7 @@ package controller
import (
"time"
+ corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
supersetv1alpha1
"github.com/apache/superset-kubernetes-operator/api/v1alpha1"
@@ -105,3 +106,26 @@ func (r *SupersetReconciler)
projectScheduleStatus(superset *supersetv1alpha1.Su
taskRef.NextScheduleAt = &t
}
}
+
+// validateSchedules checks all active cron expressions for validity and sets
+// a warning condition + event if any are invalid.
+func (r *SupersetReconciler) validateSchedules(superset
*supersetv1alpha1.Superset) {
+ if superset.Spec.Lifecycle == nil {
+ return
+ }
+ if superset.Spec.Lifecycle.Clone != nil &&
superset.Spec.Lifecycle.Clone.CronSchedule != nil &&
+ !isDisabled(superset.Spec.Lifecycle.Clone.Disabled) {
+ expr := *superset.Spec.Lifecycle.Clone.CronSchedule
+ if err := schedule.Validate(expr); err != nil {
+ setCondition(&superset.Status.Conditions,
conditionTypeScheduleValid,
+ metav1.ConditionFalse, "InvalidCronSchedule",
err.Error(), superset.Generation)
+ r.Recorder.Eventf(superset, nil,
corev1.EventTypeWarning, "InvalidCronSchedule", "Lifecycle",
+ "Clone cron schedule is invalid: %v", err)
+ return
+ }
+ }
+ setCondition(&superset.Status.Conditions, conditionTypeScheduleValid,
+ metav1.ConditionTrue, "SchedulesValid", "All cron schedules are
valid", superset.Generation)
+}
+
+const conditionTypeScheduleValid = "ScheduleValid"