This is an automated email from the ASF dual-hosted git repository.
villebro pushed a commit to branch main
in repository
https://gitbox.apache.org/repos/asf/superset-kubernetes-operator.git
The following commit(s) were added to refs/heads/main by this push:
new 727fc77 fix(lifecycle): resolve maintenance port mismatch and harden
return path (#47)
727fc77 is described below
commit 727fc7752cccf3dbb59dd8b7454707e70470b702
Author: Ville Brofeldt <[email protected]>
AuthorDate: Mon May 11 21:33:48 2026 -0700
fix(lifecycle): resolve maintenance port mismatch and harden return path
(#47)
---
AGENTS.md | 27 ++++---
docs/architecture/internals.md | 111 ++++++++++++++--------------
docs/architecture/overview.md | 4 +-
docs/index.md | 9 ++-
docs/user-guide/lifecycle.md | 16 ++--
internal/controller/lifecycle.go | 4 +
internal/controller/maintenance.go | 39 +++++-----
internal/controller/maintenance_test.go | 114 +++++++++++++++++++++++++++++
internal/controller/superset_controller.go | 6 ++
9 files changed, 234 insertions(+), 96 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index 92d0238..621b369 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -28,16 +28,16 @@ Kubernetes operator for Apache Superset, built with the
Go-based Operator SDK an
## Developer Guide
-See `docs/developer-guide.md` for development setup, make commands, testing
philosophy, code generation workflow, linting, CI/supply chain, and
contributing guidelines. Adhere to the conventions documented there.
+See `docs/contributing/development-setup.md` and
`docs/contributing/development-guidelines.md` for development setup, make
commands, testing philosophy, code generation workflow, linting, CI/supply
chain, and contributing guidelines. Adhere to the conventions documented there.
## Architecture
-The operator uses a **two-tier CRD architecture** where the parent `Superset`
resource resolves shared top-level and per-component configuration into
fully-flattened child CRDs. See `docs/architecture.md` for detailed design.
+The operator uses a **two-tier CRD architecture** where the parent `Superset`
resource resolves shared top-level and per-component configuration into
fully-flattened child CRDs. See `docs/architecture/overview.md` for detailed
design.
### CRD Hierarchy
- **Superset** (parent) — top-level CR with shared spec (top-level +
per-component), environment, secretKey/secretKeyFrom, metastore (with
uriFrom/passwordFrom), valkey (cache/broker/results), config, LifecycleSpec,
NetworkingSpec, MonitoringSpec
-- **SupersetLifecycleTask** — lifecycle task runner: bare Pods + ConfigMap.
Exclusively created and managed by the parent controller (not user-created).
Two sequential tasks per upgrade: "migrate" (`superset db upgrade`) and "init"
(`superset init`). Each task has an independent strategy
(`VersionChange`/`Always`/`Never`). Named `{parentName}-migrate` and
`{parentName}-init`.
+- **SupersetLifecycleTask** — lifecycle task runner: bare Pods + ConfigMap.
Exclusively created and managed by the parent controller (not user-created).
Three sequential tasks: "clone" (database snapshot from external source),
"migrate" (`superset db upgrade`), and "init" (`superset init`). Each task can
be independently disabled via `disabled: true`. Clone supports `cronSchedule`
for periodic re-execution. Named `{parentName}-clone`, `{parentName}-migrate`,
and `{parentName}-init`.
- **SupersetWebServer** — gunicorn web server Deployment + Service + ConfigMap
- **SupersetCeleryWorker** — async task worker Deployment + ConfigMap
- **SupersetCeleryBeat** — periodic task scheduler Deployment + ConfigMap
(singleton, always 1 replica)
@@ -47,7 +47,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
**Key principles:**
- **Parent resolves, children execute.** All layering logic lives in the
parent controller. Child CRs are fully flattened — no inheritance to trace.
-- **Presence = enabled.** No `enabled: true/false`. If `celeryWorker: {}` is
set, workers deploy. Lifecycle tasks run by default on image changes
(`VersionChange` strategy); disable individual tasks via `strategy: Never`.
+- **Presence = enabled.** No `enabled: true/false`. If `celeryWorker: {}` is
set, workers deploy. Lifecycle tasks (migrate, init) run by default; disable
individual tasks via `disabled: true`. Clone runs when `spec.lifecycle.clone`
is set.
- **Secrets never touch ConfigMaps.** In prod mode, CRD CEL validation rejects
inline `secretKey`, `metastore.uri`, `metastore.password`, and
`valkey.password`. Use `secretKeyFrom`, `metastore.uriFrom`,
`metastore.passwordFrom`, or `valkey.passwordFrom` to reference Kubernetes
Secrets (operator injects `valueFrom.secretKeyRef` env vars). In dev mode,
inline secrets are allowed.
- **Per-component config rendering.** All Python components get `SECRET_KEY`
rendered from `SUPERSET_OPERATOR__SECRET_KEY`. Web gets port config. Structured
metastore renders an f-string URI from `SUPERSET_OPERATOR__DB_*` env vars. When
`spec.valkey` is set, operator renders all cache configs (`CACHE_CONFIG`,
`DATA_CACHE_CONFIG`, etc.), `CeleryConfig`, and `RESULTS_BACKEND` from
`SUPERSET_OPERATOR__VALKEY_*` env vars. Websocket gets nothing (Node.js).
@@ -55,7 +55,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
- `api/v1alpha1/` — CRD type definitions
- `shared_types.go` — ImageSpec, MetastoreSpec, ValkeySpec (ValkeySSLSpec,
ValkeyCacheSpec, ValkeyCelerySpec, ValkeyResultsBackendSpec), GunicornSpec,
CeleryWorkerProcessSpec, SQLAlchemyEngineOptionsSpec, FlatComponentSpec,
DeploymentTemplate, PodTemplate, ContainerTemplate, ScalableComponentSpec,
ComponentSpec, AutoscalingSpec, PDBSpec
- - `superset_types.go` — Parent CRD: SupersetSpec (environment,
secretKey/secretKeyFrom, metastore with uriFrom/passwordFrom, valkey, config,
sqlaEngineOptions, autoscaling, podDisruptionBudget), component specs
(GunicornSpec on webServer, CeleryWorkerProcessSpec on celeryWorker,
SQLAlchemyEngineOptionsSpec on all Python components except Flower),
LifecycleSpec (migrate/init tasks, upgradeMode), AdminUserSpec, NetworkingSpec,
MonitoringSpec, status types (LifecycleStatus, LastLifecycleImage)
+ - `superset_types.go` — Parent CRD: SupersetSpec (environment,
secretKey/secretKeyFrom, metastore with uriFrom/passwordFrom, valkey, config,
sqlaEngineOptions, autoscaling, podDisruptionBudget), component specs
(GunicornSpec on webServer, CeleryWorkerProcessSpec on celeryWorker,
SQLAlchemyEngineOptionsSpec on all Python components except Flower),
LifecycleSpec (clone/migrate/init tasks, upgradeMode, maintenancePage),
AdminUserSpec, NetworkingSpec, MonitoringSpec, status types (Lifecycl [...]
- `supersetlifecycletask_types.go` — Flat child CRD (Config + checksums,
Pods + ConfigMap)
- `supersetwebserver_types.go` — Flat child CRD (Config + Service +
checksums)
- `supersetceleryworker_types.go` — Flat child CRD (Config + checksums)
@@ -75,6 +75,12 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
- `engine_options.go` — SQLALCHEMY_ENGINE_OPTIONS computation (pool sizing
from worker/thread counts)
- `internal/common/` — Shared types (ComponentType, Ptr), naming functions
(ChildName, ConfigMapName, ComponentLabels), constants (labels, suffixes, ports)
- `internal/controller/` — Reconciler implementations
+ - `superset_controller.go` — Parent `SupersetReconciler`: top-level
Reconcile loop, child CR apply, orphan pruning, status
+ - `lifecycle.go` — Lifecycle pipeline orchestration: task sequencing,
checksum computation, upgrade gates
+ - `drain.go` — Component drain logic: child CR deletion, pod termination
verification
+ - `schedule.go` — Cron schedule handling: tick computation, requeue timing
+ - `config_builder.go` — Spec conversion: top-level → SharedInput, config
rendering, env var collection
+ - `maintenance.go` — Maintenance page: parent-owned Deployment + ConfigMap,
Service selector switching
- `child_reconciler.go` — generic `ChildReconciler` with `ChildCR`
interface: shared sub-resource lifecycle (ConfigMap, Deployment, Service,
Scaling) used by all 6 child controllers
- `child_controllers.go` — `ChildControllerDefs()`: registers all 6 generic
child controllers with per-component DeploymentConfig (default commands, ports,
scaling flags)
- `component_descriptors.go` — table-driven component descriptors for
parent→child conversion
@@ -97,7 +103,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
## Key Patterns
- **Two-tier resolution**: Parent resolves top-level + per-component fields
into flat child spec. `internal/resolution/ResolveChildSpec()` is the core
engine.
-- **Deployment template hierarchy**: All Deployment/Pod/Container
configuration flows through `deploymentTemplate` (Deployment-level) and
`podTemplate` (Pod-level with nested `container` for main container fields) as
siblings on the component spec. Top-level values provide defaults;
per-component values are field-level merged (scalars: component wins; named
collections: merge by name; unnamed collections: append). Task pods use
`podTemplate` only (no Deployment-level). See `docs/user-gui [...]
+- **Deployment template hierarchy**: All Deployment/Pod/Container
configuration flows through `deploymentTemplate` (Deployment-level) and
`podTemplate` (Pod-level with nested `container` for main container fields) as
siblings on the component spec. Top-level values provide defaults;
per-component values are field-level merged (scalars: component wins; named
collections: merge by name; unnamed collections: append). Task pods use
`podTemplate` only (no Deployment-level). See `docs/user-gui [...]
- **ScalableComponentSpec**: Has `DeploymentTemplate`, `PodTemplate`, and
scaling fields (`Replicas`, `Autoscaling`, `PDB`). Used by scalable components.
CeleryBeat has `DeploymentTemplate` + `PodTemplate` directly (no scaling). Task
pods have `PodTemplate` only.
- **ComponentSpec**: Per-component image override field (`Image`). Embedded by
all component specs except LifecycleSpec.
- **Per-component config**: `internal/config/RenderConfig()` generates
component-appropriate Python. `SECRET_KEY` is rendered from the
`SUPERSET_OPERATOR__SECRET_KEY` env var. Both passthrough and structured
metastore modes render `SQLALCHEMY_DATABASE_URI` in the config from
operator-internal env vars (`SUPERSET_OPERATOR__DB_URI` for passthrough,
`SUPERSET_OPERATOR__DB_*` for structured). `SQLALCHEMY_ENGINE_OPTIONS` is
computed per component from the `sqlaEngineOptions` preset and Gunico [...]
@@ -117,7 +123,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
- **HPA**: When `autoscaling` is set, Deployment replicas is nil (HPA
manages). Supports custom metrics via `autoscalingv2.MetricSpec`. Top-level
`autoscaling`/`podDisruptionBudget` provide defaults inherited by all scalable
components; per-component values override (not merge). CeleryBeat and lifecycle
tasks are excluded (singleton/bare pods).
- **Beat singleton**: CeleryBeat always forces replicas=1 regardless of spec.
- **Gateway API**: Uses `sigs.k8s.io/gateway-api` types. Graceful handling of
missing CRDs via `meta.IsNoMatchError`.
-- **Lifecycle tasks**: `spec.lifecycle` on the parent CRD (type
`LifecycleSpec`) defines two sequential tasks: "migrate" (`superset db
upgrade`) and "init" (`superset init`). Each produces a `SupersetLifecycleTask`
child CR named `{parentName}-migrate` and `{parentName}-init`. The parent
controller is the sole orchestrator: it creates task CRs (Get+Create/Delete
pattern, never CreateOrUpdate), sequences them (init waits for migrate), gates
component deployment, and triggers re-runs by de [...]
+- **Lifecycle tasks**: `spec.lifecycle` on the parent CRD (type
`LifecycleSpec`) defines up to three sequential tasks: "clone" (database
snapshot from external source), "migrate" (`superset db upgrade`), and "init"
(`superset init`). Each produces a `SupersetLifecycleTask` child CR named
`{parentName}-clone`, `{parentName}-migrate`, and `{parentName}-init`. The
parent controller is the sole orchestrator: it creates task CRs
(Get+Create/Delete pattern, never CreateOrUpdate), sequences the [...]
- **CRD validation**: All validation uses CEL (`x-kubernetes-validations`) on
CRD types — no admission webhooks. Rules cover: environment mode restrictions,
secret mutual exclusivity, metastore/valkey validation, networking constraints,
monitoring constraints. Defaults (repository, pullPolicy, environment) use
kubebuilder default markers.
- **Metrics**: Operator exposes controller-runtime default metrics (reconcile
counts, durations, leader election) on HTTPS :8443 with Kubernetes auth/authz.
No custom metrics — controller-runtime defaults are sufficient. Superset
instance monitoring via optional `spec.monitoring.serviceMonitor` (creates a
Prometheus ServiceMonitor targeting the web-server component using unstructured
objects; gracefully skips if CRD is absent).
- **Config mount path**: `/app/pythonpath` for superset_config.py.
@@ -127,6 +133,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
| Parent field | CRD Kind | Component suffix | Container name |
|---|---|---|---|
+| `lifecycle` (clone) | `SupersetLifecycleTask` | `clone` | `superset` |
| `lifecycle` (migrate) | `SupersetLifecycleTask` | `migrate` | `superset` |
| `lifecycle` (init) | `SupersetLifecycleTask` | `init` | `superset` |
| `webServer` | `SupersetWebServer` | `web-server` | `superset` |
@@ -136,7 +143,7 @@ The operator uses a **two-tier CRD architecture** where the
parent `Superset` re
| `websocketServer` | `SupersetWebsocketServer` | `websocket-server` |
`superset` |
| `mcpServer` | `SupersetMcpServer` | `mcp-server` | `superset` |
-**Two-level naming:** Child CRs always use the parent name (differentiated by
Kind), except lifecycle tasks which use `{parentName}-{taskName}` (e.g.,
`{parentName}-migrate`, `{parentName}-init`). Sub-resources (Deployments,
Services, ConfigMaps) are named `{parentName}-{componentType}`. Each child
controller computes sub-resource names locally from its CR name and known
component type. Example: parent `my-superset` → child CR
`SupersetWebServer/my-superset` → Deployment `my-superset-web [...]
+**Two-level naming:** Child CRs always use the parent name (differentiated by
Kind), except lifecycle tasks which use `{parentName}-{taskName}` (e.g.,
`{parentName}-clone`, `{parentName}-migrate`, `{parentName}-init`).
Sub-resources (Deployments, Services, ConfigMaps) are named
`{parentName}-{componentType}`. Each child controller computes sub-resource
names locally from its CR name and known component type. Example: parent
`my-superset` → child CR `SupersetWebServer/my-superset` → Deplo [...]
All components use the reserved container name `superset` for the main
container. Since each component runs in its own Pod, names never collide. This
allows `kubectl exec -it <pod> -c superset` without needing to know the
component type.
@@ -154,8 +161,8 @@ All CRD names (parent and child) are validated via CEL to
be valid DNS labels (l
- **README** is a landing page: project description, philosophy, quick start,
link to docs. Keep it welcoming and free of jargon — don't reference specific
knobs, internal config names, or implementation details that might intimidate
newcomers.
- **docs/index.md** is the primary feature overview for the docs site. Keep
feature descriptions high-level and outcome-focused. Implementation details
belong in the user guide or architecture docs.
-- **docs/user-guide.md** is the full configuration reference. Here it's
appropriate to name specific fields, presets, env vars, and show concrete YAML
examples.
-- **docs/architecture.md** explains design decisions and internal structure
for contributors and advanced users.
+- **docs/user-guide/configuration.md** is the full configuration reference.
Here it's appropriate to name specific fields, presets, env vars, and show
concrete YAML examples.
+- **docs/architecture/overview.md** explains design decisions and internal
structure for contributors and advanced users.
- General principles: be concise and objective, avoid overselling or verbose
language, reserve code blocks for real code (not ASCII art), minimize
duplication between README and docs (README links to docs for details).
- **API reference** (`docs/reference/api-reference.md`) is generated from Go
types via `make codegen`. Only operator-defined types are rendered; built-in
Kubernetes types (e.g., `Affinity`, `Container`, `Volume`) are linked to
[pkg.go.dev](https://pkg.go.dev) via `knownTypes` in
`hack/api-ref-config.yaml`. When adding a field that references a new K8s type,
add a `knownTypes` entry so it renders as a link rather than being inlined.
- **`make codegen`** regenerates all generated artifacts (CRDs, DeepCopy, Helm
CRDs, API docs). Run it after modifying types in `api/v1alpha1/`. CI verifies
nothing is stale.
diff --git a/docs/architecture/internals.md b/docs/architecture/internals.md
index e21444f..2796218 100644
--- a/docs/architecture/internals.md
+++ b/docs/architecture/internals.md
@@ -59,22 +59,26 @@ parent CR name. Owned by the parent CR and
garbage-collected on parent deletion.
### Phase 3: Lifecycle Tasks
The parent controller creates `SupersetLifecycleTask` child CRs:
-`{parentName}-migrate` and `{parentName}-init`. The parent uses a
Get+Create/Delete
-pattern (never CreateOrUpdate) to avoid races with the task controller's status
-writes. When a task needs to re-run (checksum mismatch), the parent deletes the
-old CR and creates a fresh one on the next reconcile.
-
-Tasks run sequentially: migrate must complete before init starts. The task
-strategy (default: `VersionChange`) determines whether tasks are triggered —
-with the default strategy, tasks only run when the Superset image changes.
-
-When `upgradeStrategy: Drain` is set, the operator deletes all component child
-CRs before running tasks. The parent verifies all component pods have
terminated
-(not just Deployments deleted) before proceeding to task execution. This
-ensures no application pods access the metastore during schema changes. After
-tasks complete, Phase 4 recreates all components fresh.
-
-Components do not deploy until both lifecycle tasks complete (or lifecycle is
+`{parentName}-clone`, `{parentName}-migrate`, and `{parentName}-init`. The
parent
+uses a Get+Create/Delete pattern (never CreateOrUpdate) to avoid races with the
+task controller's status writes. When a task needs to re-run (checksum
mismatch),
+the parent deletes the old CR and creates a fresh one on the next reconcile.
+
+Tasks run sequentially: clone → migrate → init. Each task can be independently
+disabled via `disabled: true`. Clone also supports periodic re-execution via
+`cronSchedule`. Checksums cascade downstream: a re-clone forces re-migrate,
+which forces re-init.
+
+When a task requires drain (`requiresDrain: true`, the default for clone and
+migrate), the operator deletes all component child CRs before running that
task.
+The parent verifies all component pods have terminated (not just Deployments
+deleted) before proceeding to task execution. This ensures no application pods
+access the metastore during schema changes. If `maintenancePage` is configured,
+the parent brings up a maintenance Deployment and switches the web-server
Service
+selector before draining. After tasks complete, Phase 4 recreates all
components
+fresh.
+
+Components do not deploy until all enabled lifecycle tasks complete (or
lifecycle is
explicitly disabled via `spec.lifecycle.disabled: true`). If a task is in
progress or has failed, `Reconcile()` returns early with a requeue, skipping
Phase 4.
@@ -289,16 +293,17 @@ plaintext) because changes to these values must trigger a
rollout.
The operator uses Kubernetes owner references for automatic cleanup. The parent
`Superset` CR owns child CRDs (SupersetLifecycleTask, SupersetWebServer, etc.),
-networking resources, ServiceMonitor, and NetworkPolicies. Each child CR owns
-its managed resources — deployment CRDs own their Deployment, ConfigMap,
-Service, HPA, and PDB; the SupersetLifecycleTask CRDs own their ConfigMap and
Pods.
+the web-server Service, networking resources, ServiceMonitor, and
NetworkPolicies.
+Each child CR owns its managed resources — deployment CRDs own their
Deployment,
+ConfigMap, Service (except web-server, which is parent-owned), HPA, and PDB;
the
+SupersetLifecycleTask CRDs own their ConfigMap and Pods.
Deleting the parent cascades to all child CRs, which cascade to all their
owned resources. Removing a component from the parent spec (e.g. deleting
`spec.celeryWorker`) deletes its child CR, cascading to all owned resources.
---
-## Maintenance Page (Service Takeover via Orphan Deletion)
+## Maintenance Page (Parent-Owned Service Selector Switch)
When `spec.lifecycle.maintenancePage` is set, the operator serves a maintenance
page during drain and lifecycle tasks. This section documents the design
decision
@@ -306,30 +311,44 @@ behind the traffic switchover mechanism.
### Problem
-During drain, component child CRs are deleted. GC cascades this to Deployments,
-Services, and Pods. The web-server Service disappears, leaving users with
-connection errors instead of a friendly maintenance message.
+During drain, component child CRs are deleted. GC cascades this to Deployments
+and Pods. Without intervention, users experience connection errors instead of a
+friendly maintenance message.
-### Solution: Orphan Deletion + Selector Patch
+### Solution: Parent-Owned Web-Server Service
-The operator uses `propagationPolicy: Orphan` when deleting the
SupersetWebServer
-child CR. This preserves the Service (and Deployment) as unowned resources. The
-operator then patches the orphaned Service's selector to route traffic to
-maintenance pods, and explicitly deletes the orphaned Deployment to terminate
-web-server pods.
+The parent controller owns the web-server Service directly (not the child CR).
+During lifecycle drain, the parent:
-After lifecycle tasks complete, the operator clears any remaining owner
references
-from the Service. The subsequent component reconciliation recreates the
-SupersetWebServer child CR, whose reconciler finds the existing Service via
-`CreateOrUpdate` and adopts it via `SetControllerReference`, restoring the
-original web-server selector.
+1. Creates a maintenance Deployment (parent-owned) running a lightweight HTTP
+ server (nginx:alpine by default or a user-provided image).
+2. Switches the web-server Service's selector to match the maintenance-page pod
+ labels, instantly routing traffic to maintenance pods.
+3. Drains all component child CRs (GC cascades to Deployments and Pods, but the
+ Service is unaffected because it belongs to the parent).
+4. Runs lifecycle tasks (clone → migrate → init).
+5. After tasks complete and the web-server child CR is recreated, waits for the
+ web-server Deployment to become ready.
+6. Switches the Service selector back to the web-server pod labels.
+7. Deletes the maintenance Deployment and its ConfigMap.
+
+### Why Parent-Owned Service
+
+- Service selector changes propagate in ~1 second via the endpoints controller,
+ giving instant traffic switchover regardless of ingress implementation
+- Works for all access patterns: Ingress, direct Service, port-forward
+- No orphan deletion complexity — the Service is always owned by the parent,
+ so GC of child CRs never affects it
+- The child `SupersetWebServer` reconciler skips Service management (the parent
+ handles it), keeping the child controller simple
### Alternatives Considered
-**Owner reference manipulation** (transfer Service ownership parent ↔ child):
-Rejected because manually editing `ownerReferences` is non-standard, creates
-coupling with the GC controller's timing, and violates the principle that
-controllers should only manage resources they own.
+**Orphan deletion + selector patch** (previous design): Used
`propagationPolicy:
+Orphan` when deleting the SupersetWebServer child CR to preserve the Service,
+then patched the selector. Rejected because orphan lifecycle was fragile — race
+conditions between GC finalization and reconciliation, plus the child had to
+detect and re-adopt the orphaned Service on recreation.
**Separate maintenance Service + Ingress/HTTPRoute backend swap**:
Architecturally
pure (clean separation, no interaction with web-server resources), but rejected
@@ -338,22 +357,6 @@ implementation — from ~1s (Envoy-based) to 1-3 minutes
(cloud load balancers l
GCP/AWS). This creates an unacceptable error window where users hit the
draining
backend. Also doesn't work for users without networking configured.
-**Parent-owned stable "frontend" Service**: Cleanest long-term architecture
(parent
-permanently owns the external-facing Service, child CRD's Service is internal),
-but requires a breaking change to Service naming and introduces a new
architectural
-concept for a single feature.
-
-### Why Orphan Deletion
-
-- Uses a standard Kubernetes API concept (`propagationPolicy: Orphan`)
-- The Service is genuinely unowned during the maintenance window — no
architectural
- boundary violation (the child controller doesn't exist at that point)
-- Service selector changes propagate in ~1 second via the endpoints controller,
- giving instant traffic switchover regardless of ingress implementation
-- Works for all access patterns: Ingress, direct Service, port-forward
-- `CreateOrUpdate` + `SetControllerReference` naturally re-adopts the orphaned
- Service when the child CR is recreated (standard controller-runtime pattern)
-
---
## Status and Conditions
diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md
index 2cf9725..b5dedd8 100644
--- a/docs/architecture/overview.md
+++ b/docs/architecture/overview.md
@@ -41,7 +41,7 @@ about.
Splitting into dedicated child CRDs and controllers isolates each component's
lifecycle. The web server controller only watches `SupersetWebServer`
resources;
it cannot interfere with Celery or init. Each child controller is simple and
-generic (all seven share `ChildReconciler`), while the parent controller
focuses
+generic (all six share `ChildReconciler`), while the parent controller focuses
solely on configuration resolution and child CR orchestration. This separation
also enables independent scaling of controller watches and makes `kubectl get`
output immediately useful — `kubectl get supersetwebservers` shows web server
@@ -97,7 +97,7 @@ Components fall into two categories:
| CRD Kind | Parent field | Suffix | Creates |
|---|---|---|---|
-| `SupersetLifecycleTask` | `lifecycle` | `-migrate`, `-init` | Pods,
ConfigMap |
+| `SupersetLifecycleTask` | `lifecycle` | `-clone`, `-migrate`, `-init` |
Pods, ConfigMap |
| `SupersetCeleryBeat` | `celeryBeat` | `-celery-beat` | Deployment, ConfigMap
|
**Presence = enabled**: Setting `celeryWorker: {}` deploys workers with
diff --git a/docs/index.md b/docs/index.md
index aae4cfb..84a7a87 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,7 +33,7 @@ The operator manages the full Superset lifecycle: database
migrations, configura
- **Full control** — every default is overridable, from high-level presets
down to individual fields, with a raw Python escape hatch for anything not
covered
- **Flat configuration** — shared top-level defaults inherited by all
components, with per-component overrides (primitives replace, collections merge)
- **Component toggle** — enable CeleryWorker, CeleryBeat, CeleryFlower,
WebsocketServer, or McpServer by setting their spec; omit to disable
-- **Init lifecycle** — database migration and initialization run as managed
Pods before components deploy
+- **Lifecycle management** — database cloning, migration, and initialization
run as managed Pods before components deploy
- **Checksum-driven rollouts** — config changes automatically trigger rolling
restarts of affected components
- **Networking** — Gateway API (HTTPRoute) and Ingress support
- **HPA with custom metrics**, PodDisruptionBudgets, NetworkPolicies,
Prometheus ServiceMonitor
@@ -63,9 +63,10 @@ spec:
webServer:
replicas: 2
mcpServer: {}
- init:
- adminUser: {}
- loadExamples: true
+ lifecycle:
+ init:
+ adminUser: {}
+ loadExamples: true
```
For production, use `secretKeyFrom` and `metastore.uriFrom` to reference
Kubernetes Secrets instead of inline values:
diff --git a/docs/user-guide/lifecycle.md b/docs/user-guide/lifecycle.md
index 86f7fdc..9cd6e31 100644
--- a/docs/user-guide/lifecycle.md
+++ b/docs/user-guide/lifecycle.md
@@ -474,16 +474,16 @@ The lifecycle pipeline runs: **clone → migrate → init →
components**. Comp
are not deployed until all tasks complete, and clone always drains existing
components before running (DROP DATABASE fails with active connections).
-### Clone Strategy
+### Clone Trigger and Scheduling
-| Strategy | Behavior |
-|---|---|
-| `OnTrigger` (default) | Runs when the `trigger` value changes |
-| `Always` | Runs on every spec change |
-| `Never` | Disabled |
+The clone task runs when its checksum changes. Two mechanisms trigger
re-execution:
+
+- **`trigger` field** — an opaque string (date, UUID, CI build ID). Changing it
+ causes a re-clone. Use this for manual or CI-driven refreshes.
+- **`cronSchedule` field** — a 5-field cron expression for periodic
re-execution.
+ When the clock crosses a cron boundary, the task checksum changes
automatically.
-The `trigger` field is opaque — use a date, UUID, or CI build ID. The operator
-includes it in the task checksum; changing it causes a re-clone.
+To disable clone without removing its configuration, set `disabled: true`.
### Table Exclusion
diff --git a/internal/controller/lifecycle.go b/internal/controller/lifecycle.go
index 3e425e9..c7598b8 100644
--- a/internal/controller/lifecycle.go
+++ b/internal/controller/lifecycle.go
@@ -159,6 +159,10 @@ func (r *SupersetReconciler) reconcileLifecycle(
superset.Status.Lifecycle.Phase = lifecyclePhaseDraining
return taskRequeueInterval, false, nil
}
+ // Switch Service selector to maintenance pods before drain
begins.
+ if err := r.reconcileWebServerService(ctx, superset); err !=
nil {
+ return 0, false, fmt.Errorf("switching web-server
Service to maintenance: %w", err)
+ }
}
// Drain components if any enabled task requires it.
diff --git a/internal/controller/maintenance.go
b/internal/controller/maintenance.go
index b9df6aa..79412b2 100644
--- a/internal/controller/maintenance.go
+++ b/internal/controller/maintenance.go
@@ -48,9 +48,6 @@ const (
var maintenanceDeployConfig = DeploymentConfig{
ContainerName: maintenanceContainerName,
DefaultCommand: nil,
- DefaultPorts: []corev1.ContainerPort{
- {Name: naming.PortNameHTTP, ContainerPort:
naming.PortWebServer, Protocol: corev1.ProtocolTCP},
- },
}
func isMaintenancePageEnabled(superset *supersetv1alpha1.Superset) bool {
@@ -81,16 +78,17 @@ func (r *SupersetReconciler) reconcileMaintenancePageUp(
) (bool, error) {
log := logf.FromContext(ctx)
spec := superset.Spec.Lifecycle.MaintenancePage
+ port := resolveWebServerContainerPort(superset.Spec.WebServer)
// Step 1: Reconcile ConfigMap (managed mode only).
if !isCustomMode(spec) {
- if err := r.reconcileMaintenanceConfigMap(ctx, superset, spec);
err != nil {
+ if err := r.reconcileMaintenanceConfigMap(ctx, superset, spec,
port); err != nil {
return false, fmt.Errorf("reconciling maintenance
ConfigMap: %w", err)
}
}
// Step 2: CreateOrUpdate maintenance Deployment (parent-owned).
- if err := r.reconcileMaintenanceDeployment(ctx, superset, spec); err !=
nil {
+ if err := r.reconcileMaintenanceDeployment(ctx, superset, spec, port);
err != nil {
return false, fmt.Errorf("reconciling maintenance Deployment:
%w", err)
}
@@ -114,8 +112,11 @@ func (r *SupersetReconciler) reconcileMaintenancePageUp(
// reconcileMaintenanceReturn handles the zero-downtime switchback from
// maintenance to web-server. It waits for the web-server Deployment to be
-// ready, then sets MaintenanceActive=false (so reconcileWebServerService
-// switches the selector), and finally cleans up maintenance resources.
+// ready, then sets MaintenanceActive=false so reconcileWebServerService
+// switches the selector on the same reconcile pass.
+// Resource cleanup is deferred to the caller (after the Service is reconciled)
+// to avoid a failure window where the Service still selects maintenance pods
+// whose Deployment has been deleted.
// Returns cleared=true when maintenance is inactive (either already was, or
// was just cleared).
func (r *SupersetReconciler) reconcileMaintenanceReturn(
@@ -142,15 +143,10 @@ func (r *SupersetReconciler) reconcileMaintenanceReturn(
return false, nil
}
- // Web-server is ready — switch traffic back.
+ // Web-server is ready — mark maintenance as inactive. The caller will
+ // reconcile the Service (switching selector) and then clean up
resources.
superset.Status.Lifecycle.MaintenanceActive = false
log.Info("Web-server ready, clearing maintenance page")
-
- // Clean up maintenance resources.
- if err := r.deleteMaintenanceResources(ctx, superset); err != nil {
- return false, fmt.Errorf("cleaning up maintenance resources:
%w", err)
- }
-
return true, nil
}
@@ -192,6 +188,7 @@ func (r *SupersetReconciler) reconcileMaintenanceDeployment(
ctx context.Context,
superset *supersetv1alpha1.Superset,
spec *supersetv1alpha1.MaintenancePageSpec,
+ port int32,
) error {
deployName := maintenanceDeploymentName(superset.Name)
deploy := &appsv1.Deployment{
@@ -208,11 +205,16 @@ func (r *SupersetReconciler)
reconcileMaintenanceDeployment(
naming.AnnotationConfigChecksum: checksum,
}
+ cfg := maintenanceDeployConfig
+ cfg.DefaultPorts = []corev1.ContainerPort{
+ {Name: naming.PortNameHTTP, ContainerPort: port, Protocol:
corev1.ProtocolTCP},
+ }
+
_, err := controllerutil.CreateOrUpdate(ctx, r.Client, deploy, func()
error {
if err := controllerutil.SetControllerReference(superset,
deploy, r.Scheme); err != nil {
return err
}
- deploy.Spec = buildDeploymentSpec(&flat,
maintenanceDeployConfig, podAnnotations, selectorLabels)
+ deploy.Spec = buildDeploymentSpec(&flat, cfg, podAnnotations,
selectorLabels)
deploy.Labels = mergeLabels(nil,
componentLabels(string(naming.ComponentMaintenancePage), superset.Name))
return nil
})
@@ -225,6 +227,7 @@ func (r *SupersetReconciler) reconcileMaintenanceConfigMap(
ctx context.Context,
superset *supersetv1alpha1.Superset,
spec *supersetv1alpha1.MaintenancePageSpec,
+ port int32,
) error {
cmName := maintenanceConfigMapName(superset.Name)
cm := &corev1.ConfigMap{
@@ -239,7 +242,7 @@ func (r *SupersetReconciler) reconcileMaintenanceConfigMap(
return err
}
cm.Data = map[string]string{
- "default.conf": renderNginxConf(),
+ "default.conf": renderNginxConf(port),
"index.html": renderMaintenanceHTML(spec),
}
return nil
@@ -346,9 +349,9 @@ func computeMaintenanceChecksum(spec
*supersetv1alpha1.MaintenancePageSpec) stri
return fmt.Sprintf("%x", h.Sum(nil))[:16]
}
-func renderNginxConf() string {
+func renderNginxConf(port int32) string {
return `server {
- listen ` + fmt.Sprintf("%d", naming.PortWebServer) + `;
+ listen ` + fmt.Sprintf("%d", port) + `;
server_name _;
location = / {
diff --git a/internal/controller/maintenance_test.go
b/internal/controller/maintenance_test.go
index 40d3a39..e39758b 100644
--- a/internal/controller/maintenance_test.go
+++ b/internal/controller/maintenance_test.go
@@ -22,7 +22,11 @@ import (
"strings"
"testing"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
supersetv1alpha1
"github.com/apache/superset-kubernetes-operator/api/v1alpha1"
+ "github.com/apache/superset-kubernetes-operator/internal/common"
)
func TestRenderMaintenanceHTML_EscapesTitle(t *testing.T) {
@@ -72,3 +76,113 @@ func TestRenderMaintenanceHTML_DefaultsAreEscaped(t
*testing.T) {
t.Error("expected full HTML document")
}
}
+
+func TestRenderNginxConf_UsesCustomPort(t *testing.T) {
+ conf := renderNginxConf(9090)
+ if !strings.Contains(conf, "listen 9090") {
+ t.Error("expected nginx to listen on custom port 9090")
+ }
+ if strings.Contains(conf, "listen 8088") {
+ t.Error("should not contain default port when custom port is
provided")
+ }
+}
+
+func TestRenderNginxConf_UsesDefaultPort(t *testing.T) {
+ conf := renderNginxConf(common.PortWebServer)
+ if !strings.Contains(conf, "listen 8088") {
+ t.Error("expected nginx to listen on default port 8088")
+ }
+}
+
+func TestResolveWebServerContainerPort_Default(t *testing.T) {
+ ws := &supersetv1alpha1.WebServerComponentSpec{}
+ port := resolveWebServerContainerPort(ws)
+ if port != common.PortWebServer {
+ t.Errorf("expected default port %d, got %d",
common.PortWebServer, port)
+ }
+}
+
+func TestResolveWebServerContainerPort_CustomPort(t *testing.T) {
+ ws := &supersetv1alpha1.WebServerComponentSpec{
+ ScalableComponentSpec: supersetv1alpha1.ScalableComponentSpec{
+ PodTemplate: &supersetv1alpha1.PodTemplate{
+ Container: &supersetv1alpha1.ContainerTemplate{
+ Ports: []corev1.ContainerPort{
+ {Name: "http", ContainerPort:
9090},
+ },
+ },
+ },
+ },
+ }
+ port := resolveWebServerContainerPort(ws)
+ if port != 9090 {
+ t.Errorf("expected custom port 9090, got %d", port)
+ }
+}
+
+func TestResolveWebServerContainerPort_Nil(t *testing.T) {
+ port := resolveWebServerContainerPort(nil)
+ if port != common.PortWebServer {
+ t.Errorf("expected default port %d for nil spec, got %d",
common.PortWebServer, port)
+ }
+}
+
+func TestReconcileWebServerService_SelectorBasedOnMaintenanceActive(t
*testing.T) {
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "my-superset", Namespace:
"default"},
+ Spec: supersetv1alpha1.SupersetSpec{
+ WebServer: &supersetv1alpha1.WebServerComponentSpec{},
+ },
+ Status: supersetv1alpha1.SupersetStatus{
+ Lifecycle: &supersetv1alpha1.LifecycleStatus{
+ MaintenanceActive: true,
+ },
+ },
+ }
+
+ // When MaintenanceActive=true, selector should point to
maintenance-page component.
+ expectedSelector :=
common.ComponentLabels(common.ComponentMaintenancePage, "my-superset")
+
+ // Verify the selector logic (we test the selector derivation, not the
full reconcile
+ // which requires a fake client).
+ var selector map[string]string
+ if superset.Status.Lifecycle != nil &&
superset.Status.Lifecycle.MaintenanceActive {
+ selector =
common.ComponentLabels(common.ComponentMaintenancePage, superset.Name)
+ } else {
+ selector = common.ComponentLabels(common.ComponentWebServer,
superset.Name)
+ }
+ for k, v := range expectedSelector {
+ if selector[k] != v {
+ t.Errorf("expected selector[%s]=%s, got %s", k, v,
selector[k])
+ }
+ }
+
+ // When MaintenanceActive=false, selector should point to web-server.
+ superset.Status.Lifecycle.MaintenanceActive = false
+ if superset.Status.Lifecycle != nil &&
superset.Status.Lifecycle.MaintenanceActive {
+ selector =
common.ComponentLabels(common.ComponentMaintenancePage, superset.Name)
+ } else {
+ selector = common.ComponentLabels(common.ComponentWebServer,
superset.Name)
+ }
+ expectedWebServer := common.ComponentLabels(common.ComponentWebServer,
"my-superset")
+ for k, v := range expectedWebServer {
+ if selector[k] != v {
+ t.Errorf("expected selector[%s]=%s, got %s", k, v,
selector[k])
+ }
+ }
+}
+
+func TestMaintenanceDeployConfig_UsesCustomPort(t *testing.T) {
+ port := int32(9090)
+ cfg := maintenanceDeployConfig
+ cfg.DefaultPorts = []corev1.ContainerPort{
+ {Name: common.PortNameHTTP, ContainerPort: port, Protocol:
corev1.ProtocolTCP},
+ }
+
+ if len(cfg.DefaultPorts) != 1 {
+ t.Fatal("expected exactly 1 default port")
+ }
+ if cfg.DefaultPorts[0].ContainerPort != port {
+ t.Errorf("expected container port %d, got %d", port,
cfg.DefaultPorts[0].ContainerPort)
+ }
+}
diff --git a/internal/controller/superset_controller.go
b/internal/controller/superset_controller.go
index ba484af..eceedb1 100644
--- a/internal/controller/superset_controller.go
+++ b/internal/controller/superset_controller.go
@@ -178,6 +178,12 @@ func (r *SupersetReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (c
r.Recorder.Eventf(superset, nil, corev1.EventTypeWarning,
"ReconcileError", "Reconcile", "Failed to reconcile web-server Service: %v",
err)
return ctrl.Result{}, fmt.Errorf("reconciling web-server
Service: %w", err)
}
+ if maintenanceCleared {
+ // Service selector has been switched to web-server; safe to
clean up
+ // maintenance resources now. Errors are non-fatal — GC will
handle
+ // them since they are parent-owned.
+ _ = r.deleteMaintenanceResources(ctx, superset)
+ }
if !maintenanceCleared {
if statusErr := r.Status().Update(ctx, superset); statusErr !=
nil {
return ctrl.Result{}, fmt.Errorf("updating status
during maintenance return: %w", statusErr)