This is an automated email from the ASF dual-hosted git repository.
villebro pushed a commit to branch main
in repository
https://gitbox.apache.org/repos/asf/superset-kubernetes-operator.git
The following commit(s) were added to refs/heads/main by this push:
new a6e8a58 feat(controller): non-root helpers, task self-heal, unified
routing (#99)
a6e8a58 is described below
commit a6e8a585c7c6362a91bd58760b82e5f394fa57f0
Author: Ville Brofeldt <[email protected]>
AuthorDate: Fri May 29 21:51:12 2026 -0700
feat(controller): non-root helpers, task self-heal, unified routing (#99)
---
api/v1alpha1/superset_types.go | 9 +-
.../crds/superset.apache.org_supersets.yaml | 4 +-
.../crd/bases/superset.apache.org_supersets.yaml | 4 +-
config/samples/dev-dependencies.yaml | 162 +++++++++++
config/samples/kind-cluster.yaml | 58 ++++
config/samples/kustomization.yaml | 1 +
.../superset_v1alpha1_superset_dev_full.yaml | 230 +++++++++++++++
docs/index.md | 2 +-
docs/reference/api-reference.md | 6 +-
docs/user-guide/configuration.md | 8 +-
docs/user-guide/lifecycle.md | 10 +
docs/user-guide/networking-and-monitoring.md | 19 ++
internal/common/names.go | 6 +
internal/controller/component_descriptors.go | 18 ++
internal/controller/component_resources.go | 7 +-
internal/controller/lifecycle.go | 21 +-
internal/controller/lifecycle_create_db.go | 59 +++-
internal/controller/lifecycle_create_db_test.go | 88 ++++++
internal/controller/lifecycle_job.go | 122 +++++++-
internal/controller/lifecycle_pod_health.go | 89 ++++++
internal/controller/lifecycle_pod_health_test.go | 314 +++++++++++++++++++++
internal/controller/maintenance.go | 56 ++++
internal/controller/maintenance_test.go | 74 +++++
internal/controller/networking.go | 146 ++++++----
internal/controller/networking_test.go | 133 +++++++++
25 files changed, 1563 insertions(+), 83 deletions(-)
diff --git a/api/v1alpha1/superset_types.go b/api/v1alpha1/superset_types.go
index 520242c..77b6932 100644
--- a/api/v1alpha1/superset_types.go
+++ b/api/v1alpha1/superset_types.go
@@ -35,7 +35,7 @@ import (
// +kubebuilder:validation:XValidation:rule="(has(self.environment) &&
self.environment == 'Development') || !has(self.lifecycle) ||
!has(self.lifecycle.init) ||
!has(self.lifecycle.init.adminUser)",message="lifecycle.init.adminUser is only
allowed when environment is Development"
// +kubebuilder:validation:XValidation:rule="(has(self.environment) &&
self.environment == 'Development') || !has(self.lifecycle) ||
!has(self.lifecycle.init) ||
!has(self.lifecycle.init.loadExamples)",message="lifecycle.init.loadExamples is
only allowed when environment is Development"
// +kubebuilder:validation:XValidation:rule="(has(self.environment) &&
self.environment == 'Development') || !has(self.websocketServer) ||
!has(self.websocketServer.config)",message="websocketServer.config is only
allowed when environment is Development; use websocketServer.configFrom to
reference a Secret in Staging or Production"
-// +kubebuilder:validation:XValidation:rule="!has(self.networking) ||
!has(self.networking.ingress) ||
has(self.webServer)",message="spec.networking.ingress requires spec.webServer
to be set (all Ingress rules target the web server service)"
+// +kubebuilder:validation:XValidation:rule="!has(self.networking) ||
!has(self.networking.ingress) ||
has(self.webServer)",message="spec.networking.ingress requires spec.webServer
to be set (it provides the catch-all '/' route; other components are routed by
path)"
// +kubebuilder:validation:XValidation:rule="!has(self.networking) ||
!has(self.networking.gateway) || has(self.webServer) ||
has(self.websocketServer) || has(self.mcpServer) ||
has(self.celeryFlower)",message="spec.networking.gateway requires at least one
component with a routable service (webServer, websocketServer, mcpServer, or
celeryFlower)"
// +kubebuilder:validation:XValidation:rule="!has(self.monitoring) ||
!has(self.monitoring.serviceMonitor) ||
has(self.webServer)",message="spec.monitoring.serviceMonitor requires
spec.webServer to be set (scrapes the web server service)"
// +kubebuilder:validation:XValidation:rule="(has(self.environment) &&
(self.environment == 'Development' || self.environment == 'Staging')) ||
!has(self.lifecycle) || !has(self.lifecycle.clone) ||
(has(self.lifecycle.clone.disabled) &&
self.lifecycle.clone.disabled)",message="lifecycle.clone is only allowed when
environment is Development or Staging; cloning performs a destructive DROP
DATABASE on the target metastore"
@@ -141,6 +141,9 @@ type SupersetSpec struct {
// +optional
CeleryFlower *CeleryFlowerComponentSpec `json:"celeryFlower,omitempty"`
// WebSocket server for real-time updates (Node.js, no Python config).
+ // Experimental: the websocket server is not yet well supported (it
requires a
+ // custom Node.js image, and gateway/ingress routing for it is
unvalidated).
+ // Treat it as subject to change.
// +optional
WebsocketServer *WebsocketServerComponentSpec
`json:"websocketServer,omitempty"`
// FastMCP server component for AI tooling integration.
@@ -265,6 +268,10 @@ type CeleryFlowerComponentSpec struct {
// WebsocketServerComponentSpec defines the websocket server component on the
parent CRD.
// The websocket server is a Node.js app — the default Superset image does not
contain
// websocket_server.js, so an image override is required.
+//
+// Experimental: this component is not yet well supported. It requires a custom
+// Node.js image, and path-based gateway/ingress routing to it has not been
+// validated. The shape and behavior may change in a future release.
// +kubebuilder:validation:XValidation:rule="has(self.image) &&
has(self.image.repository) && size(self.image.repository) >
0",message="websocketServer.image.repository is required: the default Superset
image does not include websocket_server.js"
// +kubebuilder:validation:XValidation:rule="!(has(self.config) &&
has(self.configFrom))",message="websocketServer.config and
websocketServer.configFrom are mutually exclusive"
type WebsocketServerComponentSpec struct {
diff --git a/charts/superset-operator/crds/superset.apache.org_supersets.yaml
b/charts/superset-operator/crds/superset.apache.org_supersets.yaml
index e381d80..0245961 100644
--- a/charts/superset-operator/crds/superset.apache.org_supersets.yaml
+++ b/charts/superset-operator/crds/superset.apache.org_supersets.yaml
@@ -39279,8 +39279,8 @@ spec:
in Staging or Production
rule: (has(self.environment) && self.environment ==
'Development') ||
!has(self.websocketServer) || !has(self.websocketServer.config)
- - message: spec.networking.ingress requires spec.webServer to be
set (all
- Ingress rules target the web server service)
+ - message: spec.networking.ingress requires spec.webServer to be
set (it
+ provides the catch-all '/' route; other components are routed
by path)
rule: '!has(self.networking) || !has(self.networking.ingress) ||
has(self.webServer)'
- message: spec.networking.gateway requires at least one component
with
a routable service (webServer, websocketServer, mcpServer, or
celeryFlower)
diff --git a/config/crd/bases/superset.apache.org_supersets.yaml
b/config/crd/bases/superset.apache.org_supersets.yaml
index e381d80..0245961 100644
--- a/config/crd/bases/superset.apache.org_supersets.yaml
+++ b/config/crd/bases/superset.apache.org_supersets.yaml
@@ -39279,8 +39279,8 @@ spec:
in Staging or Production
rule: (has(self.environment) && self.environment ==
'Development') ||
!has(self.websocketServer) || !has(self.websocketServer.config)
- - message: spec.networking.ingress requires spec.webServer to be
set (all
- Ingress rules target the web server service)
+ - message: spec.networking.ingress requires spec.webServer to be
set (it
+ provides the catch-all '/' route; other components are routed
by path)
rule: '!has(self.networking) || !has(self.networking.ingress) ||
has(self.webServer)'
- message: spec.networking.gateway requires at least one component
with
a routable service (webServer, websocketServer, mcpServer, or
celeryFlower)
diff --git a/config/samples/dev-dependencies.yaml
b/config/samples/dev-dependencies.yaml
new file mode 100644
index 0000000..2010a1b
--- /dev/null
+++ b/config/samples/dev-dependencies.yaml
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Throwaway PostgreSQL + Valkey for local DEVELOPMENT/testing on Kind.
+# Provides the two backing services that
superset_v1alpha1_superset_dev_full.yaml
+# expects: a PostgreSQL at host "postgres" and a Valkey at host "valkey", with
+# matching credentials (user/db "superset", password "superset").
+#
+# WARNING: ephemeral (emptyDir) storage, plain-text passwords, single replica —
+# NOT for production. Data is lost when the pod restarts.
+#
+# kubectl apply -f config/samples/dev-dependencies.yaml
+#
+# Prefer Helm? The Bitnami charts provide the same services with auth flags.
+# Set fullnameOverride so the Service names match the sample (or update
+# spec.metastore.host / spec.valkey.host to the chart's Service names):
+#
+# helm install postgres oci://registry-1.docker.io/bitnamicharts/postgresql \
+# --set auth.username=superset --set auth.password=superset \
+# --set auth.database=superset --set fullnameOverride=postgres
+#
+# helm install valkey oci://registry-1.docker.io/bitnamicharts/valkey \
+# --set architecture=standalone --set auth.password=superset \
+# --set fullnameOverride=valkey
+#
+# After a Helm install, run `kubectl get svc` and point the sample at the
+# resulting Service names if they differ (e.g. valkey-primary).
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: postgres
+ labels:
+ app: postgres
+ app.kubernetes.io/part-of: superset-dev
+spec:
+ selector:
+ app: postgres
+ ports:
+ - name: postgres
+ port: 5432
+ targetPort: 5432
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: postgres
+ labels:
+ app: postgres
+ app.kubernetes.io/part-of: superset-dev
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: postgres
+ template:
+ metadata:
+ labels:
+ app: postgres
+ app.kubernetes.io/part-of: superset-dev
+ spec:
+ containers:
+ - name: postgres
+ image: postgres:17-alpine
+ env:
+ - name: POSTGRES_USER
+ value: superset
+ - name: POSTGRES_PASSWORD
+ value: superset
+ - name: POSTGRES_DB
+ value: superset
+ # Write data under a subdir so the emptyDir mount root stays clean.
+ - name: PGDATA
+ value: /var/lib/postgresql/data/pgdata
+ ports:
+ - name: postgres
+ containerPort: 5432
+ readinessProbe:
+ exec:
+ command: ["pg_isready", "-U", "superset", "-d", "superset"]
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ limits:
+ cpu: "1"
+ memory: 512Mi
+ volumeMounts:
+ - name: data
+ mountPath: /var/lib/postgresql/data
+ volumes:
+ - name: data
+ emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: valkey
+ labels:
+ app: valkey
+ app.kubernetes.io/part-of: superset-dev
+spec:
+ selector:
+ app: valkey
+ ports:
+ - name: valkey
+ port: 6379
+ targetPort: 6379
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: valkey
+ labels:
+ app: valkey
+ app.kubernetes.io/part-of: superset-dev
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: valkey
+ template:
+ metadata:
+ labels:
+ app: valkey
+ app.kubernetes.io/part-of: superset-dev
+ spec:
+ containers:
+ - name: valkey
+ image: valkey/valkey:8-alpine
+ # Require a password (matches spec.valkey.password in the sample) and
+ # disable on-disk persistence for an ephemeral dev cache.
+ command: ["valkey-server", "--requirepass", "superset", "--save", ""]
+ ports:
+ - name: valkey
+ containerPort: 6379
+ readinessProbe:
+ exec:
+ command: ["valkey-cli", "-a", "superset", "ping"]
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 500m
+ memory: 256Mi
diff --git a/config/samples/kind-cluster.yaml b/config/samples/kind-cluster.yaml
new file mode 100644
index 0000000..bf7af1a
--- /dev/null
+++ b/config/samples/kind-cluster.yaml
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Kind cluster config for locally testing the comprehensive dev sample through
an
+# ingress controller WITHOUT port-forwards. It maps host ports 80/443 onto the
+# cluster and labels the control-plane node `ingress-ready=true` so
ingress-nginx
+# (Kind provider) schedules its controller there and binds those ports.
+#
+# extraPortMappings can only be set at cluster creation, so this requires a
fresh
+# cluster:
+#
+# kind create cluster --config config/samples/kind-cluster.yaml
+# # then ingress-nginx (Kind provider):
+# kubectl apply -f
https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
+# kubectl -n ingress-nginx wait --for=condition=ready pod \
+# -l app.kubernetes.io/component=controller --timeout=120s
+# # then the backing services + sample:
+# kubectl apply -f config/samples/dev-dependencies.yaml
+# kubectl apply -f config/samples/superset_v1alpha1_superset_dev_full.yaml
+#
+# *.localhost resolves to 127.0.0.1, so the instance is then reachable
directly,
+# no /etc/hosts edit and no port-forward:
+#
+# http://superset-dev.localhost/ -> web server
+# http://superset-dev.localhost/flower/ -> Celery Flower
+# http://superset-dev.localhost/mcp -> MCP server
+#
+# If host ports 80/443 are already in use (or privileged on your setup), change
+# the hostPort values below (e.g. 8080/8443) and use that port in the URLs.
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+ - role: control-plane
+ kubeadmConfigPatches:
+ - |
+ kind: InitConfiguration
+ nodeRegistration:
+ kubeletExtraArgs:
+ node-labels: "ingress-ready=true"
+ extraPortMappings:
+ - containerPort: 80
+ hostPort: 80
+ protocol: TCP
+ - containerPort: 443
+ hostPort: 443
+ protocol: TCP
diff --git a/config/samples/kustomization.yaml
b/config/samples/kustomization.yaml
index b5740d9..7b53e9a 100644
--- a/config/samples/kustomization.yaml
+++ b/config/samples/kustomization.yaml
@@ -1,5 +1,6 @@
## Append samples of your project ##
resources:
- superset_v1alpha1_superset.yaml
+- superset_v1alpha1_superset_dev_full.yaml
- superset_v1alpha1_superset_prod.yaml
# +kubebuilder:scaffold:manifestskustomizesamples
diff --git a/config/samples/superset_v1alpha1_superset_dev_full.yaml
b/config/samples/superset_v1alpha1_superset_dev_full.yaml
new file mode 100644
index 0000000..55c18d6
--- /dev/null
+++ b/config/samples/superset_v1alpha1_superset_dev_full.yaml
@@ -0,0 +1,230 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Comprehensive Development-mode sample. Brings up nearly every component and
+# feature at once for end-to-end smoke testing (intended for a local Kind
+# cluster). NOT for production — it uses inline secrets and dev-only fields.
+#
+# Exercises: web server, Celery worker, Celery beat, Flower, MCP server,
+# deployment/pod templating, template processing, thumbnails, and alerts &
+# reports. The websocket server is intentionally omitted (experimental).
+#
+# Prerequisites (in the same namespace):
+# - A PostgreSQL reachable at host "postgres" (db/user/pass "superset").
+# - A Valkey (or Redis) reachable at host "valkey" (password "superset").
+# The companion manifest provides both for local testing:
+# kubectl apply -f config/samples/dev-dependencies.yaml
+# (It also documents an equivalent Bitnami Helm install.)
+#
+# Accessing the UI:
+# - NodePort: http://localhost:30088 (works on any Kind cluster, no ingress
+# controller needed). On some Kind setups you may need extraPortMappings.
+# - Port-forward (universal):
+# kubectl port-forward svc/superset-dev-web-server 8088:8088
+# - Ingress: spec.networking.ingress below targets the nginx IngressClass and
+# host "superset-dev.localhost". It only takes effect if an ingress-nginx
+# controller is installed; it is harmless otherwise. Because the host has
no
+# explicit paths, the operator fans it out per component, so you also get
+# http://superset-dev.localhost/flower/ and /mcp on the same host. To reach
+# it on :80 with no port-forward, create the cluster from
+# config/samples/kind-cluster.yaml (maps host 80/443) and install
ingress-nginx.
+# Log in with admin / admin (created by the init task).
+#
+# Alerts & Reports caveat: the operator renders all the necessary config
+# (FEATURE_FLAGS, beat schedule, WebDriver settings), but actual screenshot
+# generation requires a headless browser in the image. The stock Superset image
+# does not ship one, so reports/thumbnails will configure but may not render
+# screenshots without additional browser dependencies.
+apiVersion: superset.apache.org/v1alpha1
+kind: Superset
+metadata:
+ name: superset-dev
+spec:
+ image:
+ tag: 6.1.0-dev
+ environment: Development
+ secretKey: thisIsNotSecure_changeInProduction!
+
+ # Structured metastore (dev-mode inline password). Renders
SQLALCHEMY_DATABASE_URI
+ # as an f-string from operator-injected env vars. createDatabase attaches a
+ # one-shot init container to the migrate lifecycle Job that creates the
target
+ # database if it does not already exist (idempotent).
+ metastore:
+ type: PostgreSQL
+ host: postgres
+ port: 5432
+ database: superset
+ username: superset
+ password: superset
+ createDatabase: true
+
+ # Valkey drives cache, Celery broker/results, and the thumbnail cache. The
+ # operator renders CACHE_CONFIG, THUMBNAIL_CACHE_CONFIG, CeleryConfig, and
+ # RESULTS_BACKEND automatically. Cache sub-specs below are shown explicitly
to
+ # demonstrate the knobs; broker/result backends use their defaults.
+ valkey:
+ host: valkey
+ port: 6379
+ password: superset
+ cache:
+ database: 1
+ defaultTimeout: 300
+ thumbnailCache:
+ database: 5
+ defaultTimeout: 3600
+
+ # Application feature flags -> FEATURE_FLAGS dict in superset_config.py.
+ featureFlags:
+ ENABLE_TEMPLATE_PROCESSING: true
+ THUMBNAILS: true
+ ALERT_REPORTS: true
+
+ # Top-level deployment/pod templating — defaults inherited by every component
+ # AND by the lifecycle task pods.
+ deploymentTemplate:
+ revisionHistoryLimit: 3
+ podTemplate:
+ # runAsNonRoot is enforced everywhere, including lifecycle task pods.
+ # runAsUser must be a NUMERIC UID: the Superset image declares `USER
superset`
+ # (a name), and kubelet cannot verify a non-numeric user is non-root — so
+ # runAsNonRoot alone is rejected with "image has non-numeric user". The
+ # official Superset image's `superset` user is UID 1000. (The operator's
own
+ # helper containers — create-database, maintenance page — default to a
+ # non-root UID on their own, so they need nothing here.)
+ podSecurityContext:
+ runAsNonRoot: true
+ runAsUser: 1000
+ runAsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+ container:
+ securityContext:
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop: ["ALL"]
+ readOnlyRootFilesystem: false # Superset needs /tmp and /app writes
+ resources:
+ requests:
+ cpu: 250m
+ memory: 512Mi
+ limits:
+ cpu: "1"
+ memory: 1Gi
+ env:
+ - name: SUPERSET_LOG_LEVEL
+ value: info
+
+ # Shared Python config (applies to all Python components).
WebDriver/screenshot
+ # settings power thumbnails and alerts & reports;
CeleryConfig.imports/prefetch
+ # are shared Celery app behavior layered onto the operator-rendered
CeleryConfig.
+ config: |
+ # --- Reports & thumbnails: WebDriver / screenshot settings ---
+ WEBDRIVER_TYPE = "chrome"
+ WEBDRIVER_BASEURL = "http://superset-dev-web-server:8088/"
+ WEBDRIVER_BASEURL_USER_FRIENDLY = "http://localhost:30088/"
+ # Selenium/Playwright logs in as this user to render screenshots.
+ THUMBNAIL_SELENIUM_USER = "admin"
+ SCREENSHOT_LOCATE_WAIT = 30
+ SCREENSHOT_LOAD_WAIT = 60
+
+ # --- Shared Celery app behavior (layered onto the operator's
CeleryConfig) ---
+ CeleryConfig.imports = (
+ "superset.sql_lab",
+ "superset.tasks.scheduler",
+ "superset.tasks.thumbnails",
+ "superset.tasks.cache",
+ )
+ CeleryConfig.worker_prefetch_multiplier = 1
+ CeleryConfig.task_acks_late = False
+
+ webServer:
+ replicas: 2
+ service:
+ type: NodePort
+ nodePort: 30088
+ # Per-component template override demonstrating field-level merge: larger
+ # resources than the top-level default, plus a graceful-shutdown preStop
hook.
+ podTemplate:
+ container:
+ resources:
+ requests:
+ cpu: 500m
+ memory: 1Gi
+ limits:
+ cpu: "2"
+ memory: 2Gi
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh", "-c", "sleep 15"]
+
+ celeryWorker:
+ replicas: 2
+ celery:
+ preset: balanced
+
+ # Singleton scheduler. Beat-only settings live here so schedule edits roll
just
+ # the Beat Deployment. beat_schedule mutates the operator-rendered
CeleryConfig.
+ celeryBeat:
+ config: |
+ from celery.schedules import crontab
+
+ CeleryConfig.beat_schedule = {
+ "reports.scheduler": {
+ "task": "reports.scheduler",
+ "schedule": crontab(minute="*", hour="*"),
+ },
+ "reports.prune_log": {
+ "task": "reports.prune_log",
+ "schedule": crontab(minute=0, hour=0),
+ },
+ }
+
+ # Flower is not bundled in the stock Superset image. Install it at startup
via
+ # the bootstrap script (sourced before the flower command). The runtime
+ # `uv pip install` needs to write to the venv (and uv's cache), so this one
pod
+ # runs as root — overriding the top-level non-root policy. Dev convenience
only;
+ # for production, bake flower into a custom image and drop this override.
+ celeryFlower:
+ bootstrapScript: |
+ uv pip install --no-cache flower
+ podTemplate:
+ podSecurityContext:
+ runAsNonRoot: false
+ runAsUser: 0
+
+ mcpServer: {}
+
+ # Optional Ingress demo — only effective when an ingress-nginx controller is
+ # present. With no explicit paths, the operator routes every component by
path
+ # under this host (/ → web, /flower → flower, /mcp → mcp), so all services
are
+ # reachable from one origin. Use the NodePort or port-forward above
otherwise.
+ networking:
+ ingress:
+ className: nginx
+ host: superset-dev.localhost
+
+ lifecycle:
+ # Lightweight maintenance page served while components are drained for
+ # lifecycle tasks. Managed mode (no image) renders an nginx:alpine page
from
+ # the title/message below. Only started when a drain actually runs and an
+ # existing web-server workload is present.
+ maintenancePage:
+ title: Superset is upgrading
+ message: We'll be back shortly — applying database migrations.
+ migrate: {}
+ init:
+ adminUser: {}
+ loadExamples: true
diff --git a/docs/index.md b/docs/index.md
index b63d37c..c2f2927 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -31,7 +31,7 @@ The operator manages the full Superset lifecycle: database
migrations, configura
- **Sensible defaults** — defaults adjust based on the components and presets
you configure
- **Automatic config rendering** — structured fields for metastore, Valkey,
Gunicorn, and Celery generate `superset_config.py` per component; config
changes trigger rolling restarts
- **Configurable** — defaults can be overridden at the preset,
deployment-template, or container level, with a raw Python escape hatch in
`spec.config`/`spec.<component>.config` for settings not surfaced as typed
fields
-- **Component toggle** — enable CeleryWorker, CeleryBeat, CeleryFlower,
WebsocketServer, or McpServer by setting their spec; omit to disable
+- **Component toggle** — enable CeleryWorker, CeleryBeat, CeleryFlower,
McpServer, or WebsocketServer (experimental) by setting their spec; omit to
disable
- **Maintenance-backed upgrades** — when database migrations need to run, the
operator drains components, runs the lifecycle tasks, and restores traffic only
after the new version is healthy; an optional maintenance page can serve users
during the window when configured
- **Lifecycle automation** — database cloning, schema migrations, secret key
rotation, and application init run as sequenced tasks with automatic change
detection and checksum-based re-execution
- **Networking** — Gateway API (HTTPRoute) and Ingress support with
per-component routing
diff --git a/docs/reference/api-reference.md b/docs/reference/api-reference.md
index 02a1def..f068d87 100644
--- a/docs/reference/api-reference.md
+++ b/docs/reference/api-reference.md
@@ -1079,7 +1079,7 @@ _Appears in:_
| `celeryWorker` _[CeleryWorkerComponentSpec](#celeryworkercomponentspec)_ |
Celery async task worker component. Uses spec.valkey as broker/backend when
set;<br />otherwise the broker must be configured manually via spec.config. |
| Optional: \{\} <br /> |
| `celeryBeat` _[CeleryBeatComponentSpec](#celerybeatcomponentspec)_ | Celery
periodic task scheduler (singleton, always 1 replica). Uses spec.valkey<br />as
broker/backend when set; otherwise the broker must be configured manually<br
/>via spec.config. | | Optional: \{\} <br /> |
| `celeryFlower` _[CeleryFlowerComponentSpec](#celeryflowercomponentspec)_ |
Celery Flower monitoring UI component. | | Optional: \{\} <br /> |
-| `websocketServer`
_[WebsocketServerComponentSpec](#websocketservercomponentspec)_ | WebSocket
server for real-time updates (Node.js, no Python config). | | Optional: \{\}
<br /> |
+| `websocketServer`
_[WebsocketServerComponentSpec](#websocketservercomponentspec)_ | WebSocket
server for real-time updates (Node.js, no Python config).<br />Experimental:
the websocket server is not yet well supported (it requires a<br />custom
Node.js image, and gateway/ingress routing for it is unvalidated).<br />Treat
it as subject to change. | | Optional: \{\} <br /> |
| `mcpServer` _[McpServerComponentSpec](#mcpservercomponentspec)_ | FastMCP
server component for AI tooling integration. | | Optional: \{\} <br /> |
| `lifecycle` _[LifecycleSpec](#lifecyclespec)_ | Lifecycle configuration
(database migration, init, upgrade mode). | | Optional: \{\} <br /> |
| `networking` _[NetworkingSpec](#networkingspec)_ | Networking configuration
(Ingress or Gateway API). | | Optional: \{\} <br /> |
@@ -1300,6 +1300,10 @@ WebsocketServerComponentSpec defines the websocket
server component on the paren
The websocket server is a Node.js app — the default Superset image does not
contain
websocket_server.js, so an image override is required.
+Experimental: this component is not yet well supported. It requires a custom
+Node.js image, and path-based gateway/ingress routing to it has not been
+validated. The shape and behavior may change in a future release.
+
_Appears in:_
diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md
index 5dc7f67..f169943 100644
--- a/docs/user-guide/configuration.md
+++ b/docs/user-guide/configuration.md
@@ -175,7 +175,7 @@ Requirements and caveats:
- **Structured metastore only.** Rejected by CRD validation when `uri` or
`uriFrom` is set — the operator needs the individual host/database/username
fields to issue admin-level statements.
- **Privileges.** The configured metastore user must have `CREATEDB`
(PostgreSQL) or `CREATE` (MySQL) privilege on the server. The init container
connects to the `postgres` admin database (PostgreSQL) or runs `CREATE DATABASE
IF NOT EXISTS` (MySQL).
- **Init container image.** The operator uses `postgres:17-alpine` or
`mysql:8-alpine` (matching the clone task) — the Superset image is not assumed
to ship database client tools.
-- **Resources and securityContext are inherited from
`spec.lifecycle.podTemplate.container`.** Whatever you set on
`spec.lifecycle.podTemplate.container.resources` and
`spec.lifecycle.podTemplate.container.securityContext` is applied to the
create-database init container. This lets you satisfy strict admission policies
(Pod Security Standards `restricted`, Kyverno, OPA) without a dedicated knob.
+- **Resources and securityContext are inherited from
`spec.lifecycle.podTemplate.container`.** Whatever you set on
`spec.lifecycle.podTemplate.container.resources` and
`spec.lifecycle.podTemplate.container.securityContext` is applied to the
create-database init container. This lets you satisfy strict admission policies
(Pod Security Standards `restricted`, Kyverno, OPA) without a dedicated knob.
The init container also defaults to a non-root UID (matching its DB-tool
image), so it starts [...]
- **Redundant with `lifecycle.clone`.** Clone already drops and re-creates its
target database every time it runs, so toggling `createDatabase` on alongside
clone is harmless but does no extra work in practice — the init container
detects the existing database (created by clone) and no-ops.
## Valkey
@@ -611,6 +611,12 @@ spec:
## Websocket Server
+!!! warning "Experimental"
+ The websocket server is **experimental and not yet well supported**. It
+ requires a custom Node.js image (below), and path-based gateway/ingress
+ routing to it is unvalidated. Treat its spec and behavior as subject to
+ change.
+
Enable Superset's async event streaming by setting `websocketServer`. This
deploys a **Node.js** application (not Python) that pushes real-time updates to
dashboards via WebSocket connections.
diff --git a/docs/user-guide/lifecycle.md b/docs/user-guide/lifecycle.md
index e0f3e38..dba6af4 100644
--- a/docs/user-guide/lifecycle.md
+++ b/docs/user-guide/lifecycle.md
@@ -362,6 +362,16 @@ On failure, the operator retries with exponential backoff
(`10s * 2^(attempt-1)`
capped at 5m). If a Job exceeds the timeout while Running or Pending, it counts
as a failed attempt.
+If a task pod cannot start at all — a `CreateContainerConfigError` (for
example a
+`runAsNonRoot` policy the image can't satisfy), an image pull failure, or an
+unschedulable pod — the operator surfaces the reason on the `Superset` status
and
+events rather than silently waiting. Once you change the task's pod
+configuration (`securityContext`, resources, image, etc.), the operator
+recreates the task Job from the corrected spec automatically. This also
rescues a
+task that has already exhausted its retries: editing how the pod runs lets it
run
+again, with no need to delete the Job by hand. A purely application-level
failure
+(unchanged pod, e.g. a broken migration) stays terminal so it does not loop.
+
**Task retention policies:**
| Policy | On Success | On Failure |
diff --git a/docs/user-guide/networking-and-monitoring.md
b/docs/user-guide/networking-and-monitoring.md
index fe70ff5..36540fc 100644
--- a/docs/user-guide/networking-and-monitoring.md
+++ b/docs/user-guide/networking-and-monitoring.md
@@ -78,6 +78,25 @@ spec:
Gateway API and Ingress are mutually exclusive — set one or the other, not
both.
+Ingress uses the **same per-component path routing** as Gateway API: a host
with
+no explicit `paths` is expanded by the operator into one rule per present
+component (`/` → web server, plus `/flower`, `/mcp`, `/ws` for the components
+that are enabled), reusing each component's `service.gatewayPath`. Requests are
+forwarded as-is (no path rewrite), so each component owns its subpath the same
+way it does under Gateway API (e.g. Flower via its `--url_prefix`).
+
+```yaml
+spec:
+ networking:
+ ingress:
+ className: nginx
+ host: superset.example.com # no explicit paths -> all components
routed by path
+```
+
+A host **with** explicit `paths` is treated as a user-controlled override and
+those paths route to the web server only — use this when you want full control
+of the path rules for a host:
+
```yaml
spec:
networking:
diff --git a/internal/common/names.go b/internal/common/names.go
index be8fd2f..b204024 100644
--- a/internal/common/names.go
+++ b/internal/common/names.go
@@ -59,6 +59,12 @@ const (
// Annotation keys.
const (
AnnotationConfigChecksum = "superset.apache.org/config-checksum"
+ // AnnotationTaskPodSpecHash records a hash of a lifecycle task Job's
rendered
+ // pod spec. Unlike the semantic task checksum, it changes for any
pod-spec
+ // edit (resources, securityContext, etc.) and lets the controller
recreate a
+ // task Job that is wedged (un-startable) once the spec that would fix
it
+ // changes — without re-running healthy tasks on cosmetic edits.
+ AnnotationTaskPodSpecHash = "superset.apache.org/task-pod-spec-hash"
)
// Default ports.
diff --git a/internal/controller/component_descriptors.go
b/internal/controller/component_descriptors.go
index 4e471d4..3d3fcd9 100644
--- a/internal/controller/component_descriptors.go
+++ b/internal/controller/component_descriptors.go
@@ -302,10 +302,28 @@ func (r *SupersetReconciler) reconcileComponent(
)
}
+ // Flower serves under its URL prefix (--url_prefix), so its HTTP health
+ // endpoint lives at <prefix>/api/workers, not /api/workers. Build the
probes
+ // from the same prefix used for the env var; otherwise the probe 404s
and
+ // kubelet kills the (otherwise healthy) pod into CrashLoopBackOff.
+ applyFlowerProbes(desc.componentType, &cfg, accessor.service)
+
return reconcileComponentResources(ctx, r.Client, r.Scheme, r.Recorder,
superset,
&flatSpec, cfg, workloadChecksum, accessor.service,
flatSpec.Autoscaling, flatSpec.PodDisruptionBudget)
}
+// applyFlowerProbes overrides the Celery Flower health probes so they target
+// Flower's prefixed health endpoint (<url_prefix>/api/workers). No-op for
other
+// components. See flowerHealthPath for why the prefix matters.
+func applyFlowerProbes(componentType naming.ComponentType, cfg
*componentReconcilerConfig, svc *supersetv1alpha1.ComponentServiceSpec) {
+ if componentType != naming.ComponentCeleryFlower {
+ return
+ }
+ probePath := flowerHealthPath(svc)
+ cfg.deployConfig.DefaultLivenessProbe = httpProbe(probePath,
naming.PortCeleryFlower, 15)
+ cfg.deployConfig.DefaultReadinessProbe = httpProbe(probePath,
naming.PortCeleryFlower, 5)
+}
+
func (r *SupersetReconciler) deleteComponentResources(ctx context.Context,
superset *supersetv1alpha1.Superset, desc *componentDescriptor) error {
resourceBaseName := naming.ResourceBaseName(superset.Name,
desc.componentType)
diff --git a/internal/controller/component_resources.go
b/internal/controller/component_resources.go
index ac97af4..94bf83d 100644
--- a/internal/controller/component_resources.go
+++ b/internal/controller/component_resources.go
@@ -125,8 +125,11 @@ func ComponentResourceDefs() []ComponentResourceDef {
DefaultPorts: []corev1.ContainerPort{
{Name: common.PortNameHTTP,
ContainerPort: common.PortCeleryFlower, Protocol: corev1.ProtocolTCP},
},
- DefaultLivenessProbe:
httpProbe("/api/workers", common.PortCeleryFlower, 15),
- DefaultReadinessProbe:
httpProbe("/api/workers", common.PortCeleryFlower, 5),
+ // Path is overridden per-instance by
applyFlowerProbes to include
+ // Flower's URL prefix
(<prefix>/healthcheck); the bare path here is
+ // only a placeholder for the no-prefix
case.
+ DefaultLivenessProbe:
httpProbe("/healthcheck", common.PortCeleryFlower, 15),
+ DefaultReadinessProbe:
httpProbe("/healthcheck", common.PortCeleryFlower, 5),
},
defaultPort: common.PortCeleryFlower,
hasScaling: true,
diff --git a/internal/controller/lifecycle.go b/internal/controller/lifecycle.go
index 4c2fe8f..468d7ff 100644
--- a/internal/controller/lifecycle.go
+++ b/internal/controller/lifecycle.go
@@ -520,10 +520,23 @@ func (r *SupersetReconciler) reconcileLifecycleTask(
}
if taskRef.State == taskStateFailed && taskRef.CompletedChecksum ==
taskChecksum && taskRef.Attempts >= taskRef.MaxRetries {
- log.Info("Task permanently failed", "task", taskType)
- setCondition(&superset.Status.Conditions,
supersetv1alpha1.ConditionTypeLifecycleComplete,
- metav1.ConditionFalse, "TaskFailed", fmt.Sprintf("%s:
%s", taskType, taskRef.Message), superset.Generation)
- return lifecycleTerminal(), nil
+ // A terminally failed task is normally not retried. But if the
user has
+ // since changed the task's pod spec (e.g. fixed a
securityContext or
+ // bumped resources), that change may be exactly what fixes the
failure —
+ // so fall through to the stale-reset path below to give it
another run.
+ // Absent a pod-spec change we stay terminal, so genuine
failures (bad
+ // migration SQL, etc.) don't loop.
+ podSpecChanged, err := r.taskPodSpecChanged(ctx, superset,
taskName, &flatSpec)
+ if err != nil {
+ return lifecycleResult{}, fmt.Errorf("checking pod spec
for failed task %s: %w", taskName, err)
+ }
+ if !podSpecChanged {
+ log.Info("Task permanently failed", "task", taskType)
+ setCondition(&superset.Status.Conditions,
supersetv1alpha1.ConditionTypeLifecycleComplete,
+ metav1.ConditionFalse, "TaskFailed",
fmt.Sprintf("%s: %s", taskType, taskRef.Message), superset.Generation)
+ return lifecycleTerminal(), nil
+ }
+ log.Info("Task previously failed but pod spec changed;
retrying", "task", taskType)
}
if taskRef.State == taskStateComplete || taskRef.State ==
taskStateFailed || (taskRef.CompletedChecksum != "" &&
taskRef.CompletedChecksum != taskChecksum) {
diff --git a/internal/controller/lifecycle_create_db.go
b/internal/controller/lifecycle_create_db.go
index 9cd7f76..d02b769 100644
--- a/internal/controller/lifecycle_create_db.go
+++ b/internal/controller/lifecycle_create_db.go
@@ -90,6 +90,13 @@ echo "Ensured MySQL database $SUPERSET_OPERATOR__DB_NAME
exists"`
// image stays operator-default — the migrate task uses the Superset image,
// but this init container needs psql/mysql clients (postgres:17-alpine /
// mysql:8-alpine).
+//
+// The DB-tool images run as root by default, so an inherited pod-level
+// runAsNonRoot would make kubelet reject this container with
+// CreateContainerConfigError. The client tools connect over TCP and run
+// correctly as any UID, so when no UID is pinned (container- or pod-level) we
+// default to the image's built-in non-root user. That keeps createDatabase
+// compatible with runAsNonRoot pod security contexts out of the box.
func buildCreateDatabaseInitContainer(superset *supersetv1alpha1.Superset,
lifecyclePod *supersetv1alpha1.PodTemplate) *corev1.Container {
if !createDatabaseEnabled(superset) {
return nil
@@ -107,15 +114,59 @@ func buildCreateDatabaseInitContainer(superset
*supersetv1alpha1.Superset, lifec
Command: []string{"/bin/sh", "-c", script},
Env: createDatabaseEnvVars(superset.Spec.Metastore),
}
- if lifecyclePod != nil && lifecyclePod.Container != nil {
- if lifecyclePod.Container.Resources != nil {
- ctr.Resources = *lifecyclePod.Container.Resources
+ var containerSC *corev1.SecurityContext
+ var podSC *corev1.PodSecurityContext
+ if lifecyclePod != nil {
+ if lifecyclePod.Container != nil {
+ if lifecyclePod.Container.Resources != nil {
+ ctr.Resources =
*lifecyclePod.Container.Resources
+ }
+ containerSC = lifecyclePod.Container.SecurityContext
}
- ctr.SecurityContext = lifecyclePod.Container.SecurityContext
+ podSC = lifecyclePod.PodSecurityContext
}
+ ctr.SecurityContext = helperNonRootSecurityContext(containerSC, podSC,
helperNonRootUID(dbType))
return ctr
}
+// helperNonRootUID returns a non-root UID present in the DB-tool image, used
as
+// the default runAsUser for the create-database init container so it satisfies
+// a pod-level runAsNonRoot policy. Using the image's built-in service-account
+// UID (postgres=70 on alpine, mysql=999) guarantees a matching /etc/passwd
+// entry; the client tools themselves work as any UID.
+func helperNonRootUID(dbType string) int64 {
+ if dbType == dbTypeMySQL {
+ return 999
+ }
+ return 70
+}
+
+// helperNonRootSecurityContext returns the SecurityContext for an
+// operator-managed helper container. It preserves any user-provided container
+// securityContext and, when neither the container nor the pod pins a UID,
+// defaults runAsUser to a non-root value so the container can start under a
+// runAsNonRoot pod security context. An explicit UID at either level is
+// respected.
+func helperNonRootSecurityContext(containerSC *corev1.SecurityContext, podSC
*corev1.PodSecurityContext, defaultUID int64) *corev1.SecurityContext {
+ sc := containerSC.DeepCopy()
+ if sc == nil {
+ sc = &corev1.SecurityContext{}
+ }
+ podPinsUser := podSC != nil && podSC.RunAsUser != nil
+ if sc.RunAsUser == nil && !podPinsUser {
+ uid := defaultUID
+ sc.RunAsUser = &uid
+ // Only assert runAsNonRoot when neither level already speaks
to it, so
+ // an explicit user choice (including a deliberate
runAsNonRoot: false)
+ // is never overridden.
+ if sc.RunAsNonRoot == nil && (podSC == nil ||
podSC.RunAsNonRoot == nil) {
+ nonRoot := true
+ sc.RunAsNonRoot = &nonRoot
+ }
+ }
+ return sc
+}
+
// createDatabaseEnabled reports whether spec.metastore.createDatabase is true
// AND the structured fields the init container relies on are present. CEL
// already enforces this, but checking here keeps the controller defensive
diff --git a/internal/controller/lifecycle_create_db_test.go
b/internal/controller/lifecycle_create_db_test.go
index 0919851..141783d 100644
--- a/internal/controller/lifecycle_create_db_test.go
+++ b/internal/controller/lifecycle_create_db_test.go
@@ -302,6 +302,10 @@ func
TestBuildCreateDatabaseInitContainer_InheritsFromMigrateContainerTemplate(t
RunAsNonRoot: common.Ptr(true),
AllowPrivilegeEscalation: common.Ptr(false),
ReadOnlyRootFilesystem: common.Ptr(true),
+ // The user set runAsNonRoot but no UID; the operator defaults
the helper
+ // to the image's non-root UID so kubelet does not reject the
root-default
+ // postgres image with CreateContainerConfigError.
+ RunAsUser: common.Ptr(int64(70)),
}
migratePod := &supersetv1alpha1.PodTemplate{
Container: &supersetv1alpha1.ContainerTemplate{
@@ -622,3 +626,87 @@ func podHasContainer(containers []corev1.Container, name
string) bool {
}
return false
}
+
+func TestHelperNonRootSecurityContext(t *testing.T) {
+ // The DB-tool images (postgres/mysql) run as root by default, so an
+ // inherited pod-level runAsNonRoot would make kubelet reject the
+ // create-database init container with CreateContainerConfigError. These
+ // cases pin that the operator defaults a non-root UID when (and only
when)
+ // neither the container nor the pod already pins one.
+ cases := map[string]struct {
+ containerSC *corev1.SecurityContext
+ podSC *corev1.PodSecurityContext
+ dbType string
+ wantUID *int64
+ wantNonRoot *bool
+ }{
+ "nothing pinned -> default postgres uid": {
+ dbType: dbTypePostgresql,
+ wantUID: common.Ptr(int64(70)),
+ wantNonRoot: common.Ptr(true),
+ },
+ "nothing pinned -> default mysql uid": {
+ dbType: dbTypeMySQL,
+ wantUID: common.Ptr(int64(999)),
+ wantNonRoot: common.Ptr(true),
+ },
+ "pod runAsNonRoot but no uid -> default uid, do not override
nonRoot": {
+ podSC: &corev1.PodSecurityContext{RunAsNonRoot:
common.Ptr(true)},
+ dbType: dbTypePostgresql,
+ wantUID: common.Ptr(int64(70)),
+ wantNonRoot: nil, // pod-level runAsNonRoot already
applies; container stays unset
+ },
+ "explicit container uid respected": {
+ containerSC: &corev1.SecurityContext{RunAsUser:
common.Ptr(int64(1234))},
+ dbType: dbTypePostgresql,
+ wantUID: common.Ptr(int64(1234)),
+ wantNonRoot: nil,
+ },
+ "pod pins uid -> container not defaulted": {
+ podSC: &corev1.PodSecurityContext{RunAsUser:
common.Ptr(int64(2000))},
+ dbType: dbTypePostgresql,
+ wantUID: nil,
+ wantNonRoot: nil,
+ },
+ "explicit runAsNonRoot false honored (no uid forced)": {
+ containerSC: &corev1.SecurityContext{RunAsNonRoot:
common.Ptr(false)},
+ dbType: dbTypePostgresql,
+ // no uid pinned anywhere, so the helper still defaults
a uid, but
+ // must not flip the user's explicit runAsNonRoot:false.
+ wantUID: common.Ptr(int64(70)),
+ wantNonRoot: common.Ptr(false),
+ },
+ }
+ for name, tc := range cases {
+ t.Run(name, func(t *testing.T) {
+ got := helperNonRootSecurityContext(tc.containerSC,
tc.podSC, helperNonRootUID(tc.dbType))
+ if !int64PtrEqual(got.RunAsUser, tc.wantUID) {
+ t.Errorf("RunAsUser = %v, want %v",
derefInt64(got.RunAsUser), derefInt64(tc.wantUID))
+ }
+ if !boolPtrEqual(got.RunAsNonRoot, tc.wantNonRoot) {
+ t.Errorf("RunAsNonRoot = %v, want %v",
got.RunAsNonRoot, tc.wantNonRoot)
+ }
+ })
+ }
+}
+
+func int64PtrEqual(a, b *int64) bool {
+ if a == nil || b == nil {
+ return a == b
+ }
+ return *a == *b
+}
+
+func boolPtrEqual(a, b *bool) bool {
+ if a == nil || b == nil {
+ return a == b
+ }
+ return *a == *b
+}
+
+func derefInt64(p *int64) any {
+ if p == nil {
+ return nil
+ }
+ return *p
+}
diff --git a/internal/controller/lifecycle_job.go
b/internal/controller/lifecycle_job.go
index 9325f3d..c2cd8c7 100644
--- a/internal/controller/lifecycle_job.go
+++ b/internal/controller/lifecycle_job.go
@@ -182,6 +182,10 @@ func (r *SupersetReconciler) reconcileLifecycleTaskJob(
return lifecycleCheckpoint(), nil
}
+ if result, handled, err := r.handleStuckTaskPod(ctx, superset,
existingJob, taskType, taskName, flatSpec, taskRef); handled || err != nil {
+ return result, err
+ }
+
taskRef.State = taskStateRunning
if taskRef.StartedAt == nil {
if existingJob.Status.StartTime != nil {
@@ -225,6 +229,105 @@ func (r *SupersetReconciler) reconcileLifecycleTaskJob(
return lifecycleWait(), nil
}
+// reasonTaskCannotStart is the condition reason and event reason used when a
+// task Pod is wedged (un-startable) and the controller cannot self-heal it.
+const reasonTaskCannotStart = "TaskCannotStart"
+
+// handleStuckTaskPod detects a wedged task Pod (one that cannot start its
+// containers without intervention) and either self-heals or surfaces it.
+//
+// Self-heal: a Job's pod template is immutable, so a spec fix can only take
+// effect by replacing the Job. When the desired pod-spec hash differs from the
+// hash stamped on the existing Job, the spec that would fix the wedge has
+// changed, so the controller deletes the Job and lets the next reconcile
+// recreate it from the current spec — without anyone having to delete it by
+// hand. When the hash matches (spec unchanged), it instead records a clear,
+// actionable status/condition/event and waits, rather than silently looping
+// until the Job's activeDeadlineSeconds.
+//
+// Returns handled=true when it took ownership of this reconcile step.
+func (r *SupersetReconciler) handleStuckTaskPod(
+ ctx context.Context,
+ superset *supersetv1alpha1.Superset,
+ existingJob *batchv1.Job,
+ taskType, taskName string,
+ flatSpec *supersetv1alpha1.FlatComponentSpec,
+ taskRef *supersetv1alpha1.TaskRefStatus,
+) (lifecycleResult, bool, error) {
+ log := logf.FromContext(ctx)
+
+ msg, stuck, err := r.taskPodStartupError(ctx, superset, taskName)
+ if err != nil {
+ return lifecycleResult{}, false, err
+ }
+ if !stuck {
+ return lifecycleResult{}, false, nil
+ }
+
+ desiredHash := podSpecHash(buildInitPod(flatSpec))
+ if existingJob.Annotations[naming.AnnotationTaskPodSpecHash] !=
desiredHash {
+ log.Info("Task pod cannot start and pod spec changed;
recreating task job",
+ "task", taskType, "reason", msg)
+ if err := r.Delete(ctx, existingJob);
client.IgnoreNotFound(err) != nil {
+ return lifecycleResult{}, false, fmt.Errorf("deleting
wedged task job %s: %w", existingJob.Name, err)
+ }
+ taskRef.State = taskStatePending
+ taskRef.Message = "Recreating task after spec change: " + msg
+ r.Recorder.Eventf(superset, nil, corev1.EventTypeNormal,
"TaskRecovering", "Lifecycle",
+ "%s task pod could not start (%s); recreating with
updated spec", taskType, msg)
+ return lifecycleWait(), true, nil
+ }
+
+ // Spec unchanged: cannot self-heal automatically. Surface the blocker
so it
+ // is visible on the Superset status and as an event, and keep waiting.
+ taskRef.Message = msg
+ if !hasLifecycleConditionReason(superset, reasonTaskCannotStart) {
+ r.Recorder.Eventf(superset, nil, corev1.EventTypeWarning,
reasonTaskCannotStart, "Lifecycle",
+ "%s task pod cannot start: %s", taskType, msg)
+ }
+ setCondition(&taskRef.Conditions,
supersetv1alpha1.ConditionTypeTaskComplete,
+ metav1.ConditionFalse, reasonTaskCannotStart, msg,
superset.Generation)
+ setCondition(&superset.Status.Conditions,
supersetv1alpha1.ConditionTypeLifecycleComplete,
+ metav1.ConditionFalse, reasonTaskCannotStart,
+ fmt.Sprintf("%s task pod cannot start: %s", taskType, msg),
superset.Generation)
+ return lifecycleWait(), true, nil
+}
+
+// taskPodStartupError lists the Pods of a lifecycle task Job and returns the
+// first un-startable one's reason, if any.
+func (r *SupersetReconciler) taskPodStartupError(ctx context.Context, superset
*supersetv1alpha1.Superset, taskName string) (string, bool, error) {
+ pods := &corev1.PodList{}
+ if err := r.List(ctx, pods,
+ client.InNamespace(superset.Namespace),
+ client.MatchingLabels{labelInitInstance: taskName},
+ ); err != nil {
+ return "", false, fmt.Errorf("listing task pods for %s: %w",
taskName, err)
+ }
+ for i := range pods.Items {
+ if msg, stuck := podStartupError(&pods.Items[i]); stuck {
+ return msg, true, nil
+ }
+ }
+ return "", false, nil
+}
+
+// taskPodSpecChanged reports whether the desired pod spec for a task differs
+// from the spec stamped on its existing Job (AnnotationTaskPodSpecHash). It
lets
+// the controller retry a terminally failed task once the user changes
something
+// about how the pod runs (securityContext, resources, etc.) — that change may
be
+// exactly what fixes the failure. Returns false when no Job exists, so the
+// normal create path applies.
+func (r *SupersetReconciler) taskPodSpecChanged(ctx context.Context, superset
*supersetv1alpha1.Superset, taskName string, flatSpec
*supersetv1alpha1.FlatComponentSpec) (bool, error) {
+ job, err := r.getLifecycleTaskJob(ctx, superset, taskName)
+ if err != nil {
+ return false, err
+ }
+ if job == nil {
+ return false, nil
+ }
+ return job.Annotations[naming.AnnotationTaskPodSpecHash] !=
podSpecHash(buildInitPod(flatSpec)), nil
+}
+
func (r *SupersetReconciler) getLifecycleTaskJob(ctx context.Context, superset
*supersetv1alpha1.Superset, taskName string) (*batchv1.Job, error) {
job := &batchv1.Job{}
if err := r.Get(ctx, types.NamespacedName{Name: taskName, Namespace:
superset.Namespace}, job); err != nil {
@@ -246,9 +349,15 @@ func (r *SupersetReconciler) buildLifecycleTaskJob(
podSpec := buildInitPod(flatSpec)
pt := safePodTemplatePtr(flatSpec.PodTemplate)
labels := mergeLabels(pt.Labels, r.lifecycleTaskLabels(superset,
taskName, taskType))
- annotations := mergeAnnotations(pt.Annotations, map[string]string{
+ // Pod-template annotations carry the semantic task checksum (drives
rolling
+ // restarts). The pod-spec hash is stamped on the Job only — never on
the pod
+ // template — so it cannot feed back into its own input.
+ podAnnotations := mergeAnnotations(pt.Annotations, map[string]string{
naming.AnnotationConfigChecksum: taskChecksum,
})
+ jobAnnotations := mergeAnnotations(podAnnotations, map[string]string{
+ naming.AnnotationTaskPodSpecHash: podSpecHash(podSpec),
+ })
var activeDeadlineSeconds *int64
if timeout > 0 {
seconds := int64(timeout.Seconds())
@@ -263,7 +372,7 @@ func (r *SupersetReconciler) buildLifecycleTaskJob(
Name: taskName,
Namespace: superset.Namespace,
Labels: labels,
- Annotations: annotations,
+ Annotations: jobAnnotations,
},
Spec: batchv1.JobSpec{
BackoffLimit: ptrInt32(jobBackoffLimit),
@@ -273,7 +382,7 @@ func (r *SupersetReconciler) buildLifecycleTaskJob(
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
- Annotations: annotations,
+ Annotations: podAnnotations,
},
Spec: podSpec,
},
@@ -285,6 +394,13 @@ func ptrInt32(v int32) *int32 {
return &v
}
+// podSpecHash returns a stable hash of a task Job's rendered pod spec. It is
+// stamped on the Job (AnnotationTaskPodSpecHash) so the controller can tell
+// whether the desired pod spec has changed since a wedged Job was created.
+func podSpecHash(podSpec corev1.PodSpec) string {
+ return computeChecksum(podSpec)
+}
+
func (r *SupersetReconciler) taskJobMatchesChecksum(job *batchv1.Job,
taskChecksum string) bool {
if job.Annotations == nil ||
job.Annotations[naming.AnnotationConfigChecksum] == "" {
return !jobComplete(job) && !jobFailed(job)
diff --git a/internal/controller/lifecycle_pod_health.go
b/internal/controller/lifecycle_pod_health.go
new file mode 100644
index 0000000..e06aca6
--- /dev/null
+++ b/internal/controller/lifecycle_pod_health.go
@@ -0,0 +1,89 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+ "fmt"
+ "strings"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+// terminalStartupWaitingReasons are container "waiting" reasons that mean the
+// container will never start on its own without a change to the Pod or its
+// dependencies (a bad image reference, an invalid container config such as a
+// runAsNonRoot violation, or an unpullable image). They are distinct from the
+// benign transient reasons (ContainerCreating, PodInitializing) a Pod passes
+// through during normal startup.
+var terminalStartupWaitingReasons = map[string]bool{
+ "CreateContainerConfigError": true,
+ "CreateContainerError": true,
+ "RunContainerError": true,
+ "InvalidImageName": true,
+ "ErrImagePull": true,
+ "ImagePullBackOff": true,
+}
+
+// podStartupError reports whether a Pod is wedged — unable to start its
+// containers without external intervention — and returns a human-readable
+// reason. A Job-backed task Pod in this state never sets a JobFailed condition
+// (the container never runs), so the controller would otherwise wait on it
+// until the Job's activeDeadlineSeconds with no actionable signal. Detecting
it
+// lets the controller surface the blocker and self-heal once the spec changes.
+//
+// It is intentionally conservative: only container config/image errors and
+// definitive scheduling failures count. Transient states (image pulling,
+// ContainerCreating) return false so normal startup is never misreported.
+func podStartupError(pod *corev1.Pod) (string, bool) {
+ if pod.DeletionTimestamp != nil {
+ return "", false
+ }
+
+ statuses := make([]corev1.ContainerStatus, 0,
+
len(pod.Status.InitContainerStatuses)+len(pod.Status.ContainerStatuses))
+ statuses = append(statuses, pod.Status.InitContainerStatuses...)
+ statuses = append(statuses, pod.Status.ContainerStatuses...)
+ for i := range statuses {
+ w := statuses[i].State.Waiting
+ if w != nil && terminalStartupWaitingReasons[w.Reason] {
+ return formatContainerStartupError(statuses[i].Name,
w.Reason, w.Message), true
+ }
+ }
+
+ if pod.Status.Phase == corev1.PodPending {
+ for i := range pod.Status.Conditions {
+ c := pod.Status.Conditions[i]
+ if c.Type == corev1.PodScheduled &&
+ c.Status == corev1.ConditionFalse &&
+ c.Reason == corev1.PodReasonUnschedulable {
+ return strings.TrimSpace(fmt.Sprintf("pod
unschedulable: %s", c.Message)), true
+ }
+ }
+ }
+
+ return "", false
+}
+
+func formatContainerStartupError(name, reason, message string) string {
+ msg := fmt.Sprintf("container %q: %s", name, reason)
+ if trimmed := strings.TrimSpace(message); trimmed != "" {
+ msg += ": " + trimmed
+ }
+ return msg
+}
diff --git a/internal/controller/lifecycle_pod_health_test.go
b/internal/controller/lifecycle_pod_health_test.go
new file mode 100644
index 0000000..b714c7e
--- /dev/null
+++ b/internal/controller/lifecycle_pod_health_test.go
@@ -0,0 +1,314 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+ "context"
+ "strings"
+ "testing"
+
+ batchv1 "k8s.io/api/batch/v1"
+ corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/types"
+ "k8s.io/client-go/tools/events"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+
+ supersetv1alpha1
"github.com/apache/superset-kubernetes-operator/api/v1alpha1"
+ "github.com/apache/superset-kubernetes-operator/internal/common"
+)
+
+func TestPodStartupError(t *testing.T) {
+ waiting := func(reason string) corev1.ContainerStatus {
+ return corev1.ContainerStatus{Name: "c", State:
corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: reason,
Message: "boom"}}}
+ }
+ cases := map[string]struct {
+ pod corev1.Pod
+ wantErr bool
+ contains string
+ }{
+ "create container config error (init)": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ InitContainerStatuses:
[]corev1.ContainerStatus{waiting("CreateContainerConfigError")},
+ }},
+ wantErr: true,
+ contains: "CreateContainerConfigError",
+ },
+ "image pull backoff (main)": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ ContainerStatuses:
[]corev1.ContainerStatus{waiting("ImagePullBackOff")},
+ }},
+ wantErr: true,
+ contains: "ImagePullBackOff",
+ },
+ "unschedulable": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ Conditions: []corev1.PodCondition{{
+ Type: corev1.PodScheduled,
+ Status: corev1.ConditionFalse,
+ Reason: corev1.PodReasonUnschedulable,
+ Message: "0/3 nodes available",
+ }},
+ }},
+ wantErr: true,
+ contains: "unschedulable",
+ },
+ "container creating is transient": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ InitContainerStatuses:
[]corev1.ContainerStatus{waiting("ContainerCreating")},
+ }},
+ wantErr: false,
+ },
+ "pod initializing is transient": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ ContainerStatuses:
[]corev1.ContainerStatus{waiting("PodInitializing")},
+ }},
+ wantErr: false,
+ },
+ "running pod is fine": {
+ pod: corev1.Pod{Status: corev1.PodStatus{
+ Phase: corev1.PodRunning,
+ ContainerStatuses:
[]corev1.ContainerStatus{{Name: "c", State: corev1.ContainerState{Running:
&corev1.ContainerStateRunning{}}}},
+ }},
+ wantErr: false,
+ },
+ "terminating pod ignored": {
+ pod: corev1.Pod{
+ ObjectMeta:
metav1.ObjectMeta{DeletionTimestamp: &metav1.Time{Time: metav1.Now().Time}},
+ Status: corev1.PodStatus{
+ Phase:
corev1.PodPending,
+ InitContainerStatuses:
[]corev1.ContainerStatus{waiting("CreateContainerConfigError")},
+ },
+ },
+ wantErr: false,
+ },
+ }
+ for name, tc := range cases {
+ t.Run(name, func(t *testing.T) {
+ msg, got := podStartupError(&tc.pod)
+ if got != tc.wantErr {
+ t.Fatalf("podStartupError stuck=%v, want %v
(msg=%q)", got, tc.wantErr, msg)
+ }
+ if tc.wantErr && tc.contains != "" &&
!strings.Contains(msg, tc.contains) {
+ t.Errorf("message %q does not contain %q", msg,
tc.contains)
+ }
+ })
+ }
+}
+
+func TestPodSpecHash_SensitiveToSecurityContext(t *testing.T) {
+ base := &supersetv1alpha1.FlatComponentSpec{
+ Image: supersetv1alpha1.ImageSpec{Repository:
"apache/superset", Tag: "latest"},
+ }
+ hardened := &supersetv1alpha1.FlatComponentSpec{
+ Image: supersetv1alpha1.ImageSpec{Repository:
"apache/superset", Tag: "latest"},
+ PodTemplate: &supersetv1alpha1.PodTemplate{
+ PodSecurityContext:
&corev1.PodSecurityContext{RunAsNonRoot: common.Ptr(true)},
+ },
+ }
+ if podSpecHash(buildInitPod(base)) ==
podSpecHash(buildInitPod(hardened)) {
+ t.Error("expected pod-spec hash to change when the pod security
context changes")
+ }
+ // Stable for identical input.
+ first := podSpecHash(buildInitPod(base))
+ second := podSpecHash(buildInitPod(base))
+ if first != second {
+ t.Error("expected pod-spec hash to be stable for identical
input")
+ }
+}
+
+func TestHandleStuckTaskPod_SelfHealsWhenSpecChanged(t *testing.T) {
+ // A wedged task Pod plus a Job whose stamped pod-spec hash no longer
matches
+ // the desired spec means the user fixed the spec. The operator must
delete
+ // the Job (so the next reconcile recreates it) instead of waiting
forever —
+ // the user should never have to delete the Job by hand.
+ ctx := context.Background()
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace:
"default", UID: "uid-1"},
+ }
+ job := &batchv1.Job{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-migrate",
+ Namespace: "default",
+ Annotations:
map[string]string{common.AnnotationTaskPodSpecHash: "stale-hash"},
+ },
+ }
+ pod := wedgedTaskPod("test-migrate", "CreateContainerConfigError")
+
+ c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset,
job, pod).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+
+ flatSpec := &supersetv1alpha1.FlatComponentSpec{Image:
supersetv1alpha1.ImageSpec{Repository: "apache/superset", Tag: "latest"}}
+ taskRef := &supersetv1alpha1.TaskRefStatus{State: taskStateRunning,
MaxRetries: 3}
+
+ res, handled, err := r.handleStuckTaskPod(ctx, superset, job,
taskTypeMigrate, "test-migrate", flatSpec, taskRef)
+ if err != nil {
+ t.Fatalf("handleStuckTaskPod: %v", err)
+ }
+ if !handled {
+ t.Fatal("expected handled=true for a wedged pod")
+ }
+ if res.RequeueAfter <= 0 {
+ t.Errorf("expected a requeue, got %#v", res)
+ }
+ if taskRef.State != taskStatePending {
+ t.Errorf("expected taskRef reset to Pending for recreation, got
%q", taskRef.State)
+ }
+ if err := c.Get(ctx, types.NamespacedName{Name: "test-migrate",
Namespace: "default"}, &batchv1.Job{}); !apierrors.IsNotFound(err) {
+ t.Errorf("expected wedged Job to be deleted for recreation, got
%v", err)
+ }
+}
+
+func TestHandleStuckTaskPod_SurfacesWhenSpecUnchanged(t *testing.T) {
+ // When the spec has not changed (stamped hash matches desired), the
operator
+ // cannot fix the wedge itself. It must surface a clear condition/event
and
+ // keep the Job (no churn), rather than silently looping.
+ ctx := context.Background()
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace:
"default", UID: "uid-1"},
+ }
+ flatSpec := &supersetv1alpha1.FlatComponentSpec{Image:
supersetv1alpha1.ImageSpec{Repository: "apache/superset", Tag: "latest"}}
+ matchingHash := podSpecHash(buildInitPod(flatSpec))
+
+ job := &batchv1.Job{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-migrate",
+ Namespace: "default",
+ Annotations:
map[string]string{common.AnnotationTaskPodSpecHash: matchingHash},
+ },
+ }
+ pod := wedgedTaskPod("test-migrate", "CreateContainerConfigError")
+
+ c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset,
job, pod).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+
+ taskRef := &supersetv1alpha1.TaskRefStatus{State: taskStateRunning,
MaxRetries: 3}
+
+ _, handled, err := r.handleStuckTaskPod(ctx, superset, job,
taskTypeMigrate, "test-migrate", flatSpec, taskRef)
+ if err != nil {
+ t.Fatalf("handleStuckTaskPod: %v", err)
+ }
+ if !handled {
+ t.Fatal("expected handled=true for a wedged pod")
+ }
+ if err := c.Get(ctx, types.NamespacedName{Name: "test-migrate",
Namespace: "default"}, &batchv1.Job{}); err != nil {
+ t.Errorf("expected Job to be kept when spec is unchanged, got
%v", err)
+ }
+ if !conditionHasReason(superset.Status.Conditions,
supersetv1alpha1.ConditionTypeLifecycleComplete, reasonTaskCannotStart) {
+ t.Error("expected parent LifecycleComplete condition with
reason TaskCannotStart")
+ }
+ if taskRef.Message == "" {
+ t.Error("expected taskRef.Message to describe the startup
failure")
+ }
+}
+
+func TestHandleStuckTaskPod_NotStuckPassesThrough(t *testing.T) {
+ ctx := context.Background()
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{ObjectMeta:
metav1.ObjectMeta{Name: "test", Namespace: "default"}}
+ job := &batchv1.Job{ObjectMeta: metav1.ObjectMeta{Name: "test-migrate",
Namespace: "default"}}
+ // A healthy (running) pod must not be treated as stuck.
+ pod := &corev1.Pod{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-migrate-x",
Namespace: "default", Labels: map[string]string{labelInitInstance:
"test-migrate"}},
+ Status: corev1.PodStatus{Phase: corev1.PodRunning},
+ }
+ c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset,
job, pod).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+
+ flatSpec := &supersetv1alpha1.FlatComponentSpec{Image:
supersetv1alpha1.ImageSpec{Repository: "apache/superset", Tag: "latest"}}
+ _, handled, err := r.handleStuckTaskPod(ctx, superset, job,
taskTypeMigrate, "test-migrate", flatSpec, &supersetv1alpha1.TaskRefStatus{})
+ if err != nil {
+ t.Fatalf("handleStuckTaskPod: %v", err)
+ }
+ if handled {
+ t.Error("expected handled=false for a healthy pod so normal
handling proceeds")
+ }
+}
+
+func TestTaskPodSpecChanged(t *testing.T) {
+ // Drives the terminal-failed retry gate: a spec change since the
failed Job
+ // was created must be detectable so the controller can rerun it.
+ ctx := context.Background()
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{ObjectMeta:
metav1.ObjectMeta{Name: "test", Namespace: "default"}}
+ flatSpec := &supersetv1alpha1.FlatComponentSpec{Image:
supersetv1alpha1.ImageSpec{Repository: "apache/superset", Tag: "latest"}}
+ matchingHash := podSpecHash(buildInitPod(flatSpec))
+
+ t.Run("no job -> not changed", func(t *testing.T) {
+ c :=
fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+ changed, err := r.taskPodSpecChanged(ctx, superset,
"test-migrate", flatSpec)
+ if err != nil || changed {
+ t.Fatalf("expected (false,nil) when no Job exists, got
(%v,%v)", changed, err)
+ }
+ })
+
+ t.Run("matching hash -> not changed", func(t *testing.T) {
+ job := &batchv1.Job{ObjectMeta: metav1.ObjectMeta{Name:
"test-migrate", Namespace: "default", Annotations:
map[string]string{common.AnnotationTaskPodSpecHash: matchingHash}}}
+ c :=
fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset, job).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+ changed, err := r.taskPodSpecChanged(ctx, superset,
"test-migrate", flatSpec)
+ if err != nil || changed {
+ t.Fatalf("expected (false,nil) for matching hash, got
(%v,%v)", changed, err)
+ }
+ })
+
+ t.Run("stale hash -> changed", func(t *testing.T) {
+ job := &batchv1.Job{ObjectMeta: metav1.ObjectMeta{Name:
"test-migrate", Namespace: "default", Annotations:
map[string]string{common.AnnotationTaskPodSpecHash: "stale"}}}
+ c :=
fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset, job).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+ changed, err := r.taskPodSpecChanged(ctx, superset,
"test-migrate", flatSpec)
+ if err != nil || !changed {
+ t.Fatalf("expected (true,nil) for stale hash, got
(%v,%v)", changed, err)
+ }
+ })
+}
+
+func wedgedTaskPod(taskName, reason string) *corev1.Pod {
+ return &corev1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: taskName + "-abcde",
+ Namespace: "default",
+ Labels: map[string]string{labelInitInstance:
taskName},
+ },
+ Status: corev1.PodStatus{
+ Phase: corev1.PodPending,
+ InitContainerStatuses: []corev1.ContainerStatus{
+ {Name: "create-database", State:
corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: reason,
Message: "container has runAsNonRoot and image will run as root"}}},
+ },
+ },
+ }
+}
+
+func conditionHasReason(conditions []metav1.Condition, condType, reason
string) bool {
+ for _, c := range conditions {
+ if c.Type == condType && c.Reason == reason {
+ return true
+ }
+ }
+ return false
+}
diff --git a/internal/controller/maintenance.go
b/internal/controller/maintenance.go
index ae49065..5ea7e44 100644
--- a/internal/controller/maintenance.go
+++ b/internal/controller/maintenance.go
@@ -44,6 +44,12 @@ const (
maintenanceDefaultTag = "alpine"
maintenanceContainerName = "maintenance-page"
+
+ // maintenanceNonRootUID is the default UID for the maintenance nginx
+ // container. The rendered nginx.conf routes the pid file and temp
paths to
+ // /tmp, so nginx runs correctly as any non-root UID regardless of the
+ // image's built-in nginx user.
+ maintenanceNonRootUID int64 = 101
)
var maintenanceDeployConfig = DeploymentConfig{
@@ -317,6 +323,7 @@ func (r *SupersetReconciler) reconcileMaintenanceConfigMap(
return err
}
cm.Data = map[string]string{
+ "nginx.conf":
renderMaintenanceNginxMainConf(),
"default.conf": renderNginxConf(port),
"index.html": renderMaintenanceHTML(spec),
}
@@ -368,6 +375,7 @@ func buildMaintenanceFlatSpec(parentName string, spec
*supersetv1alpha1.Maintena
},
}
volumeMounts := []corev1.VolumeMount{
+ {Name: "maintenance-config", MountPath:
"/etc/nginx/nginx.conf", SubPath: "nginx.conf"},
{Name: "maintenance-config", MountPath:
"/etc/nginx/conf.d/default.conf", SubPath: "default.conf"},
{Name: "maintenance-config", MountPath:
"/usr/share/nginx/html/index.html", SubPath: "index.html"},
}
@@ -380,6 +388,14 @@ func buildMaintenanceFlatSpec(parentName string, spec
*supersetv1alpha1.Maintena
flat.PodTemplate.Container =
&supersetv1alpha1.ContainerTemplate{}
}
flat.PodTemplate.Container.VolumeMounts =
append(flat.PodTemplate.Container.VolumeMounts, volumeMounts...)
+
+ // Default the maintenance container to non-root. Stock nginx
runs as root
+ // by default, so it would be rejected under a runAsNonRoot pod
security
+ // context (e.g. a hardened maintenancePage.podTemplate). The
rendered
+ // nginx.conf redirects the pid file and temp paths to /tmp, so
any
+ // non-root UID works. An explicit user securityContext is
respected.
+ flat.PodTemplate.Container.SecurityContext =
maintenanceSecurityContext(
+ flat.PodTemplate.Container.SecurityContext,
flat.PodTemplate.PodSecurityContext)
}
// Inject env vars.
@@ -421,6 +437,46 @@ func computeMaintenanceChecksum(spec
*supersetv1alpha1.MaintenancePageSpec) stri
return fmt.Sprintf("%x", h.Sum(nil))[:16]
}
+// maintenanceSecurityContext returns the maintenance container's security
+// context, defaulting to non-root with privilege escalation disabled and all
+// capabilities dropped (so it satisfies restricted Pod Security Standards),
+// while respecting any user-provided container securityContext.
+func maintenanceSecurityContext(containerSC *corev1.SecurityContext, podSC
*corev1.PodSecurityContext) *corev1.SecurityContext {
+ sc := helperNonRootSecurityContext(containerSC, podSC,
maintenanceNonRootUID)
+ if sc.AllowPrivilegeEscalation == nil {
+ no := false
+ sc.AllowPrivilegeEscalation = &no
+ }
+ if sc.Capabilities == nil {
+ sc.Capabilities = &corev1.Capabilities{Drop:
[]corev1.Capability{"ALL"}}
+ }
+ return sc
+}
+
+// renderMaintenanceNginxMainConf renders the main nginx.conf for the managed
+// maintenance page. It routes the pid file and all temp paths to /tmp and logs
+// to stdout/stderr so nginx runs as an unprivileged, non-root user. The
+// per-server configuration lives in conf.d/default.conf (renderNginxConf).
+func renderMaintenanceNginxMainConf() string {
+ return `worker_processes 1;
+pid /tmp/nginx.pid;
+error_log /dev/stderr warn;
+events {
+ worker_connections 1024;
+}
+http {
+ access_log /dev/stdout;
+ client_body_temp_path /tmp/client_temp;
+ proxy_temp_path /tmp/proxy_temp;
+ fastcgi_temp_path /tmp/fastcgi_temp;
+ uwsgi_temp_path /tmp/uwsgi_temp;
+ scgi_temp_path /tmp/scgi_temp;
+ include /etc/nginx/mime.types;
+ default_type application/octet-stream;
+ include /etc/nginx/conf.d/*.conf;
+}`
+}
+
func renderNginxConf(port int32) string {
return `server {
listen ` + fmt.Sprintf("%d", port) + `;
diff --git a/internal/controller/maintenance_test.go
b/internal/controller/maintenance_test.go
index 08001c1..e66df43 100644
--- a/internal/controller/maintenance_test.go
+++ b/internal/controller/maintenance_test.go
@@ -96,6 +96,80 @@ func TestRenderNginxConf_UsesDefaultPort(t *testing.T) {
}
}
+func TestRenderMaintenanceNginxMainConf_RunsNonRoot(t *testing.T) {
+ // The main nginx.conf must route the pid file and all temp paths off
+ // root-owned directories so nginx starts as a non-root user. Without
this,
+ // a hardened (runAsNonRoot) maintenance pod fails to write
/run/nginx.pid.
+ conf := renderMaintenanceNginxMainConf()
+ for _, want := range []string{
+ "pid /tmp/nginx.pid;",
+ "client_body_temp_path /tmp/client_temp;",
+ "proxy_temp_path /tmp/proxy_temp;",
+ "include /etc/nginx/conf.d/*.conf;",
+ } {
+ if !strings.Contains(conf, want) {
+ t.Errorf("nginx.conf missing %q\n--- conf ---\n%s",
want, conf)
+ }
+ }
+}
+
+func TestBuildMaintenanceFlatSpec_DefaultsNonRoot(t *testing.T) {
+ // Managed mode must default the container to a non-root, hardened
security
+ // context and mount the custom nginx.conf, so the maintenance page
satisfies
+ // restricted Pod Security Standards out of the box.
+ title := "down"
+ flat := buildMaintenanceFlatSpec("parent",
&supersetv1alpha1.MaintenancePageSpec{Title: &title})
+
+ sc := flat.PodTemplate.Container.SecurityContext
+ if sc == nil {
+ t.Fatal("expected a default container securityContext")
+ }
+ if sc.RunAsUser == nil || *sc.RunAsUser != maintenanceNonRootUID {
+ t.Errorf("RunAsUser = %v, want %d", sc.RunAsUser,
maintenanceNonRootUID)
+ }
+ if sc.RunAsNonRoot == nil || !*sc.RunAsNonRoot {
+ t.Errorf("RunAsNonRoot = %v, want true", sc.RunAsNonRoot)
+ }
+ if sc.AllowPrivilegeEscalation == nil || *sc.AllowPrivilegeEscalation {
+ t.Errorf("AllowPrivilegeEscalation = %v, want false",
sc.AllowPrivilegeEscalation)
+ }
+ if sc.Capabilities == nil || len(sc.Capabilities.Drop) != 1 ||
sc.Capabilities.Drop[0] != "ALL" {
+ t.Errorf("Capabilities.Drop = %+v, want [ALL]", sc.Capabilities)
+ }
+
+ if !maintenanceMountsConf(flat, "/etc/nginx/nginx.conf") {
+ t.Error("expected nginx.conf to be mounted at
/etc/nginx/nginx.conf")
+ }
+}
+
+func TestBuildMaintenanceFlatSpec_RespectsUserRunAsUser(t *testing.T) {
+ // An explicit user UID must win over the operator default.
+ title := "down"
+ flat := buildMaintenanceFlatSpec("parent",
&supersetv1alpha1.MaintenancePageSpec{
+ Title: &title,
+ PodTemplate: &supersetv1alpha1.PodTemplate{
+ Container: &supersetv1alpha1.ContainerTemplate{
+ SecurityContext:
&corev1.SecurityContext{RunAsUser: common.Ptr(int64(2020))},
+ },
+ },
+ })
+ if sc := flat.PodTemplate.Container.SecurityContext; sc.RunAsUser ==
nil || *sc.RunAsUser != 2020 {
+ t.Errorf("expected user RunAsUser=2020 to be respected, got
%v", sc.RunAsUser)
+ }
+}
+
+func maintenanceMountsConf(flat supersetv1alpha1.FlatComponentSpec, path
string) bool {
+ if flat.PodTemplate == nil || flat.PodTemplate.Container == nil {
+ return false
+ }
+ for _, m := range flat.PodTemplate.Container.VolumeMounts {
+ if m.MountPath == path {
+ return true
+ }
+ }
+ return false
+}
+
func TestResolveWebServerPort_Default(t *testing.T) {
s := &supersetv1alpha1.Superset{
Spec: supersetv1alpha1.SupersetSpec{
diff --git a/internal/controller/networking.go
b/internal/controller/networking.go
index 950d0b7..c085c15 100644
--- a/internal/controller/networking.go
+++ b/internal/controller/networking.go
@@ -21,6 +21,7 @@ package controller
import (
"context"
"fmt"
+ "strings"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
@@ -174,10 +175,6 @@ func (r *SupersetReconciler) reconcileHTTPRoute(ctx
context.Context, superset *s
},
}
- _, webServerPort := webServerServiceRef(superset)
- webServerSvcName :=
gatewayv1.ObjectName(webServerDescriptor.resourceBaseName(&superset.Spec,
superset.Name))
- gwPort := webServerPort
-
_, err := controllerutil.CreateOrUpdate(ctx, r.Client, route, func()
error {
if err := controllerutil.SetControllerReference(superset,
route, r.Scheme); err != nil {
return err
@@ -186,35 +183,10 @@ func (r *SupersetReconciler) reconcileHTTPRoute(ctx
context.Context, superset *s
route.Labels = mergeLabels(gw.Labels,
parentLabels(superset.Name))
route.Annotations = mergeAnnotations(nil, gw.Annotations)
+ // Rules are ordered most-specific first (web "/" last) by
componentRoutes.
var rules []gatewayv1.HTTPRouteRule
-
- // Add websocket route FIRST (more specific) if websocket
server is enabled.
- if superset.Spec.WebsocketServer != nil {
- svcName :=
gatewayv1.ObjectName(websocketServerDescriptor.resourceBaseName(&superset.Spec,
superset.Name))
- port :=
resolveServicePort(superset.Spec.WebsocketServer.Service, common.PortWebsocket)
- path :=
resolveGatewayPath(superset.Spec.WebsocketServer.Service, "/ws")
- rules = append(rules, buildHTTPRouteRule(svcName, port,
path))
- }
-
- // Add MCP route if MCP server is enabled.
- if superset.Spec.McpServer != nil {
- svcName :=
gatewayv1.ObjectName(mcpServerDescriptor.resourceBaseName(&superset.Spec,
superset.Name))
- port :=
resolveServicePort(superset.Spec.McpServer.Service, common.PortMcpServer)
- path :=
resolveGatewayPath(superset.Spec.McpServer.Service, "/mcp")
- rules = append(rules, buildHTTPRouteRule(svcName, port,
path))
- }
-
- // Add Celery Flower route if enabled.
- if superset.Spec.CeleryFlower != nil {
- svcName :=
gatewayv1.ObjectName(celeryFlowerDescriptor.resourceBaseName(&superset.Spec,
superset.Name))
- port :=
resolveServicePort(superset.Spec.CeleryFlower.Service, common.PortCeleryFlower)
- path :=
resolveGatewayPath(superset.Spec.CeleryFlower.Service, "/flower")
- rules = append(rules, buildHTTPRouteRule(svcName, port,
path))
- }
-
- // Default route for web server (less specific, listed LAST).
- if superset.Spec.WebServer != nil {
- rules = append(rules,
buildHTTPRouteRule(webServerSvcName, gwPort, "/"))
+ for _, rt := range componentRoutes(superset) {
+ rules = append(rules,
buildHTTPRouteRule(gatewayv1.ObjectName(rt.svcName), rt.port, rt.path))
}
route.Spec = gatewayv1.HTTPRouteSpec{
@@ -251,6 +223,16 @@ func resolveGatewayPath(svc
*supersetv1alpha1.ComponentServiceSpec, defaultPath
return defaultPath
}
+// flowerHealthPath returns Flower's HTTP health endpoint. Flower runs with
+// --url_prefix (the gateway path, default /flower), so its routes are served
+// under that prefix — the health endpoint is <prefix>/healthcheck. We probe
+// /healthcheck (returns 200 "OK", no auth) rather than /api/workers, which
+// Flower 2.0+ gates behind FLOWER_UNAUTHENTICATED_API and otherwise 401s.
+// A probe that 404s or 401s would CrashLoopBackOff an otherwise-healthy pod.
+func flowerHealthPath(svc *supersetv1alpha1.ComponentServiceSpec) string {
+ return strings.TrimRight(resolveGatewayPath(svc, "/flower"), "/") +
"/healthcheck"
+}
+
// buildHTTPRouteRule constructs a single HTTPRouteRule for a component
backend.
func buildHTTPRouteRule(svcName gatewayv1.ObjectName, port
gatewayv1.PortNumber, path string) gatewayv1.HTTPRouteRule {
pathPrefix := gatewayv1.PathMatchPathPrefix
@@ -276,6 +258,67 @@ func buildHTTPRouteRule(svcName gatewayv1.ObjectName, port
gatewayv1.PortNumber,
}
}
+// componentRoute is a single path -> backend Service mapping for a routable
+// component. Shared by the Gateway (HTTPRoute) and Ingress reconcilers so both
+// expose components identically: dumb path-prefix forwarding with no rewrite,
+// each component owning its own subpath (e.g. Flower via --url_prefix).
+type componentRoute struct {
+ svcName string
+ port int32
+ path string
+}
+
+// componentRoutes returns the path routes for every present routable
component,
+// ordered most-specific first so the web server's "/" catch-all is last.
+func componentRoutes(superset *supersetv1alpha1.Superset) []componentRoute {
+ var routes []componentRoute
+ if superset.Spec.WebsocketServer != nil {
+ routes = append(routes, componentRoute{
+ svcName:
websocketServerDescriptor.resourceBaseName(&superset.Spec, superset.Name),
+ port:
resolveServicePort(superset.Spec.WebsocketServer.Service, common.PortWebsocket),
+ path:
resolveGatewayPath(superset.Spec.WebsocketServer.Service, "/ws"),
+ })
+ }
+ if superset.Spec.McpServer != nil {
+ routes = append(routes, componentRoute{
+ svcName:
mcpServerDescriptor.resourceBaseName(&superset.Spec, superset.Name),
+ port:
resolveServicePort(superset.Spec.McpServer.Service, common.PortMcpServer),
+ path:
resolveGatewayPath(superset.Spec.McpServer.Service, "/mcp"),
+ })
+ }
+ if superset.Spec.CeleryFlower != nil {
+ routes = append(routes, componentRoute{
+ svcName:
celeryFlowerDescriptor.resourceBaseName(&superset.Spec, superset.Name),
+ port:
resolveServicePort(superset.Spec.CeleryFlower.Service, common.PortCeleryFlower),
+ path:
resolveGatewayPath(superset.Spec.CeleryFlower.Service, "/flower"),
+ })
+ }
+ if superset.Spec.WebServer != nil {
+ _, port := webServerServiceRef(superset)
+ routes = append(routes, componentRoute{
+ svcName:
webServerDescriptor.resourceBaseName(&superset.Spec, superset.Name),
+ port: port,
+ path: "/",
+ })
+ }
+ return routes
+}
+
+// ingressPath builds a single Ingress HTTP path rule pointing at a Service.
+func ingressPath(path string, pathType networkingv1.PathType, svcName string,
port int32) networkingv1.HTTPIngressPath {
+ pt := pathType
+ return networkingv1.HTTPIngressPath{
+ Path: path,
+ PathType: &pt,
+ Backend: networkingv1.IngressBackend{
+ Service: &networkingv1.IngressServiceBackend{
+ Name: svcName,
+ Port: networkingv1.ServiceBackendPort{Number:
port},
+ },
+ },
+ }
+}
+
func (r *SupersetReconciler) reconcileIngress(ctx context.Context, superset
*supersetv1alpha1.Superset) error {
ing := superset.Spec.Networking.Ingress
@@ -309,7 +352,12 @@ func (r *SupersetReconciler) reconcileIngress(ctx
context.Context, superset *sup
}
}
- // Build rules from hosts.
+ // Build rules from hosts. A host without explicit Paths gets
the full
+ // per-component fan-out (web "/" plus /flower, /mcp, /ws for
present
+ // components), mirroring the Gateway HTTPRoute. A host with
explicit
+ // Paths is treated as a user-controlled override and routes
those paths
+ // to the web server.
+ routes := componentRoutes(superset)
for _, h := range hosts {
rule := networkingv1.IngressRule{
Host: h.Host,
@@ -324,37 +372,11 @@ func (r *SupersetReconciler) reconcileIngress(ctx
context.Context, superset *sup
if p.PathType != nil {
pathType = *p.PathType
}
- rule.HTTP.Paths =
append(rule.HTTP.Paths,
- networkingv1.HTTPIngressPath{
- Path: p.Path,
- PathType: &pathType,
- Backend:
networkingv1.IngressBackend{
- Service:
&networkingv1.IngressServiceBackend{
- Name:
webServerSvcName,
- Port:
networkingv1.ServiceBackendPort{
-
Number: webServerPort,
- },
- },
- },
- },
- )
+ rule.HTTP.Paths =
append(rule.HTTP.Paths, ingressPath(p.Path, pathType, webServerSvcName,
webServerPort))
}
} else {
- // Default path.
- pathType := networkingv1.PathTypePrefix
- rule.HTTP.Paths =
[]networkingv1.HTTPIngressPath{
- {
- Path: "/",
- PathType: &pathType,
- Backend:
networkingv1.IngressBackend{
- Service:
&networkingv1.IngressServiceBackend{
- Name:
webServerSvcName,
- Port:
networkingv1.ServiceBackendPort{
- Number:
webServerPort,
- },
- },
- },
- },
+ for _, rt := range routes {
+ rule.HTTP.Paths =
append(rule.HTTP.Paths, ingressPath(rt.path, networkingv1.PathTypePrefix,
rt.svcName, rt.port))
}
}
diff --git a/internal/controller/networking_test.go
b/internal/controller/networking_test.go
index c0eb90a..06c4539 100644
--- a/internal/controller/networking_test.go
+++ b/internal/controller/networking_test.go
@@ -20,6 +20,7 @@ package controller
import (
"context"
+ "reflect"
"testing"
networkingv1 "k8s.io/api/networking/v1"
@@ -379,6 +380,138 @@ func TestResolveGatewayPath(t *testing.T) {
}
}
+func TestFlowerHealthPath(t *testing.T) {
+ // Flower serves under its URL prefix, so the health probe must be
prefixed.
+ // We use /healthcheck (200, no auth); /api/workers 404s without the
prefix
+ // and 401s under Flower 2.0's default API auth — either would
CrashLoopBackOff
+ // an otherwise-healthy pod.
+ custom := "/monitoring"
+ trailing := "/flower/"
+ tests := []struct {
+ name string
+ svc *supersetv1alpha1.ComponentServiceSpec
+ want string
+ }{
+ {"default prefix", nil, "/flower/healthcheck"},
+ {"empty gatewayPath uses default",
&supersetv1alpha1.ComponentServiceSpec{}, "/flower/healthcheck"},
+ {"custom gatewayPath",
&supersetv1alpha1.ComponentServiceSpec{GatewayPath: &custom},
"/monitoring/healthcheck"},
+ {"trailing slash not doubled",
&supersetv1alpha1.ComponentServiceSpec{GatewayPath: &trailing},
"/flower/healthcheck"},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := flowerHealthPath(tt.svc); got != tt.want {
+ t.Errorf("flowerHealthPath() = %s, want %s",
got, tt.want)
+ }
+ })
+ }
+}
+
+func TestComponentRoutes(t *testing.T) {
+ // Both the Ingress and the HTTPRoute build from componentRoutes, so
this
+ // pins the ordering (most-specific first, web "/" last) and
per-component
+ // path/backend mapping that both reconcilers depend on.
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace:
"default"},
+ Spec: supersetv1alpha1.SupersetSpec{
+ WebServer:
&supersetv1alpha1.WebServerComponentSpec{},
+ McpServer:
&supersetv1alpha1.McpServerComponentSpec{},
+ CeleryFlower:
&supersetv1alpha1.CeleryFlowerComponentSpec{},
+ },
+ }
+ want := []componentRoute{
+ {svcName: "test-mcp-server", port: common.PortMcpServer, path:
"/mcp"},
+ {svcName: "test-celery-flower", port: common.PortCeleryFlower,
path: "/flower"},
+ {svcName: "test-web-server", port: common.PortWebServer, path:
"/"},
+ }
+ if got := componentRoutes(superset); !reflect.DeepEqual(got, want) {
+ t.Errorf("componentRoutes() = %+v, want %+v", got, want)
+ }
+}
+
+func TestReconcileIngress_MultiComponentFanout(t *testing.T) {
+ // A host with no explicit Paths must fan out to every present
component by
+ // path (the Ingress/Gateway symmetry fix), not route everything to the
web
+ // server.
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace:
"default", UID: "uid-1"},
+ Spec: supersetv1alpha1.SupersetSpec{
+ Image: supersetv1alpha1.ImageSpec{Repository:
"apache/superset", Tag: "latest"},
+ WebServer:
&supersetv1alpha1.WebServerComponentSpec{},
+ McpServer:
&supersetv1alpha1.McpServerComponentSpec{},
+ CeleryFlower:
&supersetv1alpha1.CeleryFlowerComponentSpec{},
+ Lifecycle: &supersetv1alpha1.LifecycleSpec{Disabled:
boolPtr(true)},
+ Networking: &supersetv1alpha1.NetworkingSpec{
+ Ingress: &supersetv1alpha1.IngressSpec{Host:
"superset.example.com"},
+ },
+ },
+ }
+
+ c :=
fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+ if err := r.reconcileIngress(context.Background(), superset); err !=
nil {
+ t.Fatalf("reconcileIngress: %v", err)
+ }
+
+ ingress := &networkingv1.Ingress{}
+ if err := c.Get(context.Background(), types.NamespacedName{Name:
"test", Namespace: "default"}, ingress); err != nil {
+ t.Fatalf("expected Ingress: %v", err)
+ }
+ if len(ingress.Spec.Rules) != 1 {
+ t.Fatalf("expected 1 rule, got %d", len(ingress.Spec.Rules))
+ }
+ got := map[string]string{} // path -> backend service
+ for _, p := range ingress.Spec.Rules[0].HTTP.Paths {
+ got[p.Path] = p.Backend.Service.Name
+ }
+ want := map[string]string{
+ "/mcp": "test-mcp-server",
+ "/flower": "test-celery-flower",
+ "/": "test-web-server",
+ }
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("ingress paths = %+v, want %+v", got, want)
+ }
+}
+
+func TestReconcileIngress_ExplicitPathsRouteToWebServer(t *testing.T) {
+ // A host WITH explicit Paths is a user-controlled override: those
paths route
+ // to the web server, not the component fan-out.
+ scheme := testScheme(t)
+ superset := &supersetv1alpha1.Superset{
+ ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace:
"default", UID: "uid-1"},
+ Spec: supersetv1alpha1.SupersetSpec{
+ Image: supersetv1alpha1.ImageSpec{Repository:
"apache/superset", Tag: "latest"},
+ WebServer:
&supersetv1alpha1.WebServerComponentSpec{},
+ CeleryFlower:
&supersetv1alpha1.CeleryFlowerComponentSpec{},
+ Lifecycle: &supersetv1alpha1.LifecycleSpec{Disabled:
boolPtr(true)},
+ Networking: &supersetv1alpha1.NetworkingSpec{
+ Ingress: &supersetv1alpha1.IngressSpec{
+ Hosts: []supersetv1alpha1.IngressHost{
+ {Host: "superset.example.com",
Paths: []supersetv1alpha1.IngressPath{
+ {Path: "/", PathType:
pathTypePtr(networkingv1.PathTypePrefix)},
+ }},
+ },
+ },
+ },
+ },
+ }
+
+ c :=
fake.NewClientBuilder().WithScheme(scheme).WithObjects(superset).Build()
+ r := &SupersetReconciler{Client: c, Scheme: scheme, Recorder:
events.NewFakeRecorder(10)}
+ if err := r.reconcileIngress(context.Background(), superset); err !=
nil {
+ t.Fatalf("reconcileIngress: %v", err)
+ }
+ ingress := &networkingv1.Ingress{}
+ if err := c.Get(context.Background(), types.NamespacedName{Name:
"test", Namespace: "default"}, ingress); err != nil {
+ t.Fatalf("expected Ingress: %v", err)
+ }
+ paths := ingress.Spec.Rules[0].HTTP.Paths
+ if len(paths) != 1 || paths[0].Path != "/" ||
paths[0].Backend.Service.Name != "test-web-server" {
+ t.Errorf("explicit paths should route to web server only, got
%+v", paths)
+ }
+}
+
func TestReconcileIngress_CreatesIngress(t *testing.T) {
scheme := testScheme(t)