This is an automated email from the ASF dual-hosted git repository.
HoustonPutman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr-operator.git
The following commit(s) were added to refs/heads/main by this push:
new e0d36ee Support online resizing (expansion) of persistent data PVCs
(#712)
e0d36ee is described below
commit e0d36ee7c1d5995a5617fdf0396effaf788149df
Author: Houston Putman <[email protected]>
AuthorDate: Mon Jun 8 11:17:01 2026 -0700
Support online resizing (expansion) of persistent data PVCs (#712)
Grow SolrCloud data PVCs in place via the storage request; the operator
resizes the PVCs and rolls the pods.
The integration (e2e) tests now use the `rawfile-localpv` provisioner that
supports resizing.
---
Makefile | 2 +-
config/rbac/role.yaml | 18 ++++
controllers/solr_cluster_ops_util.go | 107 ++++++++++++++++++-
controllers/solr_pvc_expansion_test.go | 94 +++++++++++++++++
controllers/solrcloud_controller.go | 184 ++++++++++++++++++++++++++++++---
controllers/suite_test.go | 5 +-
controllers/util/solr_util.go | 24 +++++
docs/solr-cloud/solr-cloud-crd.md | 9 +-
docs/upgrade-notes.md | 2 +-
helm/solr-operator/Chart.yaml | 7 ++
helm/solr-operator/templates/role.yaml | 18 ++++
helm/solr/Chart.yaml | 9 +-
main.go | 5 +-
tests/e2e/solrcloud_storage_test.go | 171 ++++++++++++++++++++++++++++++
tests/e2e/suite_test.go | 85 ++++++++++-----
tests/scripts/manage_e2e_tests.sh | 10 +-
16 files changed, 693 insertions(+), 57 deletions(-)
diff --git a/Makefile b/Makefile
index c637225..c9a1044 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ KUSTOMIZE_VERSION=v4.5.2
CONTROLLER_GEN_VERSION=v0.16.4
GO_LICENSES_VERSION=v1.6.0
GINKGO_VERSION = $(shell cat go.mod | grep 'github.com/onsi/ginkgo' | sed
's/.*\(v.*\)$$/\1/g')
-KIND_VERSION=v0.23.0
+KIND_VERSION=v0.30.0
YQ_VERSION=v4.33.3
CONTROLLER_RUNTIME_VERSION = $(shell cat go.mod | grep
'sigs.k8s.io/controller-runtime' | sed 's/.*\(v\(.*\)\.[^.]*\)$$/\2/g')
# ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be
downloaded by envtest binary.
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index d8239bb..53b8d47 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -43,6 +43,16 @@ rules:
- ""
resources:
- persistentvolumeclaims
+ verbs:
+ - delete
+ - get
+ - list
+ - patch
+ - update
+ - watch
+- apiGroups:
+ - ""
+ resources:
- pods
verbs:
- delete
@@ -144,6 +154,14 @@ rules:
- get
- patch
- update
+- apiGroups:
+ - storage.k8s.io
+ resources:
+ - storageclasses
+ verbs:
+ - get
+ - list
+ - watch
- apiGroups:
- zookeeper.pravega.io
resources:
diff --git a/controllers/solr_cluster_ops_util.go
b/controllers/solr_cluster_ops_util.go
index 916446b..deecd21 100644
--- a/controllers/solr_cluster_ops_util.go
+++ b/controllers/solr_cluster_ops_util.go
@@ -21,18 +21,20 @@ import (
"context"
"encoding/json"
"errors"
+ "net/url"
+ "strconv"
+ "time"
+
solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
"github.com/apache/solr-operator/controllers/util"
"github.com/apache/solr-operator/controllers/util/solr_api"
"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
- "net/url"
"sigs.k8s.io/controller-runtime/pkg/client"
- "strconv"
- "time"
)
// SolrClusterOp contains metadata for cluster operations performed on
SolrClouds.
@@ -53,6 +55,7 @@ const (
ScaleUpLock SolrClusterOperationType = "ScalingUp"
UpdateLock SolrClusterOperationType = "RollingUpdate"
BalanceReplicasLock SolrClusterOperationType = "BalanceReplicas"
+ PvcExpansionLock SolrClusterOperationType = "PVCExpansion"
)
// RollingUpdateMetadata contains metadata for rolling update cluster
operations.
@@ -150,6 +153,101 @@ func retryNextQueuedClusterOpWithQueue(statefulSet
*appsv1.StatefulSet, clusterO
return hasOp, err
}
+func determinePvcExpansionClusterOpLockIfNecessary(ctx context.Context, r
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet
*appsv1.StatefulSet, logger logr.Logger) (clusterOp *SolrClusterOp,
retryLaterDuration time.Duration, err error) {
+ if instance.Spec.StorageOptions.PersistentStorage == nil ||
+
instance.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests.Storage()
== nil {
+ return
+ }
+ newSize :=
instance.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests.Storage()
+ // If there is no old size to update, the StatefulSet can just be set
to use the new PVC size without any issue.
+ // Only do a cluster operation if we are expanding from an existing
size to a new size.
+ oldSizeStr, hasOldSize :=
statefulSet.Annotations[util.StorageMinimumSizeAnnotation]
+ if !hasOldSize || newSize.String() == oldSizeStr {
+ return
+ }
+ oldSize, e := resource.ParseQuantity(oldSizeStr)
+ if e != nil {
+ err = e
+ logger.Error(err, "Could not parse the existing minimum PVC
size from the StatefulSet annotation", "annotation",
util.StorageMinimumSizeAnnotation, "value", oldSizeStr)
+ if r.Recorder != nil {
+ r.Recorder.Eventf(instance, corev1.EventTypeWarning,
"PVCExpansionError",
+ "Could not parse the existing minimum data PVC
size %q recorded on the StatefulSet: %v", oldSizeStr, e)
+ }
+ return
+ }
+ // PVCs cannot be shrunk, so only proceed if the new size is strictly
bigger than the recorded size.
+ if newSize.Cmp(oldSize) <= 0 {
+ logger.Info("Cannot shrink existing data PVCs; ignoring the
decreased storage request", "currentSize", oldSize.String(), "requestedSize",
newSize.String())
+ if r.Recorder != nil {
+ r.Recorder.Eventf(instance, corev1.EventTypeWarning,
"PVCExpansionForbidden",
+ "Cannot shrink data PersistentVolumeClaims from
%s to %s; PersistentVolumeClaims can only be expanded.", oldSize.String(),
newSize.String())
+ }
+ return
+ }
+ // Pre-flight: make sure the storage class backing the data PVCs allows
volume expansion. If it
+ // explicitly does not, there is no point acquiring a cluster operation
lock that can never
+ // complete; surface it as an event instead.
+ if allowed, className, scErr := r.storageClassAllowsExpansion(ctx,
instance, statefulSet.Spec.Selector.MatchLabels); scErr != nil {
+ // Could not determine; proceed best-effort and let the PVC
patch surface any hard rejection.
+ logger.Error(scErr, "Could not verify whether the storage class
allows volume expansion; proceeding with the expansion attempt")
+ } else if !allowed {
+ logger.Info("Storage class does not allow volume expansion;
ignoring the increased storage request", "storageClass", className,
"currentSize", oldSize.String(), "requestedSize", newSize.String())
+ if r.Recorder != nil {
+ r.Recorder.Eventf(instance, corev1.EventTypeWarning,
"PVCExpansionForbidden",
+ "Storage class %q does not allow volume
expansion (allowVolumeExpansion); cannot expand data PersistentVolumeClaims
from %s to %s.", className, oldSize.String(), newSize.String())
+ }
+ return
+ }
+ clusterOp = &SolrClusterOp{
+ Operation: PvcExpansionLock,
+ Metadata: newSize.String(),
+ }
+ return
+}
+
+// handlePvcExpansion handles the logic of a persistent volume claim expansion
operation.
+func handlePvcExpansion(ctx context.Context, r *SolrCloudReconciler, instance
*solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, clusterOp
*SolrClusterOp, logger logr.Logger) (operationComplete bool, retryLaterDuration
time.Duration, err error) {
+ var newSize resource.Quantity
+ newSize, err = resource.ParseQuantity(clusterOp.Metadata)
+ if err != nil {
+ logger.Error(err, "Could not convert PvcExpansion metadata to a
resource.Quantity, as it represents the new size of PVCs", "metadata",
clusterOp.Metadata)
+ return
+ }
+ var resizeInfeasible bool
+ operationComplete, resizeInfeasible, err = r.expandPVCs(ctx, instance,
statefulSet.Spec.Selector.MatchLabels, newSize, logger)
+ if err == nil && operationComplete {
+ originalStatefulSet := statefulSet.DeepCopy()
+ statefulSet.Annotations[util.StorageMinimumSizeAnnotation] =
newSize.String()
+ if statefulSet.Spec.Template.Annotations == nil {
+ statefulSet.Spec.Template.Annotations =
make(map[string]string, 1)
+ }
+
statefulSet.Spec.Template.Annotations[util.StorageMinimumSizeAnnotation] =
newSize.String()
+ if err = r.Patch(ctx, statefulSet,
client.StrategicMergeFrom(originalStatefulSet)); err != nil {
+ logger.Error(err, "Error while patching StatefulSet to
set the new minimum PVC size after PVCs the completion of PVC resizing",
"newSize", newSize)
+ operationComplete = false
+ } else {
+ logger.Info("All PersistentVolumeClaims have been
expanded, now issuing a rolling restart", "statefulSet", statefulSet.Name)
+ }
+ // Return and wait for the StatefulSet to be updated which will
call the reconcile to start the rolling restart
+ retryLaterDuration = 0
+ } else if err == nil {
+ if resizeInfeasible {
+ // The storage backend has declared the requested size
infeasible. There is nothing the
+ // operator can do until the user lowers the requested
size, so surface it as an event and
+ // back off significantly instead of retrying tightly.
+ if r.Recorder != nil {
+ r.Recorder.Eventf(instance,
corev1.EventTypeWarning, "PVCExpansionInfeasible",
+ "The storage backend reported that
expanding the data PersistentVolumeClaims to %s is infeasible (e.g. it exceeds
backend or quota limits). Reduce the requested storage size to a feasible value
to recover.",
+ newSize.String())
+ }
+ retryLaterDuration = time.Minute
+ } else {
+ retryLaterDuration = time.Second * 5
+ }
+ }
+ return
+}
+
func determineScaleClusterOpLockIfNecessary(ctx context.Context, r
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet
*appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod,
blockReconciliationOfStatefulSet bool, logger logr.Logger) (clusterOp
*SolrClusterOp, retryLaterDuration time.Duration, err error) {
desiredPods := int(*instance.Spec.Replicas)
configuredPods := int(*statefulSet.Spec.Replicas)
@@ -291,7 +389,8 @@ func cleanupManagedCloudScaleDown(ctx context.Context, r
*SolrCloudReconciler, p
// handleManagedCloudScaleUp does the logic of a managed and "locked" cloud
scale up operation.
// This will likely take many reconcile loops to complete, as it is moving
replicas to the pods that have recently been scaled up.
func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler,
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, clusterOp
*SolrClusterOp, podList []corev1.Pod, logger logr.Logger) (operationComplete
bool, nextClusterOperation *SolrClusterOp, err error) {
- desiredPods, err := strconv.Atoi(clusterOp.Metadata)
+ desiredPods := 0
+ desiredPods, err = strconv.Atoi(clusterOp.Metadata)
if err != nil {
logger.Error(err, "Could not convert ScaleUp metadata to int,
as it represents the number of nodes to scale to", "metadata",
clusterOp.Metadata)
return
diff --git a/controllers/solr_pvc_expansion_test.go
b/controllers/solr_pvc_expansion_test.go
new file mode 100644
index 0000000..9ac4ca2
--- /dev/null
+++ b/controllers/solr_pvc_expansion_test.go
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+ "testing"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+// pvcWithCondition builds a PVC carrying a single resize condition.
+func pvcWithCondition(condType corev1.PersistentVolumeClaimConditionType,
status corev1.ConditionStatus) *corev1.PersistentVolumeClaim {
+ return &corev1.PersistentVolumeClaim{
+ Status: corev1.PersistentVolumeClaimStatus{
+ Conditions:
[]corev1.PersistentVolumeClaimCondition{{Type: condType, Status: status}},
+ },
+ }
+}
+
+// pvcWithAllocatedStatus builds a PVC carrying a storage
allocatedResourceStatus.
+func pvcWithAllocatedStatus(status corev1.ClaimResourceStatus)
*corev1.PersistentVolumeClaim {
+ return &corev1.PersistentVolumeClaim{
+ Status: corev1.PersistentVolumeClaimStatus{
+ AllocatedResourceStatuses:
map[corev1.ResourceName]corev1.ClaimResourceStatus{
+ corev1.ResourceStorage: status,
+ },
+ },
+ }
+}
+
+// TestPvcControllerExpansionComplete verifies that the controller-side
expansion is reported as
+// complete for the "offline" provisioner signals (FileSystemResizePending
condition or a pending/
+// in-progress node resize status), so that the rolling restart is not gated
on status.capacity.
+func TestPvcControllerExpansionComplete(t *testing.T) {
+ cases := []struct {
+ name string
+ pvc *corev1.PersistentVolumeClaim
+ want bool
+ }{
+ {"empty pvc", &corev1.PersistentVolumeClaim{}, false},
+ {"filesystem resize pending (offline ready-to-restart)",
pvcWithCondition(corev1.PersistentVolumeClaimFileSystemResizePending,
corev1.ConditionTrue), true},
+ {"filesystem resize pending but condition false",
pvcWithCondition(corev1.PersistentVolumeClaimFileSystemResizePending,
corev1.ConditionFalse), false},
+ {"unrelated resizing condition",
pvcWithCondition(corev1.PersistentVolumeClaimResizing, corev1.ConditionTrue),
false},
+ {"node resize pending status",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimNodeResizePending), true},
+ {"node resize in progress status",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimNodeResizeInProgress), true},
+ {"controller resize in progress status",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimControllerResizeInProgress),
false},
+ {"controller resize infeasible status",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimControllerResizeInfeasible),
false},
+ }
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ if got := pvcControllerExpansionComplete(tc.pvc); got
!= tc.want {
+ t.Errorf("pvcControllerExpansionComplete() =
%v, want %v", got, tc.want)
+ }
+ })
+ }
+}
+
+// TestPvcResizeInfeasible verifies that a backend-declared infeasible
expansion is detected from the
+// allocatedResourceStatuses (best-effort; populated on Kubernetes >= 1.34).
+func TestPvcResizeInfeasible(t *testing.T) {
+ cases := []struct {
+ name string
+ pvc *corev1.PersistentVolumeClaim
+ want bool
+ }{
+ {"empty pvc", &corev1.PersistentVolumeClaim{}, false},
+ {"controller resize infeasible",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimControllerResizeInfeasible),
true},
+ {"node resize infeasible",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimNodeResizeInfeasible), true},
+ {"node resize pending is not infeasible",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimNodeResizePending), false},
+ {"controller resize in progress is not infeasible",
pvcWithAllocatedStatus(corev1.PersistentVolumeClaimControllerResizeInProgress),
false},
+ }
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ if got := pvcResizeInfeasible(tc.pvc); got != tc.want {
+ t.Errorf("pvcResizeInfeasible() = %v, want %v",
got, tc.want)
+ }
+ })
+ }
+}
diff --git a/controllers/solrcloud_controller.go
b/controllers/solrcloud_controller.go
index b18dbd1..e94ef7b 100644
--- a/controllers/solrcloud_controller.go
+++ b/controllers/solrcloud_controller.go
@@ -21,13 +21,15 @@ import (
"context"
"crypto/md5"
"fmt"
- policyv1 "k8s.io/api/policy/v1"
- "k8s.io/apimachinery/pkg/runtime"
"reflect"
"sort"
"strings"
"time"
+ policyv1 "k8s.io/api/policy/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ "k8s.io/apimachinery/pkg/runtime"
+
solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
"github.com/apache/solr-operator/controllers/util"
"github.com/go-logr/logr"
@@ -35,11 +37,13 @@ import (
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
netv1 "k8s.io/api/networking/v1"
+ storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
+ "k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -53,7 +57,8 @@ import (
// SolrCloudReconciler reconciles a SolrCloud object
type SolrCloudReconciler struct {
client.Client
- Scheme *runtime.Scheme
+ Scheme *runtime.Scheme
+ Recorder record.EventRecorder
}
var useZkCRD bool
@@ -72,7 +77,8 @@ func UseZkCRD(useCRD bool) {
//+kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses/status,verbs=get
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=configmaps/status,verbs=get
-//+kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;delete
+//+kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;update;patch;delete
+//+kubebuilder:rbac:groups=storage.k8s.io,resources=storageclasses,verbs=get;list;watch
//+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=zookeeper.pravega.io,resources=zookeeperclusters,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=zookeeper.pravega.io,resources=zookeeperclusters/status,verbs=get
@@ -493,6 +499,11 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
operationComplete, nextClusterOperation, err =
handleManagedCloudScaleUp(ctx, r, instance, statefulSet, clusterOp, podList,
logger)
case BalanceReplicasLock:
operationComplete, requestInProgress,
retryLaterDuration, err = util.BalanceReplicasForCluster(ctx, instance,
statefulSet, clusterOp.Metadata, clusterOp.Metadata, logger)
+ case PvcExpansionLock:
+ operationComplete, retryLaterDuration, err =
handlePvcExpansion(ctx, r, instance, statefulSet, clusterOp, logger)
+ // PVC expansion (the controller-side volume resize)
can take a long time on some provisioners,
+ // so it should use the long requeue timeout rather
than being preempted after a minute.
+ shortTimeoutForRequeue = false
default:
operationFound = false
// This shouldn't happen, but we don't want to be stuck
if it does.
@@ -561,6 +572,15 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
clusterOp = nil
}
+ if clusterOp == nil {
+ clusterOp, retryLaterDuration, err =
determinePvcExpansionClusterOpLockIfNecessary(ctx, r, instance, statefulSet,
logger)
+ // If the new clusterOperation is an update to
a queued PVC expansion clusterOp, just change the operation that is already
queued
+ if queueIdx, opIsQueued :=
queuedRetryOps[PvcExpansionLock]; clusterOp != nil && opIsQueued {
+ clusterOpQueue[queueIdx] = *clusterOp
+ clusterOp = nil
+ }
+ }
+
// If a non-managed scale needs to take place, this
method will update the StatefulSet without starting
// a "locked" cluster operation
if clusterOp == nil {
@@ -1018,6 +1038,144 @@ func (r *SolrCloudReconciler) reconcileZk(ctx
context.Context, logger logr.Logge
return nil
}
+func (r *SolrCloudReconciler) expandPVCs(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, newSize
resource.Quantity, logger logr.Logger) (expansionComplete bool,
resizeInfeasible bool, err error) {
+ var pvcList corev1.PersistentVolumeClaimList
+ pvcList, err = r.getPVCList(ctx, cloud, pvcLabelSelector)
+ if err != nil {
+ return
+ }
+ expansionCompleteCount := 0
+ for _, pvcItem := range pvcList.Items {
+ if pvcExpansionComplete, pvcInfeasible, e := r.expandPVC(ctx,
&pvcItem, newSize, logger); e != nil {
+ err = e
+ } else {
+ if pvcExpansionComplete {
+ expansionCompleteCount += 1
+ }
+ if pvcInfeasible {
+ resizeInfeasible = true
+ }
+ }
+ }
+ // If all PVCs have completed their controller-side expansion, then we
are done
+ expansionComplete = err == nil && expansionCompleteCount ==
len(pvcList.Items)
+ return
+}
+
+// expandPVC requests (and detects the completion of) the controller-side
expansion of a single PVC.
+//
+// "Complete" here means the controller-side volume expansion has finished, so
the cluster operation
+// can hand off to a rolling restart that will carry out any remaining
node-side filesystem resize.
+// This intentionally does NOT wait for the filesystem resize itself, because
some provisioners only
+// resize the filesystem "offline" (when the volume is remounted during the
restart). Waiting for
+// status.capacity in that case would deadlock: capacity can't update until
the pod restarts, but the
+// operator wouldn't restart until capacity updated.
+func (r *SolrCloudReconciler) expandPVC(ctx context.Context, pvc
*corev1.PersistentVolumeClaim, newSize resource.Quantity, logger logr.Logger)
(expansionComplete bool, resizeInfeasible bool, err error) {
+ // If the current capacity is >= the new size, then there is nothing to
do, expansion is complete.
+ // Treat missing capacity as zero.
+ capacityQty, hasCapacity := pvc.Status.Capacity[corev1.ResourceStorage]
+ if !hasCapacity {
+ capacityQty = resource.Quantity{}
+ }
+ if capacityQty.Cmp(newSize) >= 0 || pvcControllerExpansionComplete(pvc)
{
+ // Either the volume has already been fully expanded (online
resize), or the controller-side
+ // expansion is done and only a node/filesystem resize remains
(offline resize), which the
+ // subsequent rolling restart will complete on remount.
+ expansionComplete = true
+ return
+ }
+ // Surface (best-effort) a backend that has declared the requested size
infeasible, so it can be
+ // reported instead of being silently retried forever.
allocatedResourceStatuses is populated on
+ // Kubernetes clusters with the RecoverVolumeExpansionFailure feature
(GA in 1.34); on older
+ // clusters this is simply never true and behavior is unchanged.
+ resizeInfeasible = pvcResizeInfeasible(pvc)
+
+ // Determine if the current request already matches the desired size.
+ requestQty, hasRequest :=
pvc.Spec.Resources.Requests[corev1.ResourceStorage]
+ sameRequest := hasRequest && requestQty.Equal(newSize)
+ if !sameRequest {
+ // Update the pvc if the capacity request is different.
+ // The newSize might be smaller than the current size, but this
is supported as the last size might have been too
+ // big for the storage quota, so it was lowered.
+ // As long as the PVCs current capacity is lower than the new
size, we are still good to update the PVC.
+ originalPvc := pvc.DeepCopy()
+ if pvc.Spec.Resources.Requests == nil {
+ pvc.Spec.Resources.Requests = corev1.ResourceList{}
+ }
+ pvc.Spec.Resources.Requests[corev1.ResourceStorage] = newSize
+ if err = r.Patch(ctx, pvc,
client.StrategicMergeFrom(originalPvc)); err != nil {
+ logger.Error(err, "Error while expanding
PersistentVolumeClaim size", "persistentVolumeClaim", pvc.Name, "size", newSize)
+ } else {
+ logger.Info("Expanded PersistentVolumeClaim size",
"persistentVolumeClaim", pvc.Name, "size", newSize)
+ }
+ }
+ return
+}
+
+// pvcControllerExpansionComplete reports whether the controller-side
expansion of the PVC has
+// finished and only a node-side filesystem resize remains. This is the signal
that it is safe (and,
+// for offline provisioners, necessary) to proceed to a rolling restart to
apply the resize.
+//
+// It checks the FileSystemResizePending condition (available on all supported
Kubernetes versions)
+// as the primary signal, and falls back to allocatedResourceStatuses
(best-effort, populated on
+// clusters with RecoverVolumeExpansionFailure / Kubernetes >= 1.34).
+func pvcControllerExpansionComplete(pvc *corev1.PersistentVolumeClaim) bool {
+ for _, cond := range pvc.Status.Conditions {
+ if cond.Type ==
corev1.PersistentVolumeClaimFileSystemResizePending && cond.Status ==
corev1.ConditionTrue {
+ return true
+ }
+ }
+ if status, hasStatus :=
pvc.Status.AllocatedResourceStatuses[corev1.ResourceStorage]; hasStatus {
+ if status == corev1.PersistentVolumeClaimNodeResizePending ||
status == corev1.PersistentVolumeClaimNodeResizeInProgress {
+ return true
+ }
+ }
+ return false
+}
+
+// pvcResizeInfeasible reports (best-effort) whether the storage backend has
declared the requested
+// expansion infeasible (e.g. the size exceeds backend/quota limits). This
relies on
+// allocatedResourceStatuses, which is populated on Kubernetes clusters with
the
+// RecoverVolumeExpansionFailure feature (GA in 1.34); on older clusters it is
never true.
+func pvcResizeInfeasible(pvc *corev1.PersistentVolumeClaim) bool {
+ if status, hasStatus :=
pvc.Status.AllocatedResourceStatuses[corev1.ResourceStorage]; hasStatus {
+ return status ==
corev1.PersistentVolumeClaimControllerResizeInfeasible || status ==
corev1.PersistentVolumeClaimNodeResizeInfeasible
+ }
+ return false
+}
+
+// storageClassAllowsExpansion reports whether the storage class backing the
SolrCloud's data PVCs
+// allows volume expansion. The storage class name is resolved from the actual
provisioned PVCs
+// (whose StorageClassName is always populated, even when the SolrCloud relies
on the cluster
+// default). When the class cannot be determined, this returns allowed=true so
the expansion is still
+// attempted (the PVC patch itself will surface a hard rejection).
+func (r *SolrCloudReconciler) storageClassAllowsExpansion(ctx context.Context,
cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (allowed
bool, className string, err error) {
+ pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector)
+ if err != nil {
+ return false, "", err
+ }
+ for i := range pvcList.Items {
+ if scn := pvcList.Items[i].Spec.StorageClassName; scn != nil &&
*scn != "" {
+ className = *scn
+ break
+ }
+ }
+ if className == "" {
+ // Could not determine the storage class; allow the attempt.
+ return true, "", nil
+ }
+ storageClass := &storagev1.StorageClass{}
+ if err = r.Get(ctx, types.NamespacedName{Name: className},
storageClass); err != nil {
+ if errors.IsNotFound(err) {
+ // Could not find the storage class; allow the attempt
and let the PVC patch surface any error.
+ return true, className, nil
+ }
+ return false, className, err
+ }
+ allowed = storageClass.AllowVolumeExpansion != nil &&
*storageClass.AllowVolumeExpansion
+ return allowed, className, nil
+}
+
// Logic derived from:
// - https://book.kubebuilder.io/reference/using-finalizers.html
// -
https://github.com/pravega/zookeeper-operator/blob/v0.2.9/pkg/controller/zookeepercluster/zookeepercluster_controller.go#L629
@@ -1064,16 +1222,15 @@ func (r *SolrCloudReconciler)
reconcileStorageFinalizer(ctx context.Context, clo
return nil
}
-func (r *SolrCloudReconciler) getPVCCount(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (pvcCount int, err
error) {
+func (r *SolrCloudReconciler) getPVCCount(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (int, error) {
pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector)
if err != nil {
return -1, err
}
- pvcCount = len(pvcList.Items)
- return pvcCount, nil
+ return len(pvcList.Items), nil
}
-func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud
*solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, pvcLabelSelector
map[string]string, logger logr.Logger) (err error) {
+func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud
*solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, pvcLabelSelector
map[string]string, logger logr.Logger) error {
// this check should make sure we do not delete the PVCs before the STS
has scaled down
if cloud.Status.ReadyReplicas == cloud.Status.Replicas {
pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector)
@@ -1093,24 +1250,25 @@ func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx
context.Context, cloud *solr
}
}
}
+ return err
}
return nil
}
-func (r *SolrCloudReconciler) getPVCList(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (pvList
corev1.PersistentVolumeClaimList, err error) {
+func (r *SolrCloudReconciler) getPVCList(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string)
(corev1.PersistentVolumeClaimList, error) {
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
MatchLabels: pvcLabelSelector,
})
- pvclistOps := &client.ListOptions{
+ pvcListOps := &client.ListOptions{
Namespace: cloud.Namespace,
LabelSelector: selector,
}
pvcList := &corev1.PersistentVolumeClaimList{}
- err = r.Client.List(ctx, pvcList, pvclistOps)
+ err = r.Client.List(ctx, pvcList, pvcListOps)
return *pvcList, err
}
-func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, logger logr.Logger)
(err error) {
+func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx context.Context, cloud
*solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, logger logr.Logger)
error {
pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector)
if err != nil {
return err
@@ -1118,7 +1276,7 @@ func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx
context.Context, cloud *solrv1b
for _, pvcItem := range pvcList.Items {
r.deletePVC(ctx, pvcItem, logger)
}
- return nil
+ return err
}
func (r *SolrCloudReconciler) deletePVC(ctx context.Context, pvcItem
corev1.PersistentVolumeClaim, logger logr.Logger) {
diff --git a/controllers/suite_test.go b/controllers/suite_test.go
index 4d49ef5..7b89ee8 100644
--- a/controllers/suite_test.go
+++ b/controllers/suite_test.go
@@ -106,8 +106,9 @@ var _ = BeforeSuite(func(ctx context.Context) {
// Start up Reconcilers
By("starting the reconcilers")
Expect((&SolrCloudReconciler{
- Client: k8sManager.GetClient(),
- Scheme: k8sManager.GetScheme(),
+ Client: k8sManager.GetClient(),
+ Scheme: k8sManager.GetScheme(),
+ Recorder:
k8sManager.GetEventRecorderFor("solrcloud-controller"),
}).SetupWithManager(k8sManager)).To(Succeed())
Expect((&SolrPrometheusExporterReconciler{
diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go
index 2cce36a..a3101c3 100644
--- a/controllers/util/solr_util.go
+++ b/controllers/util/solr_util.go
@@ -62,6 +62,7 @@ const (
// These are to be saved on a statefulSet update
ClusterOpsLockAnnotation = "solr.apache.org/clusterOpsLock"
ClusterOpsRetryQueueAnnotation = "solr.apache.org/clusterOpsRetryQueue"
+ StorageMinimumSizeAnnotation = "solr.apache.org/storageMinimumSize"
SolrIsNotStoppedReadinessCondition =
"solr.apache.org/isNotStopped"
SolrReplicasNotEvictedReadinessCondition =
"solr.apache.org/replicasNotEvicted"
@@ -217,6 +218,13 @@ func GenerateStatefulSet(solrCloud *solr.SolrCloud,
solrCloudStatus *solr.SolrCl
Spec: pvc.Spec,
},
}
+ if pvc.Spec.Resources.Requests.Storage() != nil {
+ annotations[StorageMinimumSizeAnnotation] =
pvc.Spec.Resources.Requests.Storage().String()
+ if podAnnotations == nil {
+ podAnnotations = make(map[string]string, 1)
+ }
+ podAnnotations[StorageMinimumSizeAnnotation] =
pvc.Spec.Resources.Requests.Storage().String()
+ }
} else {
ephemeralVolume := corev1.Volume{
Name: solrDataVolumeName,
@@ -687,6 +695,22 @@ func MaintainPreservedStatefulSetFields(expected, found
*appsv1.StatefulSet) {
}
expected.Annotations[ClusterOpsRetryQueueAnnotation] =
queue
}
+ if storage, hasStorage :=
found.Annotations[StorageMinimumSizeAnnotation]; hasStorage {
+ if expected.Annotations == nil {
+ expected.Annotations = make(map[string]string,
1)
+ }
+ expected.Annotations[StorageMinimumSizeAnnotation] =
storage
+ }
+ }
+ if found.Spec.Template.Annotations != nil {
+ // Note: the Pod template storage annotation is used to start a
rolling restart,
+ // it should always match the StatefulSet's storage annotation
+ if storage, hasStorage :=
found.Spec.Template.Annotations[StorageMinimumSizeAnnotation]; hasStorage {
+ if expected.Spec.Template.Annotations == nil {
+ expected.Spec.Template.Annotations =
make(map[string]string, 1)
+ }
+
expected.Spec.Template.Annotations[StorageMinimumSizeAnnotation] = storage
+ }
}
// Scaling (i.e. changing) the number of replicas in the SolrCloud
statefulSet is handled during the clusterOps
diff --git a/docs/solr-cloud/solr-cloud-crd.md
b/docs/solr-cloud/solr-cloud-crd.md
index 52027f0..0305188 100644
--- a/docs/solr-cloud/solr-cloud-crd.md
+++ b/docs/solr-cloud/solr-cloud-crd.md
@@ -61,8 +61,13 @@ These options can be found in `SolrCloud.spec.dataStorage`
- **`pvcTemplate`** - The template of the PVC to use for the solr data PVCs.
By default the name will be "data".
Only the `pvcTemplate.spec` field is required, metadata is optional.
- Note: This template cannot be changed unless the SolrCloud is deleted and
recreated.
- This is a [limitation of StatefulSets and PVCs in
Kubernetes](https://github.com/kubernetes/enhancements/issues/661).
+ Note: Currently, [Kubernetes does not support PVC resizing (expanding) in
StatefulSets](https://github.com/kubernetes/enhancements/issues/661).
+ However, the Solr Operator will manage the PVC expansion for users until
this is supported by default in Kubernetes.
+ Therefore the `pvcTemplate.spec` can have an update to
`pvcTemplate.spec.resources.requests`, but all other fields should be
considered immutable.
+
+ The storage size can only be increased (PVCs cannot be shrunk), and the
backing [`StorageClass` must allow volume
expansion](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#expanding-persistent-volumes-claims)
(`allowVolumeExpansion: true`).
+ When the size is increased, the operator resizes the data PVCs and then
performs a rolling restart of the SolrCloud so the new capacity is picked up on
each node.
+ If the storage class does not allow expansion, or the request would shrink
the PVCs, the operator emits a warning event on the SolrCloud and leaves the
storage unchanged.
- **`ephemeral`**
There are two types of ephemeral volumes that can be specified.
diff --git a/docs/upgrade-notes.md b/docs/upgrade-notes.md
index 267a80b..2a2eb17 100644
--- a/docs/upgrade-notes.md
+++ b/docs/upgrade-notes.md
@@ -134,7 +134,7 @@ _Note that the Helm chart version does not contain a `v`
prefix, which the downl
### v0.8.0
- **The minimum supported Solr version is now 8.11**
If you are unable to use a newer version of Solr, please install the
`v0.7.1` version of the Solr Operator.
- However, it is strongly suggested to upgrade to newer versions of Solr that
are actively supported.q
+ However, it is strongly suggested to upgrade to newer versions of Solr that
are actively supported.
See the [version compatibility matrix](#solr-versions) for more information.
- **Kubernetes support is now limited to 1.22+.**
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index 81138d0..a214d50 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -55,6 +55,13 @@ annotations:
# Allowed syntax is described at:
https://artifacthub.io/docs/topics/annotations/helm/#example
# 'kind' accepts values: "added", "changed", "deprecated", "removed",
"fixed" and "security"
artifacthub.io/changes: |
+ - kind: added
+ description: The operator can now resize (expand) persistent data PVCs,
which requires new RBAC permissions for persistentvolumeclaims (update/patch)
and storageclasses (get/list/watch)
+ links:
+ - name: Github Issue
+ url: https://github.com/apache/solr-operator/issues/709
+ - name: Github PR
+ url: https://github.com/apache/solr-operator/pull/712
- kind: changed
description: A container PostStart Hook is no longer used to create the
ZooKeeper ChRoot, instead the initContainer will manage this
links:
diff --git a/helm/solr-operator/templates/role.yaml
b/helm/solr-operator/templates/role.yaml
index 08c5fe4..6a267a0 100644
--- a/helm/solr-operator/templates/role.yaml
+++ b/helm/solr-operator/templates/role.yaml
@@ -47,6 +47,16 @@ rules:
- ""
resources:
- persistentvolumeclaims
+ verbs:
+ - delete
+ - get
+ - list
+ - patch
+ - update
+ - watch
+- apiGroups:
+ - ""
+ resources:
- pods
verbs:
- delete
@@ -148,6 +158,14 @@ rules:
- get
- patch
- update
+- apiGroups:
+ - storage.k8s.io
+ resources:
+ - storageclasses
+ verbs:
+ - get
+ - list
+ - watch
- apiGroups:
- zookeeper.pravega.io
resources:
diff --git a/helm/solr/Chart.yaml b/helm/solr/Chart.yaml
index 66e0251..2a59a34 100644
--- a/helm/solr/Chart.yaml
+++ b/helm/solr/Chart.yaml
@@ -42,15 +42,12 @@ annotations:
# Allowed syntax is described at:
https://artifacthub.io/docs/topics/annotations/helm/#example
artifacthub.io/changes: |
- kind: added
- description: Addition 1
+ description: Allow resizing (expanding) of persistent data PVCs
links:
- name: Github Issue
- url: https://github.com/issue-url
- - kind: changed
- description: Change 2
- links:
+ url: https://github.com/apache/solr-operator/issues/709
- name: Github PR
- url: https://github.com/pr-url
+ url: https://github.com/apache/solr-operator/pull/712
artifacthub.io/containsSecurityUpdates: "false"
artifacthub.io/recommendations: |
- url: https://artifacthub.io/packages/helm/apache-solr/solr-operator
diff --git a/main.go b/main.go
index c4aee80..d504995 100644
--- a/main.go
+++ b/main.go
@@ -199,8 +199,9 @@ func main() {
}
if err = (&controllers.SolrCloudReconciler{
- Client: mgr.GetClient(),
- Scheme: mgr.GetScheme(),
+ Client: mgr.GetClient(),
+ Scheme: mgr.GetScheme(),
+ Recorder: mgr.GetEventRecorderFor("solrcloud-controller"),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller",
"controller", "SolrCloud")
os.Exit(1)
diff --git a/tests/e2e/solrcloud_storage_test.go
b/tests/e2e/solrcloud_storage_test.go
new file mode 100644
index 0000000..9c96d05
--- /dev/null
+++ b/tests/e2e/solrcloud_storage_test.go
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package e2e
+
+import (
+ "context"
+ "time"
+
+ solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
+ "github.com/apache/solr-operator/controllers"
+ "github.com/apache/solr-operator/controllers/util"
+ . "github.com/onsi/ginkgo/v2"
+ . "github.com/onsi/gomega"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ "k8s.io/apimachinery/pkg/labels"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+var _ = FDescribe("E2E - SolrCloud - Storage", func() {
+ var (
+ solrCloud *solrv1beta1.SolrCloud
+
+ solrCollection1 = "e2e-1"
+
+ solrCollection2 = "e2e-2"
+ )
+
+ BeforeEach(func() {
+ solrCloud = generateBaseSolrCloud(2)
+ })
+
+ JustBeforeEach(func(ctx context.Context) {
+ By("creating the SolrCloud")
+ Expect(k8sClient.Create(ctx, solrCloud)).To(Succeed())
+
+ DeferCleanup(func(ctx context.Context) {
+ cleanupTest(ctx, solrCloud)
+ })
+
+ By("Waiting for the SolrCloud to come up healthy")
+ solrCloud = expectSolrCloudToBeReady(ctx, solrCloud)
+
+ By("creating a first Solr Collection")
+ createAndQueryCollection(ctx, solrCloud, solrCollection1, 1, 2)
+
+ By("creating a second Solr Collection")
+ createAndQueryCollection(ctx, solrCloud, solrCollection2, 2, 1)
+ })
+
+ FContext("Persistent Data - Expansion", func() {
+ BeforeEach(func() {
+ solrCloud.Spec.StorageOptions =
solrv1beta1.SolrDataStorageOptions{
+ PersistentStorage:
&solrv1beta1.SolrPersistentDataStorageOptions{
+ PersistentVolumeClaimTemplate:
solrv1beta1.PersistentVolumeClaimTemplate{
+ Spec:
corev1.PersistentVolumeClaimSpec{
+ StorageClassName:
new("rawfile-localpv"),
+ Resources:
corev1.VolumeResourceRequirements{
+ Requests:
map[corev1.ResourceName]resource.Quantity{
+
corev1.ResourceStorage: resource.MustParse("1G"),
+ },
+ },
+ },
+ },
+ },
+ }
+ })
+
+ FIt("Fully Expands", func(ctx context.Context) {
+ newStorageSize := resource.MustParse("1500M")
+ patchedSolrCloud := solrCloud.DeepCopy()
+
patchedSolrCloud.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests[corev1.ResourceStorage]
= newStorageSize
+ By("triggering a rolling restart via pod annotations")
+ Expect(k8sClient.Patch(ctx, patchedSolrCloud,
client.MergeFrom(solrCloud))).To(Succeed(), "Could not add annotation to
SolrCloud pod to initiate rolling restart")
+
+ // Wait for new pods to come up, and when they do we
should be doing a balanceReplicas clusterOp
+ expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*50, func(g Gomega,
found *appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a PvcExpansion lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.PvcExpansionLock),
"StatefulSet does not have a PvcExpansion lock after starting managed update.")
+ })
+
+ By("waiting for the expansion's rolling restart to
begin")
+ solrCloud = expectSolrCloudWithChecksAndTimeout(ctx,
solrCloud, time.Second*30, time.Millisecond*100, func(g Gomega, found
*solrv1beta1.SolrCloud) {
+
g.Expect(found.Status.UpToDateNodes).To(BeZero(), "Cloud did not get to a state
with zero up-to-date replicas when rolling restart began.")
+ for _, nodeStatus := range
found.Status.SolrNodes {
+
g.Expect(nodeStatus.SpecUpToDate).To(BeFalse(), "Node not starting as
out-of-date when rolling restart begins: %s", nodeStatus.Name)
+ }
+ })
+
+ By("checking that the resize has been requested on all
PVCs when the restart begins")
+ internalLabels := map[string]string{
+ util.SolrPVCTechnologyLabel:
util.SolrCloudPVCTechnology,
+ util.SolrPVCStorageLabel:
util.SolrCloudPVCDataStorage,
+ util.SolrPVCInstanceLabel: solrCloud.Name,
+ }
+ pvcListOps := &client.ListOptions{
+ Namespace: solrCloud.Namespace,
+ LabelSelector:
labels.SelectorFromSet(internalLabels),
+ }
+
+ foundPVCs := &corev1.PersistentVolumeClaimList{}
+ Expect(k8sClient.List(ctx, foundPVCs,
pvcListOps)).To(Succeed(), "Could not fetch PVC list")
+
Expect(foundPVCs.Items).To(HaveLen(int(*solrCloud.Spec.Replicas)), "Did not
find the same number of PVCs as Solr Pods")
+ for _, pvc := range foundPVCs.Items {
+ // The resize request (spec) is always set when
the operator hands off to the rolling restart.
+ // The node-side filesystem resize
(status.capacity) may still be pending here, since some
+ // provisioners only complete it when the
volume is remounted during the restart below.
+
Expect(pvc.Spec.Resources.Requests).To(HaveKeyWithValue(corev1.ResourceStorage,
newStorageSize), "The PVC %q does not have the new storage size in its resource
requests", pvc.Name)
+ }
+
+ statefulSet :=
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), 1, time.Millisecond, func(g Gomega, found
*appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a RollingUpdate lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.UpdateLock), "StatefulSet
does not have a RollingUpdate lock after starting managed update to increase
the storage size.")
+ // The lock metadata is the JSON-encoded
RollingUpdateMetadata. PVC-backed clouds do not require replica migration.
+
g.Expect(clusterOp.Metadata).To(Equal(`{"requiresReplicaMigration":false}`),
"StatefulSet should not require replica migration, since PVCs are being used.")
+ })
+
+ By("waiting for the rolling restart to complete")
+ // Use the default (longer) timeout, since a managed
rolling restart of multiple pods waits for
+ // Solr replicas to recover between pod restarts and
can take a while on a busy cluster.
+ expectSolrCloudWithChecks(ctx, solrCloud, func(g
Gomega, cloud *solrv1beta1.SolrCloud) {
+
g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
"The Rolling Update never completed, not all replicas up to date")
+
g.Expect(cloud.Status.ReadyReplicas).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
"The Rolling Update never completed, not all replicas ready")
+ })
+
+ By("waiting for the cluster operation lock to be
cleared")
+ expectStatefulSetWithConsistentChecksAndDuration(ctx,
solrCloud, solrCloud.StatefulSetName(), time.Second*2, func(g Gomega, found
*appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).To(BeNil(), "StatefulSet
should not have any cluster lock after finishing its rolling update.")
+ })
+
+ By("checking that all PVCs have been fully expanded
(status.capacity) after the restart")
+ // The node-side filesystem resize completes as the
volumes are remounted during the rolling
+ // restart, so the reported capacity is only guaranteed
to reflect the new size once the
+ // restart has finished. This holds for both online-
and offline-resizing provisioners.
+ Eventually(func(g Gomega) {
+ updatedPVCs :=
&corev1.PersistentVolumeClaimList{}
+ g.Expect(k8sClient.List(ctx, updatedPVCs,
pvcListOps)).To(Succeed(), "Could not fetch PVC list")
+
g.Expect(updatedPVCs.Items).To(HaveLen(int(*solrCloud.Spec.Replicas)), "Did not
find the same number of PVCs as Solr Pods")
+ for _, pvc := range updatedPVCs.Items {
+
g.Expect(pvc.Status.Capacity).To(HaveKeyWithValue(corev1.ResourceStorage,
newStorageSize), "The PVC %q does not have the new storage size in its
status.capacity", pvc.Name)
+ }
+ }).WithContext(ctx).WithTimeout(time.Second *
90).WithPolling(time.Second).Should(Succeed())
+
+ By("checking that the collections can be queried after
the restart")
+ queryCollection(ctx, solrCloud, solrCollection1, 0)
+ queryCollection(ctx, solrCloud, solrCollection2, 0)
+ })
+ })
+})
diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go
index 1ac10f8..b63d227 100644
--- a/tests/e2e/suite_test.go
+++ b/tests/e2e/suite_test.go
@@ -19,10 +19,17 @@ package e2e
import (
"bufio"
- "bytes"
"context"
"encoding/json"
"fmt"
+ "io"
+ "math/rand"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+ "time"
+
solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
"github.com/apache/solr-operator/version"
certManagerApi "github.com/cert-manager/cert-manager/pkg/api"
@@ -31,7 +38,6 @@ import (
zkApi "github.com/pravega/zookeeper-operator/api/v1beta1"
"golang.org/x/text/cases"
"golang.org/x/text/language"
- "io"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -40,16 +46,10 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
- "math/rand"
- "os"
- "path/filepath"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
- "strings"
- "testing"
- "time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
@@ -229,7 +229,7 @@ var _ = JustAfterEach(func(ctx context.Context) {
getSolrOperatorPodName(ctx,
solrOperatorReleaseNamespace),
solrOperatorReleaseNamespace,
&startTime,
- fmt.Sprintf("%q: %q", "namespace", testNamespace()),
+ fmt.Sprintf("%q:%q", "namespace", testNamespace()),
)
// Always save the logs of the Solr Operator for the test
writeAllSolrInfoToFiles(
@@ -313,11 +313,26 @@ func writeAllSolrInfoToFiles(ctx context.Context,
directory string, namespace st
for _, pod := range foundPods.Items {
writeAllPodInfoToFiles(
ctx,
- directory+pod.Name,
+ directory+pod.Name+".pod",
&pod,
)
}
+ listOps = &client.ListOptions{
+ Namespace: namespace,
+ LabelSelector: labelSelector,
+ }
+
+ foundPVCs := &corev1.PersistentVolumeClaimList{}
+ Expect(k8sClient.List(ctx, foundPVCs, listOps)).To(Succeed(), "Could
not fetch Solr PVCs")
+ Expect(foundPVCs).ToNot(BeNil(), "No Solr PVCs could be found")
+ for _, pvc := range foundPVCs.Items {
+ writeAllPvcInfoToFiles(
+ directory+pvc.Name+".pvc",
+ &pvc,
+ )
+ }
+
foundStatefulSets := &appsv1.StatefulSetList{}
Expect(k8sClient.List(ctx, foundStatefulSets, listOps)).To(Succeed(),
"Could not fetch Solr statefulSets")
Expect(foundStatefulSets).ToNot(BeNil(), "No Solr statefulSet could be
found")
@@ -388,8 +403,8 @@ func writeSolrClusterStatusInfoToFile(ctx context.Context,
baseFilename string,
func writeAllStatefulSetInfoToFiles(baseFilename string, statefulSet
*appsv1.StatefulSet) {
// Write statefulSet to a file
statusFile, err := os.Create(baseFilename + ".status.json")
- defer statusFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save
statefulSet status: %s", baseFilename+".status.json")
+ defer statusFile.Close()
jsonBytes, marshErr := json.MarshalIndent(statefulSet, "", "\t")
Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize statefulSet
json")
_, writeErr := statusFile.Write(jsonBytes)
@@ -397,8 +412,8 @@ func writeAllStatefulSetInfoToFiles(baseFilename string,
statefulSet *appsv1.Sta
// Write events for statefulSet to a file
eventsFile, err := os.Create(baseFilename + ".events.json")
- defer eventsFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save
statefulSet events: %s", baseFilename+".events.yaml")
+ defer eventsFile.Close()
eventList, err :=
rawK8sClient.CoreV1().Events(statefulSet.Namespace).Search(scheme.Scheme,
statefulSet)
Expect(err).ToNot(HaveOccurred(), "Could not find events for
statefulSet: %s", statefulSet.Name)
@@ -408,13 +423,39 @@ func writeAllStatefulSetInfoToFiles(baseFilename string,
statefulSet *appsv1.Sta
Expect(writeErr).ToNot(HaveOccurred(), "Could not write statefulSet
events json to file")
}
+// writeAllPvcInfoToFiles writes the following each to a separate file with
the given base name & directory.
+// - PVC Spec/Status
+// - PVC Events
+func writeAllPvcInfoToFiles(baseFilename string, pvc
*corev1.PersistentVolumeClaim) {
+ // Write PVC to a file
+ statusFile, err := os.Create(baseFilename + ".status.json")
+ Expect(err).ToNot(HaveOccurred(), "Could not open file to save PVC
status: %s", baseFilename+".status.json")
+ defer statusFile.Close()
+ jsonBytes, marshErr := json.MarshalIndent(pvc, "", "\t")
+ Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize PVC json")
+ _, writeErr := statusFile.Write(jsonBytes)
+ Expect(writeErr).ToNot(HaveOccurred(), "Could not write PVC json to
file")
+
+ // Write events for PVC to a file
+ eventsFile, err := os.Create(baseFilename + ".events.json")
+ Expect(err).ToNot(HaveOccurred(), "Could not open file to save PVC
events: %s", baseFilename+".events.yaml")
+ defer eventsFile.Close()
+
+ eventList, err :=
rawK8sClient.CoreV1().Events(pvc.Namespace).Search(scheme.Scheme, pvc)
+ Expect(err).ToNot(HaveOccurred(), "Could not find events for PVC: %s",
pvc.Name)
+ jsonBytes, marshErr = json.MarshalIndent(eventList, "", "\t")
+ Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize PVC events
json")
+ _, writeErr = eventsFile.Write(jsonBytes)
+ Expect(writeErr).ToNot(HaveOccurred(), "Could not write PVC events json
to file")
+}
+
// writeAllServiceInfoToFiles writes the following each to a separate file
with the given base name & directory.
// - Service
func writeAllServiceInfoToFiles(baseFilename string, service *corev1.Service) {
// Write service to a file
statusFile, err := os.Create(baseFilename + ".json")
- defer statusFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save service
status: %s", baseFilename+".json")
+ defer statusFile.Close()
jsonBytes, marshErr := json.MarshalIndent(service, "", "\t")
Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize service
json")
_, writeErr := statusFile.Write(jsonBytes)
@@ -426,8 +467,8 @@ func writeAllServiceInfoToFiles(baseFilename string,
service *corev1.Service) {
func writeAllSecretInfoToFiles(baseFilename string, secret *corev1.Secret) {
// Write service to a file
statusFile, err := os.Create(baseFilename + ".json")
- defer statusFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save secret
status: %s", baseFilename+".json")
+ defer statusFile.Close()
jsonBytes, marshErr := json.MarshalIndent(secret, "", "\t")
Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize secret
json")
_, writeErr := statusFile.Write(jsonBytes)
@@ -441,8 +482,8 @@ func writeAllSecretInfoToFiles(baseFilename string, secret
*corev1.Secret) {
func writeAllPodInfoToFiles(ctx context.Context, baseFilename string, pod
*corev1.Pod) {
// Write pod to a file
statusFile, err := os.Create(baseFilename + ".status.json")
- defer statusFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save pod
status: %s", baseFilename+".status.json")
+ defer statusFile.Close()
jsonBytes, marshErr := json.MarshalIndent(pod, "", "\t")
Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize pod json")
_, writeErr := statusFile.Write(jsonBytes)
@@ -450,8 +491,8 @@ func writeAllPodInfoToFiles(ctx context.Context,
baseFilename string, pod *corev
// Write events for pod to a file
eventsFile, err := os.Create(baseFilename + ".events.json")
- defer eventsFile.Close()
Expect(err).ToNot(HaveOccurred(), "Could not open file to save pod
events: %s", baseFilename+".events.yaml")
+ defer eventsFile.Close()
eventList, err :=
rawK8sClient.CoreV1().Events(pod.Namespace).Search(scheme.Scheme, pod)
Expect(err).ToNot(HaveOccurred(), "Could not find events for pod: %s",
pod.Name)
@@ -489,22 +530,18 @@ func writePodLogsToFile(ctx context.Context, filename
string, podName string, po
Expect(logsErr).ToNot(HaveOccurred(), "Could not open stream to fetch
pod logs. namespace: %s, pod: %s", podNamespace, podName)
defer podLogs.Close()
- var logReader io.Reader
- logReader = podLogs
-
if filterLinesWithString != "" {
- filteredWriter := bytes.NewBufferString("")
scanner := bufio.NewScanner(podLogs)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, filterLinesWithString) {
- io.WriteString(filteredWriter, line)
- io.WriteString(filteredWriter, "\n")
+ _, err = io.WriteString(logFile, line)
+ _, err = io.WriteString(logFile, "\n")
}
}
- logReader = filteredWriter
+ } else {
+ _, err = io.Copy(logFile, podLogs)
}
- _, err = io.Copy(logFile, logReader)
Expect(err).ToNot(HaveOccurred(), "Could not write podLogs to file:
%s", filename)
}
diff --git a/tests/scripts/manage_e2e_tests.sh
b/tests/scripts/manage_e2e_tests.sh
index 09a0c77..c9cfd7f 100755
--- a/tests/scripts/manage_e2e_tests.sh
+++ b/tests/scripts/manage_e2e_tests.sh
@@ -73,7 +73,7 @@ if [[ -z "${OPERATOR_IMAGE:-}" ]]; then
echo "Specify a Docker image for the Solr Operator through -i, or through
the OPERATOR_IMAGE env var" >&2 && exit 1
fi
if [[ -z "${KUBERNETES_VERSION:-}" ]]; then
- KUBERNETES_VERSION="v1.26.6"
+ KUBERNETES_VERSION="v1.33.7"
fi
if [[ -z "${SOLR_IMAGE:-}" ]]; then
SOLR_IMAGE="${SOLR_VERSION:-9.10.0}"
@@ -96,7 +96,8 @@ export RAW_GINKGO
export REUSE_KIND_CLUSTER_IF_EXISTS="${REUSE_KIND_CLUSTER_IF_EXISTS:-true}" #
This is used for all start_cluster calls
export LEAVE_KIND_CLUSTER_ON_SUCCESS="${LEAVE_KIND_CLUSTER_ON_SUCCESS:-false}"
# This is only used when using run_tests or run_with_cluster
-export CERT_MANAGER_VERSION=1.12.3
+export RAWFILE_LOCAL_PV_VERSION=0.13.1
+export CERT_MANAGER_VERSION=1.17.4
export CERT_MANAGER_CSI_DRIVER_VERSION=0.5.0
function add_image_to_kind_repo_if_local() {
@@ -190,6 +191,11 @@ function setup_cluster() {
kubectl get configmap coredns -n kube-system -o yaml | sed 's/\(.*\)ttl
30\(.*\)/\1ttl 5\2/' | kubectl replace -n kube-system -f -
echo ""
+ printf "Installing Rawfile LocalPV Provisioner\n"
+ helm repo add rawfile-localpv https://openebs.github.io/rawfile-localpv
--force-update
+ helm upgrade -i -n openebs --create-namespace rawfile-localpv
rawfile-localpv/rawfile-localpv --version "${RAWFILE_LOCAL_PV_VERSION}" --set
analytics.enabled=false
+ echo ""
+
printf "Installing Cert Manager\n"
helm repo add cert-manager https://charts.jetstack.io --force-update
helm upgrade -i -n cert-manager --create-namespace cert-manager
cert-manager/cert-manager --version "${CERT_MANAGER_VERSION}" --set
installCRDs=true