(solr-operator) branch main updated: Fix scaling when using ingress-addressed nodes (#692)

houston Wed, 03 Apr 2024 13:51:16 -0700

This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr-operator.git



The following commit(s) were added to refs/heads/main by this push:
     new 1cc55ef  Fix scaling when using ingress-addressed nodes (#692)
1cc55ef is described below

commit 1cc55eff50b8711275fec987ddb644701993a258
Author: Houston Putman <[email protected]>
AuthorDate: Wed Apr 3 16:49:59 2024 -0400

    Fix scaling when using ingress-addressed nodes (#692)
---
 api/v1beta1/solrcloud_types.go              |   7 +
 controllers/solr_cluster_ops_util.go        |  22 ++-
 controllers/solrcloud_controller.go         |  71 +++++-----
 controllers/util/common.go                  |  24 +++-
 controllers/util/solr_update_util.go        |  14 +-
 controllers/util/solr_util.go               |   3 +
 helm/solr-operator/Chart.yaml               |   8 +-
 tests/e2e/solrcloud_rolling_upgrade_test.go |   2 +-
 tests/e2e/solrcloud_scaling_test.go         | 199 +++++++++++++++++++++++++++-
 tests/e2e/suite_test.go                     |  33 +++++
 tests/e2e/test_utils_test.go                |   4 +-
 11 files changed, 341 insertions(+), 46 deletions(-)

diff --git a/api/v1beta1/solrcloud_types.go b/api/v1beta1/solrcloud_types.go
index c028337..995a9ac 100644
--- a/api/v1beta1/solrcloud_types.go
+++ b/api/v1beta1/solrcloud_types.go
@@ -1227,6 +1227,13 @@ func (sc *SolrCloud) GetAllSolrPodNames() []string {
        if sc.Spec.Replicas != nil {
                replicas = int(*sc.Spec.Replicas)
        }
+       if int(sc.Status.Replicas) > replicas {
+               replicas = int(sc.Status.Replicas)
+       }
+       return sc.GetSolrPodNames(replicas)
+}
+
+func (sc *SolrCloud) GetSolrPodNames(replicas int) []string {
        podNames := make([]string, replicas)
        statefulSetName := sc.StatefulSetName()
        for i := range podNames {
diff --git a/controllers/solr_cluster_ops_util.go 
b/controllers/solr_cluster_ops_util.go
index c9f352a..07452eb 100644
--- a/controllers/solr_cluster_ops_util.go
+++ b/controllers/solr_cluster_ops_util.go
@@ -151,7 +151,7 @@ func retryNextQueuedClusterOpWithQueue(statefulSet 
*appsv1.StatefulSet, clusterO
        return hasOp, err
 }
 
-func determineScaleClusterOpLockIfNecessary(ctx context.Context, r 
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet 
*appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, logger 
logr.Logger) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err 
error) {
+func determineScaleClusterOpLockIfNecessary(ctx context.Context, r 
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet 
*appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, 
blockReconciliationOfStatefulSet bool, logger logr.Logger) (clusterOp 
*SolrClusterOp, retryLaterDuration time.Duration, err error) {
        desiredPods := int(*instance.Spec.Replicas)
        configuredPods := int(*statefulSet.Spec.Replicas)
        if desiredPods != configuredPods {
@@ -170,11 +170,26 @@ func determineScaleClusterOpLockIfNecessary(ctx 
context.Context, r *SolrCloudRec
                                Metadata:  strconv.Itoa(configuredPods - 1),
                        }
                } else if desiredPods > configuredPods && 
(instance.Spec.Scaling.PopulatePodsOnScaleUp == nil || 
*instance.Spec.Scaling.PopulatePodsOnScaleUp) {
+                       // We need to wait for all pods to become healthy to 
scale up in a managed fashion, otherwise
+                       // the balancing will skip some pods
                        if len(podList) < configuredPods {
                                // There are not enough pods, the statefulSet 
controller has yet to create the previously desired pods.
                                // Do not start the scale up until these 
missing pods are created.
                                return nil, time.Second * 5, nil
                        }
+                       // If Solr nodes are advertised by their individual 
node services (through an ingress)
+                       // then make sure that the host aliases are set for all 
desired pods before starting a scale-up.
+                       // If the host aliases do not already include the 
soon-to-be created pods, then Solr might not be able to balance
+                       // replicas onto the new hosts.
+                       // We need to make sure that the StatefulSet is updated 
with these new hostAliases before the scale up occurs.
+                       if instance.UsesIndividualNodeServices() && 
instance.Spec.SolrAddressability.External.UseExternalAddress {
+                               for _, pod := range podList {
+                                       if len(pod.Spec.HostAliases) < 
desiredPods {
+                                               return nil, time.Second * 5, nil
+                                       }
+                               }
+                       }
+
                        clusterOp = &SolrClusterOp{
                                Operation: ScaleUpLock,
                                Metadata:  strconv.Itoa(desiredPods),
@@ -349,7 +364,8 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r 
*SolrCloudReconciler
                }
                operationComplete = true
                // Only do a re-balancing for rolling restarts that migrated 
replicas
-               if updateMetadata.RequiresReplicaMigration {
+               // If a scale-up will occur afterwards, skip the re-balancing, 
because it will occur after the scale-up anyway
+               if updateMetadata.RequiresReplicaMigration && 
*instance.Spec.Replicas <= *statefulSet.Spec.Replicas {
                        nextClusterOp = &SolrClusterOp{
                                Operation: BalanceReplicasLock,
                                Metadata:  "RollingUpdateComplete",
@@ -371,7 +387,7 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r 
*SolrCloudReconciler
                // We won't kill pods that we need the cluster state for, but 
we can kill the pods that are already not running.
                // This is important for scenarios where there is a bad pod 
config and nothing is running, but we need to do
                // a restart to get a working pod config.
-               state, retryLater, apiError := util.GetNodeReplicaState(ctx, 
instance, hasReadyPod, logger)
+               state, retryLater, apiError := util.GetNodeReplicaState(ctx, 
instance, statefulSet, hasReadyPod, logger)
                if apiError != nil {
                        return false, true, 0, nil, apiError
                } else if !retryLater {
diff --git a/controllers/solrcloud_controller.go 
b/controllers/solrcloud_controller.go
index 5ad2213..4d832cf 100644
--- a/controllers/solrcloud_controller.go
+++ b/controllers/solrcloud_controller.go
@@ -152,6 +152,11 @@ func (r *SolrCloudReconciler) Reconcile(ctx 
context.Context, req ctrl.Request) (
        hostNameIpMap := make(map[string]string)
        // Generate a service for every Node
        if instance.UsesIndividualNodeServices() {
+               // When updating the statefulSet below, the hostNameIpMap is 
just used to add new IPs or modify existing ones.
+               // When scaling down, the hostAliases that are no longer found 
here will not be removed from the hostAliases in the statefulSet pod spec.
+               // Therefore, it should be ok that we are not reconciling the 
node services that will be scaled down in the future.
+               // This is unfortunately the reality since we don't have the 
statefulSet yet to determine how many Solr pods are still running,
+               // we just have Spec.replicas which is the requested pod count.
                for _, nodeName := range solrNodeNames {
                        err, ip := r.reconcileNodeService(ctx, logger, 
instance, nodeName)
                        if err != nil {
@@ -161,6 +166,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx 
context.Context, req ctrl.Request) (
                        if 
instance.Spec.SolrAddressability.External.UseExternalAddress {
                                if ip == "" {
                                        // If we are using this IP in the 
hostAliases of the statefulSet, it needs to be set for every service before 
trying to update the statefulSet
+                                       // TODO: Make an event here
                                        blockReconciliationOfStatefulSet = true
                                } else {
                                        
hostNameIpMap[instance.AdvertisedNodeHost(nodeName)] = ip
@@ -319,36 +325,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx 
context.Context, req ctrl.Request) (
                }
        }
 
-       extAddressabilityOpts := instance.Spec.SolrAddressability.External
-       if extAddressabilityOpts != nil && extAddressabilityOpts.Method == 
solrv1beta1.Ingress {
-               // Generate Ingress
-               ingress := util.GenerateIngress(instance, solrNodeNames)
-
-               // Check if the Ingress already exists
-               ingressLogger := logger.WithValues("ingress", ingress.Name)
-               foundIngress := &netv1.Ingress{}
-               err = r.Get(ctx, types.NamespacedName{Name: ingress.Name, 
Namespace: ingress.Namespace}, foundIngress)
-               if err != nil && errors.IsNotFound(err) {
-                       ingressLogger.Info("Creating Ingress")
-                       if err = 
controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil {
-                               err = r.Create(ctx, ingress)
-                       }
-               } else if err == nil {
-                       var needsUpdate bool
-                       needsUpdate, err = util.OvertakeControllerRef(instance, 
foundIngress, r.Scheme)
-                       needsUpdate = util.CopyIngressFields(ingress, 
foundIngress, ingressLogger) || needsUpdate
-
-                       // Update the found Ingress and write the result back 
if there are any changes
-                       if needsUpdate && err == nil {
-                               ingressLogger.Info("Updating Ingress")
-                               err = r.Update(ctx, foundIngress)
-                       }
-               }
-               if err != nil {
-                       return requeueOrNot, err
-               }
-       }
-
        var statefulSet *appsv1.StatefulSet
 
        if !blockReconciliationOfStatefulSet {
@@ -416,6 +392,39 @@ func (r *SolrCloudReconciler) Reconcile(ctx 
context.Context, req ctrl.Request) (
        if err != nil {
                return requeueOrNot, err
        }
+       if statefulSet != nil && statefulSet.Spec.Replicas != nil {
+               solrNodeNames = 
instance.GetSolrPodNames(int(*statefulSet.Spec.Replicas))
+       }
+
+       extAddressabilityOpts := instance.Spec.SolrAddressability.External
+       if extAddressabilityOpts != nil && extAddressabilityOpts.Method == 
solrv1beta1.Ingress {
+               // Generate Ingress
+               ingress := util.GenerateIngress(instance, solrNodeNames)
+
+               // Check if the Ingress already exists
+               ingressLogger := logger.WithValues("ingress", ingress.Name)
+               foundIngress := &netv1.Ingress{}
+               err = r.Get(ctx, types.NamespacedName{Name: ingress.Name, 
Namespace: ingress.Namespace}, foundIngress)
+               if err != nil && errors.IsNotFound(err) {
+                       ingressLogger.Info("Creating Ingress")
+                       if err = 
controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil {
+                               err = r.Create(ctx, ingress)
+                       }
+               } else if err == nil {
+                       var needsUpdate bool
+                       needsUpdate, err = util.OvertakeControllerRef(instance, 
foundIngress, r.Scheme)
+                       needsUpdate = util.CopyIngressFields(ingress, 
foundIngress, ingressLogger) || needsUpdate
+
+                       // Update the found Ingress and write the result back 
if there are any changes
+                       if needsUpdate && err == nil {
+                               ingressLogger.Info("Updating Ingress")
+                               err = r.Update(ctx, foundIngress)
+                       }
+               }
+               if err != nil {
+                       return requeueOrNot, err
+               }
+       }
 
        // *********************************************************
        // The operations after this require a statefulSet to exist,
@@ -546,7 +555,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx 
context.Context, req ctrl.Request) (
                        // a "locked" cluster operation
                        if clusterOp == nil {
                                _, scaleDownOpIsQueued := 
queuedRetryOps[ScaleDownLock]
-                               clusterOp, retryLaterDuration, err = 
determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, 
scaleDownOpIsQueued, podList, logger)
+                               clusterOp, retryLaterDuration, err = 
determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, 
scaleDownOpIsQueued, podList, blockReconciliationOfStatefulSet, logger)
 
                                // If the new clusterOperation is an update to 
a queued clusterOp, just change the operation that is already queued
                                if clusterOp != nil {
diff --git a/controllers/util/common.go b/controllers/util/common.go
index b887f6d..3eca00b 100644
--- a/controllers/util/common.go
+++ b/controllers/util/common.go
@@ -432,7 +432,29 @@ func CopyPodTemplates(from, to *corev1.PodTemplateSpec, 
basePath string, logger
 
        if !DeepEqualWithNils(to.Spec.HostAliases, from.Spec.HostAliases) {
                requireUpdate = true
-               to.Spec.HostAliases = from.Spec.HostAliases
+               if to.Spec.HostAliases == nil {
+                       to.Spec.HostAliases = from.Spec.HostAliases
+               } else {
+                       // Do not remove aliases that are no longer used.
+                       // This is in case Solr is scaling down and we want to 
keep the old addresses for future use.
+                       for _, fromAlias := range from.Spec.HostAliases {
+                               found := false
+                               for i, toAlias := range to.Spec.HostAliases {
+                                       if fromAlias.Hostnames[0] == 
toAlias.Hostnames[0] {
+                                               found = true
+                                               if !DeepEqualWithNils(toAlias, 
fromAlias) {
+                                                       requireUpdate = true
+                                                       to.Spec.HostAliases[i] 
= fromAlias
+                                                       break
+                                               }
+                                       }
+                               }
+                               if !found {
+                                       requireUpdate = true
+                                       to.Spec.HostAliases = 
append(to.Spec.HostAliases, fromAlias)
+                               }
+                       }
+               }
                logger.Info("Update required because field changed", "field", 
basePath+"Spec.HostAliases", "from", to.Spec.HostAliases, "to", 
from.Spec.HostAliases)
        }
 
diff --git a/controllers/util/solr_update_util.go 
b/controllers/util/solr_update_util.go
index 4f9eebe..3123544 100644
--- a/controllers/util/solr_update_util.go
+++ b/controllers/util/solr_update_util.go
@@ -24,6 +24,7 @@ import (
        "github.com/apache/solr-operator/controllers/util/solr_api"
        "github.com/go-logr/logr"
        "github.com/robfig/cron/v3"
+       appsv1 "k8s.io/api/apps/v1"
        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/util/intstr"
        "net/url"
@@ -119,7 +120,7 @@ func (state NodeReplicaState) PodHasReplicas(cloud 
*solr.SolrCloud, podName stri
        return isInClusterState && contents.replicas > 0
 }
 
-func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, 
hasReadyPod bool, logger logr.Logger) (state NodeReplicaState, retryLater bool, 
err error) {
+func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, 
statefulSet *appsv1.StatefulSet, hasReadyPod bool, logger logr.Logger) (state 
NodeReplicaState, retryLater bool, err error) {
        clusterResp := &solr_api.SolrClusterStatusResponse{}
        overseerResp := &solr_api.SolrOverseerStatusResponse{}
 
@@ -138,7 +139,7 @@ func GetNodeReplicaState(ctx context.Context, cloud 
*solr.SolrCloud, hasReadyPod
                        }
                }
                if err == nil {
-                       state = findSolrNodeContents(clusterResp.ClusterStatus, 
overseerResp.Leader, GetAllManagedSolrNodeNames(cloud))
+                       state = findSolrNodeContents(clusterResp.ClusterStatus, 
overseerResp.Leader, GetManagedSolrNodeNames(cloud, 
int(*statefulSet.Spec.Replicas)))
                } else {
                        logger.Error(err, "Could not fetch cluster state 
information for cloud")
                }
@@ -535,6 +536,15 @@ func GetAllManagedSolrNodeNames(solrCloud *solr.SolrCloud) 
map[string]bool {
        return allNodeNames
 }
 
+func GetManagedSolrNodeNames(solrCloud *solr.SolrCloud, 
currentlyConfiguredPodCount int) map[string]bool {
+       podNames := solrCloud.GetSolrPodNames(currentlyConfiguredPodCount)
+       allNodeNames := make(map[string]bool, len(podNames))
+       for _, podName := range podNames {
+               allNodeNames[SolrNodeName(solrCloud, podName)] = true
+       }
+       return allNodeNames
+}
+
 // EvictReplicasForPodIfNecessary takes a solr Pod and migrates all replicas 
off of that Pod.
 // For updates this will only be called for pods using ephemeral data.
 // For scale-down operations, this can be called for pods using ephemeral or 
persistent data.
diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go
index 47f8c07..de44d7c 100644
--- a/controllers/util/solr_util.go
+++ b/controllers/util/solr_util.go
@@ -556,6 +556,9 @@ func GenerateStatefulSet(solrCloud *solr.SolrCloud, 
solrCloudStatus *solr.SolrCl
                        VolumeClaimTemplates: pvcs,
                },
        }
+       if solrCloud.UsesHeadlessService() {
+               stateful.Spec.Template.Spec.Subdomain = 
solrCloud.HeadlessServiceName()
+       }
 
        var imagePullSecrets []corev1.LocalObjectReference
 
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index fd2b6da..5a954c2 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -61,7 +61,6 @@ annotations:
           url: https://github.com/apache/solr-operator/issues/624
         - name: Github PR
           url: https://github.com/apache/solr-operator/pull/648
-    - kind: fixed
       description: Avoid reset of security.json if get request fails  
       links:
         - name: Github Issue
@@ -89,6 +88,13 @@ annotations:
           url: https://issues.apache.org/jira/browse/SOLR-17216
         - name: Github PR
           url: https://github.com/apache/solr-operator/pull/698
+    - kind: fixed
+      description: SolrClouds addressed via an Ingress now scale up and down 
safely.
+      links:
+        - name: Github Issue
+          url: https://github.com/apache/solr-operator/issues/682
+        - name: Github PR
+          url: https://github.com/apache/solr-operator/pull/692
   artifacthub.io/images: |
     - name: solr-operator
       image: apache/solr-operator:v0.9.0-prerelease
diff --git a/tests/e2e/solrcloud_rolling_upgrade_test.go 
b/tests/e2e/solrcloud_rolling_upgrade_test.go
index 6143342..faf2175 100644
--- a/tests/e2e/solrcloud_rolling_upgrade_test.go
+++ b/tests/e2e/solrcloud_rolling_upgrade_test.go
@@ -163,7 +163,7 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades", 
func() {
                        }
 
                        By("waiting for the balanceReplicas to finish")
-                       expectStatefulSetWithChecks(ctx, solrCloud, 
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+                       expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*30, time.Second, func(g Gomega, found 
*appsv1.StatefulSet) {
                                clusterOp, err := 
controllers.GetCurrentClusterOp(found)
                                g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
                                g.Expect(clusterOp).To(BeNil(), "StatefulSet 
should not have a balanceReplicas lock after balancing is complete.")
diff --git a/tests/e2e/solrcloud_scaling_test.go 
b/tests/e2e/solrcloud_scaling_test.go
index 53d3630..106dd4f 100644
--- a/tests/e2e/solrcloud_scaling_test.go
+++ b/tests/e2e/solrcloud_scaling_test.go
@@ -140,6 +140,110 @@ var _ = FDescribe("E2E - SolrCloud - Scale Down", func() {
                })
        })
 
+       FContext("with replica migration using an ingress for node addresses", 
func() {
+
+               BeforeEach(func() {
+                       solrCloud.Spec.SolrAddressability = 
solrv1beta1.SolrAddressabilityOptions{
+                               External: &solrv1beta1.ExternalAddressability{
+                                       Method:             solrv1beta1.Ingress,
+                                       UseExternalAddress: true,
+                                       DomainName:         "test.solr.org",
+                               },
+                       }
+               })
+
+               FIt("Scales Down", func(ctx context.Context) {
+                       originalSolrCloud := solrCloud.DeepCopy()
+                       solrCloud.Spec.Replicas = pointer.Int32(1)
+                       By("triggering a scale down via solrCloud replicas")
+                       Expect(k8sClient.Patch(ctx, solrCloud, 
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud 
replicas to initiate scale down")
+
+                       By("waiting for the scaleDown of first pod to begin")
+                       expectStatefulSetWithChecks(ctx, solrCloud, 
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+                               
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should still have 3 pods, because the scale down should first move Solr 
replicas")
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a scaleDown lock.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet 
does not have a scaleDown lock.")
+                               g.Expect(clusterOp.Metadata).To(Equal("2"), 
"StatefulSet scaling lock operation has the wrong metadata.")
+                       })
+                       queryCollection(ctx, solrCloud, solrCollection2, 0)
+
+                       // Make sure that ingress still has 3 nodes at this 
point
+                       ingress := expectIngress(ctx, solrCloud, 
solrCloud.CommonIngressName())
+                       Expect(ingress.Spec.Rules).To(HaveLen(4), "Wrong number 
of ingress rules. 3 nodes + 1 common endpoint. The third node rule should not 
be deleted until the node itself is deleted")
+
+                       By("waiting for the scaleDown of the first pod to 
finish")
+                       expectStatefulSetWithChecks(ctx, solrCloud, 
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+                               
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet 
should now have 2 pods, after the replicas have been moved off the first pod.")
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a scaleDown lock.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet 
does not have a scaleDown lock.")
+                               g.Expect(clusterOp.Metadata).To(Equal("2"), 
"StatefulSet scaling lock operation has the wrong metadata.")
+                       })
+                       queryCollection(ctx, solrCloud, solrCollection2, 0)
+
+                       // Wait till the pod has actually been deleted
+                       expectStatefulSetWithChecks(ctx, solrCloud, 
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+                               
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet 
should now have 2 pods, after the replicas have been moved off the first pod.")
+                       })
+
+                       By("waiting for the scaleDown of second pod to begin")
+                       statefulSet := expectStatefulSetWithChecks(ctx, 
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found 
*appsv1.StatefulSet) {
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a scaleDown lock.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet 
does not have a scaleDown lock.")
+                               g.Expect(clusterOp.Metadata).To(Equal("1"), 
"StatefulSet scaling lock operation has the wrong metadata.")
+                       })
+
+                       // Make sure that ingress still has 2 nodes at this 
point
+                       ingress = expectIngress(ctx, solrCloud, 
solrCloud.CommonIngressName())
+                       Expect(ingress.Spec.Rules).To(HaveLen(3), "Wrong number 
of ingress rules. 2 nodes + 1 common endpoint. The second node rule should not 
be deleted until the node itself is deleted")
+
+                       // When the next scale down happens, the 3rd solr pod 
(ordinal 2) should be gone, and the statefulSet replicas should be 2 across the 
board.
+                       // The first scale down should not be complete until 
this is done.
+                       
Expect(statefulSet.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet 
should still have 2 pods configured, because the scale down should first move 
Solr replicas")
+                       
Expect(statefulSet.Status.Replicas).To(HaveValue(BeEquivalentTo(2)), 
"StatefulSet should only have 2 pods running, because previous pod scale down 
should have completely finished")
+                       // This pod check must happen after the above 
clusterLock and replicas check.
+                       // The StatefulSet controller might take a good amount 
of time to actually delete the pod,
+                       // and the replica migration/cluster op might already 
be done by the time the first pod is deleted.
+                       expectNoPodNow(ctx, solrCloud, 
solrCloud.GetSolrPodName(2))
+                       queryCollection(ctx, solrCloud, solrCollection1, 0)
+
+                       By("waiting for the scaleDown to finish")
+                       statefulSet = expectStatefulSetWithChecks(ctx, 
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found 
*appsv1.StatefulSet) {
+                               
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet 
should now have 1 pods, after the replicas have been moved.")
+                       })
+                       // Once the scale down actually occurs, the clusterOp 
is not complete. We need to wait till the last pod is deleted
+                       clusterOp, err := 
controllers.GetCurrentClusterOp(statefulSet)
+                       Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not 
have a scaleDown lock.")
+                       
Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet 
does not have a scaleDown lock.")
+                       Expect(clusterOp.Metadata).To(Equal("1"), "StatefulSet 
scaling lock operation has the wrong metadata.")
+
+                       // Wait for the last pod to be deleted
+                       expectStatefulSetWithChecks(ctx, solrCloud, 
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+                               
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet 
should now have 1 pods, after the replicas have been moved.")
+                       })
+                       // Once the scale down actually occurs, the statefulSet 
annotations should be removed very soon
+                       expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*2, time.Millisecond*500, func(g 
Gomega, found *appsv1.StatefulSet) {
+                               clusterOp, err = 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).To(BeNil(), "StatefulSet 
should not have a ScaleDown lock after scaling is complete.")
+                       })
+
+                       // Make sure that ingress has 1 node at this point
+                       ingress = expectIngress(ctx, solrCloud, 
solrCloud.CommonIngressName())
+                       Expect(ingress.Spec.Rules).To(HaveLen(2), "Wrong number 
of ingress rules. 1 nodes + 1 common endpoint.")
+
+                       expectNoPod(ctx, solrCloud, solrCloud.GetSolrPodName(1))
+
+                       queryCollection(ctx, solrCloud, solrCollection1, 0)
+                       queryCollection(ctx, solrCloud, solrCollection2, 0)
+               })
+       })
+
        FContext("with replica migration and deleted persistent data", func() {
 
                BeforeEach(func() {
@@ -240,7 +344,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
                FIt("Scales Up", func(ctx context.Context) {
                        originalSolrCloud := solrCloud.DeepCopy()
                        solrCloud.Spec.Replicas = pointer.Int32(int32(3))
-                       By("triggering a scale down via solrCloud replicas")
+                       By("triggering a scale up via solrCloud replicas")
                        Expect(k8sClient.Patch(ctx, solrCloud, 
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud 
replicas to initiate scale up")
 
                        By("waiting for the scaleUp to begin")
@@ -254,9 +358,11 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
 
                        // The first step is to increase the number of pods
                        // Check very often, as the new pods will be created 
quickly, which will cause the cluster op to change.
-                       statefulSet = 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, 
found *appsv1.StatefulSet) {
-                               
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should still have 3 pods, because the scale down should first move Solr 
replicas")
-                       })
+                       if int(*statefulSet.Spec.Replicas) != 3 {
+                               statefulSet = 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, 
found *appsv1.StatefulSet) {
+                                       
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should be configured for 3 pods when scaling up")
+                               })
+                       }
                        clusterOp, err := 
controllers.GetCurrentClusterOp(statefulSet)
                        Expect(err).ToNot(HaveOccurred(), "Error occurred while 
finding clusterLock for SolrCloud")
                        Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not 
have a scaleUp lock.")
@@ -265,7 +371,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
 
                        // Wait for new pods to come up, and when they do we 
should be doing a balanceReplicas clusterOp
                        statefulSet = expectStatefulSetWithChecks(ctx, 
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found 
*appsv1.StatefulSet) {
-                               
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should still have 3 pods, because the scale down should first move Solr 
replicas")
+                               
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should eventually have 3 pods created for it")
                        })
                        clusterOp, err = 
controllers.GetCurrentClusterOp(statefulSet)
                        Expect(err).ToNot(HaveOccurred(), "Error occurred while 
finding clusterLock for SolrCloud")
@@ -285,6 +391,89 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
                })
        })
 
+       FContext("with replica migration using an ingress for node addresses", 
func() {
+
+               BeforeEach(func() {
+                       solrCloud.Spec.Replicas = pointer.Int32(2)
+                       solrCloud.Spec.SolrAddressability = 
solrv1beta1.SolrAddressabilityOptions{
+                               External: &solrv1beta1.ExternalAddressability{
+                                       Method:             solrv1beta1.Ingress,
+                                       UseExternalAddress: true,
+                                       DomainName:         "test.solr.org",
+                               },
+                       }
+               })
+
+               FIt("Scales Up", func(ctx context.Context) {
+                       originalSolrCloud := solrCloud.DeepCopy()
+                       solrCloud.Spec.Replicas = pointer.Int32(int32(3))
+                       By("triggering a scale up via solrCloud replicas")
+                       Expect(k8sClient.Patch(ctx, solrCloud, 
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud 
replicas to initiate scale up")
+
+                       By("waiting for the rolling restart to begin with 
hostAliases")
+                       statefulSet := 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, 
found *appsv1.StatefulSet) {
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a update lock to add hostAliases.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.UpdateLock), "StatefulSet 
does not have a update lock to add hostAliases.")
+                               
g.Expect(found.Spec.Template.Spec.HostAliases).To(HaveLen(3), "Host aliases in 
the pod template do not have length 3, which is the number of pods to scale up 
to")
+                       })
+                       expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, 
time.Second*5, time.Millisecond*5, func(g Gomega, cloud *solrv1beta1.SolrCloud) 
{
+                               
g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(0), "The Rolling Update 
never started, upToDateNodes should eventually be 0 when starting a restart")
+                       })
+
+                       By("Wait for the rolling update to be done")
+                       expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, 
time.Second*90, time.Millisecond*5, func(g Gomega, cloud 
*solrv1beta1.SolrCloud) {
+                               
g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
 "The Rolling Update never completed, not all replicas up to date")
+                               
g.Expect(cloud.Status.ReadyReplicas).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
 "The Rolling Update never completed, not all replicas ready")
+                       })
+
+                       By("waiting for the scaleUp to begin")
+                       statefulSet = 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond, func(g Gomega, 
found *appsv1.StatefulSet) {
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a scaleUp lock.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet 
does not have a scaleUp lock.")
+                               g.Expect(clusterOp.Metadata).To(Equal("3"), 
"StatefulSet scaling lock operation has the wrong metadata.")
+                       })
+
+                       // The first step is to increase the number of pods
+                       // Check very often, as the new pods will be created 
quickly, which will cause the cluster op to change.
+                       if int(*statefulSet.Spec.Replicas) != 3 {
+                               statefulSet = 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, 
found *appsv1.StatefulSet) {
+                                       
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should be configured for 3 pods when scaling up")
+                               })
+                       }
+                       clusterOp, err := 
controllers.GetCurrentClusterOp(statefulSet)
+                       Expect(err).ToNot(HaveOccurred(), "Error occurred while 
finding clusterLock for SolrCloud")
+                       Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not 
have a scaleUp lock.")
+                       
Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet 
does not have a scaleUp lock.")
+                       Expect(clusterOp.Metadata).To(Equal("3"), "StatefulSet 
scaling lock operation has the wrong metadata.")
+
+                       // Wait for new pods to come up, and when they do we 
should be doing a balanceReplicas clusterOp
+                       statefulSet = expectStatefulSetWithChecks(ctx, 
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found 
*appsv1.StatefulSet) {
+                               
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet 
should eventually have 3 pods created for it")
+                       })
+                       statefulSet = 
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, 
solrCloud.StatefulSetName(), time.Millisecond*50, time.Millisecond*10, func(g 
Gomega, found *appsv1.StatefulSet) {
+                               clusterOp, err = 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet 
does not have a balanceReplicas lock after new pods are created.")
+                               
g.Expect(clusterOp.Operation).To(Equal(controllers.BalanceReplicasLock), 
"StatefulSet does not have a balanceReplicas lock after new pods are created.")
+                               
g.Expect(clusterOp.Metadata).To(Equal("ScaleUp"), "StatefulSet balanceReplicas 
lock operation has the wrong metadata.")
+                       })
+
+                       By("waiting for the scaleUp to finish")
+                       statefulSet = expectStatefulSetWithChecks(ctx, 
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found 
*appsv1.StatefulSet) {
+                               clusterOp, err := 
controllers.GetCurrentClusterOp(found)
+                               g.Expect(err).ToNot(HaveOccurred(), "Error 
occurred while finding clusterLock for SolrCloud")
+                               g.Expect(clusterOp).To(BeNil(), "StatefulSet 
should not have a balanceReplicas lock after balancing is complete.")
+                       })
+
+                       queryCollection(ctx, solrCloud, solrCollection1, 0)
+                       queryCollection(ctx, solrCloud, solrCollection2, 0)
+               })
+       })
+
        FContext("without replica migration", func() {
 
                BeforeEach(func() {
diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go
index f27a90f..8b38a98 100644
--- a/tests/e2e/suite_test.go
+++ b/tests/e2e/suite_test.go
@@ -311,6 +311,26 @@ func writeAllSolrInfoToFiles(ctx context.Context, 
directory string, namespace st
                        &statefulSet,
                )
        }
+
+       // Unfortunately the services don't have the technology label
+       req, err = labels.NewRequirement("solr-cloud", selection.Exists, 
make([]string, 0))
+       Expect(err).ToNot(HaveOccurred())
+
+       labelSelector = labels.Everything().Add(*req)
+       listOps = &client.ListOptions{
+               Namespace:     namespace,
+               LabelSelector: labelSelector,
+       }
+
+       foundServices := &corev1.ServiceList{}
+       Expect(k8sClient.List(ctx, foundServices, listOps)).To(Succeed(), 
"Could not fetch Solr pods")
+       Expect(foundServices).ToNot(BeNil(), "No Solr services could be found")
+       for _, service := range foundServices.Items {
+               writeAllServiceInfoToFiles(
+                       directory+service.Name+".service",
+                       &service,
+               )
+       }
 }
 
 // writeAllStatefulSetInfoToFiles writes the following each to a separate file 
with the given base name & directory.
@@ -339,6 +359,19 @@ func writeAllStatefulSetInfoToFiles(baseFilename string, 
statefulSet *appsv1.Sta
        Expect(writeErr).ToNot(HaveOccurred(), "Could not write statefulSet 
events json to file")
 }
 
+// writeAllServiceInfoToFiles writes the following each to a separate file 
with the given base name & directory.
+//   - Service
+func writeAllServiceInfoToFiles(baseFilename string, service *corev1.Service) {
+       // Write service to a file
+       statusFile, err := os.Create(baseFilename + ".json")
+       defer statusFile.Close()
+       Expect(err).ToNot(HaveOccurred(), "Could not open file to save service 
status: %s", baseFilename+".json")
+       jsonBytes, marshErr := json.MarshalIndent(service, "", "\t")
+       Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize service 
json")
+       _, writeErr := statusFile.Write(jsonBytes)
+       Expect(writeErr).ToNot(HaveOccurred(), "Could not write service json to 
file")
+}
+
 // writeAllPodInfoToFile writes the following each to a separate file with the 
given base name & directory.
 //   - Pod Spec/Status
 //   - Pod Events
diff --git a/tests/e2e/test_utils_test.go b/tests/e2e/test_utils_test.go
index b28df88..c3e693c 100644
--- a/tests/e2e/test_utils_test.go
+++ b/tests/e2e/test_utils_test.go
@@ -475,10 +475,10 @@ func callSolrApiInPod(ctx context.Context, solrCloud 
*solrv1beta1.SolrCloud, htt
                "-verbose",
                "-" + strings.ToLower(httpMethod),
                fmt.Sprintf(
-                       "\"%s://%s:%d%s%s\"",
+                       "\"%s://%s%s%s%s\"",
                        solrCloud.UrlScheme(false),
                        hostname,
-                       solrCloud.Spec.SolrAddressability.PodPort,
+                       solrCloud.NodePortSuffix(false),
                        apiPath,
                        queryParamsString),
        }

(solr-operator) branch main updated: Fix scaling when using ingress-addressed nodes (#692)

Reply via email to