This is an automated email from the ASF dual-hosted git repository.
houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr-operator.git
The following commit(s) were added to refs/heads/main by this push:
new 1cc55ef Fix scaling when using ingress-addressed nodes (#692)
1cc55ef is described below
commit 1cc55eff50b8711275fec987ddb644701993a258
Author: Houston Putman <[email protected]>
AuthorDate: Wed Apr 3 16:49:59 2024 -0400
Fix scaling when using ingress-addressed nodes (#692)
---
api/v1beta1/solrcloud_types.go | 7 +
controllers/solr_cluster_ops_util.go | 22 ++-
controllers/solrcloud_controller.go | 71 +++++-----
controllers/util/common.go | 24 +++-
controllers/util/solr_update_util.go | 14 +-
controllers/util/solr_util.go | 3 +
helm/solr-operator/Chart.yaml | 8 +-
tests/e2e/solrcloud_rolling_upgrade_test.go | 2 +-
tests/e2e/solrcloud_scaling_test.go | 199 +++++++++++++++++++++++++++-
tests/e2e/suite_test.go | 33 +++++
tests/e2e/test_utils_test.go | 4 +-
11 files changed, 341 insertions(+), 46 deletions(-)
diff --git a/api/v1beta1/solrcloud_types.go b/api/v1beta1/solrcloud_types.go
index c028337..995a9ac 100644
--- a/api/v1beta1/solrcloud_types.go
+++ b/api/v1beta1/solrcloud_types.go
@@ -1227,6 +1227,13 @@ func (sc *SolrCloud) GetAllSolrPodNames() []string {
if sc.Spec.Replicas != nil {
replicas = int(*sc.Spec.Replicas)
}
+ if int(sc.Status.Replicas) > replicas {
+ replicas = int(sc.Status.Replicas)
+ }
+ return sc.GetSolrPodNames(replicas)
+}
+
+func (sc *SolrCloud) GetSolrPodNames(replicas int) []string {
podNames := make([]string, replicas)
statefulSetName := sc.StatefulSetName()
for i := range podNames {
diff --git a/controllers/solr_cluster_ops_util.go
b/controllers/solr_cluster_ops_util.go
index c9f352a..07452eb 100644
--- a/controllers/solr_cluster_ops_util.go
+++ b/controllers/solr_cluster_ops_util.go
@@ -151,7 +151,7 @@ func retryNextQueuedClusterOpWithQueue(statefulSet
*appsv1.StatefulSet, clusterO
return hasOp, err
}
-func determineScaleClusterOpLockIfNecessary(ctx context.Context, r
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet
*appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, logger
logr.Logger) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err
error) {
+func determineScaleClusterOpLockIfNecessary(ctx context.Context, r
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet
*appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod,
blockReconciliationOfStatefulSet bool, logger logr.Logger) (clusterOp
*SolrClusterOp, retryLaterDuration time.Duration, err error) {
desiredPods := int(*instance.Spec.Replicas)
configuredPods := int(*statefulSet.Spec.Replicas)
if desiredPods != configuredPods {
@@ -170,11 +170,26 @@ func determineScaleClusterOpLockIfNecessary(ctx
context.Context, r *SolrCloudRec
Metadata: strconv.Itoa(configuredPods - 1),
}
} else if desiredPods > configuredPods &&
(instance.Spec.Scaling.PopulatePodsOnScaleUp == nil ||
*instance.Spec.Scaling.PopulatePodsOnScaleUp) {
+ // We need to wait for all pods to become healthy to
scale up in a managed fashion, otherwise
+ // the balancing will skip some pods
if len(podList) < configuredPods {
// There are not enough pods, the statefulSet
controller has yet to create the previously desired pods.
// Do not start the scale up until these
missing pods are created.
return nil, time.Second * 5, nil
}
+ // If Solr nodes are advertised by their individual
node services (through an ingress)
+ // then make sure that the host aliases are set for all
desired pods before starting a scale-up.
+ // If the host aliases do not already include the
soon-to-be created pods, then Solr might not be able to balance
+ // replicas onto the new hosts.
+ // We need to make sure that the StatefulSet is updated
with these new hostAliases before the scale up occurs.
+ if instance.UsesIndividualNodeServices() &&
instance.Spec.SolrAddressability.External.UseExternalAddress {
+ for _, pod := range podList {
+ if len(pod.Spec.HostAliases) <
desiredPods {
+ return nil, time.Second * 5, nil
+ }
+ }
+ }
+
clusterOp = &SolrClusterOp{
Operation: ScaleUpLock,
Metadata: strconv.Itoa(desiredPods),
@@ -349,7 +364,8 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r
*SolrCloudReconciler
}
operationComplete = true
// Only do a re-balancing for rolling restarts that migrated
replicas
- if updateMetadata.RequiresReplicaMigration {
+ // If a scale-up will occur afterwards, skip the re-balancing,
because it will occur after the scale-up anyway
+ if updateMetadata.RequiresReplicaMigration &&
*instance.Spec.Replicas <= *statefulSet.Spec.Replicas {
nextClusterOp = &SolrClusterOp{
Operation: BalanceReplicasLock,
Metadata: "RollingUpdateComplete",
@@ -371,7 +387,7 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r
*SolrCloudReconciler
// We won't kill pods that we need the cluster state for, but
we can kill the pods that are already not running.
// This is important for scenarios where there is a bad pod
config and nothing is running, but we need to do
// a restart to get a working pod config.
- state, retryLater, apiError := util.GetNodeReplicaState(ctx,
instance, hasReadyPod, logger)
+ state, retryLater, apiError := util.GetNodeReplicaState(ctx,
instance, statefulSet, hasReadyPod, logger)
if apiError != nil {
return false, true, 0, nil, apiError
} else if !retryLater {
diff --git a/controllers/solrcloud_controller.go
b/controllers/solrcloud_controller.go
index 5ad2213..4d832cf 100644
--- a/controllers/solrcloud_controller.go
+++ b/controllers/solrcloud_controller.go
@@ -152,6 +152,11 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
hostNameIpMap := make(map[string]string)
// Generate a service for every Node
if instance.UsesIndividualNodeServices() {
+ // When updating the statefulSet below, the hostNameIpMap is
just used to add new IPs or modify existing ones.
+ // When scaling down, the hostAliases that are no longer found
here will not be removed from the hostAliases in the statefulSet pod spec.
+ // Therefore, it should be ok that we are not reconciling the
node services that will be scaled down in the future.
+ // This is unfortunately the reality since we don't have the
statefulSet yet to determine how many Solr pods are still running,
+ // we just have Spec.replicas which is the requested pod count.
for _, nodeName := range solrNodeNames {
err, ip := r.reconcileNodeService(ctx, logger,
instance, nodeName)
if err != nil {
@@ -161,6 +166,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
if
instance.Spec.SolrAddressability.External.UseExternalAddress {
if ip == "" {
// If we are using this IP in the
hostAliases of the statefulSet, it needs to be set for every service before
trying to update the statefulSet
+ // TODO: Make an event here
blockReconciliationOfStatefulSet = true
} else {
hostNameIpMap[instance.AdvertisedNodeHost(nodeName)] = ip
@@ -319,36 +325,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
}
}
- extAddressabilityOpts := instance.Spec.SolrAddressability.External
- if extAddressabilityOpts != nil && extAddressabilityOpts.Method ==
solrv1beta1.Ingress {
- // Generate Ingress
- ingress := util.GenerateIngress(instance, solrNodeNames)
-
- // Check if the Ingress already exists
- ingressLogger := logger.WithValues("ingress", ingress.Name)
- foundIngress := &netv1.Ingress{}
- err = r.Get(ctx, types.NamespacedName{Name: ingress.Name,
Namespace: ingress.Namespace}, foundIngress)
- if err != nil && errors.IsNotFound(err) {
- ingressLogger.Info("Creating Ingress")
- if err =
controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil {
- err = r.Create(ctx, ingress)
- }
- } else if err == nil {
- var needsUpdate bool
- needsUpdate, err = util.OvertakeControllerRef(instance,
foundIngress, r.Scheme)
- needsUpdate = util.CopyIngressFields(ingress,
foundIngress, ingressLogger) || needsUpdate
-
- // Update the found Ingress and write the result back
if there are any changes
- if needsUpdate && err == nil {
- ingressLogger.Info("Updating Ingress")
- err = r.Update(ctx, foundIngress)
- }
- }
- if err != nil {
- return requeueOrNot, err
- }
- }
-
var statefulSet *appsv1.StatefulSet
if !blockReconciliationOfStatefulSet {
@@ -416,6 +392,39 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
if err != nil {
return requeueOrNot, err
}
+ if statefulSet != nil && statefulSet.Spec.Replicas != nil {
+ solrNodeNames =
instance.GetSolrPodNames(int(*statefulSet.Spec.Replicas))
+ }
+
+ extAddressabilityOpts := instance.Spec.SolrAddressability.External
+ if extAddressabilityOpts != nil && extAddressabilityOpts.Method ==
solrv1beta1.Ingress {
+ // Generate Ingress
+ ingress := util.GenerateIngress(instance, solrNodeNames)
+
+ // Check if the Ingress already exists
+ ingressLogger := logger.WithValues("ingress", ingress.Name)
+ foundIngress := &netv1.Ingress{}
+ err = r.Get(ctx, types.NamespacedName{Name: ingress.Name,
Namespace: ingress.Namespace}, foundIngress)
+ if err != nil && errors.IsNotFound(err) {
+ ingressLogger.Info("Creating Ingress")
+ if err =
controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil {
+ err = r.Create(ctx, ingress)
+ }
+ } else if err == nil {
+ var needsUpdate bool
+ needsUpdate, err = util.OvertakeControllerRef(instance,
foundIngress, r.Scheme)
+ needsUpdate = util.CopyIngressFields(ingress,
foundIngress, ingressLogger) || needsUpdate
+
+ // Update the found Ingress and write the result back
if there are any changes
+ if needsUpdate && err == nil {
+ ingressLogger.Info("Updating Ingress")
+ err = r.Update(ctx, foundIngress)
+ }
+ }
+ if err != nil {
+ return requeueOrNot, err
+ }
+ }
// *********************************************************
// The operations after this require a statefulSet to exist,
@@ -546,7 +555,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx
context.Context, req ctrl.Request) (
// a "locked" cluster operation
if clusterOp == nil {
_, scaleDownOpIsQueued :=
queuedRetryOps[ScaleDownLock]
- clusterOp, retryLaterDuration, err =
determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet,
scaleDownOpIsQueued, podList, logger)
+ clusterOp, retryLaterDuration, err =
determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet,
scaleDownOpIsQueued, podList, blockReconciliationOfStatefulSet, logger)
// If the new clusterOperation is an update to
a queued clusterOp, just change the operation that is already queued
if clusterOp != nil {
diff --git a/controllers/util/common.go b/controllers/util/common.go
index b887f6d..3eca00b 100644
--- a/controllers/util/common.go
+++ b/controllers/util/common.go
@@ -432,7 +432,29 @@ func CopyPodTemplates(from, to *corev1.PodTemplateSpec,
basePath string, logger
if !DeepEqualWithNils(to.Spec.HostAliases, from.Spec.HostAliases) {
requireUpdate = true
- to.Spec.HostAliases = from.Spec.HostAliases
+ if to.Spec.HostAliases == nil {
+ to.Spec.HostAliases = from.Spec.HostAliases
+ } else {
+ // Do not remove aliases that are no longer used.
+ // This is in case Solr is scaling down and we want to
keep the old addresses for future use.
+ for _, fromAlias := range from.Spec.HostAliases {
+ found := false
+ for i, toAlias := range to.Spec.HostAliases {
+ if fromAlias.Hostnames[0] ==
toAlias.Hostnames[0] {
+ found = true
+ if !DeepEqualWithNils(toAlias,
fromAlias) {
+ requireUpdate = true
+ to.Spec.HostAliases[i]
= fromAlias
+ break
+ }
+ }
+ }
+ if !found {
+ requireUpdate = true
+ to.Spec.HostAliases =
append(to.Spec.HostAliases, fromAlias)
+ }
+ }
+ }
logger.Info("Update required because field changed", "field",
basePath+"Spec.HostAliases", "from", to.Spec.HostAliases, "to",
from.Spec.HostAliases)
}
diff --git a/controllers/util/solr_update_util.go
b/controllers/util/solr_update_util.go
index 4f9eebe..3123544 100644
--- a/controllers/util/solr_update_util.go
+++ b/controllers/util/solr_update_util.go
@@ -24,6 +24,7 @@ import (
"github.com/apache/solr-operator/controllers/util/solr_api"
"github.com/go-logr/logr"
"github.com/robfig/cron/v3"
+ appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"net/url"
@@ -119,7 +120,7 @@ func (state NodeReplicaState) PodHasReplicas(cloud
*solr.SolrCloud, podName stri
return isInClusterState && contents.replicas > 0
}
-func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud,
hasReadyPod bool, logger logr.Logger) (state NodeReplicaState, retryLater bool,
err error) {
+func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud,
statefulSet *appsv1.StatefulSet, hasReadyPod bool, logger logr.Logger) (state
NodeReplicaState, retryLater bool, err error) {
clusterResp := &solr_api.SolrClusterStatusResponse{}
overseerResp := &solr_api.SolrOverseerStatusResponse{}
@@ -138,7 +139,7 @@ func GetNodeReplicaState(ctx context.Context, cloud
*solr.SolrCloud, hasReadyPod
}
}
if err == nil {
- state = findSolrNodeContents(clusterResp.ClusterStatus,
overseerResp.Leader, GetAllManagedSolrNodeNames(cloud))
+ state = findSolrNodeContents(clusterResp.ClusterStatus,
overseerResp.Leader, GetManagedSolrNodeNames(cloud,
int(*statefulSet.Spec.Replicas)))
} else {
logger.Error(err, "Could not fetch cluster state
information for cloud")
}
@@ -535,6 +536,15 @@ func GetAllManagedSolrNodeNames(solrCloud *solr.SolrCloud)
map[string]bool {
return allNodeNames
}
+func GetManagedSolrNodeNames(solrCloud *solr.SolrCloud,
currentlyConfiguredPodCount int) map[string]bool {
+ podNames := solrCloud.GetSolrPodNames(currentlyConfiguredPodCount)
+ allNodeNames := make(map[string]bool, len(podNames))
+ for _, podName := range podNames {
+ allNodeNames[SolrNodeName(solrCloud, podName)] = true
+ }
+ return allNodeNames
+}
+
// EvictReplicasForPodIfNecessary takes a solr Pod and migrates all replicas
off of that Pod.
// For updates this will only be called for pods using ephemeral data.
// For scale-down operations, this can be called for pods using ephemeral or
persistent data.
diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go
index 47f8c07..de44d7c 100644
--- a/controllers/util/solr_util.go
+++ b/controllers/util/solr_util.go
@@ -556,6 +556,9 @@ func GenerateStatefulSet(solrCloud *solr.SolrCloud,
solrCloudStatus *solr.SolrCl
VolumeClaimTemplates: pvcs,
},
}
+ if solrCloud.UsesHeadlessService() {
+ stateful.Spec.Template.Spec.Subdomain =
solrCloud.HeadlessServiceName()
+ }
var imagePullSecrets []corev1.LocalObjectReference
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index fd2b6da..5a954c2 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -61,7 +61,6 @@ annotations:
url: https://github.com/apache/solr-operator/issues/624
- name: Github PR
url: https://github.com/apache/solr-operator/pull/648
- - kind: fixed
description: Avoid reset of security.json if get request fails
links:
- name: Github Issue
@@ -89,6 +88,13 @@ annotations:
url: https://issues.apache.org/jira/browse/SOLR-17216
- name: Github PR
url: https://github.com/apache/solr-operator/pull/698
+ - kind: fixed
+ description: SolrClouds addressed via an Ingress now scale up and down
safely.
+ links:
+ - name: Github Issue
+ url: https://github.com/apache/solr-operator/issues/682
+ - name: Github PR
+ url: https://github.com/apache/solr-operator/pull/692
artifacthub.io/images: |
- name: solr-operator
image: apache/solr-operator:v0.9.0-prerelease
diff --git a/tests/e2e/solrcloud_rolling_upgrade_test.go
b/tests/e2e/solrcloud_rolling_upgrade_test.go
index 6143342..faf2175 100644
--- a/tests/e2e/solrcloud_rolling_upgrade_test.go
+++ b/tests/e2e/solrcloud_rolling_upgrade_test.go
@@ -163,7 +163,7 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades",
func() {
}
By("waiting for the balanceReplicas to finish")
- expectStatefulSetWithChecks(ctx, solrCloud,
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+ expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*30, time.Second, func(g Gomega, found
*appsv1.StatefulSet) {
clusterOp, err :=
controllers.GetCurrentClusterOp(found)
g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
g.Expect(clusterOp).To(BeNil(), "StatefulSet
should not have a balanceReplicas lock after balancing is complete.")
diff --git a/tests/e2e/solrcloud_scaling_test.go
b/tests/e2e/solrcloud_scaling_test.go
index 53d3630..106dd4f 100644
--- a/tests/e2e/solrcloud_scaling_test.go
+++ b/tests/e2e/solrcloud_scaling_test.go
@@ -140,6 +140,110 @@ var _ = FDescribe("E2E - SolrCloud - Scale Down", func() {
})
})
+ FContext("with replica migration using an ingress for node addresses",
func() {
+
+ BeforeEach(func() {
+ solrCloud.Spec.SolrAddressability =
solrv1beta1.SolrAddressabilityOptions{
+ External: &solrv1beta1.ExternalAddressability{
+ Method: solrv1beta1.Ingress,
+ UseExternalAddress: true,
+ DomainName: "test.solr.org",
+ },
+ }
+ })
+
+ FIt("Scales Down", func(ctx context.Context) {
+ originalSolrCloud := solrCloud.DeepCopy()
+ solrCloud.Spec.Replicas = pointer.Int32(1)
+ By("triggering a scale down via solrCloud replicas")
+ Expect(k8sClient.Patch(ctx, solrCloud,
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud
replicas to initiate scale down")
+
+ By("waiting for the scaleDown of first pod to begin")
+ expectStatefulSetWithChecks(ctx, solrCloud,
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should still have 3 pods, because the scale down should first move Solr
replicas")
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a scaleDown lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet
does not have a scaleDown lock.")
+ g.Expect(clusterOp.Metadata).To(Equal("2"),
"StatefulSet scaling lock operation has the wrong metadata.")
+ })
+ queryCollection(ctx, solrCloud, solrCollection2, 0)
+
+ // Make sure that ingress still has 3 nodes at this
point
+ ingress := expectIngress(ctx, solrCloud,
solrCloud.CommonIngressName())
+ Expect(ingress.Spec.Rules).To(HaveLen(4), "Wrong number
of ingress rules. 3 nodes + 1 common endpoint. The third node rule should not
be deleted until the node itself is deleted")
+
+ By("waiting for the scaleDown of the first pod to
finish")
+ expectStatefulSetWithChecks(ctx, solrCloud,
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet
should now have 2 pods, after the replicas have been moved off the first pod.")
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a scaleDown lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet
does not have a scaleDown lock.")
+ g.Expect(clusterOp.Metadata).To(Equal("2"),
"StatefulSet scaling lock operation has the wrong metadata.")
+ })
+ queryCollection(ctx, solrCloud, solrCollection2, 0)
+
+ // Wait till the pod has actually been deleted
+ expectStatefulSetWithChecks(ctx, solrCloud,
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet
should now have 2 pods, after the replicas have been moved off the first pod.")
+ })
+
+ By("waiting for the scaleDown of second pod to begin")
+ statefulSet := expectStatefulSetWithChecks(ctx,
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found
*appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a scaleDown lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet
does not have a scaleDown lock.")
+ g.Expect(clusterOp.Metadata).To(Equal("1"),
"StatefulSet scaling lock operation has the wrong metadata.")
+ })
+
+ // Make sure that ingress still has 2 nodes at this
point
+ ingress = expectIngress(ctx, solrCloud,
solrCloud.CommonIngressName())
+ Expect(ingress.Spec.Rules).To(HaveLen(3), "Wrong number
of ingress rules. 2 nodes + 1 common endpoint. The second node rule should not
be deleted until the node itself is deleted")
+
+ // When the next scale down happens, the 3rd solr pod
(ordinal 2) should be gone, and the statefulSet replicas should be 2 across the
board.
+ // The first scale down should not be complete until
this is done.
+
Expect(statefulSet.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet
should still have 2 pods configured, because the scale down should first move
Solr replicas")
+
Expect(statefulSet.Status.Replicas).To(HaveValue(BeEquivalentTo(2)),
"StatefulSet should only have 2 pods running, because previous pod scale down
should have completely finished")
+ // This pod check must happen after the above
clusterLock and replicas check.
+ // The StatefulSet controller might take a good amount
of time to actually delete the pod,
+ // and the replica migration/cluster op might already
be done by the time the first pod is deleted.
+ expectNoPodNow(ctx, solrCloud,
solrCloud.GetSolrPodName(2))
+ queryCollection(ctx, solrCloud, solrCollection1, 0)
+
+ By("waiting for the scaleDown to finish")
+ statefulSet = expectStatefulSetWithChecks(ctx,
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found
*appsv1.StatefulSet) {
+
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet
should now have 1 pods, after the replicas have been moved.")
+ })
+ // Once the scale down actually occurs, the clusterOp
is not complete. We need to wait till the last pod is deleted
+ clusterOp, err :=
controllers.GetCurrentClusterOp(statefulSet)
+ Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not
have a scaleDown lock.")
+
Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet
does not have a scaleDown lock.")
+ Expect(clusterOp.Metadata).To(Equal("1"), "StatefulSet
scaling lock operation has the wrong metadata.")
+
+ // Wait for the last pod to be deleted
+ expectStatefulSetWithChecks(ctx, solrCloud,
solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) {
+
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet
should now have 1 pods, after the replicas have been moved.")
+ })
+ // Once the scale down actually occurs, the statefulSet
annotations should be removed very soon
+ expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*2, time.Millisecond*500, func(g
Gomega, found *appsv1.StatefulSet) {
+ clusterOp, err =
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).To(BeNil(), "StatefulSet
should not have a ScaleDown lock after scaling is complete.")
+ })
+
+ // Make sure that ingress has 1 node at this point
+ ingress = expectIngress(ctx, solrCloud,
solrCloud.CommonIngressName())
+ Expect(ingress.Spec.Rules).To(HaveLen(2), "Wrong number
of ingress rules. 1 nodes + 1 common endpoint.")
+
+ expectNoPod(ctx, solrCloud, solrCloud.GetSolrPodName(1))
+
+ queryCollection(ctx, solrCloud, solrCollection1, 0)
+ queryCollection(ctx, solrCloud, solrCollection2, 0)
+ })
+ })
+
FContext("with replica migration and deleted persistent data", func() {
BeforeEach(func() {
@@ -240,7 +344,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
FIt("Scales Up", func(ctx context.Context) {
originalSolrCloud := solrCloud.DeepCopy()
solrCloud.Spec.Replicas = pointer.Int32(int32(3))
- By("triggering a scale down via solrCloud replicas")
+ By("triggering a scale up via solrCloud replicas")
Expect(k8sClient.Patch(ctx, solrCloud,
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud
replicas to initiate scale up")
By("waiting for the scaleUp to begin")
@@ -254,9 +358,11 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
// The first step is to increase the number of pods
// Check very often, as the new pods will be created
quickly, which will cause the cluster op to change.
- statefulSet =
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega,
found *appsv1.StatefulSet) {
-
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should still have 3 pods, because the scale down should first move Solr
replicas")
- })
+ if int(*statefulSet.Spec.Replicas) != 3 {
+ statefulSet =
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega,
found *appsv1.StatefulSet) {
+
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should be configured for 3 pods when scaling up")
+ })
+ }
clusterOp, err :=
controllers.GetCurrentClusterOp(statefulSet)
Expect(err).ToNot(HaveOccurred(), "Error occurred while
finding clusterLock for SolrCloud")
Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not
have a scaleUp lock.")
@@ -265,7 +371,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
// Wait for new pods to come up, and when they do we
should be doing a balanceReplicas clusterOp
statefulSet = expectStatefulSetWithChecks(ctx,
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found
*appsv1.StatefulSet) {
-
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should still have 3 pods, because the scale down should first move Solr
replicas")
+
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should eventually have 3 pods created for it")
})
clusterOp, err =
controllers.GetCurrentClusterOp(statefulSet)
Expect(err).ToNot(HaveOccurred(), "Error occurred while
finding clusterLock for SolrCloud")
@@ -285,6 +391,89 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() {
})
})
+ FContext("with replica migration using an ingress for node addresses",
func() {
+
+ BeforeEach(func() {
+ solrCloud.Spec.Replicas = pointer.Int32(2)
+ solrCloud.Spec.SolrAddressability =
solrv1beta1.SolrAddressabilityOptions{
+ External: &solrv1beta1.ExternalAddressability{
+ Method: solrv1beta1.Ingress,
+ UseExternalAddress: true,
+ DomainName: "test.solr.org",
+ },
+ }
+ })
+
+ FIt("Scales Up", func(ctx context.Context) {
+ originalSolrCloud := solrCloud.DeepCopy()
+ solrCloud.Spec.Replicas = pointer.Int32(int32(3))
+ By("triggering a scale up via solrCloud replicas")
+ Expect(k8sClient.Patch(ctx, solrCloud,
client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud
replicas to initiate scale up")
+
+ By("waiting for the rolling restart to begin with
hostAliases")
+ statefulSet :=
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega,
found *appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a update lock to add hostAliases.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.UpdateLock), "StatefulSet
does not have a update lock to add hostAliases.")
+
g.Expect(found.Spec.Template.Spec.HostAliases).To(HaveLen(3), "Host aliases in
the pod template do not have length 3, which is the number of pods to scale up
to")
+ })
+ expectSolrCloudWithChecksAndTimeout(ctx, solrCloud,
time.Second*5, time.Millisecond*5, func(g Gomega, cloud *solrv1beta1.SolrCloud)
{
+
g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(0), "The Rolling Update
never started, upToDateNodes should eventually be 0 when starting a restart")
+ })
+
+ By("Wait for the rolling update to be done")
+ expectSolrCloudWithChecksAndTimeout(ctx, solrCloud,
time.Second*90, time.Millisecond*5, func(g Gomega, cloud
*solrv1beta1.SolrCloud) {
+
g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
"The Rolling Update never completed, not all replicas up to date")
+
g.Expect(cloud.Status.ReadyReplicas).To(BeEquivalentTo(*statefulSet.Spec.Replicas),
"The Rolling Update never completed, not all replicas ready")
+ })
+
+ By("waiting for the scaleUp to begin")
+ statefulSet =
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond, func(g Gomega,
found *appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a scaleUp lock.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet
does not have a scaleUp lock.")
+ g.Expect(clusterOp.Metadata).To(Equal("3"),
"StatefulSet scaling lock operation has the wrong metadata.")
+ })
+
+ // The first step is to increase the number of pods
+ // Check very often, as the new pods will be created
quickly, which will cause the cluster op to change.
+ if int(*statefulSet.Spec.Replicas) != 3 {
+ statefulSet =
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega,
found *appsv1.StatefulSet) {
+
g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should be configured for 3 pods when scaling up")
+ })
+ }
+ clusterOp, err :=
controllers.GetCurrentClusterOp(statefulSet)
+ Expect(err).ToNot(HaveOccurred(), "Error occurred while
finding clusterLock for SolrCloud")
+ Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not
have a scaleUp lock.")
+
Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet
does not have a scaleUp lock.")
+ Expect(clusterOp.Metadata).To(Equal("3"), "StatefulSet
scaling lock operation has the wrong metadata.")
+
+ // Wait for new pods to come up, and when they do we
should be doing a balanceReplicas clusterOp
+ statefulSet = expectStatefulSetWithChecks(ctx,
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found
*appsv1.StatefulSet) {
+
g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet
should eventually have 3 pods created for it")
+ })
+ statefulSet =
expectStatefulSetWithChecksAndTimeout(ctx, solrCloud,
solrCloud.StatefulSetName(), time.Millisecond*50, time.Millisecond*10, func(g
Gomega, found *appsv1.StatefulSet) {
+ clusterOp, err =
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet
does not have a balanceReplicas lock after new pods are created.")
+
g.Expect(clusterOp.Operation).To(Equal(controllers.BalanceReplicasLock),
"StatefulSet does not have a balanceReplicas lock after new pods are created.")
+
g.Expect(clusterOp.Metadata).To(Equal("ScaleUp"), "StatefulSet balanceReplicas
lock operation has the wrong metadata.")
+ })
+
+ By("waiting for the scaleUp to finish")
+ statefulSet = expectStatefulSetWithChecks(ctx,
solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found
*appsv1.StatefulSet) {
+ clusterOp, err :=
controllers.GetCurrentClusterOp(found)
+ g.Expect(err).ToNot(HaveOccurred(), "Error
occurred while finding clusterLock for SolrCloud")
+ g.Expect(clusterOp).To(BeNil(), "StatefulSet
should not have a balanceReplicas lock after balancing is complete.")
+ })
+
+ queryCollection(ctx, solrCloud, solrCollection1, 0)
+ queryCollection(ctx, solrCloud, solrCollection2, 0)
+ })
+ })
+
FContext("without replica migration", func() {
BeforeEach(func() {
diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go
index f27a90f..8b38a98 100644
--- a/tests/e2e/suite_test.go
+++ b/tests/e2e/suite_test.go
@@ -311,6 +311,26 @@ func writeAllSolrInfoToFiles(ctx context.Context,
directory string, namespace st
&statefulSet,
)
}
+
+ // Unfortunately the services don't have the technology label
+ req, err = labels.NewRequirement("solr-cloud", selection.Exists,
make([]string, 0))
+ Expect(err).ToNot(HaveOccurred())
+
+ labelSelector = labels.Everything().Add(*req)
+ listOps = &client.ListOptions{
+ Namespace: namespace,
+ LabelSelector: labelSelector,
+ }
+
+ foundServices := &corev1.ServiceList{}
+ Expect(k8sClient.List(ctx, foundServices, listOps)).To(Succeed(),
"Could not fetch Solr pods")
+ Expect(foundServices).ToNot(BeNil(), "No Solr services could be found")
+ for _, service := range foundServices.Items {
+ writeAllServiceInfoToFiles(
+ directory+service.Name+".service",
+ &service,
+ )
+ }
}
// writeAllStatefulSetInfoToFiles writes the following each to a separate file
with the given base name & directory.
@@ -339,6 +359,19 @@ func writeAllStatefulSetInfoToFiles(baseFilename string,
statefulSet *appsv1.Sta
Expect(writeErr).ToNot(HaveOccurred(), "Could not write statefulSet
events json to file")
}
+// writeAllServiceInfoToFiles writes the following each to a separate file
with the given base name & directory.
+// - Service
+func writeAllServiceInfoToFiles(baseFilename string, service *corev1.Service) {
+ // Write service to a file
+ statusFile, err := os.Create(baseFilename + ".json")
+ defer statusFile.Close()
+ Expect(err).ToNot(HaveOccurred(), "Could not open file to save service
status: %s", baseFilename+".json")
+ jsonBytes, marshErr := json.MarshalIndent(service, "", "\t")
+ Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize service
json")
+ _, writeErr := statusFile.Write(jsonBytes)
+ Expect(writeErr).ToNot(HaveOccurred(), "Could not write service json to
file")
+}
+
// writeAllPodInfoToFile writes the following each to a separate file with the
given base name & directory.
// - Pod Spec/Status
// - Pod Events
diff --git a/tests/e2e/test_utils_test.go b/tests/e2e/test_utils_test.go
index b28df88..c3e693c 100644
--- a/tests/e2e/test_utils_test.go
+++ b/tests/e2e/test_utils_test.go
@@ -475,10 +475,10 @@ func callSolrApiInPod(ctx context.Context, solrCloud
*solrv1beta1.SolrCloud, htt
"-verbose",
"-" + strings.ToLower(httpMethod),
fmt.Sprintf(
- "\"%s://%s:%d%s%s\"",
+ "\"%s://%s%s%s%s\"",
solrCloud.UrlScheme(false),
hostname,
- solrCloud.Spec.SolrAddressability.PodPort,
+ solrCloud.NodePortSuffix(false),
apiPath,
queryParamsString),
}