pbacsko commented on code in PR #429:
URL: https://github.com/apache/yunikorn-k8shim/pull/429#discussion_r888072297
##########
test/e2e/recovery_and_restart/recovery_and_restart_test.go:
##########
@@ -128,6 +128,125 @@ var _ = ginkgo.Describe("", func() {
gomega.Ω(int64(resMap["vcore"].(float64))).To(gomega.Equal(core))
})
+ ginkgo.It("Verify_SleepJobs_Restart_YK", func() {
+ kClient = k8s.KubeCtl{}
+ Ω(kClient.SetClient()).To(gomega.BeNil())
+ defer restorePortForwarding(&kClient)
+
+ appID1 := normalSleepJobPrefix + "-" + common.RandSeq(5)
+ sleepPodConfig1 := common.SleepPodConfig{Name:
"normal-sleep-job", NS: dev, Time: 20, AppID: appID1}
+ pod1 := common.InitSleepPod(sleepPodConfig1)
+
+ appID2 := normalSleepJobPrefix + "-" + common.RandSeq(5)
+ sleepPodConfig2 := common.SleepPodConfig{Name:
"normal-sleep-job-2", NS: dev, Time: 20, AppID: appID2}
+ pod2 := common.InitSleepPod(sleepPodConfig2)
+
+ ginkgo.By("Submitting two normal sleep jobs")
+ job1 := common.InitTestJob(appID1, parallelism, parallelism,
pod1)
+ _, createErr := kClient.CreateJob(job1, dev)
+ Ω(createErr).NotTo(gomega.HaveOccurred())
+ job2 := common.InitTestJob(appID2, parallelism, parallelism,
pod2)
+ _, createErr2 := kClient.CreateJob(job2, dev)
+ Ω(createErr2).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Restart the scheduler pod immediately")
+ restartYunikorn(&kClient)
+
+ ginkgo.By("Listing pods")
+ pods, err := kClient.GetPods(dev)
+ Ω(err).NotTo(gomega.HaveOccurred())
+ fmt.Fprintf(ginkgo.GinkgoWriter, "Total number of pods in
namespace %s: %d\n",
+ dev, len(pods.Items))
+ for _, pod := range pods.Items {
+ fmt.Fprintf(ginkgo.GinkgoWriter, "Pod name:
%-40s\tStatus: %s\n", pod.GetName(), pod.Status.Phase)
+ }
+
+ ginkgo.By("Waiting for sleep pods to be running")
+ err = kClient.WaitForJobPodsRunning(dev, job1.Name,
parallelism, 30*time.Second)
+ Ω(err).NotTo(gomega.HaveOccurred())
+ err = kClient.WaitForJobPodsRunning(dev, job2.Name,
parallelism, 30*time.Second)
+ Ω(err).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Waiting for sleep pods to finish")
+ err = kClient.WaitForJobPodsSucceeded(dev, job1.Name,
parallelism, 30*time.Second)
+ Ω(err).NotTo(gomega.HaveOccurred())
+ err = kClient.WaitForJobPodsSucceeded(dev, job2.Name,
parallelism, 30*time.Second)
+ Ω(err).NotTo(gomega.HaveOccurred())
+ })
+
+ ginkgo.It("Verify_GangScheduling_TwoGangs_Restart_YK", func() {
+ kClient = k8s.KubeCtl{}
+ Ω(kClient.SetClient()).To(gomega.BeNil())
+ defer restorePortForwarding(&kClient)
+
+ appID := gangSleepJobPrefix + "-" + common.RandSeq(5)
+ sleepPodConfig := common.SleepPodConfig{Name: "gang-sleep-job",
NS: dev, Time: 1, AppID: appID}
+ taskGroups := common.InitTaskGroups(sleepPodConfig, taskGroupA,
taskGroupB, parallelism)
+ pod := common.InitSleepPod(sleepPodConfig)
+ pod = common.DecoratePodForGangScheduling(30, "Soft",
taskGroupA,
+ taskGroups, pod)
+
+ ginkgo.By("Submitting gang sleep job")
+ job := common.InitTestJob(appID, parallelism, parallelism, pod)
+ _, err := kClient.CreateJob(job, dev)
+ Ω(err).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Waiting job pods to be created")
+ createErr := kClient.WaitForJobPodsCreated(dev, job.Name,
parallelism, 30*time.Second)
+ Ω(createErr).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Waiting for placeholders in task group A (expected
state: Running)")
+ err = kClient.WaitForPlaceholders(dev, taskGroupAprefix,
parallelism, 30*time.Second, v1.PodRunning)
+ Ω(err).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Waiting for placeholders in task group B (expected
state: Pending)")
+ err = kClient.WaitForPlaceholders(dev, taskGroupBprefix,
parallelism+1, 30*time.Second, v1.PodPending)
+ Ω(err).NotTo(gomega.HaveOccurred())
+
+ ginkgo.By("Restart the scheduler pod")
+ restartYunikorn(&kClient)
+
+ // give YK some time to go through internal state transitions
Review Comment:
This is only an extra safety net: we want to ensure that after
`restartYunikorn()` returns, YK has definitely re-created all internal
context/app/task objects and nothing is in progress inside if the node which
runs the test is a bit slow.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]