This is an automated email from the ASF dual-hosted git repository.
pbacsko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-k8shim.git
The following commit(s) were added to refs/heads/master by this push:
new dbfae6e9 [YUNIKORN-2294] Flaky E2E Test: "Verify_Hard_GS_Failed_State"
polling short-lived "Failing" application status (#759)
dbfae6e9 is described below
commit dbfae6e945d3de8f0e2797c0b6b9dde0e1408909
Author: Yu-Lin Chen <[email protected]>
AuthorDate: Tue Jan 9 23:18:25 2024 +0100
[YUNIKORN-2294] Flaky E2E Test: "Verify_Hard_GS_Failed_State" polling
short-lived "Failing" application status (#759)
Closes: #759
Signed-off-by: Peter Bacsko <[email protected]>
---
go.mod | 2 +-
go.sum | 4 +-
test/e2e/framework/configmanager/constants.go | 21 ++++-----
.../framework/helpers/yunikorn/rest_api_utils.go | 52 +++++++++++++++++++++-
test/e2e/gang_scheduling/gang_scheduling_test.go | 10 ++++-
5 files changed, 72 insertions(+), 17 deletions(-)
diff --git a/go.mod b/go.mod
index f75befce..64dfed5b 100644
--- a/go.mod
+++ b/go.mod
@@ -21,7 +21,7 @@ module github.com/apache/yunikorn-k8shim
go 1.20
require (
- github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61
+ github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27
github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240102192148-d4b43d6910c9
github.com/google/go-cmp v0.6.0
github.com/google/uuid v1.3.1
diff --git a/go.sum b/go.sum
index d8acf5b3..b23f5b95 100644
--- a/go.sum
+++ b/go.sum
@@ -49,8 +49,8 @@ github.com/alecthomas/units
v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod
h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10
h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves=
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod
h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY=
-github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61
h1:pvuFiYiYS6pqgex+ouI42h9V9Zr1DAit/9zUZxazTdQ=
-github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61/go.mod
h1:lSAZNt47HGygsVG6mJTl0rW7acBl3tbN/Fg2wGJXYs8=
+github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27
h1:o38fyYaBl7YurkVf8Lm8IFpPSxAVjRGBh0tlAN5YohY=
+github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27/go.mod
h1:lSAZNt47HGygsVG6mJTl0rW7acBl3tbN/Fg2wGJXYs8=
github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240102192148-d4b43d6910c9
h1:9Nj1XB52J7CjUysHwwu1Jf1HNC7fil5F/LkslwZEkN0=
github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240102192148-d4b43d6910c9/go.mod
h1:zDWV5y9Zh9DM1C65RCVXT1nhNNO8kykVW7bzPFamNYw=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5
h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
diff --git a/test/e2e/framework/configmanager/constants.go
b/test/e2e/framework/configmanager/constants.go
index e00ae776..e66fb0a1 100644
--- a/test/e2e/framework/configmanager/constants.go
+++ b/test/e2e/framework/configmanager/constants.go
@@ -39,16 +39,17 @@ const (
YKAdmCtrlName = "yunikorn-admission-controller-service" // YuniKorn
Admission controller serivce name
// REST endpoints of YuniKorn
- PartitionsPath = "ws/v1/partitions"
- QueuesPath = "ws/v1/partition/%s/queues"
- AppsPath = "ws/v1/partition/%s/queue/%s/applications"
- AppPath = "ws/v1/partition/%s/queue/%s/application/%s"
- ClustersPath = "ws/v1/clusters"
- NodesPath = "ws/v1/partition/%s/nodes"
- UserUsagePath = "ws/v1/partition/%s/usage/user/%s"
- GroupUsagePath = "ws/v1/partition/%s/usage/group/%s"
- HealthCheckPath = "ws/v1/scheduler/healthcheck"
- ValidateConfPath = "ws/v1/validate-conf"
+ PartitionsPath = "ws/v1/partitions"
+ QueuesPath = "ws/v1/partition/%s/queues"
+ AppsPath = "ws/v1/partition/%s/queue/%s/applications"
+ AppPath = "ws/v1/partition/%s/queue/%s/application/%s"
+ CompletedAppsPath = "ws/v1/partition/%s/applications/completed"
+ ClustersPath = "ws/v1/clusters"
+ NodesPath = "ws/v1/partition/%s/nodes"
+ UserUsagePath = "ws/v1/partition/%s/usage/user/%s"
+ GroupUsagePath = "ws/v1/partition/%s/usage/group/%s"
+ HealthCheckPath = "ws/v1/scheduler/healthcheck"
+ ValidateConfPath = "ws/v1/validate-conf"
// YuniKorn Service Details
DefaultYuniKornHost = "localhost"
diff --git a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
index dc7e4079..a2b0c54b 100644
--- a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
+++ b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
@@ -108,7 +108,7 @@ func (c *RClient) GetHealthCheck()
(dao.SchedulerHealthDAOInfo, error) {
}
func (c *RClient) WaitforQueueToAppear(partition string, queueName string,
timeout int) error {
- return wait.PollUntilContextTimeout(context.TODO(),
300*time.Microsecond, time.Duration(timeout)*time.Second, false,
c.IsQueuePresent(partition, queueName).WithContext())
+ return wait.PollUntilContextTimeout(context.TODO(), time.Second,
time.Duration(timeout)*time.Second, false, c.IsQueuePresent(partition,
queueName).WithContext())
}
func (c *RClient) IsQueuePresent(partition string, queueName string)
wait.ConditionFunc {
@@ -169,6 +169,35 @@ func (c *RClient) GetAppInfo(partition string, queueName
string, appID string) (
return app, err
}
+func (c *RClient) GetCompletedAppInfo(partition string, appID string)
(*dao.ApplicationDAOInfo, error) {
+ req, err := c.newRequest("GET",
fmt.Sprintf(configmanager.CompletedAppsPath, partition), nil)
+ if err != nil {
+ return nil, err
+ }
+ var apps []*dao.ApplicationDAOInfo
+ _, err = c.do(req, &apps)
+ if err != nil {
+ return nil, err
+ }
+
+ // ApplicationID is not unique in the completed applications list. Try
to get the latest one.
+ var latestApp *dao.ApplicationDAOInfo
+ var latestSubmissionTime = int64(0)
+
+ for _, app := range apps {
+ if app.ApplicationID == appID && app.SubmissionTime >
latestSubmissionTime {
+ latestApp = app
+ latestSubmissionTime = app.SubmissionTime
+ }
+ }
+
+ if latestApp != nil {
+ return latestApp, nil
+ }
+
+ return nil, fmt.Errorf("No application found with ID %s in 'Failed',
'Expired', 'Completed' state", appID)
+}
+
func (c *RClient) GetAllocationLog(partition string, queueName string, appID
string, podName string) ([]*dao.AllocationAskLogDAOInfo, error) {
reqs, err := c.GetAppInfo(partition, queueName, appID)
if err != nil {
@@ -221,6 +250,21 @@ func (c *RClient) isAppInDesiredState(partition string,
queue string, appID stri
}
}
+func (c *RClient) isAppInDesiredCompletedState(partition string, appID string,
state string) wait.ConditionFunc {
+ // Completed state including 'Expired', 'Completed', 'Failed'
+ return func() (bool, error) {
+ appInfo, err := c.GetCompletedAppInfo(partition, appID)
+ if err != nil {
+ return false, nil // returning nil here for wait & loop
+ }
+
+ if appInfo.State == state {
+ return true, nil
+ }
+ return false, nil
+ }
+}
+
func (c *RClient) GetNodes(partition string) (*[]dao.NodeDAOInfo, error) {
req, err := c.newRequest("GET", fmt.Sprintf(configmanager.NodesPath,
partition), nil)
if err != nil {
@@ -232,7 +276,11 @@ func (c *RClient) GetNodes(partition string)
(*[]dao.NodeDAOInfo, error) {
}
func (c *RClient) WaitForAppStateTransition(partition string, queue string,
appID string, state string, timeout int) error {
- return wait.PollUntilContextTimeout(context.TODO(),
time.Millisecond*300, time.Duration(timeout)*time.Second, false,
c.isAppInDesiredState(partition, queue, appID, state).WithContext())
+ return wait.PollUntilContextTimeout(context.TODO(), time.Second,
time.Duration(timeout)*time.Second, false, c.isAppInDesiredState(partition,
queue, appID, state).WithContext())
+}
+
+func (c *RClient) WaitForCompletedAppStateTransition(partition string, appID
string, state string, timeout int) error {
+ return wait.PollUntilContextTimeout(context.TODO(), time.Second,
time.Duration(timeout)*time.Second, false,
c.isAppInDesiredCompletedState(partition, appID, state).WithContext())
}
func (c *RClient) AreAllExecPodsAllotted(partition string, queueName string,
appID string, execPodCount int) wait.ConditionFunc {
diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go
b/test/e2e/gang_scheduling/gang_scheduling_test.go
index 6673c1ce..46d722df 100644
--- a/test/e2e/gang_scheduling/gang_scheduling_test.go
+++ b/test/e2e/gang_scheduling/gang_scheduling_test.go
@@ -285,10 +285,10 @@ var _ = Describe("", func() {
// Wait for placeholder timeout
time.Sleep(time.Duration(pdTimeout) * time.Second)
- checkAppStatus(appID, yunikorn.States().Application.Failing)
+ checkCompletedAppStatus(appID,
yunikorn.States().Application.Failed)
// Ensure placeholders are timed out and allocations count is
correct as app started running normal because of 'soft' gang style
- appDaoInfo, appDaoInfoErr :=
restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID)
+ appDaoInfo, appDaoInfoErr :=
restClient.GetCompletedAppInfo(configmanager.DefaultPartition, appID)
Ω(appDaoInfoErr).NotTo(HaveOccurred())
Ω(len(appDaoInfo.PlaceholderData)).To(Equal(2), "Placeholder
count is not correct")
checkPlaceholderData(appDaoInfo, groupA, 3, 0, 3)
@@ -646,6 +646,12 @@ func checkAppStatus(applicationID, state string) {
Ω(timeoutErr).NotTo(HaveOccurred())
}
+func checkCompletedAppStatus(applicationID, state string) {
+ By(fmt.Sprintf("Verify application %s status is %s", applicationID,
state))
+ timeoutErr :=
restClient.WaitForCompletedAppStateTransition(configmanager.DefaultPartition,
applicationID, state, 120)
+ Ω(timeoutErr).NotTo(HaveOccurred())
+}
+
func checkPlaceholderData(appDaoInfo *dao.ApplicationDAOInfo, tgName string,
count, replaced, timeout int) {
verified := false
for _, placeholderData := range appDaoInfo.PlaceholderData {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]