This is an automated email from the ASF dual-hosted git repository.

pbacsko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-k8shim.git


The following commit(s) were added to refs/heads/master by this push:
     new dbfae6e9 [YUNIKORN-2294] Flaky E2E Test: "Verify_Hard_GS_Failed_State" 
polling short-lived "Failing" application status (#759)
dbfae6e9 is described below

commit dbfae6e945d3de8f0e2797c0b6b9dde0e1408909
Author: Yu-Lin Chen <[email protected]>
AuthorDate: Tue Jan 9 23:18:25 2024 +0100

    [YUNIKORN-2294] Flaky E2E Test: "Verify_Hard_GS_Failed_State" polling 
short-lived "Failing" application status (#759)
    
    Closes: #759
    
    Signed-off-by: Peter Bacsko <[email protected]>
---
 go.mod                                             |  2 +-
 go.sum                                             |  4 +-
 test/e2e/framework/configmanager/constants.go      | 21 ++++-----
 .../framework/helpers/yunikorn/rest_api_utils.go   | 52 +++++++++++++++++++++-
 test/e2e/gang_scheduling/gang_scheduling_test.go   | 10 ++++-
 5 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/go.mod b/go.mod
index f75befce..64dfed5b 100644
--- a/go.mod
+++ b/go.mod
@@ -21,7 +21,7 @@ module github.com/apache/yunikorn-k8shim
 go 1.20
 
 require (
-       github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61
+       github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27
        github.com/apache/yunikorn-scheduler-interface 
v0.0.0-20240102192148-d4b43d6910c9
        github.com/google/go-cmp v0.6.0
        github.com/google/uuid v1.3.1
diff --git a/go.sum b/go.sum
index d8acf5b3..b23f5b95 100644
--- a/go.sum
+++ b/go.sum
@@ -49,8 +49,8 @@ github.com/alecthomas/units 
v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
 github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod 
h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
 github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 
h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves=
 github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod 
h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY=
-github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61 
h1:pvuFiYiYS6pqgex+ouI42h9V9Zr1DAit/9zUZxazTdQ=
-github.com/apache/yunikorn-core v0.0.0-20240103094035-ba62c5db9f61/go.mod 
h1:lSAZNt47HGygsVG6mJTl0rW7acBl3tbN/Fg2wGJXYs8=
+github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27 
h1:o38fyYaBl7YurkVf8Lm8IFpPSxAVjRGBh0tlAN5YohY=
+github.com/apache/yunikorn-core v0.0.0-20240105094327-77e19f6aca27/go.mod 
h1:lSAZNt47HGygsVG6mJTl0rW7acBl3tbN/Fg2wGJXYs8=
 github.com/apache/yunikorn-scheduler-interface 
v0.0.0-20240102192148-d4b43d6910c9 
h1:9Nj1XB52J7CjUysHwwu1Jf1HNC7fil5F/LkslwZEkN0=
 github.com/apache/yunikorn-scheduler-interface 
v0.0.0-20240102192148-d4b43d6910c9/go.mod 
h1:zDWV5y9Zh9DM1C65RCVXT1nhNNO8kykVW7bzPFamNYw=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 
h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
diff --git a/test/e2e/framework/configmanager/constants.go 
b/test/e2e/framework/configmanager/constants.go
index e00ae776..e66fb0a1 100644
--- a/test/e2e/framework/configmanager/constants.go
+++ b/test/e2e/framework/configmanager/constants.go
@@ -39,16 +39,17 @@ const (
        YKAdmCtrlName = "yunikorn-admission-controller-service" // YuniKorn 
Admission controller serivce name
 
        // REST endpoints of YuniKorn
-       PartitionsPath   = "ws/v1/partitions"
-       QueuesPath       = "ws/v1/partition/%s/queues"
-       AppsPath         = "ws/v1/partition/%s/queue/%s/applications"
-       AppPath          = "ws/v1/partition/%s/queue/%s/application/%s"
-       ClustersPath     = "ws/v1/clusters"
-       NodesPath        = "ws/v1/partition/%s/nodes"
-       UserUsagePath    = "ws/v1/partition/%s/usage/user/%s"
-       GroupUsagePath   = "ws/v1/partition/%s/usage/group/%s"
-       HealthCheckPath  = "ws/v1/scheduler/healthcheck"
-       ValidateConfPath = "ws/v1/validate-conf"
+       PartitionsPath    = "ws/v1/partitions"
+       QueuesPath        = "ws/v1/partition/%s/queues"
+       AppsPath          = "ws/v1/partition/%s/queue/%s/applications"
+       AppPath           = "ws/v1/partition/%s/queue/%s/application/%s"
+       CompletedAppsPath = "ws/v1/partition/%s/applications/completed"
+       ClustersPath      = "ws/v1/clusters"
+       NodesPath         = "ws/v1/partition/%s/nodes"
+       UserUsagePath     = "ws/v1/partition/%s/usage/user/%s"
+       GroupUsagePath    = "ws/v1/partition/%s/usage/group/%s"
+       HealthCheckPath   = "ws/v1/scheduler/healthcheck"
+       ValidateConfPath  = "ws/v1/validate-conf"
 
        // YuniKorn Service Details
        DefaultYuniKornHost   = "localhost"
diff --git a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go 
b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
index dc7e4079..a2b0c54b 100644
--- a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
+++ b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go
@@ -108,7 +108,7 @@ func (c *RClient) GetHealthCheck() 
(dao.SchedulerHealthDAOInfo, error) {
 }
 
 func (c *RClient) WaitforQueueToAppear(partition string, queueName string, 
timeout int) error {
-       return wait.PollUntilContextTimeout(context.TODO(), 
300*time.Microsecond, time.Duration(timeout)*time.Second, false, 
c.IsQueuePresent(partition, queueName).WithContext())
+       return wait.PollUntilContextTimeout(context.TODO(), time.Second, 
time.Duration(timeout)*time.Second, false, c.IsQueuePresent(partition, 
queueName).WithContext())
 }
 
 func (c *RClient) IsQueuePresent(partition string, queueName string) 
wait.ConditionFunc {
@@ -169,6 +169,35 @@ func (c *RClient) GetAppInfo(partition string, queueName 
string, appID string) (
        return app, err
 }
 
+func (c *RClient) GetCompletedAppInfo(partition string, appID string) 
(*dao.ApplicationDAOInfo, error) {
+       req, err := c.newRequest("GET", 
fmt.Sprintf(configmanager.CompletedAppsPath, partition), nil)
+       if err != nil {
+               return nil, err
+       }
+       var apps []*dao.ApplicationDAOInfo
+       _, err = c.do(req, &apps)
+       if err != nil {
+               return nil, err
+       }
+
+       // ApplicationID is not unique in the completed applications list. Try 
to get the latest one.
+       var latestApp *dao.ApplicationDAOInfo
+       var latestSubmissionTime = int64(0)
+
+       for _, app := range apps {
+               if app.ApplicationID == appID && app.SubmissionTime > 
latestSubmissionTime {
+                       latestApp = app
+                       latestSubmissionTime = app.SubmissionTime
+               }
+       }
+
+       if latestApp != nil {
+               return latestApp, nil
+       }
+
+       return nil, fmt.Errorf("No application found with ID %s in 'Failed', 
'Expired', 'Completed' state", appID)
+}
+
 func (c *RClient) GetAllocationLog(partition string, queueName string, appID 
string, podName string) ([]*dao.AllocationAskLogDAOInfo, error) {
        reqs, err := c.GetAppInfo(partition, queueName, appID)
        if err != nil {
@@ -221,6 +250,21 @@ func (c *RClient) isAppInDesiredState(partition string, 
queue string, appID stri
        }
 }
 
+func (c *RClient) isAppInDesiredCompletedState(partition string, appID string, 
state string) wait.ConditionFunc {
+       // Completed state including 'Expired', 'Completed', 'Failed'
+       return func() (bool, error) {
+               appInfo, err := c.GetCompletedAppInfo(partition, appID)
+               if err != nil {
+                       return false, nil // returning nil here for wait & loop
+               }
+
+               if appInfo.State == state {
+                       return true, nil
+               }
+               return false, nil
+       }
+}
+
 func (c *RClient) GetNodes(partition string) (*[]dao.NodeDAOInfo, error) {
        req, err := c.newRequest("GET", fmt.Sprintf(configmanager.NodesPath, 
partition), nil)
        if err != nil {
@@ -232,7 +276,11 @@ func (c *RClient) GetNodes(partition string) 
(*[]dao.NodeDAOInfo, error) {
 }
 
 func (c *RClient) WaitForAppStateTransition(partition string, queue string, 
appID string, state string, timeout int) error {
-       return wait.PollUntilContextTimeout(context.TODO(), 
time.Millisecond*300, time.Duration(timeout)*time.Second, false, 
c.isAppInDesiredState(partition, queue, appID, state).WithContext())
+       return wait.PollUntilContextTimeout(context.TODO(), time.Second, 
time.Duration(timeout)*time.Second, false, c.isAppInDesiredState(partition, 
queue, appID, state).WithContext())
+}
+
+func (c *RClient) WaitForCompletedAppStateTransition(partition string, appID 
string, state string, timeout int) error {
+       return wait.PollUntilContextTimeout(context.TODO(), time.Second, 
time.Duration(timeout)*time.Second, false, 
c.isAppInDesiredCompletedState(partition, appID, state).WithContext())
 }
 
 func (c *RClient) AreAllExecPodsAllotted(partition string, queueName string, 
appID string, execPodCount int) wait.ConditionFunc {
diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go 
b/test/e2e/gang_scheduling/gang_scheduling_test.go
index 6673c1ce..46d722df 100644
--- a/test/e2e/gang_scheduling/gang_scheduling_test.go
+++ b/test/e2e/gang_scheduling/gang_scheduling_test.go
@@ -285,10 +285,10 @@ var _ = Describe("", func() {
                // Wait for placeholder timeout
                time.Sleep(time.Duration(pdTimeout) * time.Second)
 
-               checkAppStatus(appID, yunikorn.States().Application.Failing)
+               checkCompletedAppStatus(appID, 
yunikorn.States().Application.Failed)
 
                // Ensure placeholders are timed out and allocations count is 
correct as app started running normal because of 'soft' gang style
-               appDaoInfo, appDaoInfoErr := 
restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID)
+               appDaoInfo, appDaoInfoErr := 
restClient.GetCompletedAppInfo(configmanager.DefaultPartition, appID)
                Ω(appDaoInfoErr).NotTo(HaveOccurred())
                Ω(len(appDaoInfo.PlaceholderData)).To(Equal(2), "Placeholder 
count is not correct")
                checkPlaceholderData(appDaoInfo, groupA, 3, 0, 3)
@@ -646,6 +646,12 @@ func checkAppStatus(applicationID, state string) {
        Ω(timeoutErr).NotTo(HaveOccurred())
 }
 
+func checkCompletedAppStatus(applicationID, state string) {
+       By(fmt.Sprintf("Verify application %s status is %s", applicationID, 
state))
+       timeoutErr := 
restClient.WaitForCompletedAppStateTransition(configmanager.DefaultPartition, 
applicationID, state, 120)
+       Ω(timeoutErr).NotTo(HaveOccurred())
+}
+
 func checkPlaceholderData(appDaoInfo *dao.ApplicationDAOInfo, tgName string, 
count, replaced, timeout int) {
        verified := false
        for _, placeholderData := range appDaoInfo.PlaceholderData {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to