This is an automated email from the ASF dual-hosted git repository.

wwei pushed a commit to branch soak-test
in repository https://gitbox.apache.org/repos/asf/yunikorn-release.git


The following commit(s) were added to refs/heads/soak-test by this push:
     new 3b51902  [YUNIKORN-3008] Add framework core bits and ddd clusterloader 
end to end test (#197)
3b51902 is described below

commit 3b519023c90bec0ebaef3383e8b5da8faafc8f3d
Author: Shravan Achar <[email protected]>
AuthorDate: Tue May 13 10:30:09 2025 -0700

    [YUNIKORN-3008] Add framework core bits and ddd clusterloader end to end 
test (#197)
    
    Co-authored-by: Shravan Achar <[email protected]>
---
 soak/autoscaler/run.go                             |  72 ++++++++++
 soak/{pkg/setup => autoscaler}/setup.go            | 101 ++++++-------
 soak/{pkg/setup => autoscaler}/setup_test.go       |  20 +--
 soak/conf.yaml                                     | 160 ++-------------------
 soak/{conf.yaml => conf.yaml.example}              |   1 +
 soak/framework/config.go                           |  13 +-
 soak/framework/interface.go                        |  33 +++++
 soak/logger/log.go                                 |  15 ++
 soak/main.go                                       |  35 ++++-
 soak/pkg/setup/test_conf.yaml                      |  29 ----
 soak/templates/autoscaler-configmap.yaml           |  21 ---
 soak/templates/kwok-node-template.yaml             |  14 +-
 soak/templates/kwok-provider-config.yaml           |  46 ++++++
 .../basic-scheduler-throughput/cl2-metadata.json   |   1 +
 soak/tests/basic-scheduler-throughput/config.yaml  |  81 +++++++++++
 .../basic-scheduler-throughput/pod-default.yaml    |  25 ++++
 16 files changed, 390 insertions(+), 277 deletions(-)

diff --git a/soak/autoscaler/run.go b/soak/autoscaler/run.go
new file mode 100644
index 0000000..a746065
--- /dev/null
+++ b/soak/autoscaler/run.go
@@ -0,0 +1,72 @@
+package autoscaler
+
+import (
+       "fmt"
+       "github.com/apache/yunikorn-release/soak/framework"
+       "github.com/apache/yunikorn-release/soak/logger"
+       "go.uber.org/zap"
+       "os/exec"
+       "path/filepath"
+       "strings"
+)
+
+var log = logger.Logger
+
+type AutoscalingScenario struct {
+       templateConf framework.Template
+       testCases    []framework.TestCase
+}
+
+func New(config *framework.Config) *AutoscalingScenario {
+       for _, c := range config.Tests {
+               if c.Name == "autoscaling" {
+                       return &AutoscalingScenario{
+                               templateConf: c.Template,
+                               testCases:    c.TestCases,
+                       }
+               }
+       }
+       return nil
+}
+
+func (a *AutoscalingScenario) GetName() string {
+       return "autoscaling"
+}
+
+func (a *AutoscalingScenario) Init() error {
+       if err := a.upgradeSchedulerPerConfig(); err != nil {
+               return err
+       }
+
+       return a.setAutoscalerPerConfig()
+}
+
+func (a *AutoscalingScenario) Tests() []framework.TestCase {
+       // enable or disable test cases here
+       return a.testCases
+}
+
+func (a *AutoscalingScenario) Run() ([]string, error) {
+       log := logger.Logger
+       results := make([]string, len(a.testCases))
+       for idx, tests := range a.testCases {
+               clusterLoaderConfigPath := tests.ClusterLoaderConfigPath
+               reportDir := filepath.Dir(clusterLoaderConfigPath)
+               args := []string{fmt.Sprintf("----testconfig=%s", 
clusterLoaderConfigPath),
+                       "--provider=kind", fmt.Sprintf("--kubeconfig=%s", 
a.templateConf.Kubeconfig.Path),
+                       "--v=4", fmt.Sprintf("--report-dir=%s", reportDir)}
+               cmd := exec.Command("clusterloader2", args...)
+               log.Info("Clusterloader command to be executed",
+                       zap.String("command", fmt.Sprintf("clusterloader2 %s", 
strings.Join(args, " "))))
+               results[idx] = reportDir
+               _, err := cmd.CombinedOutput()
+               if err != nil {
+                       log.Error("Clusterloader command failed. Check results 
directory for more info",
+                               zap.String("command", 
fmt.Sprintf("clusterloader2 %s", strings.Join(args, " "))))
+
+                       return results, err
+
+               }
+       }
+       return results, nil
+}
diff --git a/soak/pkg/setup/setup.go b/soak/autoscaler/setup.go
similarity index 65%
rename from soak/pkg/setup/setup.go
rename to soak/autoscaler/setup.go
index 000375c..8d674f4 100644
--- a/soak/pkg/setup/setup.go
+++ b/soak/autoscaler/setup.go
@@ -14,12 +14,10 @@
  limitations under the License.
 */
 
-package setup
+package autoscaler
 
 import (
        "fmt"
-       "github.com/apache/yunikorn-core/pkg/log"
-       "github.com/apache/yunikorn-release/soak/framework"
        "github.com/apache/yunikorn-release/soak/pkg/constants"
        "go.uber.org/zap"
        "gopkg.in/yaml.v3"
@@ -29,23 +27,24 @@ import (
        "strings"
 )
 
-var logger *zap.Logger = log.Log(log.Test)
-
-func setK8sContext() error {
+func (a *AutoscalingScenario) setK8sContext() error {
        homeDir, err := os.UserHomeDir()
        if err != nil {
                return fmt.Errorf("failed to get home directory: %v", err)
        }
        kubeconfigPath := filepath.Join(homeDir, ".kube", "config")
+       if len(a.templateConf.Kubeconfig.Path) > 0 {
+               kubeconfigPath = a.templateConf.Kubeconfig.Path
+       }
        os.Setenv("KUBECONFIG", kubeconfigPath)
-       logger.Info("Set KUBECONFIG", zap.String("path", kubeconfigPath))
+       log.Info("Set KUBECONFIG", zap.String("path", kubeconfigPath))
 
        contextCmd := exec.Command("kubectl", "config", "use-context", 
constants.KindSoakTestCluster)
        contextOutput, err := contextCmd.CombinedOutput()
        if err != nil {
                return fmt.Errorf("failed to switch kubectl context: %v, 
output: %s", err, string(contextOutput))
        }
-       logger.Info("Kubectl context switch output", zap.String("output", 
strings.TrimSpace(string(contextOutput))))
+       log.Info("Kubectl context switch output", zap.String("output", 
strings.TrimSpace(string(contextOutput))))
 
        currentContextCmd := exec.Command("kubectl", "config", 
"current-context")
        _, err = currentContextCmd.CombinedOutput()
@@ -56,13 +55,16 @@ func setK8sContext() error {
        return nil
 }
 
-func upgradeSchedulerPerConfig(scheduler framework.SchedulerFields) error {
-       if err := setK8sContext(); err != nil {
-               logger.Fatal("failed to set kubernetes context", zap.Error(err))
+func (a *AutoscalingScenario) upgradeSchedulerPerConfig() error {
+       if err := a.setK8sContext(); err != nil {
+               log.Fatal("failed to set kubernetes context", zap.Error(err))
                return err
        }
 
-       logger.Info("Scheduler details",
+       // TODO: Support multiple yunikorn scheduler config directives. 
Currently take the first one
+       scheduler := a.templateConf.Scheduler[0]
+
+       log.Info("Scheduler details",
                zap.String("VcoreRequests", scheduler.VcoreRequests),
                zap.String("MemoryRequests", scheduler.MemoryRequests),
                zap.String("VcoreLimits", scheduler.VcoreLimits),
@@ -96,7 +98,7 @@ func upgradeSchedulerPerConfig(scheduler 
framework.SchedulerFields) error {
 
                cmd := exec.Command("helm", args...)
 
-               logger.Info("Helm command to be executed",
+               log.Info("Helm command to be executed",
                        zap.String("command", fmt.Sprintf("helm %s", 
strings.Join(args, " "))))
 
                output, err := cmd.CombinedOutput()
@@ -104,7 +106,7 @@ func upgradeSchedulerPerConfig(scheduler 
framework.SchedulerFields) error {
                        return fmt.Errorf("helm upgrade failed: %v", err)
                }
 
-               logger.Info("Helm upgrade successful",
+               log.Info("Helm upgrade successful",
                        zap.String("command", fmt.Sprintf("helm %s", 
strings.Join(args, " "))),
                        zap.String("output", string(output)))
        }
@@ -113,74 +115,77 @@ func upgradeSchedulerPerConfig(scheduler 
framework.SchedulerFields) error {
                kubectlArgs := []string{"apply"}
                kubectlArgs = append(kubectlArgs, "-f", scheduler.Path, "-n", 
"yunikorn")
                kubectlCmd := exec.Command("kubectl", kubectlArgs...)
-               logger.Info("Kubectl command to be executed",
+               log.Info("Kubectl command to be executed",
                        zap.String("command", fmt.Sprintf("kubectl %s", 
strings.Join(kubectlArgs, " "))))
 
                kubectlOutput, err := kubectlCmd.CombinedOutput()
                if err != nil {
                        return fmt.Errorf("kubectl apply failed: %v", err)
                }
-               logger.Info("Kubectl apply successful", zap.String("output", 
strings.TrimSpace(string(kubectlOutput))))
+               log.Info("Kubectl apply successful", zap.String("output", 
strings.TrimSpace(string(kubectlOutput))))
        }
 
        return nil
 }
 
-func setAutoscalerPerConfig(node framework.NodeFields) error {
-       if err := setK8sContext(); err != nil {
-               logger.Fatal("failed to set kubernetes context", zap.Error(err))
+func (a *AutoscalingScenario) setAutoscalerPerConfig() error {
+       if err := a.setK8sContext(); err != nil {
+               log.Fatal("failed to set kubernetes context", zap.Error(err))
                return err
        }
 
-       logger.Info("Node details",
-               zap.String("path", node.Path),
-               zap.String("NodesDesiredCount", node.DesiredCount),
-               zap.String("maxCount", node.MaxCount))
+       // TODO: Support multiple kwok node configs. Currently take the first 
node template
+       nodeConfig := a.templateConf.Node[0]
+
+       log.Info("Node details",
+               zap.String("path", nodeConfig.Path),
+               zap.String("NodesDesiredCount", nodeConfig.DesiredCount),
+               zap.String("maxCount", nodeConfig.MaxCount))
 
-       templateContent, err := os.ReadFile(node.Path)
+       templateContent, err := os.ReadFile(nodeConfig.Path)
        if err != nil {
-               logger.Error("failed to read template file", zap.Error(err))
+               log.Error("failed to read template file", zap.Error(err))
                return err
        }
 
        var nodeTemplate map[string]interface{}
        err = yaml.Unmarshal(templateContent, &nodeTemplate)
        if err != nil {
-               logger.Error("failed to parse template YAML", zap.Error(err))
+               log.Error("failed to parse template YAML", zap.Error(err))
                return err
        }
 
        metadata, ok := nodeTemplate["metadata"].(map[string]interface{})
        if !ok {
-               logger.Error("invalid metadata format in node template")
+               log.Error("invalid metadata format in node template")
                return fmt.Errorf("invalid metadata format in node template")
        }
 
        annotations, ok := metadata["annotations"].(map[string]interface{})
        if !ok {
-               logger.Error("invalid annotations format in node template")
+               log.Error("invalid annotations format in node template")
                return fmt.Errorf("invalid annotations format in node template")
        }
 
-       annotations["cluster-autoscaler.kwok.nodegroup/max-count"] = 
node.MaxCount
-       annotations["cluster-autoscaler.kwok.nodegroup/min-count"] = 
node.DesiredCount
-       annotations["cluster-autoscaler.kwok.nodegroup/desired-count"] = 
node.DesiredCount
+       annotations["cluster-autoscaler.kwok.nodegroup/max-count"] = 
nodeConfig.MaxCount
+       annotations["cluster-autoscaler.kwok.nodegroup/min-count"] = 
nodeConfig.DesiredCount
+       annotations["cluster-autoscaler.kwok.nodegroup/desired-count"] = 
nodeConfig.DesiredCount
 
-       autoscalerConfigmapPath := "../../templates/autoscaler-configmap.yaml"
+       kwokProviderConfigmap := "../../templates/kwok-provider-config.yaml"
 
-       autoscalerConfigmap, err := os.ReadFile(autoscalerConfigmapPath)
+       autoscalerConfigmap, err := os.ReadFile(kwokProviderConfigmap)
        if err != nil {
-               logger.Error("failed to read autoscaler configmap template", 
zap.Error(err))
+               log.Error("failed to read autoscaler configmap template", 
zap.Error(err))
                return err
        }
 
        var autoscalerNodeList map[string]interface{}
        err = yaml.Unmarshal(autoscalerConfigmap, &autoscalerNodeList)
        if err != nil {
-               logger.Error("failed to parse autoscalerConfigmap YAML", 
zap.Error(err))
+               log.Error("failed to parse autoscalerConfigmap YAML", 
zap.Error(err))
                return err
        }
-       logger.Info("Autoscaler Node List", zap.Any("autoscalerNodeList", 
autoscalerNodeList))
+       log.Info("Autoscaler Node List", zap.Any("autoscalerNodeList", 
autoscalerNodeList))
 
        var itemsSlice []interface{}
        itemsSlice = append(itemsSlice, nodeTemplate)
@@ -188,14 +193,14 @@ func setAutoscalerPerConfig(node framework.NodeFields) 
error {
 
        autoscalerNodeListYaml, err := yaml.Marshal(autoscalerNodeList)
        if err != nil {
-               logger.Error("failed to convert updated autoscalerNodeList to 
YAML", zap.Error(err))
+               log.Error("failed to convert updated autoscalerNodeList to 
YAML", zap.Error(err))
                return err
        }
-       logger.Info("Encoded autoscalerNodeListYaml", 
zap.Any("autoscalerNodeListYaml", autoscalerNodeListYaml))
+       log.Info("Encoded autoscalerNodeListYaml", 
zap.Any("autoscalerNodeListYaml", autoscalerNodeListYaml))
 
        updatedAcCmTempFile, err := os.CreateTemp("", 
"updated-autoscaler-configmap-temp.yaml")
        if err != nil {
-               logger.Error("failed to create 
updated-autoscaler-configmap-temp file", zap.Error(err))
+               log.Error("failed to create updated-autoscaler-configmap-temp 
file", zap.Error(err))
                return err
        }
 
@@ -204,11 +209,11 @@ func setAutoscalerPerConfig(node framework.NodeFields) 
error {
 
        if _, err = updatedAcCmTempFile.Write(autoscalerNodeListYaml); err != 
nil {
                updatedAcCmTempFile.Close()
-               logger.Error("failed to write to 
updated-autoscaler-configmap-temp file", zap.Error(err))
+               log.Error("failed to write to updated-autoscaler-configmap-temp 
file", zap.Error(err))
                return err
        }
        if err = updatedAcCmTempFile.Close(); err != nil {
-               logger.Error("failed to close updated-autoscaler-configmap-temp 
file", zap.Error(err))
+               log.Error("failed to close updated-autoscaler-configmap-temp 
file", zap.Error(err))
                return err
        }
 
@@ -216,31 +221,31 @@ func setAutoscalerPerConfig(node framework.NodeFields) 
error {
        deleteConfigMapCmd := exec.Command("kubectl", "delete", "cm", 
"kwok-provider-templates")
        deleteConfigMapCmdOutput, err := deleteConfigMapCmd.CombinedOutput()
        if err != nil {
-               logger.Error("fail to delete configmap", zap.Error(err))
+               log.Error("fail to delete configmap", zap.Error(err))
                return err
        }
-       logger.Info(string(deleteConfigMapCmdOutput))
+       log.Info(string(deleteConfigMapCmdOutput))
 
        // Create a new autoscaler configMap
        createConfigMapCmd := exec.Command("kubectl", "create", "cm", 
"kwok-provider-templates",
                "--from-file=templates="+updatedAcCmTempFilePath)
        createConfigMapCmdOutput, err := createConfigMapCmd.CombinedOutput()
        if err != nil {
-               logger.Error("fail to create new configmap", zap.Error(err))
+               log.Error("fail to create new configmap", zap.Error(err))
                return err
        }
-       logger.Info(string(createConfigMapCmdOutput))
+       log.Info(string(createConfigMapCmdOutput))
 
        // Restart the autoscaler pod after updating the configmap
        restartAutoscalerPodCmd := exec.Command("kubectl", "rollout", 
"restart", "deployment", "autoscaler-kwok-cluster-autoscaler")
        restartAutoscalerPodCmdOutput, err := 
restartAutoscalerPodCmd.CombinedOutput()
        if err != nil {
-               logger.Error("failed to restart autoscaler deployment", 
zap.Error(err))
+               log.Error("failed to restart autoscaler deployment", 
zap.Error(err))
                return err
        }
-       logger.Info("Restarted autoscaler deployment", zap.String("output", 
string(restartAutoscalerPodCmdOutput)))
+       log.Info("Restarted autoscaler deployment", zap.String("output", 
string(restartAutoscalerPodCmdOutput)))
 
-       logger.Info("Successfully set up kwok provider cluster autoscaler for 
desiredNodeCount and MaxNodeCount")
+       log.Info("Successfully set up kwok provider cluster autoscaler for 
desiredNodeCount and MaxNodeCount")
 
        return nil
 }
diff --git a/soak/pkg/setup/setup_test.go b/soak/autoscaler/setup_test.go
similarity index 70%
rename from soak/pkg/setup/setup_test.go
rename to soak/autoscaler/setup_test.go
index eb64e57..b223983 100644
--- a/soak/pkg/setup/setup_test.go
+++ b/soak/autoscaler/setup_test.go
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 
-package setup
+package autoscaler
 
 import (
        "github.com/apache/yunikorn-release/soak/framework"
@@ -24,18 +24,12 @@ import (
 )
 
 func TestSetAutoScalerPerConfig(t *testing.T) {
-       conf, err := framework.InitConfig("test_conf.yaml")
+       conf, err := framework.InitConfig("conf.yaml")
        if err != nil {
-               logger.Fatal("failed to parse config", zap.Error(err))
-       }
-       logger.Info("config successfully loaded", zap.Any("conf", conf))
-
-       for _, test := range conf.Tests {
-               if len(test.Template.Node) > 0 {
-                       for _, nodeTemplate := range test.Template.Node {
-                               err := setAutoscalerPerConfig(nodeTemplate)
-                               assert.NoError(t, err)
-                       }
-               }
+               log.Fatal("failed to parse config", zap.Error(err))
        }
+       log.Info("config successfully loaded", zap.Any("conf", conf))
+       a := New(conf)
+       err = a.setAutoscalerPerConfig()
+       assert.NoError(t, err)
 }
diff --git a/soak/conf.yaml b/soak/conf.yaml
index a469951..99a2a97 100644
--- a/soak/conf.yaml
+++ b/soak/conf.yaml
@@ -1,4 +1,3 @@
-#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,150 +15,15 @@
 # limitations under the License.
 
 tests:
-- name: autoscaling
-  template:
-    kubeconfig:
-      path: ../templates/kubeconfig
-    node:
-      - path: ../templates/nodeGroupTemplates.yaml
-        maxCount: "$nodesMaxCount"
-        desiredCount: "$nodesDesiredCount"
-    job:
-      - path: ../templates/jobATemplate.yaml
-        count: "$numJobs"
-        podCount: "$numPods"
-        mode: "always" #one of ["always", "random-max-percent", 
"fixed-percent"]
-        value: "50" # when mode is "random-max-percent" or "fixed-percent"
-      - path: ../templates/jobBTemplate.yaml
-        count: "$numJobs"
-        podCount: "$numPods"
-    scheduler:
-      - path: ../templates/autoscaling-queues.yaml
-        vcoreRequests: 2
-        vcoreLimits: 2
-        memoryRequests: 16Gi
-        memoryLimits: 16Gi
-  testCases:
-    - name: "1000-nodes-cluster"
-      params:
-        nodesMaxCount: 1000
-        nodesDesiredCount: 20
-        numPods: 5000
-        numJobs: 200
-      schedule: once
-      labels: ["short"]
-      # labels: ["soak-test"]
-      threshold:
-        maxRuntime: "10m"
-        pendingPods: 0
-        metrics:
-          maxAllocationDelay: "5s"
-    - name: "5000-nodes-cluster"
-      params:
-        nodesMaxCount: 5000
-        nodesDesiredCount: 20
-        numPods: 20000
-        numJobs: 700
-      schedule: once
-      runs: 1
-      # labels: ["soak-test", "benchmark-test"]
-      labels: ["short"]
-      threshold:
-        maxRuntime: "60m"
-        pendingPods: 0
-        maxAllocationDelay: "20s"
-    - name: "300-nodes-cluster-schedule"
-      params:
-        nodesMaxCount: 300
-        nodesDesiredCount: 0
-        numPods: 2000
-        numJobs: 150
-      schedule: "*/15 * * * *"
-      runs: 10
-      #labels: ["soak-test"]
-      labels: ["super-long"]
-      threshold:
-        maxRuns: 10
-        pendingPods: 0
-        metrics:
-          maxAllocationDelay: "5s"
-- name: chaos-faults
-  template:
-    kubeconfig:
-      path: ../templates/kubeconfig
-    node:
-      - path: ../templates/nodeGroupTemplates.yaml
-        maxCount: "$nodesMaxCount"
-        desiredCount: "$nodesDesiredCount"
-    job:
-      - path: ../templates/jobATemplate.yaml
-        count: "$numJobs"
-        podCount: "$numPods"
-    choas:
-      - path: ../templates/chaos.yaml
-        count: "$numChaos"
-    scheduler:
-      - path: ../templates/chaos-queues.yaml
-        vcoreRequests: 2
-        vcoreLimits: 2
-        memoryRequests: 16Gi
-        memoryLimits: 16Gi
-  testCases:
-    - name: "1000-nodes-cluster"
-      params:
-        nodesMaxCount: 1000
-        nodesDesiredCount: 20
-        numPods: 5000
-        numJobs: 200
-        numChaos: 0
-      schedule: once
-      labels: ["short"]
-      # labels: ["soak-test", "benchmark-test", "integration-test"]
-      threshold:
-        maxRuntime: "10m"
-        pendingPods: 0
-        detectDeadlock: false
-        metrics:
-          schedulerRestarts: 0
-          maxAllocationDelay: "10s"
-    - name: "5000-nodes-cluster"
-      params:
-        nodesMaxCount: 5000
-        nodesDesiredCount: 20
-        numPods: 20000
-        numJobs: 700
-        numChaos: 200
-        schedule: once
-        runs: 1
-        labels: ["long"]
-        # labels: ["soak-test", "benchmark-test"]
-        threshold:
-          maxRuntime: "60m"
-          pendingPods: 0
-          detectDeadlock: true
-          metrics:
-            schedulerRestarts: 1
-            maxAllocationDelay: "60s"
-    - name: "300-nodes-cluster-schedule"
-      params:
-        nodesMaxCount: 300
-        nodesDesiredCount: 0
-        numPods: 2000
-        numJobs: 150
-        numChaos: 10
-      schedule: "*/15 * * * *"
-      runs: 10
-      # labels: ["soak-test"]
-      labels: ["super-long"]
-      threshold:
-        maxRuntime: "60m"
-        pendingPods: 0
-        detectDeadlock: true
-        metrics:
-          schedulerRestarts: 5
-          maxAllocationDelay: "60s"
-          prom:
-            - query: 
'sum(rate(go_memstats_heap_inuse_bytes{service="yunikorn"}[60m])) by (service)'
-              expression: 'sprintf("%.0f", query_result / 1000000)'
-              value: '20'
-              op: '<='
\ No newline at end of file
+  - name: autoscaling
+    template:
+      node:
+        - path: ../../templates/kwok-node-template.yaml
+          maxCount: "10"
+          desiredCount: "5"
+      scheduler:
+        - path: ../../templates/autoscaling-queues.yaml
+          vcoreRequests: 2
+          vcoreLimits: 2
+          memoryRequests: 16Gi
+          memoryLimits: 16Gi
\ No newline at end of file
diff --git a/soak/conf.yaml b/soak/conf.yaml.example
similarity index 97%
copy from soak/conf.yaml
copy to soak/conf.yaml.example
index a469951..30c42f2 100644
--- a/soak/conf.yaml
+++ b/soak/conf.yaml.example
@@ -41,6 +41,7 @@ tests:
         memoryLimits: 16Gi
   testCases:
     - name: "1000-nodes-cluster"
+      clusterLoaderConfigPath: ../tests/basic-scheduler-throughput/config.yaml
       params:
         nodesMaxCount: 1000
         nodesDesiredCount: 20
diff --git a/soak/framework/config.go b/soak/framework/config.go
index 1de9b14..70582de 100644
--- a/soak/framework/config.go
+++ b/soak/framework/config.go
@@ -89,12 +89,13 @@ type Threshold struct {
 }
 
 type TestCase struct {
-       Name      string         `yaml:"name,omitempty"`
-       Params    TestCaseParams `yaml:"params,omitempty"`
-       Schedule  string         `yaml:"schedule,omitempty"`
-       Runs      int            `yaml:"runs,omitempty"`
-       Labels    []string       `yaml:"labels,omitempty"`
-       Threshold Threshold      `yaml:"threshold,omitempty"`
+       Name                    string         `yaml:"name,omitempty"`
+       Params                  TestCaseParams `yaml:"params,omitempty"`
+       Schedule                string         `yaml:"schedule,omitempty"`
+       Runs                    int            `yaml:"runs,omitempty"`
+       Labels                  []string       `yaml:"labels,omitempty"`
+       Threshold               Threshold      `yaml:"threshold,omitempty"`
+       ClusterLoaderConfigPath string         
`yaml:"clusterLoaderConfigPath,omitempty"`
 }
 
 type Test struct {
diff --git a/soak/framework/interface.go b/soak/framework/interface.go
new file mode 100644
index 0000000..e4ae8a2
--- /dev/null
+++ b/soak/framework/interface.go
@@ -0,0 +1,33 @@
+package framework
+
+import (
+       "github.com/apache/yunikorn-release/soak/logger"
+       "go.uber.org/zap"
+)
+
+var log = logger.Logger
+
+type Scenarios struct {
+       registeredTestScenarios map[string]TestScenario
+}
+
+var testScenarios Scenarios
+
+func init() {
+       testScenarios.registeredTestScenarios = make(map[string]TestScenario)
+}
+
+func Register(ts TestScenario) {
+       testScenarios.registeredTestScenarios[ts.GetName()] = ts
+       log.Info("register scenario", zap.String("scenarioName", ts.GetName()))
+}
+
+func GetRegisteredTestScenarios() map[string]TestScenario {
+       return testScenarios.registeredTestScenarios
+}
+
+type TestScenario interface {
+       GetName() string
+       Init() error
+       Run() ([]string, error)
+}
diff --git a/soak/logger/log.go b/soak/logger/log.go
new file mode 100644
index 0000000..36696d0
--- /dev/null
+++ b/soak/logger/log.go
@@ -0,0 +1,15 @@
+package logger
+
+import (
+       "github.com/apache/yunikorn-core/pkg/log"
+       "go.uber.org/zap"
+       "strconv"
+)
+
+var Logger *zap.Logger = log.Log(log.Test)
+
+func SetLogLevel(level int) {
+       log.UpdateLoggingConfig(map[string]string{
+               "log.level": strconv.Itoa(level),
+       })
+}
diff --git a/soak/main.go b/soak/main.go
index c048fbb..690c6c3 100644
--- a/soak/main.go
+++ b/soak/main.go
@@ -19,8 +19,9 @@
 package main
 
 import (
-       "github.com/apache/yunikorn-core/pkg/log"
+       "github.com/apache/yunikorn-release/soak/autoscaler"
        "github.com/apache/yunikorn-release/soak/framework"
+       "github.com/apache/yunikorn-release/soak/logger"
        "go.uber.org/zap"
 )
 
@@ -28,12 +29,36 @@ const (
        ConfigFileName = "conf.yaml"
 )
 
-var logger *zap.Logger = log.Log(log.Test)
-
 func main() {
        conf, err := framework.InitConfig(ConfigFileName)
+       log := logger.Logger
        if err != nil {
-               logger.Fatal("failed to parse config", zap.Error(err))
+               log.Fatal("failed to parse config", zap.Error(err))
+       }
+       log.Info("config successfully loaded", zap.Any("conf", conf))
+
+       // Register scenarios
+       a := autoscaler.New(conf)
+       if a != nil {
+               framework.Register(a)
+       }
+
+       for _, ts := range framework.GetRegisteredTestScenarios() {
+               err = ts.Init()
+               if err != nil {
+                       log.Fatal("failed to initialize scenario", 
zap.String("scenarioName", ts.GetName()),
+                               zap.Error(err))
+               }
+
+               reportDirs, err := ts.Run()
+               if err != nil {
+                       log.Error("failed to run scenario", 
zap.String("scenarioName", ts.GetName()),
+                               zap.Error(err))
+               }
+               log.Info("Reports are generated for scenario",
+                       zap.String("scenarioName", ts.GetName()),
+                       zap.Strings("reportDirectories", reportDirs))
+
        }
-       logger.Info("config successully loaded", zap.Any("conf", conf))
+
 }
diff --git a/soak/pkg/setup/test_conf.yaml b/soak/pkg/setup/test_conf.yaml
deleted file mode 100644
index 99a2a97..0000000
--- a/soak/pkg/setup/test_conf.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tests:
-  - name: autoscaling
-    template:
-      node:
-        - path: ../../templates/kwok-node-template.yaml
-          maxCount: "10"
-          desiredCount: "5"
-      scheduler:
-        - path: ../../templates/autoscaling-queues.yaml
-          vcoreRequests: 2
-          vcoreLimits: 2
-          memoryRequests: 16Gi
-          memoryLimits: 16Gi
\ No newline at end of file
diff --git a/soak/templates/autoscaler-configmap.yaml 
b/soak/templates/autoscaler-configmap.yaml
deleted file mode 100644
index 3ca6613..0000000
--- a/soak/templates/autoscaler-configmap.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: v1
-items:
-kind: List
-metadata:
-  resourceVersion: ""
diff --git a/soak/templates/kwok-node-template.yaml 
b/soak/templates/kwok-node-template.yaml
index eea8399..c33e8c1 100644
--- a/soak/templates/kwok-node-template.yaml
+++ b/soak/templates/kwok-node-template.yaml
@@ -18,9 +18,9 @@ apiVersion: v1
 kind: Node
 metadata:
   annotations:
-    cluster-autoscaler.kwok.nodegroup/max-count:
-    cluster-autoscaler.kwok.nodegroup/min-count:
-    cluster-autoscaler.kwok.nodegroup/desired-count:
+    cluster-autoscaler.kwok.nodegroup/max-count: {{$MAX_COUNT}}
+    cluster-autoscaler.kwok.nodegroup/min-count: {{$MIN_COUNT}}
+    cluster-autoscaler.kwok.nodegroup/desired-count: {{$DESIRED_COUNT}}
   labels:
     beta.kubernetes.io/arch: amd64
     beta.kubernetes.io/os: linux
@@ -31,13 +31,13 @@ metadata:
   name: kwok-node
 status:
   allocatable:
-    cpu: 32
+    cpu: "32"
     memory: 256Gi
-    pods: 110
+    pods: "110"
   capacity:
-    cpu: 32
+    cpu: "32"
     memory: 256Gi
-    pods: 110
+    pods: "110"
   nodeInfo:
     architecture: amd64
     bootID: ""
diff --git a/soak/templates/kwok-provider-config.yaml 
b/soak/templates/kwok-provider-config.yaml
new file mode 100644
index 0000000..c5c8760
--- /dev/null
+++ b/soak/templates/kwok-provider-config.yaml
@@ -0,0 +1,46 @@
+apiVersion: v1
+data:
+  config: |-
+    # if you see '\n' everywhere, remove all the trailing spaces
+    apiVersion: v1alpha1
+    readNodesFrom: configmap # possible values: [cluster,configmap]
+    nodegroups:
+      # to specify how to group nodes into a nodegroup
+      # e.g., you want to treat nodes with same instance type as a nodegroup
+      # node1: m5.xlarge
+      # node2: c5.xlarge
+      # node3: m5.xlarge
+      # nodegroup1: [node1,node3]
+      # nodegroup2: [node2]
+      fromNodeLabelKey: "kwok-nodegroup"
+      # you can either specify fromNodeLabelKey OR fromNodeAnnotation
+      # fromNodeAnnotation: "abc.domain.com/nodegroup"
+    nodes:
+      # gpuConfig:
+      #   # to tell kwok provider what label should be considered as GPU label
+      #   gpuLabelKey: "abc.domain.com/accelerator"
+      #   availableGPUTypes:
+      #     "nvidia-tesla-k80": {}
+      #     "nvidia-tesla-p100": {}
+    configmap:
+      name: kwok-provider-templates
+    kwok: {} # default: fetch latest release of kwok from github and install it
+    # # you can also manually specify which kwok release you want to install
+    # # for example:
+    # kwok:
+    #   release: v0.3.0
+    # # you can also disable installing kwok in CA code (and install your own 
kwok release)
+    # kwok:
+    #   install: false (true if not specified)
+kind: ConfigMap
+metadata:
+  annotations:
+    meta.helm.sh/release-name: autoscaler
+    meta.helm.sh/release-namespace: default
+  creationTimestamp: "2025-03-07T18:36:19Z"
+  labels:
+    app.kubernetes.io/managed-by: Helm
+  name: kwok-provider-config
+  namespace: default
+  resourceVersion: "3713"
+  uid: 6c058143-1de9-4f91-8944-51d59cdb17e1
diff --git a/soak/tests/basic-scheduler-throughput/cl2-metadata.json 
b/soak/tests/basic-scheduler-throughput/cl2-metadata.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/cl2-metadata.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/soak/tests/basic-scheduler-throughput/config.yaml 
b/soak/tests/basic-scheduler-throughput/config.yaml
new file mode 100644
index 0000000..0f4f8f5
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/config.yaml
@@ -0,0 +1,81 @@
+{{$totalSchedulerThroughputPods := DefaultParam .CL2_SCHEDULER_THROUGHPUT_PODS 
10}}
+{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS  10}}
+{{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 5}}
+{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 10}}
+
+{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam 
.CL2_SCHEDULER_THROUGHPUT_THRESHOLD 10}}
+
+name: direct-scheduler-throughput
+namespace:
+  number: 1
+tuningSets:
+# default is a tuningset that is meant to be used when we don't have any 
specific requirements on pace of operations.
+- name: default
+  globalQPSLoad:
+    qps: {{$defaultQps}}
+    burst: {{$defaultBurst}}
+- name: UniformQPS
+  qpsLoad:
+    qps: {{$uniformQps}}
+steps:
+- name: Creating scheduler throughput measurements
+  measurements:
+  - Identifier: DirectSchedulerThroughputPodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: start
+      labelSelector: group = direct-scheduler-throughput
+      threshold: 5s
+  - Identifier: DirectSchedulingThroughput
+# TODO: Move to SchedulingThroughputPrometheus which requires cl2 prom stack 
setup as pre-req
+    Method: SchedulingThroughput
+    Params:
+      action: start
+      labelSelector: group = direct-scheduler-throughput
+      measurmentInterval: 1s
+- name: create scheduler throughput pods
+  phases:
+  - namespaceRange:
+      min: 1
+      max: 1
+    replicasPerNamespace: {{$totalSchedulerThroughputPods}}
+    tuningSet: UniformQPS
+    objectBundle:
+    - basename: direct-scheduler-throughput-pod
+      objectTemplatePath: pod-default.yaml
+      templateFillMap:
+        Group: direct-scheduler-throughput
+- name: Waiting for scheduler throughput pods to be created
+  measurements:
+  - Identifier: WaitForDirectSchedulerThroughputPods
+    Method: WaitForRunningPods
+    Params:
+      action: gather
+      timeout: 5m
+      desiredPodCount: {{$totalSchedulerThroughputPods}}
+      labelSelector: group = direct-scheduler-throughput
+- name: Collecting scheduler throughput measurements
+  measurements:
+  - Identifier: DirectSchedulerThroughputPodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: gather
+      schedulerName: yunikorn
+  - Identifier: DirectSchedulingThroughput
+    Method: SchedulingThroughput
+    Params:
+      action: gather
+      enableViolations: true
+      threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}}
+- name: Delete scheduler throughput pods
+  phases:
+  - namespaceRange:
+      min: 1
+      max: 1
+    replicasPerNamespace: 0
+    tuningSet: default
+    objectBundle:
+    - basename: direct-scheduler-throughput-pod
+      objectTemplatePath: pod-default.yaml
+      templateFillMap:
+        Group: direct-scheduler-throughput
diff --git a/soak/tests/basic-scheduler-throughput/pod-default.yaml 
b/soak/tests/basic-scheduler-throughput/pod-default.yaml
new file mode 100644
index 0000000..74d600f
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/pod-default.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  generateName: pod-churn-
+  labels:
+    group: {{.Group}}  
+spec:
+  schedulerName: yunikorn
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: kwok-nodegroup
+                operator: In
+                values:
+                - kind-worker
+                - kind-worker2
+  tolerations:
+    - key: "kwok-provider"
+      operator: "Exists"
+      effect: "NoSchedule" 
+  containers:
+  - image: registry.k8s.io/pause:3.9
+    name: pause


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to