Hello,

We run the multifactor plugin with fairshare. In our environment we have hundreds of users but <100 very heavy users. We would like each account to have equal access to resources and therefore equal fairshare. Fairshare works great for the light users. For heavy users, from testing, it appears the fairshare component decays to 0 it around 10x their fairshare usage. So for example a user who has used 10x their expected utilization and someone who has used 100x get roughly the same fairshare priority.

We have this situation happen quite often, due to the makeup of our user base. We don't use any sort of hierarchical fairshare setup as all of or users should be treated equally.

We have been running the attached patch for several weeks and it seems to be working pretty well. Previously, in sshare, we were seeing fairshare priority with pretty discontinuous values, either near the max or at 0. With this patch the values are pretty spread out. Now users who go over their expected usage are still sorted by overall usage and get a fairshare priority value.

We introduced a new parameter, FairShareDampeningFactor, which will modify the calculation of fairshare based on usage. The default is "1" which will not change the existing functionality. We have used "5" and that seems to work pretty well. Previously, once you used about 5-10x your allocation the fairshare priory would go to 0. With this parameter set, at about 30x utilization, the fairshare priory goes to zero.

I should note that this is different from the existing PriorityDecayHalfLife parameter. This new parameter affect the fairshare priority based on whatever the utilization has been calculated as.

I apologize for the convoluted description. I can send some graphs of our analysis if that is more clear. It would be great to have this included if you feel appropriate.

Thanks

Martins
--- ./src/common/slurm_protocol_api.c.orig      2013-10-04 13:53:39.014827390 
-0400
+++ ./src/common/slurm_protocol_api.c   2013-10-04 13:57:13.628236585 -0400
@@ -626,6 +626,24 @@
        return factor;
 }
 
+/* slurm_get_fs_dampening_factor
+ * returns the dampening factor for fairshare from slurmctld_conf object
+ * RET uint32_t - factor.
+ */
+uint32_t slurm_get_fs_dampening_factor(void)
+{
+        uint32_t factor = 1;
+        slurm_ctl_conf_t *conf;
+
+        if (slurmdbd_conf) {
+        } else {
+                conf = slurm_conf_lock();
+                factor = conf->fs_dampening_factor;
+                slurm_conf_unlock();
+        }
+
+        return factor;
+}
 
 /* slurm_get_priority_weight_job_size
  * returns the priority weight for job size from slurmctld_conf object
--- ./src/common/slurm_protocol_pack.c.orig     2013-10-04 13:58:07.733360261 
-0400
+++ ./src/common/slurm_protocol_pack.c  2013-10-04 14:00:15.121475294 -0400
@@ -5004,6 +5004,7 @@
                packstr(build_ptr->priority_type, buffer);
                pack32(build_ptr->priority_weight_age, buffer);
                pack32(build_ptr->priority_weight_fs, buffer);
+               pack32(build_ptr->fs_dampening_factor, buffer);
                pack32(build_ptr->priority_weight_js, buffer);
                pack32(build_ptr->priority_weight_part, buffer);
                pack32(build_ptr->priority_weight_qos, buffer);
@@ -5211,6 +5212,7 @@
                packstr(build_ptr->priority_type, buffer);
                pack32(build_ptr->priority_weight_age, buffer);
                pack32(build_ptr->priority_weight_fs, buffer);
+               pack32(build_ptr->fs_dampening_factor, buffer);
                pack32(build_ptr->priority_weight_js, buffer);
                pack32(build_ptr->priority_weight_part, buffer);
                pack32(build_ptr->priority_weight_qos, buffer);
@@ -5412,6 +5414,7 @@
                packstr(build_ptr->priority_type, buffer);
                pack32(build_ptr->priority_weight_age, buffer);
                pack32(build_ptr->priority_weight_fs, buffer);
+               pack32(build_ptr->fs_dampening_factor, buffer);
                pack32(build_ptr->priority_weight_js, buffer);
                pack32(build_ptr->priority_weight_part, buffer);
                pack32(build_ptr->priority_weight_qos, buffer);
@@ -5695,6 +5698,7 @@
                                       buffer);
                safe_unpack32(&build_ptr->priority_weight_age, buffer);
                safe_unpack32(&build_ptr->priority_weight_fs, buffer);
+               safe_unpack32(&build_ptr->fs_dampening_factor, buffer);
                safe_unpack32(&build_ptr->priority_weight_js, buffer);
                safe_unpack32(&build_ptr->priority_weight_part, buffer);
                safe_unpack32(&build_ptr->priority_weight_qos, buffer);
@@ -5985,6 +5989,7 @@
                                       buffer);
                safe_unpack32(&build_ptr->priority_weight_age, buffer);
                safe_unpack32(&build_ptr->priority_weight_fs, buffer);
+               safe_unpack32(&build_ptr->fs_dampening_factor, buffer);
                safe_unpack32(&build_ptr->priority_weight_js, buffer);
                safe_unpack32(&build_ptr->priority_weight_part, buffer);
                safe_unpack32(&build_ptr->priority_weight_qos, buffer);
@@ -6262,6 +6267,7 @@
                                       buffer);
                safe_unpack32(&build_ptr->priority_weight_age, buffer);
                safe_unpack32(&build_ptr->priority_weight_fs, buffer);
+               safe_unpack32(&build_ptr->fs_dampening_factor, buffer);
                safe_unpack32(&build_ptr->priority_weight_js, buffer);
                safe_unpack32(&build_ptr->priority_weight_part, buffer);
                safe_unpack32(&build_ptr->priority_weight_qos, buffer);
--- ./src/common/read_config.c.orig     2013-10-04 13:43:25.864357474 -0400
+++ ./src/common/read_config.c  2013-10-04 13:50:54.346390808 -0400
@@ -250,6 +250,7 @@
        {"PriorityFlags", S_P_STRING},
        {"PriorityWeightAge", S_P_UINT32},
        {"PriorityWeightFairshare", S_P_UINT32},
+       {"FairShareDampeningFactor", S_P_UINT32},
        {"PriorityWeightJobSize", S_P_UINT32},
        {"PriorityWeightPartition", S_P_UINT32},
        {"PriorityWeightQOS", S_P_UINT32},
@@ -3272,6 +3273,9 @@
        if (!s_p_get_uint32(&conf->priority_weight_fs,
                            "PriorityWeightFairshare", hashtbl))
                conf->priority_weight_fs = 0;
+       if (!s_p_get_uint32(&conf->fs_dampening_factor,
+                           "FairShareDampeningFactor", hashtbl))
+               conf->fs_dampening_factor = 1;
        if (!s_p_get_uint32(&conf->priority_weight_js,
                            "PriorityWeightJobSize", hashtbl))
                conf->priority_weight_js = 0;
--- ./src/plugins/priority/multifactor/priority_multifactor.c.orig      
2013-10-04 13:16:26.524994685 -0400
+++ ./src/plugins/priority/multifactor/priority_multifactor.c   2013-10-04 
14:06:07.676759254 -0400
@@ -1679,6 +1679,7 @@
                                        long double shares_norm)
 {
        double priority_fs = 0.0;
+       long double fairshare_dampening_factor = (long 
double)slurm_get_fs_dampening_factor();
 
        if (fuzzy_equal(usage_efctv, NO_VAL))
                return priority_fs;
@@ -1691,7 +1692,7 @@
                        usage_efctv = MIN_USAGE_FACTOR * shares_norm;
                priority_fs = shares_norm / usage_efctv;
        } else {
-               priority_fs = pow(2.0, -(usage_efctv / shares_norm));
+               priority_fs = pow(2.0, -(usage_efctv / 
shares_norm)/fairshare_dampening_factor);
        }
 
        return priority_fs;
--- ./src/slurmctld/proc_req.c.orig     2013-10-04 13:52:16.421585379 -0400
+++ ./src/slurmctld/proc_req.c  2013-10-04 13:53:11.265685546 -0400
@@ -622,6 +622,7 @@
        conf_ptr->priority_type       = xstrdup(conf->priority_type);
        conf_ptr->priority_weight_age = conf->priority_weight_age;
        conf_ptr->priority_weight_fs  = conf->priority_weight_fs;
+       conf_ptr->fs_dampening_factor = conf->fs_dampening_factor;
        conf_ptr->priority_weight_js  = conf->priority_weight_js;
        conf_ptr->priority_weight_part= conf->priority_weight_part;
        conf_ptr->priority_weight_qos = conf->priority_weight_qos;
--- ./src/api/config_info.c.orig        2013-10-04 13:39:24.703756802 -0400
+++ ./src/api/config_info.c     2013-10-04 13:43:03.052534606 -0400
@@ -762,6 +762,13 @@
                list_append(ret_list, key_pair);
 
                snprintf(tmp_str, sizeof(tmp_str), "%u",
+                        slurm_ctl_conf_ptr->fs_dampening_factor);
+               key_pair = xmalloc(sizeof(config_key_pair_t));
+               key_pair->name = xstrdup("FairShareDampeningFactor");
+               key_pair->value = xstrdup(tmp_str);
+               list_append(ret_list, key_pair);
+
+               snprintf(tmp_str, sizeof(tmp_str), "%u",
                         slurm_ctl_conf_ptr->priority_weight_js);
                key_pair = xmalloc(sizeof(config_key_pair_t));
                key_pair->name = xstrdup("PriorityWeightJobSize");
--- ./slurm/slurm.h.in.orig     2013-10-04 14:03:18.763114377 -0400
+++ ./slurm/slurm.h.in  2013-10-04 14:02:51.669466647 -0400
@@ -2043,6 +2043,7 @@
        char *priority_type;    /* priority type plugin */
        uint32_t priority_weight_age; /* weight for age factor */
        uint32_t priority_weight_fs; /* weight for Fairshare factor */
+       uint32_t fs_dampening_factor; /*  dampening for Fairshare factor */
        uint32_t priority_weight_js; /* weight for Job Size factor */
        uint32_t priority_weight_part; /* weight for Partition factor */
        uint32_t priority_weight_qos; /* weight for QOS factor */

Reply via email to