Thanks for noticing this! Here is a patch that will fix this and the other
limits that weren't doing the right thing on a reconfig. As you noticed, this
only had to do with QOS and not with the account associations.
Let me know if you see anything else.
Danny
On 02/13/11 12:26, Bjørn-Helge Mevik wrote:
[email protected] (Bjørn-Helge Mevik) writes:
We think we've discovered an error in slurm 2.2.1. When we do a
scontrol reconfig (or kill -hup the slurmctld), the usage->grp_used_cpus
field of qos'es and associations gets set to its previous value + the
current actual usage. (If we restart slurm, it gets set to the current
actual usage.)
[...]
We can reproduce it on an un-patched slurm 2.2.1 like this (this is for
QoS limits, but we see the same behaviour for account limits):
A small correction to myself: we have seen this only for _QOSes_, not
for _accounts_. I guess I got a little confused when looking at the
logs (with the current debug level, we get about 1 GB per day. :-)
If I were to guess what happens, I'd suggest the problem is that
_restore_job_dependencies() calls assoc_mgr_clear_used_info() which
clears association limits, but it doesn't cleare the qos limits.
Index: src/common/assoc_mgr.c
===================================================================
--- src/common/assoc_mgr.c (revision 22469)
+++ src/common/assoc_mgr.c (working copy)
@@ -109,9 +109,9 @@
return SLURM_SUCCESS;
}
-static int _clear_used_info(slurmdb_association_rec_t *assoc)
+static int _clear_used_assoc_info(slurmdb_association_rec_t *assoc)
{
- if (!assoc)
+ if (!assoc || !assoc->usage)
return SLURM_ERROR;
assoc->usage->grp_used_cpus = 0;
@@ -127,6 +127,24 @@
return SLURM_SUCCESS;
}
+static int _clear_used_qos_info(slurmdb_qos_rec_t *qos)
+{
+ if (!qos || !qos->usage)
+ return SLURM_ERROR;
+
+ qos->usage->grp_used_cpus = 0;
+ qos->usage->grp_used_nodes = 0;
+
+ qos->usage->grp_used_jobs = 0;
+ qos->usage->grp_used_submit_jobs = 0;
+ /* do not reset usage_raw or grp_used_wall.
+ * if you need to reset it do it
+ * else where since sometimes we call this and do not want
+ * shares reset */
+
+ return SLURM_SUCCESS;
+}
+
/* Locks should be in place before calling this. */
static int _change_user_name(slurmdb_user_rec_t *user)
{
@@ -2452,7 +2470,7 @@
changed we could have different usage
*/
if (!object->user) {
- _clear_used_info(object);
+ _clear_used_assoc_info(object);
object->usage->usage_raw = 0;
object->usage->grp_used_wall = 0;
}
@@ -3091,18 +3109,27 @@
{
ListIterator itr = NULL;
slurmdb_association_rec_t * found_assoc = NULL;
+ slurmdb_qos_rec_t * found_qos = NULL;
assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK,
- NO_LOCK, NO_LOCK, NO_LOCK };
+ WRITE_LOCK, NO_LOCK, NO_LOCK };
- if (!assoc_mgr_association_list)
- return;
-
assoc_mgr_lock(&locks);
- itr = list_iterator_create(assoc_mgr_association_list);
- while ((found_assoc = list_next(itr))) {
- _clear_used_info(found_assoc);
+ if (assoc_mgr_association_list) {
+ itr = list_iterator_create(assoc_mgr_association_list);
+ while ((found_assoc = list_next(itr))) {
+ _clear_used_assoc_info(found_assoc);
+ }
+ list_iterator_destroy(itr);
}
- list_iterator_destroy(itr);
+
+ if (assoc_mgr_qos_list) {
+ itr = list_iterator_create(assoc_mgr_qos_list);
+ while ((found_qos = list_next(itr))) {
+ _clear_used_qos_info(found_qos);
+ }
+ list_iterator_destroy(itr);
+ }
+
assoc_mgr_unlock(&locks);
}