Thanks for noticing this!  Here is a patch that will fix this and the other 
limits that weren't doing the right thing on a reconfig.  As you noticed, this 
only had to do with QOS and not with the account associations.

Let me know if you see anything else.

Danny



On 02/13/11 12:26, Bjørn-Helge Mevik wrote:
[email protected] (Bjørn-Helge Mevik) writes:

We think we've discovered an error in slurm 2.2.1.  When we do a
scontrol reconfig (or kill -hup the slurmctld), the usage->grp_used_cpus
field of qos'es and associations gets set to its previous value + the
current actual usage.  (If we restart slurm, it gets set to the current
actual usage.)

[...]

We can reproduce it on an un-patched slurm 2.2.1 like this (this is for
QoS limits, but we see the same behaviour for account limits):

A small correction to myself: we have seen this only for _QOSes_, not
for _accounts_.  I guess I got a little confused when looking at the
logs (with the current debug level, we get about 1 GB per day. :-)

If I were to guess what happens, I'd suggest the problem is that
_restore_job_dependencies() calls assoc_mgr_clear_used_info() which
clears association limits, but it doesn't cleare the qos limits.

Index: src/common/assoc_mgr.c
===================================================================
--- src/common/assoc_mgr.c	(revision 22469)
+++ src/common/assoc_mgr.c	(working copy)
@@ -109,9 +109,9 @@
 	return SLURM_SUCCESS;
 }
 
-static int _clear_used_info(slurmdb_association_rec_t *assoc)
+static int _clear_used_assoc_info(slurmdb_association_rec_t *assoc)
 {
-	if (!assoc)
+	if (!assoc || !assoc->usage)
 		return SLURM_ERROR;
 
 	assoc->usage->grp_used_cpus = 0;
@@ -127,6 +127,24 @@
 	return SLURM_SUCCESS;
 }
 
+static int _clear_used_qos_info(slurmdb_qos_rec_t *qos)
+{
+	if (!qos || !qos->usage)
+		return SLURM_ERROR;
+
+	qos->usage->grp_used_cpus = 0;
+	qos->usage->grp_used_nodes = 0;
+
+	qos->usage->grp_used_jobs  = 0;
+	qos->usage->grp_used_submit_jobs = 0;
+	/* do not reset usage_raw or grp_used_wall.
+	 * if you need to reset it do it
+	 * else where since sometimes we call this and do not want
+	 * shares reset */
+
+	return SLURM_SUCCESS;
+}
+
 /* Locks should be in place before calling this. */
 static int _change_user_name(slurmdb_user_rec_t *user)
 {
@@ -2452,7 +2470,7 @@
 			   changed we could have different usage
 			*/
 			if (!object->user) {
-				_clear_used_info(object);
+				_clear_used_assoc_info(object);
 				object->usage->usage_raw = 0;
 				object->usage->grp_used_wall = 0;
 			}
@@ -3091,18 +3109,27 @@
 {
 	ListIterator itr = NULL;
 	slurmdb_association_rec_t * found_assoc = NULL;
+	slurmdb_qos_rec_t * found_qos = NULL;
 	assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK,
-				   NO_LOCK, NO_LOCK, NO_LOCK };
+				   WRITE_LOCK, NO_LOCK, NO_LOCK };
 
-	if (!assoc_mgr_association_list)
-		return;
-
 	assoc_mgr_lock(&locks);
-	itr = list_iterator_create(assoc_mgr_association_list);
-	while ((found_assoc = list_next(itr))) {
-		_clear_used_info(found_assoc);
+	if (assoc_mgr_association_list) {
+		itr = list_iterator_create(assoc_mgr_association_list);
+		while ((found_assoc = list_next(itr))) {
+			_clear_used_assoc_info(found_assoc);
+		}
+		list_iterator_destroy(itr);
 	}
-	list_iterator_destroy(itr);
+
+	if (assoc_mgr_qos_list) {
+		itr = list_iterator_create(assoc_mgr_qos_list);
+		while ((found_qos = list_next(itr))) {
+			_clear_used_qos_info(found_qos);
+		}
+		list_iterator_destroy(itr);
+	}
+
 	assoc_mgr_unlock(&locks);
 }
 

Reply via email to