Hi, we are experimenting a problem with the behavior of the sacct command when only jobs running on a subset of nodes are requested. In some situations, we are getting results corresponding to a different subset of nodes.
The running configuration is [root@lascaux0 ~] # grep StorageType /etc/slurm/slurm.conf AccountingStorageType=accounting_storage/slurmdbd [root@lascaux0 ~] # grep StorageType /etc/slurm/slurmdbd.conf StorageType=accounting_storage/mysql [root@lascaux0 ~] # Example of the problem : [root@lascaux0 ~] # sacct -Xno nodelist%100 -N lascaux9014 lascaux5011 lascaux[5001,5003,5011-5012,5014,5039,5050,5093,5117-5118,5123,5125,5134,5162,5165] [root@lascaux0 ~] # [root@lascaux0 ~] # sacct -Xno nodelist%100 -N lascaux5011 lascaux[9011-9012] lascaux[9011-9013] lascaux[9011-9013] lascaux[9011-9013] lascaux[9004-9005,9011-9012] lascaux[9004-9005,9011-9012] lascaux[9004-9005,9011-9012] [root@lascaux0 ~] # This problem seems to be due to the fact that the engine to get the jobs that match the nodelist criteria compare 2 bitmaps that are not build with the same reference. For MYSQL plugin, the comparison is done in as_mysql_jobacct_process.c:949 : job_bitmap = bit_alloc(hostlist_count((*curr_cluster)->hl)); bit_unfmt(job_bitmap, node_inx); if (!bit_overlap((*curr_cluster)->asked_bitmap, job_bitmap)) { FREE_NULL_BITMAP(job_bitmap); return 0; } However, asked_bitmap is build with the cluster node list as the reference and node_inx is build with the node_record_table_ptr reference (example in accounting_storage_slurmdbd.c:178). However, the order of the nodes in node_record_table_ptr depends of the order of their description in slurm.conf but the order in the cluster node list always correspond to the numerically sorted list of nodes of the cluster . As a result, the 2 references do not match when the order of the nodes describes in slurm.conf doest not follow a numerical search, in our case : [root@lascaux0 slurm-2.2.7] # grep NodeName= /etc/slurm/slurm.conf | egrep "9000|5000" NodeName=lascaux[9000-9199] NodeAddr=lascaux[9000-9199] Sockets=4 CoresPerSocket=8 ThreadsPerCore=1 RealMemory=128900 Weight=30 State=UNKNOWN NodeName=lascaux[5000-5197] NodeAddr=lascaux[5000-5197] Sockets=2 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=23200 Weight=40 State=UNKNOWN [root@lascaux0 slurm-2.2.7] # A solution would be to ensure that the 2 reference are similar, so node_inx should be build using a sorted nodelist reference, node the internal bitmap of slurmctld. However, as currenlty stored entries have a wrong node_inx value, the good one should be computed again to ensure that everything is going well again. To have a workaround on our clusters that enable to acess our accounting history with a working nodes subset functionality, I have made a patch (enclosed) that no longer use node_inx in teh comparison but directly us the job's node list instead. It is certainly a bit less optimized than using the index but avoid the coherency problem described without having to deal with regenerating every node_inx entries already stored in the DB. FYI, the postgresql part of the patch is not tested/validated but should have the same problem without the workaround. I will let you judge how to best manage this problem, switching to a certainly less optimized algorithm like in te enclosed patch or modify the node_inx generation to be coherent and add a hook to ensure recomputation of all the already stored node_inx entries of jobs and steps. I do not have figures to compare the 2 algorithms and their respective performances, if you do not have a clear preference too, I could try to compare them to help in the choice. Regards, Matthieu
--- slurm-2.2.7.base/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.h 2011-06-10 18:55:39.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.h 2011-09-09 15:12:21.968667214 +0200 @@ -52,6 +52,9 @@ extern int good_nodes_from_inx(List local_cluster_list, void **object, char *node_inx, int submit); +extern int good_nodes_from_nodelist(List local_cluster_list, + void **object, char *nodelist, + int submit); extern char *setup_job_cluster_cond_limits(mysql_conn_t *mysql_conn, slurmdb_job_cond_t *job_cond, char *cluster_name, char **extra); --- slurm-2.2.7.base/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c 2011-06-10 18:57:28.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c 2011-09-09 15:36:05.038848356 +0200 @@ -442,9 +442,9 @@ /* check the bitmap to see if this is one of the jobs we are looking for */ - if (!good_nodes_from_inx(local_cluster_list, - (void **)&curr_cluster, - row[JOB_REQ_NODE_INX], submit)) { + if (!good_nodes_from_nodelist(local_cluster_list, + (void **)&curr_cluster, + row[JOB_REQ_NODELIST], submit)) { last_id = curr_id; continue; } @@ -672,10 +672,10 @@ while ((step_row = mysql_fetch_row(step_result))) { /* check the bitmap to see if this is one of the steps we are looking for */ - if (!good_nodes_from_inx(local_cluster_list, - (void **)&curr_cluster, - step_row[STEP_REQ_NODE_INX], - submit)) + if (!good_nodes_from_nodelist(local_cluster_list, + (void **)&curr_cluster, + step_row[STEP_REQ_NODELIST], + submit)) continue; step = slurmdb_create_step_rec(); @@ -956,6 +956,62 @@ return 1; } +extern int good_nodes_from_nodelist(List local_cluster_list, + void **object, char *nodelist, + int submit) +{ + local_cluster_t **curr_cluster = (local_cluster_t **)object; + int loc = 0; + char *host = NULL; + hostlist_t temp_hl = NULL; + hostlist_iterator_t h_itr = NULL; + + /* check the bitmap to see if this is one of the jobs + we are looking for */ + if (*curr_cluster) { + bitstr_t *job_bitmap = NULL; + if (!nodelist || !nodelist) + return 0; + if ((submit < (*curr_cluster)->start) + || (submit > (*curr_cluster)->end)) { + local_cluster_t *local_cluster = NULL; + + ListIterator itr = + list_iterator_create(local_cluster_list); + while ((local_cluster = list_next(itr))) { + if ((submit >= local_cluster->start) + && (submit <= local_cluster->end)) { + *curr_cluster = local_cluster; + break; + } + } + list_iterator_destroy(itr); + if (!local_cluster) + return 0; + } + job_bitmap = bit_alloc(hostlist_count((*curr_cluster)->hl)); + temp_hl = hostlist_create(nodelist); + if (hostlist_count(temp_hl) <= 0) { + return 0; + } + h_itr = hostlist_iterator_create(temp_hl); + while ((host = hostlist_next(h_itr))) { + if ((loc = hostlist_find( + (*curr_cluster)->hl, host)) != -1) + bit_set(job_bitmap, loc); + free(host); + } + hostlist_iterator_destroy(h_itr); + hostlist_destroy(temp_hl); + if (!bit_overlap((*curr_cluster)->asked_bitmap, job_bitmap)) { + FREE_NULL_BITMAP(job_bitmap); + return 0; + } + FREE_NULL_BITMAP(job_bitmap); + } + return 1; +} + extern char *setup_job_cluster_cond_limits(mysql_conn_t *mysql_conn, slurmdb_job_cond_t *job_cond, char *cluster_name, char **extra) --- slurm-2.2.7.base/src/plugins/accounting_storage/mysql/as_mysql_resv.c 2011-06-10 18:55:39.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/mysql/as_mysql_resv.c 2011-09-09 15:03:21.418548270 +0200 @@ -593,8 +593,8 @@ int start = slurm_atoul(row[RESV_REQ_START]); list_append(resv_list, resv); - if (!good_nodes_from_inx(local_cluster_list, &curr_cluster, - row[RESV_REQ_NODE_INX], start)) + if (!good_nodes_from_nodelist(local_cluster_list, &curr_cluster, + row[RESV_REQ_NODES], start)) continue; resv->id = slurm_atoul(row[RESV_REQ_ID]); --- slurm-2.2.7.base/src/plugins/accounting_storage/pgsql/as_pg_common.c 2011-06-10 18:55:39.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/pgsql/as_pg_common.c 2011-09-09 15:22:51.478653581 +0200 @@ -478,6 +478,64 @@ return 1; } +/* + * good_nodes_from_nodelist - whether nodelist is within the used nodes + * of specified cluster + */ +extern int +good_nodes_from_nodelist(cluster_nodes_t *cnodes, char *nodelist, int submit) +{ + bitstr_t *job_bitmap = NULL; + int loc = 0; + char *host = NULL; + hostlist_t temp_hl = NULL; + hostlist_iterator_t h_itr = NULL; + + if (! cnodes) + return 1; + + if(!node_inx || !node_inx[0]) + return 0; + + if(!cnodes->curr_cluster || + (submit < (cnodes->curr_cluster)->start) || + (submit > (cnodes->curr_cluster)->end)) { + local_cluster_t *local_cluster = NULL; + ListIterator itr = + list_iterator_create(cnodes->cluster_list); + while((local_cluster = list_next(itr))) { + if((submit >= local_cluster->start) + && (submit <= local_cluster->end)) { + cnodes->curr_cluster = local_cluster; + break; + } + } + list_iterator_destroy(itr); + if (! local_cluster) + return 0; + } + job_bitmap = bit_alloc(hostlist_count((cnodes->curr_cluster)->hl)); + temp_hl = hostlist_create(nodelist); + if (hostlist_count(temp_hl) <= 0) { + return 0; + } + h_itr = hostlist_iterator_create(temp_hl); + while ((host = hostlist_next(h_itr))) { + if ((loc = hostlist_find( + (cnodes->curr_cluster)->hl, host)) != -1) + bit_set(job_bitmap, loc); + free(host); + } + hostlist_iterator_destroy(h_itr); + hostlist_destroy(temp_hl); + if(!bit_overlap((cnodes->curr_cluster)->asked_bitmap, job_bitmap)) { + FREE_NULL_BITMAP(job_bitmap); + return 0; + } + FREE_NULL_BITMAP(job_bitmap); + return 1; +} + /* rollback and discard updates */ extern void reset_pgsql_conn(pgsql_conn_t *pg_conn) --- slurm-2.2.7.base/src/plugins/accounting_storage/pgsql/as_pg_get_jobs.c 2011-06-10 18:57:28.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/pgsql/as_pg_get_jobs.c 2011-09-09 15:18:24.028546398 +0200 @@ -570,7 +570,7 @@ /* check the bitmap to see if this is one of the jobs we are looking for */ - if(!good_nodes_from_inx(cnodes, ROW(JF_NODE_INX), submit)) + if(!good_nodes_from_nodelist(cnodes, ROW(JF_NODELIST), submit)) continue; debug3("as/pg: get_jobs_cond: job %d past node test", curr_id); @@ -772,8 +772,8 @@ FOR_EACH_ROW2 { /* check the bitmap to see if this is one of the steps we are looking for */ - if(!good_nodes_from_inx(cnodes, ROW2(SF_NODE_INX), - submit)) + if(!good_nodes_from_nodelist(cnodes,ROW2(SF_NODELIST), + submit)) continue; step = slurmdb_create_step_rec(); --- slurm-2.2.7.base/src/plugins/accounting_storage/pgsql/as_pg_resv.c 2011-06-10 18:55:39.000000000 +0200 +++ slurm-2.2.7/src/plugins/accounting_storage/pgsql/as_pg_resv.c 2011-09-09 15:18:11.358554295 +0200 @@ -535,7 +535,7 @@ int start; start = atoi(ROW(F_START)); - if(!good_nodes_from_inx(cnodes, ROW(F_NODE_INX), start)) + if(!good_nodes_from_nodelist(cnodes, ROW(F_NODELIST), start)) continue; resv = xmalloc(sizeof(slurmdb_reservation_rec_t));