Dear all,
we are using the cpu_load information stored in node_info as part of our
monitoring to get a ballpark figure for the CPU utilization by the workload on
a particular node. This works quite nicely and fits well in our overall
infrastructure. However, currently we lack a corresponding measure for the
memory usage on nodes.
We are thinking about patching our Slurm version to provide the free memory in
the same way as the CPU load is recorded. In contrast to CPU load, where Slurm
records the 5 minute average, that number is the instanteneous memory usage,
which is admittedly not perfect but good enough from my point of view.
Before making this step, however, I would like to understand if there would be
the possibility to bring the required changes upstream or if we would need to
maintain the patch set for a longer amount of time. I am pasting the diff - on
top of the current HEAD - at the end of the e-mail (please note that this is an
early first version). If there is a chance to include these changes into the
official version I would be glad to clean this up and send it in for review.
Thank you for your time and consideration.
Best regards,
Dorian
diff --git a/contribs/slurm_completion_help/slurm_completion.sh
b/contribs/slurm_completion_help/slurm_completion.sh
index e5994d9..6f57163 100644
--- a/contribs/slurm_completion_help/slurm_completion.sh
+++ b/contribs/slurm_completion_help/slurm_completion.sh
@@ -1190,7 +1190,7 @@ _sinfo()
%E(reason) %f(features) %F(nodes_usage) %g(group) %G(Gres)\
%h(shared) %H(timestamp) %l(time_limit) %L(default_time) %m(mem)\
%M(preemt_mode) %n(hostnames) %N(node_names) %o(node_addr)\
- %O(cpu_load) %p(partition_prio) %P(partition) %r(root_jobs)\
+ %O(cpu_load) %e(free_mem) %p(partition_prio) %P(partition)
%r(root_jobs)\
%R(reason) %s(max_job_size) %S(allowed_allocating_nodes)\
%t(state) %T(state) %u(user) %U(uID) %w(weight)\
%X(sockets_per_node) %Y(cores_per_socket)\
diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1
index 36697ee..0a418d1 100644
--- a/doc/man/man1/sinfo.1
+++ b/doc/man/man1/sinfo.1
@@ -197,6 +197,9 @@ List of node communication addresses
\fB%O\fR
CPU load of a node
.TP
+\fB%e\fR
+Free memory of a node
+.TP
\fB%p\fR
Partition scheduling priority
.TP
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index e708883..5b40a7d 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1814,6 +1814,7 @@ typedef struct node_info {
uint16_t cores; /* number of cores per socket */
uint16_t core_spec_cnt; /* number of specialized cores on node */
uint32_t cpu_load; /* CPU load * 100 */
+ uint32_t free_mem; /* free memory in MiB */
uint16_t cpus; /* configured count of cpus running on
* the node */
char *cpu_spec_list; /* node's specialized cpus */
diff --git a/src/api/node_info.c b/src/api/node_info.c
index 9284aaa..d7a40e9 100644
--- a/src/api/node_info.c
+++ b/src/api/node_info.c
@@ -128,7 +128,7 @@ slurm_sprint_node_table (node_info_t * node_ptr,
{
uint32_t my_state = node_ptr->node_state;
char *cloud_str = "", *comp_str = "", *drain_str = "", *power_str = "";
- char load_str[32], tmp_line[512], time_str[32], owner_str[32];
+ char load_str[32], mem_str[32], tmp_line[512], time_str[32],
owner_str[32];
char *out = NULL, *reason_str = NULL, *select_reason_str = NULL;
uint16_t err_cpus = 0, alloc_cpus = 0;
int cpus_per_node = 1;
@@ -278,13 +278,18 @@ slurm_sprint_node_table (node_info_t * node_ptr,
snprintf(tmp_line, sizeof(tmp_line), "OS=%s ", node_ptr->os);
xstrcat(out, tmp_line);
}
+ if (node_ptr->free_mem == NO_VAL)
+ strcpy(mem_str, "N/A");
+ else {
+ snprintf(mem_str, sizeof(mem_str), "%u", node_ptr->free_mem);
+ }
slurm_get_select_nodeinfo(node_ptr->select_nodeinfo,
SELECT_NODEDATA_MEM_ALLOC,
NODE_STATE_ALLOCATED,
&alloc_memory);
snprintf(tmp_line, sizeof(tmp_line),
- "RealMemory=%u AllocMem=%u Sockets=%u Boards=%u",
- node_ptr->real_memory, alloc_memory,
+ "RealMemory=%u AllocMem=%u FreeMem=%s Sockets=%u Boards=%u",
+ node_ptr->real_memory, alloc_memory, mem_str,
node_ptr->sockets, node_ptr->boards);
xstrcat(out, tmp_line);
if (one_liner)
diff --git a/src/common/node_conf.c b/src/common/node_conf.c
index 118bc5c..22e20b0 100644
--- a/src/common/node_conf.c
+++ b/src/common/node_conf.c
@@ -793,6 +793,7 @@ extern struct node_record *create_node_record (
/* these values will be overwritten when the node actually registers */
node_ptr->cpus = config_ptr->cpus;
node_ptr->cpu_load = NO_VAL;
+ node_ptr->free_mem = NO_VAL;
node_ptr->cpu_spec_list = xstrdup(config_ptr->cpu_spec_list);
node_ptr->boards = config_ptr->boards;
node_ptr->sockets = config_ptr->sockets;
diff --git a/src/common/node_conf.h b/src/common/node_conf.h
index 20677b1..7f52f89 100644
--- a/src/common/node_conf.h
+++ b/src/common/node_conf.h
@@ -168,6 +168,8 @@ struct node_record {
* to access contents */
uint32_t cpu_load; /* CPU load * 100 */
time_t cpu_load_time; /* Time when cpu_load last set */
+ uint32_t free_mem; /* Free memory in MiB */
+ time_t free_mem_time; /* Time when free_mem last set */
uint16_t protocol_version; /* Slurm version number */
char *version; /* Slurm version */
bitstr_t *node_spec_bitmap; /* node cpu specialization bitmap */
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 4f4fa2f..4991732 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -1067,6 +1067,7 @@ typedef struct suspend_int_msg {
typedef struct ping_slurmd_resp_msg {
uint32_t cpu_load; /* CPU load * 100 */
+ uint32_t free_mem; /* Free memory in MiB */
} ping_slurmd_resp_msg_t;
typedef struct license_info_request_msg {
@@ -1087,6 +1088,7 @@ typedef struct slurm_node_registration_status_msg {
uint16_t cores;
uint16_t cpus;
uint32_t cpu_load; /* CPU load * 100 */
+ uint32_t free_mem; /* Free memory in MiB */
char *cpu_spec_list; /* list of specialized CPUs */
acct_gather_energy_t *energy;
Buf gres_info; /* generic resource info */
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index bbe6145..c35d18c 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -3328,6 +3328,7 @@
_pack_node_registration_status_msg(slurm_node_registration_status_msg_t *
pack32(msg->up_time, buffer);
pack32(msg->hash_val, buffer);
pack32(msg->cpu_load, buffer);
+ pack32(msg->free_mem, buffer);
pack32(msg->job_count, buffer);
for (i = 0; i < msg->job_count; i++) {
@@ -3366,6 +3367,7 @@
_pack_node_registration_status_msg(slurm_node_registration_status_msg_t *
pack32(msg->up_time, buffer);
pack32(msg->hash_val, buffer);
pack32(msg->cpu_load, buffer);
+ pack32(msg->free_mem, buffer);
pack32(msg->job_count, buffer);
for (i = 0; i < msg->job_count; i++) {
@@ -3431,6 +3433,7 @@
_unpack_node_registration_status_msg(slurm_node_registration_status_msg_t
safe_unpack32(&node_reg_ptr->up_time, buffer);
safe_unpack32(&node_reg_ptr->hash_val, buffer);
safe_unpack32(&node_reg_ptr->cpu_load, buffer);
+ safe_unpack32(&node_reg_ptr->free_mem, buffer);
safe_unpack32(&node_reg_ptr->job_count, buffer);
node_reg_ptr->job_id =
@@ -3488,6 +3491,7 @@
_unpack_node_registration_status_msg(slurm_node_registration_status_msg_t
safe_unpack32(&node_reg_ptr->up_time, buffer);
safe_unpack32(&node_reg_ptr->hash_val, buffer);
safe_unpack32(&node_reg_ptr->cpu_load, buffer);
+ safe_unpack32(&node_reg_ptr->free_mem, buffer);
safe_unpack32(&node_reg_ptr->job_count, buffer);
node_reg_ptr->job_id =
@@ -3944,6 +3948,7 @@ _unpack_node_info_members(node_info_t * node, Buf buffer,
buffer);
safe_unpack32(&node->cpu_load, buffer);
+ safe_unpack32(&node->free_mem, buffer);
safe_unpack32(&node->weight, buffer);
safe_unpack32(&node->reason_uid, buffer);
@@ -3995,6 +4000,7 @@ _unpack_node_info_members(node_info_t * node, Buf buffer,
buffer);
safe_unpack32(&node->cpu_load, buffer);
+ safe_unpack32(&node->free_mem, buffer);
safe_unpack32(&node->weight, buffer);
safe_unpack32(&node->reason_uid, buffer);
@@ -4037,6 +4043,7 @@ _unpack_node_info_members(node_info_t * node, Buf buffer,
safe_unpack32(&node->real_memory, buffer);
safe_unpack32(&node->tmp_disk, buffer);
safe_unpack32(&node->cpu_load, buffer);
+ safe_unpack32(&node->free_mem, buffer);
safe_unpack32(&node->weight, buffer);
safe_unpack32(&node->reason_uid, buffer);
@@ -12091,6 +12098,7 @@ static void
_pack_ping_slurmd_resp(ping_slurmd_resp_msg_t *msg,
xassert (msg != NULL);
pack32(msg->cpu_load, buffer);
+ pack32(msg->free_mem, buffer);
}
static int _unpack_ping_slurmd_resp(ping_slurmd_resp_msg_t **msg_ptr,
@@ -12102,6 +12110,7 @@ static int
_unpack_ping_slurmd_resp(ping_slurmd_resp_msg_t **msg_ptr,
msg = xmalloc(sizeof(ping_slurmd_resp_msg_t));
*msg_ptr = msg;
safe_unpack32(&msg->cpu_load, buffer);
+ safe_unpack32(&msg->free_mem, buffer);
return SLURM_SUCCESS;
diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c
index d72058d..cce1449 100644
--- a/src/sinfo/opts.c
+++ b/src/sinfo/opts.c
@@ -700,6 +700,12 @@ _parse_format( char* format )
field_size,
right_justify,
suffix );
+ } else if (field[0] == 'e') {
+ params.match_flags.free_mem_flag = true;
+ format_add_free_mem( params.format_list,
+ field_size,
+ right_justify,
+ suffix );
} else if (field[0] == 'p') {
params.match_flags.priority_flag = true;
format_add_priority( params.format_list,
diff --git a/src/sinfo/print.c b/src/sinfo/print.c
index a697e4f..36a7a8d 100644
--- a/src/sinfo/print.c
+++ b/src/sinfo/print.c
@@ -67,6 +67,9 @@ static int _build_min_max_32_string(char *buffer, int
buf_size,
static int _build_cpu_load_min_max_32(char *buffer, int buf_size,
uint32_t min, uint32_t max,
bool range);
+static int _build_free_mem_min_max_32(char *buffer, int buf_size,
+ uint32_t min, uint32_t max,
+ bool range);
static void _print_reservation(reserve_info_t *resv_ptr, int width);
static int _print_secs(long time, int width, bool right, bool cut_output);
static int _print_str(char *str, int width, bool right, bool cut_output);
@@ -317,6 +320,35 @@ _build_cpu_load_min_max_32(char *buffer, int buf_size,
return snprintf(buffer, buf_size, "%s+", tmp_min);
}
+static int
+_build_free_mem_min_max_32(char *buffer, int buf_size,
+ uint32_t min, uint32_t max,
+ bool range)
+{
+
+ char tmp_min[16];
+ char tmp_max[16];
+
+ if (min == NO_VAL) {
+ strcpy(tmp_min, "N/A");
+ } else {
+ snprintf(tmp_min, sizeof(tmp_min), "%u", min);
+ }
+
+ if (max == NO_VAL) {
+ strcpy(tmp_max, "N/A");
+ } else {
+ snprintf(tmp_max, sizeof(tmp_max), "%u", max);
+ }
+
+ if (max == min)
+ return snprintf(buffer, buf_size, "%s", tmp_max);
+ else if (range)
+ return snprintf(buffer, buf_size, "%s-%s", tmp_min, tmp_max);
+ else
+ return snprintf(buffer, buf_size, "%s+", tmp_min);
+}
+
int
format_add_function(List list, int width, bool right, char *suffix,
int (*function) (sinfo_data_t *, int, bool, char*))
@@ -1190,6 +1222,26 @@ int _print_cpu_load(sinfo_data_t * sinfo_data, int width,
return SLURM_SUCCESS;
}
+int _print_free_mem(sinfo_data_t * sinfo_data, int width,
+ bool right_justify, char *suffix)
+{
+ char id[FORMAT_STRING_SIZE];
+
+ if (sinfo_data) {
+ _build_free_mem_min_max_32(id, FORMAT_STRING_SIZE,
+ sinfo_data->min_free_mem,
+ sinfo_data->max_free_mem,
+ true);
+ _print_str(id, width, right_justify, true);
+ } else {
+ _print_str("FREE_MEM", width, right_justify, true);
+ }
+
+ if (suffix)
+ printf("%s", suffix);
+ return SLURM_SUCCESS;
+}
+
int _print_max_cpus_per_node(sinfo_data_t * sinfo_data, int width,
bool right_justify, char *suffix)
{
diff --git a/src/sinfo/print.h b/src/sinfo/print.h
index 4fdb6bc..07ad188 100644
--- a/src/sinfo/print.h
+++ b/src/sinfo/print.h
@@ -146,6 +146,8 @@ void print_sinfo_reservation(reserve_info_msg_t *resv_ptr);
format_add_function(list,wid,right,suffix,_print_com_invalid)
#define format_add_cpu_load(list,wid,right,suffix) \
format_add_function(list,wid,right,suffix,_print_cpu_load)
+#define format_add_free_mem(list,wid,right,suffix) \
+ format_add_function(list,wid,right,suffix,_print_free_mem)
#define format_add_max_cpus_per_node(list,wid,right,suffix) \
format_add_function(list,wid,right,suffix,_print_max_cpus_per_node)
#define format_add_version(list,wid,right,suffix) \
@@ -231,6 +233,8 @@ int _print_com_invalid(sinfo_data_t * sinfo_data, int width,
bool right_justify, char *suffix);
int _print_cpu_load(sinfo_data_t * node_ptr, int width,
bool right_justify, char *suffix);
+int _print_free_mem(sinfo_data_t * node_ptr, int width,
+ bool right_justify, char *suffix);
int _print_max_cpus_per_node(sinfo_data_t * sinfo_data, int width,
bool right_justify, char *suffix);
int _print_version(sinfo_data_t * sinfo_data, int width,
diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c
index 3819a3a..cdfcf9b 100644
--- a/src/sinfo/sinfo.c
+++ b/src/sinfo/sinfo.c
@@ -807,6 +807,9 @@ static bool _match_node_data(sinfo_data_t *sinfo_ptr,
node_info_t *node_ptr)
if (params.match_flags.cpu_load_flag &&
(node_ptr->cpu_load != sinfo_ptr->min_cpu_load))
return false;
+ if (params.match_flags.free_mem_flag &&
+ (node_ptr->free_mem != sinfo_ptr->min_free_mem))
+ return false;
if (params.match_flags.version_flag &&
(node_ptr->version != sinfo_ptr->version))
return false;
@@ -926,6 +929,8 @@ static void _update_sinfo(sinfo_data_t *sinfo_ptr,
node_info_t *node_ptr,
sinfo_ptr->max_weight = node_ptr->weight;
sinfo_ptr->min_cpu_load = node_ptr->cpu_load;
sinfo_ptr->max_cpu_load = node_ptr->cpu_load;
+ sinfo_ptr->min_free_mem = node_ptr->free_mem;
+ sinfo_ptr->max_free_mem = node_ptr->free_mem;
sinfo_ptr->max_cpus_per_node = sinfo_ptr->part_info->
max_cpus_per_node;
sinfo_ptr->version = node_ptr->version;
@@ -973,6 +978,11 @@ static void _update_sinfo(sinfo_data_t *sinfo_ptr,
node_info_t *node_ptr,
sinfo_ptr->min_cpu_load = node_ptr->cpu_load;
if (sinfo_ptr->max_cpu_load < node_ptr->cpu_load)
sinfo_ptr->max_cpu_load = node_ptr->cpu_load;
+
+ if (sinfo_ptr->min_free_mem > node_ptr->free_mem)
+ sinfo_ptr->min_free_mem = node_ptr->free_mem;
+ if (sinfo_ptr->max_free_mem < node_ptr->free_mem)
+ sinfo_ptr->max_free_mem = node_ptr->free_mem;
}
hostlist_push_host(sinfo_ptr->nodes, node_ptr->name);
diff --git a/src/sinfo/sinfo.h b/src/sinfo/sinfo.h
index cf6be03..415f929 100644
--- a/src/sinfo/sinfo.h
+++ b/src/sinfo/sinfo.h
@@ -100,6 +100,8 @@ typedef struct {
uint32_t max_weight;
uint32_t min_cpu_load;
uint32_t max_cpu_load;
+ uint32_t min_free_mem;
+ uint32_t max_free_mem;
uint32_t max_cpus_per_node;
@@ -151,6 +153,7 @@ struct sinfo_match_flags {
bool reason_timestamp_flag;
bool reason_user_flag;
bool cpu_load_flag;
+ bool free_mem_flag;
bool max_cpus_per_node_flag;
bool version_flag;
};
diff --git a/src/sinfo/sort.c b/src/sinfo/sort.c
index 4d98937..12ca54a 100644
--- a/src/sinfo/sort.c
+++ b/src/sinfo/sort.c
@@ -54,6 +54,7 @@ static void _get_sinfo_from_void(sinfo_data_t **s1,
sinfo_data_t **s2,
void *v1, void *v2);
static int _sort_by_avail(void *void1, void *void2);
static int _sort_by_cpu_load(void *void1, void *void2);
+static int _sort_by_free_mem(void *void1, void *void2);
static int _sort_by_cpus(void *void1, void *void2);
static int _sort_by_sct(void *void1, void *void2);
static int _sort_by_sockets(void *void1, void *void2);
@@ -143,6 +144,8 @@ void sort_sinfo_list(List sinfo_list)
list_sort(sinfo_list, _sort_by_node_addr);
else if (params.sort[i] == 'O')
list_sort(sinfo_list, _sort_by_cpu_load);
+ else if (params.sort[i] == 'e')
+ list_sort(sinfo_list, _sort_by_free_mem);
else if (params.sort[i] == 'p')
list_sort(sinfo_list, _sort_by_priority);
else if (params.sort[i] == 'P')
@@ -227,6 +230,21 @@ static int _sort_by_cpu_load(void *void1, void *void2)
return diff;
}
+static int _sort_by_free_mem(void *void1, void *void2)
+{
+ int diff;
+ sinfo_data_t *sinfo1;
+ sinfo_data_t *sinfo2;
+
+ _get_sinfo_from_void(&sinfo1, &sinfo2, void1, void2);
+
+ diff = _diff_uint32(sinfo1->min_free_mem, sinfo2->min_free_mem);
+
+ if (reverse_order)
+ diff = -diff;
+ return diff;
+}
+
static int _sort_by_cpus(void *void1, void *void2)
{
int diff;
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 859c554..e02ccf9 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -935,6 +935,8 @@ static void *_thread_per_group_rpc(void *args)
lock_slurmctld(node_write_lock);
reset_node_load(ret_data_info->node_name,
ping_resp->cpu_load);
+ reset_node_free_mem(ret_data_info->node_name,
+ ping_resp->free_mem);
unlock_slurmctld(node_write_lock);
}
/* SPECIAL CASE: Mark node as IDLE if job already complete */
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 60268f6..de3fdfd 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -924,6 +924,7 @@ static void _pack_node (struct node_record *dump_node_ptr,
Buf buffer,
packstr(dump_node_ptr->cpu_spec_list, buffer);
pack32(dump_node_ptr->cpu_load, buffer);
+ pack32(dump_node_ptr->free_mem, buffer);
pack32(dump_node_ptr->config_ptr->weight, buffer);
pack32(dump_node_ptr->reason_uid, buffer);
@@ -997,6 +998,7 @@ static void _pack_node (struct node_record *dump_node_ptr,
Buf buffer,
packstr(dump_node_ptr->cpu_spec_list, buffer);
pack32(dump_node_ptr->cpu_load, buffer);
+ pack32(dump_node_ptr->free_mem, buffer);
pack32(dump_node_ptr->config_ptr->weight, buffer);
pack32(dump_node_ptr->reason_uid, buffer);
@@ -1065,6 +1067,7 @@ static void _pack_node (struct node_record
*dump_node_ptr, Buf buffer,
}
#endif
pack32(dump_node_ptr->cpu_load, buffer);
+ pack32(dump_node_ptr->free_mem, buffer);
pack32(dump_node_ptr->config_ptr->weight, buffer);
pack32(dump_node_ptr->reason_uid, buffer);
@@ -2224,6 +2227,11 @@ extern int
validate_node_specs(slurm_node_registration_status_msg_t *reg_msg,
node_ptr->cpu_load_time = now;
last_node_update = now;
}
+ if (node_ptr->free_mem != reg_msg->free_mem) {
+ node_ptr->free_mem = reg_msg->free_mem;
+ node_ptr->free_mem_time = now;
+ last_node_update = now;
+ }
if (IS_NODE_NO_RESPOND(node_ptr)) {
if (IS_NODE_POWER_UP(node_ptr))
@@ -3526,3 +3534,23 @@ extern void reset_node_load(char *node_name, uint32_t
cpu_load)
error("is_node_resp unable to find node %s", node_name);
#endif
}
+
+/* Reset a node's free memory value */
+extern void reset_node_free_mem(char *node_name, uint32_t free_mem)
+{
+#ifdef HAVE_FRONT_END
+ return;
+#else
+ struct node_record *node_ptr;
+
+ node_ptr = find_node_record(node_name);
+ if (node_ptr) {
+ time_t now = time(NULL);
+ node_ptr->free_mem = free_mem;
+ node_ptr->free_mem_time = now;
+ last_node_update = now;
+ } else
+ error("is_node_resp unable to find node %s", node_name);
+#endif
+}
+
diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c
index 60b5fc0..e437c7e 100644
--- a/src/slurmctld/ping_nodes.c
+++ b/src/slurmctld/ping_nodes.c
@@ -152,6 +152,7 @@ void ping_nodes (void)
#else
struct node_record *node_ptr = NULL;
time_t old_cpu_load_time = now - slurmctld_conf.slurmd_timeout;
+ time_t old_free_mem_time = now - slurmctld_conf.slurmd_timeout;
#endif
ping_agent_args = xmalloc (sizeof (agent_arg_t));
@@ -322,7 +323,8 @@ void ping_nodes (void)
if ((!IS_NODE_NO_RESPOND(node_ptr)) &&
(node_ptr->last_response >= still_live_time) &&
- (node_ptr->cpu_load_time >= old_cpu_load_time))
+ (node_ptr->cpu_load_time >= old_cpu_load_time) &&
+ (node_ptr->free_mem_time >= old_free_mem_time))
continue;
/* Do not keep pinging down nodes since this can induce
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index b8b69ee..dfaddb4 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -1895,6 +1895,9 @@ extern void reset_job_bitmaps (void);
/* Reset a node's CPU load value */
extern void reset_node_load(char *node_name, uint32_t cpu_load);
+/* Reset a node's free memory value */
+extern void reset_node_free_mem(char *node_name, uint32_t free_mem);
+
/* Reset all scheduling statistics
* level IN - clear backfilled_jobs count if set */
extern void reset_stats(int level);
diff --git a/src/slurmd/slurmd/get_mach_stat.c
b/src/slurmd/slurmd/get_mach_stat.c
index d7c5eb1..798c673 100644
--- a/src/slurmd/slurmd/get_mach_stat.c
+++ b/src/slurmd/slurmd/get_mach_stat.c
@@ -352,6 +352,25 @@ extern int get_cpu_load(uint32_t *cpu_load)
return 0;
}
+extern int get_free_mem(uint32_t *free_mem)
+{
+#if defined(HAVE_AIX) || defined(__sun) || defined(__APPLE__) ||
defined(__NetBSD__) || defined(__FreeBSD__) || defined(__CYGWIN__)
+ /* Not sure how to get CPU load on above systems.
+ * Perhaps some method below works. */
+ *free_mem = 0;
+#else
+ struct sysinfo info;
+
+ if (sysinfo(&info) < 0) {
+ *free_mem = 0;
+ return errno;
+ }
+
+ *free_mem = (((uint64_t )info.freeram)*info.mem_unit)/(1024*1024);
+#endif
+ return 0;
+}
+
#ifdef USE_CPU_SPEED
/* _chk_cpuinfo_str
* check a line of cpuinfo data (buffer) for a keyword. If it
diff --git a/src/slurmd/slurmd/get_mach_stat.h
b/src/slurmd/slurmd/get_mach_stat.h
index 83bf9a4..9515864 100644
--- a/src/slurmd/slurmd/get_mach_stat.h
+++ b/src/slurmd/slurmd/get_mach_stat.h
@@ -52,6 +52,7 @@
#endif /* HAVE_CONFIG_H */
extern int get_cpu_load(uint32_t *cpu_load);
+extern int get_free_mem(uint32_t *free_mem);
extern int get_mach_name(char *node_name);
extern int get_memory(uint32_t *real_memory);
extern int get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs);
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index b7ab12c..295fbdb 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2467,6 +2467,7 @@ _rpc_ping(slurm_msg_t *msg)
slurm_msg_t resp_msg;
ping_slurmd_resp_msg_t ping_resp;
get_cpu_load(&ping_resp.cpu_load);
+ get_free_mem(&ping_resp.free_mem);
slurm_msg_t_copy(&resp_msg, msg);
resp_msg.msg_type = RESPONSE_PING_SLURMD;
resp_msg.data = &ping_resp;
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index 9b6086b..229cddf 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -671,6 +671,7 @@ _fill_registration_msg(slurm_node_registration_status_msg_t
*msg)
msg->tmp_disk = conf->tmp_disk_space;
msg->hash_val = slurm_get_hash_val();
get_cpu_load(&msg->cpu_load);
+ get_free_mem(&msg->free_mem);
gres_info = init_buf(1024);
if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS)
diff --git a/src/sview/node_info.c b/src/sview/node_info.c
index e9872bc..b76a59e 100644
--- a/src/sview/node_info.c
+++ b/src/sview/node_info.c
@@ -49,6 +49,7 @@ enum {
SORTID_COLOR,
SORTID_CPUS,
SORTID_CPU_LOAD,
+ SORTID_FREE_MEM,
SORTID_CONSUMED_ENERGY,
SORTID_CORES,
SORTID_CURRENT_WATTS,
@@ -140,6 +141,8 @@ static display_data_t display_data_node[] = {
create_model_node, admin_edit_node},
{G_TYPE_STRING, SORTID_CPU_LOAD, "CPU Load", FALSE, EDIT_NONE,
refresh_node, create_model_node, admin_edit_node},
+ {G_TYPE_STRING, SORTID_FREE_MEM, "Free Memory", FALSE, EDIT_NONE,
+ refresh_node, create_model_node, admin_edit_node},
{G_TYPE_STRING, SORTID_ARCH, "Arch", FALSE,
EDIT_NONE, refresh_node, create_model_node, admin_edit_node},
{G_TYPE_STRING, SORTID_FEATURES, "Features", FALSE,
@@ -275,6 +278,17 @@ static void _layout_node_record(GtkTreeView *treeview,
SORTID_CPU_LOAD),
tmp_cnt);
+ if (node_ptr->free_mem == NO_VAL) {
+ snprintf(tmp_cnt, sizeof(tmp_cnt), "N/A");
+ } else {
+ snprintf(tmp_cnt, sizeof(tmp_cnt), "%uM",
+ node_ptr->free_mem);
+ }
+ add_display_treestore_line(update, treestore, &iter,
+ find_col_name(display_data_node,
+ SORTID_FREE_MEM),
+ tmp_cnt);
+
select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
SELECT_NODEDATA_SUBCNT,
NODE_STATE_ALLOCATED,
@@ -468,7 +482,7 @@ static void _update_node_record(sview_node_info_t
*sview_node_info_ptr,
node_info_t *node_ptr = sview_node_info_ptr->node_ptr;
char tmp_disk[20], tmp_cpus[20], tmp_err_cpus[20], tmp_idle_cpus[20];
char tmp_mem[20], tmp_used_memory[20];
- char tmp_used_cpus[20], tmp_cpu_load[20], tmp_owner[32];
+ char tmp_used_cpus[20], tmp_cpu_load[20], tmp_free_mem[20],
tmp_owner[32];
char tmp_current_watts[50], tmp_base_watts[50], tmp_consumed_energy[50];
char tmp_cap_watts[50], tmp_version[50];
char *tmp_state_lower, *tmp_state_upper;
@@ -504,6 +518,13 @@ static void _update_node_record(sview_node_info_t
*sview_node_info_ptr,
"%.2f", (node_ptr->cpu_load / 100.0));
}
+ if (node_ptr->free_mem == NO_VAL) {
+ strcpy(tmp_free_mem, "N/A");
+ } else {
+ snprintf(tmp_free_mem, sizeof(tmp_free_mem),
+ "%uM", node_ptr->free_mem);
+ }
+
convert_num_unit((float)node_ptr->cpus, tmp_cpus,
sizeof(tmp_cpus), UNIT_NONE);
@@ -594,6 +615,7 @@ static void _update_node_record(sview_node_info_t
*sview_node_info_ptr,
SORTID_CPUS, tmp_cpus,
SORTID_CURRENT_WATTS, tmp_current_watts,
SORTID_CPU_LOAD, tmp_cpu_load,
+ SORTID_FREE_MEM, tmp_free_mem,
SORTID_DISK, tmp_disk,
SORTID_ERR_CPUS, tmp_err_cpus,
SORTID_IDLE_CPUS, tmp_idle_cpus,
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
Forschungszentrum Juelich GmbH
52425 Juelich
Sitz der Gesellschaft: Juelich
Eingetragen im Handelsregister des Amtsgerichts Dueren Nr. HR B 3498
Vorsitzender des Aufsichtsrats: MinDir Dr. Karl Eugen Huthmacher
Geschaeftsfuehrung: Prof. Dr.-Ing. Wolfgang Marquardt (Vorsitzender),
Karsten Beneke (stellv. Vorsitzender), Prof. Dr.-Ing. Harald Bolt,
Prof. Dr. Sebastian M. Schmidt
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------