The attached patch changes type of some variables in the PMI code from uint16_t to uint32_t, to allow running more than 65536 tasks.
diff -X DONTDIFF -uNr slurm-2.2.3/src/api/pmi_server.c /home/hjcao/work/slurm-2.2.3/src/api/pmi_server.c
--- slurm-2.2.3/src/api/pmi_server.c 2010-11-30 07:21:10.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/api/pmi_server.c 2011-03-26 15:41:21.000000000 +0800
@@ -64,8 +64,8 @@
char *hostname;
}; /* details for barrier task communcations */
struct barrier_resp *barrier_ptr = NULL;
-uint16_t barrier_resp_cnt = 0; /* tasks having reached barrier */
-uint16_t barrier_cnt = 0; /* tasks needing to reach barrier */
+uint32_t barrier_resp_cnt = 0; /* tasks having reached barrier */
+uint32_t barrier_cnt = 0; /* tasks needing to reach barrier */
pthread_mutex_t agent_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
diff -X DONTDIFF -uNr slurm-2.2.3/src/api/slurm_pmi.h /home/hjcao/work/slurm-2.2.3/src/api/slurm_pmi.h
--- slurm-2.2.3/src/api/slurm_pmi.h 2010-11-30 07:21:10.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/api/slurm_pmi.h 2011-03-26 10:40:04.000000000 +0800
@@ -60,7 +60,7 @@
#define PMI_MAX_VAL_LEN 256 /* Maximum size of a PMI value */
struct kvs_hosts {
- uint16_t task_id; /* job step's task id */
+ uint32_t task_id; /* job step's task id */
uint16_t port; /* communication port */
char * hostname; /* communication host */
};
diff -X DONTDIFF -uNr slurm-2.2.3/src/common/slurm_protocol_defs.h /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_defs.h
--- slurm-2.2.3/src/common/slurm_protocol_defs.h 2010-11-10 00:23:40.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_defs.h 2011-03-26 10:37:51.000000000 +0800
@@ -827,8 +827,8 @@
} suspend_msg_t;
typedef struct kvs_get_msg {
- uint16_t task_id; /* job step's task id */
- uint16_t size; /* count of tasks in job */
+ uint32_t task_id; /* job step's task id */
+ uint32_t size; /* count of tasks in job */
uint16_t port; /* port to be sent the kvs data */
char * hostname; /* hostname to be sent the kvs data */
} kvs_get_msg_t;
diff -X DONTDIFF -uNr slurm-2.2.3/src/common/slurm_protocol_pack.c /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_pack.c
--- slurm-2.2.3/src/common/slurm_protocol_pack.c 2010-11-10 00:23:40.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_pack.c 2011-03-26 10:45:07.000000000 +0800
@@ -7671,7 +7671,7 @@
static void _pack_kvs_host_rec(struct kvs_hosts *msg_ptr, Buf buffer,
uint16_t protocol_version)
{
- pack16(msg_ptr->task_id, buffer);
+ pack32(msg_ptr->task_id, buffer);
pack16(msg_ptr->port, buffer);
packstr(msg_ptr->hostname, buffer);
}
@@ -7681,7 +7681,7 @@
{
uint32_t uint32_tmp;
- safe_unpack16(&msg_ptr->task_id, buffer);
+ safe_unpack32(&msg_ptr->task_id, buffer);
safe_unpack16(&msg_ptr->port, buffer);
safe_unpackstr_xmalloc(&msg_ptr->hostname, &uint32_tmp, buffer);
return SLURM_SUCCESS;
@@ -7818,8 +7818,8 @@
static void _pack_kvs_get(kvs_get_msg_t *msg_ptr, Buf buffer,
uint16_t protocol_version)
{
- pack16((uint16_t)msg_ptr->task_id, buffer);
- pack16((uint16_t)msg_ptr->size, buffer);
+ pack32((uint32_t)msg_ptr->task_id, buffer);
+ pack32((uint32_t)msg_ptr->size, buffer);
pack16((uint16_t)msg_ptr->port, buffer);
packstr(msg_ptr->hostname, buffer);
}
@@ -7832,8 +7832,8 @@
msg = xmalloc(sizeof(struct kvs_get_msg));
*msg_ptr = msg;
- safe_unpack16(&msg->task_id, buffer);
- safe_unpack16(&msg->size, buffer);
+ safe_unpack32(&msg->task_id, buffer);
+ safe_unpack32(&msg->size, buffer);
safe_unpack16(&msg->port, buffer);
safe_unpackstr_xmalloc(&msg->hostname, &uint32_tmp, buffer);
return SLURM_SUCCESS;
signature.asc
Description: 这是信件的数字签名部分
