The attached patch changes type of some variables in the PMI code from
uint16_t to uint32_t, to allow running more than 65536 tasks.
diff -X DONTDIFF -uNr slurm-2.2.3/src/api/pmi_server.c /home/hjcao/work/slurm-2.2.3/src/api/pmi_server.c
--- slurm-2.2.3/src/api/pmi_server.c	2010-11-30 07:21:10.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/api/pmi_server.c	2011-03-26 15:41:21.000000000 +0800
@@ -64,8 +64,8 @@
 	char *hostname;
 };				/* details for barrier task communcations */
 struct barrier_resp *barrier_ptr = NULL;
-uint16_t barrier_resp_cnt = 0;	/* tasks having reached barrier */
-uint16_t barrier_cnt = 0;	/* tasks needing to reach barrier */
+uint32_t barrier_resp_cnt = 0;	/* tasks having reached barrier */
+uint32_t barrier_cnt = 0;	/* tasks needing to reach barrier */
 
 pthread_mutex_t agent_mutex = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t  agent_cond  = PTHREAD_COND_INITIALIZER;
diff -X DONTDIFF -uNr slurm-2.2.3/src/api/slurm_pmi.h /home/hjcao/work/slurm-2.2.3/src/api/slurm_pmi.h
--- slurm-2.2.3/src/api/slurm_pmi.h	2010-11-30 07:21:10.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/api/slurm_pmi.h	2011-03-26 10:40:04.000000000 +0800
@@ -60,7 +60,7 @@
 #define PMI_MAX_VAL_LEN     256	/* Maximum size of a PMI value */
 
 struct kvs_hosts {
-	uint16_t	task_id;	/* job step's task id */
+	uint32_t	task_id;	/* job step's task id */
 	uint16_t	port;		/* communication port */
 	char *		hostname;	/* communication host */
 };
diff -X DONTDIFF -uNr slurm-2.2.3/src/common/slurm_protocol_defs.h /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_defs.h
--- slurm-2.2.3/src/common/slurm_protocol_defs.h	2010-11-10 00:23:40.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_defs.h	2011-03-26 10:37:51.000000000 +0800
@@ -827,8 +827,8 @@
 } suspend_msg_t;
 
 typedef struct kvs_get_msg {
-	uint16_t task_id;	/* job step's task id */
-	uint16_t size;		/* count of tasks in job */
+	uint32_t task_id;	/* job step's task id */
+	uint32_t size;		/* count of tasks in job */
 	uint16_t port;		/* port to be sent the kvs data */
 	char * hostname;	/* hostname to be sent the kvs data */
 } kvs_get_msg_t;
diff -X DONTDIFF -uNr slurm-2.2.3/src/common/slurm_protocol_pack.c /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_pack.c
--- slurm-2.2.3/src/common/slurm_protocol_pack.c	2010-11-10 00:23:40.000000000 +0800
+++ /home/hjcao/work/slurm-2.2.3/src/common/slurm_protocol_pack.c	2011-03-26 10:45:07.000000000 +0800
@@ -7671,7 +7671,7 @@
 static void _pack_kvs_host_rec(struct kvs_hosts *msg_ptr, Buf buffer,
 			       uint16_t protocol_version)
 {
-	pack16(msg_ptr->task_id, buffer);
+	pack32(msg_ptr->task_id, buffer);
 	pack16(msg_ptr->port, buffer);
 	packstr(msg_ptr->hostname, buffer);
 }
@@ -7681,7 +7681,7 @@
 {
 	uint32_t uint32_tmp;
 
-	safe_unpack16(&msg_ptr->task_id, buffer);
+	safe_unpack32(&msg_ptr->task_id, buffer);
 	safe_unpack16(&msg_ptr->port, buffer);
 	safe_unpackstr_xmalloc(&msg_ptr->hostname, &uint32_tmp, buffer);
 	return SLURM_SUCCESS;
@@ -7818,8 +7818,8 @@
 static void _pack_kvs_get(kvs_get_msg_t *msg_ptr, Buf buffer,
 			  uint16_t protocol_version)
 {
-	pack16((uint16_t)msg_ptr->task_id, buffer);
-	pack16((uint16_t)msg_ptr->size, buffer);
+	pack32((uint32_t)msg_ptr->task_id, buffer);
+	pack32((uint32_t)msg_ptr->size, buffer);
 	pack16((uint16_t)msg_ptr->port, buffer);
 	packstr(msg_ptr->hostname, buffer);
 }
@@ -7832,8 +7832,8 @@
 
 	msg = xmalloc(sizeof(struct kvs_get_msg));
 	*msg_ptr = msg;
-	safe_unpack16(&msg->task_id, buffer);
-	safe_unpack16(&msg->size, buffer);
+	safe_unpack32(&msg->task_id, buffer);
+	safe_unpack32(&msg->size, buffer);
 	safe_unpack16(&msg->port, buffer);
 	safe_unpackstr_xmalloc(&msg->hostname, &uint32_tmp, buffer);
 	return SLURM_SUCCESS;

Attachment: signature.asc
Description: 这是信件的数字签名部分



Reply via email to