Index: src/plugins/task/cgroup/task_cgroup_cpuset.c
===================================================================
RCS file: /cvsroot/slurm/slurm/src/plugins/task/cgroup/Attic/task_cgroup_cpuset.c,v
retrieving revision 1.1.1.4
diff -u -r1.1.1.4 task_cgroup_cpuset.c
--- src/plugins/task/cgroup/task_cgroup_cpuset.c	19 Oct 2011 18:16:10 -0000	1.1.1.4
+++ src/plugins/task/cgroup/task_cgroup_cpuset.c	18 Jan 2012 20:50:02 -0000
@@ -3,6 +3,8 @@
  *****************************************************************************
  *  Copyright (C) 2009 CEA/DAM/DIF
  *  Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
+ *  Portions copyright (C) 2012 Bull
+ *  Written by Martin Perry <martin.perry@bull.com>
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://www.schedmd.com/slurmdocs/>.
@@ -75,6 +77,24 @@
 
 static int _xcgroup_cpuset_init(xcgroup_t* cg);
 
+static int _task_cgroup_cpuset_dist_cyclic(hwloc_topology_t topology,
+		hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
+		uint32_t nobj, slurmd_job_t *job,
+#if HWLOC_API_VERSION <= 0x00010000
+		hwloc_cpuset_t cpuset);
+#else
+	    hwloc_bitmap_t cpuset);
+#endif
+
+static int _task_cgroup_cpuset_dist_block(hwloc_topology_t topology,
+	    hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
+	    uint32_t nobj, slurmd_job_t *job,
+#if HWLOC_API_VERSION <= 0x00010000
+	    hwloc_cpuset_t cpuset);
+#else
+	    hwloc_bitmap_t cpuset);
+#endif
+
 extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 {
 	char release_agent_path[PATH_MAX];
@@ -279,9 +299,9 @@
 		goto error;
 	}
 	debug("task/cgroup: job physical cores are '%s'",
-	      job->job_alloc_cores);
+	      job_alloc_cores);
 	debug("task/cgroup: step physical cores are '%s'",
-	      job->step_alloc_cores);
+	      step_alloc_cores);
 
 	/*
 	 * create user cgroup in the cpuset ns (it could already exist)
@@ -430,15 +450,14 @@
 
 	hwloc_topology_t topology;
 #if HWLOC_API_VERSION <= 0x00010000
-	hwloc_cpuset_t cpuset,ct;
+	hwloc_cpuset_t cpuset;
 #else
-	hwloc_bitmap_t cpuset,ct;
+	hwloc_bitmap_t cpuset;
 #endif
 	hwloc_obj_t obj;
 	struct hwloc_obj *pobj;
 	hwloc_obj_type_t hwtype;
 	hwloc_obj_type_t req_hwtype;
-	int hwdepth;
 
 	size_t tssize;
 	cpu_set_t ts;
@@ -513,6 +532,7 @@
 						       HWLOC_OBJ_SOCKET);
 	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
 						     HWLOC_OBJ_NODE);
+
 	hwtype = HWLOC_OBJ_MACHINE;
 	nobj = 1;
 	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
@@ -543,7 +563,7 @@
 	}
 
 	/*
-	 * Perform a block binding on the detected object respecting the
+	 * Bind the detected object to the taskid, respecting the
 	 * granularity.
 	 * If not enough objects to do the job, revert to no affinity mode
 	 */
@@ -564,81 +584,17 @@
 			info("task/cgroup: task[%u] using %s granularity",
 			     taskid,hwloc_obj_type_string(hwtype));
 		}
-		if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
-			/* cores or threads granularity */
-			pfirst = taskid *  job->cpus_per_task ;
-			plast = pfirst + job->cpus_per_task - 1;
-		} else {
-			/* sockets or ldoms granularity */
-			pfirst = taskid;
-			plast = pfirst;
-		}
-
-		hwdepth = hwloc_get_type_depth(topology,hwtype);
-		for (i = pfirst; i <= plast && i < nobj ; i++) {
-			obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i);
-
-			/* if requested binding overlap the granularity */
-			/* use the ancestor cpuset instead of the object one */
-			if (hwloc_compare_types(hwtype,req_hwtype) > 0) {
-
-				/* Get the parent object of req_hwtype or the */
-				/* one just above if not found (meaning of >0)*/
-				/* (useful for ldoms binding with !NUMA nodes)*/
-				pobj = obj->parent;
-				while (pobj != NULL &&
-					hwloc_compare_types(pobj->type,
-							    req_hwtype) > 0)
-					pobj = pobj->parent;
-
-				if (pobj != NULL) {
-					if (verbose)
-						info("task/cgroup: task[%u] "
-						     "higher level %s found",
-						     taskid,
-						     hwloc_obj_type_string(
-							     pobj->type));
-#if HWLOC_API_VERSION <= 0x00010000
-					ct = hwloc_cpuset_dup(pobj->
-							      allowed_cpuset);
-					hwloc_cpuset_or(cpuset,cpuset,ct);
-					hwloc_cpuset_free(ct);
-#else
-					ct = hwloc_bitmap_dup(pobj->
-							      allowed_cpuset);
-					hwloc_bitmap_or(cpuset,cpuset,ct);
-					hwloc_bitmap_free(ct);
-#endif
-				} else {
-					/* should not be executed */
-					if (verbose)
-						info("task/cgroup: task[%u] "
-						     "no higher level found",
-						     taskid);
-#if HWLOC_API_VERSION <= 0x00010000
-					ct = hwloc_cpuset_dup(obj->
-							      allowed_cpuset);
-					hwloc_cpuset_or(cpuset,cpuset,ct);
-					hwloc_cpuset_free(ct);
-#else
-					ct = hwloc_bitmap_dup(obj->
-							      allowed_cpuset);
-					hwloc_bitmap_or(cpuset,cpuset,ct);
-					hwloc_bitmap_free(ct);
-#endif
-				}
-
-			} else {
-#if HWLOC_API_VERSION <= 0x00010000
-				ct = hwloc_cpuset_dup(obj->allowed_cpuset);
-				hwloc_cpuset_or(cpuset,cpuset,ct);
-				hwloc_cpuset_free(ct);
-#else
-				ct = hwloc_bitmap_dup(obj->allowed_cpuset);
-				hwloc_bitmap_or(cpuset,cpuset,ct);
-				hwloc_bitmap_free(ct);
-#endif
-			}
+		switch(job->task_dist) {
+		case SLURM_DIST_CYCLIC:
+		case SLURM_DIST_BLOCK:
+		case SLURM_DIST_CYCLIC_CYCLIC:
+		case SLURM_DIST_BLOCK_CYCLIC:
+			_task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype,
+					nobj, job, cpuset);
+			break;
+		default:
+			_task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype,
+					nobj, job, cpuset);
 		}
 
 		char *str;
@@ -665,7 +621,6 @@
 			fstatus = SLURM_ERROR;
 		}
 		free(str);
-
 	}
 
 	/* Destroy hwloc objects */
@@ -747,3 +702,239 @@
 	xcgroup_destroy(&acg);
 	return XCGROUP_SUCCESS;
 }
+
+static int _task_cgroup_cpuset_dist_cyclic(hwloc_topology_t topology,
+		hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
+		uint32_t nobj, slurmd_job_t *job,
+#if HWLOC_API_VERSION <= 0x00010000
+		hwloc_cpuset_t cpuset)
+#else
+	    hwloc_bitmap_t cpuset)
+#endif
+{
+#if HWLOC_API_VERSION <= 0x00010000
+	hwloc_cpuset_t ct;
+#else
+	hwloc_bitmap_t ct;
+#endif
+	hwloc_obj_t obj;
+	struct hwloc_obj *pobj;
+    uint32_t *obj_idx;
+	uint32_t i, sock_idx, npskip, npdist, nsockets;
+	uint32_t taskid = job->envtp->localid;
+
+	if (verbose)
+		info("task/cgroup: task[%u] using cyclic distribution, task_dist %u",
+				taskid, job->task_dist);
+	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
+						       HWLOC_OBJ_SOCKET);
+    obj_idx = xmalloc(nsockets * sizeof(uint32_t));
+
+	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
+		/* cores or threads granularity */
+		npskip = taskid * job->cpus_per_task;
+		npdist = job->cpus_per_task;
+	} else {
+		/* sockets or ldoms granularity */
+		npskip = taskid;
+		npdist = 1;
+	}
+
+	/* skip objs for lower taskids */
+    i = 0;
+    sock_idx = 0;
+    while (i < npskip) {
+   	    while ((sock_idx < nsockets) && (i < npskip)) {
+            obj = hwloc_get_obj_below_by_type(topology,
+            		HWLOC_OBJ_SOCKET, sock_idx,
+            		hwtype, obj_idx[sock_idx]);
+           	if (obj != NULL) {
+                obj_idx[sock_idx]++;
+                i++;
+           	}
+           	sock_idx++;
+        }
+   	    if (i < npskip)
+   	    	sock_idx = 0;
+    }
+
+    /* distribute objs cyclically across sockets */
+    i = npdist;
+    while (i > 0) {
+    	while ((sock_idx < nsockets) && (i > 0)) {
+            obj = hwloc_get_obj_below_by_type(topology,
+            		HWLOC_OBJ_SOCKET, sock_idx,
+            		hwtype, obj_idx[sock_idx]);
+           	if (obj != NULL) {
+                obj_idx[sock_idx]++;
+                /* if requested binding overlap the granularity */
+        		/* use the ancestor cpuset instead of the object one */
+        		if (hwloc_compare_types(hwtype,req_hwtype) > 0) {
+
+        			/* Get the parent object of req_hwtype or the */
+        			/* one just above if not found (meaning of >0)*/
+        			/* (useful for ldoms binding with !NUMA nodes)*/
+        			pobj = obj->parent;
+        			while (pobj != NULL &&
+        				hwloc_compare_types(pobj->type,
+        						    req_hwtype) > 0)
+        				pobj = pobj->parent;
+
+        			if (pobj != NULL) {
+        				if (verbose)
+        					info("task/cgroup: task[%u] "
+        					     "higher level %s found",
+        					     taskid,
+        					     hwloc_obj_type_string(
+        						     pobj->type));
+#if HWLOC_API_VERSION <= 0x00010000
+        				ct = hwloc_cpuset_dup(pobj->
+        						      allowed_cpuset);
+        				hwloc_cpuset_or(cpuset,cpuset,ct);
+        				hwloc_cpuset_free(ct);
+#else
+        				ct = hwloc_bitmap_dup(pobj->
+        						      allowed_cpuset);
+        				hwloc_bitmap_or(cpuset,cpuset,ct);
+        				hwloc_bitmap_free(ct);
+#endif
+        			} else {
+        				/* should not be executed */
+        				if (verbose)
+        					info("task/cgroup: task[%u] "
+        					     "no higher level found",
+        					     taskid);
+#if HWLOC_API_VERSION <= 0x00010000
+        				ct = hwloc_cpuset_dup(obj->
+        						      allowed_cpuset);
+        				hwloc_cpuset_or(cpuset,cpuset,ct);
+        				hwloc_cpuset_free(ct);
+#else
+        				ct = hwloc_bitmap_dup(obj->
+        						      allowed_cpuset);
+        				hwloc_bitmap_or(cpuset,cpuset,ct);
+        				hwloc_bitmap_free(ct);
+#endif
+        			}
+
+        		} else {
+#if HWLOC_API_VERSION <= 0x00010000
+        			ct = hwloc_cpuset_dup(obj->allowed_cpuset);
+        			hwloc_cpuset_or(cpuset,cpuset,ct);
+        			hwloc_cpuset_free(ct);
+#else
+        			ct = hwloc_bitmap_dup(obj->allowed_cpuset);
+        			hwloc_bitmap_or(cpuset,cpuset,ct);
+        			hwloc_bitmap_free(ct);
+#endif
+        		}
+                i--;
+           	}
+            sock_idx++;
+        }
+        sock_idx = 0;
+    }
+    xfree(obj_idx);
+	return XCGROUP_SUCCESS;
+}
+
+static int _task_cgroup_cpuset_dist_block(hwloc_topology_t topology,
+		hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
+		uint32_t nobj, slurmd_job_t *job,
+#if HWLOC_API_VERSION <= 0x00010000
+		hwloc_cpuset_t cpuset)
+#else
+	    hwloc_bitmap_t cpuset)
+#endif
+
+{
+#if HWLOC_API_VERSION <= 0x00010000
+	hwloc_cpuset_t ct;
+#else
+	hwloc_bitmap_t ct;
+#endif
+	hwloc_obj_t obj;
+	struct hwloc_obj *pobj;
+	uint32_t i, pfirst,plast;
+	uint32_t taskid = job->envtp->localid;
+	int hwdepth;
+
+	if (verbose)
+		info("task/cgroup: task[%u] using block distribution, task_dist %u",
+				taskid, job->task_dist);
+	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
+		/* cores or threads granularity */
+		pfirst = taskid *  job->cpus_per_task ;
+		plast = pfirst + job->cpus_per_task - 1;
+	} else {
+		/* sockets or ldoms granularity */
+		pfirst = taskid;
+		plast = pfirst;
+	}
+	hwdepth = hwloc_get_type_depth(topology,hwtype);
+	for (i = pfirst; i <= plast && i < nobj ; i++) {
+		obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i);
+		/* if requested binding overlap the granularity */
+		/* use the ancestor cpuset instead of the object one */
+		if (hwloc_compare_types(hwtype,req_hwtype) > 0) {
+
+			/* Get the parent object of req_hwtype or the */
+			/* one just above if not found (meaning of >0)*/
+			/* (useful for ldoms binding with !NUMA nodes)*/
+			pobj = obj->parent;
+			while (pobj != NULL &&
+				hwloc_compare_types(pobj->type,
+						    req_hwtype) > 0)
+				pobj = pobj->parent;
+
+			if (pobj != NULL) {
+				if (verbose)
+					info("task/cgroup: task[%u] "
+					     "higher level %s found",
+					     taskid,
+					     hwloc_obj_type_string(
+						     pobj->type));
+#if HWLOC_API_VERSION <= 0x00010000
+				ct = hwloc_cpuset_dup(pobj->
+						      allowed_cpuset);
+				hwloc_cpuset_or(cpuset,cpuset,ct);
+				hwloc_cpuset_free(ct);
+#else
+				ct = hwloc_bitmap_dup(pobj->
+						      allowed_cpuset);
+				hwloc_bitmap_or(cpuset,cpuset,ct);
+				hwloc_bitmap_free(ct);
+#endif
+			} else {
+				/* should not be executed */
+				if (verbose)
+					info("task/cgroup: task[%u] "
+					     "no higher level found",
+					     taskid);
+#if HWLOC_API_VERSION <= 0x00010000
+				ct = hwloc_cpuset_dup(obj->
+						      allowed_cpuset);
+				hwloc_cpuset_or(cpuset,cpuset,ct);
+				hwloc_cpuset_free(ct);
+#else
+				ct = hwloc_bitmap_dup(obj->
+						      allowed_cpuset);
+				hwloc_bitmap_or(cpuset,cpuset,ct);
+				hwloc_bitmap_free(ct);
+#endif
+			}
+
+		} else {
+#if HWLOC_API_VERSION <= 0x00010000
+			ct = hwloc_cpuset_dup(obj->allowed_cpuset);
+			hwloc_cpuset_or(cpuset,cpuset,ct);
+			hwloc_cpuset_free(ct);
+#else
+			ct = hwloc_bitmap_dup(obj->allowed_cpuset);
+			hwloc_bitmap_or(cpuset,cpuset,ct);
+			hwloc_bitmap_free(ct);
+#endif
+		}
+	}
+	return XCGROUP_SUCCESS;
+}
