The attached patch (against 4.0.2) should fix it, I'll prepare a PR to
fix this upstream.
Brice
Le 27/11/2019 à 00:41, Brice Goglin via users a écrit :
> It looks like NUMA is broken, while others such as SOCKET and L3CACHE
> work fine. A quick look in opal_hwloc_base_get_relative_locality() and
> friends tells me that those functions were not properly updated to hwloc
> 2.0 NUMA changes. I'll try to understand what's going on tomorrow.
>
> Rebuilding OMPI with an external hwloc 1.11.x might avoid the issue in
> the meantime.
>
> Beware that splitting on NUMA might become meaningless on some platforms
> in the future (there are already some x86 platforms where some NUMA
> nodes are attached to the Packages while others are attached to each
> half of the same Packages).
>
> Brice
>
>
> Le 26/11/2019 à 23:12, Hatem Elshazly via users a écrit :
>> Hello,
>>
>>
>> I'm trying to split the world communicator by NUMA using
>> MPI_Comm_split_type. I expected to get as many sub communicators as
>> the NUMA nodes, but what I get is as many sub communicator as the
>> number of mpi processes each containing one process.
>>
>>
>> Attached is a reproducer code. I tried it using version 4.0.2 built
>> with GNU 9.2.0 on a skyline and haswell machines and both behave
>> similarly.
>>
>>
>> Can anyone point me to why does it behave like that? Is this expected
>> or am I confusing something?
>>
>>
>> Thanks in advance,
>>
>> Hatem
>>
>> Junior Researcher -- Barcelona Supercomputing Center (BSC)
>>
>>
>>
>> http://bsc.es/disclaimer
diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c
index ba26ba0ac6..daf2fa2064 100644
--- a/opal/mca/hwloc/base/hwloc_base_util.c
+++ b/opal/mca/hwloc/base/hwloc_base_util.c
@@ -1215,16 +1215,84 @@ int opal_hwloc_base_cpu_list_parse(const char *slot_str,
return OPAL_SUCCESS;
}
+static void opal_hwloc_base_get_relative_locality_by_depth(hwloc_topology_t topo, unsigned d,
+ hwloc_cpuset_t loc1, hwloc_cpuset_t loc2,
+ opal_hwloc_locality_t *locality, bool *shared)
+{
+ unsigned width, w;
+ hwloc_obj_t obj;
+ int sect1, sect2;
+
+ /* get the width of the topology at this depth */
+ width = hwloc_get_nbobjs_by_depth(topo, d);
+
+ /* scan all objects at this depth to see if
+ * our locations overlap with them
+ */
+ for (w=0; w < width; w++) {
+ /* get the object at this depth/index */
+ obj = hwloc_get_obj_by_depth(topo, d, w);
+ /* see if our locations intersect with the cpuset for this obj */
+ sect1 = hwloc_bitmap_intersects(obj->cpuset, loc1);
+ sect2 = hwloc_bitmap_intersects(obj->cpuset, loc2);
+ /* if both intersect, then we share this level */
+ if (sect1 && sect2) {
+ *shared = true;
+ switch(obj->type) {
+ case HWLOC_OBJ_NODE:
+ *locality |= OPAL_PROC_ON_NUMA;
+ break;
+ case HWLOC_OBJ_SOCKET:
+ *locality |= OPAL_PROC_ON_SOCKET;
+ break;
+#if HWLOC_API_VERSION < 0x20000
+ case HWLOC_OBJ_CACHE:
+ if (3 == obj->attr->cache.depth) {
+ *locality |= OPAL_PROC_ON_L3CACHE;
+ } else if (2 == obj->attr->cache.depth) {
+ *locality |= OPAL_PROC_ON_L2CACHE;
+ } else {
+ *locality |= OPAL_PROC_ON_L1CACHE;
+ }
+ break;
+#else
+ case HWLOC_OBJ_L3CACHE:
+ *locality |= OPAL_PROC_ON_L3CACHE;
+ break;
+ case HWLOC_OBJ_L2CACHE:
+ *locality |= OPAL_PROC_ON_L2CACHE;
+ break;
+ case HWLOC_OBJ_L1CACHE:
+ *locality |= OPAL_PROC_ON_L1CACHE;
+ break;
+#endif
+ case HWLOC_OBJ_CORE:
+ *locality |= OPAL_PROC_ON_CORE;
+ break;
+ case HWLOC_OBJ_PU:
+ *locality |= OPAL_PROC_ON_HWTHREAD;
+ break;
+ default:
+ /* just ignore it */
+ break;
+ }
+ break;
+ }
+ /* otherwise, we don't share this
+ * object - but we still might share another object
+ * on this level, so we have to keep searching
+ */
+ }
+}
+
opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
char *cpuset1, char *cpuset2)
{
opal_hwloc_locality_t locality;
- hwloc_obj_t obj;
- unsigned depth, d, width, w;
+ hwloc_cpuset_t loc1, loc2;
+ unsigned depth, d;
bool shared;
hwloc_obj_type_t type;
- int sect1, sect2;
- hwloc_cpuset_t loc1, loc2;
/* start with what we know - they share a node on a cluster
* NOTE: we may alter that latter part as hwloc's ability to
@@ -1265,66 +1333,8 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
HWLOC_OBJ_PU != type) {
continue;
}
- /* get the width of the topology at this depth */
- width = hwloc_get_nbobjs_by_depth(topo, d);
+ opal_hwloc_base_get_relative_locality_by_depth(topo, d, loc1, loc2, &locality, &shared);
- /* scan all objects at this depth to see if
- * our locations overlap with them
- */
- for (w=0; w < width; w++) {
- /* get the object at this depth/index */
- obj = hwloc_get_obj_by_depth(topo, d, w);
- /* see if our locations intersect with the cpuset for this obj */
- sect1 = hwloc_bitmap_intersects(obj->cpuset, loc1);
- sect2 = hwloc_bitmap_intersects(obj->cpuset, loc2);
- /* if both intersect, then we share this level */
- if (sect1 && sect2) {
- shared = true;
- switch(obj->type) {
- case HWLOC_OBJ_NODE:
- locality |= OPAL_PROC_ON_NUMA;
- break;
- case HWLOC_OBJ_SOCKET:
- locality |= OPAL_PROC_ON_SOCKET;
- break;
-#if HWLOC_API_VERSION < 0x20000
- case HWLOC_OBJ_CACHE:
- if (3 == obj->attr->cache.depth) {
- locality |= OPAL_PROC_ON_L3CACHE;
- } else if (2 == obj->attr->cache.depth) {
- locality |= OPAL_PROC_ON_L2CACHE;
- } else {
- locality |= OPAL_PROC_ON_L1CACHE;
- }
- break;
-#else
- case HWLOC_OBJ_L3CACHE:
- locality |= OPAL_PROC_ON_L3CACHE;
- break;
- case HWLOC_OBJ_L2CACHE:
- locality |= OPAL_PROC_ON_L2CACHE;
- break;
- case HWLOC_OBJ_L1CACHE:
- locality |= OPAL_PROC_ON_L1CACHE;
- break;
-#endif
- case HWLOC_OBJ_CORE:
- locality |= OPAL_PROC_ON_CORE;
- break;
- case HWLOC_OBJ_PU:
- locality |= OPAL_PROC_ON_HWTHREAD;
- break;
- default:
- /* just ignore it */
- break;
- }
- break;
- }
- /* otherwise, we don't share this
- * object - but we still might share another object
- * on this level, so we have to keep searching
- */
- }
/* if we spanned the entire width without finding
* a point of intersection, then no need to go
* deeper
@@ -1333,6 +1343,9 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
break;
}
}
+#if HWLOC_API_VERSION >= 0x20000
+ opal_hwloc_base_get_relative_locality_by_depth(topo, HWLOC_TYPE_DEPTH_NUMANODE, loc1, loc2, &locality, &shared);
+#endif
opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
"locality: %s",
@@ -2063,12 +2076,40 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
return sig;
}
+static int opal_hwloc_base_get_locality_string_by_depth(hwloc_topology_t topo,
+ int d,
+ hwloc_cpuset_t cpuset,
+ hwloc_cpuset_t result)
+{
+ hwloc_obj_t obj;
+ unsigned width, w;
+
+ /* get the width of the topology at this depth */
+ width = hwloc_get_nbobjs_by_depth(topo, d);
+ if (0 == width) {
+ return -1;
+ }
+
+ /* scan all objects at this depth to see if
+ * the location overlaps with them
+ */
+ for (w=0; w < width; w++) {
+ /* get the object at this depth/index */
+ obj = hwloc_get_obj_by_depth(topo, d, w);
+ /* see if the location intersects with it */
+ if (hwloc_bitmap_intersects(obj->cpuset, cpuset)) {
+ hwloc_bitmap_set(result, w);
+ }
+ }
+
+ return 0;
+}
+
char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo,
char *bitmap)
{
- hwloc_obj_t obj;
char *locality=NULL, *tmp, *t2;
- unsigned depth, d, width, w;
+ unsigned depth, d;
hwloc_cpuset_t cpuset, result;
hwloc_obj_type_t type;
@@ -2111,28 +2152,15 @@ char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo,
continue;
}
- /* get the width of the topology at this depth */
- width = hwloc_get_nbobjs_by_depth(topo, d);
- if (0 == width) {
+ if (opal_hwloc_base_get_locality_string_by_depth(topo, d, cpuset, result) < 0) {
continue;
}
- /* scan all objects at this depth to see if
- * the location overlaps with them
- */
- for (w=0; w < width; w++) {
- /* get the object at this depth/index */
- obj = hwloc_get_obj_by_depth(topo, d, w);
- /* see if the location intersects with it */
- if (hwloc_bitmap_intersects(obj->cpuset, cpuset)) {
- hwloc_bitmap_set(result, w);
- }
- }
/* it should be impossible, but allow for the possibility
* that we came up empty at this depth */
if (!hwloc_bitmap_iszero(result)) {
hwloc_bitmap_list_asprintf(&tmp, result);
- switch(obj->type) {
+ switch(type) {
case HWLOC_OBJ_NODE:
asprintf(&t2, "%sNM%s:", (NULL == locality) ? "" : locality, tmp);
if (NULL != locality) {
@@ -2217,6 +2245,24 @@ char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo,
}
hwloc_bitmap_zero(result);
}
+
+#if HWLOC_API_VERSION >= 0x20000
+ if (opal_hwloc_base_get_locality_string_by_depth(topo, HWLOC_TYPE_DEPTH_NUMANODE, cpuset, result) == 0) {
+ /* it should be impossible, but allow for the possibility
+ * that we came up empty at this depth */
+ if (!hwloc_bitmap_iszero(result)) {
+ hwloc_bitmap_list_asprintf(&tmp, result);
+ asprintf(&t2, "%sNM%s:", (NULL == locality) ? "" : locality, tmp);
+ if (NULL != locality) {
+ free(locality);
+ }
+ locality = t2;
+ free(tmp);
+ }
+ hwloc_bitmap_zero(result);
+ }
+#endif
+
hwloc_bitmap_free(result);
hwloc_bitmap_free(cpuset);