Hi!
There are 2 problems with the current HWLOC code. The questions are at
the bottom.
1. Old kernels (RHEL5.*) do expose some numa nodes via sysfs but there
is no information regarting cache (L1/L2/L3) and CPU threads. RHEL6 does
that. The proposed patch parses PowerPC's /proc/device-tree and add
necessary information into the topology.
2. The HWLOC expects numa nodes to be numbered consecutively, like
1-2-3-4-5.... However this is not necessary true for PowerPC with LPARs
or on systems with numa hotswap (do they exist? don't know). This was
before the patch:
=========================
os node 0 has cpuset 0xffffffff
os node 1 has cpuset 0xffffffff,0x0
os node 4 has cpuset 0xffffffff,,0x0
os node 5 has cpuset 0xffffffff,,,0x0
os node 8 has cpuset 0xffffffff,,,,0x0
os node 9 has cpuset 0xffffffff,,,,,0x0
os node 12 has cpuset 0xffffffff,,,,,,0x0
os node 13 has cpuset 0xffffffff,,,,,,,0x0
node distance matrix:
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 10 20 40 40 40 40 40 40 0 1 128 3596701896 0 1
1 20 10 40 40 40 40 40 40 4095 3642405872 4095 3642406288 0
65536
2 128 3596490848 4095 3642406160 4095 3642406048 0 0 128
3597792932 0 0 0 0
3 128 3598856792 0 0 0 0 0 1 0 218840 0 1 0 0
4 40 40 10 20 40 40 40 40 128 3596902928 128 3596700232 4095
3642406320
5 40 40 20 10 40 40 40 40 0 5 4095 3642406432 0 0
6 4095 3642406256 0 0 128 3596923832 256 276108416 4095
3642406272 0 0 0 0
7 0 0 0 0 256 276173984 128 3598846040 0 191376 128
3598846016 256 276108400
8 40 40 40 40 10 20 40 40 4095 2587230208 4095 2587260160 0 0
9 40 40 40 40 20 10 40 40 4095 3642406320 4095 3642406464 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 128 3596680064 128 3596687552 128
3596679872
12 40 40 40 40 40 40 10 20 128 3597793376 128 3598856792 128
3597315568
13 40 40 40 40 40 40 20 10 0 0 128 3598779600 429496729
2576980377
distance matrix asymmetric ([0,2]=40 != [2,0]=128), aborting
=========================
This is how it is supposed to look like:
=========================
node distance matrix:
0 1 4 5 8 9 12 13
0 10 20 40 40 40 40 40 40
1 20 10 40 40 40 40 40 40
4 40 40 10 20 40 40 40 40
5 40 40 20 10 40 40 40 40
8 40 40 40 40 10 20 40 40
9 40 40 40 40 20 10 40 40
12 40 40 40 40 40 40 10 20
13 40 40 40 40 40 40 20 10
trying to group NUMANode objects into misc objects according to physical
distances
found minimal distance 20 between objects
object 1 is minimally connected to 0
found transitive graph with 2 objects with minimal distance 20
object 3 is minimally connected to 2
found transitive graph with 2 objects with minimal distance 20
object 5 is minimally connected to 4
found transitive graph with 2 objects with minimal distance 20
object 7 is minimally connected to 6
found transitive graph with 2 objects with minimal distance 20
adding misc object with 2 objects and cpuset 0xffffffff,0xffffffff
adding misc object with 2 objects and cpuset 0xffffffff,0xffffffff,,0x0
adding misc object with 2 objects and cpuset 0xffffffff,0xffffffff,,,,0x0
adding misc object with 2 objects and cpuset 0xffffffff,0xffffffff,,,,,,0x0
group distances:
15 40 40 40
40 15 40 40
40 40 15 40
40 40 40 15
trying to group Group objects into misc objects according to physical
distances
found minimal distance 40 between objects
object 1 is minimally connected to 0
object 2 is minimally connected to 0
object 3 is minimally connected to 0
found transitive graph with 4 objects with minimal distance 40
=========================
The questions are:
1. What should I change in my patch to have it committed into the svn?
Specifically:
- where do I put IBM-specific code?
- may be there is a better way to detect that no cache info was fetched
from sysfs
- is the coding style ok? :-)
2. Do not I miss something in my patch in order to solve the problems
mentioned in the beginning of this mail?
Thank you.
Index: include/private/private.h
===================================================================
--- include/private/private.h (revision 2427)
+++ include/private/private.h (working copy)
@@ -112,7 +112,7 @@
extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
-extern void hwloc_setup_misc_level_from_distances(struct hwloc_topology *topology, unsigned nbobjs, struct hwloc_obj **objs, unsigned *_distances/*[nbnobjs][nbobjs]*/);
+extern void hwloc_setup_misc_level_from_distances(struct hwloc_topology *topology, unsigned nbobjs, struct hwloc_obj **objs, unsigned *_distances/*[nbnobjs][nbobjs]*/, unsigned *distance_indexes);
extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
extern unsigned hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
Index: src/topology-linux.c
===================================================================
--- src/topology-linux.c (revision 2427)
+++ src/topology-linux.c (working copy)
@@ -98,6 +98,8 @@
#ifdef HAVE_OPENAT
/* Use our own filesystem functions if we have openat */
+#define HWLOC_NBMAXCPUS 1024 /* FIXME: drop */
+
static FILE *
hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
{
@@ -1302,7 +1304,7 @@
look_sysfsnode(struct hwloc_topology *topology, const char *path, unsigned *found)
{
unsigned osnode;
- unsigned nbnodes = 1;
+ unsigned nbnodes = 0;
DIR *dir;
struct dirent *dirent;
hwloc_obj_t node;
@@ -1314,12 +1316,9 @@
{
while ((dirent = readdir(dir)) != NULL)
{
- unsigned long numnode;
if (strncmp(dirent->d_name, "node", 4))
continue;
- numnode = strtoul(dirent->d_name+4, NULL, 0);
- if (nbnodes < numnode+1)
- nbnodes = numnode+1;
+ ++nbnodes;
}
closedir(dir);
}
@@ -1329,10 +1328,30 @@
/* For convenience, put these declarations inside a block. Saves us
from a bunch of mallocs, particularly with the 2D array. */
+ dir = hwloc_opendir(path, topology->backend_params.sysfs.root_fd);
+ if (NULL == dir)
+ return;
+
{
hwloc_obj_t nodes[nbnodes];
unsigned distances[nbnodes][nbnodes];
- for (osnode=0; osnode < nbnodes; osnode++) {
+ unsigned distance_indexes[nbnodes];
+ for (unsigned nnode = 0; ; ++nnode)
+ {
+ while (1)
+ {
+ dirent = readdir(dir);
+ if (NULL == dirent)
+ break;
+ if (0 == strncmp(dirent->d_name, "node", 4))
+ break;
+ }
+ if (NULL == dirent)
+ break;
+
+ unsigned int osnode = strtoul(dirent->d_name+4, NULL, 0);
+ distance_indexes[nnode] = osnode;
+
char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
hwloc_cpuset_t cpuset;
@@ -1351,18 +1370,256 @@
hwloc_debug_1arg_cpuset("os node %u has cpuset %s\n",
osnode, node->cpuset);
hwloc_insert_object_by_cpuset(topology, node);
- nodes[osnode] = node;
+ nodes[nnode] = node;
sprintf(nodepath, "%s/node%u/distance", path, osnode);
- hwloc_parse_node_distance(nodepath, nbnodes, distances[osnode], topology->backend_params.sysfs.root_fd);
+ hwloc_parse_node_distance(nodepath, nbnodes, distances[nnode], topology->backend_params.sysfs.root_fd);
+ }
+ hwloc_setup_misc_level_from_distances(topology, nbnodes, nodes, (unsigned*) distances, distance_indexes);
+ }
+ *found = nbnodes;
+}
+
+/* IBM PowerPC device-tree helpers */
+static inline int
+of_get_str(const char *p, const char *p1, char *buf, size_t nbuf, int root_fd)
+{
+ char fname[256];
+ int ret = -1;
+ sprintf(fname, "%s/%s", p, p1);
+ FILE *file = hwloc_fopen(fname, "r", root_fd);
+ if (NULL == file)
+ return ret;
+ if (NULL != fgets(buf, nbuf, file))
+ ret = 0;
+ fclose(file);
+ buf[nbuf-1] = 0;
+ return ret;
+}
+
+static inline size_t
+of_get_int_arr(const char *p, const char *p1, unsigned int *buf, size_t nbuf,
+ int root_fd)
+{
+ char fname[256];
+ size_t cb = 0;
+ sprintf(fname, "%s/%s", p, p1);
+ FILE *file = hwloc_fopen(fname, "r", root_fd);
+ if (NULL == file)
+ return cb;
+ cb = fread(buf, 1, nbuf, file);
+ fclose(file);
+ return cb;
+}
+
+/*
+The look_powerpc_device_tree_discover_cache function is supposed to fetch
+the L1/L2/... cache and CPU threads information which is absent in sysfs when
+running RHEL5.* kernels.
+*/
+typedef struct {
+ unsigned int n;
+ struct {
+ hwloc_cpuset_t cpuset;
+ unsigned int ibm_phandle;
+ unsigned int l2_cache;
+#ifdef HWLOC_DEBUG
+ char name[64];
+#endif
+ } p[HWLOC_NBMAXCPUS];
+} ppcnodes_t;
+
+static int
+look_powerpc_device_tree_discover_cache(ppcnodes_t *pnodes,
+ unsigned int ibm_phandle, unsigned int *level, hwloc_cpuset_t cpuset)
+{
+ int ret = -1;
+ if ((NULL == level) || (NULL == cpuset))
+ return ret;
+ for (unsigned int i = 0; i < pnodes->n; ++i) {
+ if (ibm_phandle != pnodes->p[i].l2_cache)
+ continue;
+ if (NULL != pnodes->p[i].cpuset) {
+ hwloc_cpuset_or(cpuset, cpuset, pnodes->p[i].cpuset);
+ ret = 0;
+ } else {
+ ++(*level);
+ if (0 == look_powerpc_device_tree_discover_cache(pnodes,
+ pnodes->p[i].ibm_phandle, level, cpuset))
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static void
+look_powerpc_device_tree(struct hwloc_topology *topology)
+{
+ ppcnodes_t nodes;
+ nodes.n = 0;
+ const char ofroot[] = "/proc/device-tree/cpus";
+
+ int root_fd = topology->backend_params.sysfs.root_fd;
+ DIR *dt = hwloc_opendir(ofroot, root_fd);
+ if (NULL == dt)
+ return;
+
+ struct dirent *dirent;
+ while (NULL != (dirent = readdir(dt))) {
+
+ char cpu[256];
+ sprintf(cpu, "%s/%s", ofroot, dirent->d_name);
+
+ char device_type[64];
+ if (0 > of_get_str(cpu, "device_type", device_type, sizeof(device_type),
+ root_fd))
+ continue;
+
+ // cpumask_t cpumask = {0};
+ hwloc_cpuset_t cpuset = NULL;
+ if (0 == strcmp(device_type, "cpu")) {
+ unsigned int threads[128], nthreads = 0;
+ nthreads = of_get_int_arr(cpu, "ibm,ppc-interrupt-server#s",
+ threads, sizeof(threads), root_fd) / sizeof(threads[0]);
+ if (0 != nthreads) {
+ cpuset = hwloc_cpuset_alloc();
+ for (unsigned int i = 0; i < nthreads; ++i)
+ hwloc_cpuset_set(cpuset, threads[i]);
}
+ }
- hwloc_setup_misc_level_from_distances(topology, nbnodes, nodes, (unsigned*) distances);
+ if ((0 == strcmp(device_type, "cpu")) ||
+ (0 == strcmp(device_type, "cache"))) {
+ /* Save core number and cache phandle */
+ unsigned int l2_cache = -1, ibm_phandle = -1;
+ of_get_int_arr(cpu, "l2-cache", &l2_cache, sizeof(l2_cache), root_fd);
+ of_get_int_arr(cpu, "ibm,phandle", &ibm_phandle, sizeof(ibm_phandle),
+ root_fd);
+ if ( ((unsigned)-1 != ibm_phandle) || ((unsigned)-1 != l2_cache)) {
+ nodes.p[nodes.n].ibm_phandle = ibm_phandle;
+ nodes.p[nodes.n].cpuset = (NULL == cpuset)?NULL:hwloc_cpuset_dup(cpuset);
+ nodes.p[nodes.n].l2_cache = l2_cache;
+#ifdef HWLOC_DEBUG
+ strncpy(nodes.p[nodes.n].name, dirent->d_name, sizeof(nodes.p[0].name)-1);
+#endif
+ ++nodes.n;
+ }
+ }
+
+ if (0 == strcmp(device_type, "cpu")) {
+ /* PARP: For a cpu node, the first and only value of the âregâ property
+ shall be the number of the per-processor interrupt
+ line assigned to the processor represented by the node */
+ unsigned int reg = -1;
+ if (0 != of_get_int_arr(cpu, "reg", ®, sizeof(reg), root_fd)) {
+ /* Add core */
+ struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
+ core->cpuset = hwloc_cpuset_dup(cpuset);
+ hwloc_insert_object_by_cpuset(topology, core);
+
+ /* Add socket */
+ /* -1 - to discuss */
+ struct hwloc_obj *socket = hwloc_alloc_setup_object(HWLOC_OBJ_SOCKET, -1);
+ socket->cpuset = hwloc_cpuset_dup(cpuset);
+ hwloc_insert_object_by_cpuset(topology, socket);
+ }
+
+ /* Add L1 cache */
+ /* Ignore Instruction caches */
+
+ /* d-cache-block-size - ignore */
+ /* d-cache-line-size - to read, in bytes */
+ /* d-cache-sets - ignore */
+ /* d-cache-size - to read, in bytes */
+ /* d-tlb-sets - ignore */
+ /* d-tlb-size - ignore, always 0 on power6 */
+ /* i-cache-* and i-tlb-* represent instruction cache, ignore */
+ unsigned int d_cache_line_size = 0, d_cache_size = 0;
+ if ( (0 != of_get_int_arr(cpu, "d-cache-line-size",
+ &d_cache_line_size, sizeof(d_cache_line_size), root_fd)) &&
+ (0 != of_get_int_arr(cpu, "d-cache-size",
+ &d_cache_size, sizeof(d_cache_size), root_fd)) ) {
+ struct hwloc_obj *cache =
+ hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ cache->attr->cache.size = d_cache_size;
+ cache->attr->cache.depth = 1;
+ cache->attr->cache.linesize = d_cache_line_size;
+ cache->cpuset = hwloc_cpuset_dup(cpuset);
+ hwloc_debug_cpuset("cache depth 1 has cpuset %s\n", cache->cpuset);
+ hwloc_insert_object_by_cpuset(topology, cache);
+ }
+ }
+ hwloc_cpuset_free(cpuset);
}
+ closedir(dt);
- *found = nbnodes;
+ /* No cores and L2 cache were found, exiting */
+ if (0 == nodes.n) {
+ hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
+ return;
+ }
+
+#ifdef HWLOC_DEBUG
+ for (unsigned int i; i < nodes.n; ++i)
+ printf("%i: %s ibm,phandle=%08X l2_cache=%08X cpuset=%08X\n",
+ i, nodes.p[i].name, nodes.p[i].ibm_phandle, nodes.p[i].l2_cache,
+ (unsigned int) nodes.p[i].cpuset);
+#endif
+
+ /* Scan L2/L3/... caches */
+ dt = hwloc_opendir(ofroot, root_fd);
+ if (NULL == dt)
+ return; /* No device-tree, exiting (cannot possibly happen though) */
+
+ while (NULL != (dirent = readdir(dt))) {
+ char cache[256];
+ sprintf(cache, "%s/%s", ofroot, dirent->d_name);
+ char device_type[64];
+
+ if (0 > of_get_str(cache, "device_type",
+ device_type, sizeof(device_type), root_fd))
+ continue;
+ if (0 != strcmp(device_type, "cache"))
+ continue;
+ unsigned int ibm_phandle = 0;
+ if (0 == of_get_int_arr(cache, "ibm,phandle",
+ &ibm_phandle, sizeof(ibm_phandle), root_fd))
+ continue;
+
+ /* Calculate cache level and CPU mask */
+ unsigned int level = 2;
+ hwloc_cpuset_t cpuset = hwloc_cpuset_alloc();
+ if (0 == look_powerpc_device_tree_discover_cache(&nodes,
+ ibm_phandle, &level, cpuset)) {
+ /* Check for "cache-unified" - if it is present, CPU has unified L1 cache */
+ /* d-cache-line-size - to read, in bytes */
+ /* d-cache-sets - ignore */
+ /* d-cache-size - to read, in bytes */
+ /* i-cache-* represent instruction cache, ignore */
+ unsigned int d_cache_line_size = 0, d_cache_size = 0;
+ if ( (0 != of_get_int_arr(cache, "d-cache-line-size",
+ &d_cache_line_size, sizeof(d_cache_line_size), root_fd)) &&
+ (0 != of_get_int_arr(cache, "d-cache-size",
+ &d_cache_size, sizeof(d_cache_size), root_fd)) ) {
+ struct hwloc_obj *c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ c->attr->cache.size = d_cache_size;
+ c->attr->cache.depth = level;
+ c->attr->cache.linesize = d_cache_line_size;
+ c->cpuset = hwloc_cpuset_dup(cpuset);
+ hwloc_debug_1arg_cpuset("cache depth %d has cpuset %s\n", level, c->cpuset);
+ hwloc_insert_object_by_cpuset(topology, c);
+ }
+ }
+ hwloc_cpuset_free(cpuset);
+ }
+ closedir(dt);
+
+ /* Do cleanup */
+ for (unsigned int i = 0; i < nodes.n; ++i)
+ hwloc_cpuset_free(nodes.p[i].cpuset);
}
+
/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
static void
look_sysfscpu(struct hwloc_topology *topology, const char *path)
@@ -1425,6 +1682,7 @@
hwloc_debug_1arg_cpuset("found %d cpu topologies, cpuset %s\n",
hwloc_cpuset_weight(cpuset), cpuset);
+ unsigned caches_added = 0;
hwloc_cpuset_foreach_begin(i, cpuset)
{
struct hwloc_obj *socket, *core, *thread;
@@ -1552,6 +1810,7 @@
hwloc_debug_1arg_cpuset("cache depth %d has cpuset %s\n",
depth, cacheset);
hwloc_insert_object_by_cpuset(topology, cache);
+ ++caches_added;
} else
hwloc_cpuset_free(cacheset);
}
@@ -1559,6 +1818,9 @@
}
hwloc_cpuset_foreach_end();
+ if (0 == caches_added)
+ look_powerpc_device_tree(topology);
+
hwloc_cpuset_free(cpuset);
}
@@ -1567,7 +1829,6 @@
# define PROCESSOR "processor"
# define PHYSID "physical id"
# define COREID "core id"
-#define HWLOC_NBMAXCPUS 1024 /* FIXME: drop */
static int
look_cpuinfo(struct hwloc_topology *topology, const char *path,
hwloc_cpuset_t online_cpuset)
Index: src/topology.c
===================================================================
--- src/topology.c (revision 2427)
+++ src/topology.c (working copy)
@@ -291,7 +291,8 @@
hwloc_setup_misc_level_from_distances(struct hwloc_topology *topology,
unsigned nbobjs,
struct hwloc_obj **objs,
- unsigned *_distances)
+ unsigned *_distances,
+ unsigned *distance_indexes)
{
unsigned (*distances)[nbobjs][nbobjs] = (unsigned (*)[nbobjs][nbobjs])_distances;
unsigned i,j;
@@ -303,11 +304,11 @@
hwloc_debug("%s", "node distance matrix:\n");
hwloc_debug("%s", " ");
for(j=0; j<nbobjs; j++)
- hwloc_debug(" %3u", j);
+ hwloc_debug(" %3u", distance_indexes[j]);
hwloc_debug("%s", "\n");
for(i=0; i<nbobjs; i++) {
- hwloc_debug("%3u", i);
+ hwloc_debug("%3u", distance_indexes[i]);
for(j=0; j<nbobjs; j++)
hwloc_debug(" %3u", (*distances)[i][j]);
hwloc_debug("%s", "\n");