While debugging libnuma helpers, I needed a way to emulate remote topologies in libnuma as hwloc does so that we can do HWLOC_FSROOT=/foo/bar tests/linux-libnuma on any of my existing remote topologies. I modified 2.0.8-rc3 with the attached patch so that it honors HWLOC_FSROOT when reading /sys and /proc files.
It's not perfect: * We need to retrieve the distant /proc/self/status and manually add it to $FSROOT/proc/self/ (libnuma needs it, hwloc doesn't gather it) * The remote topology cannot use cpumaps that are larger than the local kernel cpumap. Basically, you need to remove starting zeros in $FSROOT/sys/devices/system/node/node*/cpumap until it's not longer than your local /sys/devices/system/node/node0/cpumap * Some corner cases don't case such as highly sparse node ids don't work, maybe because CONFIG_NODES_SHIFT=6 in my kernel The patch also brings back the old "numa_all_nodes" behavior (before libnuma 2.0.6) wrt nodes with no memory if HWLOC_OLD_LIBNUMA=1. But this shouldn't matter anyway because I removed nodemask_t helpers from trunk. Still no reply to my bug report on numa-devel about this http://thread.gmane.org/gmane.linux.kernel.numa/716 I am sharing this so that it doesn't get lost, in case somebody ever has to debug this again. Brice
diff -ur numactl-2.0.8~rc3/affinity.c numactl-2.0.8~rc3-new/affinity.c --- numactl-2.0.8~rc3/affinity.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/affinity.c 2012-02-20 13:50:29.537251813 +0100 @@ -49,6 +49,8 @@ #include "affinity.h" #include "rtnetlink.h" +extern char *rootpath; + static int badchar(char *s) { if (strpbrk(s, "/.")) @@ -88,7 +90,7 @@ much about the actual sysfs layout. */ char path[1024]; char *fn = NULL; - if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 && + if (asprintf(&fn, "%s/sys/class/%s/%s", rootpath, cls, dev) > 0 && readlink(fn, path, sizeof path) > 0) { regex_t re; regmatch_t match[2]; @@ -104,7 +106,7 @@ assert(match[0].rm_eo > 0); path[match[1].rm_eo + 1] = 0; p = path + match[0].rm_so; - ret = sysfs_node_read(mask, "/sys/%s/numa_node", p); + ret = sysfs_node_read(mask, "%s/sys/%s/numa_node", rootpath, p); if (ret < 0) return node_parse_failure(ret, NULL, p); return ret; @@ -112,8 +114,8 @@ } free(fn); - ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", - cls, dev); + ret = sysfs_node_read(mask, "%s/sys/class/%s/%s/device/numa_node", + rootpath, cls, dev); if (ret < 0) return node_parse_failure(ret, cls, dev); return 0; @@ -131,7 +133,7 @@ struct dirent de, *dep; cls = "block"; - char fn[sizeof("/sys/class/") + strlen(cls)]; + char fn[1024]; if (stat(file, &st) < 0) { numa_warn(W_blockdev1, "Cannot stat file %s", file); return -1; @@ -145,7 +147,7 @@ } else if (S_ISBLK(st.st_mode)) d = st.st_rdev; - sprintf(fn, "/sys/class/%s", cls); + sprintf(fn, "%s/sys/class/%s", rootpath, cls); dir = opendir(fn); if (!dir) { numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs", @@ -157,10 +159,10 @@ if (*name == '.') continue; char *dev; - char fn2[sizeof("/sys/class/block//dev") + strlen(name)]; + char fn2[1024]; n = -1; - if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0) + if (sprintf(fn2, "%s/sys/class/block/%s/dev", rootpath, name) < 0) break; dev = sysfs_read(fn2); if (dev) { @@ -299,8 +301,8 @@ return -1; } ret = sysfs_node_read(mask, - "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", - seg, bus, seg, bus, dev, func); + "%s/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", + rootpath, seg, bus, seg, bus, dev, func); if (ret < 0) return node_parse_failure(ret, cls, id); return 0; diff -ur numactl-2.0.8~rc3/distance.c numactl-2.0.8~rc3-new/distance.c --- numactl-2.0.8~rc3/distance.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/distance.c 2012-02-20 13:46:15.713260676 +0100 @@ -61,7 +61,8 @@ for (nd = 0;; nd++) { char fn[100]; FILE *dfh; - sprintf(fn, "/sys/devices/system/node/node%d/distance", nd); + char *env = getenv("HWLOC_FSROOT"); + sprintf(fn, "%s/sys/devices/system/node/node%d/distance", env ? : "", nd); dfh = fopen(fn, "r"); if (!dfh) { if (errno == ENOENT && nd > 0) diff -ur numactl-2.0.8~rc3/libnuma.c numactl-2.0.8~rc3-new/libnuma.c --- numactl-2.0.8~rc3/libnuma.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/libnuma.c 2012-02-20 13:40:18.225273153 +0100 @@ -77,6 +77,9 @@ int numa_exit_on_warn = 0; static void set_sizes(void); +char *rootpath = ""; +static char *mask_size_file; + /* * There are two special functions, _init(void) and _fini(void), which * are called automatically by the dynamic loader whenever a library is loaded. @@ -87,6 +90,12 @@ numa_init(void) { int max,i; + char *env; + + env = getenv("HWLOC_FSROOT"); + if (env) + rootpath = env; + asprintf(&mask_size_file, "%s/proc/self/status", rootpath); if (sizes_set) return; @@ -320,11 +329,13 @@ DIR *d; struct dirent *de; long long freep; + char *path; numa_memnode_ptr = numa_allocate_nodemask(); numa_nodes_ptr = numa_allocate_nodemask(); - d = opendir("/sys/devices/system/node"); + asprintf(&path, "%s/sys/devices/system/node", rootpath); + d = opendir(path); if (!d) { maxconfigurednode = 0; } else { @@ -341,6 +352,7 @@ } closedir(d); } + free(path); } /* @@ -358,7 +370,6 @@ return strncmp(s, pre, strlen(pre)) == 0; } -static const char *mask_size_file = "/proc/self/status"; static const char *nodemask_prefix = "Mems_allowed:\t"; /* * (do this the way Paul Jackson's libcpuset does it) @@ -557,10 +568,13 @@ static void set_configured_cpus(void) { - char *dirnamep = "/sys/devices/system/cpu"; + char *dirnamep; struct dirent *dirent; DIR *dir; + + asprintf(&dirnamep, "%s/sys/devices/system/cpu", rootpath); dir = opendir(dirnamep); + free(dirnamep); if (dir == NULL) { /* fall back to using the online cpu count */ @@ -600,6 +614,8 @@ int numa_num_configured_nodes(void) { + if (getenv("HWLOC_OLD_LIBNUMA")) + return maxconfigurednode+1; /* * NOTE: this function's behavior matches the documentation (ie: it * returns a count of nodes with memory) despite the poor function @@ -743,13 +759,13 @@ char *line = NULL; long long size = -1; FILE *f; - char fn[64]; + char fn[512]; int ok = 0; int required = freep ? 2 : 1; if (freep) *freep = -1; - sprintf(fn,"/sys/devices/system/node/node%d/meminfo", node); + sprintf(fn, "%s/sys/devices/system/node/node%d/meminfo", rootpath, node); f = fopen(fn, "r"); if (!f) return -1; @@ -1254,7 +1270,7 @@ numa_node_to_cpus_v1(int node, unsigned long *buffer, int bufferlen) { int err = 0; - char fn[64]; + char fn[512]; FILE *f; char *line = NULL; size_t len = 0; @@ -1281,7 +1297,7 @@ mask = (unsigned long *)buffer; memset(mask, 0, buflen_needed); - sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node); + sprintf(fn, "%s/sys/devices/system/node/node%d/cpumap", rootpath, node); f = fopen(fn, "r"); if (!f || getdelim(&line, &len, '\n', f) < 1) { numa_warn(W_nosysfs2, @@ -1330,7 +1346,7 @@ { int err = 0, bufferlen; int nnodes = numa_max_node(); - char fn[64], *line = NULL; + char fn[512], *line = NULL; FILE *f; size_t len = 0; struct bitmask *mask; @@ -1359,7 +1375,7 @@ mask = numa_allocate_cpumask(); /* this is a kernel cpumask_t (see node_read_cpumap()) */ - sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node); + sprintf(fn, "%s/sys/devices/system/node/node%d/cpumap", rootpath, node); f = fopen(fn, "r"); if (!f || getdelim(&line, &len, '\n', f) < 1) { numa_warn(W_nosysfs2, diff -ur numactl-2.0.8~rc3/migspeed.c numactl-2.0.8~rc3-new/migspeed.c --- numactl-2.0.8~rc3/migspeed.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/migspeed.c 2012-02-20 13:51:17.457250140 +0100 @@ -4,6 +4,7 @@ * (C) 2007 Silicon Graphics, Inc. Christoph Lameter <clame...@sgi.com> * */ +#define _GNU_SOURCE 1 #include <stdio.h> #include <stdlib.h> #include "numa.h" @@ -40,10 +41,16 @@ void displaymap(void) { - FILE *f = fopen("/proc/self/numa_maps","r"); + char *env = getenv("HWLOC_FSROOT"); + char *path; + FILE *f; + + asprintf(&path, "%s/proc/self/numa_maps", env ? : ""); + f = fopen(path, "r"); + free(path); if (!f) { - printf("/proc/self/numa_maps not accessible.\n"); + printf("%s not accessible.\n", path); exit(1); } diff -ur numactl-2.0.8~rc3/numamon.c numactl-2.0.8~rc3-new/numamon.c --- numactl-2.0.8~rc3/numamon.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/numamon.c 2012-02-20 13:51:35.681249506 +0100 @@ -243,7 +243,12 @@ char *line = NULL; size_t size = 0; int bad = 0; - FILE *f = fopen("/proc/cpuinfo", "r"); + char *env = getenv("HWLOC_FSROOT"); + char *path; + FILE *f; + asprintf(&path, "%s/proc/cpuinfo", env ? : ""); + f = fopen(path, "r"); + free(path); if (!f) return; while (getline(&line, &size, f) > 0) { diff -ur numactl-2.0.8~rc3/numastat numactl-2.0.8~rc3-new/numastat --- numactl-2.0.8~rc3/numastat 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/numastat 2012-02-19 12:48:29.520445939 +0100 @@ -39,7 +39,7 @@ } $WIDTH = 32 if $WIDTH < 32; -if (! -d "/sys/devices/system/node" ) { +if (! -d "/home/bgoglin/SOFT/hwloc/topology-chroot/power7vnode/sys/devices/system/node" ) { print STDERR "sysfs not mounted or system not NUMA aware\n"; exit 1; } @@ -47,7 +47,7 @@ %stat = (); $title = ""; $mode = 0; -opendir(NODES, "/sys/devices/system/node") || exit 1; +opendir(NODES, "/home/bgoglin/SOFT/hwloc/topology-chroot/power7vnode/sys/devices/system/node") || exit 1; foreach $nd (readdir(NODES)) { next unless $nd =~ /node(\d+)/; # On newer kernels, readdir may enumerate the 'node(\d+) subdirs @@ -59,7 +59,7 @@ if (!$title && $nd =~ /node0/) { $mode = 1; } - open(STAT, "/sys/devices/system/node/$nd/numastat") || + open(STAT, "/home/bgoglin/SOFT/hwloc/topology-chroot/power7vnode/sys/devices/system/node/$nd/numastat") || die "cannot open $nd: $!\n"; if (! $mode) { $title = sprintf("%16s",$nd) . $title; diff -ur numactl-2.0.8~rc3/shm.c numactl-2.0.8~rc3-new/shm.c --- numactl-2.0.8~rc3/shm.c 2011-12-19 15:51:35.000000000 +0100 +++ numactl-2.0.8~rc3-new/shm.c 2012-02-20 13:43:10.161267153 +0100 @@ -44,11 +44,19 @@ int shmflags; static int shm_pagesize; +extern char *rootpath; + long huge_page_size(void) { size_t len = 0; char *line = NULL; - FILE *f = fopen("/proc/meminfo", "r"); + char *env = getenv("HWLOC_FSROOT"); + char *path; + FILE *f; + + asprintf(&path, "%s/proc/meminfo", env ? : ""); + f = fopen(path, "r"); + free(path); if (f != NULL) { while (getdelim(&line, &len, '\n', f) > 0) { int ps;