On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote: > since commit > dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT > Linux kernel actually tries to use CPU to Node mapping from > QEMU provided SRAT table instead of discarding it, and that > in some cases breaks build_sched_domains() which expects > sane mapping where cores/threads belonging to the same socket > are on the same NUMA node. > > With current default round-robin mapping of VCPUs to nodes > guest ends-up with cores/threads belonging to the same socket > being on different NUMA nodes. > > For example with following CLI: > qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \ > -numa node,nodeid=0 -numa node,nodeid=1 > 2.6.32 based kernels will hang on boot due to incorrectly build > sched_group-s list in update_sd_lb_stats() > so comment in QEMU justifying dumb default mapping: > " > guest OSes must cope with this anyway, because there are BIOSes > out there in real machines which also use this scheme. > " > isn't really valid. > > Replacing default mapping withi a manual, where VCPUs belonging to > the same socket are on the same NUMA node, fixes issue for > guests which can't handle nonsense topology i.e. cnaging CLI to: > -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7 > > So instead of simply scattering VCPUs around nodes, map > the same socket VCPUs to the same NUMA node, which is what > guest would expect from a sane hardware/BIOS. > > Signed-off-by: Igor Mammedov <imamm...@redhat.com>
I believe the proposed behavior is much better. But if we are going to break compatibility, shouldn't we at least do that before the first -rc so we get feedback in case it break existing configurations? About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have smp_cores and smp_threads available (even if machines ignore it), but the default stub can return values that are larger than the number of sockets if smp_cores*smp_threads > 1, which would be obviously incorrect. Isn't it easier to simply make "cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket mapping function, and allow machine-specific (not arch-specific) overrides if necessary? > --- > include/sysemu/cpus.h | 3 +++ > numa.c | 14 ++++++++++---- > stubs/Makefile.objs | 1 + > stubs/qemu_cpu_socket_id_from_index.c | 6 ++++++ > target-i386/cpu.c | 11 +++++++++++ > 5 files changed, 31 insertions(+), 4 deletions(-) > create mode 100644 stubs/qemu_cpu_socket_id_from_index.c > > diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h > index 3f162a9..aacabcb 100644 > --- a/include/sysemu/cpus.h > +++ b/include/sysemu/cpus.h > @@ -1,5 +1,6 @@ > #ifndef QEMU_CPUS_H > #define QEMU_CPUS_H > +#include "qemu-common.h" > > /* cpus.c */ > void qemu_init_cpu_loop(void); > @@ -18,6 +19,8 @@ void qtest_clock_warp(int64_t dest); > /* vl.c */ > extern int smp_cores; > extern int smp_threads; > + > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index); > #else > /* *-user doesn't have configurable SMP topology */ > #define smp_cores 1 > diff --git a/numa.c b/numa.c > index ffbec68..5297749 100644 > --- a/numa.c > +++ b/numa.c > @@ -26,6 +26,7 @@ > #include "exec/cpu-common.h" > #include "qemu/bitmap.h" > #include "qom/cpu.h" > +#include "sysemu/cpus.h" > #include "qemu/error-report.h" > #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */ > #include "qapi-visit.h" > @@ -233,13 +234,18 @@ void parse_numa_opts(void) > break; > } > } > - /* assigning the VCPUs round-robin is easier to implement, guest OSes > - * must cope with this anyway, because there are BIOSes out there in > - * real machines which also use this scheme. > + /* Assign VCPUs from the same socket to the same node. > + * Since mapping is arch dependent, target that care about > + * correct mapping of VCPUs to node should implement > + * qemu_cpu_socket_id_from_index() function that maps cpu_index to > + * a socket #, for all other cases legacy round-robin mode > + * will be used. > */ > if (i == nb_numa_nodes) { > for (i = 0; i < max_cpus; i++) { > - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); > + unsigned socket_id = qemu_cpu_socket_id_from_index(i); > + > + set_bit(i, numa_info[socket_id % nb_numa_nodes].node_cpu); > } > } > } > diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs > index 8beff4c..86b8060 100644 > --- a/stubs/Makefile.objs > +++ b/stubs/Makefile.objs > @@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o > stub-obj-y += cpus.o > stub-obj-y += kvm.o > stub-obj-y += qmp_pc_dimm_device_list.o > +stub-obj-y += qemu_cpu_socket_id_from_index.o > diff --git a/stubs/qemu_cpu_socket_id_from_index.c > b/stubs/qemu_cpu_socket_id_from_index.c > new file mode 100644 > index 0000000..3d8ea8b > --- /dev/null > +++ b/stubs/qemu_cpu_socket_id_from_index.c > @@ -0,0 +1,6 @@ > +#include "sysemu/cpus.h" > + > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index) > +{ > + return cpu_index; > +} > diff --git a/target-i386/cpu.c b/target-i386/cpu.c > index ed7e5d5..7a7e236 100644 > --- a/target-i386/cpu.c > +++ b/target-i386/cpu.c > @@ -47,6 +47,7 @@ > #include "hw/xen/xen.h" > #include "hw/i386/apic_internal.h" > #endif > +#include "hw/i386/topology.h" > > > /* Cache topology CPUID constants: */ > @@ -2822,6 +2823,16 @@ out: > } > } > > +#ifndef CONFIG_USER_ONLY > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index) > +{ > + unsigned pkg_id, core_id, smt_id; > + x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index, > + &pkg_id, &core_id, &smt_id); > + return pkg_id; > +} > +#endif > + > static void x86_cpu_initfn(Object *obj) > { > CPUState *cs = CPU(obj); > -- > 1.8.3.1 > -- Eduardo