Re: Build cpu topology on amd64.
On Fri, Jul 13, 2012 at 03:06:34PM +0200, Mark Kettenis wrote: > > Date: Fri, 13 Jul 2012 14:57:11 +0200 > > From: "Christiano F. Haesbaert" > > > > Ok so here is the version with #ifndef SMALL_KERNEL, the only question > > that remains is: do we keep the printf in dmesg ? or shall I take that > > out ? > > > > I'd like to keep it so we may know if the detection is correctly just by > > looking at sent dmesgs. > > Can you shelve this until you: > > a) Have the equivalent code for i386. Sure, that should actually be the same code, I just need to make the identifycpu() stuff run on each cpu on i386 as in amd64. > b) Have something that actually uses this? That won't be so simple, but ok :) Let me explain why, when I started all this I wanted to favor migration from procs on the same core, and then on the same package. So you would pay a penalty to cross cores and a double penalty to cross packages. But this is naive and stupid, sometimes, you want procs to go as far away as possible: think of 2 procs that trash the cachelines, Brett and I found a good metric from a paper from Alexandra Fedorova, it involves calculating a "pain" parameter, but we're far away from making that possible and viable, we have easier/bigger gains right now doing other stuff. I'll hold onto it, at least it's on the mailing lists so users can play with it :). Cheers > > Cheers, > > Mark > > > Index: arch/amd64/amd64/identcpu.c > > === > > RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v > > retrieving revision 1.36 > > diff -d -u -p -r1.36 identcpu.c > > --- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 - 1.36 > > +++ arch/amd64/amd64/identcpu.c 13 Jul 2012 11:45:58 - > > @@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci) > > sensordev_install(&ci->ci_sensordev); > > #endif > > } > > +#ifndef SMALL_KERNEL > > + cpu_topology(ci); > > +#endif > > +} > > + > > +#ifndef SMALL_KERNEL > > +/* > > + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). > > + */ > > +static int > > +log2(unsigned int i) > > +{ > > + int ret = 0; > > + > > + while (i >>= 1) > > + ret++; > > + > > + return (ret); > > +} > > + > > +static int > > +mask_width(u_int x) > > +{ > > + int bit; > > + int mask; > > + int powerof2; > > + > > + powerof2 = ((x - 1) & x) == 0; > > + mask = (x << (1 - powerof2)) - 1; > > + > > + /* fls */ > > + if (mask == 0) > > + return (0); > > + for (bit = 1; mask != 1; bit++) > > + mask = (unsigned int)mask >> 1; > > + > > + return (bit); > > +} > > + > > +/* > > + * Build up cpu topology for given cpu, must run on the core itself. > > + */ > > +void > > +cpu_topology(struct cpu_info *ci) > > +{ > > + u_int32_t eax, ebx, ecx, edx; > > + u_int32_t apicid, max_apicid, max_coreid; > > + u_int32_t smt_bits, core_bits, pkg_bits; > > + u_int32_t smt_mask, core_mask, pkg_mask; > > + > > + /* We need at least apicid at CPUID 1 */ > > + CPUID(0, eax, ebx, ecx, edx); > > + if (eax < 1) > > + goto no_topology; > > + > > + /* Initial apicid */ > > + CPUID(1, eax, ebx, ecx, edx); > > + apicid = (ebx >> 24) & 0xff; > > + > > + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { > > + /* We need at least apicid at CPUID 0x8008 */ > > + CPUID(0x8000, eax, ebx, ecx, edx); > > + if (eax < 0x8008) > > + goto no_topology; > > + > > + CPUID(0x8008, eax, ebx, ecx, edx); > > + core_bits = (ecx >> 12) & 0xf; > > + if (core_bits == 0) > > + goto no_topology; > > + /* So coreidsize 2 gives 3, 3 gives 7... */ > > + core_mask = (1 << core_bits) - 1; > > + /* Core id is the least significant considering mask */ > > + ci->ci_core_id = apicid & core_mask; > > + /* Pkg id is the upper remaining bits */ > > + ci->ci_pkg_id = apicid & ~core_mask; > > + ci->ci_pkg_id >>= core_bits; > > + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { > > + /* We only support leaf 1/4 detection */ > > + CPUID(0, eax, ebx, ecx, edx); > > + if (eax < 4) > > + goto no_topology; > > + /* Get max_apicid */ > > + CPUID(1, eax, ebx, ecx, edx); > > + max_apicid = (ebx >> 16) & 0xff; > > + /* Get max_coreid */ > > + CPUID2(4, 0, eax, ebx, ecx, edx); > > + max_coreid = ((eax >> 26) & 0x3f) + 1; > > + /* SMT */ > > + smt_bits = mask_width(max_apicid / max_coreid); > > + smt_mask = (1 << smt_bits) - 1; > > + /* Core */ > > + core_bits = log2(max_coreid); > > + core_mask = (1 << (core_bits + smt_bits)) - 1; > > + core_mask ^= smt_mask; > > + /* Pkg */ > > + pkg_bits = core_bits + s
Re: Build cpu topology on amd64.
> Date: Fri, 13 Jul 2012 14:57:11 +0200 > From: "Christiano F. Haesbaert" > > Ok so here is the version with #ifndef SMALL_KERNEL, the only question > that remains is: do we keep the printf in dmesg ? or shall I take that > out ? > > I'd like to keep it so we may know if the detection is correctly just by > looking at sent dmesgs. Can you shelve this until you: a) Have the equivalent code for i386. b) Have something that actually uses this? Cheers, Mark > Index: arch/amd64/amd64/identcpu.c > === > RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v > retrieving revision 1.36 > diff -d -u -p -r1.36 identcpu.c > --- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 - 1.36 > +++ arch/amd64/amd64/identcpu.c 13 Jul 2012 11:45:58 - > @@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci) > sensordev_install(&ci->ci_sensordev); > #endif > } > +#ifndef SMALL_KERNEL > + cpu_topology(ci); > +#endif > +} > + > +#ifndef SMALL_KERNEL > +/* > + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). > + */ > +static int > +log2(unsigned int i) > +{ > + int ret = 0; > + > + while (i >>= 1) > + ret++; > + > + return (ret); > +} > + > +static int > +mask_width(u_int x) > +{ > + int bit; > + int mask; > + int powerof2; > + > + powerof2 = ((x - 1) & x) == 0; > + mask = (x << (1 - powerof2)) - 1; > + > + /* fls */ > + if (mask == 0) > + return (0); > + for (bit = 1; mask != 1; bit++) > + mask = (unsigned int)mask >> 1; > + > + return (bit); > +} > + > +/* > + * Build up cpu topology for given cpu, must run on the core itself. > + */ > +void > +cpu_topology(struct cpu_info *ci) > +{ > + u_int32_t eax, ebx, ecx, edx; > + u_int32_t apicid, max_apicid, max_coreid; > + u_int32_t smt_bits, core_bits, pkg_bits; > + u_int32_t smt_mask, core_mask, pkg_mask; > + > + /* We need at least apicid at CPUID 1 */ > + CPUID(0, eax, ebx, ecx, edx); > + if (eax < 1) > + goto no_topology; > + > + /* Initial apicid */ > + CPUID(1, eax, ebx, ecx, edx); > + apicid = (ebx >> 24) & 0xff; > + > + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { > + /* We need at least apicid at CPUID 0x8008 */ > + CPUID(0x8000, eax, ebx, ecx, edx); > + if (eax < 0x8008) > + goto no_topology; > + > + CPUID(0x8008, eax, ebx, ecx, edx); > + core_bits = (ecx >> 12) & 0xf; > + if (core_bits == 0) > + goto no_topology; > + /* So coreidsize 2 gives 3, 3 gives 7... */ > + core_mask = (1 << core_bits) - 1; > + /* Core id is the least significant considering mask */ > + ci->ci_core_id = apicid & core_mask; > + /* Pkg id is the upper remaining bits */ > + ci->ci_pkg_id = apicid & ~core_mask; > + ci->ci_pkg_id >>= core_bits; > + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { > + /* We only support leaf 1/4 detection */ > + CPUID(0, eax, ebx, ecx, edx); > + if (eax < 4) > + goto no_topology; > + /* Get max_apicid */ > + CPUID(1, eax, ebx, ecx, edx); > + max_apicid = (ebx >> 16) & 0xff; > + /* Get max_coreid */ > + CPUID2(4, 0, eax, ebx, ecx, edx); > + max_coreid = ((eax >> 26) & 0x3f) + 1; > + /* SMT */ > + smt_bits = mask_width(max_apicid / max_coreid); > + smt_mask = (1 << smt_bits) - 1; > + /* Core */ > + core_bits = log2(max_coreid); > + core_mask = (1 << (core_bits + smt_bits)) - 1; > + core_mask ^= smt_mask; > + /* Pkg */ > + pkg_bits = core_bits + smt_bits; > + pkg_mask = -1 << core_bits; > + > + ci->ci_smt_id = apicid & smt_mask; > + ci->ci_core_id = (apicid & core_mask) >> smt_bits; > + ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits; > + } else > + goto no_topology; > +#ifdef DEBUG > + printf("cpu%d: smt %u, core %u, pkg %u " > + "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, > smt_mask 0x%x, " > + "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n", > + ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id, > + apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits, > + core_mask, pkg_bits, pkg_mask); > +#else > + printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid, > + ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id); > + > +#endif > + return; > + /* We can't map, so consider ci_core_id as ci_cpuid */ > +no_topology: > + c
Re: Build cpu topology on amd64.
Ok so here is the version with #ifndef SMALL_KERNEL, the only question that remains is: do we keep the printf in dmesg ? or shall I take that out ? I'd like to keep it so we may know if the detection is correctly just by looking at sent dmesgs. Index: arch/amd64/amd64/identcpu.c === RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.36 diff -d -u -p -r1.36 identcpu.c --- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 - 1.36 +++ arch/amd64/amd64/identcpu.c 13 Jul 2012 11:45:58 - @@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci) sensordev_install(&ci->ci_sensordev); #endif } +#ifndef SMALL_KERNEL + cpu_topology(ci); +#endif +} + +#ifndef SMALL_KERNEL +/* + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). + */ +static int +log2(unsigned int i) +{ + int ret = 0; + + while (i >>= 1) + ret++; + + return (ret); +} + +static int +mask_width(u_int x) +{ + int bit; + int mask; + int powerof2; + + powerof2 = ((x - 1) & x) == 0; + mask = (x << (1 - powerof2)) - 1; + + /* fls */ + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (unsigned int)mask >> 1; + + return (bit); +} + +/* + * Build up cpu topology for given cpu, must run on the core itself. + */ +void +cpu_topology(struct cpu_info *ci) +{ + u_int32_t eax, ebx, ecx, edx; + u_int32_t apicid, max_apicid, max_coreid; + u_int32_t smt_bits, core_bits, pkg_bits; + u_int32_t smt_mask, core_mask, pkg_mask; + + /* We need at least apicid at CPUID 1 */ + CPUID(0, eax, ebx, ecx, edx); + if (eax < 1) + goto no_topology; + + /* Initial apicid */ + CPUID(1, eax, ebx, ecx, edx); + apicid = (ebx >> 24) & 0xff; + + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + /* We need at least apicid at CPUID 0x8008 */ + CPUID(0x8000, eax, ebx, ecx, edx); + if (eax < 0x8008) + goto no_topology; + + CPUID(0x8008, eax, ebx, ecx, edx); + core_bits = (ecx >> 12) & 0xf; + if (core_bits == 0) + goto no_topology; + /* So coreidsize 2 gives 3, 3 gives 7... */ + core_mask = (1 << core_bits) - 1; + /* Core id is the least significant considering mask */ + ci->ci_core_id = apicid & core_mask; + /* Pkg id is the upper remaining bits */ + ci->ci_pkg_id = apicid & ~core_mask; + ci->ci_pkg_id >>= core_bits; + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + /* We only support leaf 1/4 detection */ + CPUID(0, eax, ebx, ecx, edx); + if (eax < 4) + goto no_topology; + /* Get max_apicid */ + CPUID(1, eax, ebx, ecx, edx); + max_apicid = (ebx >> 16) & 0xff; + /* Get max_coreid */ + CPUID2(4, 0, eax, ebx, ecx, edx); + max_coreid = ((eax >> 26) & 0x3f) + 1; + /* SMT */ + smt_bits = mask_width(max_apicid / max_coreid); + smt_mask = (1 << smt_bits) - 1; + /* Core */ + core_bits = log2(max_coreid); + core_mask = (1 << (core_bits + smt_bits)) - 1; + core_mask ^= smt_mask; + /* Pkg */ + pkg_bits = core_bits + smt_bits; + pkg_mask = -1 << core_bits; + + ci->ci_smt_id = apicid & smt_mask; + ci->ci_core_id = (apicid & core_mask) >> smt_bits; + ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits; + } else + goto no_topology; +#ifdef DEBUG + printf("cpu%d: smt %u, core %u, pkg %u " + "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, smt_mask 0x%x, " + "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n", + ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id, + apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits, + core_mask, pkg_bits, pkg_mask); +#else + printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid, + ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id); + +#endif + return; + /* We can't map, so consider ci_core_id as ci_cpuid */ +no_topology: + ci->ci_smt_id = 0; + ci->ci_core_id = ci->ci_cpuid; + ci->ci_pkg_id = 0; } +#endif /* SMALL_KERNEL */ Index: arch/amd64/include/cpu.h === RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.73 diff -d -u -p -r1.73 cpu.h
Re: Build cpu topology on amd64.
On Sun, Jul 08, 2012 at 11:47:42AM +0200, Christiano F. Haesbaert wrote: > [...] > Do we want this ? > [...] I definitely want it, at least for my EEVDF experiments (maybe that patch is the kick in the butt I needed to finally get that into some sensible shape). So yeah, even if it won't get into the tree, I'll have a use for it. Thanks :) -- Gregor Best [demime 1.01d removed an attachment of type application/pgp-signature]
Build cpu topology on amd64.
Heya, I have this rotting in my tree, since actually using it effectively is way harder than it seems, anyhow, this correctly builds the topology in amd64, we know 3 things about each cpu now: - thread id (smt id) - core id - package id This is not complete but is enough IMHO, it lacks x2apic detection. I've tried to trim it up, but the mask logic is a bit cryptic. obs: I left a print on dmesg just so that people can test, I intend to remove if it goes in. an atom d270 reports the following: cpu0: smt 0, core 0, package 0 cpu1: smt 1, core 0, package 0 cpu2: smt 0, core 1, package 0 cpu3: smt 1, core 1, package 0 a core2duo L7500: cpu0: smt 0, core 0, package 0 cpu1: smt 0, core 1, package 0 Do we want this ? Index: arch/amd64/amd64/identcpu.c === RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.36 diff -d -u -p -r1.36 identcpu.c --- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 - 1.36 +++ arch/amd64/amd64/identcpu.c 8 Jul 2012 09:03:02 - @@ -446,4 +446,123 @@ identifycpu(struct cpu_info *ci) sensordev_install(&ci->ci_sensordev); #endif } + + cpu_topology(ci); +} + +/* + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). + */ +static int +log2(unsigned int i) +{ + int ret = 0; + + while (i >>= 1) + ret++; + + return (ret); +} + +static int +mask_width(u_int x) +{ + int bit; + int mask; + int powerof2; + + powerof2 = ((x - 1) & x) == 0; + mask = (x << (1 - powerof2)) - 1; + + /* fls */ + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (unsigned int)mask >> 1; + + return (bit); +} + +/* + * Build up cpu topology for given cpu, must run on the core itself. + */ +void +cpu_topology(struct cpu_info *ci) +{ + u_int32_t eax, ebx, ecx, edx; + u_int32_t apicid, max_apicid, max_coreid; + u_int32_t smt_bits, core_bits, pkg_bits; + u_int32_t smt_mask, core_mask, pkg_mask; + + /* We need at least apicid at CPUID 1 */ + CPUID(0, eax, ebx, ecx, edx); + if (eax < 1) + goto no_topology; + + /* Initial apicid */ + CPUID(1, eax, ebx, ecx, edx); + apicid = (ebx >> 24) & 0xff; + + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + /* We need at least apicid at CPUID 0x8008 */ + CPUID(0x8000, eax, ebx, ecx, edx); + if (eax < 0x8008) + goto no_topology; + + CPUID(0x8008, eax, ebx, ecx, edx); + core_bits = (ecx >> 12) & 0xf; + if (core_bits == 0) + goto no_topology; + /* So coreidsize 2 gives 3, 3 gives 7... */ + core_mask = (1 << core_bits) - 1; + /* Core id is the least significant considering mask */ + ci->ci_core_id = apicid & core_mask; + /* Pkg id is the upper remaining bits */ + ci->ci_pkg_id = apicid & ~core_mask; + ci->ci_pkg_id >>= core_bits; + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + /* We only support leaf 1/4 detection */ + CPUID(0, eax, ebx, ecx, edx); + if (eax < 4) + goto no_topology; + /* Get max_apicid */ + CPUID(1, eax, ebx, ecx, edx); + max_apicid = (ebx >> 16) & 0xff; + /* Get max_coreid */ + CPUID2(4, 0, eax, ebx, ecx, edx); + max_coreid = ((eax >> 26) & 0x3f) + 1; + /* SMT */ + smt_bits = mask_width(max_apicid / max_coreid); + smt_mask = (1 << smt_bits) - 1; + /* Core */ + core_bits = log2(max_coreid); + core_mask = (1 << (core_bits + smt_bits)) - 1; + core_mask ^= smt_mask; + /* Pkg */ + pkg_bits = core_bits + smt_bits; + pkg_mask = -1 << core_bits; + + ci->ci_smt_id = apicid & smt_mask; + ci->ci_core_id = (apicid & core_mask) >> smt_bits; + ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits; + } else + goto no_topology; +#ifdef DEBUG + printf("cpu%d: smt %u, core %u, pkg %u " + "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, smt_mask 0x%x, " + "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n", + ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id, + apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits, + core_mask, pkg_bits, pkg_mask); +#else + printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid, + ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id); +