Re: Build cpu topology on amd64.

2012-07-13 Thread Christiano F. Haesbaert
On Fri, Jul 13, 2012 at 03:06:34PM +0200, Mark Kettenis wrote:
> > Date: Fri, 13 Jul 2012 14:57:11 +0200
> > From: "Christiano F. Haesbaert" 
> > 
> > Ok so here is the version with #ifndef SMALL_KERNEL, the only question
> > that remains is: do we keep the printf in dmesg ? or shall I take that
> > out ? 
> > 
> > I'd like to keep it so we may know if the detection is correctly just by
> > looking at sent dmesgs.
> 
> Can you shelve this until you:
> 
> a) Have the equivalent code for i386.

Sure, that should actually be the same code, I just need to make the
identifycpu() stuff run on each cpu on i386 as in amd64.

> b) Have something that actually uses this?

That won't be so simple, but ok :)

Let me explain why, when I started all this I wanted to favor migration
from procs on the same core, and then on the same package. So you would
pay a penalty to cross cores and a double penalty to cross packages.

But this is naive and stupid, sometimes, you want procs to go as far
away as possible: think of 2 procs that trash the cachelines, Brett and
I found a good metric from a paper from Alexandra Fedorova, it involves
calculating a "pain" parameter, but we're far away from making that
possible and viable, we have easier/bigger gains right now doing other
stuff.

I'll hold onto it, at least it's on the mailing lists so users can play
with it :).

Cheers 

> 
> Cheers,
> 
> Mark
> 
> > Index: arch/amd64/amd64/identcpu.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
> > retrieving revision 1.36
> > diff -d -u -p -r1.36 identcpu.c
> > --- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 -  1.36
> > +++ arch/amd64/amd64/identcpu.c 13 Jul 2012 11:45:58 -
> > @@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci)
> > sensordev_install(&ci->ci_sensordev);
> >  #endif
> > }
> > +#ifndef SMALL_KERNEL
> > +   cpu_topology(ci);
> > +#endif
> > +}
> > +
> > +#ifndef SMALL_KERNEL
> > +/*
> > + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
> > + */
> > +static int
> > +log2(unsigned int i)
> > +{
> > +   int ret = 0;
> > +
> > +   while (i >>= 1)
> > +   ret++;
> > +
> > +   return (ret);
> > +}
> > +
> > +static int
> > +mask_width(u_int x)
> > +{
> > +   int bit;
> > +   int mask;
> > +   int powerof2;
> > +
> > +   powerof2 = ((x - 1) & x) == 0;
> > +   mask = (x << (1 - powerof2)) - 1;
> > +
> > +   /* fls */
> > +   if (mask == 0)
> > +   return (0);
> > +   for (bit = 1; mask != 1; bit++)
> > +   mask = (unsigned int)mask >> 1;
> > +
> > +   return (bit);
> > +}
> > +
> > +/*
> > + * Build up cpu topology for given cpu, must run on the core itself.
> > + */
> > +void
> > +cpu_topology(struct cpu_info *ci)
> > +{
> > +   u_int32_t eax, ebx, ecx, edx;
> > +   u_int32_t apicid, max_apicid, max_coreid;
> > +   u_int32_t smt_bits, core_bits, pkg_bits;
> > +   u_int32_t smt_mask, core_mask, pkg_mask;
> > +   
> > +   /* We need at least apicid at CPUID 1 */
> > +   CPUID(0, eax, ebx, ecx, edx);
> > +   if (eax < 1)
> > +   goto no_topology;
> > +   
> > +   /* Initial apicid */
> > +   CPUID(1, eax, ebx, ecx, edx);
> > +   apicid = (ebx >> 24) & 0xff;
> > +   
> > +   if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
> > +   /* We need at least apicid at CPUID 0x8008 */
> > +   CPUID(0x8000, eax, ebx, ecx, edx);
> > +   if (eax < 0x8008)
> > +   goto no_topology;
> > +   
> > +   CPUID(0x8008, eax, ebx, ecx, edx);
> > +   core_bits = (ecx >> 12) & 0xf;
> > +   if (core_bits == 0)
> > +   goto no_topology;
> > +   /* So coreidsize 2 gives 3, 3 gives 7... */
> > +   core_mask = (1 << core_bits) - 1;
> > +   /* Core id is the least significant considering mask */
> > +   ci->ci_core_id = apicid & core_mask;
> > +   /* Pkg id is the upper remaining bits */
> > +   ci->ci_pkg_id = apicid & ~core_mask;
> > +   ci->ci_pkg_id >>= core_bits;
> > +   } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
> > +   /* We only support leaf 1/4 detection */
> > +   CPUID(0, eax, ebx, ecx, edx);
> > +   if (eax < 4)
> > +   goto no_topology;
> > +   /* Get max_apicid */
> > +   CPUID(1, eax, ebx, ecx, edx);
> > +   max_apicid = (ebx >> 16) & 0xff;
> > +   /* Get max_coreid */
> > +   CPUID2(4, 0, eax, ebx, ecx, edx);
> > +   max_coreid = ((eax >> 26) & 0x3f) + 1;
> > +   /* SMT */
> > +   smt_bits = mask_width(max_apicid / max_coreid);
> > +   smt_mask = (1 << smt_bits) - 1;
> > +   /* Core */
> > +   core_bits = log2(max_coreid);
> > +   core_mask = (1 << (core_bits + smt_bits)) - 1;
> > +   core_mask ^= smt_mask;
> > +   /* Pkg */
> > +   pkg_bits = core_bits + s

Re: Build cpu topology on amd64.

2012-07-13 Thread Mark Kettenis
> Date: Fri, 13 Jul 2012 14:57:11 +0200
> From: "Christiano F. Haesbaert" 
> 
> Ok so here is the version with #ifndef SMALL_KERNEL, the only question
> that remains is: do we keep the printf in dmesg ? or shall I take that
> out ? 
> 
> I'd like to keep it so we may know if the detection is correctly just by
> looking at sent dmesgs.

Can you shelve this until you:

a) Have the equivalent code for i386.
b) Have something that actually uses this?

Cheers,

Mark

> Index: arch/amd64/amd64/identcpu.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
> retrieving revision 1.36
> diff -d -u -p -r1.36 identcpu.c
> --- arch/amd64/amd64/identcpu.c   22 Apr 2012 19:36:09 -  1.36
> +++ arch/amd64/amd64/identcpu.c   13 Jul 2012 11:45:58 -
> @@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci)
>   sensordev_install(&ci->ci_sensordev);
>  #endif
>   }
> +#ifndef SMALL_KERNEL
> + cpu_topology(ci);
> +#endif
> +}
> +
> +#ifndef SMALL_KERNEL
> +/*
> + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
> + */
> +static int
> +log2(unsigned int i)
> +{
> + int ret = 0;
> +
> + while (i >>= 1)
> + ret++;
> +
> + return (ret);
> +}
> +
> +static int
> +mask_width(u_int x)
> +{
> + int bit;
> + int mask;
> + int powerof2;
> +
> + powerof2 = ((x - 1) & x) == 0;
> + mask = (x << (1 - powerof2)) - 1;
> +
> + /* fls */
> + if (mask == 0)
> + return (0);
> + for (bit = 1; mask != 1; bit++)
> + mask = (unsigned int)mask >> 1;
> +
> + return (bit);
> +}
> +
> +/*
> + * Build up cpu topology for given cpu, must run on the core itself.
> + */
> +void
> +cpu_topology(struct cpu_info *ci)
> +{
> + u_int32_t eax, ebx, ecx, edx;
> + u_int32_t apicid, max_apicid, max_coreid;
> + u_int32_t smt_bits, core_bits, pkg_bits;
> + u_int32_t smt_mask, core_mask, pkg_mask;
> + 
> + /* We need at least apicid at CPUID 1 */
> + CPUID(0, eax, ebx, ecx, edx);
> + if (eax < 1)
> + goto no_topology;
> + 
> + /* Initial apicid */
> + CPUID(1, eax, ebx, ecx, edx);
> + apicid = (ebx >> 24) & 0xff;
> + 
> + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
> + /* We need at least apicid at CPUID 0x8008 */
> + CPUID(0x8000, eax, ebx, ecx, edx);
> + if (eax < 0x8008)
> + goto no_topology;
> + 
> + CPUID(0x8008, eax, ebx, ecx, edx);
> + core_bits = (ecx >> 12) & 0xf;
> + if (core_bits == 0)
> + goto no_topology;
> + /* So coreidsize 2 gives 3, 3 gives 7... */
> + core_mask = (1 << core_bits) - 1;
> + /* Core id is the least significant considering mask */
> + ci->ci_core_id = apicid & core_mask;
> + /* Pkg id is the upper remaining bits */
> + ci->ci_pkg_id = apicid & ~core_mask;
> + ci->ci_pkg_id >>= core_bits;
> + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
> + /* We only support leaf 1/4 detection */
> + CPUID(0, eax, ebx, ecx, edx);
> + if (eax < 4)
> + goto no_topology;
> + /* Get max_apicid */
> + CPUID(1, eax, ebx, ecx, edx);
> + max_apicid = (ebx >> 16) & 0xff;
> + /* Get max_coreid */
> + CPUID2(4, 0, eax, ebx, ecx, edx);
> + max_coreid = ((eax >> 26) & 0x3f) + 1;
> + /* SMT */
> + smt_bits = mask_width(max_apicid / max_coreid);
> + smt_mask = (1 << smt_bits) - 1;
> + /* Core */
> + core_bits = log2(max_coreid);
> + core_mask = (1 << (core_bits + smt_bits)) - 1;
> + core_mask ^= smt_mask;
> + /* Pkg */
> + pkg_bits = core_bits + smt_bits;
> + pkg_mask = -1 << core_bits;
> +  
> + ci->ci_smt_id = apicid & smt_mask;
> + ci->ci_core_id = (apicid & core_mask) >> smt_bits;
> + ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits;
> + } else
> + goto no_topology;
> +#ifdef DEBUG
> + printf("cpu%d: smt %u, core %u, pkg %u "
> + "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, 
> smt_mask 0x%x, "
> + "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n",
> + ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id,
> + apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits,
> + core_mask, pkg_bits, pkg_mask);
> +#else
> + printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid,
> + ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id);
> + 
> +#endif
> + return;
> + /* We can't map, so consider ci_core_id as ci_cpuid */
> +no_topology:
> + c

Re: Build cpu topology on amd64.

2012-07-13 Thread Christiano F. Haesbaert
Ok so here is the version with #ifndef SMALL_KERNEL, the only question
that remains is: do we keep the printf in dmesg ? or shall I take that
out ? 

I'd like to keep it so we may know if the detection is correctly just by
looking at sent dmesgs.

Index: arch/amd64/amd64/identcpu.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
retrieving revision 1.36
diff -d -u -p -r1.36 identcpu.c
--- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 -  1.36
+++ arch/amd64/amd64/identcpu.c 13 Jul 2012 11:45:58 -
@@ -446,4 +446,126 @@ identifycpu(struct cpu_info *ci)
sensordev_install(&ci->ci_sensordev);
 #endif
}
+#ifndef SMALL_KERNEL
+   cpu_topology(ci);
+#endif
+}
+
+#ifndef SMALL_KERNEL
+/*
+ * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
+ */
+static int
+log2(unsigned int i)
+{
+   int ret = 0;
+
+   while (i >>= 1)
+   ret++;
+
+   return (ret);
+}
+
+static int
+mask_width(u_int x)
+{
+   int bit;
+   int mask;
+   int powerof2;
+
+   powerof2 = ((x - 1) & x) == 0;
+   mask = (x << (1 - powerof2)) - 1;
+
+   /* fls */
+   if (mask == 0)
+   return (0);
+   for (bit = 1; mask != 1; bit++)
+   mask = (unsigned int)mask >> 1;
+
+   return (bit);
+}
+
+/*
+ * Build up cpu topology for given cpu, must run on the core itself.
+ */
+void
+cpu_topology(struct cpu_info *ci)
+{
+   u_int32_t eax, ebx, ecx, edx;
+   u_int32_t apicid, max_apicid, max_coreid;
+   u_int32_t smt_bits, core_bits, pkg_bits;
+   u_int32_t smt_mask, core_mask, pkg_mask;
+   
+   /* We need at least apicid at CPUID 1 */
+   CPUID(0, eax, ebx, ecx, edx);
+   if (eax < 1)
+   goto no_topology;
+   
+   /* Initial apicid */
+   CPUID(1, eax, ebx, ecx, edx);
+   apicid = (ebx >> 24) & 0xff;
+   
+   if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+   /* We need at least apicid at CPUID 0x8008 */
+   CPUID(0x8000, eax, ebx, ecx, edx);
+   if (eax < 0x8008)
+   goto no_topology;
+   
+   CPUID(0x8008, eax, ebx, ecx, edx);
+   core_bits = (ecx >> 12) & 0xf;
+   if (core_bits == 0)
+   goto no_topology;
+   /* So coreidsize 2 gives 3, 3 gives 7... */
+   core_mask = (1 << core_bits) - 1;
+   /* Core id is the least significant considering mask */
+   ci->ci_core_id = apicid & core_mask;
+   /* Pkg id is the upper remaining bits */
+   ci->ci_pkg_id = apicid & ~core_mask;
+   ci->ci_pkg_id >>= core_bits;
+   } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+   /* We only support leaf 1/4 detection */
+   CPUID(0, eax, ebx, ecx, edx);
+   if (eax < 4)
+   goto no_topology;
+   /* Get max_apicid */
+   CPUID(1, eax, ebx, ecx, edx);
+   max_apicid = (ebx >> 16) & 0xff;
+   /* Get max_coreid */
+   CPUID2(4, 0, eax, ebx, ecx, edx);
+   max_coreid = ((eax >> 26) & 0x3f) + 1;
+   /* SMT */
+   smt_bits = mask_width(max_apicid / max_coreid);
+   smt_mask = (1 << smt_bits) - 1;
+   /* Core */
+   core_bits = log2(max_coreid);
+   core_mask = (1 << (core_bits + smt_bits)) - 1;
+   core_mask ^= smt_mask;
+   /* Pkg */
+   pkg_bits = core_bits + smt_bits;
+   pkg_mask = -1 << core_bits;
+
+   ci->ci_smt_id = apicid & smt_mask;
+   ci->ci_core_id = (apicid & core_mask) >> smt_bits;
+   ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits;
+   } else
+   goto no_topology;
+#ifdef DEBUG
+   printf("cpu%d: smt %u, core %u, pkg %u "
+   "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, 
smt_mask 0x%x, "
+   "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n",
+   ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id,
+   apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits,
+   core_mask, pkg_bits, pkg_mask);
+#else
+   printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid,
+   ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id);
+   
+#endif
+   return;
+   /* We can't map, so consider ci_core_id as ci_cpuid */
+no_topology:
+   ci->ci_smt_id  = 0;
+   ci->ci_core_id = ci->ci_cpuid;
+   ci->ci_pkg_id  = 0;
 }
+#endif /* SMALL_KERNEL */
Index: arch/amd64/include/cpu.h
===
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.73
diff -d -u -p -r1.73 cpu.h

Re: Build cpu topology on amd64.

2012-07-08 Thread Gregor Best
On Sun, Jul 08, 2012 at 11:47:42AM +0200, Christiano F. Haesbaert wrote:
> [...]
> Do we want this ?
> [...]

I definitely want it, at least for my EEVDF experiments (maybe that patch is
the kick in the butt I needed to finally get that into
some sensible shape). So yeah, even if it won't get into the tree, I'll have a
use for it. Thanks :)

--
Gregor Best

[demime 1.01d removed an attachment of type application/pgp-signature]



Build cpu topology on amd64.

2012-07-08 Thread Christiano F. Haesbaert
Heya, 

I have this rotting in my tree, since actually using it effectively is
way harder than it seems, anyhow, this correctly builds the topology in
amd64, we know 3 things about each cpu now:

- thread id (smt id)
- core id
- package id

This is not complete but is enough IMHO, it lacks x2apic detection.
I've tried to trim it up, but the mask logic is a bit cryptic.

obs: I left a print on dmesg just so that people can test, I intend to
remove if it goes in. 

an atom d270 reports the following:
cpu0: smt 0, core 0, package 0
cpu1: smt 1, core 0, package 0
cpu2: smt 0, core 1, package 0
cpu3: smt 1, core 1, package 0

a core2duo L7500:
cpu0: smt 0, core 0, package 0
cpu1: smt 0, core 1, package 0

Do we want this ? 

Index: arch/amd64/amd64/identcpu.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
retrieving revision 1.36
diff -d -u -p -r1.36 identcpu.c
--- arch/amd64/amd64/identcpu.c 22 Apr 2012 19:36:09 -  1.36
+++ arch/amd64/amd64/identcpu.c 8 Jul 2012 09:03:02 -
@@ -446,4 +446,123 @@ identifycpu(struct cpu_info *ci)
sensordev_install(&ci->ci_sensordev);
 #endif
}
+
+   cpu_topology(ci);
+}
+
+/*
+ * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
+ */
+static int
+log2(unsigned int i)
+{
+   int ret = 0;
+
+   while (i >>= 1)
+   ret++;
+
+   return (ret);
+}
+
+static int
+mask_width(u_int x)
+{
+   int bit;
+   int mask;
+   int powerof2;
+
+   powerof2 = ((x - 1) & x) == 0;
+   mask = (x << (1 - powerof2)) - 1;
+
+   /* fls */
+   if (mask == 0)
+   return (0);
+   for (bit = 1; mask != 1; bit++)
+   mask = (unsigned int)mask >> 1;
+
+   return (bit);
+}
+
+/*
+ * Build up cpu topology for given cpu, must run on the core itself.
+ */
+void
+cpu_topology(struct cpu_info *ci)
+{
+   u_int32_t eax, ebx, ecx, edx;
+   u_int32_t apicid, max_apicid, max_coreid;
+   u_int32_t smt_bits, core_bits, pkg_bits;
+   u_int32_t smt_mask, core_mask, pkg_mask;
+   
+   /* We need at least apicid at CPUID 1 */
+   CPUID(0, eax, ebx, ecx, edx);
+   if (eax < 1)
+   goto no_topology;
+   
+   /* Initial apicid */
+   CPUID(1, eax, ebx, ecx, edx);
+   apicid = (ebx >> 24) & 0xff;
+   
+   if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+   /* We need at least apicid at CPUID 0x8008 */
+   CPUID(0x8000, eax, ebx, ecx, edx);
+   if (eax < 0x8008)
+   goto no_topology;
+   
+   CPUID(0x8008, eax, ebx, ecx, edx);
+   core_bits = (ecx >> 12) & 0xf;
+   if (core_bits == 0)
+   goto no_topology;
+   /* So coreidsize 2 gives 3, 3 gives 7... */
+   core_mask = (1 << core_bits) - 1;
+   /* Core id is the least significant considering mask */
+   ci->ci_core_id = apicid & core_mask;
+   /* Pkg id is the upper remaining bits */
+   ci->ci_pkg_id = apicid & ~core_mask;
+   ci->ci_pkg_id >>= core_bits;
+   } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+   /* We only support leaf 1/4 detection */
+   CPUID(0, eax, ebx, ecx, edx);
+   if (eax < 4)
+   goto no_topology;
+   /* Get max_apicid */
+   CPUID(1, eax, ebx, ecx, edx);
+   max_apicid = (ebx >> 16) & 0xff;
+   /* Get max_coreid */
+   CPUID2(4, 0, eax, ebx, ecx, edx);
+   max_coreid = ((eax >> 26) & 0x3f) + 1;
+   /* SMT */
+   smt_bits = mask_width(max_apicid / max_coreid);
+   smt_mask = (1 << smt_bits) - 1;
+   /* Core */
+   core_bits = log2(max_coreid);
+   core_mask = (1 << (core_bits + smt_bits)) - 1;
+   core_mask ^= smt_mask;
+   /* Pkg */
+   pkg_bits = core_bits + smt_bits;
+   pkg_mask = -1 << core_bits;
+
+   ci->ci_smt_id = apicid & smt_mask;
+   ci->ci_core_id = (apicid & core_mask) >> smt_bits;
+   ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits;
+   } else
+   goto no_topology;
+#ifdef DEBUG
+   printf("cpu%d: smt %u, core %u, pkg %u "
+   "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, 
smt_mask 0x%x, "
+   "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n",
+   ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id,
+   apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits,
+   core_mask, pkg_bits, pkg_mask);
+#else
+   printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid,
+   ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id);
+