Re: [PATCH v5 2/2 gnumach] percpu area using gs segment
Damien Zammit, le dim. 24 sept. 2023 10:35:10 +, a ecrit: > Untested on 64 bit It will just not work :) 64bit needs to set an fsgs base instead. Samuel
Re: [PATCH v5 2/2 gnumach] percpu area using gs segment
Applied, thanks! Damien Zammit, le dim. 24 sept. 2023 10:35:10 +, a ecrit: > This speeds up smp again, by storing the struct processor > in a percpu area and avoiding an expensive cpu_number every call > of current_processor(), as well as getting the cpu_number by > an offset into the percpu area. Untested on 64 bit > and work remains to use other percpu arrays. > > TESTED: (NCPUS=8) -smp 1 boots to login shell ~2x slower than uniprocessor > TESTED: (NCPUS=8) -smp 2 boots to INIT but hangs there > TESTED: (NCPUS=8) -smp 4 gets stuck seemingly within rumpdisk and hangs > TESTED: (NCPUS=1) uniprocessor is a bit faster than normal > > --- > i386/Makefrag.am| 2 + > i386/i386/cpu_number.h | 17 +++-- > i386/i386/fpu.c | 2 +- > i386/i386/gdt.c | 21 +-- > i386/i386/gdt.h | 8 ++-- > i386/i386/i386asm.sym | 2 + > i386/i386/locore.S | 20 ++ > i386/i386/mp_desc.c | 3 +- > i386/i386/percpu.c | 31 +++ > i386/i386/percpu.h | 83 + > i386/i386/pit.c | 2 +- > i386/i386/spl.S | 16 > i386/i386at/model_dep.c | 1 + > kern/cpu_number.h | 3 +- > kern/processor.c| 7 +--- > kern/processor.h| 17 +++-- > kern/startup.c | 3 +- > x86_64/Makefrag.am | 2 + > x86_64/locore.S | 7 ++-- > 19 files changed, 194 insertions(+), 53 deletions(-) > create mode 100644 i386/i386/percpu.c > create mode 100644 i386/i386/percpu.h > > diff --git a/i386/Makefrag.am b/i386/Makefrag.am > index 274e8695..c1724cea 100644 > --- a/i386/Makefrag.am > +++ b/i386/Makefrag.am > @@ -108,6 +108,8 @@ libkernel_a_SOURCES += \ > i386/i386/irq.c \ > i386/i386/irq.h \ > i386/i386/msr.h \ > + i386/i386/percpu.c \ > + i386/i386/percpu.h \ > i386/i386/pit.c \ > i386/i386/pit.h > > diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h > index 8357be84..6ba46e4b 100644 > --- a/i386/i386/cpu_number.h > +++ b/i386/i386/cpu_number.h > @@ -30,6 +30,8 @@ > #ifndef _I386_CPU_NUMBER_H_ > #define _I386_CPU_NUMBER_H_ > > +#define MY(stm) %gs:PERCPU_##stm > + > #if NCPUS > 1 > > #ifdef __i386__ > @@ -45,8 +47,8 @@ > shrl$24, reg;\ > movl%cs:CX(cpu_id_lut, reg), reg;\ > > -/* Never call CPU_NUMBER(%esi) */ > -#define CPU_NUMBER(reg) \ > +/* Never call CPU_NUMBER_NO_GS(%esi) */ > +#define CPU_NUMBER_NO_GS(reg)\ > pushl %esi;\ > pushl %eax;\ > pushl %ebx;\ > @@ -63,20 +65,29 @@ > movl%esi, reg ;\ > popl%esi;\ > > +#define CPU_NUMBER(reg) \ > + movlMY(CPU_ID), reg; > + > #ifndef __ASSEMBLER__ > #include > #include > +#include > > -static inline int cpu_number(void) > +static inline int cpu_number_slow(void) > { > return cpu_id_lut[apic_get_current_cpu()]; > } > > +static inline int cpu_number(void) > +{ > + return percpu_get(int, cpu_id); > +} > #endif > > #else/* NCPUS == 1 */ > > #define CPU_NUMBER_NO_STACK(reg) > +#define CPU_NUMBER_NO_GS(reg) > #define CPU_NUMBER(reg) > #define CX(addr,reg)addr > > diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c > index fefe5e49..e1818683 100644 > --- a/i386/i386/fpu.c > +++ b/i386/i386/fpu.c > @@ -119,7 +119,7 @@ init_fpu(void) > #else/* MACH_RING1 */ > unsigned int native = 0; > > - if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486) > + if (machine_slot[cpu_number_slow()].cpu_type >= CPU_TYPE_I486) > native = CR0_NE; > > /* > diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c > index ddda603b..4edd3ec5 100644 > --- a/i386/i386/gdt.c > +++ b/i386/i386/gdt.c > @@ -35,6 +35,8 @@ > > #include > #include > +#include > +#include > > #include "vm_param.h" > #include "seg.h" > @@ -48,7 +50,7 @@ extern > struct real_descriptor gdt[GDTSZ]; > > static void > -gdt_fill(struct real_descriptor *mygdt) > +gdt_fill(int cpu, struct real_descriptor *mygdt) > { > /* Initialize the kernel code and data segment descriptors. */ > #ifdef __x86_64__ > @@ -73,6 +75,16 @@ gdt_fill(struct real_descriptor *mygdt) > 0x, > ACC_PL_K|ACC_DATA_W, SZ_32); > #endif /* MACH_PV_DESCRIPTORS */ > + vm_offset_t thiscpu = kvtolin(_array[cpu]); > + _fill_gdt_descriptor(mygdt, PERCPU_DS, > + thiscpu, > + thiscpu + sizeof(struct percpu) - 1, > +#ifdef __x86_64__ > + ACC_PL_K|ACC_DATA_W, SZ_64 > +#else > + ACC_PL_K|ACC_DATA_W, SZ_32 > +#endif > + ); > #endif > > #ifdef MACH_PV_DESCRIPTORS > @@ -119,15 +131,16 @@ reload_segs(void) > >"movw
[PATCH v5 2/2 gnumach] percpu area using gs segment
This speeds up smp again, by storing the struct processor in a percpu area and avoiding an expensive cpu_number every call of current_processor(), as well as getting the cpu_number by an offset into the percpu area. Untested on 64 bit and work remains to use other percpu arrays. TESTED: (NCPUS=8) -smp 1 boots to login shell ~2x slower than uniprocessor TESTED: (NCPUS=8) -smp 2 boots to INIT but hangs there TESTED: (NCPUS=8) -smp 4 gets stuck seemingly within rumpdisk and hangs TESTED: (NCPUS=1) uniprocessor is a bit faster than normal --- i386/Makefrag.am| 2 + i386/i386/cpu_number.h | 17 +++-- i386/i386/fpu.c | 2 +- i386/i386/gdt.c | 21 +-- i386/i386/gdt.h | 8 ++-- i386/i386/i386asm.sym | 2 + i386/i386/locore.S | 20 ++ i386/i386/mp_desc.c | 3 +- i386/i386/percpu.c | 31 +++ i386/i386/percpu.h | 83 + i386/i386/pit.c | 2 +- i386/i386/spl.S | 16 i386/i386at/model_dep.c | 1 + kern/cpu_number.h | 3 +- kern/processor.c| 7 +--- kern/processor.h| 17 +++-- kern/startup.c | 3 +- x86_64/Makefrag.am | 2 + x86_64/locore.S | 7 ++-- 19 files changed, 194 insertions(+), 53 deletions(-) create mode 100644 i386/i386/percpu.c create mode 100644 i386/i386/percpu.h diff --git a/i386/Makefrag.am b/i386/Makefrag.am index 274e8695..c1724cea 100644 --- a/i386/Makefrag.am +++ b/i386/Makefrag.am @@ -108,6 +108,8 @@ libkernel_a_SOURCES += \ i386/i386/irq.c \ i386/i386/irq.h \ i386/i386/msr.h \ + i386/i386/percpu.c \ + i386/i386/percpu.h \ i386/i386/pit.c \ i386/i386/pit.h diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h index 8357be84..6ba46e4b 100644 --- a/i386/i386/cpu_number.h +++ b/i386/i386/cpu_number.h @@ -30,6 +30,8 @@ #ifndef_I386_CPU_NUMBER_H_ #define_I386_CPU_NUMBER_H_ +#define MY(stm)%gs:PERCPU_##stm + #ifNCPUS > 1 #ifdef __i386__ @@ -45,8 +47,8 @@ shrl$24, reg;\ movl%cs:CX(cpu_id_lut, reg), reg;\ -/* Never call CPU_NUMBER(%esi) */ -#define CPU_NUMBER(reg)\ +/* Never call CPU_NUMBER_NO_GS(%esi) */ +#define CPU_NUMBER_NO_GS(reg) \ pushl %esi;\ pushl %eax;\ pushl %ebx;\ @@ -63,20 +65,29 @@ movl%esi, reg ;\ popl%esi;\ +#define CPU_NUMBER(reg)\ + movlMY(CPU_ID), reg; + #ifndef __ASSEMBLER__ #include #include +#include -static inline int cpu_number(void) +static inline int cpu_number_slow(void) { return cpu_id_lut[apic_get_current_cpu()]; } +static inline int cpu_number(void) +{ + return percpu_get(int, cpu_id); +} #endif #else /* NCPUS == 1 */ #defineCPU_NUMBER_NO_STACK(reg) +#defineCPU_NUMBER_NO_GS(reg) #defineCPU_NUMBER(reg) #defineCX(addr,reg)addr diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c index fefe5e49..e1818683 100644 --- a/i386/i386/fpu.c +++ b/i386/i386/fpu.c @@ -119,7 +119,7 @@ init_fpu(void) #else /* MACH_RING1 */ unsigned int native = 0; - if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486) + if (machine_slot[cpu_number_slow()].cpu_type >= CPU_TYPE_I486) native = CR0_NE; /* diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c index ddda603b..4edd3ec5 100644 --- a/i386/i386/gdt.c +++ b/i386/i386/gdt.c @@ -35,6 +35,8 @@ #include #include +#include +#include #include "vm_param.h" #include "seg.h" @@ -48,7 +50,7 @@ extern struct real_descriptor gdt[GDTSZ]; static void -gdt_fill(struct real_descriptor *mygdt) +gdt_fill(int cpu, struct real_descriptor *mygdt) { /* Initialize the kernel code and data segment descriptors. */ #ifdef __x86_64__ @@ -73,6 +75,16 @@ gdt_fill(struct real_descriptor *mygdt) 0x, ACC_PL_K|ACC_DATA_W, SZ_32); #endif /* MACH_PV_DESCRIPTORS */ + vm_offset_t thiscpu = kvtolin(_array[cpu]); + _fill_gdt_descriptor(mygdt, PERCPU_DS, + thiscpu, + thiscpu + sizeof(struct percpu) - 1, +#ifdef __x86_64__ + ACC_PL_K|ACC_DATA_W, SZ_64 +#else + ACC_PL_K|ACC_DATA_W, SZ_32 +#endif + ); #endif #ifdef MACH_PV_DESCRIPTORS @@ -119,15 +131,16 @@ reload_segs(void) "movw %w1,%%ds\n" "movw %w1,%%es\n" +"movw %w3,%%gs\n" "movw %w1,%%ss\n" -: : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0)); +: : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0), "r" (PERCPU_DS)); #endif } void gdt_init(void)