Re: [PATCH v5 2/2 gnumach] percpu area using gs segment

2023-09-24 Thread Samuel Thibault
Damien Zammit, le dim. 24 sept. 2023 10:35:10 +, a ecrit:
> Untested on 64 bit

It will just not work :)

64bit needs to set an fsgs base instead.

Samuel



Re: [PATCH v5 2/2 gnumach] percpu area using gs segment

2023-09-24 Thread Samuel Thibault
Applied, thanks!

Damien Zammit, le dim. 24 sept. 2023 10:35:10 +, a ecrit:
> This speeds up smp again, by storing the struct processor
> in a percpu area and avoiding an expensive cpu_number every call
> of current_processor(), as well as getting the cpu_number by
> an offset into the percpu area.  Untested on 64 bit
> and work remains to use other percpu arrays.
> 
> TESTED: (NCPUS=8) -smp 1 boots to login shell ~2x slower than uniprocessor
> TESTED: (NCPUS=8) -smp 2 boots to INIT but hangs there
> TESTED: (NCPUS=8) -smp 4 gets stuck seemingly within rumpdisk and hangs
> TESTED: (NCPUS=1) uniprocessor is a bit faster than normal
> 
> ---
>  i386/Makefrag.am|  2 +
>  i386/i386/cpu_number.h  | 17 +++--
>  i386/i386/fpu.c |  2 +-
>  i386/i386/gdt.c | 21 +--
>  i386/i386/gdt.h |  8 ++--
>  i386/i386/i386asm.sym   |  2 +
>  i386/i386/locore.S  | 20 ++
>  i386/i386/mp_desc.c |  3 +-
>  i386/i386/percpu.c  | 31 +++
>  i386/i386/percpu.h  | 83 +
>  i386/i386/pit.c |  2 +-
>  i386/i386/spl.S | 16 
>  i386/i386at/model_dep.c |  1 +
>  kern/cpu_number.h   |  3 +-
>  kern/processor.c|  7 +---
>  kern/processor.h| 17 +++--
>  kern/startup.c  |  3 +-
>  x86_64/Makefrag.am  |  2 +
>  x86_64/locore.S |  7 ++--
>  19 files changed, 194 insertions(+), 53 deletions(-)
>  create mode 100644 i386/i386/percpu.c
>  create mode 100644 i386/i386/percpu.h
> 
> diff --git a/i386/Makefrag.am b/i386/Makefrag.am
> index 274e8695..c1724cea 100644
> --- a/i386/Makefrag.am
> +++ b/i386/Makefrag.am
> @@ -108,6 +108,8 @@ libkernel_a_SOURCES += \
>   i386/i386/irq.c \
>   i386/i386/irq.h \
>   i386/i386/msr.h \
> + i386/i386/percpu.c \
> + i386/i386/percpu.h \
>   i386/i386/pit.c \
>   i386/i386/pit.h
>  
> diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h
> index 8357be84..6ba46e4b 100644
> --- a/i386/i386/cpu_number.h
> +++ b/i386/i386/cpu_number.h
> @@ -30,6 +30,8 @@
>  #ifndef  _I386_CPU_NUMBER_H_
>  #define  _I386_CPU_NUMBER_H_
>  
> +#define MY(stm)  %gs:PERCPU_##stm
> +
>  #if  NCPUS > 1
>  
>  #ifdef __i386__
> @@ -45,8 +47,8 @@
>   shrl$24, reg;\
>   movl%cs:CX(cpu_id_lut, reg), reg;\
>  
> -/* Never call CPU_NUMBER(%esi) */
> -#define CPU_NUMBER(reg)  \
> +/* Never call CPU_NUMBER_NO_GS(%esi) */
> +#define CPU_NUMBER_NO_GS(reg)\
>   pushl   %esi;\
>   pushl   %eax;\
>   pushl   %ebx;\
> @@ -63,20 +65,29 @@
>   movl%esi, reg   ;\
>   popl%esi;\
>  
> +#define CPU_NUMBER(reg)  \
> + movlMY(CPU_ID), reg;
> +
>  #ifndef __ASSEMBLER__
>  #include 
>  #include 
> +#include 
>  
> -static inline int cpu_number(void)
> +static inline int cpu_number_slow(void)
>  {
>   return cpu_id_lut[apic_get_current_cpu()];
>  }
>  
> +static inline int cpu_number(void)
> +{
> + return percpu_get(int, cpu_id);
> +}
>  #endif
>  
>  #else/* NCPUS == 1 */
>  
>  #define  CPU_NUMBER_NO_STACK(reg)
> +#define  CPU_NUMBER_NO_GS(reg)
>  #define  CPU_NUMBER(reg)
>  #define  CX(addr,reg)addr
>  
> diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c
> index fefe5e49..e1818683 100644
> --- a/i386/i386/fpu.c
> +++ b/i386/i386/fpu.c
> @@ -119,7 +119,7 @@ init_fpu(void)
>  #else/* MACH_RING1 */
>   unsigned int native = 0;
>  
> - if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486)
> + if (machine_slot[cpu_number_slow()].cpu_type >= CPU_TYPE_I486)
>   native = CR0_NE;
>  
>   /*
> diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c
> index ddda603b..4edd3ec5 100644
> --- a/i386/i386/gdt.c
> +++ b/i386/i386/gdt.c
> @@ -35,6 +35,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #include "vm_param.h"
>  #include "seg.h"
> @@ -48,7 +50,7 @@ extern
>  struct real_descriptor gdt[GDTSZ];
>  
>  static void
> -gdt_fill(struct real_descriptor *mygdt)
> +gdt_fill(int cpu, struct real_descriptor *mygdt)
>  {
>   /* Initialize the kernel code and data segment descriptors.  */
>  #ifdef __x86_64__
> @@ -73,6 +75,16 @@ gdt_fill(struct real_descriptor *mygdt)
>   0x,
>   ACC_PL_K|ACC_DATA_W, SZ_32);
>  #endif   /* MACH_PV_DESCRIPTORS */
> + vm_offset_t thiscpu = kvtolin(_array[cpu]);
> + _fill_gdt_descriptor(mygdt, PERCPU_DS,
> + thiscpu,
> + thiscpu + sizeof(struct percpu) - 1,
> +#ifdef __x86_64__
> + ACC_PL_K|ACC_DATA_W, SZ_64
> +#else
> + ACC_PL_K|ACC_DATA_W, SZ_32
> +#endif
> + );
>  #endif
>  
>  #ifdef   MACH_PV_DESCRIPTORS
> @@ -119,15 +131,16 @@ reload_segs(void)
>
>"movw

[PATCH v5 2/2 gnumach] percpu area using gs segment

2023-09-24 Thread Damien Zammit
This speeds up smp again, by storing the struct processor
in a percpu area and avoiding an expensive cpu_number every call
of current_processor(), as well as getting the cpu_number by
an offset into the percpu area.  Untested on 64 bit
and work remains to use other percpu arrays.

TESTED: (NCPUS=8) -smp 1 boots to login shell ~2x slower than uniprocessor
TESTED: (NCPUS=8) -smp 2 boots to INIT but hangs there
TESTED: (NCPUS=8) -smp 4 gets stuck seemingly within rumpdisk and hangs
TESTED: (NCPUS=1) uniprocessor is a bit faster than normal

---
 i386/Makefrag.am|  2 +
 i386/i386/cpu_number.h  | 17 +++--
 i386/i386/fpu.c |  2 +-
 i386/i386/gdt.c | 21 +--
 i386/i386/gdt.h |  8 ++--
 i386/i386/i386asm.sym   |  2 +
 i386/i386/locore.S  | 20 ++
 i386/i386/mp_desc.c |  3 +-
 i386/i386/percpu.c  | 31 +++
 i386/i386/percpu.h  | 83 +
 i386/i386/pit.c |  2 +-
 i386/i386/spl.S | 16 
 i386/i386at/model_dep.c |  1 +
 kern/cpu_number.h   |  3 +-
 kern/processor.c|  7 +---
 kern/processor.h| 17 +++--
 kern/startup.c  |  3 +-
 x86_64/Makefrag.am  |  2 +
 x86_64/locore.S |  7 ++--
 19 files changed, 194 insertions(+), 53 deletions(-)
 create mode 100644 i386/i386/percpu.c
 create mode 100644 i386/i386/percpu.h

diff --git a/i386/Makefrag.am b/i386/Makefrag.am
index 274e8695..c1724cea 100644
--- a/i386/Makefrag.am
+++ b/i386/Makefrag.am
@@ -108,6 +108,8 @@ libkernel_a_SOURCES += \
i386/i386/irq.c \
i386/i386/irq.h \
i386/i386/msr.h \
+   i386/i386/percpu.c \
+   i386/i386/percpu.h \
i386/i386/pit.c \
i386/i386/pit.h
 
diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h
index 8357be84..6ba46e4b 100644
--- a/i386/i386/cpu_number.h
+++ b/i386/i386/cpu_number.h
@@ -30,6 +30,8 @@
 #ifndef_I386_CPU_NUMBER_H_
 #define_I386_CPU_NUMBER_H_
 
+#define MY(stm)%gs:PERCPU_##stm
+
 #ifNCPUS > 1
 
 #ifdef __i386__
@@ -45,8 +47,8 @@
shrl$24, reg;\
movl%cs:CX(cpu_id_lut, reg), reg;\
 
-/* Never call CPU_NUMBER(%esi) */
-#define CPU_NUMBER(reg)\
+/* Never call CPU_NUMBER_NO_GS(%esi) */
+#define CPU_NUMBER_NO_GS(reg)  \
pushl   %esi;\
pushl   %eax;\
pushl   %ebx;\
@@ -63,20 +65,29 @@
movl%esi, reg   ;\
popl%esi;\
 
+#define CPU_NUMBER(reg)\
+   movlMY(CPU_ID), reg;
+
 #ifndef __ASSEMBLER__
 #include 
 #include 
+#include 
 
-static inline int cpu_number(void)
+static inline int cpu_number_slow(void)
 {
return cpu_id_lut[apic_get_current_cpu()];
 }
 
+static inline int cpu_number(void)
+{
+   return percpu_get(int, cpu_id);
+}
 #endif
 
 #else  /* NCPUS == 1 */
 
 #defineCPU_NUMBER_NO_STACK(reg)
+#defineCPU_NUMBER_NO_GS(reg)
 #defineCPU_NUMBER(reg)
 #defineCX(addr,reg)addr
 
diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c
index fefe5e49..e1818683 100644
--- a/i386/i386/fpu.c
+++ b/i386/i386/fpu.c
@@ -119,7 +119,7 @@ init_fpu(void)
 #else  /* MACH_RING1 */
unsigned int native = 0;
 
-   if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486)
+   if (machine_slot[cpu_number_slow()].cpu_type >= CPU_TYPE_I486)
native = CR0_NE;
 
/*
diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c
index ddda603b..4edd3ec5 100644
--- a/i386/i386/gdt.c
+++ b/i386/i386/gdt.c
@@ -35,6 +35,8 @@
 
 #include 
 #include 
+#include 
+#include 
 
 #include "vm_param.h"
 #include "seg.h"
@@ -48,7 +50,7 @@ extern
 struct real_descriptor gdt[GDTSZ];
 
 static void
-gdt_fill(struct real_descriptor *mygdt)
+gdt_fill(int cpu, struct real_descriptor *mygdt)
 {
/* Initialize the kernel code and data segment descriptors.  */
 #ifdef __x86_64__
@@ -73,6 +75,16 @@ gdt_fill(struct real_descriptor *mygdt)
0x,
ACC_PL_K|ACC_DATA_W, SZ_32);
 #endif /* MACH_PV_DESCRIPTORS */
+   vm_offset_t thiscpu = kvtolin(_array[cpu]);
+   _fill_gdt_descriptor(mygdt, PERCPU_DS,
+   thiscpu,
+   thiscpu + sizeof(struct percpu) - 1,
+#ifdef __x86_64__
+   ACC_PL_K|ACC_DATA_W, SZ_64
+#else
+   ACC_PL_K|ACC_DATA_W, SZ_32
+#endif
+   );
 #endif
 
 #ifdef MACH_PV_DESCRIPTORS
@@ -119,15 +131,16 @@ reload_segs(void)
 
 "movw  %w1,%%ds\n"
 "movw  %w1,%%es\n"
+"movw  %w3,%%gs\n"
 "movw  %w1,%%ss\n"
-: : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0));
+: : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0), "r" 
(PERCPU_DS));
 #endif
 }
 
 void
 gdt_init(void)