Re: [PATCH v3 gnumach] percpu area using gs segment

2023-09-26 Thread Richard Braun
On Sun, Sep 24, 2023 at 12:31:20AM +0200, Samuel Thibault wrote:
> For an inline to actually get optimized, you have to put it in the .h
> file, otherwise the compiler won't be able to inline it at all since it
> won't have the definition when compiling other .c files.

It could be a good thing to add whole program optimizations like LTOs,
although this requires the code quality to be excellent, but that should
eventually be the case anyway.

-- 
Richard Braun



Re: [PATCH v3 gnumach] percpu area using gs segment

2023-09-23 Thread Samuel Thibault
Damien Zammit, le sam. 23 sept. 2023 10:57:08 +, a ecrit:
> diff --git a/i386/i386/cpu_number.c b/i386/i386/cpu_number.c
> index ef19e11f..83c86d82 100644
> --- a/i386/i386/cpu_number.c
> +++ b/i386/i386/cpu_number.c
> @@ -20,11 +20,17 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
> 
>  #if NCPUS > 1
> -int cpu_number(void)
> +inline int cpu_number_slow(void)
>  {
> return cpu_id_lut[apic_get_current_cpu()];
>  }
> +
> +inline int cpu_number(void)
> +{
> +   return percpu_get(int, cpu_id);
> +}
>  #endif

For an inline to actually get optimized, you have to put it in the .h
file, otherwise the compiler won't be able to inline it at all since it
won't have the definition when compiling other .c files.



> diff --git a/i386/i386/gdt.h b/i386/i386/gdt.h
> index 5def73cb..c7da012a 100644
> --- a/i386/i386/gdt.h
> +++ b/i386/i386/gdt.h
> @@ -77,11 +77,11 @@
>  
>  /*   0x58   used by user TSS in 64bit mode */
>  
> +#ifndef __ASSEMBLER__
>  
>  extern struct real_descriptor gdt[GDTSZ];
>  
> @@ -117,4 +117,5 @@ extern struct real_descriptor gdt[GDTSZ];
>  extern void gdt_init(void);
>  extern void ap_gdt_init(int cpu);
>  
> +#endif /* __ASSEMBLER__ */
>  #endif /* _I386_GDT_ */
> diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
> index 436e296a..b19dfe7c 100644
> --- a/i386/i386/i386asm.sym
> +++ b/i386/i386/i386asm.sym
> @@ -154,17 +156,11 @@ exprNPTES   
> PTES_PER_PAGE
>  expr INTEL_PTE_VALID|INTEL_PTE_WRITE INTEL_PTE_KERNEL
>  
>  expr IDTSZ
> -expr GDTSZ
> -expr LDTSZ
>  
>  expr KERNEL_RING
>  
>  expr KERNEL_CS
>  expr KERNEL_DS
> -expr KERNEL_TSS
> -#ifndef  MACH_PV_DESCRIPTORS
> -expr KERNEL_LDT
> -#endif   /* MACH_PV_DESCRIPTORS */
>  
>  expr (VM_MIN_KERNEL_ADDRESS>>PDESHIFT)*sizeof(pt_entry_t)KERNELBASEPDE
>  
> diff --git a/i386/i386/locore.S b/i386/i386/locore.S
> index 55aa9d60..870db785 100644
> --- a/i386/i386/locore.S
> +++ b/i386/i386/locore.S
> @@ -33,6 +33,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 

As I wrote before, please put these into a separate commit, so that
people looking at the commit that does actually change behavior doesn't
contain unrelated changes that are just cleanups.



> diff --git a/i386/i386/percpu.c b/i386/i386/percpu.c
> new file mode 100644
> index ..b2b8afa7
> --- /dev/null
> +++ b/i386/i386/percpu.c
> +#include 
> +#include 
> +#include 
> +
> +struct percpu percpu_array[NCPUS] __aligned(0x8000) = {0};

Why aligning?


Samuel



[PATCH v3 gnumach] percpu area using gs segment

2023-09-23 Thread Damien Zammit
This speeds up smp again, by storing the struct processor
in a percpu area and avoiding an expensive cpu_number every call
of current_processor(), as well as getting the cpu_number by
an offset into the percpu area.  Untested on 64 bit
and work remains to use other percpu arrays.

TESTED: (NCPUS=8) -smp 1 boots to login shell ~2x slower than uniprocessor
TESTED: (NCPUS=8) -smp 4 boots to INIT but hangs there
TESTED: (NCPUS=1) uniprocessor is a bit faster than normal
---
 i386/Makefrag.am|  2 +
 i386/i386/cpu_number.c  |  8 +++-
 i386/i386/cpu_number.h  | 13 --
 i386/i386/fpu.c |  2 +-
 i386/i386/gdt.c | 20 --
 i386/i386/gdt.h | 11 +++---
 i386/i386/i386asm.sym   |  8 +---
 i386/i386/locore.S  | 21 ++
 i386/i386/mp_desc.c |  3 +-
 i386/i386/percpu.c  | 30 ++
 i386/i386/percpu.h  | 88 +
 i386/i386/pit.c |  2 +-
 i386/i386/spl.S | 16 
 i386/i386at/model_dep.c |  1 +
 kern/cpu_number.h   |  3 +-
 kern/processor.c|  7 +---
 kern/processor.h| 18 +++--
 kern/startup.c  |  3 +-
 x86_64/Makefrag.am  |  2 +
 x86_64/locore.S |  8 ++--
 20 files changed, 206 insertions(+), 60 deletions(-)
 create mode 100644 i386/i386/percpu.c
 create mode 100644 i386/i386/percpu.h

diff --git a/i386/Makefrag.am b/i386/Makefrag.am
index 274e8695..c1724cea 100644
--- a/i386/Makefrag.am
+++ b/i386/Makefrag.am
@@ -108,6 +108,8 @@ libkernel_a_SOURCES += \
i386/i386/irq.c \
i386/i386/irq.h \
i386/i386/msr.h \
+   i386/i386/percpu.c \
+   i386/i386/percpu.h \
i386/i386/pit.c \
i386/i386/pit.h
 
diff --git a/i386/i386/cpu_number.c b/i386/i386/cpu_number.c
index ef19e11f..83c86d82 100644
--- a/i386/i386/cpu_number.c
+++ b/i386/i386/cpu_number.c
@@ -20,11 +20,17 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #if NCPUS > 1
-int cpu_number(void)
+inline int cpu_number_slow(void)
 {
return cpu_id_lut[apic_get_current_cpu()];
 }
+
+inline int cpu_number(void)
+{
+   return percpu_get(int, cpu_id);
+}
 #endif
diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h
index 479a847a..b50ae275 100644
--- a/i386/i386/cpu_number.h
+++ b/i386/i386/cpu_number.h
@@ -30,6 +30,8 @@
 #ifndef_I386_CPU_NUMBER_H_
 #define_I386_CPU_NUMBER_H_
 
+#define MY(stm)%gs:PERCPU_##stm
+
 #ifNCPUS > 1
 
 #ifdef __i386__
@@ -45,8 +47,8 @@
shrl$24, reg;\
movl%cs:CX(cpu_id_lut, reg), reg;\
 
-/* Never call CPU_NUMBER(%esi) */
-#define CPU_NUMBER(reg)\
+/* Never call CPU_NUMBER_NO_GS(%esi) */
+#define CPU_NUMBER_NO_GS(reg)  \
pushl   %esi;\
pushl   %eax;\
pushl   %ebx;\
@@ -63,14 +65,19 @@
movl%esi, reg   ;\
popl%esi;\
 
+#define CPU_NUMBER(reg)\
+   movlMY(CPU_ID), reg;
+
 #ifndef __ASSEMBLER__
 #include "kern/cpu_number.h"
-int cpu_number(void);
+inline int cpu_number_slow(void);
+inline int cpu_number(void);
 #endif
 
 #else  /* NCPUS == 1 */
 
 #defineCPU_NUMBER_NO_STACK(reg)
+#defineCPU_NUMBER_NO_GS(reg)
 #defineCPU_NUMBER(reg)
 #defineCX(addr,reg)addr
 
diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c
index fefe5e49..e1818683 100644
--- a/i386/i386/fpu.c
+++ b/i386/i386/fpu.c
@@ -119,7 +119,7 @@ init_fpu(void)
 #else  /* MACH_RING1 */
unsigned int native = 0;
 
-   if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486)
+   if (machine_slot[cpu_number_slow()].cpu_type >= CPU_TYPE_I486)
native = CR0_NE;
 
/*
diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c
index ddda603b..524b1e85 100644
--- a/i386/i386/gdt.c
+++ b/i386/i386/gdt.c
@@ -35,6 +35,7 @@
 
 #include 
 #include 
+#include 
 
 #include "vm_param.h"
 #include "seg.h"
@@ -48,7 +49,7 @@ extern
 struct real_descriptor gdt[GDTSZ];
 
 static void
-gdt_fill(struct real_descriptor *mygdt)
+gdt_fill(int cpu, struct real_descriptor *mygdt)
 {
/* Initialize the kernel code and data segment descriptors.  */
 #ifdef __x86_64__
@@ -73,6 +74,16 @@ gdt_fill(struct real_descriptor *mygdt)
0x,
ACC_PL_K|ACC_DATA_W, SZ_32);
 #endif /* MACH_PV_DESCRIPTORS */
+   vm_offset_t thiscpu = kvtolin(&percpu_array[cpu]);
+   _fill_gdt_descriptor(mygdt, PERCPU_DS,
+   thiscpu,
+   thiscpu + sizeof(struct percpu) - 1,
+#ifdef __x86_64__
+   ACC_PL_K|ACC_DATA_W, SZ_64
+#else
+   ACC_PL_K|ACC_DATA_W, SZ_32
+#endif
+   );
 #endif
 
 #ifdef MACH_PV_DESCRIPTORS
@@ -119,15 +130,16 @@ reload_segs(void)
 
 "movw  %w1,%%ds\n"
 "movw  %w1,%%es\n"
+