Re: [PATCH 05/10] powerpc/mm/slice: implement a slice mask cache

2018-03-06 Thread Nicholas Piggin
On Tue, 6 Mar 2018 14:49:57 +0100
Christophe LEROY  wrote:

> Le 06/03/2018 à 14:25, Nicholas Piggin a écrit :

> > @@ -201,6 +206,15 @@ typedef struct {
> > unsigned char low_slices_psize[SLICE_ARRAY_SIZE];
> > unsigned char high_slices_psize[0];
> > unsigned long slb_addr_limit;
> > +# ifdef CONFIG_PPC_16K_PAGES
> > +   struct slice_mask mask_16k;
> > +# else
> > +   struct slice_mask mask_4k;
> > +# endif  
> 
> Could we just call it mask_base or something like that regardless of the 
> standard page size ?

[...]

> > +#elif defined(CONFIG_PPC_8xx)
> > +static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int 
> > psize)
> > +{
> > +#ifdef CONFIG_PPC_16K_PAGES
> > +   if (psize == MMU_PAGE_16K)
> > +   return &mm->context.mask_16k;
> > +#else
> > +   if (psize == MMU_PAGE_4K)
> > +   return &mm->context.mask_4k;
> > +#endif  
> 
> What about the following instead:
> + if (psize == mmu_virtual_size)
> + return &mm->context.mask_base;

Sure if you prefer. It should generate the same code right?

Thanks,
Nick


Re: [PATCH 05/10] powerpc/mm/slice: implement a slice mask cache

2018-03-06 Thread Christophe LEROY



Le 06/03/2018 à 14:25, Nicholas Piggin a écrit :

Calculating the slice mask can become a signifcant overhead for
get_unmapped_area. This patch adds a struct slice_mask for
each page size in the mm_context, and keeps these in synch with
the slices psize arrays and slb_addr_limit.

On Book3S/64 this adds 288 bytes to the mm_context_t for the
slice mask caches.

On POWER8, this increases vfork+exec+exit performance by 9.9%
and reduces time to mmap+munmap a 64kB page by 28%.

Reduces time to mmap+munmap by about 10% on 8xx.

Cc: Benjamin Herrenschmidt 
Cc: Anton Blanchard 
---
  arch/powerpc/include/asm/book3s/64/mmu.h |  18 +
  arch/powerpc/include/asm/mmu-8xx.h   |  14 
  arch/powerpc/mm/slice.c  | 118 ---
  3 files changed, 107 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index bef6e39ed63a..78579305 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -80,6 +80,16 @@ struct spinlock;
  /* Maximum possible number of NPUs in a system. */
  #define NV_MAX_NPUS 8
  
+/*

+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+   u64 low_slices;
+   DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
+
  typedef struct {
mm_context_id_t id;
u16 user_psize; /* page size index */
@@ -95,6 +105,14 @@ typedef struct {
unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE];
unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
unsigned long slb_addr_limit;
+# ifdef CONFIG_PPC_64K_PAGES
+   struct slice_mask mask_64k;
+# endif
+   struct slice_mask mask_4k;
+# ifdef CONFIG_HUGETLB_PAGE
+   struct slice_mask mask_16m;
+   struct slice_mask mask_16g;
+# endif
  #else
u16 sllp;   /* SLB page size encoding */
  #endif
diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index d3d7e79140c6..4c3b14703b3e 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -192,6 +192,11 @@
  #endif
  
  #ifndef __ASSEMBLY__

+struct slice_mask {
+   u64 low_slices;
+   DECLARE_BITMAP(high_slices, 0);
+};
+
  typedef struct {
unsigned int id;
unsigned int active;
@@ -201,6 +206,15 @@ typedef struct {
unsigned char low_slices_psize[SLICE_ARRAY_SIZE];
unsigned char high_slices_psize[0];
unsigned long slb_addr_limit;
+# ifdef CONFIG_PPC_16K_PAGES
+   struct slice_mask mask_16k;
+# else
+   struct slice_mask mask_4k;
+# endif


Could we just call it mask_base or something like that regardless of the 
standard page size ?



+# ifdef CONFIG_HUGETLB_PAGE
+   struct slice_mask mask_512k;
+   struct slice_mask mask_8m;
+# endif
  #endif
  } mm_context_t;
  
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c

index 233c42d593dc..2115efe5e869 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -37,15 +37,6 @@
  #include 
  
  static DEFINE_SPINLOCK(slice_convert_lock);

-/*
- * One bit per slice. We have lower slices which cover 256MB segments
- * upto 4G range. That gets us 16 low slices. For the rest we track slices
- * in 1TB size.
- */
-struct slice_mask {
-   u64 low_slices;
-   DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
-};
  
  #ifdef DEBUG

  int _slice_debug = 1;
@@ -149,37 +140,44 @@ static void slice_mask_for_free(struct mm_struct *mm, 
struct slice_mask *ret,
__set_bit(i, ret->high_slices);
  }
  
-static void slice_mask_for_size(struct mm_struct *mm, int psize,

-   struct slice_mask *ret,
-   unsigned long high_limit)
+#ifdef CONFIG_PPC_BOOK3S_64
+static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
  {
-   unsigned char *hpsizes, *lpsizes;
-   int index, mask_index;
-   unsigned long i;
-
-   ret->low_slices = 0;
-   if (SLICE_NUM_HIGH)
-   bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
-
-   lpsizes = mm->context.low_slices_psize;
-   for (i = 0; i < SLICE_NUM_LOW; i++) {
-   mask_index = i & 0x1;
-   index = i >> 1;
-   if (((lpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-   ret->low_slices |= 1u << i;
-   }
-
-   if (high_limit <= SLICE_LOW_TOP)
-   return;
-
-   hpsizes = mm->context.high_slices_psize;
-   for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) {
-   mask_index = i & 0x1;
-   index = i >> 1;
-   if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-   __set_bit(i, ret->high_slices);
-   }
+#ifdef CONFIG_PPC_64K_PAGES
+