Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-18 Thread Thomas Zimmermann

Hi Sam

Am 18.02.22 um 11:14 schrieb Sam Ravnborg:

Hi Thomas,

On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:

Improve the performance of sys_imageblit() by manually unrolling
the inner blitting loop and moving some invariants out. The compiler
failed to do this automatically. The resulting binary code was even
slower than the cfb_imageblit() helper, which uses the same algorithm,
but operates on I/O memory.


It would be super to have the same optimization done to cfb_imageblit(),
to prevent that the two codebases diverge more than necessary.
Also I think cfb_ version would also see a performance gain from this.


Yes, I can do that.



The actual implementation looks good.
So with or without the extra un-rolling the patch is:
Acked-by: Sam Ravnborg 

One small nit belwo.

Sam



A microbenchmark measures the average number of CPU cycles
for sys_imageblit() after a stabilizing period of a few minutes
(i7-4790, FullHD, simpledrm, kernel with debugging). The value
for CFB is given as a reference.

   sys_imageblit(), new: 25934 cycles
   sys_imageblit(), old: 35944 cycles
   cfb_imageblit():  30566 cycles

In the optimized case, sys_imageblit() is now ~30% faster than before
and ~20% faster than cfb_imageblit().

Signed-off-by: Thomas Zimmermann 
---
  drivers/video/fbdev/core/sysimgblt.c | 51 +---
  1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/video/fbdev/core/sysimgblt.c 
b/drivers/video/fbdev/core/sysimgblt.c
index a4d05b1b17d7..d70d65af6fcb 100644
--- a/drivers/video/fbdev/core/sysimgblt.c
+++ b/drivers/video/fbdev/core/sysimgblt.c
@@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, 
struct fb_info *p,
  {
u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
-   u32 bit_mask, end_mask, eorx, shift;
+   u32 bit_mask, eorx;
const char *s = image->data, *src;
u32 *dst;
-   const u32 *tab = NULL;
-   int i, j, k;
+   const u32 *tab;
+   size_t tablen;
+   u32 colortab[16];
+   int i, j, k, jdecr;
+
+   if ((uintptr_t)dst1 % 8)
+   return;

This check is new - and should not trigger ever. Maybe add an unlikely
and a WARN_ON_ONCE()?


I think I can remove this test. It was supposed to tell the compiler 
that dst1 is 8-aligned, but I don't think it worked.


Best regards
Thomas




  
  	switch (bpp) {

case 8:
tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
+   tablen = 16;
break;
case 16:
tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
+   tablen = 4;
break;
case 32:
-   default:
tab = cfb_tab32;
+   tablen = 2;
break;
+   default:
+   return;
}
  
  	for (i = ppw-1; i--; ) {

@@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, 
struct fb_info *p,
bit_mask = (1 << ppw) - 1;
eorx = fgx ^ bgx;
k = image->width/ppw;
+   jdecr = 8 / ppw;
+
+   for (i = 0; i < tablen; ++i)
+   colortab[i] = (tab[i] & eorx) ^ bgx;

This code could have been embedded with the switch (bpp) {
That would have made some sense I think.
But both ways works, so this was just a small observation.

Sam


--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-18 Thread Sam Ravnborg
Hi Thomas,

On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.

It would be super to have the same optimization done to cfb_imageblit(),
to prevent that the two codebases diverge more than necessary.
Also I think cfb_ version would also see a performance gain from this.

The actual implementation looks good.
So with or without the extra un-rolling the patch is:
Acked-by: Sam Ravnborg 

One small nit belwo.

Sam

> 
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
> 
>   sys_imageblit(), new: 25934 cycles
>   sys_imageblit(), old: 35944 cycles
>   cfb_imageblit():  30566 cycles
> 
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
> 
> Signed-off-by: Thomas Zimmermann 
> ---
>  drivers/video/fbdev/core/sysimgblt.c | 51 +---
>  1 file changed, 39 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/video/fbdev/core/sysimgblt.c 
> b/drivers/video/fbdev/core/sysimgblt.c
> index a4d05b1b17d7..d70d65af6fcb 100644
> --- a/drivers/video/fbdev/core/sysimgblt.c
> +++ b/drivers/video/fbdev/core/sysimgblt.c
> @@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image 
> *image, struct fb_info *p,
>  {
>   u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
>   u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
> - u32 bit_mask, end_mask, eorx, shift;
> + u32 bit_mask, eorx;
>   const char *s = image->data, *src;
>   u32 *dst;
> - const u32 *tab = NULL;
> - int i, j, k;
> + const u32 *tab;
> + size_t tablen;
> + u32 colortab[16];
> + int i, j, k, jdecr;
> +
> + if ((uintptr_t)dst1 % 8)
> + return;
This check is new - and should not trigger ever. Maybe add an unlikely
and a WARN_ON_ONCE()?


>  
>   switch (bpp) {
>   case 8:
>   tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
> + tablen = 16;
>   break;
>   case 16:
>   tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
> + tablen = 4;
>   break;
>   case 32:
> - default:
>   tab = cfb_tab32;
> + tablen = 2;
>   break;
> + default:
> + return;
>   }
>  
>   for (i = ppw-1; i--; ) {
> @@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image 
> *image, struct fb_info *p,
>   bit_mask = (1 << ppw) - 1;
>   eorx = fgx ^ bgx;
>   k = image->width/ppw;
> + jdecr = 8 / ppw;
> +
> + for (i = 0; i < tablen; ++i)
> + colortab[i] = (tab[i] & eorx) ^ bgx;
This code could have been embedded with the switch (bpp) {
That would have made some sense I think.
But both ways works, so this was just a small observation.

Sam


Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-18 Thread Javier Martinez Canillas
Hello Thomas,

On 2/17/22 11:34, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.
> 
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
> 
>   sys_imageblit(), new: 25934 cycles
>   sys_imageblit(), old: 35944 cycles
>   cfb_imageblit():  30566 cycles
> 
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
> 
> Signed-off-by: Thomas Zimmermann 
> ---

This patch looks good to me as well.

Reviewed-by: Javier Martinez Canillas 

Best regards,
-- 
Javier Martinez Canillas
Linux Engineering
Red Hat



Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-17 Thread Thomas Zimmermann

Hi

Am 17.02.22 um 12:05 schrieb Gerd Hoffmann:

-   for (j = k; j--; ) {
-   shift -= ppw;
-   end_mask = tab[(*src >> shift) & bit_mask];
-   *dst++ = (end_mask & eorx) ^ bgx;
-   if (!shift) {
-   shift = 8;
-   src++;
+   for (j = k; j; j -= jdecr, ++src) {
+   switch (ppw) {
+   case 4: /* 8 bpp */
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
+   case 2: /* 16 bpp */
+   *dst++ = colortab[(*src >> 6) & bit_mask];
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 2) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
+   case 1: /* 32 bpp */
+   *dst++ = colortab[(*src >> 7) & bit_mask];
+   *dst++ = colortab[(*src >> 6) & bit_mask];
+   *dst++ = colortab[(*src >> 5) & bit_mask];
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 3) & bit_mask];
+   *dst++ = colortab[(*src >> 2) & bit_mask];
+   *dst++ = colortab[(*src >> 1) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
}


How about moving the switch out of the loop, i.e.

switch (ppw) {
case 4:
 for (j = ...) {
 *dst++ = colortab[(*src >> 4) & bit_mask];
 *dst++ = colortab[(*src >> 0) & bit_mask];
 }
[ ... ]
}

?


No difference. Values for the microbenchmark (rdtsc around 
sys_imageblit()) and the directory listing stabilize at the same 
numbers.  I'll still go with you suggestion, because the code is more 
readable.


Best regards
Thomas



take care,
   Gerd



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-17 Thread Gerd Hoffmann
> - for (j = k; j--; ) {
> - shift -= ppw;
> - end_mask = tab[(*src >> shift) & bit_mask];
> - *dst++ = (end_mask & eorx) ^ bgx;
> - if (!shift) {
> - shift = 8;
> - src++;
> + for (j = k; j; j -= jdecr, ++src) {
> + switch (ppw) {
> + case 4: /* 8 bpp */
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 2: /* 16 bpp */
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 1: /* 32 bpp */
> + *dst++ = colortab[(*src >> 7) & bit_mask];
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 5) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 3) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 1) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
>   }

How about moving the switch out of the loop, i.e.

switch (ppw) {
case 4:
for (j = ...) {
*dst++ = colortab[(*src >> 4) & bit_mask];
*dst++ = colortab[(*src >> 0) & bit_mask];
}
[ ... ]
}

?

take care,
  Gerd



[PATCH 2/2] fbdev: Improve performance of sys_imageblit()

2022-02-17 Thread Thomas Zimmermann
Improve the performance of sys_imageblit() by manually unrolling
the inner blitting loop and moving some invariants out. The compiler
failed to do this automatically. The resulting binary code was even
slower than the cfb_imageblit() helper, which uses the same algorithm,
but operates on I/O memory.

A microbenchmark measures the average number of CPU cycles
for sys_imageblit() after a stabilizing period of a few minutes
(i7-4790, FullHD, simpledrm, kernel with debugging). The value
for CFB is given as a reference.

  sys_imageblit(), new: 25934 cycles
  sys_imageblit(), old: 35944 cycles
  cfb_imageblit():  30566 cycles

In the optimized case, sys_imageblit() is now ~30% faster than before
and ~20% faster than cfb_imageblit().

Signed-off-by: Thomas Zimmermann 
---
 drivers/video/fbdev/core/sysimgblt.c | 51 +---
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/video/fbdev/core/sysimgblt.c 
b/drivers/video/fbdev/core/sysimgblt.c
index a4d05b1b17d7..d70d65af6fcb 100644
--- a/drivers/video/fbdev/core/sysimgblt.c
+++ b/drivers/video/fbdev/core/sysimgblt.c
@@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, 
struct fb_info *p,
 {
u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
-   u32 bit_mask, end_mask, eorx, shift;
+   u32 bit_mask, eorx;
const char *s = image->data, *src;
u32 *dst;
-   const u32 *tab = NULL;
-   int i, j, k;
+   const u32 *tab;
+   size_t tablen;
+   u32 colortab[16];
+   int i, j, k, jdecr;
+
+   if ((uintptr_t)dst1 % 8)
+   return;
 
switch (bpp) {
case 8:
tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
+   tablen = 16;
break;
case 16:
tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
+   tablen = 4;
break;
case 32:
-   default:
tab = cfb_tab32;
+   tablen = 2;
break;
+   default:
+   return;
}
 
for (i = ppw-1; i--; ) {
@@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, 
struct fb_info *p,
bit_mask = (1 << ppw) - 1;
eorx = fgx ^ bgx;
k = image->width/ppw;
+   jdecr = 8 / ppw;
+
+   for (i = 0; i < tablen; ++i)
+   colortab[i] = (tab[i] & eorx) ^ bgx;
 
for (i = image->height; i--; ) {
dst = dst1;
-   shift = 8;
src = s;
 
-   for (j = k; j--; ) {
-   shift -= ppw;
-   end_mask = tab[(*src >> shift) & bit_mask];
-   *dst++ = (end_mask & eorx) ^ bgx;
-   if (!shift) {
-   shift = 8;
-   src++;
+   for (j = k; j; j -= jdecr, ++src) {
+   switch (ppw) {
+   case 4: /* 8 bpp */
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
+   case 2: /* 16 bpp */
+   *dst++ = colortab[(*src >> 6) & bit_mask];
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 2) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
+   case 1: /* 32 bpp */
+   *dst++ = colortab[(*src >> 7) & bit_mask];
+   *dst++ = colortab[(*src >> 6) & bit_mask];
+   *dst++ = colortab[(*src >> 5) & bit_mask];
+   *dst++ = colortab[(*src >> 4) & bit_mask];
+   *dst++ = colortab[(*src >> 3) & bit_mask];
+   *dst++ = colortab[(*src >> 2) & bit_mask];
+   *dst++ = colortab[(*src >> 1) & bit_mask];
+   *dst++ = colortab[(*src >> 0) & bit_mask];
+   break;
}
}
dst1 += p->fix.line_length;
-- 
2.34.1