Re: [PATCH 4/4] sm501: Optimise 1 pixel 2d ops

2020-06-15 Thread BALATON Zoltan

On Mon, 15 Jun 2020, Peter Maydell wrote:

On Sat, 6 Jun 2020 at 20:22, BALATON Zoltan  wrote:


Some guests do 1x1 blits which is faster to do directly than calling a
function for it so avoid overhead in this case.


How much does the performance improve by ?


I haven't actually measured due to lack of time experimenting with it and 
those who I've asked to check it only reported it felt a bit faster but no 
numerical measurements so that does not prove anything. Anyway these 
special cases should not hurt and simple enough to have it here even if 
may not improve performance very much. The biggest loss is probably 
converting 16 bit screen on every display update to 32 bit because the 
guest driver does not allow higher bit depth than 16 for some reason. But 
I don't plan to spend more time with improving sm501, only done it because 
I had to touch it anyway. Longer term plan is to finish ati-vga which 
should have better guest drivers and better performance.



Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c | 40 +---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 3397ca9fbf..59693fbb5c 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
 src_x == dst_x && src_y == dst_y) {
 break;
 }
+/* Some clients also do 1 pixel blits, avoid overhead for these */
+if (width == 1 && height == 1) {
+unsigned int si = (src_x + src_y * src_pitch) * (1 << format);
+unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << format);
+switch (format) {
+case 0:
+s->local_mem[dst_base + di] = s->local_mem[src_base + si];
+break;
+case 1:
+*(uint16_t *)>local_mem[dst_base + di] =
+*(uint16_t *)>local_mem[src_base + si];
+break;
+case 2:
+*(uint32_t *)>local_mem[dst_base + di] =
+*(uint32_t *)>local_mem[src_base + si];
+break;
+}


You could write this more compactly as
  stn_he_p(>local_mem[dst_base + di], 1 << format,
   ldn_he_p(>local_mem[src_base + si], 1
<< format));

(which handles the length-cases for you and also doesn't rely on
casting a uint8_t* giving you something correctly aligned for a
wider access).


OK, I never know these and they are a bit hard to find because they are 
defined as glued together macros so grepping for it does not show the 
definition. Maybe adding a comment with the names where these are defined 
in bswap.h might help. stn_he_p seems to ultimately call __builtin_memcpy 
that probably does the same direct assignment for short values. I wonder 
how much overhead that has going through all the function calls but 
hopefully those are inlined.


Regards,
BALATON Zoltan


+break;
+}
 /* Check for overlaps, this could be made more exact */
 uint32_t sb, se, db, de;
 sb = src_base + src_x + src_y * (width + src_pitch);
@@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
 color = cpu_to_le16(color);
 }

-pixman_fill((uint32_t *)>local_mem[dst_base],
-dst_pitch * (1 << format) / sizeof(uint32_t),
-8 * (1 << format), dst_x, dst_y, width, height, color);
+if (width == 1 && height == 1) {
+unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
+switch (format) {
+case 0:
+s->local_mem[dst_base + i] = color & 0xff;
+break;
+case 1:
+*(uint16_t *)>local_mem[dst_base + i] = color & 0x;
+break;
+case 2:
+*(uint32_t *)>local_mem[dst_base + i] = color;
+break;
+}


  stn_he_p(>local_mem[dst_base + i], 1 << format, color);




Re: [PATCH 4/4] sm501: Optimise 1 pixel 2d ops

2020-06-15 Thread Peter Maydell
On Sat, 6 Jun 2020 at 20:22, BALATON Zoltan  wrote:
>
> Some guests do 1x1 blits which is faster to do directly than calling a
> function for it so avoid overhead in this case.

How much does the performance improve by ?

> Signed-off-by: BALATON Zoltan 
> ---
>  hw/display/sm501.c | 40 +---
>  1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/hw/display/sm501.c b/hw/display/sm501.c
> index 3397ca9fbf..59693fbb5c 100644
> --- a/hw/display/sm501.c
> +++ b/hw/display/sm501.c
> @@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
>  src_x == dst_x && src_y == dst_y) {
>  break;
>  }
> +/* Some clients also do 1 pixel blits, avoid overhead for these 
> */
> +if (width == 1 && height == 1) {
> +unsigned int si = (src_x + src_y * src_pitch) * (1 << 
> format);
> +unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << 
> format);
> +switch (format) {
> +case 0:
> +s->local_mem[dst_base + di] = s->local_mem[src_base + 
> si];
> +break;
> +case 1:
> +*(uint16_t *)>local_mem[dst_base + di] =
> +*(uint16_t *)>local_mem[src_base + 
> si];
> +break;
> +case 2:
> +*(uint32_t *)>local_mem[dst_base + di] =
> +*(uint32_t *)>local_mem[src_base + 
> si];
> +break;
> +}

You could write this more compactly as
   stn_he_p(>local_mem[dst_base + di], 1 << format,
ldn_he_p(>local_mem[src_base + si], 1
<< format));

(which handles the length-cases for you and also doesn't rely on
casting a uint8_t* giving you something correctly aligned for a
wider access).

> +break;
> +}
>  /* Check for overlaps, this could be made more exact */
>  uint32_t sb, se, db, de;
>  sb = src_base + src_x + src_y * (width + src_pitch);
> @@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
>  color = cpu_to_le16(color);
>  }
>
> -pixman_fill((uint32_t *)>local_mem[dst_base],
> -dst_pitch * (1 << format) / sizeof(uint32_t),
> -8 * (1 << format), dst_x, dst_y, width, height, color);
> +if (width == 1 && height == 1) {
> +unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
> +switch (format) {
> +case 0:
> +s->local_mem[dst_base + i] = color & 0xff;
> +break;
> +case 1:
> +*(uint16_t *)>local_mem[dst_base + i] = color & 0x;
> +break;
> +case 2:
> +*(uint32_t *)>local_mem[dst_base + i] = color;
> +break;
> +}

   stn_he_p(>local_mem[dst_base + i], 1 << format, color);


> +} else {
> +pixman_fill((uint32_t *)>local_mem[dst_base],
> +dst_pitch * (1 << format) / sizeof(uint32_t),
> +8 * (1 << format), dst_x, dst_y, width, height, 
> color);
> +}
>  break;
>  }
>  default:

thanks
-- PMM



[PATCH 4/4] sm501: Optimise 1 pixel 2d ops

2020-06-06 Thread BALATON Zoltan
Some guests do 1x1 blits which is faster to do directly than calling a
function for it so avoid overhead in this case.

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c | 40 +---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 3397ca9fbf..59693fbb5c 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
 src_x == dst_x && src_y == dst_y) {
 break;
 }
+/* Some clients also do 1 pixel blits, avoid overhead for these */
+if (width == 1 && height == 1) {
+unsigned int si = (src_x + src_y * src_pitch) * (1 << format);
+unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << format);
+switch (format) {
+case 0:
+s->local_mem[dst_base + di] = s->local_mem[src_base + si];
+break;
+case 1:
+*(uint16_t *)>local_mem[dst_base + di] =
+*(uint16_t *)>local_mem[src_base + si];
+break;
+case 2:
+*(uint32_t *)>local_mem[dst_base + di] =
+*(uint32_t *)>local_mem[src_base + si];
+break;
+}
+break;
+}
 /* Check for overlaps, this could be made more exact */
 uint32_t sb, se, db, de;
 sb = src_base + src_x + src_y * (width + src_pitch);
@@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
 color = cpu_to_le16(color);
 }
 
-pixman_fill((uint32_t *)>local_mem[dst_base],
-dst_pitch * (1 << format) / sizeof(uint32_t),
-8 * (1 << format), dst_x, dst_y, width, height, color);
+if (width == 1 && height == 1) {
+unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
+switch (format) {
+case 0:
+s->local_mem[dst_base + i] = color & 0xff;
+break;
+case 1:
+*(uint16_t *)>local_mem[dst_base + i] = color & 0x;
+break;
+case 2:
+*(uint32_t *)>local_mem[dst_base + i] = color;
+break;
+}
+} else {
+pixman_fill((uint32_t *)>local_mem[dst_base],
+dst_pitch * (1 << format) / sizeof(uint32_t),
+8 * (1 << format), dst_x, dst_y, width, height, color);
+}
 break;
 }
 default:
-- 
2.21.3