On Sat, 6 Jun 2020 at 20:22, BALATON Zoltan <bala...@eik.bme.hu> wrote:
>
> Some guests do 1x1 blits which is faster to do directly than calling a
> function for it so avoid overhead in this case.

How much does the performance improve by ?

> Signed-off-by: BALATON Zoltan <bala...@eik.bme.hu>
> ---
>  hw/display/sm501.c | 40 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/hw/display/sm501.c b/hw/display/sm501.c
> index 3397ca9fbf..59693fbb5c 100644
> --- a/hw/display/sm501.c
> +++ b/hw/display/sm501.c
> @@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
>                  src_x == dst_x && src_y == dst_y) {
>                  break;
>              }
> +            /* Some clients also do 1 pixel blits, avoid overhead for these 
> */
> +            if (width == 1 && height == 1) {
> +                unsigned int si = (src_x + src_y * src_pitch) * (1 << 
> format);
> +                unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << 
> format);
> +                switch (format) {
> +                case 0:
> +                    s->local_mem[dst_base + di] = s->local_mem[src_base + 
> si];
> +                    break;
> +                case 1:
> +                    *(uint16_t *)&s->local_mem[dst_base + di] =
> +                                    *(uint16_t *)&s->local_mem[src_base + 
> si];
> +                    break;
> +                case 2:
> +                    *(uint32_t *)&s->local_mem[dst_base + di] =
> +                                    *(uint32_t *)&s->local_mem[src_base + 
> si];
> +                    break;
> +                }

You could write this more compactly as
                   stn_he_p(&s->local_mem[dst_base + di], 1 << format,
                            ldn_he_p(&s->local_mem[src_base + si], 1
<< format));

(which handles the length-cases for you and also doesn't rely on
casting a uint8_t* giving you something correctly aligned for a
wider access).

> +                break;
> +            }
>              /* Check for overlaps, this could be made more exact */
>              uint32_t sb, se, db, de;
>              sb = src_base + src_x + src_y * (width + src_pitch);
> @@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
>              color = cpu_to_le16(color);
>          }
>
> -        pixman_fill((uint32_t *)&s->local_mem[dst_base],
> -                    dst_pitch * (1 << format) / sizeof(uint32_t),
> -                    8 * (1 << format), dst_x, dst_y, width, height, color);
> +        if (width == 1 && height == 1) {
> +            unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
> +            switch (format) {
> +            case 0:
> +                s->local_mem[dst_base + i] = color & 0xff;
> +                break;
> +            case 1:
> +                *(uint16_t *)&s->local_mem[dst_base + i] = color & 0xffff;
> +                break;
> +            case 2:
> +                *(uint32_t *)&s->local_mem[dst_base + i] = color;
> +                break;
> +            }

               stn_he_p(&s->local_mem[dst_base + i], 1 << format, color);


> +        } else {
> +            pixman_fill((uint32_t *)&s->local_mem[dst_base],
> +                        dst_pitch * (1 << format) / sizeof(uint32_t),
> +                        8 * (1 << format), dst_x, dst_y, width, height, 
> color);
> +        }
>          break;
>      }
>      default:

thanks
-- PMM

Reply via email to