Re: [Pixman] [PATCH 2/3] SSE2 optimizations for scaled over_8888_8888 with nearest filter

Siarhei Siamashka Thu, 09 Sep 2010 02:51:20 -0700

On Wednesday 08 September 2010 10:45:07 Siarhei Siamashka wrote:
> +/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
> +static force_inline void
> +scaled_nearest_scanline_sse2_8888_8888_none_OVER (uint32_t*       pd,
> +                                                  const uint32_t* ps,
> +                                                  int32_t         w,
> +                                                  pixman_fixed_t  vx,
> +                                                  pixman_fixed_t  unit_x,
> +                                                  pixman_fixed_t  max_vx)
> +{
> +    uint32_t s, d;
> +    const uint32_t* pm = NULL;
> +
> +    __m128i xmm_dst_lo, xmm_dst_hi;
> +    __m128i xmm_src_lo, xmm_src_hi;
> +    __m128i xmm_alpha_lo, xmm_alpha_hi;
> +
> +    /* Align dst on a 16-byte boundary */
> +    while (w && ((unsigned long)pd & 15))
> +    {
> +     d = *pd;
> +     s = combine1 (ps + (vx >> 16), pm);
> +     vx += unit_x;
> +
> +     *pd++ = core_combine_over_u_pixel_sse2 (s, d);
> +     if (pm)
> +         pm++;
> +     w--;
> +    }
> +
> +    while (w >= 4)
> +    {
> +     __m128i tmp;
> +     uint32_t tmp1, tmp2, tmp3, tmp4;
> +
> +     tmp1 = ps[vx >> 16];
> +     vx += unit_x;
> +     tmp2 = ps[vx >> 16];
> +     vx += unit_x;
> +     tmp3 = ps[vx >> 16];
> +     vx += unit_x;
> +     tmp4 = ps[vx >> 16];
> +     vx += unit_x;
> +
> +     tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
> +
> +     xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
> +
> +     if (is_opaque (xmm_src_hi))
> +     {
> +         save_128_aligned ((__m128i*)pd, xmm_src_hi);
> +     }
> +     else if (!is_zero (xmm_src_hi))
> +     {
> +         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
> +
> +         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
> +         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
> +
> +         expand_alpha_2x128 (
> +             xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
> +
> +         over_2x128 (&xmm_src_lo, &xmm_src_hi,
> +                     &xmm_alpha_lo, &xmm_alpha_hi,
> +                     &xmm_dst_lo, &xmm_dst_hi);
> +
> +         /* rebuid the 4 pixel data and save*/
> +         save_128_aligned ((__m128i*)pd,
> +                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
> +     }
> +
> +     w -= 4;
> +     pd += 4;
> +     if (pm)
> +         pm += 4;
> +    }
> +
> +    while (w)
> +    {
> +     d = *pd;
> +     s = combine1 (ps + (vx >> 16), pm);
> +     vx += unit_x;
> +
> +     *pd++ = core_combine_over_u_pixel_sse2 (s, d);
> +     if (pm)
> +         pm++;
> +
> +     w--;
> +    }
> +}


Actually there is a problem here which I discovered after also trying the
patch on a 32-bit x86 system. The floating point registers may become corrupted
unless _mm_empty() is added somewhere at the end of this scaling fast path
code.

I have sent a patch which can help to detect such issues automatically by just
running pixman test suite (maybe not totally bulletproof, but still quite
useful):
http://lists.freedesktop.org/archives/pixman/2010-September/000486.html

-- 
Best regards,
Siarhei Siamashka
_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

Re: [Pixman] [PATCH 2/3] SSE2 optimizations for scaled over_8888_8888 with nearest filter

Reply via email to