[Pixman] [test PATCH] Use _mm_maddubs_epi16 in BILINEAR_INTERPOLATE_ONE_PIXEL

2012-09-29 Thread Matt Turner
Siarhei, can you measure any performance improvement with this? I
can't... :(
---
 pixman/pixman-sse2.c |8 +++-
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index efed310..4fbc045 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -32,6 +32,7 @@
 
 #include xmmintrin.h /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include emmintrin.h /* for SSE2 intrinsics */
+#include tmmintrin.h /* for SSSE3 intrinsics */
 #include pixman-private.h
 #include pixman-combine32.h
 #include pixman-inlines.h
@@ -5414,7 +5415,7 @@ FAST_NEAREST_MAINLOOP_COMMON 
(sse2__n__normal_OVER,
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)
\
 do {   
\
-__m128i xmm_wh, xmm_lo, xmm_hi, a; 
\
+__m128i xmm_wh, a; 
\
 /* fetch 2x2 pixel block into sse2 registers */
\
 __m128i tltr = _mm_loadl_epi64 (   
\
(__m128i *)src_top[pixman_fixed_to_int (vx)]); 
\
@@ -5443,10 +5444,7 @@ do { 
\
_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); 
\
xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);  
\
/* horizontal interpolation */  
\
-   xmm_lo = _mm_mullo_epi16 (a, xmm_wh);   
\
-   xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);   
\
-   a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), 
\
-  _mm_unpackhi_epi16 (xmm_lo, xmm_hi));
\
+   a = _mm_maddubs_epi16 (a, xmm_wh);  
\
 }  
\
 /* shift and pack the result */
\
 a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);   
\
-- 
1.7.8.6

___
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman


Re: [Pixman] [test PATCH] Use _mm_maddubs_epi16 in BILINEAR_INTERPOLATE_ONE_PIXEL

2012-09-29 Thread Siarhei Siamashka
On Sat, 29 Sep 2012 00:12:16 -0700
Matt Turner matts...@gmail.com wrote:

 Siarhei, can you measure any performance improvement with this? I
 can't... :(

I guess that's because you patched the code for the 8-bit
interpolation precision, and pixman is now using 7 bits by default.

But PMADDUBSW can be only used for the first step of interpolation
(vertical) and not the second one (horizontal). Because the first step
does 8-bit * 7-bit - 15-bit multiplication. And the second step does
a wider 15-bit * 7-bit - 22-bit multiplication.

The needed changes may look like this:

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index efed310..b260c95 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -32,6 +32,7 @@
 
 #include xmmintrin.h /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include emmintrin.h /* for SSE2 intrinsics */
+#include tmmintrin.h /* for SSSE3 intrinsics */
 #include pixman-private.h
 #include pixman-combine32.h
 #include pixman-inlines.h
@@ -5401,15 +5402,14 @@ FAST_NEAREST_MAINLOOP_COMMON 
(sse2__n__normal_OVER,
 #define BMSK ((1  BILINEAR_INTERPOLATION_BITS) - 1)
 
 #define BILINEAR_DECLARE_VARIABLES 
\
-const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); 
\
-const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); 
\
+const __m128i xmm_wtb = _mm_set_epi8 (wt, wb, wt, wb, wt, wb, wt, wb,  
\
+  wt, wb, wt, wb, wt, wb, wt, wb); 
\
 const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, 
BMSK);\
 const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);  
\
 const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, 
BMSK);\
 const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);  
\
 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,  
\
  unit_x, unit_x, unit_x, unit_x);  
\
-const __m128i xmm_zero = _mm_setzero_si128 (); 
\
 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)
\
@@ -5422,10 +5422,7 @@ do { 
\
(__m128i *)src_bottom[pixman_fixed_to_int (vx)]);  
\
 vx += unit_x;  
\
 /* vertical interpolation */   
\
-a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),
\
-   xmm_wt),
\
-  _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), 
\
-   xmm_wb));   
\
+a = _mm_maddubs_epi16 (_mm_unpacklo_epi8 (blbr, tltr), xmm_wtb);   
\
 if (BILINEAR_INTERPOLATION_BITS  8)   
\
 {  
\
/* calculate horizontal weights */  
\


And I'm getting the following performance improvement on Core i7 860
when running lowlevel-blt-bench -b src__:

before:
   src__ =  L1: 318.11  L2: 314.48  M:311.16
after:
   src__ =  L1: 356.75  L2: 352.18  M:348.76

That's just ~12% faster. The next step would be to try taking the
compiler out of the way and ensuring that no CPU cycles are wasted :)

-- 
Best regards,
Siarhei Siamashka
___
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman