---
Reduces runtime of firefox-planet-gnome trace from 156 to 153 seconds on 
Loongson.

Increases runtime of firefox-fishtank trace from 1030 to 1060 seconds. Why?

 pixman/pixman-mmx.c |   45 ++++++++++++++++++++++++++++++++-------------
 1 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index ea732bb..bff8585 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3526,11 +3526,14 @@ mmx_composite_over_reverse_n_8888 
(pixman_implementation_t *imp,
 }
 
 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
 
 #define BILINEAR_DECLARE_VARIABLES                                             
\
     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);                         
\
     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);                         
\
     const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);     
\
+    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);                          
\
+    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);                    
\
     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);         
\
     const __m64 mm_zero = _mm_setzero_si64 ();                                 
\
     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
@@ -3548,21 +3551,37 @@ do {                                                    
                        \
     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);        
        \
     __m64 hi = _mm_add_pi16 (t_hi, b_hi);                                      
\
     __m64 lo = _mm_add_pi16 (t_lo, b_lo);                                      
\
-    /* calculate horizontal weights */                                         
\
-    __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,             
\
+    if (BILINEAR_INTERPOLATION_BITS < 8)                                       
\
+    {                                                                          
\
+       /* calculate horizontal weights */                                      
\
+       __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,           
\
+                         _mm_srli_pi16 (mm_x,                                  
\
+                                        16 - BILINEAR_INTERPOLATION_BITS)));   
\
+       mm_x = _mm_add_pi16 (mm_x, mm_ux);                                      
\
+       /* horizontal interpolation */                                          
\
+       __m64 p = _mm_unpacklo_pi16 (lo, hi);                                   
\
+       __m64 q = _mm_unpackhi_pi16 (lo, hi);                                   
\
+       lo = _mm_madd_pi16 (p, mm_wh);                                          
\
+       hi = _mm_madd_pi16 (q, mm_wh);                                          
\
+    }                                                                          
\
+    else                                                                       
\
+    {                                                                          
\
+       /* calculate horizontal weights */                                      
\
+       __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,          
\
                                        16 - BILINEAR_INTERPOLATION_BITS));     
\
-    __m64 mm_wh_hi = _mm_srli_pi16 (mm_x,                                      
\
+       __m64 mm_wh_hi = _mm_srli_pi16 (mm_x,                                   
\
                                        16 - BILINEAR_INTERPOLATION_BITS);      
\
-    mm_x = _mm_add_pi16 (mm_x, mm_ux);                                         
\
-    /* horizontal interpolation */                                             
\
-    __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);                            
\
-    __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);                            
\
-    __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);                            
\
-    __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);                            
\
-    lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),                 
\
-                      _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));                 
\
-    hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),                 
\
-                      _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));                 
\
+       mm_x = _mm_add_pi16 (mm_x, mm_ux);                                      
\
+       /* horizontal interpolation */                                          
\
+       __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);                         
\
+       __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);                         
\
+       __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);                         
\
+       __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);                         
\
+       lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),              
\
+                          _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));             
\
+       hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),              
\
+                          _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));             
\
+    }                                                                          
\
     /* shift and pack the result */                                            
\
     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);                  
\
     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);                  
\
-- 
1.7.3.4

_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to