01: Add neon for upscaling and map routines in evas.

Yury Usischev - Enlightenment Git Fri, 02 Aug 2013 02:52:59 -0700

raster pushed a commit to branch master.

commit bd6de4ba8c9711c1c010a3b1b311738d248c26ce
Author: Yury Usischev <[email protected]>
Date:   Fri Aug 2 18:06:55 2013 +0900


    Add neon for upscaling and map routines in evas.
---
 AUTHORS                                           |  1 +
 ChangeLog                                         |  4 +
 NEWS                                              |  1 +
 src/lib/evas/common/evas_map_image_core.c         |  3 +
 src/lib/evas/common/evas_map_image_loop.c         | 90 +++++++++++++++++++++--
 src/lib/evas/common/evas_scale_smooth.c           | 44 ++++++++++-
 src/lib/evas/common/evas_scale_smooth_scaler_up.c | 26 +++++++
 src/lib/evas/include/evas_blend_ops.h             | 58 +++++++++++++++
 8 files changed, 220 insertions(+), 7 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index a42f1e4..e0e0cec 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -125,6 +125,7 @@ Patryk Kaczmarek <[email protected]>
 Zbigniew Kosinski <[email protected]>
 Paulo Cavalcanti <[email protected]>
 Jean-Philippe Andre <[email protected]>
+Yury Usischev <[email protected]>
 
 
 Ecore
diff --git a/ChangeLog b/ChangeLog
index 4cd2a4f..d45dab1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-02  Yury Usischev
+
+        * Add neon optimizations for several scaling/map routines in evas
+
 2013-08-02  Cedric Bail
 
         * Evas: change mapping policy for image loader (RANDOM during header,
diff --git a/NEWS b/NEWS
index bbbdc06..243bf6d 100644
--- a/NEWS
+++ b/NEWS
@@ -201,6 +201,7 @@ Improvements:
      - Use eo array of callbacks to reduce callbacks memory footprint of 
Evas_Object_Box and Evas_Object_Table.
      - Optimized path for when map use the same color for all corner.
      - Asynchronous preload of GL texture.
+     - Add neon assembly for upscaling and map routines
     * Ecore_Con:
      - Rebase dns.c against upstream
     * Edje:
diff --git a/src/lib/evas/common/evas_map_image_core.c 
b/src/lib/evas/common/evas_map_image_core.c
index 7e44c4b..6e2be0e 100644
--- a/src/lib/evas/common/evas_map_image_core.c
+++ b/src/lib/evas/common/evas_map_image_core.c
@@ -19,6 +19,9 @@
 #ifdef SCALE_USING_MMX
              pxor_r2r(mm0, mm0);
              MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+             FPU_NEON;
+             VMOV_I2R_NEON(q2, #255);
 #endif
                
              line = &(spans[y - ystart]);
diff --git a/src/lib/evas/common/evas_map_image_loop.c 
b/src/lib/evas/common/evas_map_image_loop.c
index fc32286..a8a49eb 100644
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@@ -1,13 +1,27 @@
 #ifdef SMOOTH
 {
 # ifdef SCALE_USING_MMX
-#   ifdef COLMUL
-#    ifdef COLSAME
+#  ifdef COLMUL
+#   ifdef COLSAME
    MOV_P2R(c1, mm7, mm0); // col
-#    endif   
 #   endif   
 #  endif   
-   while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // this part can be done here as c1 and c2 are constants in the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d18, c1);
+   VEOR_NEON(q8);
+   VMOV_M2R_NEON(d19, c2);
+   VZIP_NEON(q9, q8);
+   VMOV_R2R_NEON(d19, d16);
+   // here we have c1 and c2 spread through q9 register
+#   endif
+#  endif
+# endif
+     while (ww > 0)
      {
 # ifdef COLBLACK
         *d = 0xff000000; // col
@@ -77,6 +91,41 @@
 #    endif        
 #   endif                            
         MOV_R2P(mm1, *d, mm0);
+#  elif defined SCALE_USING_NEON
+        // not sure if we need this condition, but it doesn't affect the result
+        if (val1 | val2 | val3 | val4)
+          {
+            FPU_NEON;
+#   ifdef COLMUL
+            // initialize alpha for interpolation of c1 and c2
+            VDUP_NEON(d15, cv >> 16);
+            // copy c1 and c2 as algorithm will overwrite it
+            VMOV_R2R_NEON(q6, q9);
+            cv += cd; // col
+#   endif
+            VMOV_M2R_NEON(d8, val1);
+            VEOR_NEON(q0);
+            VMOV_M2R_NEON(d9, val3);
+            VMOV_M2R_NEON(d10, val2);
+            VEOR_NEON(q1);
+            VMOV_M2R_NEON(d11, val4);
+            VDUP_NEON(q3, ru);
+            VDUP_NEON(d14, rv);
+            VZIP_NEON(q4, q0);
+            VZIP_NEON(q5, q1);
+            VMOV_R2R_NEON(d9, d0);
+            VMOV_R2R_NEON(d11, d2);
+            // by this point we have all required data in right registers
+            INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and 
val3,val4
+            VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and 
c1 if COLMUL is defined) for next step
+            INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, 
also here c1 and c2 are interpolated
+#   ifdef COLMUL
+            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+#   endif
+            VMOV_R2M_NEON(q4, d8, d); // save result to d
+          }
+        else
+          *d = val1;
 #  else
         val1 = INTERP_256(ru, val2, val1);
         val3 = INTERP_256(ru, val4, val3);
@@ -102,10 +151,23 @@
 }
 #else
 {
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // c1 and c2 are constants inside the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d10, c1);
+   VEOR_NEON(q0);
+   VMOV_M2R_NEON(d11, c2);
+   VZIP_NEON(q5, q0);
+   VMOV_R2R_NEON(d11, d0);
+#   endif
+#  endif
+# endif
    while (ww > 0)
      {
 # ifdef COLMUL
-#  ifndef COLBLACK        
+#  ifndef COLBLACK
         DATA32 val1;
 #   ifdef COLSAME
 #   else        
@@ -121,11 +183,27 @@
 #  ifdef COLMUL
         val1 = *s; // col
 #   ifdef COLSAME
+#    ifdef SCALE_USING_NEON
         *d = MUL4_SYM(c1, val1);
-#   else        
+#    else
+        *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+#    endif
+#   else
+#    ifdef SCALE_USING_NEON
+        FPU_NEON;
+        VMOV_M2R_NEON(d12, val1);
+        VMOV_R2R_NEON(q4, q5);
+        VEOR_NEON(q1);
+        VDUP_NEON(d15, cv >> 16);
+        VZIP_NEON(q6, q1);
+        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+        MUL4_SYM_NEON(d8, d12, d4); // multiply
+        VMOV_R2M_NEON(q4, d8, d); // save result
+#    else
         cval = INTERP_256((cv >> 16), c2, c1); // col
         *d = MUL4_SYM(cval, val1);
         cv += cd; // col              
+#    endif
 #   endif        
 #  else
         *d = *s;
diff --git a/src/lib/evas/common/evas_scale_smooth.c 
b/src/lib/evas/common/evas_scale_smooth.c
index 02dbe7d..61bda22b 100644
--- a/src/lib/evas/common/evas_scale_smooth.c
+++ b/src/lib/evas/common/evas_scale_smooth.c
@@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
 # include "evas_scale_smooth_scaler.c"
 #endif
 
+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
 #undef SCALE_FUNC
 #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
 #undef SCALE_USING_MMX
@@ -197,6 +206,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image 
*src, RGBA_Image *dst,
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
    else
 #endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+   else
+#endif
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
 
    return evas_common_scale_rgba_in_to_out_clip_cb(src, dst, dc,
@@ -223,6 +237,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, 
RGBA_Image *dst, int dst_cli
         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
    else
 #endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+     (src, dst,
+         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+         mul_col, render_op,
+         src_region_x, src_region_y, src_region_w, src_region_h,
+         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+   else
+#endif
      _evas_common_scale_rgba_in_to_out_clip_smooth_c
        (src, dst,
         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
@@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const 
Cutout_Rects *reuse,
                                               dst_region_w, dst_region_h);
        else
 # endif
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, 
src_region_y,
+                                                              src_region_w, 
src_region_h,
+                                                              dst_region_x, 
dst_region_y,
+                                                              dst_region_w, 
dst_region_h);
+        else
+#endif
          evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, 
src_region_y,
                                                          src_region_w, 
src_region_h,
@@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const 
Cutout_Rects *reuse,
                                               dst_region_w, dst_region_h);
        else
 # endif
-         evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, 
src_region_y,
+                                                              src_region_w, 
src_region_h,
+                                                              dst_region_x, 
dst_region_y,
+                                                              dst_region_w, 
dst_region_h);
+        else
+#endif
+            evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, 
src_region_y,
                                                          src_region_w, 
src_region_h,
                                                          dst_region_x, 
dst_region_y,
diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c 
b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
index e43e0c7..4b21d59 100644
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@@ -172,6 +172,10 @@
            MOV_A2R(ay, mm4)
            pxor_r2r(mm0, mm0);
            MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+           FPU_NEON;
+           VDUP_NEON(d12, ay);
+           VMOV_I2R_NEON(q2, #255);
 #endif
            pbuf = buf;  pbuf_end = buf + dst_clip_w;
            sxx = sxx0;
@@ -210,6 +214,28 @@
                INTERP_256_R2R(mm4, mm2, mm1, mm5)
                MOV_R2P(mm1, *pbuf, mm0)
                pbuf++;
+#elif defined SCALE_USING_NEON
+               if (p0 | p1 | p2 | p3)
+                 {
+                   FPU_NEON;
+                   VMOV_M2R_NEON(d8, p0);
+                   VEOR_NEON(q0);
+                   VMOV_M2R_NEON(d9, p2);
+                   VMOV_M2R_NEON(d10, p1);
+                   VEOR_NEON(q1);
+                   VMOV_M2R_NEON(d11, p3);
+                   VDUP_NEON(q3, ax);
+                   VZIP_NEON(q4, q0);
+                   VZIP_NEON(q5, q1);
+                   VMOV_R2R_NEON(d9, d0);
+                   VMOV_R2R_NEON(d11, d2);
+                   INTERP_256_NEON(q3, q5, q4, q2);
+                   INTERP_256_NEON(d12, d9, d8, d5);
+                   VMOV_R2M_NEON(q4, d8, pbuf);
+                   pbuf++;
+                 }
+               else
+                 *pbuf++ = p0;
 #else
                if (p0 | p1)
                  p0 = INTERP_256(ax, p1, p0);
diff --git a/src/lib/evas/include/evas_blend_ops.h 
b/src/lib/evas/include/evas_blend_ops.h
index 0a78843..3ae9437 100644
--- a/src/lib/evas/include/evas_blend_ops.h
+++ b/src/lib/evas/include/evas_blend_ops.h
@@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
 
 #endif
 
+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+       __asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+       __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+       __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" 
(value) : #reg); 
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+       __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+                            "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" 
(pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+       __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+       __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" 
(value) : #reg); 
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , 
#reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , 
#reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+       __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+       __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+       __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx );
+
+#endif
 
 /* some useful SSE3 inline functions */
 

-- 

------------------------------------------------------------------------------
Get your SQL database under version control now!
Version control is standard for application code, but databases havent 
caught up. So what steps can you take to put your SQL databases under 
version control? Why should you start doing it? Read more to find out.
http://pubads.g.doubleclick.net/gampad/clk?id=49501711&iu=/4140/ostg.clktrk

[EGIT] [core/efl] master 01/01: Add neon for upscaling and map routines in evas.

Reply via email to