cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c
commit 71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c Author: Yury Usishchev <[email protected]> Date: Wed Apr 15 17:21:33 2015 +0200 evas: enable NEON-optimized code for aarch64. Summary: Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or NEON intrinsics should be built. GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline code can be built only for armv7. @feature Reviewers: raster, stefan_schmidt, cedric Subscribers: cedric, stefan_schmidt Projects: #efl Differential Revision: https://phab.enlightenment.org/D2309 Signed-off-by: Cedric BAIL <[email protected]> --- configure.ac | 18 +++++++++ src/lib/evas/common/evas_blit_main.c | 8 ++++ src/lib/evas/common/evas_cpu.c | 9 +++++ .../common/evas_op_blend/op_blend_color_neon.c | 10 ++++- .../evas_op_blend/op_blend_mask_color_neon.c | 47 ++++++++++++++++++++++ .../evas_op_blend/op_blend_pixel_color_neon.c | 14 ++++++- .../common/evas_op_blend/op_blend_pixel_neon.c | 33 ++++++++++++++- .../evas/common/evas_op_copy/op_copy_color_neon.c | 9 +++++ 8 files changed, 145 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 9eed98c..63cc54d 100644 --- a/configure.ac +++ b/configure.ac @@ -576,6 +576,21 @@ case $host_cpu in CFLAGS="${CFLAGS_save}" fi ;; + aarch64*) + if test "x${want_neon}" = "xyes"; then + build_cpu_neon="yes" + AC_MSG_CHECKING([whether to use NEON instructions]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_neon.h>]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([BUILD_NEON], [1], [Build NEON Code]) + AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics]) + build_cpu_neon="yes" + ],[ + AC_MSG_RESULT([no]) + build_cpu_neon="no" + ]) + fi + ;; esac AC_SUBST([ALTIVEC_CFLAGS]) @@ -4741,6 +4756,9 @@ case $host_cpu in arm*) EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) ;; + aarch64*) + EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) + ;; esac if test "${have_linux}" = "yes"; then diff --git a/src/lib/evas/common/evas_blit_main.c b/src/lib/evas/common/evas_blit_main.c index 7f8faa1..4da4034 100644 --- a/src/lib/evas/common/evas_blit_main.c +++ b/src/lib/evas/common/evas_blit_main.c @@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len) static void evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) { +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_rev_c(src, dst, len); +#else uint32_t *tmp = (void *)37; #define AP "evas_common_copy_rev_pixels_neon_" asm volatile ( @@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) ); #undef AP +#endif } #endif @@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len) #ifdef BUILD_NEON static void evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_c(src, dst, len); +#else uint32_t *e,*tmp = (void *)37; e = dst + len; #define AP "evas_common_copy_pixels_neon_" @@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ ); #undef AP +#endif } #endif /* BUILD_NEON */ diff --git a/src/lib/evas/common/evas_cpu.c b/src/lib/evas/common/evas_cpu.c index 4139098..0f83258 100644 --- a/src/lib/evas/common/evas_cpu.c +++ b/src/lib/evas/common/evas_cpu.c @@ -2,6 +2,11 @@ #ifdef BUILD_MMX #include "evas_mmx.h" #endif +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include <arm_neon.h> +#endif +#endif #if defined BUILD_SSE3 #include <immintrin.h> #endif @@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void) { //#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70) #ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS + volatile uint32x4_t temp = vdupq_n_u32(0x1); +#else asm volatile ( ".fpu neon \n\t" "vqadd.u8 d0, d1, d0\n" @@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void) "d0", "d1" ); #endif +#endif //#endif } diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 9e94298..2bf14c1 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e, a = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c + MUL_256(a, *d); + d++; + }); +#else DATA32 *e, *tmp = 0; #define AP "B_C_DP" asm volatile ( @@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3 ); #undef AP - +#endif } #define _op_blend_caa_dp_neon _op_blend_c_dp_neon diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index 99f4b38..dbeb063 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -19,6 +19,30 @@ #ifdef BUILD_NEON static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 a = *m; + switch(a) + { + case 0: + break; + case 255: + *d = c + MUL_256(alpha, *d); + break; + default: + { + DATA32 mc = MUL_SYM(a, c); + a = 256 - (mc >> 24); + *d = mc + MUL_256(a, *d); + } + break; + } + m++; d++; + }); +#else DATA32 *e = d + l; // everything we can do only once per cycle @@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in "q10", "q15", "q14", "memory" ); } +#endif } #endif #ifdef BUILD_NEON static void _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = *m; + switch(alpha) + { + case 0: + break; + case 255: + *d = c; + break; + default: + alpha++; + *d = INTERP_256(alpha, c, *d); + break; + } + m++; d++; + }); +#else DATA32 *e,*tmp; int alpha; @@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, ); #undef AP +#endif } #endif diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c index d6b3a73..c47ec7c 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -7,7 +7,18 @@ * reads, then two writes, a miss on read is 'just' two reads */ static void _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) { - +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 sc = MUL4_SYM(c, *s); + alpha = 256 - (sc >> 24); + *d = sc + MUL_256(alpha, *d); + d++; + s++; + }); +#else #define AP "blend_p_c_dp_" asm volatile ( ".fpu neon\n\t" @@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory" ); #undef AP +#endif } static void diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c index 4b9993b..3c32790 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c @@ -3,6 +3,16 @@ #ifdef BUILD_NEON static void _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = 256 - (*s >> 24); + *d = *s++ + MUL_256(alpha, *d); + d++; + }); +#else #define AP "blend_p_dp_" asm volatile ( ".fpu neon \n\t" @@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered ); #undef AP - +#endif } static void _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + switch (*s & 0xff000000) + { + case 0: + break; + case 0xff000000: + *d = *s; + break; + default: + alpha = 256 - (*s >> 24); + *d = *s + MUL_256(alpha, *d); + break; + } + s++; d++; + }); +#else #define AP "blend_pas_dp_" DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; asm volatile ( @@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory" ); #undef AP +#endif } #define _op_blend_pan_dp_neon NULL diff --git a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c index 96310cd..009bd75 100644 --- a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c +++ b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c; + d++; + }); +#else #define AP "COPY_C_DP_" uint32_t *e = d + l,*tmp; asm volatile ( @@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { ); +#endif } #define _op_copy_cn_dp_neon _op_copy_c_dp_neon --
