cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=a0d0c9883995e0e04979f5382fc8954941b19edc
commit a0d0c9883995e0e04979f5382fc8954941b19edc Author: Yury Usishchev <y.usishc...@samsung.com> Date: Thu Apr 16 19:23:29 2015 +0200 evas: improve _op_blend_mas_c_dp_neon intrinsics implementation. Summary: Use vceqq and vbsl instead of twice as much vmovl and vadd instructions. Replace vaddq_u8 with vaddq_u32. This allows NEON code to behave exactly like C version. Reviewers: cedric, raster Projects: #efl Differential Revision: https://phab.enlightenment.org/D2362 Signed-off-by: Cedric BAIL <ced...@osg.samsung.com> --- .../evas_op_blend/op_blend_mask_color_neon.c | 37 +++++++++++----------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index 0bc8c5c..a09277e 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -25,8 +25,6 @@ static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { #ifdef BUILD_NEON_INTRINSICS - uint16x8_t d0_16x8; - uint16x8_t d1_16x8; uint16x8_t m_16x8; uint16x8_t mc0_16x8; uint16x8_t mc1_16x8; @@ -36,14 +34,20 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in uint32x2_t c_32x2; uint32x2_t m_32x2; uint32x4_t a_32x4; + uint32x4_t ad_32x4; + uint32x4_t cond_32x4; uint32x4_t d_32x4; uint32x4_t m_32x4; + uint32x4_t temp_32x4; + uint32x4_t mc_32x4; + uint32x4_t x0_32x4; uint32x4_t x1_32x4; uint8x16_t a_8x16; uint8x16_t d_8x16; uint8x16_t m_8x16; uint8x16_t mc_8x16; uint8x16_t temp_8x16; + uint8x16_t x0_8x16; uint8x16_t x1_8x16; uint8x8_t a0_8x8; uint8x8_t a1_8x8; @@ -59,6 +63,8 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in uint8x8_t temp1_8x8; x1_8x16 = vdupq_n_u8(0x1); + x0_8x16 = vdupq_n_u8(0x0); + x0_32x4 = vreinterpretq_u32_u8(x0_8x16); x255_16x8 = vdupq_n_u16(0xff); x1_32x4 = vreinterpretq_u32_u8(x1_8x16); c_32x2 = vdup_n_u32(c); @@ -66,7 +72,7 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in DATA32 *start = d; int size = l; - DATA32 *end = start + (size & ~7); + DATA32 *end = start + (size & ~3); while (start < end) { int k = *((int *)m); if (k == 0) @@ -77,7 +83,6 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in } m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0); - d_32x4 = vld1q_u32(start); m_8x8 = vreinterpret_u8_u32(m_32x2); @@ -94,15 +99,15 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in mc0_16x8 = vmull_u8(m0_8x8, c_8x8); mc1_16x8 = vmull_u8(m1_8x8, c_8x8); - mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8); mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8); mc0_8x8 = vshrn_n_u16(mc0_16x8, 8); mc1_8x8 = vshrn_n_u16(mc1_16x8, 8); - mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8); - a_8x16 = vmvnq_u8(mc_8x16); + + a_8x16 = vsubq_u8(x0_8x16, mc_8x16); + a_32x4 = vreinterpretq_u32_u8(a_8x16); a_32x4 = vshrq_n_u32(a_32x4, 24); a_32x4 = vmulq_u32(a_32x4, x1_32x4); @@ -112,35 +117,29 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in a1_8x8 = vget_high_u8(a_8x16); d_8x16 = vreinterpretq_u8_u32(d_32x4); - d0_8x8 = vget_low_u8(d_8x16); d1_8x8 = vget_high_u8(d_8x16); - d0_16x8 = vmovl_u8(d0_8x8); - d1_16x8 = vmovl_u8(d1_8x8); - temp0_16x8 = vmull_u8(a0_8x8, d0_8x8); temp1_16x8 = vmull_u8(a1_8x8, d1_8x8); - - temp0_16x8 = vaddq_u16(temp0_16x8, d0_16x8); - temp1_16x8 = vaddq_u16(temp1_16x8, d1_16x8); - temp0_8x8 = vshrn_n_u16(temp0_16x8,8); temp1_8x8 = vshrn_n_u16(temp1_16x8,8); temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8); + temp_32x4 = vreinterpretq_u32_u8(temp_8x16); - d_8x16 = vaddq_u8(mc_8x16, temp_8x16); + cond_32x4 = vceqq_u32(a_32x4, x0_32x4); + ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4); - d_32x4 = vreinterpretq_u32_u8(d_8x16); + mc_32x4 = vreinterpretq_u32_u8(mc_8x16); + d_32x4 = vaddq_u32(mc_32x4, ad_32x4); vst1q_u32(start, d_32x4); start+=4; m+=4; - } - end += (size & 7); + end += (size & 3); while (start < end) { DATA32 a = *m; DATA32 mc = MUL_SYM(a, c); --