cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=76a5efe13ae76ce44d02e1f5921db9465e8a739b
commit 76a5efe13ae76ce44d02e1f5921db9465e8a739b Author: Cedric BAIL <ced...@osg.samsung.com> Date: Tue Apr 28 23:36:04 2015 +0200 evas: implement pixel_color blending functions using NEON intrinsics. Summary: NEON intrinsics can be built both for armv7 and armv8. Implemented functions: _op_blend_pan_c_dp_neon _op_blend_p_can_dp_neon _op_blend_pan_can_dp_neon _op_blend_p_caa_dp_neon _op_blend_pan_caa_dp_neon Reviewers: raster, cedric Subscribers: cedric Projects: #efl Maniphest Tasks: T2341 Differential Revision: https://phab.enlightenment.org/D2409 Signed-off-by: Cedric BAIL <ced...@osg.samsung.com> --- .../evas_op_blend/op_blend_pixel_color_neon.c | 654 +++++++++++++++------ 1 file changed, 465 insertions(+), 189 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c index b1bfc25..aec1c86 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -1,8 +1,3 @@ -#ifdef BUILD_NEON -#ifdef BUILD_NEON_INTRINSICS -#include <arm_neon.h> -#endif -#endif /* blend pixel x color --> dst */ #ifdef BUILD_NEON @@ -202,240 +197,521 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT #endif } -static void -_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { - DATA32 *e; - UNROLL8_PLD_WHILE(d, l, e, - { - *d++ = 0xff000000 + MUL3_SYM(c, *s); - s++; - }); -} static void -_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { -#if 1 - DATA32 *e; - DATA32 sc; - int alpha; - c = 1 + (c & 0xff); - UNROLL8_PLD_WHILE(d, l, e, - { - sc = MUL_256(c, *s); - alpha = 256 - (sc >> 24); - *d = sc + MUL_256(alpha, *d); - d++; - s++; - }); -#else // the below neon is buggy!! misses rendering of spans, i think with alignment. quick - just disable this. -#define AP "_op_blend_pan_caa_dp_" - DATA32 *e = d + l, *tmp = (void*)73; - asm volatile ( - ".fpu neon \n\t" - /* Set up 'c' */ - "vdup.u8 d14, %[c] \n\t" - "vmov.i8 d15, #1 \n\t" - "vaddl.u8 q15, d14, d15 \n\t" - "vshr.u8 q15,#1 \n\t" - - // Pick a loop - "andS %[tmp], %[d], $0xf \n\t" - "beq "AP"quadstart \n\t" - - "andS %[tmp], %[d], $0x4 \n\t" - "beq "AP"dualstart \n\t" - - AP"singleloop: \n\t" - "vld1.32 d4[0], [%[d]] \n\t" - "vld1.32 d0[0], [%[s]]! \n\t" - - // Long version of 'd' - "vmovl.u8 q8, d4 \n\t" +_op_blend_pan_c_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { + uint16x8_t ad0_16x8; + uint16x8_t ad1_16x8; + uint16x8_t sc0_16x8; + uint16x8_t sc1_16x8; + uint16x8_t x255_16x8; + uint32x4_t ad_32x4; + uint32x4_t c_32x4; + uint32x4_t d_32x4; + uint32x4_t mask_32x4; + uint32x4_t s_32x4; + uint32x4_t sc_32x4; + uint8x16_t ad_8x16; + uint8x16_t c_8x16; + uint8x16_t d_8x16; + uint8x16_t mask_8x16; + uint8x16_t s_8x16; + uint8x16_t sc_8x16; + uint8x8_t a_8x8; + uint8x8_t ad0_8x8; + uint8x8_t ad1_8x8; + uint8x8_t c_8x8; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t s0_8x8; + uint8x8_t s1_8x8; + uint8x8_t sc0_8x8; + uint8x8_t sc1_8x8; + + // alpha can only be 0 if color is 0x0. In that case we can just return. + // Otherwise we can assume alpha != 0. This allows more optimization in + // NEON code. + + if(!c) + return; + + unsigned char a; + a = ~(c >> 24) + 1; // 256 - (c >> 24) + + a_8x8 = vdup_n_u8(a); + c_32x4 = vdupq_n_u32(c); + c_8x16 = vreinterpretq_u8_u32(c_32x4); + c_8x8 = vget_low_u8(c_8x16); + x255_16x8 = vdupq_n_u16(0xff); + mask_32x4 = vdupq_n_u32(0xff000000); + mask_8x16 = vreinterpretq_u8_u32(mask_32x4); + + DATA32 *end = d + (l & ~3); + while (d < end) + { + // load 4 elements from d + d_32x4 = vld1q_u32(d); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); + + // multiply MUL_256(a, *d) + ad0_16x8 = vmull_u8(a_8x8, d0_8x8); + ad1_16x8 = vmull_u8(a_8x8, d1_8x8); + ad0_8x8 = vshrn_n_u16(ad0_16x8,8); + ad1_8x8 = vshrn_n_u16(ad1_16x8,8); + ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); + ad_32x4 = vreinterpretq_u32_u8(ad_8x16); + + // load 4 elements from s + s_32x4 = vld1q_u32(s); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); + + // multiply MUL_SYM(c, *s); + sc0_16x8 = vmull_u8(s0_8x8, c_8x8); + sc1_16x8 = vmull_u8(s1_8x8, c_8x8); + sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8); + sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8); + sc0_8x8 = vshrn_n_u16(sc0_16x8, 8); + sc1_8x8 = vshrn_n_u16(sc1_16x8, 8); + sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8); - // Long version of 's' - "vmovl.u8 q6, d0 \n\t" - - // d8 = s -d - "vsub.s16 d8, d12, d16 \n\t" - - // Multiply - "vmul.s16 d8, d8, d30 \n\t" - - // Shift down - "vshr.s16 d8, #7 \n\t" - - // Add 'd' - "vqadd.s16 d8, d8, d16 \n\t" - - // Shrink to save - "vqmovun.s16 d0, q4 \n\t" - "vst1.32 d0[0], [%[d]]! \n\t" - - // Now where? - "andS %[tmp], %[d], $0xf \n\t" - "beq "AP"quadstart \n\t" - - AP"dualstart: \n\t" - // Check we have enough - "sub %[tmp], %[e], %[d] \n\t" - "cmp %[tmp], #16 \n\t" - "blt "AP"loopout \n\t" + // select alpha channel from c + sc_8x16 = vbslq_u8(mask_8x16, c_8x16, sc_8x16); + sc_32x4 = vreinterpretq_u32_u8(sc_8x16); - AP"dualloop:" - "vldm %[d], {d4} \n\t" - "vldm %[s]!, {d0} \n\t" + // add up everything + d_32x4 = vaddq_u32(sc_32x4, ad_32x4); - // Long version of d - "vmovl.u8 q8, d4 \n\t" + // save result + vst1q_u32(d, d_32x4); - // Long version of s - "vmovl.u8 q6, d0 \n\t" + d+=4; + s+=4; + } - // q4/q5 = s-d - "vsub.s16 q4, q6, q8 \n\t" + end += (l & 3); + while (d < end) + { + *d = ((c & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(a, *d); + d++; + s++; + } - // Multiply - "vmul.s16 q4, q4,q15 \n\t" +} - // Shift down - "vshr.s16 q4, #7 \n\t" +static void +_op_blend_p_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { + uint16x8_t ad0_16x8; + uint16x8_t ad1_16x8; + uint16x8_t sc0_16x8; + uint16x8_t sc1_16x8; + uint16x8_t x255_16x8; + uint32x2_t c_32x2; + uint32x4_t ad_32x4; + uint32x4_t alpha_32x4; + uint32x4_t cond_32x4; + uint32x4_t d_32x4; + uint32x4_t mask_32x4; + uint32x4_t s_32x4; + uint32x4_t sc_32x4; + uint32x4_t x0_32x4; + uint32x4_t x1_32x4; + uint8x16_t ad_8x16; + uint8x16_t alpha_8x16; + uint8x16_t d_8x16; + uint8x16_t mask_8x16; + uint8x16_t s_8x16; + uint8x16_t sc_8x16; + uint8x16_t x0_8x16; + uint8x16_t x1_8x16; + uint8x8_t ad0_8x8; + uint8x8_t ad1_8x8; + uint8x8_t alpha0_8x8; + uint8x8_t alpha1_8x8; + uint8x8_t c_8x8; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t s0_8x8; + uint8x8_t s1_8x8; + uint8x8_t sc0_8x8; + uint8x8_t sc1_8x8; + + x1_8x16 = vdupq_n_u8(0x1); + x1_32x4 = vreinterpretq_u32_u8(x1_8x16); + x0_8x16 = vdupq_n_u8(0x0); + x0_32x4 = vreinterpretq_u32_u8(x0_8x16); + mask_32x4 = vdupq_n_u32(0xff000000); + mask_8x16 = vreinterpretq_u8_u32(mask_32x4); + c_32x2 = vdup_n_u32(c); + c_8x8 = vreinterpret_u8_u32(c_32x2); + x255_16x8 = vdupq_n_u16(0xff); - // Add d - "vqadd.s16 q4, q4, q8 \n\t" + DATA32 *end = d + (l & ~3); + while (d < end) + { + // load 4 elements from s + s_32x4 = vld1q_u32(s); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); - // Shrink to save - "vqmovun.s16 d0, q4 \n\t" + // load 4 elements from d + d_32x4 = vld1q_u32(d); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); - "vstm %[d]!, {d0} \n\t" - AP"quadstart: \n\t" - "sub %[tmp], %[e], %[d] \n\t" - "cmp %[tmp], #16 \n\t" - "blt "AP"loopout \n\t" + // calculate alpha = 256 - (*s >> 24) + alpha_32x4 = vshrq_n_u32(s_32x4, 24); + alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4); + alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4); + alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16); + alpha0_8x8 = vget_low_u8(alpha_8x16); + alpha1_8x8 = vget_high_u8(alpha_8x16); - "sub %[tmp], %[e], #15 \n\t" + // multiply MUL_SYM(c, *s); + sc0_16x8 = vmull_u8(s0_8x8, c_8x8); + sc1_16x8 = vmull_u8(s1_8x8, c_8x8); + sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8); + sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8); + sc0_8x8 = vshrn_n_u16(sc0_16x8, 8); + sc1_8x8 = vshrn_n_u16(sc1_16x8, 8); + sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8); - AP"quadloop: \n\t" - // load 's' -> q0, 'd' -> q2 - "vldm %[d], {d4,d5} \n\t" - "vldm %[s]!, {d0,d1} \n\t" + // select alpha channel from *s + sc_8x16 = vbslq_u8(mask_8x16, s_8x16, sc_8x16); + sc_32x4 = vreinterpretq_u32_u8(sc_8x16); - // Long version of d - "vmovl.u8 q8, d4 \n\t" - "vmovl.u8 q9, d5 \n\t" + // multiply MUL_256(a, *d) + ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8); + ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8); + ad0_8x8 = vshrn_n_u16(ad0_16x8,8); + ad1_8x8 = vshrn_n_u16(ad1_16x8,8); + ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); + ad_32x4 = vreinterpretq_u32_u8(ad_8x16); - // Long version of s - "vmovl.u8 q6, d0 \n\t" - "vmovl.u8 q7, d1 \n\t" + // select d if alpha is 0 + cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4); + ad_32x4 = vbslq_u32(cond_32x4, d_32x4, ad_32x4); - // q4/q5 = s-d - "vsub.s16 q4, q6, q8 \n\t" - "vsub.s16 q5, q7, q9 \n\t" + // add up everything + d_32x4 = vaddq_u32(sc_32x4, ad_32x4); - // Multiply - "vmul.s16 q4, q4,q15 \n\t" - "vmul.s16 q5, q5,q15 \n\t" + // save result + vst1q_u32(d, d_32x4); - // Shift down - "vshr.s16 q4, #7 \n\t" - "vshr.s16 q5, #7 \n\t" + d+=4; + s+=4; + } - // Add d - "vqadd.s16 q4, q4, q8 \n\t" - "vqadd.s16 q5, q5, q9 \n\t" + end += (l & 3); + int alpha; + while (d < end) + { + alpha = 256 - (*s >> 24); + *d = ((*s & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d); + d++; + s++; + } +} - // Shrink to save - "vqmovun.s16 d0, q4 \n\t" - "vqmovun.s16 d1, q5 \n\t" - "vstm %[d]!, {d0,d1} \n\t" - "cmp %[tmp], %[d] \n\t" +static void +_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { + uint16x8_t sc00_16x8; + uint16x8_t sc01_16x8; + uint16x8_t sc10_16x8; + uint16x8_t sc11_16x8; + uint16x8_t x255_16x8; + uint32x2_t c_32x2; + uint32x4_t d0_32x4; + uint32x4_t d1_32x4; + uint32x4_t mask_32x4; + uint32x4_t s0_32x4; + uint32x4_t s1_32x4; + uint32x4_t sc0_32x4; + uint32x4_t sc1_32x4; + uint8x16_t s0_8x16; + uint8x16_t s1_8x16; + uint8x16_t sc0_8x16; + uint8x16_t sc1_8x16; + uint8x8_t c_8x8; + uint8x8_t s00_8x8; + uint8x8_t s01_8x8; + uint8x8_t s10_8x8; + uint8x8_t s11_8x8; + uint8x8_t sc00_8x8; + uint8x8_t sc01_8x8; + uint8x8_t sc10_8x8; + uint8x8_t sc11_8x8; + + mask_32x4 = vdupq_n_u32(0xff000000); + x255_16x8 = vdupq_n_u16(0xff); + c_32x2 = vdup_n_u32(c); + c_8x8 = vreinterpret_u8_u32(c_32x2); - "bhi "AP"quadloop\n\t" + DATA32 *end = d + (l & ~7); + while (d < end) + { + // load 8 elements from s + s0_32x4 = vld1q_u32(s); + s0_8x16 = vreinterpretq_u8_u32(s0_32x4); + s00_8x8 = vget_low_u8(s0_8x16); + s01_8x8 = vget_high_u8(s0_8x16); + s1_32x4 = vld1q_u32(s+4); + s1_8x16 = vreinterpretq_u8_u32(s1_32x4); + s10_8x8 = vget_low_u8(s1_8x16); + s11_8x8 = vget_high_u8(s1_8x16); + + // multiply MUL_SYM(c, *s); + sc00_16x8 = vmull_u8(s00_8x8, c_8x8); + sc01_16x8 = vmull_u8(s01_8x8, c_8x8); + sc10_16x8 = vmull_u8(s10_8x8, c_8x8); + sc11_16x8 = vmull_u8(s11_8x8, c_8x8); + sc00_16x8 = vaddq_u16(sc00_16x8, x255_16x8); + sc01_16x8 = vaddq_u16(sc01_16x8, x255_16x8); + sc10_16x8 = vaddq_u16(sc10_16x8, x255_16x8); + sc11_16x8 = vaddq_u16(sc11_16x8, x255_16x8); + sc00_8x8 = vshrn_n_u16(sc00_16x8, 8); + sc01_8x8 = vshrn_n_u16(sc01_16x8, 8); + sc10_8x8 = vshrn_n_u16(sc10_16x8, 8); + sc11_8x8 = vshrn_n_u16(sc11_16x8, 8); + sc0_8x16 = vcombine_u8(sc00_8x8, sc01_8x8); + sc1_8x16 = vcombine_u8(sc10_8x8, sc11_8x8); + + // add alpha channel + sc0_32x4 = vreinterpretq_u32_u8(sc0_8x16); + sc1_32x4 = vreinterpretq_u32_u8(sc1_8x16); + d0_32x4 = vorrq_u32(sc0_32x4, mask_32x4); + d1_32x4 = vorrq_u32(sc1_32x4, mask_32x4); + + // save result + vst1q_u32(d, d0_32x4); + vst1q_u32(d+4, d1_32x4); + + d+=8; + s+=8; + } + end += (l & 7); + while (d < end) + { + *d++ = 0xff000000 + MUL3_SYM(c, *s); + s++; + } +} - "b "AP"done\n\t" - AP"loopout: \n\t" - "cmp %[d], %[e] \n\t" - "beq "AP"done\n\t" - "sub %[tmp],%[e], %[d] \n\t" - "cmp %[tmp],$0x04 \n\t" - "beq "AP"singleloop2 \n\t" +static void +_op_blend_p_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { + uint16x8_t ad0_16x8; + uint16x8_t ad1_16x8; + uint16x8_t cs0_16x8; + uint16x8_t cs1_16x8; + uint32x4_t ad_32x4; + uint32x4_t alpha_32x4; + uint32x4_t c_32x4; + uint32x4_t cond_32x4; + uint32x4_t cs_32x4; + uint32x4_t d_32x4; + uint32x4_t s_32x4; + uint32x4_t x0_32x4; + uint32x4_t x1_32x4; + uint8x16_t ad_8x16; + uint8x16_t alpha_8x16; + uint8x16_t c_8x16; + uint8x16_t cs_8x16; + uint8x16_t d_8x16; + uint8x16_t s_8x16; + uint8x16_t x0_8x16; + uint8x16_t x1_8x16; + uint8x8_t ad0_8x8; + uint8x8_t ad1_8x8; + uint8x8_t alpha0_8x8; + uint8x8_t alpha1_8x8; + uint8x8_t c_8x8; + uint8x8_t cs0_8x8; + uint8x8_t cs1_8x8; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t s0_8x8; + uint8x8_t s1_8x8; - AP"dualloop2: \n\t" - "vldm %[d], {d4} \n\t" - "vldm %[s]!, {d0} \n\t" + int temp = (1 + c) & 0xff; - // Long version of d - "vmovl.u8 q8, d4 \n\t" + x1_8x16 = vdupq_n_u8(0x1); + x1_32x4 = vreinterpretq_u32_u8(x1_8x16); + c_32x4 = vdupq_n_u32(temp); + c_32x4 = vmulq_u32(x1_32x4, c_32x4); + c_8x16 = vreinterpretq_u8_u32(c_32x4); + c_8x8 = vget_low_u8(c_8x16); + x0_8x16 = vdupq_n_u8(0x0); + x0_32x4 = vreinterpretq_u32_u8(x0_8x16); - // Long version of s - "vmovl.u8 q6, d0 \n\t" + DATA32 *end = d + (l & ~3); + while (d < end) + { + // load 4 elements from s + s_32x4 = vld1q_u32(s); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); - // q4/q5 = s-d - "vsub.s16 q4, q6, q8 \n\t" + // multiply MUL_256(c, *s) + cs0_16x8 = vmull_u8(c_8x8, s0_8x8); + cs1_16x8 = vmull_u8(c_8x8, s1_8x8); + cs0_8x8 = vshrn_n_u16(cs0_16x8,8); + cs1_8x8 = vshrn_n_u16(cs1_16x8,8); + cs_8x16 = vcombine_u8(cs0_8x8, cs1_8x8); + cs_32x4 = vreinterpretq_u32_u8(cs_8x16); - // Multiply - "vmul.s16 q4, q4,q15 \n\t" + // select s if c is 0 + cond_32x4 = vceqq_u32(c_32x4, x0_32x4); + cs_32x4 = vbslq_u32(cond_32x4, s_32x4 , cs_32x4); - // Shift down - "vshr.s16 q4, #7 \n\t" + // calculate alpha = 256 - (*s >> 24) + alpha_32x4 = vshrq_n_u32(cs_32x4, 24); + alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4); + alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4); + alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16); + alpha0_8x8 = vget_low_u8(alpha_8x16); + alpha1_8x8 = vget_high_u8(alpha_8x16); - // Add d - "vqadd.s16 q4, q4, q8 \n\t" + // load 4 elements from d + d_32x4 = vld1q_u32(d); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); - // Shrink to save - "vqmovun.s16 d0, q4 \n\t" + // multiply MUL_256(a, *d) + ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8); + ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8); + ad0_8x8 = vshrn_n_u16(ad0_16x8,8); + ad1_8x8 = vshrn_n_u16(ad1_16x8,8); + ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); + ad_32x4 = vreinterpretq_u32_u8(ad_8x16); - "vstm %[d]!, {d0} \n\t" + // select d if alpha is 0 + alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16); + cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4); + ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4); - "cmp %[d], %[e] \n\t" - "beq "AP"done \n\t" + // add up everything + d_32x4 = vaddq_u32(cs_32x4, ad_32x4); - AP"singleloop2: \n\t" - "vld1.32 d4[0], [%[d]] \n\t" - "vld1.32 d0[0], [%[s]]! \n\t" + // save result + vst1q_u32(d, d_32x4); - // Long version of 'd' - "vmovl.u8 q8, d4 \n\t" + d+=4; + s+=4; + } - // Long version of 's' - "vmovl.u8 q6, d0 \n\t" + end += (l & 3); + int alpha; + c = 1 + (c & 0xff); + while (d < end) + { + DATA32 sc = MUL_256(c, *s); + alpha = 256 - (sc >> 24); + *d = sc + MUL_256(alpha, *d); + d++; + s++; + } - // d8 = s -d - "vsub.s16 d8, d12, d16 \n\t" +} - // Multiply - "vmul.s16 d8, d8, d30 \n\t" +static void +_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { + int16x8_t c_i16x8; + int16x8_t d0_i16x8; + int16x8_t d1_i16x8; + int16x8_t ds0_i16x8; + int16x8_t ds1_i16x8; + int16x8_t s0_i16x8; + int16x8_t s1_i16x8; + int8x16_t ds_i8x16; + int8x8_t ds0_i8x8; + int8x8_t ds1_i8x8; + uint16x8_t c_16x8; + uint16x8_t d0_16x8; + uint16x8_t d1_16x8; + uint16x8_t s0_16x8; + uint16x8_t s1_16x8; + uint32x4_t d_32x4; + uint32x4_t ds_32x4; + uint32x4_t s_32x4; + uint8x16_t d_8x16; + uint8x16_t s_8x16; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t s0_8x8; + uint8x8_t s1_8x8; - // Shift down - "vshr.s16 d8, #7 \n\t" + c = 1 + (c & 0xff); - // Add 'd' - "vqadd.s16 d8, d8, d16 \n\t" + c_16x8 = vdupq_n_u16(c); + c_i16x8 = vreinterpretq_s16_u16(c_16x8); - // Shrink to save - "vqmovun.s16 d0, q4 \n\t" + DATA32 *end = d + (l & ~3); + while (d < end) + { + // load 4 elements from d + d_32x4 = vld1q_u32(d); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); - "vst1.32 d0[0], [%[d]] \n\t" + // spread d so that each channel occupies 16 bit + d0_16x8 = vmovl_u8(d0_8x8); + d1_16x8 = vmovl_u8(d1_8x8); + d0_i16x8 = vreinterpretq_s16_u16(d0_16x8); + d1_i16x8 = vreinterpretq_s16_u16(d1_16x8); + // load 4 elements from s + s_32x4 = vld1q_u32(s); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); - AP"done: \n\t" + // spread s so that each channel occupies 16 bit + s0_16x8 = vmovl_u8(s0_8x8); + s1_16x8 = vmovl_u8(s1_8x8); + s0_i16x8 = vreinterpretq_s16_u16(s0_16x8); + s1_i16x8 = vreinterpretq_s16_u16(s1_16x8); + + // interpolate + ds0_i16x8 = vsubq_s16(s0_i16x8, d0_i16x8); + ds1_i16x8 = vsubq_s16(s1_i16x8, d1_i16x8); + ds0_i16x8 = vmulq_s16(ds0_i16x8, c_i16x8); + ds1_i16x8 = vmulq_s16(ds1_i16x8, c_i16x8); + ds0_i16x8 = vshrq_n_s16(ds0_i16x8, 8); + ds1_i16x8 = vshrq_n_s16(ds1_i16x8, 8); + ds0_i16x8 = vaddq_s16(ds0_i16x8, d0_i16x8); + ds1_i16x8 = vaddq_s16(ds1_i16x8, d1_i16x8); + ds0_i8x8 = vmovn_s16(ds0_i16x8); + ds1_i8x8 = vmovn_s16(ds1_i16x8); + + // save result + ds_i8x16 = vcombine_s8(ds0_i8x8, ds1_i8x8); + ds_32x4 = vreinterpretq_u32_s8(ds_i8x16); + vst1q_u32(d, ds_32x4); + + d+=4; + s+=4; + } - // No output - : - // Input - : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp) - // Clobbered - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory" - ); -#undef AP -#endif + end += (l & 3); + while (d < end) + { + *d = INTERP_256(c, *s, *d); + d++; + s++; + } } #define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon -#define _op_blend_pan_c_dp_neon _op_blend_p_c_dp_neon -#define _op_blend_p_can_dp_neon _op_blend_p_c_dp_neon #define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon -#define _op_blend_p_caa_dp_neon _op_blend_p_c_dp_neon #define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon #define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon --