cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=b3dc08bf8bf438b531a28fc231c3ac854fe09fc9
commit b3dc08bf8bf438b531a28fc231c3ac854fe09fc9 Author: Cedric BAIL <[email protected]> Date: Sat Sep 12 07:31:49 2015 +0200 ector: cleanup default backend drawer. --- src/lib/ector/software/ector_drawhelper.c | 68 ++++---- src/lib/ector/software/ector_drawhelper_neon.c | 203 +++++++++++----------- src/lib/ector/software/ector_drawhelper_private.h | 21 ++- src/lib/ector/software/ector_drawhelper_sse2.c | 30 +++- 4 files changed, 175 insertions(+), 147 deletions(-) diff --git a/src/lib/ector/software/ector_drawhelper.c b/src/lib/ector/software/ector_drawhelper.c index eee1d7c..43cf038 100644 --- a/src/lib/ector/software/ector_drawhelper.c +++ b/src/lib/ector/software/ector_drawhelper.c @@ -15,28 +15,30 @@ */ /* - result = s + d * sia + result = s + d * sia dest = (s + d * sia) * ca + d * cia = s * ca + d * (sia * ca + cia) = s * ca + d * (1 - sa*ca) */ -void -comp_func_solid_source_over(uint *dest, int length, uint color, uint const_alpha) + +static void +_comp_func_solid_source_over(uint *dest, int length, uint color, uint const_alpha) { int ialpha, i; + if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); - ialpha = _alpha(~color); + ialpha = alpha_inverse(color); for (i = 0; i < length; ++i) dest[i] = color + BYTE_MUL(dest[i], ialpha); } - static void -comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint const_alpha) +_comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint const_alpha) { int i; uint s, sc, sia; + if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); @@ -49,7 +51,7 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint dest[i] = s; else if (s != 0) { - sia = _alpha(~s); + sia = alpha_inverse(s); dest[i] = s + BYTE_MUL(dest[i], sia); } } @@ -60,7 +62,7 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint { s = src[i]; sc = ECTOR_MUL4_SYM(color, s); - sia = _alpha(~sc); + sia = alpha_inverse(sc); dest[i] = sc + BYTE_MUL(dest[i], sia); } } @@ -71,10 +73,14 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint dest = s * ca + d * cia */ static void -comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha) +_comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha) { int ialpha, i; - if (const_alpha == 255) _ector_memfill(dest, length, color); + + if (const_alpha == 255) + { + _ector_memfill(dest, length, color); + } else { ialpha = 255 - const_alpha; @@ -85,20 +91,23 @@ comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha) } static void -comp_func_source(uint *dest, const uint *src, int length, uint color, uint const_alpha) +_comp_func_source(uint *dest, const uint *src, int length, uint color, uint const_alpha) { int i, ialpha; uint src_color; + if (color == 0xffffffff) // No color multiplier { if (const_alpha == 255) - memcpy(dest, src, length * sizeof(uint)); + { + memcpy(dest, src, length * sizeof(uint)); + } else - { - ialpha = 255 - const_alpha; - for (i = 0; i < length; ++i) - dest[i] = INTERPOLATE_PIXEL_256(src[i], const_alpha, dest[i], ialpha); - } + { + ialpha = 255 - const_alpha; + for (i = 0; i < length; ++i) + dest[i] = INTERPOLATE_PIXEL_256(src[i], const_alpha, dest[i], ialpha); + } } else { @@ -109,24 +118,24 @@ comp_func_source(uint *dest, const uint *src, int length, uint color, uint const } else { - ialpha = 255 - const_alpha; - for (i = 0; i < length; ++i) - { - src_color = ECTOR_MUL4_SYM(src[i], color); - dest[i] = INTERPOLATE_PIXEL_256(src_color, const_alpha, dest[i], ialpha); - } + ialpha = 255 - const_alpha; + for (i = 0; i < length; ++i) + { + src_color = ECTOR_MUL4_SYM(src[i], color); + dest[i] = INTERPOLATE_PIXEL_256(src_color, const_alpha, dest[i], ialpha); + } } } } RGBA_Comp_Func_Solid func_for_mode_solid[ECTOR_ROP_LAST] = { - comp_func_solid_source_over, - comp_func_solid_source + _comp_func_solid_source_over, + _comp_func_solid_source }; RGBA_Comp_Func func_for_mode[ECTOR_ROP_LAST] = { - comp_func_source_over, - comp_func_source + _comp_func_source_over, + _comp_func_source }; RGBA_Comp_Func_Solid @@ -146,13 +155,10 @@ RGBA_Comp_Func ector_comp_func_span_get(Ector_Rop op, uint color, Eina_Bool src_ { if (op == ECTOR_ROP_BLEND) op = ECTOR_ROP_COPY; } + return func_for_mode[op]; } -extern void init_drawhelper_gradient(); -extern void init_draw_helper_sse2(); -extern void init_draw_helper_neon(); - void init_draw_helper() { init_drawhelper_gradient(); diff --git a/src/lib/ector/software/ector_drawhelper_neon.c b/src/lib/ector/software/ector_drawhelper_neon.c index 3adfdba..59e032f 100644 --- a/src/lib/ector/software/ector_drawhelper_neon.c +++ b/src/lib/ector/software/ector_drawhelper_neon.c @@ -7,6 +7,7 @@ #ifdef BUILD_NEON #include <arm_neon.h> + static void comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color, uint const_alpha) { @@ -39,8 +40,8 @@ comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color, // alpha can only be 0 if color is 0x0. In that case we can just return. // Otherwise we can assume alpha != 0. This allows more optimization in // NEON code. - if(!color) - return; + if (!color) + return; DATA32 *start = dest; int size = length; @@ -53,46 +54,47 @@ comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color, c_32x4 = vdupq_n_u32(color); while (start < end) - { - d0_32x4 = vld1q_u32(start); - d1_32x4 = vld1q_u32(start+4); - d0_8x16 = vreinterpretq_u8_u32(d0_32x4); - d1_8x16 = vreinterpretq_u8_u32(d1_32x4); - - d00_8x8 = vget_low_u8(d0_8x16); - d01_8x8 = vget_high_u8(d0_8x16); - d10_8x8 = vget_low_u8(d1_8x16); - d11_8x8 = vget_high_u8(d1_8x16); - - temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8); - temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8); - temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8); - temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8); - - temp00_8x8 = vshrn_n_u16(temp00_16x8,8); - temp01_8x8 = vshrn_n_u16(temp01_16x8,8); - temp10_8x8 = vshrn_n_u16(temp10_16x8,8); - temp11_8x8 = vshrn_n_u16(temp11_16x8,8); - - temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8); - temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8); - - temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16); - temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16); - - d0_32x4 = vaddq_u32(c_32x4, temp0_32x4); - d1_32x4 = vaddq_u32(c_32x4, temp1_32x4); - - vst1q_u32(start, d0_32x4); - vst1q_u32(start+4, d1_32x4); - start+=8; - } + { + d0_32x4 = vld1q_u32(start); + d1_32x4 = vld1q_u32(start+4); + d0_8x16 = vreinterpretq_u8_u32(d0_32x4); + d1_8x16 = vreinterpretq_u8_u32(d1_32x4); + + d00_8x8 = vget_low_u8(d0_8x16); + d01_8x8 = vget_high_u8(d0_8x16); + d10_8x8 = vget_low_u8(d1_8x16); + d11_8x8 = vget_high_u8(d1_8x16); + + temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8); + temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8); + temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8); + temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8); + + temp00_8x8 = vshrn_n_u16(temp00_16x8,8); + temp01_8x8 = vshrn_n_u16(temp01_16x8,8); + temp10_8x8 = vshrn_n_u16(temp10_16x8,8); + temp11_8x8 = vshrn_n_u16(temp11_16x8,8); + + temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8); + temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8); + + temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16); + temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16); + + d0_32x4 = vaddq_u32(c_32x4, temp0_32x4); + d1_32x4 = vaddq_u32(c_32x4, temp1_32x4); + + vst1q_u32(start, d0_32x4); + vst1q_u32(start+4, d1_32x4); + start+=8; + } + end += (size & 7); while (start < end) - { - *start = color + MUL_256(alpha, *start); - start++; - } + { + *start = color + MUL_256(alpha, *start); + start++; + } } /* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of @@ -132,6 +134,9 @@ comp_func_source_over_sse2(uint * __restrict dest, const uint * __restrict src, uint8x8_t s1_8x8; uint8x8_t sc0_8x8; uint8x8_t sc1_8x8; + int size; + DATA32 *start; + DATA32 *end; if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); @@ -143,69 +148,69 @@ comp_func_source_over_sse2(uint * __restrict dest, const uint * __restrict src, x0_32x4 = vreinterpretq_u32_u8(x0_8x16); x1_8x16 = vdupq_n_u8(0x1); x1_32x4 = vreinterpretq_u32_u8(x1_8x16); - DATA32 *start = dest; - int size = l; - DATA32 *end = start + (size & ~3); + start = dest; + size = l; + end = start + (size & ~3); + while (start < end) - { - - s_32x4 = vld1q_u32(src); - s_8x16 = vreinterpretq_u8_u32(s_32x4); - - d_32x4 = vld1q_u32(start); - d_8x16 = vreinterpretq_u8_u32(d_32x4); - d0_8x8 = vget_low_u8(d_8x16); - d1_8x8 = vget_high_u8(d_8x16); - - s0_8x8 = vget_low_u8(s_8x16); - s1_8x8 = vget_high_u8(s_8x16); - - sc0_16x8 = vmull_u8(s0_8x8, c_8x8); - sc1_16x8 = vmull_u8(s1_8x8, c_8x8); - sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8); - sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8); - sc0_8x8 = vshrn_n_u16(sc0_16x8, 8); - sc1_8x8 = vshrn_n_u16(sc1_16x8, 8); - sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8); - - alpha_32x4 = vreinterpretq_u32_u8(sc_8x16); - alpha_32x4 = vshrq_n_u32(alpha_32x4, 24); - alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4); - alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4); - alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16); - alpha0_8x8 = vget_low_u8(alpha_8x16); - alpha1_8x8 = vget_high_u8(alpha_8x16); - - ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8); - ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8); - ad0_8x8 = vshrn_n_u16(ad0_16x8,8); - ad1_8x8 = vshrn_n_u16(ad1_16x8,8); - ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); - ad_32x4 = vreinterpretq_u32_u8(ad_8x16); - - alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16); - cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4); - ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4); - - sc_32x4 = vreinterpretq_u32_u8(sc_8x16); - d_32x4 = vaddq_u32(sc_32x4, ad_32x4); - - vst1q_u32(start, d_32x4); - - src+=4; - start+=4; - } + { + s_32x4 = vld1q_u32(src); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + + d_32x4 = vld1q_u32(start); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); + + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); + + sc0_16x8 = vmull_u8(s0_8x8, c_8x8); + sc1_16x8 = vmull_u8(s1_8x8, c_8x8); + sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8); + sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8); + sc0_8x8 = vshrn_n_u16(sc0_16x8, 8); + sc1_8x8 = vshrn_n_u16(sc1_16x8, 8); + sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8); + + alpha_32x4 = vreinterpretq_u32_u8(sc_8x16); + alpha_32x4 = vshrq_n_u32(alpha_32x4, 24); + alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4); + alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4); + alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16); + alpha0_8x8 = vget_low_u8(alpha_8x16); + alpha1_8x8 = vget_high_u8(alpha_8x16); + + ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8); + ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8); + ad0_8x8 = vshrn_n_u16(ad0_16x8,8); + ad1_8x8 = vshrn_n_u16(ad1_16x8,8); + ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); + ad_32x4 = vreinterpretq_u32_u8(ad_8x16); + + alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16); + cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4); + ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4); + + sc_32x4 = vreinterpretq_u32_u8(sc_8x16); + d_32x4 = vaddq_u32(sc_32x4, ad_32x4); + + vst1q_u32(start, d_32x4); + + src+=4; + start+=4; + } + end += (size & 3); while (start < end) - { - DATA32 sc = MUL4_SYM(color, *s); - DATA32 alpha = 256 - (sc >> 24); - *start = sc + MUL_256(alpha, *start); - start++; - src++; - } + { + DATA32 sc = MUL4_SYM(color, *s); + DATA32 alpha = 256 - (sc >> 24); + *start = sc + MUL_256(alpha, *start); + start++; + src++; + } } - #endif void diff --git a/src/lib/ector/software/ector_drawhelper_private.h b/src/lib/ector/software/ector_drawhelper_private.h index a753987..007ef53 100644 --- a/src/lib/ector/software/ector_drawhelper_private.h +++ b/src/lib/ector/software/ector_drawhelper_private.h @@ -1,9 +1,7 @@ #ifndef ECTOR_DRAWHELPER_PRIVATE_H #define ECTOR_DRAWHELPER_PRIVATE_H -#ifdef HAVE_CONFIG_H -# include <config.h> -#endif +#include "ector_private.h" #ifndef MIN #define MIN( a, b ) ( (a) < (b) ? (a) : (b) ) @@ -17,11 +15,6 @@ typedef unsigned int uint; #endif -static inline int _alpha(uint c) -{ - return c>>24; -} - #define ECTOR_ARGB_JOIN(a,r,g,b) \ (((a) << 24) + ((r) << 16) + ((g) << 8) + (b)) @@ -53,6 +46,13 @@ static inline int _alpha(uint c) } \ } +static inline int +alpha_inverse(int color) +{ + color = ~color; + return A_VAL(&color); +} + static inline void _ector_memfill(uint *dest, int length, uint value) { @@ -89,9 +89,14 @@ INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b) typedef void (*RGBA_Comp_Func)(uint *dest, const uint *src, int length, uint mul_col, uint const_alpha); typedef void (*RGBA_Comp_Func_Solid)(uint *dest, int length, uint color, uint const_alpha); + extern RGBA_Comp_Func_Solid func_for_mode_solid[ECTOR_ROP_LAST]; extern RGBA_Comp_Func func_for_mode[ECTOR_ROP_LAST]; +void init_drawhelper_gradient(); +void init_draw_helper_sse2(); +void init_draw_helper_neon(); + void init_draw_helper(); RGBA_Comp_Func_Solid ector_comp_func_solid_span_get(Ector_Rop op, uint color); diff --git a/src/lib/ector/software/ector_drawhelper_sse2.c b/src/lib/ector/software/ector_drawhelper_sse2.c index 581cbb7..3af1bc1 100644 --- a/src/lib/ector/software/ector_drawhelper_sse2.c +++ b/src/lib/ector/software/ector_drawhelper_sse2.c @@ -89,6 +89,7 @@ v4_mul_color_sse2(__m128i x, __m128i y) { const __m128i zero = _mm_setzero_si128(); const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF); + __m128i x_l = _mm_unpacklo_epi8(x, zero); __m128i x_h = _mm_unpackhi_epi8(x, zero); @@ -111,6 +112,7 @@ static inline __m128i v4_ialpha_sse2(__m128i c) { __m128i a = _mm_srli_epi32(c, 24); + return _mm_sub_epi32(_mm_set1_epi32(0xff), a); } @@ -141,10 +143,14 @@ comp_func_helper_sse2 (uint *dest, int length, uint color, uint alpha) void comp_func_solid_source_sse2(uint *dest, int length, uint color, uint const_alpha) { - int ialpha; - if (const_alpha == 255) _ector_memfill(dest, length, color); + if (const_alpha == 255) + { + _ector_memfill(dest, length, color); + } else { + int ialpha; + ialpha = 255 - const_alpha; color = BYTE_MUL(color, const_alpha); comp_func_helper_sse2(dest, length, color, ialpha); @@ -155,9 +161,10 @@ void comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_alpha) { int ialpha; + if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); - ialpha = _alpha(~color); + ialpha = alpha_inverse(color); comp_func_helper_sse2(dest, length, color, ialpha); } @@ -194,21 +201,23 @@ comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_ #define V4_COMP_OP_SRC \ v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest); - - static void comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha) { int ialpha; uint src_color; + if (color == 0xffffffff) // No color multiplier { if (const_alpha == 255) - memcpy(dest, src, length * sizeof(uint)); + { + memcpy(dest, src, length * sizeof(uint)); + } else { ialpha = 255 - const_alpha; __m128i v_alpha = _mm_set1_epi32(const_alpha); + LOOP_ALIGNED_U1_A4(dest, length, { /* UOP */ *dest = INTERPOLATE_PIXEL_256(*src, const_alpha, *dest, ialpha); @@ -225,6 +234,7 @@ comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint else { __m128i v_color = _mm_set1_epi32(color); + if (const_alpha == 255) { LOOP_ALIGNED_U1_A4(dest, length, @@ -243,6 +253,7 @@ comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint { ialpha = 255 - const_alpha; __m128i v_alpha = _mm_set1_epi32(const_alpha); + LOOP_ALIGNED_U1_A4(dest, length, { /* UOP */ src_color = ECTOR_MUL4_SYM(*src, color); @@ -264,6 +275,7 @@ static void comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha) { uint s, sia; + if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); @@ -272,7 +284,7 @@ comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, LOOP_ALIGNED_U1_A4(dest, length, { /* UOP */ s = *src; - sia = _alpha(~s); + sia = alpha_inverse(s); *dest = s + BYTE_MUL(*dest, sia); dest++; src++; length--; }, @@ -286,10 +298,11 @@ comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, else { __m128i v_color = _mm_set1_epi32(color); + LOOP_ALIGNED_U1_A4(dest, length, { /* UOP */ s = ECTOR_MUL4_SYM(*src, color); - sia = _alpha(~s); + sia = alpha_inverse(s); *dest = s + BYTE_MUL(*dest, sia); dest++; src++; length--; }, @@ -321,4 +334,3 @@ init_draw_helper_sse2() } #endif } - --
