This is used instead of the equivalent C fast path.
lowlevel-blt-bench results, compared to no fast path at all:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 8.2 0.0 38.7 0.5 100.0% +372.7%
L2 7.9 0.1 37.6 0.5 100.0% +376.8%
M 7.3 0.0 38.5 0.1 100.0% +425.6%
HT 6.9 0.0 26.1 0.3 100.0% +279.9%
VT 6.8 0.0 24.5 0.3 100.0% +258.0%
R 6.6 0.1 23.6 0.2 100.0% +255.1%
RT 4.5 0.1 10.9 0.2 100.0% +143.1%
---
pixman/pixman-arm-simd-asm.S | 114 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 4 ++
2 files changed, 118 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a74a0a8..08d6709 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1177,3 +1177,117 @@ generate_composite_function \
over_n_8888_process_tail
/******************************************************************************/
+
+.macro over_n_0565_init
+ BITMSK5 .req Y
+ BITMSK6 .req STRIDE_D
+ SRCRB .req SRC
+ SRCG .req STRIDE_S
+ HALF .req MASK
+ ALPHA .req STRIDE_M
+ TMP0 .req SCRATCH
+ TMP1 .req ORIG_W
+ line_saved_regs Y, STRIDE_D, ORIG_W
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov ALPHA, #255
+ pkhbt SRCG, SRC, SRC, lsl #16 @
GGGGGGGGxxxxxxxxGGGGGGGGxxxxxxxx
+ sub ALPHA, ALPHA, SRC, lsr #24
+ mov SRCRB, SRC, lsl #8 @
RRRRRRRRxxxxxxxxBBBBBBBBxxxxxxxx
+ ldr HALF, =0x00800080
+.endm
+
+.macro over_n_0565_newline
+ ldr BITMSK5, =0x001f001f
+ ldr BITMSK6, =0xfc00fc00
+.endm
+
+.macro over_n_0565_cleanup
+ .unreq BITMSK5
+ .unreq BITMSK6
+ .unreq SRCRB
+ .unreq SRCG
+ .unreq HALF
+ .unreq ALPHA
+ .unreq TMP0
+ .unreq TMP1
+.endm
+
+.macro over_n_0565_process_head cond, numbytes, firstreg, unaligned_src,
unaligned_mask, preload
+ pixld , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_n_0565_1pixel dst
+ mov TMP1, WK&dst, lsl #16 @
rrrrrggggggbbbbb0000000000000000
+ bic TMP1, TMP1, BITMSK6, lsr #5 @
rrrrr000000bbbbb0000000000000000
+ and TMP0, BITMSK6, WK&dst, lsl #5 @
0000000000000000gggggg0000000000
+ orr WK&dst, TMP1, TMP1, lsr #5 @
rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr TMP0, TMP0, lsr #6 @
0000000000000000gggggggggggg0000
+ pkhtb WK&dst, WK&dst, WK&dst, asr #5 @
rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+ uxtb TMP0, TMP0, ror #8 @
000000000000000000000000gggggggg
+ uxtb16 WK&dst, WK&dst, ror #8 @
00000000rrrrrrrr00000000bbbbbbbb
+ mla TMP0, TMP0, ALPHA, HALF @
xxxxxxxxxxxxxxxxgggggggggggggggg
+ mla WK&dst, WK&dst, ALPHA, HALF @
rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uxtab TMP0, TMP0, TMP0, ror #8 @
xxxxxxxxxxxxxxxxgggggggggggggggg
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8 @
rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uqadd8 TMP0, TMP0, SRCG @
xxxxxxxxxxxxxxxxggggggggxxxxxxxx
+ uqadd8 WK&dst, WK&dst, SRCRB @
rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+ and TMP0, TMP0, BITMSK6 @
xxxxxx0000000000gggggg0000000000
+ and WK&dst, BITMSK5, WK&dst, lsr #11 @
00000000000rrrrr00000000000bbbbb
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @
00000000000xxxxxrrrrr000000bbbbb
+ orr WK&dst, WK&dst, TMP0, lsr #5 @
00000xxxxxxxxxxxrrrrrggggggbbbbb
+.endm
+
+.macro over_n_0565_2pixels dst
+ bic TMP1, WK&dst, BITMSK6, lsr #5 @
RRRRR000000BBBBBrrrrr000000bbbbb
+ and TMP0, BITMSK6, WK&dst, lsl #5 @
GGGGGG0000000000gggggg0000000000
+ mov WK&dst, TMP1, lsl #16 @
rrrrr000000bbbbb0000000000000000
+ orr TMP0, TMP0, lsr #6 @
GGGGGGGGGGGG0000gggggggggggg0000
+ bic TMP1, TMP1, WK&dst, lsr #16 @
RRRRR000000BBBBB0000000000000000
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @
rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr TMP1, TMP1, TMP1, lsr #5 @
RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK&dst, WK&dst, WK&dst, asr #5 @
rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+ pkhtb TMP1, TMP1, TMP1, asr #5 @
RRRRRRRRRR0xxxxxBBBBBBBBBB000000
+ uxtb16 TMP0, TMP0, ror #8 @
00000000GGGGGGGG00000000gggggggg
+ uxtb16 WK&dst, WK&dst, ror #8 @
00000000rrrrrrrr00000000bbbbbbbb
+ uxtb16 TMP1, TMP1, ror #8 @
00000000RRRRRRRR00000000BBBBBBBB
+ mla TMP0, TMP0, ALPHA, HALF @
GGGGGGGGGGGGGGGGgggggggggggggggg
+ mla WK&dst, WK&dst, ALPHA, HALF @
rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ mla TMP1, TMP1, ALPHA, HALF @
RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+ uxtab16 TMP0, TMP0, TMP0, ror #8 @
GGGGGGGGGGGGGGGGgggggggggggggggg
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8 @
rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+ uxtab16 TMP1, TMP1, TMP1, ror #8 @
RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+ uqadd8 TMP0, TMP0, SRCG @
GGGGGGGGxxxxxxxxggggggggxxxxxxxx
+ uqadd8 TMP1, TMP1, SRCRB @
RRRRRRRRxxxxxxxxBBBBBBBBxxxxxxxx
+ uqadd8 WK&dst, WK&dst, SRCRB @
rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+ and TMP0, TMP0, BITMSK6 @
GGGGGG0000000000gggggg0000000000
+ and TMP1, BITMSK5, TMP1, lsr #11 @
00000000000RRRRR00000000000BBBBB
+ and WK&dst, BITMSK5, WK&dst, lsr #11 @
00000000000rrrrr00000000000bbbbb
+ orr TMP1, TMP1, TMP1, lsr #5 @
00000000000xxxxxRRRRR000000BBBBB
+ orr WK&dst, WK&dst, WK&dst, lsr #5 @
00000000000xxxxxrrrrr000000bbbbb
+ pkhbt TMP1, WK&dst, TMP1, LSL #16 @
RRRRR000000BBBBBrrrrr000000bbbbb
+ orr WK&dst, TMP1, TMP0, lsr #5 @
RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+.endm
+
+.macro over_n_0565_process_tail cond, numbytes, firstreg
+ .if numbytes == 2
+ over_n_0565_1pixel firstreg
+ .else
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ over_n_0565_2pixels %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_armv6, 0, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_SPILL_LINE_VARS, \
+ 2, /* prefetch distance */ \
+ over_n_0565_init, \
+ over_n_0565_newline, \
+ over_n_0565_cleanup, \
+ over_n_0565_process_head, \
+ over_n_0565_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 3223010..31f960d 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -53,6 +53,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6,
in_reverse_8888_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888,
uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_0565,
+ uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
uint32_t, 1)
@@ -246,6 +248,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8,
armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8,
armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8,
armv6_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5,
armv6_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5,
armv6_composite_over_n_0565),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8,
armv6_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8,
armv6_composite_over_reverse_n_8888),
--
1.7.5.4
_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman