[Pixman] [PATCH 2/3] MIPS: DSPr2: Added more fast-paths for SRC operation:
From: Nemanja Lukic nemanja.lu...@rt-rk.com Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): src_n_8_ = L1: 13.79 L2: 22.47 M: 17.55 ( 58.28%) HT: 6.95 VT: 6.46 R: 6.34 RT: 2.07 ( 20Kops/s) src_n_8_8 = L1: 20.22 L2: 20.21 M: 18.20 ( 24.17%) HT: 6.65 VT: 6.22 R: 6.11 RT: 2.03 ( 20Kops/s) Optimized: src_n_8_ = L1: 58.31 L2: 53.34 M: 25.69 ( 85.29%) HT: 22.55 VT: 21.44 R: 19.91 RT: 10.34 ( 48Kops/s) src_n_8_8 = L1: 102.60 L2: 89.43 M: 65.01 ( 86.32%) HT: 37.87 VT: 37.02 R: 32.43 RT: 12.41 ( 51Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 133 pixman/pixman-mips-dspr2.c |9 +++ 2 files changed, 142 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index 9b9b449..d2482e0 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -310,6 +310,139 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888__asm_mips) END(pixman_composite_src_x888__asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_src_n_8__asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + +SAVE_REGS_ON_STACK 0, v0 +li v0, 0x00ff00ff + +beqz a3, 3f + nop +addiut1, a3, -1 +beqz t1, 2f + nop + +1: + /* a1 = source (32bit constant) */ +lbu t0, 0(a2) /* t2 = mask(a8) */ +lbu t1, 1(a2) /* t3 = mask(a8) */ +addiua2, a2, 2 + +MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9 + +sw t2, 0(a0) +sw t3, 4(a0) +addiua3, a3, -2 +addiut2, a3, -1 +bgtz t2, 1b + addiu a0, a0, 8 + +beqz a3, 3f + nop + +2: +lbu t0, 0(a2) +addiua2, a2, 1 + +MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5 + +sw t1, 0(a0) +addiua3, a3, -1 +addiua0, a0, 4 + +3: +RESTORE_REGS_FROM_STACK 0, v0 +jra + nop + +END(pixman_composite_src_n_8__asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips) +/* + * a0 - dst (a8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +lit9, 0x00ff00ff +beqz a3, 3f + nop +srl t7, a3, 2 /* t7 = how many multiples of 4 dst pixels */ +beqz t7, 1f /* branch if less than 4 src pixels */ + nop + +srl t8, a1, 24 +replv.ph t8, t8 + +0: +beqz t7, 1f + addiut7, t7, -1 +lbu t0, 0(a2) +lbu t1, 1(a2) +lbu t2, 2(a2) +lbu t3, 3(a2) + +addiu a2, a2, 4 + +precr_sra.ph.wt1, t0, 0 +precr_sra.ph.wt3, t2, 0 +precr.qb.ph t0, t3, t1 + +muleu_s.ph.qblt2, t0, t8 +muleu_s.ph.qbrt3, t0, t8 +shra_r.ph t4, t2, 8 +shra_r.ph t5, t3, 8 +and t4, t4, t9 +and t5, t5, t9 +addq.ph t2, t2, t4 +addq.ph t3, t3, t5 +shra_r.ph t2, t2, 8 +shra_r.ph t3, t3, 8 +precr.qb.ph t2, t2, t3 + +sbt2, 0(a0) +srl t2, t2, 8 +sbt2, 1(a0) +srl t2, t2, 8 +sbt2, 2(a0) +srl t2, t2, 8 +sbt2, 3(a0) +addiu a3, a3, -4 +b 0b + addiua0, a0, 4 + +1: +beqz a3, 3f + nop +srl t8, a1, 24 +2: +lbu t0, 0(a2) +addiu a2, a2, 1 + +mul t2, t0, t8 +shra_r.ph t3, t2, 8 +andi t3, t3, 0x00ff +addq.ph t2, t2, t3 +shra_r.ph t2, t2, 8 + +sbt2, 0(a0) +addiu a3, a3, -1 +bnez a3, 2b + addiua0, a0, 1 + +3: +j ra + nop + +END(pixman_composite_src_n_8_8_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_n___ca_asm_mips) /* * a0 - dst (a8r8g8b8) diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index bd828df..161377b 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -55,6 +55,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8, PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add__, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_, + uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC,
[Pixman] [PATCH 2/3] MIPS: DSPr2: Added more fast-paths for SRC operation:
From: Nemanja Lukic nemanja.lu...@rt-rk.com Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): src_n_8_ = L1: 13.79 L2: 22.47 M: 17.55 ( 58.28%) HT: 6.95 VT: 6.46 R: 6.34 RT: 2.07 ( 20Kops/s) src_n_8_8 = L1: 20.22 L2: 20.21 M: 18.20 ( 24.17%) HT: 6.65 VT: 6.22 R: 6.11 RT: 2.03 ( 20Kops/s) Optimized: src_n_8_ = L1: 58.31 L2: 53.34 M: 25.69 ( 85.29%) HT: 22.55 VT: 21.44 R: 19.91 RT: 10.34 ( 48Kops/s) src_n_8_8 = L1: 102.60 L2: 89.43 M: 65.01 ( 86.32%) HT: 37.87 VT: 37.02 R: 32.43 RT: 12.41 ( 51Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 133 pixman/pixman-mips-dspr2.c |9 +++ 2 files changed, 142 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index 9b9b449..d2482e0 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -310,6 +310,139 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888__asm_mips) END(pixman_composite_src_x888__asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_src_n_8__asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + +SAVE_REGS_ON_STACK 0, v0 +li v0, 0x00ff00ff + +beqz a3, 3f + nop +addiut1, a3, -1 +beqz t1, 2f + nop + +1: + /* a1 = source (32bit constant) */ +lbu t0, 0(a2) /* t2 = mask(a8) */ +lbu t1, 1(a2) /* t3 = mask(a8) */ +addiua2, a2, 2 + +MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9 + +sw t2, 0(a0) +sw t3, 4(a0) +addiua3, a3, -2 +addiut2, a3, -1 +bgtz t2, 1b + addiu a0, a0, 8 + +beqz a3, 3f + nop + +2: +lbu t0, 0(a2) +addiua2, a2, 1 + +MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5 + +sw t1, 0(a0) +addiua3, a3, -1 +addiua0, a0, 4 + +3: +RESTORE_REGS_FROM_STACK 0, v0 +jra + nop + +END(pixman_composite_src_n_8__asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips) +/* + * a0 - dst (a8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +lit9, 0x00ff00ff +beqz a3, 3f + nop +srl t7, a3, 2 /* t7 = how many multiples of 4 dst pixels */ +beqz t7, 1f /* branch if less than 4 src pixels */ + nop + +srl t8, a1, 24 +replv.ph t8, t8 + +0: +beqz t7, 1f + addiut7, t7, -1 +lbu t0, 0(a2) +lbu t1, 1(a2) +lbu t2, 2(a2) +lbu t3, 3(a2) + +addiu a2, a2, 4 + +precr_sra.ph.wt1, t0, 0 +precr_sra.ph.wt3, t2, 0 +precr.qb.ph t0, t3, t1 + +muleu_s.ph.qblt2, t0, t8 +muleu_s.ph.qbrt3, t0, t8 +shra_r.ph t4, t2, 8 +shra_r.ph t5, t3, 8 +and t4, t4, t9 +and t5, t5, t9 +addq.ph t2, t2, t4 +addq.ph t3, t3, t5 +shra_r.ph t2, t2, 8 +shra_r.ph t3, t3, 8 +precr.qb.ph t2, t2, t3 + +sbt2, 0(a0) +srl t2, t2, 8 +sbt2, 1(a0) +srl t2, t2, 8 +sbt2, 2(a0) +srl t2, t2, 8 +sbt2, 3(a0) +addiu a3, a3, -4 +b 0b + addiua0, a0, 4 + +1: +beqz a3, 3f + nop +srl t8, a1, 24 +2: +lbu t0, 0(a2) +addiu a2, a2, 1 + +mul t2, t0, t8 +shra_r.ph t3, t2, 8 +andi t3, t3, 0x00ff +addq.ph t2, t2, t3 +shra_r.ph t2, t2, 8 + +sbt2, 0(a0) +addiu a3, a3, -1 +bnez a3, 2b + addiua0, a0, 1 + +3: +j ra + nop + +END(pixman_composite_src_n_8_8_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_n___ca_asm_mips) /* * a0 - dst (a8r8g8b8) diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index bd828df..161377b 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -55,6 +55,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8, PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add__, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_, + uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC,