[Pixman] [PATCH 2/3] MIPS: DSPr2: Added more fast-paths for SRC operation:

2012-11-12 Thread Nemanja Lukic
From: Nemanja Lukic nemanja.lu...@rt-rk.com

Performance numbers before/after on MIPS-74kc @ 1GHz:

lowlevel-blt-bench results

Referent (before):
src_n_8_ =  L1:  13.79  L2:  22.47  M: 17.55 ( 58.28%)  HT:  6.95  
VT:  6.46  R:  6.34  RT:  2.07 (  20Kops/s)
   src_n_8_8 =  L1:  20.22  L2:  20.21  M: 18.20 ( 24.17%)  HT:  6.65  
VT:  6.22  R:  6.11  RT:  2.03 (  20Kops/s)

Optimized:
src_n_8_ =  L1:  58.31  L2:  53.34  M: 25.69 ( 85.29%)  HT: 22.55  
VT: 21.44  R: 19.91  RT: 10.34 (  48Kops/s)
   src_n_8_8 =  L1: 102.60  L2:  89.43  M: 65.01 ( 86.32%)  HT: 37.87  
VT: 37.02  R: 32.43  RT: 12.41 (  51Kops/s)
---
 pixman/pixman-mips-dspr2-asm.S |  133 
 pixman/pixman-mips-dspr2.c |9 +++
 2 files changed, 142 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 9b9b449..d2482e0 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -310,6 +310,139 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888__asm_mips)
 
 END(pixman_composite_src_x888__asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8__asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+
+SAVE_REGS_ON_STACK 0, v0
+li   v0, 0x00ff00ff
+
+beqz a3, 3f
+ nop
+addiut1, a3, -1
+beqz t1, 2f
+ nop
+
+1:
+   /* a1 = source  (32bit constant) */
+lbu  t0, 0(a2) /* t2 = mask(a8) */
+lbu  t1, 1(a2) /* t3 = mask(a8) */
+addiua2, a2, 2
+
+MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9
+
+sw   t2, 0(a0)
+sw   t3, 4(a0)
+addiua3, a3, -2
+addiut2, a3, -1
+bgtz t2, 1b
+ addiu   a0, a0, 8
+
+beqz a3, 3f
+ nop
+
+2:
+lbu  t0, 0(a2)
+addiua2, a2, 1
+
+MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5
+
+sw   t1, 0(a0)
+addiua3, a3, -1
+addiua0, a0, 4
+
+3:
+RESTORE_REGS_FROM_STACK 0, v0
+jra
+ nop
+
+END(pixman_composite_src_n_8__asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+lit9, 0x00ff00ff
+beqz  a3, 3f
+ nop
+srl   t7, a3, 2   /* t7 = how many multiples of 4 dst pixels */
+beqz  t7, 1f  /* branch if less than 4 src pixels */
+ nop
+
+srl   t8, a1, 24
+replv.ph  t8, t8
+
+0:
+beqz  t7, 1f
+ addiut7, t7, -1
+lbu   t0, 0(a2)
+lbu   t1, 1(a2)
+lbu   t2, 2(a2)
+lbu   t3, 3(a2)
+
+addiu a2, a2, 4
+
+precr_sra.ph.wt1, t0, 0
+precr_sra.ph.wt3, t2, 0
+precr.qb.ph   t0, t3, t1
+
+muleu_s.ph.qblt2, t0, t8
+muleu_s.ph.qbrt3, t0, t8
+shra_r.ph t4, t2, 8
+shra_r.ph t5, t3, 8
+and   t4, t4, t9
+and   t5, t5, t9
+addq.ph   t2, t2, t4
+addq.ph   t3, t3, t5
+shra_r.ph t2, t2, 8
+shra_r.ph t3, t3, 8
+precr.qb.ph   t2, t2, t3
+
+sbt2, 0(a0)
+srl   t2, t2, 8
+sbt2, 1(a0)
+srl   t2, t2, 8
+sbt2, 2(a0)
+srl   t2, t2, 8
+sbt2, 3(a0)
+addiu a3, a3, -4
+b 0b
+ addiua0, a0, 4
+
+1:
+beqz  a3, 3f
+ nop
+srl   t8, a1, 24
+2:
+lbu   t0, 0(a2)
+addiu a2, a2, 1
+
+mul   t2, t0, t8
+shra_r.ph t3, t2, 8
+andi  t3, t3, 0x00ff
+addq.ph   t2, t2, t3
+shra_r.ph t2, t2, 8
+
+sbt2, 0(a0)
+addiu a3, a3, -1
+bnez  a3, 2b
+ addiua0, a0, 1
+
+3:
+j ra
+ nop
+
+END(pixman_composite_src_n_8_8_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_over_n___ca_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index bd828df..161377b 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -55,6 +55,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add__,
 uint32_t, 1, uint32_t, 1)
 
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_,
+   uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8,
+   uint8_t, 1, uint8_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, 

[Pixman] [PATCH 2/3] MIPS: DSPr2: Added more fast-paths for SRC operation:

2012-11-04 Thread Nemanja Lukic
From: Nemanja Lukic nemanja.lu...@rt-rk.com

Performance numbers before/after on MIPS-74kc @ 1GHz:

lowlevel-blt-bench results

Referent (before):
src_n_8_ =  L1:  13.79  L2:  22.47  M: 17.55 ( 58.28%)  HT:  6.95  
VT:  6.46  R:  6.34  RT:  2.07 (  20Kops/s)
   src_n_8_8 =  L1:  20.22  L2:  20.21  M: 18.20 ( 24.17%)  HT:  6.65  
VT:  6.22  R:  6.11  RT:  2.03 (  20Kops/s)

Optimized:
src_n_8_ =  L1:  58.31  L2:  53.34  M: 25.69 ( 85.29%)  HT: 22.55  
VT: 21.44  R: 19.91  RT: 10.34 (  48Kops/s)
   src_n_8_8 =  L1: 102.60  L2:  89.43  M: 65.01 ( 86.32%)  HT: 37.87  
VT: 37.02  R: 32.43  RT: 12.41 (  51Kops/s)
---
 pixman/pixman-mips-dspr2-asm.S |  133 
 pixman/pixman-mips-dspr2.c |9 +++
 2 files changed, 142 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 9b9b449..d2482e0 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -310,6 +310,139 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888__asm_mips)
 
 END(pixman_composite_src_x888__asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8__asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+
+SAVE_REGS_ON_STACK 0, v0
+li   v0, 0x00ff00ff
+
+beqz a3, 3f
+ nop
+addiut1, a3, -1
+beqz t1, 2f
+ nop
+
+1:
+   /* a1 = source  (32bit constant) */
+lbu  t0, 0(a2) /* t2 = mask(a8) */
+lbu  t1, 1(a2) /* t3 = mask(a8) */
+addiua2, a2, 2
+
+MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9
+
+sw   t2, 0(a0)
+sw   t3, 4(a0)
+addiua3, a3, -2
+addiut2, a3, -1
+bgtz t2, 1b
+ addiu   a0, a0, 8
+
+beqz a3, 3f
+ nop
+
+2:
+lbu  t0, 0(a2)
+addiua2, a2, 1
+
+MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5
+
+sw   t1, 0(a0)
+addiua3, a3, -1
+addiua0, a0, 4
+
+3:
+RESTORE_REGS_FROM_STACK 0, v0
+jra
+ nop
+
+END(pixman_composite_src_n_8__asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+lit9, 0x00ff00ff
+beqz  a3, 3f
+ nop
+srl   t7, a3, 2   /* t7 = how many multiples of 4 dst pixels */
+beqz  t7, 1f  /* branch if less than 4 src pixels */
+ nop
+
+srl   t8, a1, 24
+replv.ph  t8, t8
+
+0:
+beqz  t7, 1f
+ addiut7, t7, -1
+lbu   t0, 0(a2)
+lbu   t1, 1(a2)
+lbu   t2, 2(a2)
+lbu   t3, 3(a2)
+
+addiu a2, a2, 4
+
+precr_sra.ph.wt1, t0, 0
+precr_sra.ph.wt3, t2, 0
+precr.qb.ph   t0, t3, t1
+
+muleu_s.ph.qblt2, t0, t8
+muleu_s.ph.qbrt3, t0, t8
+shra_r.ph t4, t2, 8
+shra_r.ph t5, t3, 8
+and   t4, t4, t9
+and   t5, t5, t9
+addq.ph   t2, t2, t4
+addq.ph   t3, t3, t5
+shra_r.ph t2, t2, 8
+shra_r.ph t3, t3, 8
+precr.qb.ph   t2, t2, t3
+
+sbt2, 0(a0)
+srl   t2, t2, 8
+sbt2, 1(a0)
+srl   t2, t2, 8
+sbt2, 2(a0)
+srl   t2, t2, 8
+sbt2, 3(a0)
+addiu a3, a3, -4
+b 0b
+ addiua0, a0, 4
+
+1:
+beqz  a3, 3f
+ nop
+srl   t8, a1, 24
+2:
+lbu   t0, 0(a2)
+addiu a2, a2, 1
+
+mul   t2, t0, t8
+shra_r.ph t3, t2, 8
+andi  t3, t3, 0x00ff
+addq.ph   t2, t2, t3
+shra_r.ph t2, t2, 8
+
+sbt2, 0(a0)
+addiu a3, a3, -1
+bnez  a3, 2b
+ addiua0, a0, 1
+
+3:
+j ra
+ nop
+
+END(pixman_composite_src_n_8_8_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_over_n___ca_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index bd828df..161377b 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -55,6 +55,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add__,
 uint32_t, 1, uint32_t, 1)
 
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_,
+   uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8,
+   uint8_t, 1, uint8_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC,