[FFmpeg-devel] [PATCH v2] avcodec/mips: msa optimizations for vc1dsp

2019-10-21 Thread gxw
Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000.
---
 libavcodec/mips/Makefile|   1 +
 libavcodec/mips/vc1dsp_init_mips.c  |  30 ++-
 libavcodec/mips/vc1dsp_mips.h   |  23 ++
 libavcodec/mips/vc1dsp_msa.c| 461 
 libavutil/mips/generic_macros_msa.h |   3 +
 5 files changed, 514 insertions(+), 4 deletions(-)
 create mode 100644 libavcodec/mips/vc1dsp_msa.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index c5b54d5..b4993f6 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP)+= 
mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
 MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
 MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
+MSA-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_msa.o
diff --git a/libavcodec/mips/vc1dsp_init_mips.c 
b/libavcodec/mips/vc1dsp_init_mips.c
index 4adc9e1..c0007ff 100644
--- a/libavcodec/mips/vc1dsp_init_mips.c
+++ b/libavcodec/mips/vc1dsp_init_mips.c
@@ -23,6 +23,10 @@
 #include "vc1dsp_mips.h"
 #include "config.h"
 
+#define FN_ASSIGN(OP, X, Y, INSN) \
+dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##INSN; \
+dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
+
 #if HAVE_MMI
 static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 {
@@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi;
 dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi;
 
-#define FN_ASSIGN(OP, X, Y, INSN) \
-dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##INSN; \
-dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
-
 FN_ASSIGN(put_, 0, 0, _mmi);
 FN_ASSIGN(put_, 0, 1, _mmi);
 FN_ASSIGN(put_, 0, 2, _mmi);
@@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 }
 #endif /* HAVE_MMI */
 
+#if HAVE_MSA
+static av_cold void vc1dsp_init_msa(VC1DSPContext *dsp)
+{
+dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_msa;
+dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_msa;
+dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_msa;
+
+FN_ASSIGN(put_, 1, 1, _msa);
+FN_ASSIGN(put_, 1, 2, _msa);
+FN_ASSIGN(put_, 1, 3, _msa);
+FN_ASSIGN(put_, 2, 1, _msa);
+FN_ASSIGN(put_, 2, 2, _msa);
+FN_ASSIGN(put_, 2, 3, _msa);
+FN_ASSIGN(put_, 3, 1, _msa);
+FN_ASSIGN(put_, 3, 2, _msa);
+FN_ASSIGN(put_, 3, 3, _msa);
+}
+#endif /* HAVE_MSA */
+
 av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp)
 {
 #if HAVE_MMI
 vc1dsp_init_mmi(dsp);
 #endif /* HAVE_MMI */
+#if HAVE_MSA
+vc1dsp_init_msa(dsp);
+#endif /* HAVE_MSA */
 }
diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h
index 0db85fa..5f72e60 100644
--- a/libavcodec/mips/vc1dsp_mips.h
+++ b/libavcodec/mips/vc1dsp_mips.h
@@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* 
align 8 */,
   uint8_t *src /* align 1 */,
   int stride, int h, int x, int y);
 
+void ff_vc1_inv_trans_8x8_msa(int16_t block[64]);
+void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block);
+void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block);
+
+#define FF_PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst,  \
+  const uint8_t *src, \
+  ptrdiff_t stride, int rnd); \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst,   \
+  const uint8_t *src, \
+  ptrdiff_t stride, int rnd);
+
+FF_PUT_VC1_MSPEL_MC_MSA(1, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(1, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(1, 3);
+
+FF_PUT_VC1_MSPEL_MC_MSA(2, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(2, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(2, 3);
+
+FF_PUT_VC1_MSPEL_MC_MSA(3, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(3, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(3, 3);
 #endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */
diff --git a/libavcodec/mips/vc1dsp_msa.c b/libavcodec/mips/vc1dsp_msa.c
new file mode 100644
index 000..6e588e8
--- /dev/null
+++ b/libavcodec/mips/vc1dsp_msa.c
@@ -0,0 +1,461 @@
+/*
+ * Loongson SIMD optimized vc1dsp
+ *
+ * Copyright (c) 2019 Loongson Technology Corporation Limited
+ *gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Fo

Re: [FFmpeg-devel] [PATCH] avcodec/mips: msa optimizations for vc1dsp

2019-10-21 Thread gxw
>>+TRANSPOSE4x4_SW_SW(in_l0, in_l1, in_l2, in_l3, t_l1, t_l2, t_l3, 
t_l4);
>>+TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_l0, in_l1, in_l2, 
in_l3);
>>+TRANSPOSE4x4_SW_SW(in_l4, in_l5, in_l6, in_l7, in_l4, in_l5, in_l6, 
in_l7);
>>+in_r4 = t_l1, in_r5 = t_l2, in_r6 = t_l3, in_r7 = t_l4;
>
>It's better to transpose 'in_l0, in_l1, in_l2, in_l3' directly into 
themselves, and ' in_r4, in_r5, in_r6, in_r7' the same.
>>+PUT_VC1_MSPEL_MC_MSA(2, 1);
>>+PUT_VC1_MSPEL_MC_MSA(2, 2);
>>+PUT_VC1_MSPEL_MC_MSA(2, 3);
>>+
>>+PUT_VC1_MSPEL_MC_MSA(3, 1);
>>+PUT_VC1_MSPEL_MC_MSA(3, 2);
>>+PUT_VC1_MSPEL_MC_MSA(3, 3);
>
>About the 'cnst_para*' used in put_vc1_mspel_mc_h_v_msa, maybe you can 
reference the usage of 'shift_value'.
>It may reduce some if clause especially in 'ff_put_vc1_mspel_mc ## hmode 
## vmode ## _16_msa'.

Thx, will fix in v2.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/mips: Fixed four warnings in vc1dsp

2019-10-11 Thread gxw
Change the stride argument to ptrdiff_t in the following functions:
ff_put_no_rnd_vc1_chroma_mc8_mmi, ff_put_no_rnd_vc1_chroma_mc4_mmi,
ff_avg_no_rnd_vc1_chroma_mc8_mmi, ff_avg_no_rnd_vc1_chroma_mc4_mmi.
---
 libavcodec/mips/vc1dsp_mips.h | 8 
 libavcodec/mips/vc1dsp_mmi.c  | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h
index 5f72e60..5897dae 100644
--- a/libavcodec/mips/vc1dsp_mips.h
+++ b/libavcodec/mips/vc1dsp_mips.h
@@ -180,16 +180,16 @@ void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, 
int pq);
 
 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y);
+  ptrdiff_t stride, int h, int x, int y);
 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y);
+  ptrdiff_t stride, int h, int x, int y);
 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y);
+  ptrdiff_t stride, int h, int x, int y);
 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y);
+  ptrdiff_t stride, int h, int x, int y);
 
 void ff_vc1_inv_trans_8x8_msa(int16_t block[64]);
 void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block);
diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
index db314de..9837868 100644
--- a/libavcodec/mips/vc1dsp_mmi.c
+++ b/libavcodec/mips/vc1dsp_mmi.c
@@ -2241,7 +2241,7 @@ DECLARE_FUNCTION(3, 3)
 
 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y)
+  ptrdiff_t stride, int h, int x, int y)
 {
 const int A = (8 - x) * (8 - y);
 const int B = (x) * (8 - y);
@@ -2296,7 +2296,7 @@ void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* 
align 8 */,
 
 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y)
+  ptrdiff_t stride, int h, int x, int y)
 {
 const int A = (8 - x) * (8 - y);
 const int B = (x) * (8 - y);
@@ -2349,7 +2349,7 @@ void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* 
align 8 */,
 
 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y)
+  ptrdiff_t stride, int h, int x, int y)
 {
 const int A = (8 - x) * (8 - y);
 const int B = (x) * (8 - y);
@@ -2407,7 +2407,7 @@ void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* 
align 8 */,
 
 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
   uint8_t *src /* align 1 */,
-  int stride, int h, int x, int y)
+  ptrdiff_t stride, int h, int x, int y)
 {
 const int A = (8 - x) * (8 - y);
 const int B = (x) * (8 - y);
-- 
2.1.0


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/mips: msa optimizations for vc1dsp

2019-10-11 Thread gxw
Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000.
---
 libavcodec/mips/Makefile|   1 +
 libavcodec/mips/vc1dsp_init_mips.c  |  30 ++-
 libavcodec/mips/vc1dsp_mips.h   |  23 ++
 libavcodec/mips/vc1dsp_msa.c| 483 
 libavutil/mips/generic_macros_msa.h |   3 +
 5 files changed, 536 insertions(+), 4 deletions(-)
 create mode 100644 libavcodec/mips/vc1dsp_msa.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index c5b54d5..b4993f6 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP)+= 
mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
 MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
 MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
+MSA-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_msa.o
diff --git a/libavcodec/mips/vc1dsp_init_mips.c 
b/libavcodec/mips/vc1dsp_init_mips.c
index 4adc9e1..c0007ff 100644
--- a/libavcodec/mips/vc1dsp_init_mips.c
+++ b/libavcodec/mips/vc1dsp_init_mips.c
@@ -23,6 +23,10 @@
 #include "vc1dsp_mips.h"
 #include "config.h"
 
+#define FN_ASSIGN(OP, X, Y, INSN) \
+dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##INSN; \
+dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
+
 #if HAVE_MMI
 static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 {
@@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi;
 dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi;
 
-#define FN_ASSIGN(OP, X, Y, INSN) \
-dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##INSN; \
-dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = 
ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
-
 FN_ASSIGN(put_, 0, 0, _mmi);
 FN_ASSIGN(put_, 0, 1, _mmi);
 FN_ASSIGN(put_, 0, 2, _mmi);
@@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
 }
 #endif /* HAVE_MMI */
 
+#if HAVE_MSA
+static av_cold void vc1dsp_init_msa(VC1DSPContext *dsp)
+{
+dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_msa;
+dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_msa;
+dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_msa;
+
+FN_ASSIGN(put_, 1, 1, _msa);
+FN_ASSIGN(put_, 1, 2, _msa);
+FN_ASSIGN(put_, 1, 3, _msa);
+FN_ASSIGN(put_, 2, 1, _msa);
+FN_ASSIGN(put_, 2, 2, _msa);
+FN_ASSIGN(put_, 2, 3, _msa);
+FN_ASSIGN(put_, 3, 1, _msa);
+FN_ASSIGN(put_, 3, 2, _msa);
+FN_ASSIGN(put_, 3, 3, _msa);
+}
+#endif /* HAVE_MSA */
+
 av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp)
 {
 #if HAVE_MMI
 vc1dsp_init_mmi(dsp);
 #endif /* HAVE_MMI */
+#if HAVE_MSA
+vc1dsp_init_msa(dsp);
+#endif /* HAVE_MSA */
 }
diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h
index 0db85fa..5f72e60 100644
--- a/libavcodec/mips/vc1dsp_mips.h
+++ b/libavcodec/mips/vc1dsp_mips.h
@@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* 
align 8 */,
   uint8_t *src /* align 1 */,
   int stride, int h, int x, int y);
 
+void ff_vc1_inv_trans_8x8_msa(int16_t block[64]);
+void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block);
+void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block);
+
+#define FF_PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst,  \
+  const uint8_t *src, \
+  ptrdiff_t stride, int rnd); \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst,   \
+  const uint8_t *src, \
+  ptrdiff_t stride, int rnd);
+
+FF_PUT_VC1_MSPEL_MC_MSA(1, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(1, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(1, 3);
+
+FF_PUT_VC1_MSPEL_MC_MSA(2, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(2, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(2, 3);
+
+FF_PUT_VC1_MSPEL_MC_MSA(3, 1);
+FF_PUT_VC1_MSPEL_MC_MSA(3, 2);
+FF_PUT_VC1_MSPEL_MC_MSA(3, 3);
 #endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */
diff --git a/libavcodec/mips/vc1dsp_msa.c b/libavcodec/mips/vc1dsp_msa.c
new file mode 100644
index 000..1619ea4
--- /dev/null
+++ b/libavcodec/mips/vc1dsp_msa.c
@@ -0,0 +1,483 @@
+/*
+ * Loongson SIMD optimized vc1dsp
+ *
+ * Copyright (c) 2019 Loongson Technology Corporation Limited
+ *gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Fo

[FFmpeg-devel] [PATCH] avcodec/mips: simplified code in vp3dsp_idct_msa.c.

2019-09-15 Thread gxw
Use the macros of ADD8 to replace continuous addition operations.
---
 libavcodec/mips/vp3dsp_idct_msa.c   | 80 -
 libavutil/mips/generic_macros_msa.h |  6 +++
 2 files changed, 22 insertions(+), 64 deletions(-)

diff --git a/libavcodec/mips/vp3dsp_idct_msa.c 
b/libavcodec/mips/vp3dsp_idct_msa.c
index 90c578f..e4cd377 100644
--- a/libavcodec/mips/vp3dsp_idct_msa.c
+++ b/libavcodec/mips/vp3dsp_idct_msa.c
@@ -178,14 +178,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
c0, c1, c2, c3);
 ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
c4, c5, c6, c7);
-A += c0;
-B += c7;
-C += c1;
-D += c2;
-E += c3;
-F += c4;
-G += c5;
-H += c6;
+ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6,
+ A, B, C, D, E, F, G, H);
 }
 CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
 sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
@@ -208,14 +202,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
 Gd = Bdd;
 Hd = Bdd;
 } else {
-Ad = Add + c0;
-Bd = Add + c1;
-Cd = Add + c2;
-Dd = Add + c3;
-Ed = Add + c4;
-Fd = Add + c5;
-Gd = Add + c6;
-Hd = Add + c7;
+ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6,
+ Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
 }
 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
@@ -235,14 +223,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
-r0_r = Ad + A;
-r1_r = Bd + C;
-r2_r = Cd + D;
-r3_r = Dd + E;
-r0_l = Ed + F;
-r1_l = Fd + G;
-r2_l = Gd + H;
-r3_l = Hd + B;
+ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B,
+ r0_r, r1_r, r2_r, r3_r, r0_l, r1_l, r2_l, r3_l);
 
 /* Row 4 to 7 */
 TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
@@ -286,14 +268,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
c0, c1, c2, c3);
 ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
c4, c5, c6, c7);
-A += c0;
-B += c7;
-C += c1;
-D += c2;
-E += c3;
-F += c4;
-G += c5;
-H += c6;
+ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6,
+ A, B, C, D, E, F, G, H);
 }
 CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
 sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
@@ -316,14 +292,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
 Gd = Bdd;
 Hd = Bdd;
 } else {
-Ad = Add + c0;
-Bd = Add + c1;
-Cd = Add + c2;
-Dd = Add + c3;
-Ed = Add + c4;
-Fd = Add + c5;
-Gd = Add + c6;
-Hd = Add + c7;
+ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6,
+ Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
 }
 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
@@ -343,14 +313,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t 
*input, int type)
 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
-r4_r = Ad + A;
-r5_r = Bd + C;
-r6_r = Cd + D;
-r7_r = Dd + E;
-r4_l = Ed + F;
-r5_l = Fd + G;
-r6_l = Gd + H;
-r7_l = Hd + B;
+ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B,
+ r4_r, r5_r, r6_r, r7_r, r4_l, r5_l, r6_l, r7_l);
 VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
 VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
 VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
@@ -400,14 +364,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t 
line_size, int16_t *block)
e0, e1, e2, e3);
 ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
e4, e5, e6, e7);
-e0 += dc;
-e1 += dc;
-e2 += dc;
-e3 += dc;
-e4 += dc;
-e5 += dc;
-e6 += dc;
-e7 += dc;
+ADD8(e0, dc, e1, dc, e2, dc, e3, dc, e4, dc, e5, dc, e6, dc, e7, dc,
+ e0, e1, e2, e3, e4, e5, e6, e7);
 CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
 
 /* Left part */
@@ -415,14 +373,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t 
line_size, int16_t *block)
r0, r1, r2, r3);
 ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
r4, r5, r6, r7);
-r0 += dc;
-r1 += dc;
-r2 += dc;
-r3 += dc;
-r4 += dc;
-r5 += dc;
-r6 

[FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.

2019-08-07 Thread gxw
Changing details as following:
1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
   source vector.
2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
   Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
   Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x).
   Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x).
3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
   instead, because there are no difference in the effect of this two macros.
---
 libavcodec/mips/h264dsp_msa.c   |  39 +--
 libavcodec/mips/h264idct_msa.c  |   7 +-
 libavcodec/mips/hevc_idct_msa.c |  21 +++---
 libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++--
 libavcodec/mips/hevc_mc_bi_msa.c|  44 ++--
 libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++
 libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +--
 libavcodec/mips/hevcpred_msa.c  |   8 +--
 libavcodec/mips/idctdsp_msa.c   |   9 +--
 libavcodec/mips/qpeldsp_msa.c   |   4 +-
 libavcodec/mips/simple_idct_msa.c   |  98 +++---
 libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++
 libavcodec/mips/vp8_idct_msa.c  |   5 +-
 libavcodec/mips/vp9_idct_msa.c  |  10 ++-
 libavutil/mips/generic_macros_msa.h | 119 +---
 15 files changed, 280 insertions(+), 380 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index c4ba8c4..dd05982 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
-CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 
 SRA_4V(temp0, temp1, temp2, temp3, denom);
 SRA_4V(temp4, temp5, temp6, temp7, denom);
-CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
-CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
 dst0, dst1, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 temp = p1_or_q1_org_in << 1;  \
 clip3 = clip3 - temp; \
 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\
-clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);  \
+CLIP_SH(clip3, negate_tc_in, tc_in);  \
 p1_or_q1_out = p1_or_q1_org_in + clip3;   \
 }
 
@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;  \
 delta >>= 3;\
 \
-delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+CLIP_SH(delta, negate_threshold_in, threshold_in);  \
 \
 p0_or_q0_out = p0_or_q0_org_in + delta; \
 q0_or_p0_out = q0_or_p0_org_in - delta; \
@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;   \
 delta = __msa_srari_h(delta, 3); \
  \
-delta = CLIP_SH(delta, -tc, tc); \
+CLIP_SH(delta, -tc, tc); \
  \
 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\
  \
@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 q0_sub_p0 <<= 2;   \
 delta = q0_sub_p0 + p1_sub_q1; \
 delta = __msa_srari_h(delta, 3);   \
-

[FFmpeg-devel] [PATCH v3] avutil/mips: refine msa macros CLIP_*.

2019-08-07 Thread gxw
Changing details as following:
1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in
   source vector.
2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'.
   Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x).
3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255'
   instead, because there are no difference in the effect of this two macros.
---
 libavcodec/mips/h264dsp_msa.c   |  39 +--
 libavcodec/mips/h264idct_msa.c  |   7 +-
 libavcodec/mips/hevc_idct_msa.c |  21 +++---
 libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++--
 libavcodec/mips/hevc_mc_bi_msa.c|  44 ++--
 libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++
 libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +--
 libavcodec/mips/hevcpred_msa.c  |   8 +--
 libavcodec/mips/idctdsp_msa.c   |   9 +--
 libavcodec/mips/qpeldsp_msa.c   |   4 +-
 libavcodec/mips/simple_idct_msa.c   |  98 +++---
 libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++
 libavcodec/mips/vp8_idct_msa.c  |   5 +-
 libavcodec/mips/vp9_idct_msa.c  |  10 ++-
 libavutil/mips/generic_macros_msa.h | 119 +---
 15 files changed, 280 insertions(+), 380 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index c4ba8c4..dd05982 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
-CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 
 SRA_4V(temp0, temp1, temp2, temp3, denom);
 SRA_4V(temp4, temp5, temp6, temp7, denom);
-CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
-CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
 dst0, dst1, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 temp = p1_or_q1_org_in << 1;  \
 clip3 = clip3 - temp; \
 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\
-clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);  \
+CLIP_SH(clip3, negate_tc_in, tc_in);  \
 p1_or_q1_out = p1_or_q1_org_in + clip3;   \
 }
 
@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;  \
 delta >>= 3;\
 \
-delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+CLIP_SH(delta, negate_threshold_in, threshold_in);  \
 \
 p0_or_q0_out = p0_or_q0_org_in + delta; \
 q0_or_p0_out = q0_or_p0_org_in - delta; \
@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;   \
 delta = __msa_srari_h(delta, 3); \
  \
-delta = CLIP_SH(delta, -tc, tc); \
+CLIP_SH(delta, -tc, tc); \
  \
 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\
  \
@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 q0_sub_p0 <<= 2;   \
 delta = q0_sub_p0 + p1_sub_q1; \
 delta = __msa_srari_h(delta, 3);   \
-delta = CLIP_SH(delta, -tc, tc);   \
+CLIP_SH(delta, -tc, tc);   \

[FFmpeg-devel] [PATCH v2] avutil/mips: refine msa macros CLIP_*.

2019-08-07 Thread gxw
Changing details as following:
1. Remove the local variable out_m in CLIP_SH. Results are assigned
   to input vector, reduced the data replication.
2. Reimplement the macro CLIP_SH/Wn_0_255. The VP8 decoding performance
   has improved by 1.1%(7.03x to 7.11x, tested on loongson 3A4000).
3. Remove CLIP_SH/Wn_0_255_MAX_SATU. CLIP_SH/Wn_0_255_MAX_SATU and
   CLIP_SH/Wn_0_255 have the same function. It is not necessary to
   keep both, use CLIP_SH/Wn_0_255 instead.
---
 libavcodec/mips/h264dsp_msa.c   |  39 +--
 libavcodec/mips/h264idct_msa.c  |   7 +-
 libavcodec/mips/hevc_idct_msa.c |  21 +++---
 libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++--
 libavcodec/mips/hevc_mc_bi_msa.c|  44 ++--
 libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++
 libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +--
 libavcodec/mips/hevcpred_msa.c  |   8 +--
 libavcodec/mips/idctdsp_msa.c   |   9 +--
 libavcodec/mips/qpeldsp_msa.c   |   4 +-
 libavcodec/mips/simple_idct_msa.c   |  98 +++---
 libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++
 libavcodec/mips/vp8_idct_msa.c  |   5 +-
 libavcodec/mips/vp9_idct_msa.c  |  10 ++-
 libavutil/mips/generic_macros_msa.h | 119 +---
 15 files changed, 280 insertions(+), 380 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index c4ba8c4..dd05982 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
-CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 
 SRA_4V(temp0, temp1, temp2, temp3, denom);
 SRA_4V(temp4, temp5, temp6, temp7, denom);
-CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
-CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
 dst0, dst1, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 temp = p1_or_q1_org_in << 1;  \
 clip3 = clip3 - temp; \
 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\
-clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);  \
+CLIP_SH(clip3, negate_tc_in, tc_in);  \
 p1_or_q1_out = p1_or_q1_org_in + clip3;   \
 }
 
@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;  \
 delta >>= 3;\
 \
-delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+CLIP_SH(delta, negate_threshold_in, threshold_in);  \
 \
 p0_or_q0_out = p0_or_q0_org_in + delta; \
 q0_or_p0_out = q0_or_p0_org_in - delta; \
@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;   \
 delta = __msa_srari_h(delta, 3); \
  \
-delta = CLIP_SH(delta, -tc, tc); \
+CLIP_SH(delta, -tc, tc); \
  \
 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\
  \
@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 q0_sub_p0 <<= 2;   \
 delta = q0_sub_p0 + p1_sub_q1; \
 delta = __msa_srari_h(delta, 3);   \
-delta = CLIP_SH(delta, -tc, tc);   \
+CLIP_SH(delta, -tc, tc);

[FFmpeg-devel] [PATCH] avutil/mips: refine msa macros CLIP_*.

2019-08-06 Thread gxw
Changing details as following:
1. Refine CLIP_SH, results are in placed to input vectors.
2. Reimplement the macro CLIP_SH/Wn_0_255. The new macro is more
   efficient than before.
3. Remove CLIP_SH/Wn_0_255_MAX_SATU. CLIP_SH/Wn_0_255_MAX_SATU and
   CLIP_SH/Wn_0_255 have the same function. It is not necessary to
   keep both, use CLIP_SH/Wn_0_255 instead.
---
 libavcodec/mips/h264dsp_msa.c   |  39 +--
 libavcodec/mips/h264idct_msa.c  |   7 +-
 libavcodec/mips/hevc_idct_msa.c |  21 +++---
 libavcodec/mips/hevc_lpf_sao_msa.c  | 132 ++--
 libavcodec/mips/hevc_mc_bi_msa.c|  44 ++--
 libavcodec/mips/hevc_mc_biw_msa.c   |  56 +++
 libavcodec/mips/hevc_mc_uniw_msa.c  |  40 +--
 libavcodec/mips/hevcpred_msa.c  |   8 +--
 libavcodec/mips/idctdsp_msa.c   |   9 +--
 libavcodec/mips/qpeldsp_msa.c   |   4 +-
 libavcodec/mips/simple_idct_msa.c   |  98 +++---
 libavcodec/mips/vp3dsp_idct_msa.c   |  68 +++
 libavcodec/mips/vp8_idct_msa.c  |   5 +-
 libavcodec/mips/vp9_idct_msa.c  |  10 ++-
 libavutil/mips/generic_macros_msa.h | 119 +---
 15 files changed, 280 insertions(+), 380 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index c4ba8c4..dd05982 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
-CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
-CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 
 SRA_4V(temp0, temp1, temp2, temp3, denom);
 SRA_4V(temp4, temp5, temp6, temp7, denom);
-CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
-CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
 dst0, dst1, dst2, dst3);
 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 temp = p1_or_q1_org_in << 1;  \
 clip3 = clip3 - temp; \
 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\
-clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);  \
+CLIP_SH(clip3, negate_tc_in, tc_in);  \
 p1_or_q1_out = p1_or_q1_org_in + clip3;   \
 }
 
@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;  \
 delta >>= 3;\
 \
-delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+CLIP_SH(delta, negate_threshold_in, threshold_in);  \
 \
 p0_or_q0_out = p0_or_q0_org_in + delta; \
 q0_or_p0_out = q0_or_p0_org_in - delta; \
@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 delta = q0_sub_p0 + p1_sub_q1;   \
 delta = __msa_srari_h(delta, 3); \
  \
-delta = CLIP_SH(delta, -tc, tc); \
+CLIP_SH(delta, -tc, tc); \
  \
 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\
  \
@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
 q0_sub_p0 <<= 2;   \
 delta = q0_sub_p0 + p1_sub_q1; \
 delta = __msa_srari_h(delta, 3);   \
-delta = CLIP_SH(delta, -tc, tc);   \
+CLIP_SH(delta, -tc, tc);   \
\
 

[FFmpeg-devel] [PATCH v2] avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.

2019-08-06 Thread gxw
Changing details as following:
1. The previous order of parameters are irregular and difficult to
   understand. Adjust the order of the parameters according to the
   rule: (RTYPE, input registers, input mask/input index/..., output registers).
   Most of the existing msa macros follow the rule.
2. Remove the redundant macro SLDI_Bn_0 and use SLDI_Bn instead.
---
 libavcodec/mips/h264dsp_msa.c   |  9 ++--
 libavcodec/mips/h264qpel_msa.c  | 64 ++--
 libavcodec/mips/hevc_lpf_sao_msa.c  | 70 ---
 libavcodec/mips/hevcpred_msa.c  | 30 ++---
 libavcodec/mips/hpeldsp_msa.c   | 66 ++---
 libavcodec/mips/me_cmp_msa.c|  8 ++--
 libavcodec/mips/qpeldsp_msa.c   | 84 ++---
 libavcodec/mips/vp8_mc_msa.c|  4 +-
 libavcodec/mips/vp9_idct_msa.c  |  3 +-
 libavcodec/mips/vp9_lpf_msa.c   |  3 +-
 libavcodec/mips/vp9_mc_msa.c| 16 +++
 libavutil/mips/generic_macros_msa.h | 80 ++-
 12 files changed, 222 insertions(+), 215 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 89fe399..c4ba8c4 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -620,7 +620,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
  \
 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);\
-SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
+SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
 }
 
 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
@@ -1025,7 +1025,8 @@ static void 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
 
 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
-SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
+SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
+   8, src0, src2, src4, src7);
 
 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
@@ -1116,10 +1117,10 @@ static void 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
 
 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
-SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
+SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
-SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
+SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
 
 out0 = __msa_copy_u_w((v4i32) dst0, 0);
 out1 = __msa_copy_u_h((v8i16) dst0, 2);
diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index df7e3e2..e435c18 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const 
uint8_t *src,
  minus5b, res4, res5, res6, res7);
 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  plus20b, res4, res5, res6, res7);
-SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
-SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
+SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
+   src0, src2, src4, src6);
 SRARI_H4_SH(res0, res1, res2, res3, 5);
 SRARI_H4_SH(res4, res5, res6, res7, 5);
 SAT_SH4_SH(res0, res1, res2, res3, 7);
@@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const 
uint8_t *src,
  minus5b, res4, res5, res6, res7);
 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  plus20b, res4, res5, res6, res7);
-SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
-SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
+SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
+   src0, src2, src4, src6);
 SRARI_H4_SH(res0, res1, res2, res3, 5);
 SRARI_H4_SH(res4, res5, res6, res7, 5);
 SAT_SH4_SH(res0, res1, res2, res3, 7);
@@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const 
uint8_t *src,
 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
  res4, res5, res6, res7);
-SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
-SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
-SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
-SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
+SLDI_B4_SB(src0, 

[FFmpeg-devel] [PATCH] avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.

2019-08-05 Thread gxw
Changing details as following:
1. Modified the parameters order of SLDI_Bn. The previous order of
   parameters is difficult to understand.
2. Remove the redundant macro SLDI_Bn_0 and use SLDI_Bn instead.
---
 libavcodec/mips/h264dsp_msa.c   |  9 ++--
 libavcodec/mips/h264qpel_msa.c  | 64 ++--
 libavcodec/mips/hevc_lpf_sao_msa.c  | 70 ---
 libavcodec/mips/hevcpred_msa.c  | 30 ++---
 libavcodec/mips/hpeldsp_msa.c   | 66 ++---
 libavcodec/mips/me_cmp_msa.c|  8 ++--
 libavcodec/mips/qpeldsp_msa.c   | 84 ++---
 libavcodec/mips/vp8_mc_msa.c|  4 +-
 libavcodec/mips/vp9_idct_msa.c  |  3 +-
 libavcodec/mips/vp9_lpf_msa.c   |  3 +-
 libavcodec/mips/vp9_mc_msa.c| 16 +++
 libavutil/mips/generic_macros_msa.h | 80 ++-
 12 files changed, 222 insertions(+), 215 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 89fe399..c4ba8c4 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -620,7 +620,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, 
int32_t stride,
  \
 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);\
-SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
+SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
 }
 
 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
@@ -1025,7 +1025,8 @@ static void 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
 
 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
-SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
+SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
+   8, src0, src2, src4, src7);
 
 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
@@ -1116,10 +1117,10 @@ static void 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
 
 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
-SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
+SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
-SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
+SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
 
 out0 = __msa_copy_u_w((v4i32) dst0, 0);
 out1 = __msa_copy_u_h((v8i16) dst0, 2);
diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index df7e3e2..e435c18 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const 
uint8_t *src,
  minus5b, res4, res5, res6, res7);
 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  plus20b, res4, res5, res6, res7);
-SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
-SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
+SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
+   src0, src2, src4, src6);
 SRARI_H4_SH(res0, res1, res2, res3, 5);
 SRARI_H4_SH(res4, res5, res6, res7, 5);
 SAT_SH4_SH(res0, res1, res2, res3, 7);
@@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const 
uint8_t *src,
  minus5b, res4, res5, res6, res7);
 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  plus20b, res4, res5, res6, res7);
-SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
-SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
+SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
+   src0, src2, src4, src6);
 SRARI_H4_SH(res0, res1, res2, res3, 5);
 SRARI_H4_SH(res4, res5, res6, res7, 5);
 SAT_SH4_SH(res0, res1, res2, res3, 7);
@@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const 
uint8_t *src,
 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
  res4, res5, res6, res7);
-SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
-SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
-SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
-SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
+SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
+   src0, src1, src2, src3);
+SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
+

Re: [FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions

2019-02-25 Thread gxw

> 在 2019年2月24日,上午10:55,Shiyou Yin  写道:
> 
> 
> 
>> -Original Message-
>> From: ffmpeg-devel-boun...@ffmpeg.org 
>> <mailto:ffmpeg-devel-boun...@ffmpeg.org> 
>> [mailto:ffmpeg-devel-boun...@ffmpeg.org 
>> <mailto:ffmpeg-devel-boun...@ffmpeg.org>] On Behalf Of gxw
>> Sent: Thursday, February 21, 2019 8:39 PM
>> To: ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> Cc: gxw
>> Subject: [FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi 
>> optimizations for VP9 put and avg
>> functions
>> 
>> VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on 
>> loongson 3A3000).
>> ---
>> libavcodec/mips/Makefile   |   1 +
>> libavcodec/mips/vp9_mc_mmi.c   | 692 
>> +
>> libavcodec/mips/vp9dsp_init_mips.c |  42 +++
>> libavcodec/mips/vp9dsp_mips.h  |  50 +++
>> libavutil/mips/mmiutils.h  |  15 +
>> 5 files changed, 800 insertions(+)
>> create mode 100644 libavcodec/mips/vp9_mc_mmi.c
>> 
>> diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
>> index c827649..c5b54d5 100644
>> --- a/libavcodec/mips/Makefile
>> +++ b/libavcodec/mips/Makefile
>> @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= 
>> mips/vc1dsp_mmi.o
>> MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
>> MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
>> MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
>> +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
>> diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
>> new file mode 100644
>> index 000..58a920b
>> --- /dev/null
>> +++ b/libavcodec/mips/vp9_mc_mmi.c
>> @@ -0,0 +1,692 @@
>> +/*
>> + * Copyright (c) 2019 gxw 
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
>> USA
>> + */
>> +
>> +#include "libavcodec/vp9dsp.h"
>> +#include "libavutil/mips/mmiutils.h"
>> +#include "vp9dsp_mips.h"
>> +
>> +#define GET_DATA_H_MMI   \
>> +"pmaddhw%[ftmp4],%[ftmp4],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp5],%[ftmp5],   %[filter2]\n\t" \
>> +"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
>> +"punpckhwd  %[ftmp5],%[ftmp4],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
>> +"pmaddhw%[ftmp6],%[ftmp6],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp7],%[ftmp7],   %[filter2]\n\t" \
>> +"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
>> +"punpckhwd  %[ftmp7],%[ftmp6],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
>> +"punpcklwd  %[srcl], %[ftmp4],   %[ftmp6]  \n\t" \
>> +"pmaddhw%[ftmp8],%[ftmp8],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp9],%[ftmp9],   %[filter2]\n\t" \
>> +"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
>> +"punpckhwd  %[ftmp9],%[ftmp8],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
>> +"pmaddhw%[ftmp10],   %[ftmp10],  %[filter1]\n\t" \
>> +"pmaddhw%[ftmp11],   %[ftmp11],  %[filter2]\n\t" \
>> +"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
>> +"punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
>> +"punpcklwd

[FFmpeg-devel] [PATCH v3] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions

2019-02-25 Thread gxw
VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on loongson 
3A3000).
---
 libavcodec/mips/Makefile   |   1 +
 libavcodec/mips/vp9_mc_mmi.c   | 628 +
 libavcodec/mips/vp9dsp_init_mips.c |  42 +++
 libavcodec/mips/vp9dsp_mips.h  |  50 +++
 libavutil/mips/mmiutils.h  |  15 +
 5 files changed, 736 insertions(+)
 create mode 100644 libavcodec/mips/vp9_mc_mmi.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index c827649..c5b54d5 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o
 MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
 MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
+MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
new file mode 100644
index 000..e7a8387
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_mmi.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2019 gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/mmiutils.h"
+#include "vp9dsp_mips.h"
+
+#define GET_DATA_H_MMI   \
+"pmaddhw%[ftmp4],%[ftmp4],   %[filter1]\n\t" \
+"pmaddhw%[ftmp5],%[ftmp5],   %[filter2]\n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"punpckhwd  %[ftmp5],%[ftmp4],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[ftmp6],%[ftmp6],   %[filter1]\n\t" \
+"pmaddhw%[ftmp7],%[ftmp7],   %[filter2]\n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpckhwd  %[ftmp7],%[ftmp6],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpcklwd  %[srcl], %[ftmp4],   %[ftmp6]  \n\t" \
+"pmaddhw%[ftmp8],%[ftmp8],   %[filter1]\n\t" \
+"pmaddhw%[ftmp9],%[ftmp9],   %[filter2]\n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"punpckhwd  %[ftmp9],%[ftmp8],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp10],   %[ftmp10],  %[filter1]\n\t" \
+"pmaddhw%[ftmp11],   %[ftmp11],  %[filter2]\n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]  \n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpcklwd  %[srch], %[ftmp8],   %[ftmp10] \n\t"
+
+#define GET_DATA_V_MMI   \
+"punpcklhw  %[srcl], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srcl], %[srcl],%[filter10]   \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11] \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpckhhw  %[srch], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srch], %[srch],%[filter10]   \n\t" \
+"punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srch], %[srch],%[ftmp12]

[FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions

2019-02-21 Thread gxw
VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on loongson 
3A3000).
---
 libavcodec/mips/Makefile   |   1 +
 libavcodec/mips/vp9_mc_mmi.c   | 692 +
 libavcodec/mips/vp9dsp_init_mips.c |  42 +++
 libavcodec/mips/vp9dsp_mips.h  |  50 +++
 libavutil/mips/mmiutils.h  |  15 +
 5 files changed, 800 insertions(+)
 create mode 100644 libavcodec/mips/vp9_mc_mmi.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index c827649..c5b54d5 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o
 MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
 MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
+MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
new file mode 100644
index 000..58a920b
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_mmi.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2019 gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/mmiutils.h"
+#include "vp9dsp_mips.h"
+
+#define GET_DATA_H_MMI   \
+"pmaddhw%[ftmp4],%[ftmp4],   %[filter1]\n\t" \
+"pmaddhw%[ftmp5],%[ftmp5],   %[filter2]\n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"punpckhwd  %[ftmp5],%[ftmp4],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[ftmp6],%[ftmp6],   %[filter1]\n\t" \
+"pmaddhw%[ftmp7],%[ftmp7],   %[filter2]\n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpckhwd  %[ftmp7],%[ftmp6],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpcklwd  %[srcl], %[ftmp4],   %[ftmp6]  \n\t" \
+"pmaddhw%[ftmp8],%[ftmp8],   %[filter1]\n\t" \
+"pmaddhw%[ftmp9],%[ftmp9],   %[filter2]\n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"punpckhwd  %[ftmp9],%[ftmp8],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp10],   %[ftmp10],  %[filter1]\n\t" \
+"pmaddhw%[ftmp11],   %[ftmp11],  %[filter2]\n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]  \n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpcklwd  %[srch], %[ftmp8],   %[ftmp10] \n\t"
+
+#define GET_DATA_V_MMI   \
+"punpcklhw  %[srcl], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srcl], %[srcl],%[filter10]   \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11] \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpckhhw  %[srch], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srch], %[srch],%[filter10]   \n\t" \
+"punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srch], %[srch],%[ftmp12]

Re: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions

2019-02-20 Thread gxw

> 在 2019年2月21日,上午9:55,Shiyou Yin  写道:
> 
>> -Original Message-
>> From: ffmpeg-devel-boun...@ffmpeg.org 
>> <mailto:ffmpeg-devel-boun...@ffmpeg.org> 
>> [mailto:ffmpeg-devel-boun...@ffmpeg.org 
>> <mailto:ffmpeg-devel-boun...@ffmpeg.org>] On Behalf Of gxw
>> Sent: Tuesday, February 19, 2019 11:02 AM
>> To: ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> Cc: gxw
>> Subject: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations 
>> for VP9 put and avg
>> functions
>> 
>> VP9 decoding speed improved about 109.3%(from 32fps to 67fps, tested on 
>> loongson 3A3000).
>> ---
>> libavcodec/mips/Makefile   |   1 +
>> libavcodec/mips/vp9_mc_mmi.c   | 680 
>> +
>> libavcodec/mips/vp9dsp_init_mips.c |  42 +++
>> libavcodec/mips/vp9dsp_mips.h  |  50 +++
>> 4 files changed, 773 insertions(+)
>> create mode 100644 libavcodec/mips/vp9_mc_mmi.c
>> 
>> diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
>> index c827649..c5b54d5 100644
>> --- a/libavcodec/mips/Makefile
>> +++ b/libavcodec/mips/Makefile
>> @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= 
>> mips/vc1dsp_mmi.o
>> MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
>> MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
>> MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
>> +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
>> diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
>> new file mode 100644
>> index 000..145bbff
>> --- /dev/null
>> +++ b/libavcodec/mips/vp9_mc_mmi.c
>> @@ -0,0 +1,680 @@
>> +/*
>> + * Copyright (c) 2019 gxw 
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
>> USA
>> + */
>> +
>> +#include "libavcodec/vp9dsp.h"
>> +#include "libavutil/mips/mmiutils.h"
>> +#include "vp9dsp_mips.h"
>> +
>> +#define GET_DATA_H_MMI   \
>> +"pmaddhw%[ftmp4],%[ftmp4],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp5],%[ftmp5],   %[filter2]\n\t" \
>> +"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
>> +"punpckhwd  %[ftmp5],%[ftmp4],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
>> +"pmaddhw%[ftmp6],%[ftmp6],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp7],%[ftmp7],   %[filter2]\n\t" \
>> +"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
>> +"punpckhwd  %[ftmp7],%[ftmp6],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
>> +"punpcklwd  %[srcl], %[ftmp4],   %[ftmp6]  \n\t" \
>> +"pmaddhw%[ftmp8],%[ftmp8],   %[filter1]\n\t" \
>> +"pmaddhw%[ftmp9],%[ftmp9],   %[filter2]\n\t" \
>> +"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
>> +"punpckhwd  %[ftmp9],%[ftmp8],   %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
>> +"pmaddhw%[ftmp10],   %[ftmp10],  %[filter1]\n\t" \
>> +"pmaddhw%[ftmp11],   %[ftmp11],  %[filter2]\n\t" \
>> +"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
>> +"punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]  \n\t" \
>> +"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
>> +"punpcklwd  %[srch], %[ftmp8],   %[ftmp10] \n\t"
>> 

[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions

2019-02-18 Thread gxw
VP9 decoding speed improved about 109.3%(from 32fps to 67fps, tested on 
loongson 3A3000).
---
 libavcodec/mips/Makefile   |   1 +
 libavcodec/mips/vp9_mc_mmi.c   | 680 +
 libavcodec/mips/vp9dsp_init_mips.c |  42 +++
 libavcodec/mips/vp9dsp_mips.h  |  50 +++
 4 files changed, 773 insertions(+)
 create mode 100644 libavcodec/mips/vp9_mc_mmi.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index c827649..c5b54d5 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o
 MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
 MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
+MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o
diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
new file mode 100644
index 000..145bbff
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_mmi.c
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2019 gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/mmiutils.h"
+#include "vp9dsp_mips.h"
+
+#define GET_DATA_H_MMI   \
+"pmaddhw%[ftmp4],%[ftmp4],   %[filter1]\n\t" \
+"pmaddhw%[ftmp5],%[ftmp5],   %[filter2]\n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"punpckhwd  %[ftmp5],%[ftmp4],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp4],%[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[ftmp6],%[ftmp6],   %[filter1]\n\t" \
+"pmaddhw%[ftmp7],%[ftmp7],   %[filter2]\n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpckhwd  %[ftmp7],%[ftmp6],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp6],%[ftmp6],   %[ftmp7]  \n\t" \
+"punpcklwd  %[srcl], %[ftmp4],   %[ftmp6]  \n\t" \
+"pmaddhw%[ftmp8],%[ftmp8],   %[filter1]\n\t" \
+"pmaddhw%[ftmp9],%[ftmp9],   %[filter2]\n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"punpckhwd  %[ftmp9],%[ftmp8],   %[ftmp0]  \n\t" \
+"paddw  %[ftmp8],%[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp10],   %[ftmp10],  %[filter1]\n\t" \
+"pmaddhw%[ftmp11],   %[ftmp11],  %[filter2]\n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]  \n\t" \
+"paddw  %[ftmp10],   %[ftmp10],  %[ftmp11] \n\t" \
+"punpcklwd  %[srch], %[ftmp8],   %[ftmp10] \n\t"
+
+#define GET_DATA_V_MMI   \
+"punpcklhw  %[srcl], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srcl], %[srcl],%[filter10]   \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11] \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+"paddw  %[srcl], %[srcl],%[ftmp12] \n\t" \
+"punpckhhw  %[srch], %[ftmp4],   %[ftmp5]  \n\t" \
+"pmaddhw%[srch], %[srch],%[filter10]   \n\t" \
+"punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]  \n\t" \
+"pmaddhw%[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+"paddw  %[srch], %[srch],%[ftmp12] \n\t" \
+

[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize theora decoding with mmi.

2019-02-12 Thread gxw
Optimize theora decoding with mmi in functions:
1. ff_vp3_idct_add_mmi
2. ff_vp3_idct_put_mmi
3. ff_vp3_idct_dc_add_mmi
4. ff_put_no_rnd_pixels_l2_mmi

Theora decoding speed improved about 32%(from 88fps to 116fps, Tested on 
loongson 3A3000).
---
 libavcodec/mips/Makefile   |   1 +
 libavcodec/mips/vp3dsp_idct_mmi.c  | 769 +
 libavcodec/mips/vp3dsp_init_mips.c |  14 +
 libavcodec/mips/vp3dsp_mips.h  |   6 +
 4 files changed, 790 insertions(+)
 create mode 100644 libavcodec/mips/vp3dsp_idct_mmi.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 3029872..c827649 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -87,3 +87,4 @@ MMI-OBJS-$(CONFIG_HPELDSP)+= 
mips/hpeldsp_mmi.o
 MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o
 MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o
 MMI-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_mmi.o
+MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
diff --git a/libavcodec/mips/vp3dsp_idct_mmi.c 
b/libavcodec/mips/vp3dsp_idct_mmi.c
new file mode 100644
index 000..c5c4cf3
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_idct_mmi.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2018 gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp3dsp_mips.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mips/mmiutils.h"
+#include "libavutil/common.h"
+#include "libavcodec/rnd_avg.h"
+
+#define LOAD_CONST(dst, value)\
+"li %[tmp1],  "#value"  \n\t" \
+"dmtc1  %[tmp1],  "#dst"\n\t" \
+"pshufh "#dst",   "#dst", %[ftmp10] \n\t"
+
+static void idct_row_mmi(int16_t *input)
+{
+double ftmp[23];
+uint64_t tmp[2];
+__asm__ volatile (
+"xor%[ftmp10],  %[ftmp10],%[ftmp10] \n\t"
+LOAD_CONST(%[csth_1], 1)
+"li %[tmp0],0x02\n\t"
+"1: \n\t"
+/* Load input */
+"ldc1   %[ftmp0],   0x00(%[input])  \n\t"
+"ldc1   %[ftmp1],   0x10(%[input])  \n\t"
+"ldc1   %[ftmp2],   0x20(%[input])  \n\t"
+"ldc1   %[ftmp3],   0x30(%[input])  \n\t"
+"ldc1   %[ftmp4],   0x40(%[input])  \n\t"
+"ldc1   %[ftmp5],   0x50(%[input])  \n\t"
+"ldc1   %[ftmp6],   0x60(%[input])  \n\t"
+"ldc1   %[ftmp7],   0x70(%[input])  \n\t"
+LOAD_CONST(%[ftmp8], 64277)
+LOAD_CONST(%[ftmp9], 12785)
+"pmulhh %[A],   %[ftmp9], %[ftmp7]  \n\t"
+"pcmpgth%[C],   %[ftmp10],%[ftmp1]  \n\t"
+"or %[mask],%[C], %[csth_1] \n\t"
+"pmullh %[B],   %[ftmp1], %[mask]   \n\t"
+"pmulhuh%[B],   %[ftmp8], %[B]  \n\t"
+"pmullh %[B],   %[B], %[mask]   \n\t"
+"paddh  %[A],   %[A], %[B]  \n\t"
+"paddh  %[A],   %[A], %[C]  \n\t"
+"pcmpgth%[D],   %[ftmp10],%[ftmp7]  \n\t"
+"or %[mask],%[D], %[csth_1] \n\t"
+"pmullh %[ftmp7],   %[ftmp7], %[mask]   \n\t"
+"pmulhuh%[B],   %[ftmp8], %[ftmp7]  \n\t"
+"pmullh %[B],   %[B], %[mask]   \n\t"
+"pmulhh %[C],   %[ftmp9], %[ftmp1]  \n\t"
+"psubh  %[B],   %[C], %[B]  \n\t"
+"psubh  %[B],   %[B],  

[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize theora decoding in vp3dsp.

2018-12-26 Thread gxw
Optimize theora decoding with msa in functions:
1. ff_vp3_idct_add_msa
2. ff_vp3_idct_put_msa
3. ff_vp3_idct_dc_add_msa
4. ff_vp3_v_loop_filter_msa
5. ff_vp3_h_loop_filter_msa
6. ff_put_no_rnd_pixels_l2_msa

Theora decoding speed improved about 36%(from 22fps to 30fps, Tested on 
loongson 2K1000).
---
 libavcodec/mips/Makefile   |   2 +
 libavcodec/mips/vp3dsp_idct_msa.c  | 662 +
 libavcodec/mips/vp3dsp_init_mips.c |  46 +++
 libavcodec/mips/vp3dsp_mips.h  |  37 +++
 libavcodec/vp3dsp.c|   2 +
 libavcodec/vp3dsp.h|   1 +
 6 files changed, 750 insertions(+)
 create mode 100644 libavcodec/mips/vp3dsp_idct_msa.c
 create mode 100644 libavcodec/mips/vp3dsp_init_mips.c
 create mode 100644 libavcodec/mips/vp3dsp_mips.h

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 1f659a0..3571207 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -22,6 +22,7 @@ OBJS-$(CONFIG_HEVC_DECODER)   += 
mips/hevcdsp_init_mips.o  \
  mips/hevcpred_init_mips.o
 OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9dsp_init_mips.o
 OBJS-$(CONFIG_VP8_DECODER)+= mips/vp8dsp_init_mips.o
+OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_init_mips.o
 OBJS-$(CONFIG_H264DSP)+= mips/h264dsp_init_mips.o
 OBJS-$(CONFIG_H264QPEL)   += mips/h264qpel_init_mips.o
 OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_init_mips.o
@@ -54,6 +55,7 @@ MSA-OBJS-$(CONFIG_VP9_DECODER)+= 
mips/vp9_mc_msa.o \
 MSA-OBJS-$(CONFIG_VP8_DECODER)+= mips/vp8_mc_msa.o \
  mips/vp8_idct_msa.o   \
  mips/vp8_lpf_msa.o
+MSA-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_msa.o
 MSA-OBJS-$(CONFIG_H264DSP)+= mips/h264dsp_msa.o\
  mips/h264idct_msa.o
 MSA-OBJS-$(CONFIG_H264QPEL)   += mips/h264qpel_msa.o
diff --git a/libavcodec/mips/vp3dsp_idct_msa.c 
b/libavcodec/mips/vp3dsp_idct_msa.c
new file mode 100644
index 000..5427ac5
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_idct_msa.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2018 gxw 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp3dsp_mips.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavutil/intreadwrite.h"
+#include "libavcodec/rnd_avg.h"
+
+static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
+{
+v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
+v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
+  r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
+v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
+v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
+v16u8 sign_l;
+v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
+v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
+v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
+v4i32 sign_t;
+v16i8 zero = {0};
+v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+v4i32 cnst64277w = {64277, 64277, 64277, 64277};
+v4i32 cnst60547w = {60547, 60547, 60547, 60547};
+v4i32 cnst54491w = {54491, 54491, 54491, 54491};
+v4i32 cnst46341w = {46341, 46341, 46341, 46341};
+v4i32 cnst36410w = {36410, 36410, 36410, 36410};
+v4i32 cnst25080w = {25080, 25080, 25080, 25080};
+v4i32 cnst12785w = {12785, 12785, 12785, 12785};
+v4i32 cnst8w = {8, 8, 8, 8};
+v4i32 cnst2048w = {2048, 2048, 2048, 2048};
+v4i32 cnst128w = {128, 128, 128, 128};
+int nstride = stride;
+
+/* Extended input data */
+LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
+sign = __msa_clti_s_h(r0, 0);
+r0_r = (v4i32) __msa_ilvr_h(sign, r0);
+r0_l = (v4i32) __msa_ilvl_h(sign, r0);
+sign = __msa_clti_s_h(r1, 0);
+r1_r = (v4i32) __msa_ilvr_h(sign, r1);
+r1_l = (v4i32) __msa_ilvl_h(sign, r1);
+sign = __msa_clti_s_h(r2, 0);
+r2_r = (v4i32) __msa_ilvr_h(sign, r2);
+r2_l = (v4i32) __msa_ilvl_h(sign, r2);
+sign = __msa_clti_s_h(r3, 0);
+r3_r = (v4i32) __

[FFmpeg-devel] [PATCH v3] avcodec/mips: Fix failed case: hevc-conformance-AMP_A_Samsung_* when enable msa

2018-12-23 Thread gxw
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is 
still 32
in function ff_hevc_sao_edge_filter_8_msa. So, use AV_INPUT_BUFFER_PADDING_SIZE 
directly.
Also, use MAX_PB_SIZE directly instead of 64. Fate tests passed.
---
 libavcodec/mips/hevc_lpf_sao_msa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 5b5537a..adcafde 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t 
*src,
int16_t *sao_offset_val,
int eo, int width, int height)
 {
-ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / 
sizeof(uint8_t);
 
 switch (eo) {
 case 0:
-- 
2.1.0


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] avcodec/mips: Fix failed case: hevc-conformance-AMP_A_Samsung_* when enable msa

2018-12-19 Thread gxw
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is 
still 32
in function ff_hevc_sao_edge_filter_8_msa. So, use AV_INPUT_BUFFER_PADDING_SIZE 
directly.
Fate tests passed.
---
 libavcodec/mips/hevc_lpf_sao_msa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 5b5537a..b146bb1 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t 
*src,
int16_t *sao_offset_val,
int eo, int width, int height)
 {
-ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+ptrdiff_t stride_src = (2 * 64 + AV_INPUT_BUFFER_PADDING_SIZE) / 
sizeof(uint8_t);
 
 switch (eo) {
 case 0:
-- 
2.1.0


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] fix failed case: hevc-conformance-AMP_A_Samsung_* in loongson2k

2018-12-17 Thread gxw
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is 
still 32
in function ff_hevc_sao_edge_filter_8_msa. So, Modify the corresponding value 
to 64.
Fate tests passed.
---
 libavcodec/mips/hevc_lpf_sao_msa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 5b5537a..bb883d0 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t 
*src,
int16_t *sao_offset_val,
int eo, int width, int height)
 {
-ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+ptrdiff_t stride_src = (2 * 64 + 64) / sizeof(uint8_t);
 
 switch (eo) {
 case 0:
-- 
2.1.0


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel