[FFmpeg-devel] [PATCH] Add prefetch for mips

2017-07-12 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/Makefile|1 +
 libavcodec/mips/videodsp_init.c |   51 +++
 libavcodec/videodsp.c   |2 ++
 libavcodec/videodsp.h   |1 +
 4 files changed, 55 insertions(+)
 create mode 100644 libavcodec/mips/videodsp_init.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 797df09..1f659a0 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -38,6 +38,7 @@ OBJS-$(CONFIG_ME_CMP) += 
mips/me_cmp_init_mips.o
 OBJS-$(CONFIG_MPEG4_DECODER)  += mips/xvididct_init_mips.o
 OBJS-$(CONFIG_VC1DSP) += mips/vc1dsp_init_mips.o
 OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_init_mips.o
+OBJS-$(CONFIG_VIDEODSP)   += mips/videodsp_init.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_msa.o\
  mips/hevc_mc_uni_msa.o\
  mips/hevc_mc_uniw_msa.o   \
diff --git a/libavcodec/mips/videodsp_init.c b/libavcodec/mips/videodsp_init.c
new file mode 100644
index 000..8170404
--- /dev/null
+++ b/libavcodec/mips/videodsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 Kaustubh Raste (kaustubh.ra...@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_MSA
+static void prefetch_mips(uint8_t *mem, ptrdiff_t stride, int h)
+{
+register const uint8_t *p = mem;
+
+__asm__ volatile (
+"1: \n\t"
+"pref  4,  0(%[p])  \n\t"
+"pref  4,  32(%[p]) \n\t"
+PTR_ADDIU"  %[h],  %[h], -1 \n\t"
+PTR_ADDU "  %[p],  %[p], %[stride]  \n\t"
+
+"bnez   %[h],  1b   \n\t"
+
+: [p] "+r" (p), [h] "+r" (h)
+: [stride] "r" (stride)
+);
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_MSA
+ctx->prefetch = prefetch_mips;
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
index ba618a7..ce9e9eb 100644
--- a/libavcodec/videodsp.c
+++ b/libavcodec/videodsp.c
@@ -52,4 +52,6 @@ av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc)
 ff_videodsp_init_ppc(ctx, bpc);
 if (ARCH_X86)
 ff_videodsp_init_x86(ctx, bpc);
+if (ARCH_MIPS)
+ff_videodsp_init_mips(ctx, bpc);
 }
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
index fc01a31..c0545f2 100644
--- a/libavcodec/videodsp.h
+++ b/libavcodec/videodsp.h
@@ -83,5 +83,6 @@ void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc);
+void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc);
 
 #endif /* AVCODEC_VIDEODSP_H */
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] Add prefetch for mips

2017-07-11 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/Makefile|1 +
 libavcodec/mips/videodsp_mips.c |   42 +++
 libavcodec/videodsp.c   |2 ++
 libavcodec/videodsp.h   |1 +
 4 files changed, 46 insertions(+)
 create mode 100644 libavcodec/mips/videodsp_mips.c

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 797df09..e2a779c 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -38,6 +38,7 @@ OBJS-$(CONFIG_ME_CMP) += 
mips/me_cmp_init_mips.o
 OBJS-$(CONFIG_MPEG4_DECODER)  += mips/xvididct_init_mips.o
 OBJS-$(CONFIG_VC1DSP) += mips/vc1dsp_init_mips.o
 OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_init_mips.o
+OBJS-$(CONFIG_VIDEODSP)   += mips/videodsp_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)   += mips/hevcdsp_msa.o\
  mips/hevc_mc_uni_msa.o\
  mips/hevc_mc_uniw_msa.o   \
diff --git a/libavcodec/mips/videodsp_mips.c b/libavcodec/mips/videodsp_mips.c
new file mode 100644
index 000..c2c8282
--- /dev/null
+++ b/libavcodec/mips/videodsp_mips.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 Kaustubh Raste (kaustubh.ra...@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_MSA
+static void prefetch_mips(uint8_t *mem, ptrdiff_t stride, int h)
+{
+register const uint8_t *p = mem;
+do {
+__asm__ volatile ("pref 4, 0(%[p])" : : [p] "r" (p));
+__asm__ volatile ("pref 4, 32(%[p])" : : [p] "r" (p));
+p += stride;
+} while(--h);
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_MSA
+ctx->prefetch = prefetch_mips;
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
index ba618a7..ce9e9eb 100644
--- a/libavcodec/videodsp.c
+++ b/libavcodec/videodsp.c
@@ -52,4 +52,6 @@ av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc)
 ff_videodsp_init_ppc(ctx, bpc);
 if (ARCH_X86)
 ff_videodsp_init_x86(ctx, bpc);
+if (ARCH_MIPS)
+ff_videodsp_init_mips(ctx, bpc);
 }
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
index fc01a31..c0545f2 100644
--- a/libavcodec/videodsp.h
+++ b/libavcodec/videodsp.h
@@ -83,5 +83,6 @@ void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc);
+void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc);
 
 #endif /* AVCODEC_VIDEODSP_H */
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] libavutil/mips: Updated msa generic macros

2017-07-21 Thread kaustubh.raste
From: Kaustubh Raste 

Reduced msa load-store code.
Removed inline asm of GP load-store for 64 bit.
Updated variable names in GP load-store macros for naming consistency.
Corrected macro descriptions. 

Signed-off-by: Kaustubh Raste 
---
 libavutil/mips/generic_macros_msa.h |  629 ++-
 1 file changed, 245 insertions(+), 384 deletions(-)

diff --git a/libavutil/mips/generic_macros_msa.h 
b/libavutil/mips/generic_macros_msa.h
index 0a59619..61a8ee0 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -27,202 +27,163 @@
 #define ALIGNMENT   16
 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
 
-#define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
-#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
+
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)   \
-( {\
-uint8_t *psrc_m = (uint8_t *) (psrc);  \
-uint32_t val_m;\
-   \
-__asm__ volatile ( \
-"lw  %[val_m],  %[psrc_m]  \n\t"   \
-   \
-: [val_m] "=r" (val_m) \
-: [psrc_m] "m" (*psrc_m)   \
-); \
-   \
-val_m; \
+#define LH(psrc)  \
+( {   \
+uint16_t val_lh_m = *(uint16_t *)(psrc);  \
+val_lh_m; \
+} )
+
+#define LW(psrc)  \
+( {   \
+uint32_t val_lw_m = *(uint32_t *)(psrc);  \
+val_lw_m; \
 } )
 
 #if (__mips == 64)
-#define LD(psrc)   \
-( {\
-uint8_t *psrc_m = (uint8_t *) (psrc);  \
-uint64_t val_m = 0;\
-   \
-__asm__ volatile ( \
-"ld  %[val_m],  %[psrc_m]  \n\t"   \
-   \
-: [val_m] "=r" (val_m) \
-: [psrc_m] "m" (*psrc_m)   \
-); \
-   \
-val_m; \
+#define LD(psrc)   \
+( {\
+uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
+val_ld_m;  \
 } )
 #else  // !(__mips == 64)
-#define LD(psrc)  \
-( {   \
-uint8_t *psrc_ld_m = (uint8_t *) (psrc);  \
-uint32_t val0_m, val1_m;  \
-uint64_t val_m = 0;   \
-  \
-val0_m = LW(psrc_ld_m);   \
-val1_m = LW(psrc_ld_m + 4);   \
- 

[FFmpeg-devel] [PATCH] libavcodec/mips: Improve avc dequant-idct luma dc msa function

2017-07-28 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264idct_msa.c |   66 +++-
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index 81e09e9..861befe 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -40,17 +40,20 @@ static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t 
*src,
  int32_t de_q_val)
 {
 #define DC_DEST_STRIDE 16
-int16_t out0, out1, out2, out3;
-v8i16 src0, src1, src2, src3;
+int16_t out0, out1, out2, out3, out4, out5, out6, out7;
+v8i16 src1, src3;
 v8i16 vec0, vec1, vec2, vec3;
+v8i16 tmp0, tmp1, tmp2, tmp3;
 v8i16 hres0, hres1, hres2, hres3;
 v8i16 vres0, vres1, vres2, vres3;
 v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
-v4i32 de_q_vec = __msa_fill_w(de_q_val);
+const v4i32 de_q_vec = __msa_fill_w(de_q_val);
+const v8i16 src0 = LD_SH(src);
+const v8i16 src2 = LD_SH(src + 8);
 
-LD4x4_SH(src, src0, src1, src2, src3);
-TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, src0, src1, src2, src3);
-BUTTERFLY_4(src0, src2, src3, src1, vec0, vec3, vec2, vec1);
+ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
 BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
 TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
 BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
@@ -72,40 +75,35 @@ static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t 
*src,
 out1 = __msa_copy_s_h(vec0, 1);
 out2 = __msa_copy_s_h(vec0, 2);
 out3 = __msa_copy_s_h(vec0, 3);
-SH(out0, dst);
-SH(out1, (dst + 2 * DC_DEST_STRIDE));
-SH(out2, (dst + 8 * DC_DEST_STRIDE));
+out4 = __msa_copy_s_h(vec0, 4);
+out5 = __msa_copy_s_h(vec0, 5);
+out6 = __msa_copy_s_h(vec0, 6);
+out7 = __msa_copy_s_h(vec0, 7);
+SH(out0, (dst + 0  * DC_DEST_STRIDE));
+SH(out1, (dst + 2  * DC_DEST_STRIDE));
+SH(out2, (dst + 8  * DC_DEST_STRIDE));
 SH(out3, (dst + 10 * DC_DEST_STRIDE));
-dst += DC_DEST_STRIDE;
-
-out0 = __msa_copy_s_h(vec0, 4);
-out1 = __msa_copy_s_h(vec0, 5);
-out2 = __msa_copy_s_h(vec0, 6);
-out3 = __msa_copy_s_h(vec0, 7);
-SH(out0, dst);
-SH(out1, (dst + 2 * DC_DEST_STRIDE));
-SH(out2, (dst + 8 * DC_DEST_STRIDE));
-SH(out3, (dst + 10 * DC_DEST_STRIDE));
-dst += (3 * DC_DEST_STRIDE);
+SH(out4, (dst + 1  * DC_DEST_STRIDE));
+SH(out5, (dst + 3  * DC_DEST_STRIDE));
+SH(out6, (dst + 9  * DC_DEST_STRIDE));
+SH(out7, (dst + 11 * DC_DEST_STRIDE));
 
 out0 = __msa_copy_s_h(vec1, 0);
 out1 = __msa_copy_s_h(vec1, 1);
 out2 = __msa_copy_s_h(vec1, 2);
 out3 = __msa_copy_s_h(vec1, 3);
-SH(out0, dst);
-SH(out1, (dst + 2 * DC_DEST_STRIDE));
-SH(out2, (dst + 8 * DC_DEST_STRIDE));
-SH(out3, (dst + 10 * DC_DEST_STRIDE));
-dst += DC_DEST_STRIDE;
-
-out0 = __msa_copy_s_h(vec1, 4);
-out1 = __msa_copy_s_h(vec1, 5);
-out2 = __msa_copy_s_h(vec1, 6);
-out3 = __msa_copy_s_h(vec1, 7);
-SH(out0, dst);
-SH(out1, (dst + 2 * DC_DEST_STRIDE));
-SH(out2, (dst + 8 * DC_DEST_STRIDE));
-SH(out3, (dst + 10 * DC_DEST_STRIDE));
+out4 = __msa_copy_s_h(vec1, 4);
+out5 = __msa_copy_s_h(vec1, 5);
+out6 = __msa_copy_s_h(vec1, 6);
+out7 = __msa_copy_s_h(vec1, 7);
+SH(out0, (dst + 4  * DC_DEST_STRIDE));
+SH(out1, (dst + 6  * DC_DEST_STRIDE));
+SH(out2, (dst + 12 * DC_DEST_STRIDE));
+SH(out3, (dst + 14 * DC_DEST_STRIDE));
+SH(out4, (dst + 5  * DC_DEST_STRIDE));
+SH(out5, (dst + 7  * DC_DEST_STRIDE));
+SH(out6, (dst + 13 * DC_DEST_STRIDE));
+SH(out7, (dst + 15 * DC_DEST_STRIDE));
 
 #undef DC_DEST_STRIDE
 }
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] libavcodec/mips: Optimize avc idct 4x4 for msa

2017-07-24 Thread kaustubh.raste
From: Kaustubh Raste 

Removed memset call and improved performance.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264idct_msa.c  |  104 +++
 libavutil/mips/generic_macros_msa.h |   18 ++
 2 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index fac1e7a..81e09e9 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -36,48 +36,6 @@
 BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3);  \
 }
 
-static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src,
-   int32_t dst_stride)
-{
-v8i16 src0, src1, src2, src3;
-v8i16 hres0, hres1, hres2, hres3;
-v8i16 vres0, vres1, vres2, vres3;
-v8i16 zeros = { 0 };
-
-LD4x4_SH(src, src0, src1, src2, src3);
-AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
-TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
-AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
-SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
-ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride);
-ST_SH2(zeros, zeros, src, 8);
-}
-
-static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
-  int32_t dst_stride)
-{
-int16_t dc;
-uint32_t src0, src1, src2, src3;
-v16u8 pred = { 0 };
-v16i8 out;
-v8i16 input_dc, pred_r, pred_l;
-
-dc = (src[0] + 32) >> 6;
-input_dc = __msa_fill_h(dc);
-src[0] = 0;
-
-LW4(dst, dst_stride, src0, src1, src2, src3);
-INSERT_W4_UB(src0, src1, src2, src3, pred);
-UNPCK_UB_SH(pred, pred_r, pred_l);
-
-pred_r += input_dc;
-pred_l += input_dc;
-
-CLIP_SH2_0_255(pred_r, pred_l);
-out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
-ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
 static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
  int32_t de_q_val)
 {
@@ -317,11 +275,45 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t 
*src,
 ST8x4_UB(dst2, dst3, dst, dst_stride);
 }
 
-void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src,
-  int32_t dst_stride)
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
 {
-avc_idct4x4_addblk_msa(dst, src, dst_stride);
-memset(src, 0, 16 * sizeof(dctcoef));
+uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
+v16i8 dst0_m = { 0 };
+v16i8 dst1_m = { 0 };
+v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
+v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
+const v8i16 src0 = LD_SH(src);
+const v8i16 src2 = LD_SH(src + 8);
+const v8i16 zero = { 0 };
+const uint8_t *dst1 = dst + dst_stride;
+const uint8_t *dst2 = dst + 2 * dst_stride;
+const uint8_t *dst3 = dst + 3 * dst_stride;
+
+ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+ST_SH2(zero, zero, src, 8);
+AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+src0_m = LW(dst);
+src1_m = LW(dst1);
+SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+src2_m = LW(dst2);
+src3_m = LW(dst3);
+ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
+INSERT_W2_SB(src0_m, src1_m, dst0_m);
+INSERT_W2_SB(src2_m, src3_m, dst1_m);
+ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
+ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
+CLIP_SH2_0_255(res0_m, res1_m);
+PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
+out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
+out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
+out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
+out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
+SW(out0_m, dst);
+SW(out1_m, dst1);
+SW(out2_m, dst2);
+SW(out3_m, dst3);
 }
 
 void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
@@ -334,7 +326,23 @@ void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
 void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
int32_t dst_stride)
 {
-avc_idct4x4_addblk_dc_msa(dst, src, dst_stride);
+v16u8 pred = { 0 };
+v16i8 out;
+v8i16 pred_r, pred_l;
+const uint32_t src0 = LW(dst);
+const uint32_t src1 = LW(dst + dst_stride);
+const uint32_t src2 = LW(dst + 2 * dst_stride);
+const uint32_t src3 = LW(dst + 3 * dst_stride);
+const int16_t dc = (src[0] + 32) 

[FFmpeg-devel] [PATCH] libavcodec/mips: Improve avc idct8 msa function

2017-07-31 Thread kaustubh.raste
From: Kaustubh Raste 

Replace memset call with msa stores.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264idct_msa.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index 861befe..1e1a5c8 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -120,11 +120,12 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t 
*src, int32_t dst_stride)
 v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
 v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
 v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-v16i8 zeros = { 0 };
+v8i16 zeros = { 0 };
 
 src[0] += 32;
 
 LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
 
 vec0 = src0 + src4;
 vec1 = src0 - src4;
@@ -318,7 +319,6 @@ void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
   int32_t dst_stride)
 {
 avc_idct8_addblk_msa(dst, src, dst_stride);
-memset(src, 0, 64 * sizeof(dctcoef));
 }
 
 void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc sao band filter msa functions

2017-09-15 Thread kaustubh.raste
From: Kaustubh Raste 

Preload data in band filter 0-8 for better pipeline parallelization.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c  |  174 ++-
 libavutil/mips/generic_macros_msa.h |1 +
 2 files changed, 112 insertions(+), 63 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 79b156f..1d77432 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1049,29 +1049,28 @@ static void hevc_sao_band_filter_4width_msa(uint8_t 
*dst, int32_t dst_stride,
 int16_t *sao_offset_val,
 int32_t height)
 {
-int32_t h_cnt;
 v16u8 src0, src1, src2, src3;
 v16i8 src0_r, src1_r;
 v16i8 offset, offset_val, mask;
-v16i8 offset0 = { 0 };
-v16i8 offset1 = { 0 };
+v16i8 dst0, offset0, offset1;
 v16i8 zero = { 0 };
-v8i16 temp0, temp1, dst0, dst1;
 
 offset_val = LD_SB(sao_offset_val + 1);
 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
 
 offset_val = __msa_pckev_b(offset_val, offset_val);
-offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
-offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
+offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
 
+/* load in advance. */
+LD_UB4(src, src_stride, src0, src1, src2, src3);
+
 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
 SWAP(offset0, offset1);
 }
 
-for (h_cnt = height >> 2; h_cnt--;) {
-LD_UB4(src, src_stride, src0, src1, src2, src3);
+for (height -= 4; height; height -= 4) {
 src += (4 * src_stride);
 
 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
@@ -1080,14 +1079,30 @@ static void hevc_sao_band_filter_4width_msa(uint8_t 
*dst, int32_t dst_stride,
 mask = __msa_srli_b(src0_r, 3);
 offset = __msa_vshf_b(mask, offset1, offset0);
 
-UNPCK_SB_SH(offset, temp0, temp1);
-ILVRL_B2_SH(zero, src0_r, dst0, dst1);
-ADD2(dst0, temp0, dst1, temp1, dst0, dst1);
-CLIP_SH2_0_255(dst0, dst1);
-dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
+dst0 = __msa_adds_s_b(src0_r, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+/* load in advance. */
+LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+/* store results */
 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
 dst += (4 * dst_stride);
 }
+
+ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
+
+src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
+mask = __msa_srli_b(src0_r, 3);
+offset = __msa_vshf_b(mask, offset1, offset0);
+
+src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
+dst0 = __msa_adds_s_b(src0_r, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+/* store results */
+ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
@@ -1096,51 +,69 @@ static void hevc_sao_band_filter_8width_msa(uint8_t 
*dst, int32_t dst_stride,
 int16_t *sao_offset_val,
 int32_t height)
 {
-int32_t h_cnt;
 v16u8 src0, src1, src2, src3;
 v16i8 src0_r, src1_r, mask0, mask1;
-v16i8 offset, offset_val;
-v16i8 offset0 = { 0 };
-v16i8 offset1 = { 0 };
+v16i8 offset_mask0, offset_mask1, offset_val;
+v16i8 offset0, offset1, dst0, dst1;
 v16i8 zero = { 0 };
-v8i16 dst0, dst1, dst2, dst3;
-v8i16 temp0, temp1, temp2, temp3;
 
 offset_val = LD_SB(sao_offset_val + 1);
 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
 offset_val = __msa_pckev_b(offset_val, offset_val);
-offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
-offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
+offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
 
+/* load in advance. */
+LD_UB4(src, src_stride, src0, src1, src2, src3);
+
 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
 SWAP(offset0, offset1);
 }
 
-for (h_cnt = height >> 2; h_cnt--;) {
-LD_UB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
+for (height -= 4; height; height -= 4) {
+

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc lpf msa functions

2017-09-14 Thread kaustubh.raste
From: Kaustubh Raste 

Optimize luma intra case by reducing conditional cases.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264dsp_msa.c |  428 +
 1 file changed, 138 insertions(+), 290 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 16e4858..a17eacb 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Parag Salasakar (parag.salasa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Parag Salasakar (parag.salasa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -644,96 +644,69 @@ static void 
avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
uint8_t beta_in,
uint32_t img_width)
 {
-v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
-v16u8 alpha, beta;
-v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
-v16u8 p2, p1, p0, q0, q1, q2;
-v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
-v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-v8i16 p2_r = { 0 };
-v8i16 p1_r = { 0 };
-v8i16 p0_r = { 0 };
-v8i16 q0_r = { 0 };
-v8i16 q1_r = { 0 };
-v8i16 q2_r = { 0 };
-v8i16 p2_l = { 0 };
-v8i16 p1_l = { 0 };
-v8i16 p0_l = { 0 };
-v8i16 q0_l = { 0 };
-v8i16 q1_l = { 0 };
-v8i16 q2_l = { 0 };
-v16u8 tmp_flag;
-v16i8 zero = { 0 };
-
-alpha = (v16u8) __msa_fill_b(alpha_in);
-beta = (v16u8) __msa_fill_b(beta_in);
+v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+v16u8 p1_org, p0_org, q0_org, q1_org;
 
 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
 
-{
-v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
-
-p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
-p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
-q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
-is_less_than_alpha = (p0_asub_q0 < alpha);
-is_less_than_beta = (p1_asub_p0 < beta);
-is_less_than = is_less_than_beta & is_less_than_alpha;
-is_less_than_beta = (q1_asub_q0 < beta);
-is_less_than = is_less_than_beta & is_less_than;
-}
+is_less_than_alpha = (p0_asub_q0 < alpha_in);
+is_less_than_beta = (p1_asub_p0 < beta_in);
+is_less_than = is_less_than_beta & is_less_than_alpha;
+is_less_than_beta = (q1_asub_q0 < beta_in);
+is_less_than = is_less_than_beta & is_less_than;
 
 if (!__msa_test_bz_v(is_less_than)) {
-q2_org = LD_UB(data + (2 * img_width));
-p3_org = LD_UB(data - (img_width << 2));
-p2_org = LD_UB(data - (3 * img_width));
+v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
+v8i16 p0_r = { 0 };
+v8i16 q0_r = { 0 };
+v8i16 p0_l = { 0 };
+v8i16 q0_l = { 0 };
+v16i8 zero = { 0 };
+v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+v16u8 q2_org = LD_UB(data + (2 * img_width));
+v16u8 p2_org = LD_UB(data - (3 * img_width));
+v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
 
 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
 
-tmp_flag = alpha >> 2;
-tmp_flag = tmp_flag + 2;
 tmp_flag = (p0_asub_q0 < tmp_flag);
 
 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
-is_less_than_beta = (p2_asub_p0 < beta);
+is_less_than_beta = (p2_asub_p0 < beta_in);
 is_less_than_beta = is_less_than_beta & tmp_flag;
 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
 is_less_than_beta = is_less_than_beta & is_less_than;
 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
-{
-v8u16 is_less_than_beta_l, is_less_than_beta_r;
-
-q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
-
-is_less_than_beta_r =
-(v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
-if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
-v8i16 p3_org_r;
-
-ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
-AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, 
p1_org_r,
- p2_r, q1_org_r, p0_r, p1_r, p2_r);
-}
-
-q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
 
-is_less_than_beta_l =
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc mc copy msa functions

2017-09-15 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |   81 +---
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index 43d21f7..05dffea 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Parag Salasakar (parag.salasa...@imgtec.com)
+ * Copyright (c) 2015 -2017 Parag Salasakar (parag.salasa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -2966,31 +2966,100 @@ static void avg_width16_msa(const uint8_t *src, 
int32_t src_stride,
 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
  ptrdiff_t stride)
 {
-copy_width16_msa(src, stride, dst, stride, 16);
+v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+src += (8 * stride);
+LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
+
+ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
+dst += (8 * stride);
+ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
 }
 
 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
 ptrdiff_t stride)
 {
-copy_width8_msa(src, stride, dst, stride, 8);
+uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
+
+LD4(src, stride, src0, src1, src2, src3);
+src += 4 * stride;
+LD4(src, stride, src4, src5, src6, src7);
+SD4(src0, src1, src2, src3, dst, stride);
+dst += 4 * stride;
+SD4(src4, src5, src6, src7, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
  ptrdiff_t stride)
 {
-avg_width16_msa(src, stride, dst, stride, 16);
+v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+src += (8 * stride);
+LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+dst2, dst3);
+AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+dst6, dst7);
+ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+dst += (8 * stride);
+
+LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+dst2, dst3);
+AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+dst6, dst7);
+ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
 }
 
 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
 ptrdiff_t stride)
 {
-avg_width8_msa(src, stride, dst, stride, 8);
+uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+
+LD4(src, stride, tp0, tp1, tp2, tp3);
+src += 4 * stride;
+LD4(src, stride, tp4, tp5, tp6, tp7);
+INSERT_D2_UB(tp0, tp1, src0);
+INSERT_D2_UB(tp2, tp3, src1);
+INSERT_D2_UB(tp4, tp5, src2);
+INSERT_D2_UB(tp6, tp7, src3);
+
+LD4(dst, stride, tp0, tp1, tp2, tp3);
+LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
+INSERT_D2_UB(tp0, tp1, dst0);
+INSERT_D2_UB(tp2, tp3, dst1);
+INSERT_D2_UB(tp4, tp5, dst2);
+INSERT_D2_UB(tp6, tp7, dst3);
+
+AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+dst2, dst3);
+
+ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
 ptrdiff_t stride)
 {
-avg_width4_msa(src, stride, dst, stride, 4);
+uint32_t tp0, tp1, tp2, tp3;
+v16u8 src0 = { 0 }, dst0 = { 0 };
+
+LW4(src, stride, tp0, tp1, tp2, tp3);
+INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+LW4(dst, stride, tp0, tp1, tp2, tp3);
+INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+
+dst0 = __msa_aver_u_b(src0, dst0);
+
+ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/mips: Unrolled loops avc intra msa functions

2017-09-21 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264pred_msa.c |  318 
 1 file changed, 158 insertions(+), 160 deletions(-)

diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c
index cddcd2e..6c7e756 100644
--- a/libavcodec/mips/h264pred_msa.c
+++ b/libavcodec/mips/h264pred_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Shivraj Patil (shivraj.pa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -24,31 +24,21 @@
 static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
int32_t dst_stride)
 {
-uint32_t row;
-uint32_t src_data1, src_data2;
-
-src_data1 = LW(src);
-src_data2 = LW(src + 4);
+uint64_t out = LD(src);
 
-for (row = 8; row--;) {
-SW(src_data1, dst);
-SW(src_data2, (dst + 4));
-dst += dst_stride;
-}
+SD4(out, out, out, out, dst, dst_stride);
+dst += (4 * dst_stride);
+SD4(out, out, out, out, dst, dst_stride);
 }
 
 static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
  int32_t dst_stride)
 {
-uint32_t row;
-v16u8 src0;
-
-src0 = LD_UB(src);
+v16u8 out = LD_UB(src);
 
-for (row = 16; row--;) {
-ST_UB(src0, dst);
-dst += dst_stride;
-}
+ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+dst += (8 * dst_stride);
+ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
 }
 
 static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
@@ -73,28 +63,47 @@ static void intra_predict_horiz_8x8_msa(uint8_t *src, 
int32_t src_stride,
 static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
   uint8_t *dst, int32_t dst_stride)
 {
-uint32_t row;
 uint8_t inp0, inp1, inp2, inp3;
-v16u8 src0, src1, src2, src3;
-
-for (row = 4; row--;) {
-inp0 = src[0];
-src += src_stride;
-inp1 = src[0];
-src += src_stride;
-inp2 = src[0];
-src += src_stride;
-inp3 = src[0];
-src += src_stride;
-
-src0 = (v16u8) __msa_fill_b(inp0);
-src1 = (v16u8) __msa_fill_b(inp1);
-src2 = (v16u8) __msa_fill_b(inp2);
-src3 = (v16u8) __msa_fill_b(inp3);
-
-ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-dst += (4 * dst_stride);
-}
+v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+inp0 = src[0 * src_stride];
+inp1 = src[1 * src_stride];
+inp2 = src[2 * src_stride];
+inp3 = src[3 * src_stride];
+src0 = (v16u8) __msa_fill_b(inp0);
+src1 = (v16u8) __msa_fill_b(inp1);
+src2 = (v16u8) __msa_fill_b(inp2);
+src3 = (v16u8) __msa_fill_b(inp3);
+inp0 = src[4 * src_stride];
+inp1 = src[5 * src_stride];
+inp2 = src[6 * src_stride];
+inp3 = src[7 * src_stride];
+src4 = (v16u8) __msa_fill_b(inp0);
+src5 = (v16u8) __msa_fill_b(inp1);
+src6 = (v16u8) __msa_fill_b(inp2);
+src7 = (v16u8) __msa_fill_b(inp3);
+inp0 = src[ 8 * src_stride];
+inp1 = src[ 9 * src_stride];
+inp2 = src[10 * src_stride];
+inp3 = src[11 * src_stride];
+src8 = (v16u8) __msa_fill_b(inp0);
+src9 = (v16u8) __msa_fill_b(inp1);
+src10 = (v16u8) __msa_fill_b(inp2);
+src11 = (v16u8) __msa_fill_b(inp3);
+inp0 = src[12 * src_stride];
+inp1 = src[13 * src_stride];
+inp2 = src[14 * src_stride];
+inp3 = src[15 * src_stride];
+src12 = (v16u8) __msa_fill_b(inp0);
+src13 = (v16u8) __msa_fill_b(inp1);
+src14 = (v16u8) __msa_fill_b(inp2);
+src15 = (v16u8) __msa_fill_b(inp3);
+
+ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+dst += (8 * dst_stride);
+ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+   dst, dst_stride);
 }
 
 static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
@@ -206,39 +215,29 @@ static void intra_predict_dc_16x16_msa(uint8_t *src_top, 
uint8_t *src_left,
 }
 }
 
-#define INTRA_PREDICT_VALDC_8X8_MSA(val) \
-static void intra_predict_##val##dc_8x8_msa(uint8_t *dst,\
-int32_t dst_stride)  \
-{\
-uint32_t row, out;   \
-v16i8 store; \
- \
-store = __msa_ldi_b(val);\
-out = __msa_copy_u_w((v4i32) store, 0);  \
- \
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma horiz mc msa functions

2017-09-21 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  231 ++
 1 file changed, 133 insertions(+), 98 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 940e12d..c27830d 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Shivraj Patil (shivraj.pa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -29,8 +29,7 @@ static const uint8_t chroma_mask_arr[16 * 5] = {
 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
 };
 
-static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coeff0, uint32_t coeff1)
 {
 uint16_t out0, out1;
@@ -44,7 +43,7 @@ static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t 
src_stride,
 
 mask = LD_SB(_mask_arr[0]);
 
-LD_SB2(src, src_stride, src0, src1);
+LD_SB2(src, stride, src0, src1);
 
 src0 = __msa_vshf_b(mask, src1, src0);
 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
@@ -57,12 +56,11 @@ static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t 
src_stride,
 out1 = __msa_copy_u_h(res, 2);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coeff0, uint32_t coeff1)
 {
 v16u8 src0, src1, src2, src3;
@@ -75,7 +73,7 @@ static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t 
src_stride,
 
 mask = LD_SB(_mask_arr[64]);
 
-LD_UB4(src, src_stride, src0, src1, src2, src3);
+LD_UB4(src, stride, src0, src1, src2, src3);
 
 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
@@ -87,64 +85,21 @@ static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t 
src_stride,
 res_r = __msa_sat_u_h(res_r, 7);
 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-ST2x4_UB(res, 0, dst, dst_stride);
-}
-
-static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
-  uint32_t coeff0, uint32_t coeff1)
-{
-v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-v8u16 res_r;
-v8i16 res;
-v16i8 mask;
-v16i8 coeff_vec0 = __msa_fill_b(coeff0);
-v16i8 coeff_vec1 = __msa_fill_b(coeff1);
-v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
-
-mask = LD_SB(_mask_arr[64]);
-
-LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-
-VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
-VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
-
-ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
-
-res_r = __msa_dotp_u_h(src0, coeff_vec);
-res_r <<= 3;
-res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
-res_r = __msa_sat_u_h(res_r, 7);
-res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
-
-res_r = __msa_dotp_u_h(src4, coeff_vec);
-res_r <<= 3;
-res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
-res_r = __msa_sat_u_h(res_r, 7);
-res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-
-ST2x4_UB(res, 0, dst, dst_stride);
+ST2x4_UB(res, 0, dst, stride);
 }
 
-static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
  uint32_t coeff0, uint32_t coeff1,
  int32_t height)
 {
 if (2 == height) {
-avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
+avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
 } else if (4 == height) {
-avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
-} else if (8 == height) {
-avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
+avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
 }
 }
 
-static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coeff0, uint32_t coeff1)
 

[FFmpeg-devel] [PATCH] avcodec/mips: preload data in hevc sao edge 90 degree filter msa functions

2017-09-21 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c |  181 
 1 file changed, 122 insertions(+), 59 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 3472d32..39c647e 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1568,23 +1568,25 @@ static void 
hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
  int16_t *sao_offset_val,
  int32_t height)
 {
-int32_t h_cnt;
 uint32_t dst_val0, dst_val1;
-v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 v16u8 const1 = (v16u8) __msa_ldi_b(1);
 v16i8 dst0;
-v16i8 zero = { 0 };
+v16i8 sao_offset = LD_SB(sao_offset_val);
 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 v16u8 src_minus10, src_minus11, src10, src11;
 v16i8 src_zero0, src_zero1;
-v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+v16i8 offset;
+v8i16 offset_mask0, offset_mask1;
 
-sao_offset = LD_SH(sao_offset_val);
+sao_offset = __msa_pckev_b(sao_offset, sao_offset);
 
+/* load in advance */
 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+LD_UB2(src + src_stride, src_stride, src10, src11);
 
-for (h_cnt = (height >> 1); h_cnt--;) {
-LD_UB2(src + src_stride, src_stride, src10, src11);
+for (height -= 2; height; height -= 2) {
+src += (src_stride << 1);
 
 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
@@ -1604,19 +1606,22 @@ static void 
hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 
2);
 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 
2);
 
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
-   offset_mask0, offset_mask0, offset_mask0);
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
-   offset_mask1, offset_mask1, offset_mask1);
-ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
-ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
- offset_mask1);
-CLIP_SH2_0_255(offset_mask0, offset_mask1);
-dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+   offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
 
 src_minus10 = src10;
 src_minus11 = src11;
 
+/* load in advance */
+LD_UB2(src + src_stride, src_stride, src10, src11);
+
 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
 SW(dst_val0, dst);
@@ -1624,8 +1629,41 @@ static void 
hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
 SW(dst_val1, dst);
 
 dst += dst_stride;
-src += (src_stride << 1);
 }
+
+src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
+   offset, offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+dst_val0 = __msa_copy_u_w((v4i32) dst0, 

[FFmpeg-devel] [PATCH] avcodec/mips: Remove generic func use in hevc non-uni copy mc msa functions

2017-09-21 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevcdsp_msa.c |  168 +++--
 1 file changed, 160 insertions(+), 8 deletions(-)

diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
index f2bc748..1a854b2 100644
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -302,8 +302,34 @@ static void hevc_copy_16w_msa(uint8_t *src, int32_t 
src_stride,
 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
 } else if (0 == (height % 8)) {
-hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride,
-   height, 16);
+uint32_t loop_cnt;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+for (loop_cnt = (height >> 3); loop_cnt--;) {
+LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
+   src7);
+src += (8 * src_stride);
+ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
+   in1_r, in2_r, in3_r);
+ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
+   in1_l, in2_l, in3_l);
+SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+dst += (4 * dst_stride);
+
+ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
+   in1_r, in2_r, in3_r);
+ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
+   in1_l, in2_l, in3_l);
+SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+dst += (4 * dst_stride);
+}
 }
 }
 
@@ -311,29 +337,155 @@ static void hevc_copy_24w_msa(uint8_t *src, int32_t 
src_stride,
   int16_t *dst, int32_t dst_stride,
   int32_t height)
 {
-hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
-hevc_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride, height);
+uint32_t loop_cnt;
+v16i8 zero = { 0 };
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+for (loop_cnt = (height >> 2); loop_cnt--;) {
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
+src += (4 * src_stride);
+ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, 
in1_r,
+   in2_r, in3_r);
+ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, 
in1_l,
+   in2_l, in3_l);
+SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, 
in1_r,
+   in2_r, in3_r);
+SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
+dst += (4 * dst_stride);
+}
 }
 
 static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
   int16_t *dst, int32_t dst_stride,
   int32_t height)
 {
-hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+uint32_t loop_cnt;
+v16i8 zero = { 0 };
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+for (loop_cnt = (height >> 2); loop_cnt--;) {
+LD_SB4(src, src_stride, src0, src2, src4, src6);
+LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
+src += (4 * src_stride);
+
+ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, 
in1_r,
+   in2_r, in3_r);
+ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, 
in1_l,
+   in2_l, in3_l);
+SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
+dst += 

[FFmpeg-devel] [PATCH] avcodec/mips: Fixed rnd_val variable to 6 in hevc uni mc msa functions

2017-09-18 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c |  372 +
 1 file changed, 133 insertions(+), 239 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 754fbdb..cf22e7f 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -359,16 +359,14 @@ static const uint8_t mc_filt_mask_arr[16 * 3] = {
 
 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
  uint8_t *dst, int32_t dst_stride,
- const int8_t *filter, uint8_t rnd_val)
+ const int8_t *filter)
 {
 v16u8 mask0, mask1, mask2, mask3, out;
 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 v8i16 filt, out0, out1;
-v8i16 rnd_vec;
 
 mask0 = LD_UB(_filt_mask_arr[16]);
 src -= 3;
-rnd_vec = __msa_fill_h(rnd_val);
 
 /* rearranging filter */
 filt = LD_SH(filter);
@@ -382,7 +380,7 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t 
src_stride,
 XORI_B4_128_SB(src0, src1, src2, src3);
 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
-SRAR_H2_SH(out0, out1, rnd_vec);
+SRARI_H2_SH(out0, out1, 6);
 SAT_SH2_SH(out0, out1, 7);
 out = PCKEV_XORI128_UB(out0, out1);
 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -390,17 +388,15 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t 
src_stride,
 
 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
  uint8_t *dst, int32_t dst_stride,
- const int8_t *filter, uint8_t rnd_val)
+ const int8_t *filter)
 {
 v16i8 filt0, filt1, filt2, filt3;
 v16i8 src0, src1, src2, src3;
 v16u8 mask0, mask1, mask2, mask3, out;
 v8i16 filt, out0, out1, out2, out3;
-v8i16 rnd_vec;
 
 mask0 = LD_UB(_filt_mask_arr[16]);
 src -= 3;
-rnd_vec = __msa_fill_h(rnd_val);
 
 /* rearranging filter */
 filt = LD_SH(filter);
@@ -419,7 +415,7 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t 
src_stride,
 XORI_B4_128_SB(src0, src1, src2, src3);
 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
-SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
 SAT_SH4_SH(out0, out1, out2, out3, 7);
 out = PCKEV_XORI128_UB(out0, out1);
 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -430,16 +426,14 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t 
src_stride,
 
 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
   uint8_t *dst, int32_t dst_stride,
-  const int8_t *filter, uint8_t rnd_val)
+  const int8_t *filter)
 {
 v16u8 mask0, mask1, mask2, mask3, out;
 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 v8i16 filt, out0, out1, out2, out3;
-v8i16 rnd_vec;
 
 mask0 = LD_UB(_filt_mask_arr[16]);
 src -= 3;
-rnd_vec = __msa_fill_h(rnd_val);
 
 /* rearranging filter */
 filt = LD_SH(filter);
@@ -459,7 +453,7 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 src += (4 * src_stride);
 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
-SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
 SAT_SH4_SH(out0, out1, out2, out3, 7);
 out = PCKEV_XORI128_UB(out0, out1);
 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -479,7 +473,7 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
 
-SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
 SAT_SH4_SH(out0, out1, out2, out3, 7);
 out = PCKEV_XORI128_UB(out0, out1);
 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -490,30 +484,27 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 
 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
 uint8_t *dst, int32_t dst_stride,
-const int8_t *filter, int32_t height, uint8_t 

[FFmpeg-devel] [PATCH] avcodec/mips: preload data in hevc sao edge 0 degree filter msa functions

2017-09-18 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c |  232 +---
 1 file changed, 138 insertions(+), 94 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 1d77432..3472d32 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1265,54 +1265,51 @@ static void 
hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
 int16_t *sao_offset_val,
 int32_t height)
 {
-int32_t h_cnt;
 uint32_t dst_val0, dst_val1;
-v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
+v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+v16i8 sao_offset = LD_SB(sao_offset_val);
+v16i8 src_plus10, offset, src0, dst0;
 v16u8 const1 = (v16u8) __msa_ldi_b(1);
-v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
-v16u8 src_minus10, src_minus11;
 v16i8 zero = { 0 };
-v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0;
-v8i16 offset_mask0, offset_mask1;
-v8i16 sao_offset, src00, src01;
 
-sao_offset = LD_SH(sao_offset_val);
+sao_offset = __msa_pckev_b(sao_offset, sao_offset);
 src -= 1;
 
-for (h_cnt = (height >> 1); h_cnt--;) {
-LD_UB2(src, src_stride, src_minus10, src_minus11);
+/* load in advance */
+LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+for (height -= 2; height; height -= 2) {
 src += (2 * src_stride);
 
-SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
-SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
-ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
-   src_minus10, src_minus11);
-ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
-   src_zero1);
+src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
+(v2i64) src_minus10);
 
-cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
+src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
+
+cmp_minus10 = ((v16u8) src0 == src_minus10);
 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
-cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+cmp_minus10 = (src_minus10 < (v16u8) src0);
 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
 
-cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
-diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
-cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
-diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
+diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
+cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
+diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
 
-offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 
2);
-offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 
2);
+offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
 
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
-   offset_mask0, offset_mask0, offset_mask0);
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
-   offset_mask1, offset_mask1, offset_mask1);
-ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
-ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
- offset_mask1);
-CLIP_SH2_0_255(offset_mask0, offset_mask1);
+/* load in advance */
+LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+   offset, offset);
+
+src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
+dst0 = __msa_adds_s_b(src0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
 
-dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
 SW(dst_val0, dst);
@@ -1320,6 +1317,37 @@ static void 
hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
 SW(dst_val1, dst);
 dst += dst_stride;
 }
+
+src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
+(v2i64) src_minus10);
+
+src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
+src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
+
+  

[FFmpeg-devel] [PATCH] avcodec/mips: Unrolled loops and expanded functions in avc put mc 10 & 30 msa functions

2017-09-18 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |  284 +++-
 1 file changed, 278 insertions(+), 6 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index 05dffea..b7f6c3d 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -3065,37 +3065,309 @@ void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const 
uint8_t *src,
 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
  ptrdiff_t stride)
 {
-avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 0);
+uint32_t loop_cnt;
+v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
+v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+v16i8 minus5b = __msa_ldi_b(-5);
+v16i8 plus20b = __msa_ldi_b(20);
+
+LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
+mask3 = mask0 + 8;
+mask4 = mask1 + 8;
+mask5 = mask2 + 8;
+src -= 2;
+
+for (loop_cnt = 4; loop_cnt--;) {
+LD_SB2(src, 16, src0, src1);
+src += stride;
+LD_SB2(src, 16, src2, src3);
+src += stride;
+LD_SB2(src, 16, src4, src5);
+src += stride;
+LD_SB2(src, 16, src6, src7);
+src += stride;
+
+XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
+VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
+VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
+VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
+VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
+VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
+HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+ minus5b, res0, res1, res2, res3);
+DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+ plus20b, res0, res1, res2, res3);
+VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
+VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
+VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
+VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
+VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
+VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
+HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+ minus5b, res4, res5, res6, res7);
+DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+ plus20b, res4, res5, res6, res7);
+SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
+SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
+SRARI_H4_SH(res0, res1, res2, res3, 5);
+SRARI_H4_SH(res4, res5, res6, res7, 5);
+SAT_SH4_SH(res0, res1, res2, res3, 7);
+SAT_SH4_SH(res4, res5, res6, res7, 7);
+PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
+PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
+dst0 = __msa_aver_s_b(dst0, src0);
+dst1 = __msa_aver_s_b(dst1, src2);
+dst2 = __msa_aver_s_b(dst2, src4);
+dst3 = __msa_aver_s_b(dst3, src6);
+XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+dst += (4 * stride);
+}
 }
 
 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
  ptrdiff_t stride)
 {
-avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 1);
+uint32_t loop_cnt;
+v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
+v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+v16i8 minus5b = __msa_ldi_b(-5);
+v16i8 plus20b = __msa_ldi_b(20);
+
+LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
+mask3 = mask0 + 8;
+mask4 = mask1 + 8;
+mask5 = mask2 + 8;
+src -= 2;
+
+for (loop_cnt = 4; loop_cnt--;) {
+LD_SB2(src, 16, src0, src1);
+src += stride;
+LD_SB2(src, 16, src2, src3);
+src += stride;
+LD_SB2(src, 16, src4, src5);
+src += stride;
+LD_SB2(src, 16, src6, src7);
+src += stride;
+
+XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, 

[FFmpeg-devel] [PATCH] avcodec/mips: Reduced conditional cases in avc inter lpf msa functions

2017-09-18 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264dsp_msa.c |  274 +
 1 file changed, 110 insertions(+), 164 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index a17eacb..422703d 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -1250,21 +1250,7 @@ static void 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
uint8_t beta_in,
uint32_t img_width)
 {
-uint8_t *src;
-v16u8 beta, tmp_vec, bs = { 0 };
-v16u8 tc = { 0 };
-v16u8 is_less_than, is_less_than_beta;
-v16u8 p1, p0, q0, q1;
-v8i16 p0_r, q0_r, p1_r = { 0 };
-v8i16 q1_r = { 0 };
-v8i16 p0_l, q0_l, p1_l = { 0 };
-v8i16 q1_l = { 0 };
-v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
-v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
-v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
-v8i16 tc_r, tc_l;
-v16i8 zero = { 0 };
-v16u8 is_bs_greater_than0;
+v16u8 tmp_vec, bs = { 0 };
 
 tmp_vec = (v16u8) __msa_fill_b(bs0);
 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
@@ -1276,6 +1262,14 @@ static void 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
 
 if (!__msa_test_bz_v(bs)) {
+uint8_t *src = data - 4;
+v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+v16u8 is_bs_greater_than0;
+v16u8 tc = { 0 };
+v16i8 zero = { 0 };
+
 tmp_vec = (v16u8) __msa_fill_b(tc0);
 tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
 tmp_vec = (v16u8) __msa_fill_b(tc1);
@@ -1291,9 +1285,6 @@ static void 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 
-src = data;
-src -= 4;
-
 LD_UB8(src, img_width,
row0, row1, row2, row3, row4, row5, row6, row7);
 src += (8 * img_width);
@@ -1306,27 +1297,28 @@ static void 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
 p3_org, p2_org, p1_org, p0_org,
 q0_org, q1_org, q2_org, q3_org);
 }
-{
-v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
-v16u8 is_less_than_alpha;
-
-p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
-p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
-q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
-
-alpha = (v16u8) __msa_fill_b(alpha_in);
-beta = (v16u8) __msa_fill_b(beta_in);
-
-is_less_than_alpha = (p0_asub_q0 < alpha);
-is_less_than_beta = (p1_asub_p0 < beta);
-is_less_than = is_less_than_beta & is_less_than_alpha;
-is_less_than_beta = (q1_asub_q0 < beta);
-is_less_than = is_less_than_beta & is_less_than;
-is_less_than = is_less_than & is_bs_greater_than0;
-}
+
+p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+alpha = (v16u8) __msa_fill_b(alpha_in);
+beta = (v16u8) __msa_fill_b(beta_in);
+
+is_less_than_alpha = (p0_asub_q0 < alpha);
+is_less_than_beta = (p1_asub_p0 < beta);
+is_less_than = is_less_than_beta & is_less_than_alpha;
+is_less_than_beta = (q1_asub_q0 < beta);
+is_less_than = is_less_than_beta & is_less_than;
+is_less_than = is_less_than & is_bs_greater_than0;
+
 if (!__msa_test_bz_v(is_less_than)) {
 v16i8 negate_tc, sign_negate_tc;
-v8i16 negate_tc_r, i16_negatetc_l;
+v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
+v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
+v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+v8i16 p0_r, q0_r, p0_l, q0_l;
 
 negate_tc = zero - (v16i8) tc;
 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
@@ -1338,34 +1330,22 @@ static void 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
 
-{
-v16u8 p2_asub_p0;
-v16u8 is_less_than_beta_r, is_less_than_beta_l;
-
-p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni-w copy mc msa functions

2017-09-21 Thread kaustubh.raste
From: Kaustubh Raste 

Load the specific destination bytes instead of MSA load and pack.
Pack the data to half word before clipping.
Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uniw_msa.c  |  559 ---
 libavutil/mips/generic_macros_msa.h |   30 ++
 2 files changed, 415 insertions(+), 174 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index ce10f41..d184419 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -62,6 +62,31 @@
 out2_r, out3_r, out2_l, out3_l);   \
 }
 
+#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \
+   out0_h, out1_h)\
+{ \
+v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
+  \
+ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);  \
+ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);  \
+DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,  \
+wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);   \
+SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);\
+PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);  \
+ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);  \
+CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);  \
+}
+
+#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
+   offset_h, rnd_w, out0_h, out1_h,\
+   out2_h, out3_h) \
+{  \
+HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \
+   out0_h, out1_h);\
+HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \
+   out2_h, out3_h);\
+}
+
 static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
 int32_t src_stride,
 uint8_t *dst,
@@ -71,76 +96,60 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
 int32_t offset,
 int32_t rnd_val)
 {
+uint32_t loop_cnt, tp0, tp1, tp2, tp3;
 v16i8 zero = { 0 };
-v4i32 weight_vec, offset_vec, rnd_vec;
+v16u8 out0, out1;
+v16i8 src0 = { 0 }, src1 = { 0 };
+v8i16 dst0, dst1, dst2, dst3, offset_vec;
+v4i32 weight_vec, rnd_vec;
 
 weight = weight & 0x;
 weight_vec = __msa_fill_w(weight);
-offset_vec = __msa_fill_w(offset);
+offset_vec = __msa_fill_h(offset);
 rnd_vec = __msa_fill_w(rnd_val);
 
 if (2 == height) {
-v16i8 src0, src1;
-v8i16 dst0;
 v4i32 dst0_r, dst0_l;
 
-LD_SB2(src, src_stride, src0, src1);
-src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+LW2(src, src_stride, tp0, tp1);
+INSERT_W2_SB(tp0, tp1, src0);
 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
 dst0 <<= 6;
 
 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
-dst0_r = CLIP_SW_0_255(dst0_r);
-dst0_l = CLIP_SW_0_255(dst0_l);
-
-HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+dst0 += offset_vec;
+dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+ST4x2_UB(out0, dst, dst_stride);
 } else if (4 == height) {
-v16i8 src0, src1, src2, src3;
-v8i16 dst0, dst1;
-v4i32 dst0_r, dst1_r;
-v4i32 dst0_l, dst1_l;
-
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
-ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
-dst0 <<= 6;
-dst1 <<= 6;
-
-HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
-dst0_r, 

[FFmpeg-devel] [PATCH] avcodec/mips: Cleanup unused functions

2017-10-05 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |  746 
 libavcodec/mips/hevc_mc_uniw_msa.c |   67 
 libavcodec/mips/hevcdsp_msa.c  |   50 ---
 3 files changed, 863 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index 0b42bc4..afc0183 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -159,300 +159,6 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 out0_m; \
 } )
 
-static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
-   uint8_t *dst, int32_t dst_stride,
-   int32_t height)
-{
-uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3;
-v8i16 res0, res1;
-v16u8 out;
-v16i8 mask0, mask1, mask2;
-v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v16i8 minus5b = __msa_ldi_b(-5);
-v16i8 plus20b = __msa_ldi_b(20);
-
-LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2);
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
-
-XORI_B4_128_SB(src0, src1, src2, src3);
-VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
-HADD_SB2_SH(vec0, vec1, res0, res1);
-VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
-DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
-VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
-DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
-SRARI_H2_SH(res0, res1, 5);
-SAT_SH2_SH(res0, res1, 7);
-out = PCKEV_XORI128_UB(res0, res1);
-ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-dst += (4 * dst_stride);
-}
-}
-
-static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride,
-   uint8_t *dst, int32_t dst_stride,
-   int32_t height)
-{
-uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3;
-v8i16 res0, res1, res2, res3;
-v16i8 mask0, mask1, mask2;
-v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
-v16i8 minus5b = __msa_ldi_b(-5);
-v16i8 plus20b = __msa_ldi_b(20);
-v16u8 out0, out1;
-
-LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
-
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
-
-XORI_B4_128_SB(src0, src1, src2, src3);
-VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
-VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
-HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
-VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
-VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
-DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, 
minus5b,
- res0, res1, res2, res3);
-VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
-VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
-DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
- plus20b, res0, res1, res2, res3);
-SRARI_H4_SH(res0, res1, res2, res3, 5);
-SAT_SH4_SH(res0, res1, res2, res3, 7);
-out0 = PCKEV_XORI128_UB(res0, res1);
-out1 = PCKEV_XORI128_UB(res2, res3);
-ST8x4_UB(out0, out1, dst, dst_stride);
-dst += (4 * dst_stride);
-}
-}
-
-static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride,
-uint8_t *dst, int32_t dst_stride,
-int32_t height)
-{
-uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-v16i8 mask0, mask1, mask2;
-v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
-v16i8 minus5b = __msa_ldi_b(-5);
-v16i8 plus20b = __msa_ldi_b(20);
-
-LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
-
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB2(src, 8, src0, src1);
-src += src_stride;
-LD_SB2(src, 8, src2, src3);
-src += src_stride;
-
-XORI_B4_128_SB(src0, src1, src2, src3);
-VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
-VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
-VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
-VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
-VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
-VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc lpf msa functions

2017-09-12 Thread kaustubh.raste
From: Kaustubh Raste 

Seperate the filter processing in all strong, all weak and strong + weak cases.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c |  750 ++--
 1 file changed, 556 insertions(+), 194 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index da1db51..79b156f 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 -2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -35,12 +35,14 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, 
int32_t stride,
 uint8_t *q3 = src + (stride << 1) + stride;
 uint8_t flag0, flag1;
 int32_t dp00, dq00, dp30, dq30, d00, d30;
+int32_t d0030, d0434;
 int32_t dp04, dq04, dp34, dq34, d04, d34;
 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
 uint64_t dst_val0, dst_val1;
 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+v2i64 cmp3;
 v8u16 temp0, temp1;
 v8i16 temp2;
 v8i16 tc_pos, tc_neg;
@@ -54,62 +56,86 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, 
int32_t stride,
 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
 d00 = dp00 + dq00;
 d30 = dp30 + dq30;
-p_is_pcm0 = p_is_pcm[0];
-q_is_pcm0 = q_is_pcm[0];
 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
 d04 = dp04 + dq04;
 d34 = dp34 + dq34;
+
+p_is_pcm0 = p_is_pcm[0];
 p_is_pcm4 = p_is_pcm[1];
+q_is_pcm0 = q_is_pcm[0];
 q_is_pcm4 = q_is_pcm[1];
 
-if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
-if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
-p3_src = LD_UH(p3);
-p2_src = LD_UH(p2);
-p1_src = LD_UH(p1);
-p0_src = LD_UH(p0);
-q0_src = LD_UH(q0);
-q1_src = LD_UH(q1);
-q2_src = LD_UH(q2);
-q3_src = LD_UH(q3);
-
-tc0 = tc[0];
-beta30 = beta >> 3;
-beta20 = beta >> 2;
-tc250 = ((tc0 * 5 + 1) >> 1);
-tc4 = tc[1];
-tc254 = ((tc4 * 5 + 1) >> 1);
-
-flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
- abs(p0[0] - q0[0]) < tc250 &&
- abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
- abs(p0[3] - q0[3]) < tc250 &&
- (d00 << 1) < beta20 && (d30 << 1) < beta20);
-cmp0 = __msa_fill_d(flag0);
-
-flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
- abs(p0[4] - q0[4]) < tc254 &&
- abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
- abs(p0[7] - q0[7]) < tc254 &&
- (d04 << 1) < beta20 && (d34 << 1) < beta20);
-cmp1 = __msa_fill_d(flag1);
-cmp2 = __msa_ilvev_d(cmp1, cmp0);
-cmp2 = __msa_ceqi_d(cmp2, 0);
-
-ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
-   zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
-   p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
-   q3_src);
-
-cmp0 = (v2i64) __msa_fill_h(tc0);
-cmp1 = (v2i64) __msa_fill_h(tc4);
-tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+cmp0 = __msa_fill_d(p_is_pcm0);
+cmp1 = __msa_fill_d(p_is_pcm4);
+p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+d0030 = (d00 + d30) >= beta;
+d0434 = (d04 + d34) >= beta;
+
+cmp0 = (v2i64) __msa_fill_w(d0030);
+cmp1 = (v2i64) __msa_fill_w(d0434);
+cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
+
+if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+(!d0030 || !d0434)) {
+p3_src = LD_UH(p3);
+p2_src = LD_UH(p2);
+p1_src = LD_UH(p1);
+p0_src = LD_UH(p0);
+
+cmp0 = __msa_fill_d(q_is_pcm0);
+cmp1 = __msa_fill_d(q_is_pcm4);
+q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+tc0 = tc[0];
+beta30 = beta >> 3;
+beta20 = beta >> 2;
+tc250 = ((tc0 * 5 + 1) >> 1);
+tc4 = tc[1];
+tc254 = ((tc4 * 5 + 1) >> 1);
+
+cmp0 = (v2i64) __msa_fill_h(tc0);
+cmp1 = (v2i64) __msa_fill_h(tc4);
+
+ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc idct msa functions

2017-09-12 Thread kaustubh.raste
From: Kaustubh Raste 

Align the buffers. Remove reduandant constant array.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_idct_msa.c |  255 ++-
 1 file changed, 171 insertions(+), 84 deletions(-)

diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
index d483707..0943119 100644
--- a/libavcodec/mips/hevc_idct_msa.c
+++ b/libavcodec/mips/hevc_idct_msa.c
@@ -21,18 +21,18 @@
 #include "libavutil/mips/generic_macros_msa.h"
 #include "libavcodec/mips/hevcdsp_mips.h"
 
-static const int16_t gt8x8_cnst[16] = {
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
 };
 
-static const int16_t gt16x16_cnst[64] = {
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
 };
 
-static const int16_t gt32x32_cnst0[256] = {
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
@@ -51,21 +51,17 @@ static const int16_t gt32x32_cnst0[256] = {
 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
 };
 
-static const int16_t gt32x32_cnst1[64] = {
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
 };
 
-static const int16_t gt32x32_cnst2[16] = {
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
 };
 
-static const int16_t gt32x32_cnst3[16] = {
-64, 64, 64, 64, 83, 36, -36, -83, 64, -64, -64, 64, 36, -83, 83, -36
-};
-
 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,  \
  sum0, sum1, sum2, sum3, shift)   \
 { \
@@ -323,8 +319,12 @@ static void hevc_idct_4x4_msa(int16_t *coeffs)
 HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
 TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
 HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
-TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, sum0, sum1, sum2, sum3);
-PCKEV_H2_SH(sum1, sum0, sum3, sum2, in0, in1);
+
+/* Pack and transpose */
+PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1);
+ILVRL_H2_SW(in1, in0, sum0, sum1);
+ILVRL_W2_SH(sum1, sum0, in0, in1);
+
 ST_SH2(in0, in1, coeffs, 8);
 }
 
@@ -432,27 +432,35 @@ static void hevc_idct_8x32_column_msa(int16_t *coeffs, 
uint8_t buf_pitch,
 const int16_t *filter_ptr0 = _cnst0[0];
 const int16_t *filter_ptr1 = _cnst1[0];
 const int16_t *filter_ptr2 = _cnst2[0];
-const int16_t *filter_ptr3 = _cnst3[0];
+const int16_t *filter_ptr3 = _cnst[0];
 int16_t *src0 = (coeffs + buf_pitch);
 int16_t *src1 = (coeffs + 2 * buf_pitch);
 int16_t *src2 = (coeffs + 4 * buf_pitch);
 int16_t *src3 = (coeffs);
 int32_t cnst0, cnst1;
-int32_t tmp_buf[8 * 32];
-int32_t *tmp_buf_ptr = _buf[0];
+int32_t tmp_buf[8 * 32 + 15];
+int32_t *tmp_buf_ptr = tmp_buf + 15;
 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
 v8i16 filt0, filter0, filter1, filter2, filter3;
 v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
 
+/* Align pointer to 64 byte boundary */
+tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
 /* process coeff 4, 12, 20, 28 */
 LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
 ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
 ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
 
+LD_SH2(src3, 16 * buf_pitch, in4, in6);
+LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
+ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
+ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);
+
 /* loop for all columns of constants */
-for (i = 0; i < 4; i++) {
+for (i = 0; i < 2; i++) {
 /* processing single column of constants */
 cnst0 = LW(filter_ptr2);
 

[FFmpeg-devel] [PATCH] avcodec/mips: Removed generic function call in avc intra msa functions

2017-09-25 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264pred_msa.c |  215 +---
 1 file changed, 92 insertions(+), 123 deletions(-)

diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c
index c297aec..b9990c1 100644
--- a/libavcodec/mips/h264pred_msa.c
+++ b/libavcodec/mips/h264pred_msa.c
@@ -106,115 +106,6 @@ static void intra_predict_horiz_16x16_msa(uint8_t *src, 
int32_t src_stride,
dst, dst_stride);
 }
 
-static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
- int32_t src_stride_left,
- uint8_t *dst, int32_t dst_stride,
- uint8_t is_above, uint8_t is_left)
-{
-uint32_t row;
-uint32_t out, addition = 0;
-v16u8 src_above, store;
-v8u16 sum_above;
-v4u32 sum_top;
-v2u64 sum;
-
-if (is_left && is_above) {
-src_above = LD_UB(src_top);
-
-sum_above = __msa_hadd_u_h(src_above, src_above);
-sum_top = __msa_hadd_u_w(sum_above, sum_above);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-addition = __msa_copy_u_w((v4i32) sum, 0);
-
-for (row = 0; row < 8; row++) {
-addition += src_left[row * src_stride_left];
-}
-
-addition = (addition + 8) >> 4;
-store = (v16u8) __msa_fill_b(addition);
-} else if (is_left) {
-for (row = 0; row < 8; row++) {
-addition += src_left[row * src_stride_left];
-}
-
-addition = (addition + 4) >> 3;
-store = (v16u8) __msa_fill_b(addition);
-} else if (is_above) {
-src_above = LD_UB(src_top);
-
-sum_above = __msa_hadd_u_h(src_above, src_above);
-sum_top = __msa_hadd_u_w(sum_above, sum_above);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-sum = (v2u64) __msa_srari_d((v2i64) sum, 3);
-store = (v16u8) __msa_splati_b((v16i8) sum, 0);
-} else {
-store = (v16u8) __msa_ldi_b(128);
-}
-
-out = __msa_copy_u_w((v4i32) store, 0);
-
-for (row = 8; row--;) {
-SW(out, dst);
-SW(out, (dst + 4));
-dst += dst_stride;
-}
-}
-
-static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
-   int32_t src_stride_left,
-   uint8_t *dst, int32_t dst_stride,
-   uint8_t is_above, uint8_t is_left)
-{
-uint32_t row;
-uint32_t addition = 0;
-v16u8 src_above, store;
-v8u16 sum_above;
-v4u32 sum_top;
-v2u64 sum;
-
-if (is_left && is_above) {
-src_above = LD_UB(src_top);
-
-sum_above = __msa_hadd_u_h(src_above, src_above);
-sum_top = __msa_hadd_u_w(sum_above, sum_above);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-addition = __msa_copy_u_w((v4i32) sum, 0);
-
-for (row = 0; row < 16; row++) {
-addition += src_left[row * src_stride_left];
-}
-
-addition = (addition + 16) >> 5;
-store = (v16u8) __msa_fill_b(addition);
-} else if (is_left) {
-for (row = 0; row < 16; row++) {
-addition += src_left[row * src_stride_left];
-}
-
-addition = (addition + 8) >> 4;
-store = (v16u8) __msa_fill_b(addition);
-} else if (is_above) {
-src_above = LD_UB(src_top);
-
-sum_above = __msa_hadd_u_h(src_above, src_above);
-sum_top = __msa_hadd_u_w(sum_above, sum_above);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
-sum = __msa_hadd_u_d(sum_top, sum_top);
-sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
-store = (v16u8) __msa_splati_b((v16i8) sum, 0);
-} else {
-store = (v16u8) __msa_ldi_b(128);
-}
-
-for (row = 16; row--;) {
-ST_UB(store, dst);
-dst += dst_stride;
-}
-}
-
 #define INTRA_PREDICT_VALDC_8X8_MSA(val)   
\
 static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, int32_t dst_stride)  
\
 {  
\
@@ -646,8 +537,42 @@ void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, 
ptrdiff_t stride)
 uint8_t *src_top = src - stride;
 uint8_t *src_left = src - 1;
 uint8_t *dst = src;
+uint32_t addition = 0;
+v16u8 src_above, out;
+v8u16 sum_above;
+v4u32 sum_top;
+v2u64 sum;
 
-intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 1);
+src_above = LD_UB(src_top);
+
+sum_above = __msa_hadd_u_h(src_above, src_above);
+sum_top = __msa_hadd_u_w(sum_above, sum_above);
+sum = __msa_hadd_u_d(sum_top, sum_top);

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma vert mc msa functions

2017-09-25 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  237 ++
 1 file changed, 112 insertions(+), 125 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index c27830d..16e2fe4 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -302,8 +302,7 @@ static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t 
*dst, int32_t stride,
 }
 }
 
-static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coeff0, uint32_t coeff1)
 {
 uint16_t out0, out1;
@@ -315,7 +314,7 @@ static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t 
src_stride,
 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
-LD_SB3(src, src_stride, src0, src1, src2);
+LD_SB3(src, stride, src0, src1, src2);
 
 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
 
@@ -331,12 +330,11 @@ static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t 
src_stride,
 out1 = __msa_copy_u_h(res, 2);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coeff0, uint32_t coeff1)
 {
 v16u8 src0, src1, src2, src3, src4;
@@ -347,39 +345,7 @@ static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t 
src_stride,
 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
-   tmp0, tmp1, tmp2, tmp3);
-ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
-
-tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
-
-res_r = __msa_dotp_u_h(tmp0, coeff_vec);
-res_r <<= 3;
-res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
-res_r = __msa_sat_u_h(res_r, 7);
-
-res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-
-ST2x4_UB(res, 0, dst, dst_stride);
-}
-
-static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
-  uint32_t coeff0, uint32_t coeff1)
-{
-v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-v16u8 tmp0, tmp1, tmp2, tmp3;
-v8i16 res;
-v8u16 res_r;
-v16i8 coeff_vec0 = __msa_fill_b(coeff0);
-v16i8 coeff_vec1 = __msa_fill_b(coeff1);
-v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
-
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-LD_UB4(src, src_stride, src5, src6, src7, src8);
-
+LD_UB5(src, stride, src0, src1, src2, src3, src4);
 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
tmp0, tmp1, tmp2, tmp3);
 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
@@ -393,42 +359,21 @@ static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t 
src_stride,
 
 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
-
-ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
-   tmp0, tmp1, tmp2, tmp3);
-ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
-
-tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
-
-res_r = __msa_dotp_u_h(tmp0, coeff_vec);
-res_r <<= 3;
-res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
-res_r = __msa_sat_u_h(res_r, 7);
-
-res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
+ST2x4_UB(res, 0, dst, stride);
 }
 
-static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
  uint32_t coeff0, uint32_t coeff1,
  int32_t height)
 {
 if (2 == height) {
-avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
+avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
 } else if (4 == height) {
-avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
-} else if (8 == height) {
-avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, 
coeff1);
+

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 20, 01 and 03 msa functions

2017-09-26 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |  441 +++-
 1 file changed, 432 insertions(+), 9 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index b7f6c3d..0b42bc4 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -148,6 +148,17 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 hz_out_m;\
 } )
 
+#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)   \
+( { \
+v8i16 out0_m;   \
+\
+out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);   \
+out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
+out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
+\
+out0_m; \
+} )
+
 static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
@@ -3373,55 +3384,467 @@ void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const 
uint8_t *src,
 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
  ptrdiff_t stride)
 {
-avc_luma_hz_16w_msa(src - 2, stride, dst, stride, 16);
+uint32_t loop_cnt;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+v16i8 vec11;
+v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+v16i8 minus5b = __msa_ldi_b(-5);
+v16i8 plus20b = __msa_ldi_b(20);
+
+LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
+src -= 2;
+
+for (loop_cnt = 4; loop_cnt--;) {
+LD_SB2(src, 8, src0, src1);
+src += stride;
+LD_SB2(src, 8, src2, src3);
+src += stride;
+LD_SB2(src, 8, src4, src5);
+src += stride;
+LD_SB2(src, 8, src6, src7);
+src += stride;
+
+XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+ minus5b, res0, res1, res2, res3);
+DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+ plus20b, res0, res1, res2, res3);
+VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+ minus5b, res4, res5, res6, res7);
+DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+ plus20b, res4, res5, res6, res7);
+SRARI_H4_SH(res0, res1, res2, res3, 5);
+SRARI_H4_SH(res4, res5, res6, res7, 5);
+SAT_SH4_SH(res0, res1, res2, res3, 7);
+SAT_SH4_SH(res4, res5, res6, res7, 7);
+PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
+vec2, vec3);
+XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
+dst += (4 * stride);
+}
 }
 
 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
 ptrdiff_t stride)
 {
-avc_luma_hz_8w_msa(src - 2, stride, dst, stride, 8);
+v16u8 out0, out1, out2, out3;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+v16i8 vec11;
+v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+v16i8 minus5b = __msa_ldi_b(-5);
+v16i8 plus20b = __msa_ldi_b(20);
+

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc weighted mc msa functions

2017-09-25 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264dsp_msa.c   |  423 ++-
 libavutil/mips/generic_macros_msa.h |   36 +++
 2 files changed, 306 insertions(+), 153 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 422703d..5b06bd9 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -25,187 +25,201 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
 int32_t log2_denom, int32_t src_weight,
 int32_t offset_in)
 {
-uint32_t data0, data1;
+uint32_t tp0, tp1, offset_val;
 v16u8 zero = { 0 };
-v16u8 src0, src1;
-v4i32 res0, res1;
-v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
-v8u16 out0, out1;
+v16u8 src0 = { 0 };
+v8i16 src0_r, tmp0, wgt, denom, offset;
 
-offset_in <<= (log2_denom);
-
-if (log2_denom) {
-offset_in += (1 << (log2_denom - 1));
-}
+offset_val = (unsigned) offset_in << log2_denom;
 
 wgt = __msa_fill_h(src_weight);
-offset = __msa_fill_h(offset_in);
+offset = __msa_fill_h(offset_val);
 denom = __msa_fill_h(log2_denom);
 
-data0 = LW(data);
-data1 = LW(data + stride);
-
-src0 = (v16u8) __msa_fill_w(data0);
-src1 = (v16u8) __msa_fill_w(data1);
+LW2(data, stride, tp0, tp1);
+INSERT_W2_UB(tp0, tp1, src0);
+src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
+tmp0 = wgt * src0_r;
+tmp0 = __msa_adds_s_h(tmp0, offset);
+tmp0 = __msa_maxi_s_h(tmp0, 0);
+tmp0 = __msa_srlr_h(tmp0, denom);
+tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
+src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+ST4x2_UB(src0, data, stride);
+}
 
-ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
-MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
-ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
-MAXI_SH2_SH(temp0, temp1, 0);
+static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+int32_t src_weight, int32_t offset_in)
+{
+uint32_t tp0, tp1, tp2, tp3, offset_val;
+v16u8 src0 = { 0 };
+v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
 
-out0 = (v8u16) __msa_srl_h(temp0, denom);
-out1 = (v8u16) __msa_srl_h(temp1, denom);
+offset_val = (unsigned) offset_in << log2_denom;
 
-SAT_UH2_UH(out0, out1, 7);
-PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
+wgt = __msa_fill_h(src_weight);
+offset = __msa_fill_h(offset_val);
+denom = __msa_fill_h(log2_denom);
 
-data0 = __msa_copy_u_w(res0, 0);
-data1 = __msa_copy_u_w(res1, 0);
-SW(data0, data);
-data += stride;
-SW(data1, data);
+LW4(data, stride, tp0, tp1, tp2, tp3);
+INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+UNPCK_UB_SH(src0, src0_r, src1_r);
+MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
+ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
+MAXI_SH2_SH(tmp0, tmp1, 0);
+tmp0 = __msa_srlr_h(tmp0, denom);
+tmp1 = __msa_srlr_h(tmp1, denom);
+SAT_UH2_SH(tmp0, tmp1, 7);
+src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
 }
 
-static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride,
-int32_t height, int32_t log2_denom,
-int32_t src_weight, int32_t offset_in)
+static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+int32_t src_weight, int32_t offset_in)
 {
-uint8_t cnt;
-uint32_t data0, data1, data2, data3;
-v16u8 zero = { 0 };
-v16u8 src0, src1, src2, src3;
-v8u16 temp0, temp1, temp2, temp3, wgt;
-v8i16 denom, offset;
+uint32_t tp0, tp1, tp2, tp3, offset_val;
+v16u8 src0 = { 0 }, src1 = { 0 };
+v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+v8i16 wgt, denom, offset;
 
-offset_in <<= (log2_denom);
+offset_val = (unsigned) offset_in << log2_denom;
 
-if (log2_denom) {
-offset_in += (1 << (log2_denom - 1));
-}
-
-wgt = (v8u16) __msa_fill_h(src_weight);
-offset = __msa_fill_h(offset_in);
+wgt = __msa_fill_h(src_weight);
+offset = __msa_fill_h(offset_val);
 denom = __msa_fill_h(log2_denom);
 
-for (cnt = height / 4; cnt--;) {
-LW4(data, stride, data0, data1, data2, data3);
-
-src0 = (v16u8) __msa_fill_w(data0);
-src1 = (v16u8) __msa_fill_w(data1);
-src2 = (v16u8) __msa_fill_w(data2);
-src3 = (v16u8) __msa_fill_w(data3);
-
-ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
-   temp0, temp1, temp2, temp3);
-MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
- temp0, temp1, temp2, temp3);
-  

[FFmpeg-devel] [PATCH] avcodec/mips: preload data in hevc sao edge 45 degree filter msa functions

2017-09-25 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c |  197 
 1 file changed, 135 insertions(+), 62 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index 39c647e..c192265 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1878,23 +1878,25 @@ static void 
hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
  int32_t height)
 {
 uint8_t *src_orig;
-int32_t h_cnt;
 uint32_t dst_val0, dst_val1;
-v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 v16u8 const1 = (v16u8) __msa_ldi_b(1);
-v16i8 zero = { 0 };
+v16i8 offset, sao_offset = LD_SB(sao_offset_val);
 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
 v16u8 src_minus11, src10, src11;
 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
-v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+v8i16 offset_mask0, offset_mask1;
 
-sao_offset = LD_SH(sao_offset_val);
+sao_offset = __msa_pckev_b(sao_offset, sao_offset);
 
 src_orig = src - 1;
+
+/* load in advance */
 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+LD_UB2(src_orig + src_stride, src_stride, src10, src11);
 
-for (h_cnt = (height >> 1); h_cnt--;) {
-LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+for (height -= 2; height; height -= 2) {
+src_orig += (src_stride << 1);
 
 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
 SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
@@ -1917,20 +1919,22 @@ static void 
hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 
2);
 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 
2);
 
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
-   offset_mask0, offset_mask0, offset_mask0);
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
-   offset_mask1, offset_mask1, offset_mask1);
-ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
-ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
- offset_mask1);
-CLIP_SH2_0_255(offset_mask0, offset_mask1);
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
 
-dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
+   offset, offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
 
 src_minus10 = src10;
 src_minus11 = src11;
 
+/* load in advance */
+LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
 SW(dst_val0, dst);
@@ -1938,8 +1942,44 @@ static void 
hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
 SW(dst_val1, dst);
 
 dst += dst_stride;
-src_orig += (src_stride << 1);
 }
+
+SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+
+ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
+   src_minus11);
+ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+   src_zero1);
+
+cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+   offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+dst_val0 = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve vp9 idct msa functions

2017-09-04 Thread kaustubh.raste
From: Kaustubh Raste 

Removed memset calls.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/vp9_idct_msa.c |  118 
 1 file changed, 70 insertions(+), 48 deletions(-)

diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
index 25ea16c..bd762f2 100644
--- a/libavcodec/mips/vp9_idct_msa.c
+++ b/libavcodec/mips/vp9_idct_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Shivraj Patil (shivraj.pa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -352,6 +352,7 @@ static void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t 
*dst,
 out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
 out = ROUND_POWER_OF_TWO(out, 4);
 vec = __msa_fill_h(out);
+input[0] = 0;
 
 ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
 }
@@ -360,9 +361,11 @@ static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, 
uint8_t *dst,
   int32_t dst_stride)
 {
 v8i16 in0, in1, in2, in3;
+v8i16 zero = { 0 };
 
 /* load vector elements of 4x4 block */
 LD4x4_SH(input, in0, in1, in2, in3);
+ST_SH2(zero, zero, input, 8);
 /* rows */
 VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
 /* columns */
@@ -377,9 +380,11 @@ static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, 
uint8_t *dst,
int32_t dst_stride)
 {
 v8i16 in0, in1, in2, in3;
+v8i16 zero = { 0 };
 
 /* load vector elements of 4x4 block */
 LD4x4_SH(input, in0, in1, in2, in3);
+ST_SH2(zero, zero, input, 8);
 /* rows */
 VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
 /* columns */
@@ -394,9 +399,11 @@ static void vp9_iadst_idct_4x4_add_msa(int16_t *input, 
uint8_t *dst,
int32_t dst_stride, int32_t eob)
 {
 v8i16 in0, in1, in2, in3;
+v8i16 zero = { 0 };
 
 /* load vector elements of 4x4 block */
 LD4x4_SH(input, in0, in1, in2, in3);
+ST_SH2(zero, zero, input, 8);
 /* cols */
 VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
 /* columns */
@@ -411,9 +418,11 @@ static void vp9_idct_iadst_4x4_add_msa(int16_t *input, 
uint8_t *dst,
int32_t dst_stride, int32_t eob)
 {
 v8i16 in0, in1, in2, in3;
+v8i16 zero = { 0 };
 
 /* load vector elements of 4x4 block */
 LD4x4_SH(input, in0, in1, in2, in3);
+ST_SH2(zero, zero, input, 8);
 /* cols */
 VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
 /* columns */
@@ -585,6 +594,7 @@ static void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t 
*dst,
 out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
 val = ROUND_POWER_OF_TWO(out, 5);
 vec = __msa_fill_h(val);
+input[0] = 0;
 
 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
 dst += (4 * dst_stride);
@@ -601,9 +611,9 @@ static void vp9_idct8x8_12_colcol_addblk_msa(int16_t 
*input, uint8_t *dst,
 
 /* load vector elements of 8x8 block */
 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
-//TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
 
 /* stage1 */
 ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
@@ -659,9 +669,11 @@ static void vp9_idct8x8_colcol_addblk_msa(int16_t *input, 
uint8_t *dst,
   int32_t dst_stride)
 {
 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+v8i16 zero = { 0 };
 
 /* load vector elements of 8x8 block */
 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
 /* 1D idct8x8 */
 VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
@@ -689,10 +701,11 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t 
*input, uint8_t *dst,
 v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
 v8i16 cnst0, cnst1, cnst2, cnst3, cnst4;
 v8i16 temp0, temp1, temp2, temp3, s0, s1;
-v16i8 zero = { 0 };
+v8i16 zero = { 0 };
 
 /* load vector elements of 8x8 block */
 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
 
 /* 1D adst8x8 */
 VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
@@ -736,13 +749,13 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t 
*input, uint8_t *dst,
 dst0 = LD_UB(dst + 0 * dst_stride);
 dst7 = LD_UB(dst + 7 * dst_stride);
 
-res0 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst0);
+res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0);
 res0 += out0;
 res0 = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve vp9 lpf msa functions

2017-09-04 Thread kaustubh.raste
From: Kaustubh Raste 

Updated VP9_LPF_FILTER4_4W macro to process on 8 bit data.
Replaced VP9_LPF_FILTER4_8W with VP9_LPF_FILTER4_4W.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/vp9_lpf_msa.c |   94 ++---
 1 file changed, 14 insertions(+), 80 deletions(-)

diff --git a/libavcodec/mips/vp9_lpf_msa.c b/libavcodec/mips/vp9_lpf_msa.c
index eef8afc..c82a9e9 100644
--- a/libavcodec/mips/vp9_lpf_msa.c
+++ b/libavcodec/mips/vp9_lpf_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Shivraj Patil (shivraj.pa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -22,63 +22,12 @@
 #include "libavutil/mips/generic_macros_msa.h"
 #include "vp9dsp_mips.h"
 
-#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
-   p1_out, p0_out, q0_out, q1_out)   \
-{\
-v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;  \
-v16i8 filt, filt1, filt2, cnst4b, cnst3b;\
-v8i16 q0_sub_p0_r, filt_r, cnst3h;   \
- \
-p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);\
-p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);\
-q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);\
-q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);\
- \
-filt = __msa_subs_s_b(p1_m, q1_m);   \
-filt = filt & (v16i8) hev_in;\
-q0_sub_p0 = q0_m - p0_m; \
-filt_sign = __msa_clti_s_b(filt, 0); \
- \
-cnst3h = __msa_ldi_h(3); \
-q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);\
-q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);   \
-filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);  \
-filt_r += q0_sub_p0_r;   \
-filt_r = __msa_sat_s_h(filt_r, 7);   \
- \
-/* combine left and right part */\
-filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r);\
- \
-filt = filt & (v16i8) mask_in;   \
-cnst4b = __msa_ldi_b(4); \
-filt1 = __msa_adds_s_b(filt, cnst4b);\
-filt1 >>= 3; \
- \
-cnst3b = __msa_ldi_b(3); \
-filt2 = __msa_adds_s_b(filt, cnst3b);\
-filt2 >>= 3; \
- \
-q0_m = __msa_subs_s_b(q0_m, filt1);  \
-q0_out = __msa_xori_b((v16u8) q0_m, 0x80);   \
-p0_m = __msa_adds_s_b(p0_m, filt2);  \
-p0_out = __msa_xori_b((v16u8) p0_m, 0x80);   \
- \
-filt = __msa_srari_b(filt1, 1);  \
-hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
-filt = filt & (v16i8) hev_in;\
- \
-q1_m = __msa_subs_s_b(q1_m, filt);   \
-q1_out = __msa_xori_b((v16u8) q1_m, 0x80);   \
-p1_m = __msa_adds_s_b(p1_m, filt);   \
-p1_out = __msa_xori_b((v16u8) p1_m, 0x80);   \
-}
-
 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
p1_out, p0_out, q0_out, q1_out)   \
 {\
-v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;  \
-v16i8 filt, filt1, filt2, cnst4b, cnst3b;\
-v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;

[FFmpeg-devel] [PATCH] avcodec/mips: Improve vp9 mc msa functions

2017-09-04 Thread kaustubh.raste
From: Kaustubh Raste 

Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c  |   17 +-
 libavcodec/mips/vp9_mc_msa.c|  759 ---
 libavutil/mips/generic_macros_msa.h |   24 +-
 3 files changed, 369 insertions(+), 431 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index c38f1f7..43d21f7 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -1479,7 +1479,8 @@ static void avc_luma_hz_and_aver_dst_8x8_msa(const 
uint8_t *src,
  plus20b, res0, res1, res2, res3);
 SRARI_H4_SH(res0, res1, res2, res3, 5);
 SAT_SH4_SH(res0, res1, res2, res3, 7);
-CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
 dst, dst_stride);
 
 dst += (4 * dst_stride);
@@ -1825,8 +1826,8 @@ static void avc_luma_vt_and_aver_dst_8x8_msa(const 
uint8_t *src,
 SRARI_H4_SH(out0, out1, out2, out3, 5);
 SAT_SH4_SH(out0, out1, out2, out3, 7);
 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
 dst, dst_stride);
 dst += (4 * dst_stride);
 
@@ -2229,7 +2230,8 @@ static void avc_luma_mid_and_aver_dst_8w_msa(const 
uint8_t *src,
 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
hz_out6, hz_out7, hz_out8);
 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
 dst, dst_stride);
 
 dst += (4 * dst_stride);
@@ -2518,8 +2520,8 @@ static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const 
uint8_t *src,
 res1 = __msa_aver_s_h(res2, res3);
 res2 = __msa_aver_s_h(res4, res5);
 res3 = __msa_aver_s_h(res6, res7);
-
-CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
 dst, dst_stride);
 dst += (4 * dst_stride);
 
@@ -2676,7 +2678,8 @@ static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const 
uint8_t *src_x,
 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
 
 SAT_SH4_SH(out0, out1, out2, out3, 7);
-CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
 dst, dst_stride);
 dst += (4 * dst_stride);
 
diff --git a/libavcodec/mips/vp9_mc_msa.c b/libavcodec/mips/vp9_mc_msa.c
index 1671d97..749e8cb 100644
--- a/libavcodec/mips/vp9_mc_msa.c
+++ b/libavcodec/mips/vp9_mc_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Shivraj Patil (shivraj.pa...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -145,16 +145,15 @@ static const int8_t vp9_bilinear_filters_msa[15][2] = {
 ST_UB(tmp_m, (pdst)); \
 }
 
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
-   pdst, stride)\
-{   \
-v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;   \
-uint8_t *pdst_m = (uint8_t *) (pdst);   \
-\
-PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);\
-PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);\
-AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);\
-ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);   \
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,  dst0, dst1,   \
+   pdst, stride)  \
+{ \
+v16u8 tmp0_m, tmp1_m; \
+uint8_t *pdst_m = (uint8_t *) (pdst); \
+  \
+PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);  \
+

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc bi-weighted mc msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264dsp_msa.c   |  469 +++
 libavutil/mips/generic_macros_msa.h |4 +
 2 files changed, 311 insertions(+), 162 deletions(-)

diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 5b06bd9..e50f5ca 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -223,217 +223,242 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t 
stride, int32_t log2_denom,
 }
 }
 
-static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   int32_t log2_denom, int32_t src_weight,
   int32_t dst_weight, int32_t offset_in)
 {
-uint32_t load0, load1, out0, out1;
-v16i8 src_wgt, dst_wgt, wgt;
-v16i8 src0, src1, dst0, dst1;
-v8i16 temp0, temp1, denom, offset, add_val;
-int32_t val = 128 * (src_weight + dst_weight);
+uint32_t tp0, tp1;
+v16i8 src_wgt, dst_wgt, wgt, vec0;
+v16u8 src0 = { 0 }, dst0 = { 0 };
+v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
 
-offset_in = ((offset_in + 1) | 1) << log2_denom;
+offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+offset_in += (128 * (src_weight + dst_weight));
 
 src_wgt = __msa_fill_b(src_weight);
 dst_wgt = __msa_fill_b(dst_weight);
 offset = __msa_fill_h(offset_in);
 denom = __msa_fill_h(log2_denom + 1);
-add_val = __msa_fill_h(val);
-offset += add_val;
 
 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
-load0 = LW(src);
-src += src_stride;
-load1 = LW(src);
-
-src0 = (v16i8) __msa_fill_w(load0);
-src1 = (v16i8) __msa_fill_w(load1);
-
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
-
-dst0 = (v16i8) __msa_fill_w(load0);
-dst1 = (v16i8) __msa_fill_w(load1);
+LW2(src, stride, tp0, tp1);
+INSERT_W2_UB(tp0, tp1, src0);
+LW2(dst, stride, tp0, tp1);
+INSERT_W2_UB(tp0, tp1, dst0);
+XORI_B2_128_UB(src0, dst0);
+vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
+tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+tmp0 >>= denom;
+tmp0 = __msa_maxi_s_h(tmp0, 0);
+tmp0 = __msa_min_s_h(max255, tmp0);
+dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+ST4x2_UB(dst0, dst, stride);
+}
 
-XORI_B4_128_SB(src0, src1, dst0, dst1);
-ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
+static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+  int32_t log2_denom, int32_t src_weight,
+  int32_t dst_weight, int32_t offset_in)
+{
+uint32_t tp0, tp1, tp2, tp3;
+v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
+v16u8 src0, dst0;
+v8i16 tmp0, tmp1, denom, offset;
 
-temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
-temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+offset_in += (128 * (src_weight + dst_weight));
 
-temp0 >>= denom;
-temp1 >>= denom;
+src_wgt = __msa_fill_b(src_weight);
+dst_wgt = __msa_fill_b(dst_weight);
+offset = __msa_fill_h(offset_in);
+denom = __msa_fill_h(log2_denom + 1);
 
-CLIP_SH2_0_255(temp0, temp1);
-PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
+wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
-out0 = __msa_copy_u_w((v4i32) dst0, 0);
-out1 = __msa_copy_u_w((v4i32) dst1, 0);
-SW(out0, dst);
-dst += dst_stride;
-SW(out1, dst);
+LW4(src, stride, tp0, tp1, tp2, tp3);
+INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+LW4(dst, stride, tp0, tp1, tp2, tp3);
+INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+XORI_B2_128_UB(src0, dst0);
+ILVRL_B2_SB(dst0, src0, vec0, vec1);
+tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+tmp0 >>= denom;
+tmp1 >>= denom;
+CLIP_SH2_0_255(tmp0, tmp1);
+dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
 }
 
-static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
-  int32_t height, int32_t log2_denom,
-  int32_t src_weight, int32_t dst_weight,
-  int32_t offset_in)
+static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+  int32_t log2_denom, int32_t src_weight,
+  int32_t dst_weight, int32_t offset_in)
 {
-uint8_t cnt;
-uint32_t load0, load1, load2, load3;
-v16i8 src_wgt, dst_wgt, wgt;
-   

[FFmpeg-devel] [PATCH] avcodec/mips: preload data in hevc sao edge 135 degree filter msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_lpf_sao_msa.c |  194 
 1 file changed, 132 insertions(+), 62 deletions(-)

diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c 
b/libavcodec/mips/hevc_lpf_sao_msa.c
index c192265..5b5537a 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -2226,23 +2226,24 @@ static void 
hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
   int32_t height)
 {
 uint8_t *src_orig;
-int32_t h_cnt;
 uint32_t dst_val0, dst_val1;
-v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 v16u8 const1 = (v16u8) __msa_ldi_b(1);
-v16i8 zero = { 0 };
+v16i8 offset, sao_offset = LD_SB(sao_offset_val);
 v16i8 src_zero0, src_zero1, dst0;
 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 v16u8 src_minus10, src10, src_minus11, src11;
-v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01;
+v8i16 offset_mask0, offset_mask1;
 
-sao_offset = LD_SH(sao_offset_val);
+sao_offset = __msa_pckev_b(sao_offset, sao_offset);
 src_orig = src - 1;
 
+/* load in advance */
 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+LD_UB2(src_orig + src_stride, src_stride, src10, src11);
 
-for (h_cnt = (height >> 1); h_cnt--;) {
-LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+for (height -= 2; height; height -= 2) {
+src_orig += (src_stride << 1);
 
 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
@@ -2265,19 +2266,22 @@ static void 
hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 
2);
 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 
2);
 
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
-   offset_mask0, offset_mask0, offset_mask0);
-VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
-   offset_mask1, offset_mask1, offset_mask1);
-ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
-ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
- offset_mask1);
-CLIP_SH2_0_255(offset_mask0, offset_mask1);
-dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+   offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
 
 src_minus10 = src10;
 src_minus11 = src11;
 
+/* load in advance */
+LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
 
@@ -2286,8 +2290,46 @@ static void 
hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
 SW(dst_val1, dst);
 
 dst += dst_stride;
-src_orig += (src_stride << 1);
 }
+
+SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+
+ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+   src_minus11);
+ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+   src_zero1);
+
+cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+   offset, offset);
+
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+dst0 = __msa_adds_s_b(dst0, offset);
+dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+   

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 21, 23 and 02 msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c | 1219 ++--
 1 file changed, 802 insertions(+), 417 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index afc0183..a22a482 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -45,25 +45,6 @@
 out0_m;  \
 } )
 
-#define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2) \
-( { \
-v8i16 out0_m, out1_m;   \
-v16i8 tmp0_m, tmp1_m;   \
-v16i8 minus5b = __msa_ldi_b(-5);\
-v16i8 plus20b = __msa_ldi_b(20);\
-\
-tmp0_m = __msa_vshf_b((v16i8) mask0, in, in);   \
-out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);\
-\
-tmp0_m = __msa_vshf_b((v16i8) mask1, in, in);   \
-out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);  \
-\
-tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in); \
-out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m);  \
-\
-out1_m; \
-} )
-
 static const uint8_t luma_mask_arr[16 * 8] = {
 /* 8 width cases */
 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
@@ -148,6 +129,25 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 hz_out_m;\
 } )
 
+#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
+( {\
+v8i16 out0_m;  \
+v16i8 tmp0_m;  \
+v16i8 minus5b = __msa_ldi_b(-5);   \
+v16i8 plus20b = __msa_ldi_b(20);   \
+   \
+tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0);\
+out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);   \
+   \
+tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0);\
+out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
+   \
+tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0);\
+out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
+   \
+out0_m;\
+} )
+
 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)   \
 ( { \
 v8i16 out0_m;   \
@@ -159,175 +159,17 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 out0_m; \
 } )
 
-static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride,
-   uint8_t *dst, int32_t dst_stride,
-   int32_t height)
-{
-int32_t loop_cnt;
-int16_t filt_const0 = 0xfb01;
-int16_t filt_const1 = 0x1414;
-int16_t filt_const2 = 0x1fb;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
-v16i8 src87_r, src2110, src4332, src6554, src8776;
-v16i8 filt0, filt1, filt2;
-v8i16 out10, out32;
-v16u8 out;
-
-filt0 = (v16i8) __msa_fill_h(filt_const0);
-filt1 = (v16i8) __msa_fill_h(filt_const1);
-filt2 = (v16i8) __msa_fill_h(filt_const2);
-
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-
-ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
-   src10_r, src21_r, src32_r, src43_r);
-ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-XORI_B2_128_SB(src2110, src4332);
-
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src5, src6, src7, src8);
-src += (4 * src_stride);
-
-ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
-   src54_r, src65_r, src76_r, src87_r);
-ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
-XORI_B2_128_SB(src6554, src8776);
-out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
-out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
-SRARI_H2_SH(out10, out32, 5);
-SAT_SH2_SH(out10, out32, 7);
-out = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma hv mc msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  309 --
 1 file changed, 166 insertions(+), 143 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 16e2fe4..b8fcf6d 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -526,8 +526,7 @@ static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t 
*dst, int32_t stride,
 }
 }
 
-static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coef_hor0, uint32_t coef_hor1,
   uint32_t coef_ver0, uint32_t coef_ver1)
 {
@@ -544,7 +543,7 @@ static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t 
src_stride,
 
 mask = LD_SB(_mask_arr[48]);
 
-LD_UB3(src, src_stride, src0, src1, src2);
+LD_UB3(src, stride, src0, src1, src2);
 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
@@ -558,12 +557,11 @@ static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t 
src_stride,
 out1 = __msa_copy_u_h(res_vert, 1);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
+static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
   uint32_t coef_hor0, uint32_t coef_hor1,
   uint32_t coef_ver0, uint32_t coef_ver1)
 {
@@ -580,7 +578,7 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t 
src_stride,
 
 mask = LD_SB(_mask_arr[48]);
 
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+LD_UB5(src, stride, src0, src1, src2, src3, src4);
 
 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
@@ -591,83 +589,27 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t 
src_stride,
 res_vt0 += res_vt1;
 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 res_vt0 = __msa_sat_u_h(res_vt0, 7);
-res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-
-ST2x4_UB(res, 0, dst, dst_stride);
-}
-
-static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
-  uint32_t coef_hor0, uint32_t coef_hor1,
-  uint32_t coef_ver0, uint32_t coef_ver1)
-{
-v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-v16u8 tmp0, tmp1, tmp2, tmp3;
-v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
-v8i16 res;
-v16i8 mask;
-v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
-v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
-v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
-v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
-v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
-
-mask = LD_SB(_mask_arr[48]);
-
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-LD_UB4(src, src_stride, src5, src6, src7, src8);
-
-VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
-VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
-ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
-VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
-ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
-DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
-MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-
-res_vt0 += res_vt1;
-res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
-res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
-
-DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
-MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-
-res_vt0 += res_vt1;
-res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
-res_vt0 = __msa_sat_u_h(res_vt0, 7);
-
-res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-
-ST2x4_UB(res, 0, dst, dst_stride);
+ST2x4_UB(res, 0, dst, stride);
 }
 
-static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni-w horiz mc msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Load the specific destination bytes instead of MSA load and pack.
Pack the data to half word before clipping.
Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_macros_msa.h  |   13 +-
 libavcodec/mips/hevc_mc_uniw_msa.c |  641 +---
 2 files changed, 386 insertions(+), 268 deletions(-)

diff --git a/libavcodec/mips/hevc_macros_msa.h 
b/libavcodec/mips/hevc_macros_msa.h
index b06c5ad..7dcfea0 100644
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -58,6 +58,17 @@
 out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m);  \
 }
 
+#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,\
+  filt0, filt1, filt2, filt3)\
+( {  \
+v8i16 out_m; \
+ \
+out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);  \
+out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+DPADD_SB2_SH(in2, in3, filt2, filt3, out_m, out_m);  \
+out_m;   \
+} )
+
 #define HEVC_FILT_8TAP(in0, in1, in2, in3,   \
filt0, filt1, filt2, filt3)   \
 ( {  \
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 38a8844..7c01c32 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -22,6 +22,13 @@
 #include "libavcodec/mips/hevcdsp_mips.h"
 #include "libavcodec/mips/hevc_macros_msa.h"
 
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = 
{
+/* 8 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+/* 4 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,   \
out0, out1, out2, out3) \
 {  \
@@ -624,28 +631,35 @@ static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
  int32_t rnd_val)
 {
 uint32_t loop_cnt;
+v16u8 out0, out1;
 v8i16 filt0, filt1, filt2, filt3;
 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-v16i8 mask1, mask2, mask3;
-v8i16 filter_vec, const_vec;
-v16i8 vec0, vec1, vec2, vec3;
-v8i16 dst0, dst1, dst2, dst3;
-v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
-v4i32 weight_vec, offset_vec, rnd_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
+v8i16 filter_vec, dst01, dst23, dst45, dst67;
+v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
+v4i32 weight_vec, rnd_vec;
 
 src -= 3;
 weight = weight & 0x;
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 
 weight_vec = __msa_fill_w(weight);
-offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
 
+weight *= 128;
+rnd_val -= 6;
+
+weight_vec_h = __msa_fill_h(weight);
+offset_vec = __msa_fill_h(offset);
+denom_vec = __msa_fill_h(rnd_val);
+
+weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
 filter_vec = LD_SH(filter);
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
+mask0 = LD_SB(_hevc_mask_arr[16]);
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
 mask3 = mask0 + 6;
@@ -657,34 +671,27 @@ static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
 
 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
-
-dst0 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst0, dst0, dst0, dst0);
 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
-dst1 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst1, dst1, dst1, dst1);
+   vec4, vec5, vec6, vec7);
 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc uni copy mc msa functions

2017-10-09 Thread kaustubh.raste
From: Kaustubh Raste 

Load the specific bytes instead of MSA load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c |  245 +++--
 1 file changed, 100 insertions(+), 145 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index cf22e7f..eead591 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -28,83 +28,39 @@ static void copy_width8_msa(uint8_t *src, int32_t 
src_stride,
 {
 int32_t cnt;
 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-if (0 == height % 12) {
-for (cnt = (height / 12); cnt--;) {
-LD_UB8(src, src_stride,
-   src0, src1, src2, src3, src4, src5, src6, src7);
-src += (8 * src_stride);
-
-out0 = __msa_copy_u_d((v2i64) src0, 0);
-out1 = __msa_copy_u_d((v2i64) src1, 0);
-out2 = __msa_copy_u_d((v2i64) src2, 0);
-out3 = __msa_copy_u_d((v2i64) src3, 0);
-out4 = __msa_copy_u_d((v2i64) src4, 0);
-out5 = __msa_copy_u_d((v2i64) src5, 0);
-out6 = __msa_copy_u_d((v2i64) src6, 0);
-out7 = __msa_copy_u_d((v2i64) src7, 0);
 
-SD4(out0, out1, out2, out3, dst, dst_stride);
-dst += (4 * dst_stride);
-SD4(out4, out5, out6, out7, dst, dst_stride);
-dst += (4 * dst_stride);
-
-LD_UB4(src, src_stride, src0, src1, src2, src3);
+if (2 == height) {
+LD2(src, src_stride, out0, out1);
+SD(out0, dst);
+dst += dst_stride;
+SD(out1, dst);
+} else if (6 == height) {
+LD4(src, src_stride, out0, out1, out2, out3);
+src += (4 * src_stride);
+SD4(out0, out1, out2, out3, dst, dst_stride);
+dst += (4 * dst_stride);
+LD2(src, src_stride, out0, out1);
+SD(out0, dst);
+dst += dst_stride;
+SD(out1, dst);
+} else if (0 == (height % 8)) {
+for (cnt = (height >> 3); cnt--;) {
+LD4(src, src_stride, out0, out1, out2, out3);
+src += (4 * src_stride);
+LD4(src, src_stride, out4, out5, out6, out7);
 src += (4 * src_stride);
-
-out0 = __msa_copy_u_d((v2i64) src0, 0);
-out1 = __msa_copy_u_d((v2i64) src1, 0);
-out2 = __msa_copy_u_d((v2i64) src2, 0);
-out3 = __msa_copy_u_d((v2i64) src3, 0);
-
-SD4(out0, out1, out2, out3, dst, dst_stride);
-dst += (4 * dst_stride);
-}
-} else if (0 == height % 8) {
-for (cnt = height >> 3; cnt--;) {
-LD_UB8(src, src_stride,
-   src0, src1, src2, src3, src4, src5, src6, src7);
-src += (8 * src_stride);
-
-out0 = __msa_copy_u_d((v2i64) src0, 0);
-out1 = __msa_copy_u_d((v2i64) src1, 0);
-out2 = __msa_copy_u_d((v2i64) src2, 0);
-out3 = __msa_copy_u_d((v2i64) src3, 0);
-out4 = __msa_copy_u_d((v2i64) src4, 0);
-out5 = __msa_copy_u_d((v2i64) src5, 0);
-out6 = __msa_copy_u_d((v2i64) src6, 0);
-out7 = __msa_copy_u_d((v2i64) src7, 0);
-
 SD4(out0, out1, out2, out3, dst, dst_stride);
 dst += (4 * dst_stride);
 SD4(out4, out5, out6, out7, dst, dst_stride);
 dst += (4 * dst_stride);
 }
-} else if (0 == height % 4) {
-for (cnt = (height / 4); cnt--;) {
-LD_UB4(src, src_stride, src0, src1, src2, src3);
+} else if (0 == (height % 4)) {
+for (cnt = (height >> 2); cnt--;) {
+LD4(src, src_stride, out0, out1, out2, out3);
 src += (4 * src_stride);
-out0 = __msa_copy_u_d((v2i64) src0, 0);
-out1 = __msa_copy_u_d((v2i64) src1, 0);
-out2 = __msa_copy_u_d((v2i64) src2, 0);
-out3 = __msa_copy_u_d((v2i64) src3, 0);
-
 SD4(out0, out1, out2, out3, dst, dst_stride);
 dst += (4 * dst_stride);
 }
-} else if (0 == height % 2) {
-for (cnt = (height / 2); cnt--;) {
-LD_UB2(src, src_stride, src0, src1);
-src += (2 * src_stride);
-out0 = __msa_copy_u_d((v2i64) src0, 0);
-out1 = __msa_copy_u_d((v2i64) src1, 0);
-
-SD(out0, dst);
-dst += dst_stride;
-SD(out1, dst);
-dst += dst_stride;
-}
 }
 }
 
@@ -122,33 +78,6 @@ static void copy_width12_msa(uint8_t *src, int32_t 
src_stride,
 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 }
 
-static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride,
-  int32_t height, int32_t width)
-{
-int32_t cnt, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 11, 31, 13 and 33 msa functions

2017-10-24 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |  400 
 1 file changed, 240 insertions(+), 160 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index f11fce8..fcccb98 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -171,23 +171,27 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 out0_m; \
 } )
 
-static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
-   int32_t src_stride, uint8_t *dst,
-   int32_t dst_stride, int32_t height)
+static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
+uint8_t *dst, int32_t stride)
 {
-uint32_t loop_cnt;
-v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
-v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
-v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
-v16i8 mask0, mask1, mask2;
-v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
-v8i16 out0, out1;
+const int16_t filt_const0 = 0xfb01;
+const int16_t filt_const1 = 0x1414;
+const int16_t filt_const2 = 0x1fb;
 v16u8 out;
+v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
+v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
+v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
+v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
+
+filt0 = (v16i8) __msa_fill_h(filt_const0);
+filt1 = (v16i8) __msa_fill_h(filt_const1);
+filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2);
 
-LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
-src_y += (5 * src_stride);
+LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+src_y += (5 * stride);
 
 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
@@ -196,149 +200,237 @@ static void avc_luma_hv_qrt_4w_msa(const uint8_t 
*src_x, const uint8_t *src_y,
 
 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
 
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
-src_x += (4 * src_stride);
-
-XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
-
-hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
-  src_hz1, mask0,
-  mask1, mask2);
-hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
-  src_hz3, mask0,
-  mask1, mask2);
-
-SRARI_H2_SH(hz_out0, hz_out1, 5);
-SAT_SH2_SH(hz_out0, hz_out1, 7);
-
-LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
-src_y += (4 * src_stride);
-
-src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
-src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
-src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
-src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
-
-XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
+hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
 
-/* filter calc */
-vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
-  src_vt2, src_vt3,
-  src_vt4, src_vt5);
-vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
-  src_vt4, src_vt5,
-  src_vt6, src_vt7);
+SRARI_H2_SH(hz_out0, hz_out1, 5);
+SAT_SH2_SH(hz_out0, hz_out1, 7);
 
-SRARI_H2_SH(vert_out0, vert_out1, 5);
-SAT_SH2_SH(vert_out0, vert_out1, 7);
+LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
-out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
-out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+src_vt6 = (v16i8) 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi weighted hv mc msa functions

2017-10-24 Thread kaustubh.raste
From: Kaustubh Raste 

Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_biw_msa.c   |  706 ++-
 libavutil/mips/generic_macros_msa.h |   35 ++
 2 files changed, 489 insertions(+), 252 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_biw_msa.c 
b/libavcodec/mips/hevc_mc_biw_msa.c
index 05a28ec..458e73d 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (manojkumar.bhos...@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -22,6 +22,12 @@
 #include "libavcodec/mips/hevcdsp_mips.h"
 #include "libavcodec/mips/hevc_macros_msa.h"
 
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = 
{
+/* 8 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
out0_r, out1_r, out0_l, out1_l)  \
 {   \
@@ -1831,23 +1837,23 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
 int32_t rnd_val)
 {
 uint32_t loop_cnt;
-int32_t offset;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-v8i16 in0, in1;
+uint64_t tp0, tp1;
+int32_t offset, weight;
+v16u8 out;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+v8i16 in0 = { 0 }, in1 = { 0 };
 v8i16 filt0, filt1, filt2, filt3;
-v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 v16i8 mask1, mask2, mask3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec, weight_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
-v4i32 dst0_r, dst1_r;
-v4i32 tmp1, tmp2;
-v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
-v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
-v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+v8i16 tmp0, tmp1, tmp2, tmp3;
+v8i16 dst10, dst32, dst54, dst76;
+v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
+v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src0_ptr -= ((3 * src_stride) + 3);
 
@@ -1855,10 +1861,9 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
@@ -1866,13 +1871,14 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
 
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
+weight = weight0 | (weight1 << 16);
 
-const_vec = __msa_ldi_h(128);
+const_vec = __msa_fill_w((128 * weight1));
 const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
-weight_vec0 = __msa_fill_w(weight0);
-weight_vec1 = __msa_fill_w(weight1);
 rnd_vec = __msa_fill_w(rnd_val + 1);
+offset_vec += const_vec;
+weight_vec = (v8i16) __msa_fill_w(weight);
 
 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
 src0_ptr += (7 * src_stride);
@@ -1886,70 +1892,77 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
 
-dst30 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst30, dst30, dst30, dst30);
-dst41 = const_vec;
-DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
- dst41, dst41, dst41, dst41);
-dst52 = const_vec;
-DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
- dst52, dst52, dst52, dst52);
-dst63 = const_vec;
-DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
- dst63, dst63, dst63, dst63);
-
-ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
-   dst10_r, dst21_r, dst32_r);
-dst43_r = __msa_ilvl_h(dst41, dst30);
-dst54_r = __msa_ilvl_h(dst52, dst41);
-dst65_r = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma copy and avg vert mc msa functions

2017-10-24 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  627 +-
 1 file changed, 275 insertions(+), 352 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 2a54675..a5c3334 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1124,24 +1124,25 @@ static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
 uint16_t out0, out1;
-uint32_t load0, load1;
 v16i8 src0, src1, src2, tmp0, tmp1, res;
 v16u8 dst_data = { 0 };
+v8i16 out;
 v8u16 res_r;
 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
-LD_SB3(src, src_stride, src0, src1, src2);
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
+LD_SB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
 
-INSERT_W2_UB(load0, load1, dst_data);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
 
 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 
@@ -1151,20 +1152,20 @@ static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 res_r = __msa_sat_u_h(res_r, 7);
 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-dst_data = __msa_aver_u_b((v16u8) res, dst_data);
-out0 = __msa_copy_u_h((v8i16) dst_data, 0);
-out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+out0 = __msa_copy_u_h(out, 0);
+out1 = __msa_copy_u_h(out, 2);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
-uint32_t load0, load1;
+uint16_t tp0, tp1, tp2, tp3;
 v16i8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
 v8u16 res_r;
@@ -1174,19 +1175,16 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t 
*src, int32_t src_stride,
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 v16u8 dst_data = { 0 };
 
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+LD_SB5(src, stride, src0, src1, src2, src3, src4);
 
-load0 = LW(dst + 2 * dst_stride);
-load1 = LW(dst + 3 * dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+tp0 = LH(dst);
+tp1 = LH(dst + stride);
+tp2 = LH(dst + 2 * stride);
+tp3 = LH(dst + 3 * stride);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
 
 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
tmp0, tmp1, tmp2, tmp3);
@@ -1202,102 +1200,26 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t 
*src, int32_t src_stride,
 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
-}
-
-static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
-{
-uint32_t load0, load1, load2, load3;
-v16i8 src0, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc avg mc 10, 30, 01 and 03 msa functions

2017-11-05 Thread kaustubh.raste
From: Kaustubh Raste 

Align the mask buffer to 64 bytes.
Load the specific destination bytes instead of MSA load and pack.
Remove unused macros and functions.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c | 1269 
 1 file changed, 751 insertions(+), 518 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index dd11f00..9c779bd 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -21,7 +21,7 @@
 #include "libavutil/mips/generic_macros_msa.h"
 #include "h264dsp_mips.h"
 
-static const uint8_t luma_mask_arr[16 * 8] = {
+static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
 /* 8 width cases */
 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
@@ -31,9 +31,6 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
-
-2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
-3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
 };
 
 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
@@ -356,414 +353,6 @@ static void avc_luma_hv_qrt_16x16_msa(const uint8_t 
*src_x,
 }
 }
 
-static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- uint8_t hor_offset)
-{
-uint8_t slide;
-v16i8 src0, src1, src2, src3;
-v16u8 dst0, dst1, dst2, dst3;
-v16i8 mask0, mask1, mask2;
-v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v8i16 out0, out1;
-v16i8 minus5b = __msa_ldi_b(-5);
-v16i8 plus20b = __msa_ldi_b(20);
-v16u8 res0, res1;
-
-LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2);
-
-if (hor_offset) {
-slide = 3;
-} else {
-slide = 2;
-}
-
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-XORI_B4_128_SB(src0, src1, src2, src3);
-VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
-HADD_SB2_SH(vec0, vec1, out0, out1);
-VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
-DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
-VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
-DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
-SRARI_H2_SH(out0, out1, 5);
-SAT_SH2_SH(out0, out1, 7);
-
-PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
-
-src0 = __msa_sld_b(src0, src0, slide);
-src1 = __msa_sld_b(src1, src1, slide);
-src2 = __msa_sld_b(src2, src2, slide);
-src3 = __msa_sld_b(src3, src3, slide);
-src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
-src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
-res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
-res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
-
-XORI_B2_128_UB(res0, res1);
-
-dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
-dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
-
-AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
-
-ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- uint8_t hor_offset)
-{
-uint8_t slide;
-uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3;
-v16i8 mask0, mask1, mask2;
-v16u8 dst0, dst1, dst2, dst3;
-v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
-v8i16 out0, out1, out2, out3;
-v16i8 minus5b = __msa_ldi_b(-5);
-v16i8 plus20b = __msa_ldi_b(20);
-v16i8 res0, res1, res2, res3;
-
-LD_SB3(_mask_arr[0], 16, mask0, mask1, mask2);
-
-if (hor_offset) {
-slide = 3;
-} else {
-slide = 2;
-}
-
-for (loop_cnt = 2; loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
-
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-XORI_B4_128_SB(src0, src1, src2, src3);
-VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
-VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
-HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
-VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi 4 tap hv mc msa functions

2017-11-05 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.
Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_bi_msa.c | 1140 +++---
 1 file changed, 685 insertions(+), 455 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
index e9c9184..b17 100644
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -3772,20 +3772,20 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
   uint8_t *dst,
   int32_t dst_stride,
   const int8_t *filter_x,
-  const int8_t *filter_y,
-  int32_t height)
+  const int8_t *filter_y)
 {
-v8i16 in0, in1;
+uint64_t tp0, tp1;
+v16u8 out;
+v8i16 in0 = { 0 };
 v16i8 src0, src1, src2, src3, src4;
 v8i16 filt0, filt1;
-v4i32 filt_h0, filt_h1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v8i16 filt_h0, filt_h1;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 v16i8 mask1;
 v8i16 filter_vec, const_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v8i16 dst0, dst1, dst2, dst3, dst4;
-v4i32 dst0_r, dst1_r;
-v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
+v4i32 dst0, dst1;
 
 src0_ptr -= (src_stride + 1);
 
@@ -3793,56 +3793,43 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
 
 mask1 = mask0 + 2;
 
 const_vec = __msa_ldi_h(128);
 const_vec <<= 6;
 
-LD_SB3(src0_ptr, src_stride, src0, src1, src2);
-src0_ptr += (3 * src_stride);
-XORI_B3_128_SB(src0, src1, src2);
-
-VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
-VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
-VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
-dst1 = const_vec;
-DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
-dst2 = const_vec;
-DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
-ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
-
-LD_SB2(src0_ptr, src_stride, src3, src4);
-LD_SH2(src1_ptr, src2_stride, in0, in1);
-in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
-XORI_B2_128_SB(src3, src4);
-/* row 3 */
-VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
-dst3 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
-dst32_r = __msa_ilvr_h(dst3, dst2);
-dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
-dst0_r >>= 6;
-/* row 4 */
-VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
-dst4 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
-dst43_r = __msa_ilvr_h(dst4, dst3);
-dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
-dst1_r >>= 6;
-dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
-dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
-dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
-dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
-
-dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+LD2(src1_ptr, src2_stride, tp0, tp1);
+INSERT_D2_SH(tp0, tp1, in0);
+in0 = __msa_adds_s_h(in0, const_vec);
+
+VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+
+dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+
+dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+dst0 >>= 6;
+dst1 >>= 6;
+tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+tmp = __msa_adds_s_h(tmp, in0);
+tmp = __msa_srari_h(tmp, 7);
+tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hv mc msa functions

2017-11-05 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.
Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_biw_msa.c | 1396 +++--
 1 file changed, 872 insertions(+), 524 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_biw_msa.c 
b/libavcodec/mips/hevc_mc_biw_msa.c
index 0e5f8a0..ea65f00 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -4495,26 +4495,25 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  int32_t dst_stride,
  const int8_t *filter_x,
  const int8_t *filter_y,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
+uint64_t tp0, tp1;
 int32_t offset, weight;
-v8i16 in0, in1;
+v8i16 in0 = { 0 };
+v16u8 out;
 v16i8 src0, src1, src2, src3, src4;
 v8i16 filt0, filt1;
-v4i32 filt_h0, filt_h1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v8i16 filt_h0, filt_h1;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 v16i8 mask1;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec, tmp, weight_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v8i16 dst0, dst1, dst2, dst3, dst4;
-v4i32 dst0_r, dst1_r, dst0_l;
-v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
-v4i32 weight_vec, offset_vec, rnd_vec;
+v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
+v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
 
 src0_ptr -= (src_stride + 1);
 
@@ -4522,10 +4521,9 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
 
 mask1 = mask0 + 2;
 
@@ -4533,56 +4531,44 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
 
-const_vec = __msa_ldi_h(128);
+const_vec = __msa_fill_w((128 * weight1));
 const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
-weight_vec = __msa_fill_w(weight);
+weight_vec = (v8i16) __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
+offset_vec += const_vec;
 
-LD_SB3(src0_ptr, src_stride, src0, src1, src2);
-src0_ptr += (3 * src_stride);
-XORI_B3_128_SB(src0, src1, src2);
+LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+XORI_B5_128_SB(src0, src1, src2, src3, src4);
 
-VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
-VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
-VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
-dst1 = const_vec;
-DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
-dst2 = const_vec;
-DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
-ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
 
-LD_SB2(src0_ptr, src_stride, src3, src4);
-LD_SH2(src1_ptr, src2_stride, in0, in1);
-in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
-XORI_B2_128_SB(src3, src4);
-/* row 3 */
-VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
-dst3 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
-dst32_r = __msa_ilvr_h(dst3, dst2);
-dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
-dst0_r >>= 6;
-/* row 4 */
-VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
-dst4 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
-dst43_r = __msa_ilvr_h(dst4, dst3);
-dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
-dst1_r >>= 6;
-dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
 
-ILVRL_H2_SW(dst1_r, in0, dst0_r, dst0_l);
-dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc non-uni hv mc msa functions

2017-11-06 Thread kaustubh.raste
From: Kaustubh Raste 

Use mask buffer.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevcdsp_msa.c | 1478 -
 1 file changed, 870 insertions(+), 608 deletions(-)

diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
index 73cc3ea..b17127c 100644
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -22,6 +22,13 @@
 #include "libavcodec/mips/hevcdsp_mips.h"
 #include "libavcodec/mips/hevc_macros_msa.h"
 
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = 
{
+/* 8 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+/* 4 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
 static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
  int16_t *dst, int32_t dst_stride,
  int32_t height)
@@ -1308,31 +1315,28 @@ static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t 
src_stride,
   int32_t height)
 {
 uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+int32_t dst_stride_in_bytes = 2 * dst_stride;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 v8i16 filt0, filt1, filt2, filt3;
-v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 v16i8 mask1, mask2, mask3;
 v8i16 filter_vec, const_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
-v4i32 dst0_r, dst1_r;
-v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
-v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
-v16i8 mask0 = {
-0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
-};
-v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src -= ((3 * src_stride) + 3);
 filter_vec = LD_SH(filter_x);
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
@@ -1364,47 +1368,56 @@ static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t 
src_stride,
 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
  dst63, dst63, dst63, dst63);
 
-ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
-   dst10_r, dst21_r, dst32_r);
-dst43_r = __msa_ilvl_h(dst41, dst30);
-dst54_r = __msa_ilvl_h(dst52, dst41);
-dst65_r = __msa_ilvl_h(dst63, dst52);
+ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
-for (loop_cnt = height >> 1; loop_cnt--;) {
-LD_SB2(src, src_stride, src7, src8);
-src += (2 * src_stride);
-XORI_B2_128_SB(src7, src8);
+for (loop_cnt = height >> 2; loop_cnt--;) {
+LD_SB4(src, src_stride, src7, src8, src9, src10);
+src += (4 * src_stride);
+XORI_B4_128_SB(src7, src8, src9, src10);
 
-VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
-dst87 = const_vec;
+VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
+   vec4, vec5, vec6, vec7);
+dst97 = const_vec;
+dst108 = const_vec;
 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst87, dst87, dst87, dst87);
-dst76_r = __msa_ilvr_h(dst87, dst66);
+ dst97, dst97, dst97, dst97);
+DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+ dst108, dst108, dst108, dst108);
+
+dst76_r = __msa_ilvr_h(dst97, dst66);
+ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
+dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+dst98_r = __msa_ilvr_h(dst66, dst108);
+
 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
 filt_h0, filt_h1, filt_h2, filt_h3);
-dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hv mc msa functions

2017-11-06 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.
Remove unused macro and table.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c | 1125 +++--
 1 file changed, 566 insertions(+), 559 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 993dad0..740c970 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -283,25 +283,6 @@ static void copy_width64_msa(uint8_t *src, int32_t 
src_stride,
 }
 }
 
-static const uint8_t mc_filt_mask_arr[16 * 3] = {
-/* 8 width cases */
-0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-/* 4 width cases */
-0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
-/* 4 width cases */
-8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
-};
-
-#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)   \
-( { \
-v8i16 tmp0; \
-\
-tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
-tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
-\
-tmp0;   \
-} )
-
 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
  uint8_t *dst, int32_t dst_stride,
  const int8_t *filter)
@@ -3109,19 +3090,18 @@ static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
-   const int8_t *filter_y,
-   int32_t height)
+   const int8_t *filter_y)
 {
+v16u8 out;
 v16i8 src0, src1, src2, src3, src4;
 v8i16 filt0, filt1;
-v4i32 filt_h0, filt_h1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v8i16 filt_h0, filt_h1;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 v16i8 mask1;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec, tmp;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-v8i16 dst0, dst1, dst2, dst3, dst4;
-v4i32 dst0_r, dst1_r;
-v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
+v4i32 dst0, dst1;
 
 src -= (src_stride + 1);
 
@@ -3129,60 +3109,35 @@ static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
 
 mask1 = mask0 + 2;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
-
-LD_SB3(src, src_stride, src0, src1, src2);
-src += (3 * src_stride);
-
-XORI_B3_128_SB(src0, src1, src2);
-
-VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
-VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
-VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
-
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
-dst1 = const_vec;
-DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
-dst2 = const_vec;
-DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
-
-ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
-LD_SB2(src, src_stride, src3, src4);
-XORI_B2_128_SB(src3, src4);
-
-/* row 3 */
-VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
-dst3 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
-
-dst32_r = __msa_ilvr_h(dst3, dst2);
-dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
-dst0_r >>= 6;
-
-/* row 4 */
-VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
-dst4 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
-
-dst43_r = __msa_ilvr_h(dst4, dst3);
-dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
-dst1_r >>= 6;
-
-dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
-dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
-dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
-dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+XORI_B5_128_SB(src0, src1, src2, src3, src4);
 
-ST4x2_UB(dst0_r, dst, dst_stride);
+VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+VSHF_B2_SB(src1, src3, src1, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted 4 tap vt mc msa functions

2017-11-06 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.
Use immediate unsigned saturation for clip to max saving one vector register.
Remove unused macro.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uniw_msa.c | 1052 ++--
 1 file changed, 529 insertions(+), 523 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 31fec73..f9ecb41 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -29,33 +29,6 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] 
__attribute__((aligned(0x40))) = {
 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
 };
 
-#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,  \
-out0_r, out1_r, out0_l, out1_l)  \
-{\
-ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r);  \
-ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l);  \
-DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt,  \
-out0_r, out1_r, out0_l, out1_l); \
-SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
-ADD4(out0_r, offset, out1_r, offset, \
- out0_l, offset, out1_l, offset, \
- out0_r, out1_r, out0_l, out1_l);\
-out0_r = CLIP_SW_0_255(out0_r);  \
-out1_r = CLIP_SW_0_255(out1_r);  \
-out0_l = CLIP_SW_0_255(out0_l);  \
-out1_l = CLIP_SW_0_255(out1_l);  \
-}
-
-#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,  \
-out0_r, out1_r, out2_r, out3_r,\
-out0_l, out1_l, out2_l, out3_l)\
-{  \
-HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,\
-out0_r, out1_r, out0_l, out1_l);   \
-HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd,\
-out2_r, out3_r, out2_l, out3_l);   \
-}
-
 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \
out0_h, out1_h)\
 { \
@@ -3266,55 +3239,54 @@ static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
   uint8_t *dst,
   int32_t dst_stride,
   const int8_t *filter,
-  int32_t height,
   int32_t weight,
   int32_t offset,
   int32_t rnd_val)
 {
+v16u8 out;
 v16i8 src0, src1, src2, src3, src4;
 v16i8 src10_r, src32_r, src21_r, src43_r;
 v16i8 src2110, src4332;
-v8i16 dst10;
+v8i16 dst0;
 v4i32 dst0_r, dst0_l;
 v8i16 filt0, filt1;
-v8i16 filter_vec, const_vec;
-v4i32 weight_vec, offset_vec, rnd_vec;
+v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+v4i32 weight_vec, rnd_vec;
 
 src -= src_stride;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 weight = weight & 0x;
 
 weight_vec = __msa_fill_w(weight);
-offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
 
+weight *= 128;
+rnd_val -= 6;
+
+weight_vec_h = __msa_fill_h(weight);
+offset_vec = __msa_fill_h(offset);
+denom_vec = __msa_fill_h(rnd_val);
+
+weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
 filter_vec = LD_SH(filter);
 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
 
-LD_SB3(src, src_stride, src0, src1, src2);
-src += (3 * src_stride);
+LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
-src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
-src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
-LD_SB2(src, src_stride, src3, src4);
 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
-src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
-src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
-
-dst10 = const_vec;
-DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
-
-ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l);
+ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+XORI_B2_128_SB(src2110, src4332);
+dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc non-uni hz and vt mc msa functions

2017-11-09 Thread kaustubh.raste
From: Kaustubh Raste 

Use mask buffer.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevcdsp_msa.c |  541 -
 1 file changed, 312 insertions(+), 229 deletions(-)

diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
index b17127c..81db62b 100644
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -456,7 +456,7 @@ static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t 
src_stride,
 v16i8 vec0, vec1, vec2, vec3;
 v8i16 dst0, dst1, dst2, dst3;
 v8i16 filter_vec, const_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src -= 3;
 const_vec = __msa_ldi_h(128);
@@ -511,7 +511,7 @@ static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t 
src_stride,
 v16i8 vec0, vec1, vec2, vec3;
 v8i16 dst0, dst1, dst2, dst3;
 v8i16 filter_vec, const_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
 
 src -= 3;
 const_vec = __msa_ldi_h(128);
@@ -559,8 +559,75 @@ static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t 
src_stride,
int16_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height)
 {
-hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
-hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, 
height);
+uint32_t loop_cnt;
+int64_t res0, res1, res2, res3;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
+v8i16 filter_vec, const_vec;
+
+src -= 3;
+const_vec = __msa_ldi_h(128);
+const_vec <<= 6;
+
+filter_vec = LD_SH(filter);
+SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+mask0 = LD_SB(ff_hevc_mask_arr);
+mask1 = mask0 + 2;
+mask2 = mask0 + 4;
+mask3 = mask0 + 6;
+mask4 = LD_SB(ff_hevc_mask_arr + 16);
+mask5 = mask4 + 2;
+mask6 = mask4 + 4;
+mask7 = mask4 + 6;
+
+for (loop_cnt = 4; loop_cnt--;) {
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
+src += (4 * src_stride);
+XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+dst0 = const_vec;
+dst1 = const_vec;
+dst2 = const_vec;
+dst3 = const_vec;
+dst4 = const_vec;
+dst5 = const_vec;
+VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+ dst1, dst2, dst3);
+DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
+VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+ dst1, dst2, dst3);
+DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
+VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+ dst1, dst2, dst3);
+DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
+VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
+VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+ dst1, dst2, dst3);
+DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
+
+res0 = __msa_copy_s_d((v2i64) dst4, 0);
+res1 = __msa_copy_s_d((v2i64) dst4, 1);
+res2 = __msa_copy_s_d((v2i64) dst5, 0);
+res3 = __msa_copy_s_d((v2i64) dst5, 1);
+ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
+dst += (4 * dst_stride);
+}
 }
 
 static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
@@ -568,13 +635,13 @@ static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t 
src_stride,
const int8_t *filter, int32_t height)
 {
 uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+v16i8 

[FFmpeg-devel] [PATCH] avcodec/mips: cleanup unused macros

2017-11-08 Thread kaustubh.raste
From: Kaustubh Raste 

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_macros_msa.h |   37 -
 1 file changed, 37 deletions(-)

diff --git a/libavcodec/mips/hevc_macros_msa.h 
b/libavcodec/mips/hevc_macros_msa.h
index 27c69ff..ea53812 100644
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -21,43 +21,6 @@
 #ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
 #define AVCODEC_MIPS_HEVC_MACROS_MSA_H
 
-#define HEVC_PCK_SW_SB2(in0, in1, out)\
-{ \
-v8i16 tmp0_m; \
-  \
-tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1); \
-out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m);  \
-}
-
-#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)  \
-{ \
-v8i16 tmp0_m, tmp1_m; \
-  \
-PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m);  \
-out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m);  \
-}
-
-#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)  \
-{\
-v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;\
- \
-PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,  \
-tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
-PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
-}
-
-#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,   \
- in8, in9, in10, in11, out0, out1, out2)   \
-{  \
-v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;  \
-   \
-PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,\
-tmp0_m, tmp1_m, tmp2_m, tmp3_m);   \
-PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m); \
-PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);   \
-out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m);  \
-}
-
 #define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,\
   filt0, filt1, filt2, filt3)\
 ( {  \
-- 
1.7.9.5

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc avg mc 20, 21 and 23 msa functions

2017-11-02 Thread kaustubh.raste
From: Kaustubh Raste 

Load the specific destination bytes instead of MSA load and pack.
Remove unused macros and functions.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c | 1274 ++--
 1 file changed, 834 insertions(+), 440 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index 3df72f5..dd11f00 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -21,30 +21,6 @@
 #include "libavutil/mips/generic_macros_msa.h"
 #include "h264dsp_mips.h"
 
-#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)\
-( {  \
-v4i32 tmp0_m, tmp1_m;\
-v8i16 out0_m, out1_m, out2_m, out3_m;\
-v8i16 minus5h_m = __msa_ldi_h(-5);   \
-v8i16 plus20h_m = __msa_ldi_h(20);   \
- \
-ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m);   \
- \
-tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
-tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
- \
-ILVRL_H2_SH(in1, in4, out0_m, out1_m);   \
-DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m);  \
-ILVRL_H2_SH(in2, in3, out2_m, out3_m);   \
-DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m);  \
- \
-SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
-SAT_SW2_SW(tmp0_m, tmp1_m, 7);   \
-out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m);  \
- \
-out0_m;  \
-} )
-
 static const uint8_t luma_mask_arr[16 * 8] = {
 /* 8 width cases */
 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
@@ -75,44 +51,6 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);  \
 }
 
-#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  
\
-( {
\
-v4i32 tmp1_m;  
\
-v8i16 tmp2_m, tmp3_m;  
\
-v8i16 minus5h_m = __msa_ldi_h(-5); 
\
-v8i16 plus20h_m = __msa_ldi_h(20); 
\
-   
\
-tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); 
\
-tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m);   
\
-   
\
-ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m);
\
-DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m);
\
-   
\
-tmp1_m = __msa_srari_w(tmp1_m, 10);
\
-tmp1_m = __msa_sat_s_w(tmp1_m, 7); 
\
-   
\
-tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m);
\
-   
\
-tmp2_m;
\
-} )
-
-#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,  \
-mask0, mask1, mask2) \
-( {  \
-v8i16 hz_out_m;  \
-v16i8 vec0_m, vec1_m, vec2_m;\
-v16i8 minus5b_m = __msa_ldi_b(-5);   \
-v16i8 plus20b_m = __msa_ldi_b(20);   \
- \
-vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0);\
-hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m);   \
- 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi 4 tap hz and vt mc msa functions

2017-11-02 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_bi_msa.c |  428 +++---
 1 file changed, 210 insertions(+), 218 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
index 9c03ef8..e9c9184 100644
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -2183,7 +2183,7 @@ static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
 v8i16 filt0, filt1;
 v16i8 src0, src1, dst0, vec0, vec1;
 v8i16 in0, in1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1;
 v8i16 tmp0;
 v8i16 filter_vec, const_vec;
@@ -2226,7 +2226,8 @@ static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
 v8i16 in0, in1, in2, in3;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 vec2, vec3;
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1;
 v8i16 tmp0, tmp1;
 v8i16 filter_vec, const_vec;
@@ -2247,12 +2248,12 @@ static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
 XORI_B4_128_SB(src0, src1, src2, src3);
 
-VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 tmp0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
-VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
 tmp1 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
+VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, 
tmp1,
+ tmp0, tmp1);
 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
@@ -2273,8 +2274,8 @@ static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t 
*src0_ptr,
 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
 v16i8 dst0, dst1;
 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-v16i8 mask1, vec0, vec1;
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
+v16i8 mask1, vec0, vec1, vec2, vec3;
 v8i16 tmp0, tmp1, tmp2, tmp3;
 v8i16 filter_vec, const_vec;
 
@@ -2300,18 +2301,18 @@ static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t 
*src0_ptr,
 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
 
-VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 tmp0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
-VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
 tmp1 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
-VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
 tmp2 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
-VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
 tmp3 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
+VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
+ tmp1, tmp2, tmp3);
+VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
+ tmp1, tmp2, tmp3);
 
 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
   tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
@@ -2357,9 +2358,9 @@ static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3;
 v8i16 in0, in1, in2, in3;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[0]);
 v16i8 mask1;
-v16i8 vec0, vec1;
+v16i8 vec0, vec1, vec2, vec3;
 v8i16 dst0, dst1, dst2, dst3;
 v8i16 filter_vec, const_vec;
 
@@ -2380,18 +2381,18 @@ static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
 src1_ptr += (4 * src2_stride);
 XORI_B4_128_SB(src0, src1, src2, src3);
 
-VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
-VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 dst1 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions

2017-11-03 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_biw_msa.c |  587 -
 1 file changed, 247 insertions(+), 340 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_biw_msa.c 
b/libavcodec/mips/hevc_mc_biw_msa.c
index 75c1c7a..0e5f8a0 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -2633,22 +2633,21 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1;
 v8i16 in0, in1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1, vec0, vec1;
 v8i16 dst0;
 v4i32 dst0_r, dst0_l;
-v8i16 filter_vec, const_vec;
+v8i16 out0, filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2661,9 +2660,10 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2674,18 +2674,16 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 XORI_B2_128_SB(src0, src1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 
 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-dst0_r = CLIP_SW_0_255(dst0_r);
-dst0_l = CLIP_SW_0_255(dst0_l);
-
-HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+out0 = CLIP_SH_0_255(dst0_r);
+out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
+ST4x2_UB(out0, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2695,22 +2693,21 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1;
 v8i16 dst0, dst1;
 v16i8 vec0, vec1;
 v8i16 in0, in1, in2, in3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2724,9 +2721,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2737,11 +2735,9 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
-dst1 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+dst1 = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions

2017-11-03 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c |  509 -
 1 file changed, 274 insertions(+), 235 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 7d24858..993dad0 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -1947,7 +1947,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, res0;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -1959,7 +1959,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 LD_SB2(src, src_stride, src0, src1);
 XORI_B2_128_SB(src0, src1);
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 res0 = __msa_srari_h(res0, 6);
 res0 = __msa_sat_s_h(res0, 7);
 out = PCKEV_XORI128_UB(res0, res0);
@@ -1974,7 +1974,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t 
src_stride,
 v8i16 filt, out0, out1;
 v16u8 out;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2001,7 +2001,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2038,7 +2038,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2098,12 +2098,11 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,
 uint8_t *dst, int32_t dst_stride,
 const int8_t *filter, int32_t height)
 {
-uint32_t loop_cnt;
 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
 v16u8 out4, out5;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 /* rearranging filter */
@@ -2112,21 +2111,31 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,
 
 mask1 = mask0 + 2;
 
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);
 
-XORI_B4_128_SB(src0, src1, src2, src3);
-HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
-   filt1, out0, out1, out2, out3);
-SRARI_H4_SH(out0, out1, out2, out3, 6);
-SAT_SH4_SH(out0, out1, out2, out3, 7);
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);
 
-out4 = PCKEV_XORI128_UB(out0, out1);
-out5 = PCKEV_XORI128_UB(out2, out3);
-ST6x4_UB(out4, out5, dst, dst_stride);
-dst += (4 * dst_stride);
-}
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);
+
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);
 }
 
 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
@@ -2138,7 +2147,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, 
int32_t src_stride,
 v16u8 out;
 v8i16 filt, vec0, vec1, vec2, vec3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 filt = LD_SH(filter);
@@ -2172,7 +2181,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, 
int32_t src_stride,
 v16u8 tmp0, tmp1;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 /* rearranging filter */
@@ -2221,8 +2230,8 @@ static void 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted 4 tap hz mc msa functions

2017-11-03 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.
Use immediate unsigned saturation for clip to max saving one vector register.
Remove unused macro.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uniw_msa.c |  892 ++--
 1 file changed, 448 insertions(+), 444 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 0796b0a..31fec73 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -29,19 +29,6 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] 
__attribute__((aligned(0x40))) = {
 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
 };
 
-#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,   \
-   out0, out1, out2, out3) \
-{  \
-MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3);  \
-SRAR_W4_SW(out0, out1, out2, out3, rnd);   \
-ADD4(out0, offset, out1, offset, out2, offset, out3, offset,   \
- out0, out1, out2, out3);  \
-out0 = CLIP_SW_0_255(out0);\
-out1 = CLIP_SW_0_255(out1);\
-out2 = CLIP_SW_0_255(out2);\
-out3 = CLIP_SW_0_255(out3);\
-}
-
 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,  \
 out0_r, out1_r, out0_l, out1_l)  \
 {\
@@ -2382,19 +2369,19 @@ static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
   uint8_t *dst,
   int32_t dst_stride,
   const int8_t *filter,
-  int32_t height,
   int32_t weight,
   int32_t offset,
   int32_t rnd_val)
 {
+v16u8 out;
 v8i16 filt0, filt1;
 v16i8 src0, src1, vec0, vec1;
 v16i8 mask1;
 v8i16 dst0;
 v4i32 dst0_r, dst0_l;
-v8i16 filter_vec, const_vec;
-v4i32 weight_vec, offset_vec, rnd_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+v4i32 weight_vec, rnd_vec;
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 
 src -= 1;
 
@@ -2405,29 +2392,33 @@ static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
 
 weight = weight & 0x;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
-
 weight_vec = __msa_fill_w(weight);
-offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
 
+weight *= 128;
+rnd_val -= 6;
+
+weight_vec_h = __msa_fill_h(weight);
+offset_vec = __msa_fill_h(offset);
+denom_vec = __msa_fill_h(rnd_val);
+
+weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
 LD_SB2(src, src_stride, src0, src1);
 XORI_B2_128_SB(src0, src1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 
 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
-dst0_r = CLIP_SW_0_255(dst0_r);
-dst0_l = CLIP_SW_0_255(dst0_l);
-
-HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+dst0 = __msa_adds_s_h(dst0, offset_vec);
+dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+ST4x2_UB(out, dst, dst_stride);
 dst += (4 * dst_stride);
 }
 
@@ -2436,19 +2427,18 @@ static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
   uint8_t *dst,
   int32_t dst_stride,
   const int8_t *filter,
-  int32_t height,
   int32_t weight,
   int32_t offset,
   int32_t rnd_val)
 {
+v16u8 out;
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3;
-v16i8 mask1, vec0, vec1;
+v16i8 mask1, vec0, vec1, vec2, vec3;
 v8i16 dst0, dst1;
-v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
-v8i16 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted hv mc msa functions

2017-10-31 Thread kaustubh.raste
From: Kaustubh Raste 

Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_macros_msa.h  |9 +
 libavcodec/mips/hevc_mc_uniw_msa.c | 1598 +---
 2 files changed, 965 insertions(+), 642 deletions(-)

diff --git a/libavcodec/mips/hevc_macros_msa.h 
b/libavcodec/mips/hevc_macros_msa.h
index 7dcfea0..27c69ff 100644
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -80,6 +80,15 @@
 out_m;   \
 } )
 
+#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)\
+( {  \
+v8i16 out_m; \
+ \
+out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);  \
+out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+out_m;   \
+} )
+
 #define HEVC_FILT_4TAP(in0, in1, filt0, filt1)   \
 ( {  \
 v4i32 out_m; \
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 28c7062f..0796b0a 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -1801,40 +1801,42 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
  int32_t rnd_val)
 {
 uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+v16u8 out;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 v8i16 filt0, filt1, filt2, filt3;
-v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 v16i8 mask1, mask2, mask3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
-v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
-v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
-v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src -= ((3 * src_stride) + 3);
 filter_vec = LD_SH(filter_x);
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
 mask3 = mask0 + 6;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
-
 weight_vec = __msa_fill_w(weight);
 offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
+denom_vec = rnd_vec - 6;
+
+const_128 = __msa_ldi_w(128);
+const_128 *= weight_vec;
+offset_vec += __msa_srar_w(const_128, denom_vec);
 
 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 src += (7 * src_stride);
@@ -1847,64 +1849,68 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
vec8, vec9, vec10, vec11);
 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
-dst30 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst30, dst30, dst30, dst30);
-dst41 = const_vec;
-DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
- dst41, dst41, dst41, dst41);
-dst52 = const_vec;
-DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
- dst52, dst52, dst52, dst52);
-dst63 = const_vec;
-DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
- dst63, dst63, dst63, dst63);
-
-ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
-   dst10_r, dst21_r, dst32_r);
-
-dst43_r = __msa_ilvl_h(dst41, dst30);
-dst54_r = __msa_ilvl_h(dst52, dst41);
-dst65_r = __msa_ilvl_h(dst63, dst52);
+dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+  filt3);
+dst41 = 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc avg mc 02, 12 and 32 msa functions

2017-11-01 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.
Load the specific destination bytes instead of MSA load and pack.
Remove unused macro and functions.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c | 1002 +++-
 1 file changed, 676 insertions(+), 326 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index e3a8634..3df72f5 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -749,177 +749,6 @@ static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const 
uint8_t *src,
 }
 }
 
-static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst, int32_t dst_stride)
-{
-int16_t filt_const0 = 0xfb01;
-int16_t filt_const1 = 0x1414;
-int16_t filt_const2 = 0x1fb;
-v16u8 dst0, dst1, dst2, dst3;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
-v16i8 src87_r, src2110, src4332, src6554, src8776;
-v8i16 out10, out32;
-v16i8 filt0, filt1, filt2;
-v16u8 res;
-
-filt0 = (v16i8) __msa_fill_h(filt_const0);
-filt1 = (v16i8) __msa_fill_h(filt_const1);
-filt2 = (v16i8) __msa_fill_h(filt_const2);
-
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-
-ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
-   src10_r, src21_r, src32_r, src43_r);
-ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-XORI_B2_128_SB(src2110, src4332);
-LD_SB4(src, src_stride, src5, src6, src7, src8);
-ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
-   src54_r, src65_r, src76_r, src87_r);
-ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
-XORI_B2_128_SB(src6554, src8776);
-out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
-out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
-SRARI_H2_SH(out10, out32, 5);
-SAT_SH2_SH(out10, out32, 7);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-res = PCKEV_XORI128_UB(out10, out32);
-
-ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-
-dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
-dst0 = __msa_aver_u_b(res, dst0);
-
-ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst, int32_t dst_stride)
-{
-int32_t loop_cnt;
-int16_t filt_const0 = 0xfb01;
-int16_t filt_const1 = 0x1414;
-int16_t filt_const2 = 0x1fb;
-v16u8 dst0, dst1, dst2, dst3;
-v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
-v16i8 src10_r, src32_r, src76_r, src98_r;
-v16i8 src21_r, src43_r, src87_r, src109_r;
-v8i16 out0, out1, out2, out3;
-v16i8 filt0, filt1, filt2;
-
-filt0 = (v16i8) __msa_fill_h(filt_const0);
-filt1 = (v16i8) __msa_fill_h(filt_const1);
-filt2 = (v16i8) __msa_fill_h(filt_const2);
-
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-
-XORI_B5_128_SB(src0, src1, src2, src3, src4);
-ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
-   src10_r, src21_r, src32_r, src43_r);
-
-for (loop_cnt = 2; loop_cnt--;) {
-LD_SB4(src, src_stride, src7, src8, src9, src10);
-src += (4 * src_stride);
-
-XORI_B4_128_SB(src7, src8, src9, src10);
-ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
-   src76_r, src87_r, src98_r, src109_r);
-out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
-out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
-out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
-out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
-SRARI_H4_SH(out0, out1, out2, out3, 5);
-SAT_SH4_SH(out0, out1, out2, out3, 7);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
-dst, dst_stride);
-dst += (4 * dst_stride);
-
-src10_r = src76_r;
-src32_r = src98_r;
-src21_r = src87_r;
-src43_r = src109_r;
-src4 = src10;
-}
-}
-
-static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
-   int32_t src_stride,
-   uint8_t *dst, int32_t 
dst_stride)
-{
-int32_t loop_cnt;
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc avg mc 22, 11, 31, 13 and 33 msa functions

2017-10-27 Thread kaustubh.raste
From: Kaustubh Raste 

Remove loops and unroll as block sizes are known.
Load the specific destination bytes instead of MSA load and pack.
Remove unused macro and functions.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264qpel_msa.c |  781 ++--
 1 file changed, 513 insertions(+), 268 deletions(-)

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
index fcccb98..e3a8634 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -75,22 +75,6 @@ static const uint8_t luma_mask_arr[16 * 8] = {
 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);  \
 }
 
-#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  
\
-( {
\
-v8i16 tmp1_m;  
\
-v16i8 tmp0_m, tmp2_m;  
\
-v16i8 minus5b_m = __msa_ldi_b(-5); 
\
-v16i8 plus20b_m = __msa_ldi_b(20); 
\
-   
\
-tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); 
\
-tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m);   
\
-   
\
-ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m);
\
-DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m);
\
-   
\
-tmp1_m;
\
-} )
-
 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  
\
 ( {
\
 v4i32 tmp1_m;  
\
@@ -1157,128 +1141,6 @@ static void 
avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
 }
 }
 
-static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
-  int32_t src_stride,
-  uint8_t *dst, int32_t dst_stride)
-{
-v16i8 src0, src1, src2, src3, src4;
-v16i8 mask0, mask1, mask2;
-v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
-v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
-v8i16 res0, res1, res2, res3;
-v16u8 dst0, dst1, dst2, dst3;
-v16u8 tmp0, tmp1, tmp2, tmp3;
-
-LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2);
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-src += (5 * src_stride);
-
-XORI_B5_128_SB(src0, src1, src2, src3, src4);
-
-hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
-  mask0, mask1, mask2);
-hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
-  mask0, mask1, mask2);
-
-PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
-
-hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
-
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-XORI_B4_128_SB(src0, src1, src2, src3);
-
-hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
-  mask0, mask1, mask2);
-hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
-  mask0, mask1, mask2);
-
-PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
-
-res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
- hz_out3, hz_out4, hz_out5);
-res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
- hz_out4, hz_out5, hz_out6);
-res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
- hz_out5, hz_out6, hz_out7);
-res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
- hz_out6, hz_out7, hz_out8);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-tmp0 = PCKEV_XORI128_UB(res0, res1);
-tmp1 = PCKEV_XORI128_UB(res2, res3);
-PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
-AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
-
-ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
-}
-
-static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
-   

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma avg hv mc msa functions

2017-10-27 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  438 +-
 1 file changed, 238 insertions(+), 200 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index a5c3334..4c25761 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1408,15 +1408,15 @@ static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)
 {
 uint16_t out0, out1;
-v16u8 dst0, dst1;
+v16u8 dst0 = { 0 };
 v16u8 src0, src1, src2;
 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 v16i8 res, mask;
@@ -1428,8 +1428,11 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 
 mask = LD_SB(_mask_arr[48]);
 
-LD_UB3(src, src_stride, src0, src1, src2);
-LD_UB2(dst, dst_stride, dst0, dst1);
+LD_UB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
@@ -1438,67 +1441,26 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 res_vt0 = __msa_sat_u_h(res_vt0, 7);
 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 dst0 = __msa_aver_u_b((v16u8) res, dst0);
 out0 = __msa_copy_u_h((v8i16) dst0, 0);
 out1 = __msa_copy_u_h((v8i16) dst0, 1);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)
 {
+uint16_t tp0, tp1, tp2, tp3;
 v16u8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
-v16u8 dst0, dst1, dst2, dst3;
-v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
-v16i8 res, mask;
-v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
-v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
-v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
-v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
-v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
-
-mask = LD_SB(_mask_arr[48]);
-
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
-VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
-ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
-MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-
-res_vt0 += res_vt1;
-res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
-res_vt0 = __msa_sat_u_h(res_vt0, 7);
-res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
-dst0 = __msa_aver_u_b((v16u8) res, dst0);
-
-ST2x4_UB(dst0, 0, dst, dst_stride);
-}
-
-static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coef_hor0,
-   uint32_t 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi hz and hv mc msa functions

2017-10-27 Thread kaustubh.raste
From: Kaustubh Raste 

Align the mask buffer.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_bi_msa.c |  940 --
 1 file changed, 595 insertions(+), 345 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
index ccc3f8a..9c03ef8 100644
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -22,6 +22,12 @@
 #include "libavcodec/mips/hevcdsp_mips.h"
 #include "libavcodec/mips/hevc_macros_msa.h"
 
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = 
{
+/* 8 width cases */
+0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
 { \
 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);\
@@ -531,7 +537,7 @@ static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
 v8i16 dst0, dst1, dst2, dst3;
 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 v8i16 filter_vec, const_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 
 src0_ptr -= 3;
 
@@ -557,26 +563,26 @@ static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
 
-VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst0 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst0, dst0, dst0, dst0);
-VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst1 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst1, dst1, dst1, dst1);
-VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst2 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst2, dst2, dst2, dst2);
-VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst3 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst3, dst3, dst3, dst3);
+VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+ dst1, dst2, dst3);
+VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+ dst1, dst2, dst3);
+VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+ dst1, dst2, dst3);
+VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
+VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
+DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+ dst1, dst2, dst3);
 
 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
   dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
@@ -604,7 +610,7 @@ static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
 v8i16 dst0, dst1, dst2, dst3;
 v8i16 in0, in1, in2, in3;
 v8i16 filter_vec, const_vec;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[0]);
 
 src0_ptr -= 3;
 
@@ -625,26 +631,26 @@ static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
 src1_ptr += (4 * src2_stride);
 XORI_B4_128_SB(src0, src1, src2, src3);
 
-VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst0 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst0, dst0, dst0, dst0);
-VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst1 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst1, dst1, dst1, dst1);
-VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
-   vec0, vec1, vec2, vec3);
 dst2 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst2, dst2, dst2, dst2);
-

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni vt and hv mc msa functions

2017-10-27 Thread kaustubh.raste
From: Kaustubh Raste 

Remove unused macro.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c |  744 +
 1 file changed, 499 insertions(+), 245 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 3a6c5b0..7d24858 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -292,20 +292,6 @@ static const uint8_t mc_filt_mask_arr[16 * 3] = {
 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 };
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
-filt0, filt1, filt2, filt3) \
-( { \
-v8i16 tmp0, tmp1;   \
-\
-tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
-tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
-tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
-tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
-tmp0 = __msa_adds_s_h(tmp0, tmp1);  \
-\
-tmp0;   \
-} )
-
 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)   \
 ( { \
 v8i16 tmp0; \
@@ -944,12 +930,14 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t 
src_stride,
 const int8_t *filter, int32_t height)
 {
 uint32_t loop_cnt;
+v16u8 out0, out1;
 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+v16i8 src11, src12, src13, src14;
 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src1210, src14131312;
 v16i8 src10998, filt0, filt1, filt2, filt3;
-v16u8 out;
-v8i16 filt, out10, out32;
+v8i16 filt, out10, out32, out54, out76;
 
 src -= (3 * src_stride);
 
@@ -966,28 +954,45 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t 
src_stride,
src4332, src6554);
 XORI_B3_128_SB(src2110, src4332, src6554);
 
-for (loop_cnt = (height >> 2); loop_cnt--;) {
+for (loop_cnt = (height >> 3); loop_cnt--;) {
 LD_SB4(src, src_stride, src7, src8, src9, src10);
 src += (4 * src_stride);
+LD_SB4(src, src_stride, src11, src12, src13, src14);
+src += (4 * src_stride);
 
 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
+ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+   src1110_r, src1211_r, src1312_r, src1413_r);
 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
+   src1210, src14131312);
 XORI_B2_128_SB(src8776, src10998);
-out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
-filt1, filt2, filt3);
-out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
-filt1, filt2, filt3);
+XORI_B2_128_SB(src1210, src14131312);
+
+DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
+DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
+DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
+DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
+DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
+DPADD_SB2_SH(src10998, src1210, filt2, filt2, out54, out76);
+DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
+DPADD_SB2_SH(src1210, src14131312, filt3, filt3, out54, out76);
 SRARI_H2_SH(out10, out32, 6);
+SRARI_H2_SH(out54, out76, 6);
 SAT_SH2_SH(out10, out32, 7);
-out = PCKEV_XORI128_UB(out10, out32);
-ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+SAT_SH2_SH(out54, out76, 7);
+out0 = PCKEV_XORI128_UB(out10, out32);
+out1 = PCKEV_XORI128_UB(out54, out76);
+ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+dst += (4 * dst_stride);
+ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
 dst += (4 * dst_stride);
 
-src2110 = src6554;
-src4332 = src8776;
-src6554 = src10998;
-src6 = src10;
+src2110 = src10998;
+src4332 = src1210;
+src6554 = src14131312;
+src6 = src14;
 }
 }
 
@@