Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma copy and avg vert mc msa functions

2017-10-25 Thread Michael Niedermayer
On Tue, Oct 24, 2017 at 11:21:32AM +, Manojkumar Bhosale wrote:
> LGTM

will apply

thanks

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No great genius has ever existed without some touch of madness. -- Aristotle


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma copy and avg vert mc msa functions

2017-10-24 Thread Manojkumar Bhosale
LGTM

-Original Message-
From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of 
kaustubh.ra...@imgtec.com
Sent: Tuesday, October 24, 2017 12:41 PM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma copy and avg 
vert mc msa functions

From: Kaustubh Raste <kaustubh.ra...@imgtec.com>

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com>
---
 libavcodec/mips/h264chroma_msa.c |  627 +-
 1 file changed, 275 insertions(+), 352 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 2a54675..a5c3334 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1124,24 +1124,25 @@ static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
 uint16_t out0, out1;
-uint32_t load0, load1;
 v16i8 src0, src1, src2, tmp0, tmp1, res;
 v16u8 dst_data = { 0 };
+v8i16 out;
 v8u16 res_r;
 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
-LD_SB3(src, src_stride, src0, src1, src2);
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
+LD_SB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
 
-INSERT_W2_UB(load0, load1, dst_data);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
 
 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 
@@ -1151,20 +1152,20 @@ static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 res_r = __msa_sat_u_h(res_r, 7);
 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-dst_data = __msa_aver_u_b((v16u8) res, dst_data);
-out0 = __msa_copy_u_h((v8i16) dst_data, 0);
-out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+out0 = __msa_copy_u_h(out, 0);
+out1 = __msa_copy_u_h(out, 2);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
-uint32_t load0, load1;
+uint16_t tp0, tp1, tp2, tp3;
 v16i8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
 v8u16 res_r;
@@ -1174,19 +1175,16 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t 
*src, int32_t src_stride,
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 v16u8 dst_data = { 0 };
 
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+LD_SB5(src, stride, src0, src1, src2, src3, src4);
 
-load0 = LW(dst + 2 * dst_stride);
-load1 = LW(dst + 3 * dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+tp0 = LH(dst);
+tp1 = LH(dst + stride);
+tp2 = LH(dst + 2 * stride);
+tp3 = LH(dst + 3 * stride);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
 
 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
tmp0, tmp1, tmp2, tmp3); @@ -1202,102 +1200,26 @@ static void 
avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
-ST2x4_UB(res, 0, dst, dst_stride);
- 

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma copy and avg vert mc msa functions

2017-10-24 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  627 +-
 1 file changed, 275 insertions(+), 352 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 2a54675..a5c3334 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1124,24 +1124,25 @@ static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
 uint16_t out0, out1;
-uint32_t load0, load1;
 v16i8 src0, src1, src2, tmp0, tmp1, res;
 v16u8 dst_data = { 0 };
+v8i16 out;
 v8u16 res_r;
 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
-LD_SB3(src, src_stride, src0, src1, src2);
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
+LD_SB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
 
-INSERT_W2_UB(load0, load1, dst_data);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
 
 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 
@@ -1151,20 +1152,20 @@ static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 res_r = __msa_sat_u_h(res_r, 7);
 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
-dst_data = __msa_aver_u_b((v16u8) res, dst_data);
-out0 = __msa_copy_u_h((v8i16) dst_data, 0);
-out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+out0 = __msa_copy_u_h(out, 0);
+out1 = __msa_copy_u_h(out, 2);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride, uint32_t coeff0,
+   uint32_t coeff1)
 {
-uint32_t load0, load1;
+uint16_t tp0, tp1, tp2, tp3;
 v16i8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
 v8u16 res_r;
@@ -1174,19 +1175,16 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t 
*src, int32_t src_stride,
 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 v16u8 dst_data = { 0 };
 
-LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-load0 = LW(dst);
-load1 = LW(dst + dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+LD_SB5(src, stride, src0, src1, src2, src3, src4);
 
-load0 = LW(dst + 2 * dst_stride);
-load1 = LW(dst + 3 * dst_stride);
-
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
-dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+tp0 = LH(dst);
+tp1 = LH(dst + stride);
+tp2 = LH(dst + 2 * stride);
+tp3 = LH(dst + 3 * stride);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
 
 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
tmp0, tmp1, tmp2, tmp3);
@@ -1202,102 +1200,26 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t 
*src, int32_t src_stride,
 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
-ST2x4_UB(res, 0, dst, dst_stride);
-dst += (4 * dst_stride);
-}
-
-static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coeff0, uint32_t 
coeff1)
-{
-uint32_t load0, load1, load2, load3;
-v16i8 src0,