Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma avg hv mc msa functions

2017-10-31 Thread Michael Niedermayer
On Mon, Oct 30, 2017 at 11:36:41AM +, Manojkumar Bhosale wrote:
> LGTM

applied

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If a bugfix only changes things apparently unrelated to the bug with no
further explanation, that is a good sign that the bugfix is wrong.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma avg hv mc msa functions

2017-10-30 Thread Manojkumar Bhosale
LGTM

-Original Message-
From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of 
kaustubh.ra...@imgtec.com
Sent: Friday, October 27, 2017 5:03 PM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma avg hv mc msa 
functions

From: Kaustubh Raste <kaustubh.ra...@imgtec.com>

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com>
---
 libavcodec/mips/h264chroma_msa.c |  438 +-
 1 file changed, 238 insertions(+), 200 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index a5c3334..4c25761 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1408,15 +1408,15 @@ static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)  {
 uint16_t out0, out1;
-v16u8 dst0, dst1;
+v16u8 dst0 = { 0 };
 v16u8 src0, src1, src2;
 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 v16i8 res, mask;
@@ -1428,8 +1428,11 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 
 mask = LD_SB(_mask_arr[48]);
 
-LD_UB3(src, src_stride, src0, src1, src2);
-LD_UB2(dst, dst_stride, dst0, dst1);
+LD_UB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); @@ 
-1438,67 +1441,26 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 res_vt0 = __msa_sat_u_h(res_vt0, 7);
 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 dst0 = __msa_aver_u_b((v16u8) res, dst0);
 out0 = __msa_copy_u_h((v8i16) dst0, 0);
 out1 = __msa_copy_u_h((v8i16) dst0, 1);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)  {
+uint16_t tp0, tp1, tp2, tp3;
 v16u8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
-v16u8 dst0, dst1, dst2, dst3;
-v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
-v16i8 res, mask;
-v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
-v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
-v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
-v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
-v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
-
-mask = LD_SB(_mask_arr[48]);
-
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
-VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
-ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
-MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-
-res_vt0 += res_vt1;
-res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
-res_vt0 = __msa_sat_u_h(res_vt0, 7);
-res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
-dst0 = __msa_aver_u_b((v16u8) res, dst0);
-
-ST2x

[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc chroma avg hv mc msa functions

2017-10-27 Thread kaustubh.raste
From: Kaustubh Raste 

Replace generic with block size specific function.
Load the specific destination bytes instead of MSA load and pack.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/h264chroma_msa.c |  438 +-
 1 file changed, 238 insertions(+), 200 deletions(-)

diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index a5c3334..4c25761 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1408,15 +1408,15 @@ static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t 
*src, uint8_t *dst,
 }
 }
 
-static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)
 {
 uint16_t out0, out1;
-v16u8 dst0, dst1;
+v16u8 dst0 = { 0 };
 v16u8 src0, src1, src2;
 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 v16i8 res, mask;
@@ -1428,8 +1428,11 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 
 mask = LD_SB(_mask_arr[48]);
 
-LD_UB3(src, src_stride, src0, src1, src2);
-LD_UB2(dst, dst_stride, dst0, dst1);
+LD_UB3(src, stride, src0, src1, src2);
+out0 = LH(dst);
+out1 = LH(dst + stride);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
+dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
@@ -1438,67 +1441,26 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t 
*src, int32_t src_stride,
 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 res_vt0 = __msa_sat_u_h(res_vt0, 7);
 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 dst0 = __msa_aver_u_b((v16u8) res, dst0);
 out0 = __msa_copy_u_h((v8i16) dst0, 0);
 out1 = __msa_copy_u_h((v8i16) dst0, 1);
 
 SH(out0, dst);
-dst += dst_stride;
+dst += stride;
 SH(out1, dst);
 }
 
-static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+   int32_t stride,
uint32_t coef_hor0,
uint32_t coef_hor1,
uint32_t coef_ver0,
uint32_t coef_ver1)
 {
+uint16_t tp0, tp1, tp2, tp3;
 v16u8 src0, src1, src2, src3, src4;
 v16u8 tmp0, tmp1, tmp2, tmp3;
-v16u8 dst0, dst1, dst2, dst3;
-v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
-v16i8 res, mask;
-v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
-v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
-v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
-v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
-v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
-
-mask = LD_SB(_mask_arr[48]);
-
-LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
-VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
-ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
-MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-
-res_vt0 += res_vt1;
-res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
-res_vt0 = __msa_sat_u_h(res_vt0, 7);
-res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
-
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
-dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
-dst0 = __msa_aver_u_b((v16u8) res, dst0);
-
-ST2x4_UB(dst0, 0, dst, dst_stride);
-}
-
-static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t 
src_stride,
-   uint8_t *dst, int32_t 
dst_stride,
-   uint32_t coef_hor0,
-   uint32_t