Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions
On Fri, Nov 03, 2017 at 12:22:50PM +, Manojkumar Bhosale wrote: > LGTM applied [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Avoid a single point of failure, be that a person or equipment. signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions
LGTM -Original Message- From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of kaustubh.ra...@imgtec.com Sent: Friday, November 3, 2017 11:59 AM To: ffmpeg-devel@ffmpeg.org Cc: Kaustubh Raste Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions From: Kaustubh Raste <kaustubh.ra...@imgtec.com> Use global mask buffer for appropriate mask load. Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com> --- libavcodec/mips/hevc_mc_biw_msa.c | 587 - 1 file changed, 247 insertions(+), 340 deletions(-) diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c index 75c1c7a..0e5f8a0 100644 --- a/libavcodec/mips/hevc_mc_biw_msa.c +++ b/libavcodec/mips/hevc_mc_biw_msa.c @@ -2633,22 +2633,21 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, uint8_t *dst, int32_t dst_stride, const int8_t *filter, - int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val) { -int32_t offset, weight; +int32_t offset, weight, constant; v8i16 filt0, filt1; v16i8 src0, src1; v8i16 in0, in1; -v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; +v16i8 mask0 = LD_SB(_hevc_mask_arr[16]); v16i8 mask1, vec0, vec1; v8i16 dst0; v4i32 dst0_r, dst0_l; -v8i16 filter_vec, const_vec; +v8i16 out0, filter_vec; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2661,9 +2660,10 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x; weight = weight0 | (weight1 << 16); +constant = 128 * weight1; +constant <<= 6; +offset += constant; -const_vec = __msa_ldi_h(128); -const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -2674,18 +2674,16 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, XORI_B2_128_SB(src0, src1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); -dst0 = const_vec; -DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); +dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); -dst0_r = CLIP_SW_0_255(dst0_r); -dst0_l = CLIP_SW_0_255(dst0_l); - -HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); -ST4x2_UB(dst0_r, dst, dst_stride); +dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); +out0 = CLIP_SH_0_255(dst0_r); +out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); +ST4x2_UB(out0, dst, dst_stride); } static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, @@ -2695,22 +2693,21 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, uint8_t *dst, int32_t dst_stride, const int8_t *filter, - int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val) { -int32_t offset, weight; +int32_t offset, weight, constant; v8i16 filt0, filt1; v16i8 src0, src1, src2, src3; -v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; +v16i8 mask0 = LD_SB(_hevc_mask_arr[16]); v16i8 mask1; v8i16 dst0, dst1; v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; -v8i16 filter_vec, const_vec; +v8i16 filter_vec; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2724,9 +2721,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x; weight = weight0 | (weight1 << 16); +constant = 128 * weight1; +constant <<= 6; +offset += constant; -const_vec = __msa_ldi_h(128); -const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -2737,11 +2735,9 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, ILVR_D2_SH(in1
[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions
From: Kaustubh RasteUse global mask buffer for appropriate mask load. Signed-off-by: Kaustubh Raste --- libavcodec/mips/hevc_mc_biw_msa.c | 587 - 1 file changed, 247 insertions(+), 340 deletions(-) diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c index 75c1c7a..0e5f8a0 100644 --- a/libavcodec/mips/hevc_mc_biw_msa.c +++ b/libavcodec/mips/hevc_mc_biw_msa.c @@ -2633,22 +2633,21 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, uint8_t *dst, int32_t dst_stride, const int8_t *filter, - int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val) { -int32_t offset, weight; +int32_t offset, weight, constant; v8i16 filt0, filt1; v16i8 src0, src1; v8i16 in0, in1; -v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; +v16i8 mask0 = LD_SB(_hevc_mask_arr[16]); v16i8 mask1, vec0, vec1; v8i16 dst0; v4i32 dst0_r, dst0_l; -v8i16 filter_vec, const_vec; +v8i16 out0, filter_vec; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2661,9 +2660,10 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x; weight = weight0 | (weight1 << 16); +constant = 128 * weight1; +constant <<= 6; +offset += constant; -const_vec = __msa_ldi_h(128); -const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -2674,18 +2674,16 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, XORI_B2_128_SB(src0, src1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); -dst0 = const_vec; -DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); +dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); -dst0_r = CLIP_SW_0_255(dst0_r); -dst0_l = CLIP_SW_0_255(dst0_l); - -HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); -ST4x2_UB(dst0_r, dst, dst_stride); +dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); +out0 = CLIP_SH_0_255(dst0_r); +out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); +ST4x2_UB(out0, dst, dst_stride); } static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, @@ -2695,22 +2693,21 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, uint8_t *dst, int32_t dst_stride, const int8_t *filter, - int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val) { -int32_t offset, weight; +int32_t offset, weight, constant; v8i16 filt0, filt1; v16i8 src0, src1, src2, src3; -v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; +v16i8 mask0 = LD_SB(_hevc_mask_arr[16]); v16i8 mask1; v8i16 dst0, dst1; v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; -v8i16 filter_vec, const_vec; +v8i16 filter_vec; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2724,9 +2721,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x; weight = weight0 | (weight1 << 16); +constant = 128 * weight1; +constant <<= 6; +offset += constant; -const_vec = __msa_ldi_h(128); -const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -2737,11 +2735,9 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, ILVR_D2_SH(in1, in0, in3, in2, in0, in1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); -dst0 = const_vec; -DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); +dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); -dst1 = const_vec; -DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); +dst1 =