Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions
On Sat, Nov 04, 2017 at 01:07:41AM +, Manojkumar Bhosale wrote: > LGTM will apply thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Does the universe only have a finite lifespan? No, its going to go on forever, its just that you wont like living in it. -- Hiranya Peiri signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions
LGTM From: ffmpeg-devel [ffmpeg-devel-boun...@ffmpeg.org] on behalf of kaustubh.ra...@imgtec.com [kaustubh.ra...@imgtec.com] Sent: Friday, November 03, 2017 12:29 PM To: ffmpeg-devel@ffmpeg.org Cc: Kaustubh Raste Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions From: Kaustubh Raste <kaustubh.ra...@imgtec.com> Use global mask buffer for appropriate mask load. Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com> --- libavcodec/mips/hevc_mc_uni_msa.c | 509 - 1 file changed, 274 insertions(+), 235 deletions(-) diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c index 7d24858..993dad0 100644 --- a/libavcodec/mips/hevc_mc_uni_msa.c +++ b/libavcodec/mips/hevc_mc_uni_msa.c @@ -1947,7 +1947,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, res0; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -1959,7 +1959,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, LD_SB2(src, src_stride, src0, src1); XORI_B2_128_SB(src0, src1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); -res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); +res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); res0 = __msa_srari_h(res0, 6); res0 = __msa_sat_s_h(res0, 7); out = PCKEV_XORI128_UB(res0, res0); @@ -1974,7 +1974,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, v8i16 filt, out0, out1; v16u8 out; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2001,7 +2001,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2038,7 +2038,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2098,12 +2098,11 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height) { -uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v16u8 out4, out5; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[0]); +mask0 = LD_SB(_hevc_mask_arr[0]); src -= 1; /* rearranging filter */ @@ -2112,21 +2111,31 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, mask1 = mask0 + 2; -for (loop_cnt = (height >> 2); loop_cnt--;) { -LD_SB4(src, src_stride, src0, src1, src2, src3); -src += (4 * src_stride); +LD_SB4(src, src_stride, src0, src1, src2, src3); +src += (4 * src_stride); -XORI_B4_128_SB(src0, src1, src2, src3); -HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, - filt1, out0, out1, out2, out3); -SRARI_H4_SH(out0, out1, out2, out3, 6); -SAT_SH4_SH(out0, out1, out2, out3, 7); +XORI_B4_128_SB(src0, src1, src2, src3); +HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); +SRARI_H4_SH(out0, out1, out2, out3, 6); +SAT_SH4_SH(out0, out1, out2, out3, 7); +out4 = PCKEV_XORI128_UB(out0, out1); +out5 = PCKEV_XORI128_UB(out2, out3); +ST6x4_UB(out4, out5, dst, dst_stride); +dst += (4 * dst_stride); -out4 = PCKEV_XORI128_UB(out0, out1); -out5 = PCKEV_XORI128_UB(out2, out3); -ST6x4_UB(out4, out5, dst, dst_stride); -dst += (4 * dst_stride); -} +LD_SB4(src, src_stride, src0, src1, src2, src3); +src += (4 * src_stride); + +XORI_B4_128_SB(src0, src1, src2, src3); +HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); +SRARI_H4_SH(out0, out1, out2, out3, 6); +SAT_SH4_SH(out0, out1, out2, out3, 7); +out4 = PCKEV_XORI128_UB(out0, out1); +out5 = PCKEV_XORI128_UB(out2, out3); +ST6x4_UB(out4, out5, dst, dst_stride); +dst += (4 * dst_stride); } static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, @@ -2138,7 +2147,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, vec0, vec1, vec2, vec3; -mask0 = LD_SB(_filt_mask_arr[0]); +mask0 = LD_SB(_hevc_mask_arr[0]); src -= 1;
[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions
From: Kaustubh RasteUse global mask buffer for appropriate mask load. Signed-off-by: Kaustubh Raste --- libavcodec/mips/hevc_mc_uni_msa.c | 509 - 1 file changed, 274 insertions(+), 235 deletions(-) diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c index 7d24858..993dad0 100644 --- a/libavcodec/mips/hevc_mc_uni_msa.c +++ b/libavcodec/mips/hevc_mc_uni_msa.c @@ -1947,7 +1947,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, res0; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -1959,7 +1959,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, LD_SB2(src, src_stride, src0, src1); XORI_B2_128_SB(src0, src1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); -res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); +res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); res0 = __msa_srari_h(res0, 6); res0 = __msa_sat_s_h(res0, 7); out = PCKEV_XORI128_UB(res0, res0); @@ -1974,7 +1974,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, v8i16 filt, out0, out1; v16u8 out; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2001,7 +2001,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2038,7 +2038,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[16]); +mask0 = LD_SB(_hevc_mask_arr[16]); src -= 1; /* rearranging filter */ @@ -2098,12 +2098,11 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height) { -uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v16u8 out4, out5; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[0]); +mask0 = LD_SB(_hevc_mask_arr[0]); src -= 1; /* rearranging filter */ @@ -2112,21 +2111,31 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, mask1 = mask0 + 2; -for (loop_cnt = (height >> 2); loop_cnt--;) { -LD_SB4(src, src_stride, src0, src1, src2, src3); -src += (4 * src_stride); +LD_SB4(src, src_stride, src0, src1, src2, src3); +src += (4 * src_stride); -XORI_B4_128_SB(src0, src1, src2, src3); -HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, - filt1, out0, out1, out2, out3); -SRARI_H4_SH(out0, out1, out2, out3, 6); -SAT_SH4_SH(out0, out1, out2, out3, 7); +XORI_B4_128_SB(src0, src1, src2, src3); +HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); +SRARI_H4_SH(out0, out1, out2, out3, 6); +SAT_SH4_SH(out0, out1, out2, out3, 7); +out4 = PCKEV_XORI128_UB(out0, out1); +out5 = PCKEV_XORI128_UB(out2, out3); +ST6x4_UB(out4, out5, dst, dst_stride); +dst += (4 * dst_stride); -out4 = PCKEV_XORI128_UB(out0, out1); -out5 = PCKEV_XORI128_UB(out2, out3); -ST6x4_UB(out4, out5, dst, dst_stride); -dst += (4 * dst_stride); -} +LD_SB4(src, src_stride, src0, src1, src2, src3); +src += (4 * src_stride); + +XORI_B4_128_SB(src0, src1, src2, src3); +HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); +SRARI_H4_SH(out0, out1, out2, out3, 6); +SAT_SH4_SH(out0, out1, out2, out3, 7); +out4 = PCKEV_XORI128_UB(out0, out1); +out5 = PCKEV_XORI128_UB(out2, out3); +ST6x4_UB(out4, out5, dst, dst_stride); +dst += (4 * dst_stride); } static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, @@ -2138,7 +2147,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, v16u8 out; v8i16 filt, vec0, vec1, vec2, vec3; -mask0 = LD_SB(_filt_mask_arr[0]); +mask0 = LD_SB(_hevc_mask_arr[0]); src -= 1; filt = LD_SH(filter); @@ -2172,7 +2181,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, v16u8 tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; -mask0 = LD_SB(_filt_mask_arr[0]); +mask0 = LD_SB(_hevc_mask_arr[0]); src -= 1; /* rearranging filter */ @@ -2221,8 +2230,8 @@ static void