Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 11, 31, 13 and 33 msa functions
On Tue, Oct 24, 2017 at 11:21:21AM +, Manojkumar Bhosale wrote: > LGTM will apply thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Rewriting code that is poorly written but fully understood is good. Rewriting code that one doesnt understand is a sign that one is less smart then the original author, trying to rewrite it will not make it better. signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 11, 31, 13 and 33 msa functions
LGTM -Original Message- From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of kaustubh.ra...@imgtec.com Sent: Tuesday, October 24, 2017 12:39 PM To: ffmpeg-devel@ffmpeg.org Cc: Kaustubh Raste Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 11, 31, 13 and 33 msa functions From: Kaustubh Raste <kaustubh.ra...@imgtec.com> Remove loops and unroll as block sizes are known. Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com> --- libavcodec/mips/h264qpel_msa.c | 400 1 file changed, 240 insertions(+), 160 deletions(-) diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c index f11fce8..fcccb98 100644 --- a/libavcodec/mips/h264qpel_msa.c +++ b/libavcodec/mips/h264qpel_msa.c @@ -171,23 +171,27 @@ static const uint8_t luma_mask_arr[16 * 8] = { out0_m; \ } ) -static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) +static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, +uint8_t *dst, int32_t stride) { -uint32_t loop_cnt; -v16i8 src_hz0, src_hz1, src_hz2, src_hz3; -v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4; -v16i8 src_vt5, src_vt6, src_vt7, src_vt8; -v16i8 mask0, mask1, mask2; -v8i16 hz_out0, hz_out1, vert_out0, vert_out1; -v8i16 out0, out1; +const int16_t filt_const0 = 0xfb01; +const int16_t filt_const1 = 0x1414; +const int16_t filt_const2 = 0x1fb; v16u8 out; +v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; +v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; +v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; +v16i8 mask0, mask1, mask2, filt0, filt1, filt2; +v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1; + +filt0 = (v16i8) __msa_fill_h(filt_const0); +filt1 = (v16i8) __msa_fill_h(filt_const1); +filt2 = (v16i8) __msa_fill_h(filt_const2); LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2); -LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); -src_y += (5 * src_stride); +LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); +src_y += (5 * stride); src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); @@ -196,149 +200,237 @@ static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); -for (loop_cnt = (height >> 2); loop_cnt--;) { -LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); -src_x += (4 * src_stride); - -XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); - -hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, - src_hz1, mask0, - mask1, mask2); -hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, - src_hz3, mask0, - mask1, mask2); - -SRARI_H2_SH(hz_out0, hz_out1, 5); -SAT_SH2_SH(hz_out0, hz_out1, 7); - -LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); -src_y += (4 * src_stride); - -src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); -src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); -src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); -src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); - -XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); +LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); +XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); +hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); +hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, + mask2); -/* filter calc */ -vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, - src_vt2, src_vt3, - src_vt4, src_vt5); -vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, - src_vt4, src_vt5, - src_vt6, src_vt7); +SRARI_H2_SH(hz_out0, hz_out1, 5); +SAT_SH2_SH(hz_out0, hz_out1, 7); -SRARI_H2_SH(vert_out0, vert_out1, 5); -SAT_SH2_SH(vert_out0, vert_out1, 7); +LD_SB4(src_y, stride
[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc put mc 11, 31, 13 and 33 msa functions
From: Kaustubh RasteRemove loops and unroll as block sizes are known. Signed-off-by: Kaustubh Raste --- libavcodec/mips/h264qpel_msa.c | 400 1 file changed, 240 insertions(+), 160 deletions(-) diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c index f11fce8..fcccb98 100644 --- a/libavcodec/mips/h264qpel_msa.c +++ b/libavcodec/mips/h264qpel_msa.c @@ -171,23 +171,27 @@ static const uint8_t luma_mask_arr[16 * 8] = { out0_m; \ } ) -static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) +static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, +uint8_t *dst, int32_t stride) { -uint32_t loop_cnt; -v16i8 src_hz0, src_hz1, src_hz2, src_hz3; -v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4; -v16i8 src_vt5, src_vt6, src_vt7, src_vt8; -v16i8 mask0, mask1, mask2; -v8i16 hz_out0, hz_out1, vert_out0, vert_out1; -v8i16 out0, out1; +const int16_t filt_const0 = 0xfb01; +const int16_t filt_const1 = 0x1414; +const int16_t filt_const2 = 0x1fb; v16u8 out; +v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; +v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; +v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; +v16i8 mask0, mask1, mask2, filt0, filt1, filt2; +v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1; + +filt0 = (v16i8) __msa_fill_h(filt_const0); +filt1 = (v16i8) __msa_fill_h(filt_const1); +filt2 = (v16i8) __msa_fill_h(filt_const2); LD_SB3(_mask_arr[48], 16, mask0, mask1, mask2); -LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); -src_y += (5 * src_stride); +LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); +src_y += (5 * stride); src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); @@ -196,149 +200,237 @@ static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); -for (loop_cnt = (height >> 2); loop_cnt--;) { -LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); -src_x += (4 * src_stride); - -XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); - -hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, - src_hz1, mask0, - mask1, mask2); -hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, - src_hz3, mask0, - mask1, mask2); - -SRARI_H2_SH(hz_out0, hz_out1, 5); -SAT_SH2_SH(hz_out0, hz_out1, 7); - -LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); -src_y += (4 * src_stride); - -src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); -src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); -src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); -src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); - -XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); +LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); +XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); +hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); +hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2); -/* filter calc */ -vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, - src_vt2, src_vt3, - src_vt4, src_vt5); -vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, - src_vt4, src_vt5, - src_vt6, src_vt7); +SRARI_H2_SH(hz_out0, hz_out1, 5); +SAT_SH2_SH(hz_out0, hz_out1, 7); -SRARI_H2_SH(vert_out0, vert_out1, 5); -SAT_SH2_SH(vert_out0, vert_out1, 7); +LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); -out0 = __msa_srari_h((hz_out0 + vert_out0), 1); -out1 = __msa_srari_h((hz_out1 + vert_out1), 1); +src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); +src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); +src_vt6 = (v16i8)