Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions

2017-11-04 Thread Michael Niedermayer
On Sat, Nov 04, 2017 at 01:07:41AM +, Manojkumar Bhosale wrote:
> LGTM

will apply

thanks

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Does the universe only have a finite lifespan? No, its going to go on
forever, its just that you wont like living in it. -- Hiranya Peiri


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions

2017-11-03 Thread Manojkumar Bhosale
LGTM

From: ffmpeg-devel [ffmpeg-devel-boun...@ffmpeg.org] on behalf of 
kaustubh.ra...@imgtec.com [kaustubh.ra...@imgtec.com]
Sent: Friday, November 03, 2017 12:29 PM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and 
vt mc msa functions

From: Kaustubh Raste <kaustubh.ra...@imgtec.com>

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com>
---
 libavcodec/mips/hevc_mc_uni_msa.c |  509 -
 1 file changed, 274 insertions(+), 235 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 7d24858..993dad0 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -1947,7 +1947,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, res0;

-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;

 /* rearranging filter */
@@ -1959,7 +1959,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 LD_SB2(src, src_stride, src0, src1);
 XORI_B2_128_SB(src0, src1);
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 res0 = __msa_srari_h(res0, 6);
 res0 = __msa_sat_s_h(res0, 7);
 out = PCKEV_XORI128_UB(res0, res0);
@@ -1974,7 +1974,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t 
src_stride,
 v8i16 filt, out0, out1;
 v16u8 out;

-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;

 /* rearranging filter */
@@ -2001,7 +2001,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;

-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;

 /* rearranging filter */
@@ -2038,7 +2038,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;

-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;

 /* rearranging filter */
@@ -2098,12 +2098,11 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,
 uint8_t *dst, int32_t dst_stride,
 const int8_t *filter, int32_t height)
 {
-uint32_t loop_cnt;
 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
 v16u8 out4, out5;
 v8i16 filt, out0, out1, out2, out3;

-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;

 /* rearranging filter */
@@ -2112,21 +2111,31 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,

 mask1 = mask0 + 2;

-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);

-XORI_B4_128_SB(src0, src1, src2, src3);
-HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
-   filt1, out0, out1, out2, out3);
-SRARI_H4_SH(out0, out1, out2, out3, 6);
-SAT_SH4_SH(out0, out1, out2, out3, 7);
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);

-out4 = PCKEV_XORI128_UB(out0, out1);
-out5 = PCKEV_XORI128_UB(out2, out3);
-ST6x4_UB(out4, out5, dst, dst_stride);
-dst += (4 * dst_stride);
-}
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);
+
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);
 }

 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
@@ -2138,7 +2147,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, 
int32_t src_stride,
 v16u8 out;
 v8i16 filt, vec0, vec1, vec2, vec3;

-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni 4 tap hz and vt mc msa functions

2017-11-03 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_uni_msa.c |  509 -
 1 file changed, 274 insertions(+), 235 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_uni_msa.c 
b/libavcodec/mips/hevc_mc_uni_msa.c
index 7d24858..993dad0 100644
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -1947,7 +1947,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, res0;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -1959,7 +1959,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t 
src_stride,
 LD_SB2(src, src_stride, src0, src1);
 XORI_B2_128_SB(src0, src1);
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 res0 = __msa_srari_h(res0, 6);
 res0 = __msa_sat_s_h(res0, 7);
 out = PCKEV_XORI128_UB(res0, res0);
@@ -1974,7 +1974,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t 
src_stride,
 v8i16 filt, out0, out1;
 v16u8 out;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2001,7 +2001,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2038,7 +2038,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t 
src_stride,
 v16u8 out;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[16]);
+mask0 = LD_SB(_hevc_mask_arr[16]);
 src -= 1;
 
 /* rearranging filter */
@@ -2098,12 +2098,11 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,
 uint8_t *dst, int32_t dst_stride,
 const int8_t *filter, int32_t height)
 {
-uint32_t loop_cnt;
 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
 v16u8 out4, out5;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 /* rearranging filter */
@@ -2112,21 +2111,31 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t 
src_stride,
 
 mask1 = mask0 + 2;
 
-for (loop_cnt = (height >> 2); loop_cnt--;) {
-LD_SB4(src, src_stride, src0, src1, src2, src3);
-src += (4 * src_stride);
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);
 
-XORI_B4_128_SB(src0, src1, src2, src3);
-HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
-   filt1, out0, out1, out2, out3);
-SRARI_H4_SH(out0, out1, out2, out3, 6);
-SAT_SH4_SH(out0, out1, out2, out3, 7);
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);
 
-out4 = PCKEV_XORI128_UB(out0, out1);
-out5 = PCKEV_XORI128_UB(out2, out3);
-ST6x4_UB(out4, out5, dst, dst_stride);
-dst += (4 * dst_stride);
-}
+LD_SB4(src, src_stride, src0, src1, src2, src3);
+src += (4 * src_stride);
+
+XORI_B4_128_SB(src0, src1, src2, src3);
+HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+   filt1, out0, out1, out2, out3);
+SRARI_H4_SH(out0, out1, out2, out3, 6);
+SAT_SH4_SH(out0, out1, out2, out3, 7);
+out4 = PCKEV_XORI128_UB(out0, out1);
+out5 = PCKEV_XORI128_UB(out2, out3);
+ST6x4_UB(out4, out5, dst, dst_stride);
+dst += (4 * dst_stride);
 }
 
 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
@@ -2138,7 +2147,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, 
int32_t src_stride,
 v16u8 out;
 v8i16 filt, vec0, vec1, vec2, vec3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 filt = LD_SH(filter);
@@ -2172,7 +2181,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, 
int32_t src_stride,
 v16u8 tmp0, tmp1;
 v8i16 filt, out0, out1, out2, out3;
 
-mask0 = LD_SB(_filt_mask_arr[0]);
+mask0 = LD_SB(_hevc_mask_arr[0]);
 src -= 1;
 
 /* rearranging filter */
@@ -2221,8 +2230,8 @@ static void