Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions

2017-11-03 Thread Michael Niedermayer
On Fri, Nov 03, 2017 at 12:22:50PM +, Manojkumar Bhosale wrote:
> LGTM

applied


[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Avoid a single point of failure, be that a person or equipment.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions

2017-11-03 Thread Manojkumar Bhosale
LGTM

-Original Message-
From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of 
kaustubh.ra...@imgtec.com
Sent: Friday, November 3, 2017 11:59 AM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and 
vt mc msa functions

From: Kaustubh Raste <kaustubh.ra...@imgtec.com>

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com>
---
 libavcodec/mips/hevc_mc_biw_msa.c |  587 -
 1 file changed, 247 insertions(+), 340 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_biw_msa.c 
b/libavcodec/mips/hevc_mc_biw_msa.c
index 75c1c7a..0e5f8a0 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -2633,22 +2633,21 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1;
 v8i16 in0, in1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1, vec0, vec1;
 v8i16 dst0;
 v4i32 dst0_r, dst0_l;
-v8i16 filter_vec, const_vec;
+v8i16 out0, filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2661,9 +2660,10 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2674,18 +2674,16 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 XORI_B2_128_SB(src0, src1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 
 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-dst0_r = CLIP_SW_0_255(dst0_r);
-dst0_l = CLIP_SW_0_255(dst0_l);
-
-HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+out0 = CLIP_SH_0_255(dst0_r);
+out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
+ST4x2_UB(out0, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2695,22 +2693,21 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1;
 v8i16 dst0, dst1;
 v16i8 vec0, vec1;
 v8i16 in0, in1, in2, in3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2724,9 +2721,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2737,11 +2735,9 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 ILVR_D2_SH(in1

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc bi wgt 4 tap hz and vt mc msa functions

2017-11-03 Thread kaustubh.raste
From: Kaustubh Raste 

Use global mask buffer for appropriate mask load.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_mc_biw_msa.c |  587 -
 1 file changed, 247 insertions(+), 340 deletions(-)

diff --git a/libavcodec/mips/hevc_mc_biw_msa.c 
b/libavcodec/mips/hevc_mc_biw_msa.c
index 75c1c7a..0e5f8a0 100644
--- a/libavcodec/mips/hevc_mc_biw_msa.c
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -2633,22 +2633,21 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1;
 v8i16 in0, in1;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1, vec0, vec1;
 v8i16 dst0;
 v4i32 dst0_r, dst0_l;
-v8i16 filter_vec, const_vec;
+v8i16 out0, filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2661,9 +2660,10 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2674,18 +2674,16 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
 XORI_B2_128_SB(src0, src1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 
 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
-dst0_r = CLIP_SW_0_255(dst0_r);
-dst0_l = CLIP_SW_0_255(dst0_l);
-
-HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
-ST4x2_UB(dst0_r, dst, dst_stride);
+dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+out0 = CLIP_SH_0_255(dst0_r);
+out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
+ST4x2_UB(out0, dst, dst_stride);
 }
 
 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@@ -2695,22 +2693,21 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  uint8_t *dst,
  int32_t dst_stride,
  const int8_t *filter,
- int32_t height,
  int32_t weight0,
  int32_t weight1,
  int32_t offset0,
  int32_t offset1,
  int32_t rnd_val)
 {
-int32_t offset, weight;
+int32_t offset, weight, constant;
 v8i16 filt0, filt1;
 v16i8 src0, src1, src2, src3;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v16i8 mask0 = LD_SB(_hevc_mask_arr[16]);
 v16i8 mask1;
 v8i16 dst0, dst1;
 v16i8 vec0, vec1;
 v8i16 in0, in1, in2, in3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v4i32 weight_vec, offset_vec, rnd_vec;
 
 src0_ptr -= 1;
@@ -2724,9 +2721,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 offset = (offset0 + offset1) << rnd_val;
 weight0 = weight0 & 0x;
 weight = weight0 | (weight1 << 16);
+constant = 128 * weight1;
+constant <<= 6;
+offset += constant;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
 offset_vec = __msa_fill_w(offset);
 weight_vec = __msa_fill_w(weight);
 rnd_vec = __msa_fill_w(rnd_val + 1);
@@ -2737,11 +2735,9 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
 
 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
-dst0 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
-dst1 = const_vec;
-DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+dst1 =