Re: [libav-devel] [PATCH] vc1: make overlap filter for I-frames bit-exact.

Kostya Sun, 01 May 2011 22:42:53 -0700

On Sun, May 01, 2011 at 10:35:14PM -0400, Ronald S. Bultje wrote:
> ---
>  libavcodec/vc1.h    |    2 +
>  libavcodec/vc1dec.c |  327 
> ++++++++++++++++++++++++++++++++++++++++-----------
>  libavcodec/vc1dsp.c |   58 +++++++++-
>  libavcodec/vc1dsp.h |    6 +-
>  4 files changed, 321 insertions(+), 72 deletions(-)
> 
> diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
> index db8a7f4..96e5744 100644
> --- a/libavcodec/vc1.h
> +++ b/libavcodec/vc1.h
> @@ -317,6 +317,8 @@ typedef struct VC1Context{
>      int bi_type;
>      int x8_type;
>  
> +    DCTELEM (*block)[6][64];
> +    int n_allocated_blks, cur_blk_idx, left_blk_idx, topleft_blk_idx, 
> top_blk_idx;
>      uint32_t *cbp_base, *cbp;
>      uint8_t *is_intra_base, *is_intra;
>      int16_t (*luma_mv_base)[2], (*luma_mv)[2];
> diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
> index 7097c81..2768ad0 100644
> --- a/libavcodec/vc1dec.c
> +++ b/libavcodec/vc1dec.c
> @@ -160,6 +160,70 @@ enum Imode {
>  
>  /** @} */ //Bitplane group
>  
> +static void vc1_put_signed_blocks_clamped(VC1Context *v)
> +{
> +    MpegEncContext *s= &v->s;


iam against such formatting

> +
> +    /* The put pixels loop is always one MB row behind the decoding loop,
> +     * because we can only put pixels when overlap filtering is done, and
> +     * for filtering of the bottom edge of a MB, we need the next MB row
> +     * present as well.
> +     * Within the row, the put pixels loop is also one MB col behind the
> +     * decoding loop. The reason for this is again, because for filtering
> +     * of the right MB edge, we need the next MB present. */
> +    if (!s->first_slice_line) {
> +        if (s->mb_x) {
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
> +                                             s->dest[0] - 16 * s->linesize - 
> 16,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1],
> +                                             s->dest[0] - 16 * s->linesize - 
> 8,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2],
> +                                             s->dest[0] - 8 * s->linesize - 
> 16,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
> +                                             s->dest[0] - 8 * s->linesize - 
> 8,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
> +                                             s->dest[1] - 8 * s->uvlinesize 
> - 8,
> +                                             s->uvlinesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
> +                                             s->dest[2] - 8 * s->uvlinesize 
> - 8,
> +                                             s->uvlinesize);
> +        }
> +        if (s->mb_x == s->mb_width - 1) {
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
> +                                             s->dest[0] - 16 * s->linesize,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1],
> +                                             s->dest[0] - 16 * s->linesize + 
> 8,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2],
> +                                             s->dest[0] - 8 * s->linesize,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
> +                                             s->dest[0] - 8 * s->linesize + 
> 8,
> +                                             s->linesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
> +                                             s->dest[1] - 8 * s->uvlinesize,
> +                                             s->uvlinesize);
> +            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
> +                                             s->dest[2] - 8 * s->uvlinesize,
> +                                             s->uvlinesize);
> +        }
> +    }
> +
> +#define inc_blk_idx(x) do { \
> +        if (++x >= v->n_allocated_blks) x = 0; \
> +    } while (0)
> +
> +    inc_blk_idx(v->topleft_blk_idx);
> +    inc_blk_idx(v->top_blk_idx);
> +    inc_blk_idx(v->left_blk_idx);
> +    inc_blk_idx(v->cur_blk_idx);

I'd simply use
 idx++;
 if (idx >= v->n_allocated_blks)
   idx = 0;

and it's a bit silly to define rather simple statement just for four lines

> +}
> +
>  static void vc1_loop_filter_iblk(VC1Context *v, int pq)
>  {
>      MpegEncContext *s = &v->s;
> @@ -187,6 +251,150 @@ static void vc1_loop_filter_iblk(VC1Context *v, int pq)
[...]
> +
> +static void vc1_smooth_overlap_filter_iblk(VC1Context *v)
> +{
> +    MpegEncContext *s = &v->s;
> +    int mb_pos;
> +
> +    if (v->condover == CONDOVER_NONE) return;

you can add newline here if you want to

[...]
> @@ -2764,7 +2972,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
>  
>  /** Decode blocks of I-frame for advanced profile
>   */
> -static void vc1_decode_i_blocks_adv(VC1Context *v, int mby_start, int 
> mby_end)
> +static void vc1_decode_i_blocks_adv(VC1Context *v)

this seems to be a bit independent change of passing slice start/end, maybe
make it separate patch?

[...]
> @@ -3056,20 +3239,24 @@ static void vc1_decode_skip_blocks(VC1Context *v)
>      s->pict_type = FF_P_TYPE;
>  }
>  
> -static void vc1_decode_blocks(VC1Context *v, int mby_start, int mby_end)
> +static void vc1_decode_blocks(VC1Context *v)
>  {
>  
>      v->s.esc3_level_length = 0;
>      if(v->x8_type){
>          ff_intrax8_decode_picture(&v->x8, 2*v->pq+v->halfpq, 
> v->pq*(!v->pquantizer) );
>      }else{
> +        v->cur_blk_idx = 0;
> +        v->left_blk_idx = -1;
> +        v->topleft_blk_idx = 1;
> +        v->top_blk_idx = 2;
>          switch(v->s.pict_type) {
>          case FF_I_TYPE:
>              if(v->profile == PROFILE_ADVANCED)
>  {
>  #undef printf
>  //printf("I\n");

ahem

> -                vc1_decode_i_blocks_adv(v, mby_start, mby_end);
> +                vc1_decode_i_blocks_adv(v);
>  }
>              else
>                  vc1_decode_i_blocks(v);
> @@ -3083,7 +3270,7 @@ static void vc1_decode_blocks(VC1Context *v, int 
> mby_start, int mby_end)
>              else
>  {
>  //printf("P\n");

Diego would cry because of those ugly debug lines

> -                vc1_decode_p_blocks(v, mby_start, mby_end);
> +                vc1_decode_p_blocks(v);
>  }
>              break;
>          case FF_B_TYPE:
> @@ -3091,14 +3278,14 @@ static void vc1_decode_blocks(VC1Context *v, int 
> mby_start, int mby_end)
>                  if(v->profile == PROFILE_ADVANCED)
>  {
>  //printf("BI\n");
> -                    vc1_decode_i_blocks_adv(v, mby_start, mby_end);
> +                    vc1_decode_i_blocks_adv(v);
>  }
>                  else
>                      vc1_decode_i_blocks(v);
>              }else
>  {
>  //printf("B\n");
> -                vc1_decode_b_blocks(v, mby_start, mby_end);
> +                vc1_decode_b_blocks(v);
>  }
>              break;
>          }
> @@ -3349,6 +3536,8 @@ static av_cold int vc1_decode_init(AVCodecContext 
> *avctx)
>      v->acpred_plane = av_malloc(s->mb_stride * s->mb_height);
>      v->over_flags_plane = av_malloc(s->mb_stride * s->mb_height);
>  
> +    v->n_allocated_blks = s->mb_width + 2;
> +    v->block = av_malloc(sizeof(*v->block) * v->n_allocated_blks);

I wonder why nobody cared about checking allocation result beck then.

>      v->cbp_base = av_malloc(sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
>      v->cbp = v->cbp_base + s->mb_stride;
>      v->ttblk_base = av_malloc(sizeof(v->ttblk_base[0]) * 2 * s->mb_stride);
> @@ -3555,8 +3744,9 @@ static int vc1_decode_frame(AVCodecContext *avctx,
>          for (i = 0; i <= n_slices; i++) {
>              if (i && get_bits1(&s->gb))
>                  vc1_parse_frame_header_adv(v, &s->gb);
> -            vc1_decode_blocks(v, i == 0 ? 0 : FFMAX(0, 
> slices[i-1].mby_start),
> -                i == n_slices ? s->mb_height : FFMIN(s->mb_height, 
> slices[i].mby_start));
> +            s->start_mb_y = (i == 0)        ? 0 : FFMAX(0, 
> slices[i-1].mby_start);
> +            s->end_mb_y   = (i == n_slices) ? s->mb_height : 
> FFMIN(s->mb_height, slices[i].mby_start);
> +            vc1_decode_blocks(v);
>              if (i != n_slices) s->gb = slices[i].gb;
>          }
>  //av_log(s->avctx, AV_LOG_INFO, "Consumed %i/%i bits\n", 
> get_bits_count(&s->gb), s->gb.size_in_bits);
> @@ -3613,6 +3803,7 @@ static av_cold int vc1_decode_end(AVCodecContext *avctx)
>      av_freep(&v->acpred_plane);
>      av_freep(&v->over_flags_plane);
>      av_freep(&v->mb_type_base);
> +    av_freep(&v->block);
>      av_freep(&v->cbp_base);
>      av_freep(&v->ttblk_base);
>      av_freep(&v->is_intra_base); // FIXME use v->mb_type[]
> diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
> index e131553..14f0dc3 100644
> --- a/libavcodec/vc1dsp.c
> +++ b/libavcodec/vc1dsp.c
> @@ -78,6 +78,58 @@ static void vc1_h_overlap_c(uint8_t* src, int stride)
>      }
>  }
>  
> +static void vc1_v_s_overlap_c(DCTELEM *top,  DCTELEM *bottom)
> +{
> +    int i;
> +    int a, b, c, d;
> +    int d1, d2;
> +    int rnd1 = 4, rnd2 = 3;
> +    for(i = 0; i < 8; i++) {
> +        a = top[48];
> +        b = top[56];
> +        c = bottom[0];
> +        d = bottom[8];
> +        d1 = a - d;
> +        d2 = a - d + b - c;
> +        
> +        top[48]   = ((a << 3) - d1 + rnd1) >> 3;
> +        top[56]   = ((b << 3) - d2 + rnd2) >> 3;
> +        bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
> +        bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
> +
> +        bottom++;
> +        top++;
> +        rnd2 = 7 - rnd2;
> +        rnd1 = 7 - rnd1;
> +    }
> +}
> +
> +static void vc1_h_s_overlap_c(DCTELEM *left, DCTELEM *right)
> +{
> +    int i;
> +    int a, b, c, d;
> +    int d1, d2;
> +    int rnd1 = 4, rnd2 = 3;
> +    for(i = 0; i < 8; i++) {
> +        a = left[6];
> +        b = left[7];
> +        c = right[0];
> +        d = right[1];
> +        d1 = a - d;
> +        d2 = a - d + b - c;
> +
> +        left[6]  = ((a << 3) - d1 + rnd1) >> 3;
> +        left[7]  = ((b << 3) - d2 + rnd2) >> 3;
> +        right[0] = ((c << 3) + d2 + rnd1) >> 3;
> +        right[1] = ((d << 3) + d1 + rnd2) >> 3;
> +
> +        right += 8;
> +        left += 8;
> +        rnd2 = 7 - rnd2;
> +        rnd1 = 7 - rnd1;
> +    }
> +}
> +
>  /**
>   * VC-1 in-loop deblocking filter for one line
>   * @param src source block type
> @@ -672,6 +724,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
>      dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
>      dsp->vc1_h_overlap = vc1_h_overlap_c;
>      dsp->vc1_v_overlap = vc1_v_overlap_c;
> +    dsp->vc1_h_s_overlap = vc1_h_s_overlap_c;
> +    dsp->vc1_v_s_overlap = vc1_v_s_overlap_c;
>      dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c;
>      dsp->vc1_h_loop_filter4 = vc1_h_loop_filter4_c;
>      dsp->vc1_v_loop_filter8 = vc1_v_loop_filter8_c;
> @@ -718,6 +772,6 @@ av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
>  
>      if (HAVE_ALTIVEC)
>          ff_vc1dsp_init_altivec(dsp);
> -    if (HAVE_MMX)
> -        ff_vc1dsp_init_mmx(dsp);
> +    //if (HAVE_MMX)
> +    //    ff_vc1dsp_init_mmx(dsp);

ahem, why?

>  }
> diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
> index 7b1ae10..e1b6ba0 100644
> --- a/libavcodec/vc1dsp.h
> +++ b/libavcodec/vc1dsp.h
> @@ -40,8 +40,10 @@ typedef struct VC1DSPContext {
>      void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM 
> *block);
>      void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM 
> *block);
>      void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM 
> *block);
> -    void (*vc1_v_overlap)(uint8_t* src, int stride);
> -    void (*vc1_h_overlap)(uint8_t* src, int stride);
> +    void (*vc1_v_overlap)(uint8_t *src, int stride);
> +    void (*vc1_h_overlap)(uint8_t *src, int stride);
> +    void (*vc1_v_s_overlap)(DCTELEM *top,  DCTELEM *bottom);
> +    void (*vc1_h_s_overlap)(DCTELEM *left, DCTELEM *right);
>      void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
>      void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
>      void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
> -- 

Overall, very nice work
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] vc1: make overlap filter for I-frames bit-exact.

Reply via email to