---
 libavcodec/vc1.h    |    2 +
 libavcodec/vc1dec.c |  327 ++++++++++++++++++++++++++++++++++++++++-----------
 libavcodec/vc1dsp.c |   58 +++++++++-
 libavcodec/vc1dsp.h |    6 +-
 4 files changed, 321 insertions(+), 72 deletions(-)

diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
index db8a7f4..96e5744 100644
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@@ -317,6 +317,8 @@ typedef struct VC1Context{
     int bi_type;
     int x8_type;
 
+    DCTELEM (*block)[6][64];
+    int n_allocated_blks, cur_blk_idx, left_blk_idx, topleft_blk_idx, 
top_blk_idx;
     uint32_t *cbp_base, *cbp;
     uint8_t *is_intra_base, *is_intra;
     int16_t (*luma_mv_base)[2], (*luma_mv)[2];
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 7097c81..2768ad0 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -160,6 +160,70 @@ enum Imode {
 
 /** @} */ //Bitplane group
 
+static void vc1_put_signed_blocks_clamped(VC1Context *v)
+{
+    MpegEncContext *s= &v->s;
+
+    /* The put pixels loop is always one MB row behind the decoding loop,
+     * because we can only put pixels when overlap filtering is done, and
+     * for filtering of the bottom edge of a MB, we need the next MB row
+     * present as well.
+     * Within the row, the put pixels loop is also one MB col behind the
+     * decoding loop. The reason for this is again, because for filtering
+     * of the right MB edge, we need the next MB present. */
+    if (!s->first_slice_line) {
+        if (s->mb_x) {
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
+                                             s->dest[0] - 16 * s->linesize - 
16,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1],
+                                             s->dest[0] - 16 * s->linesize - 8,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2],
+                                             s->dest[0] - 8 * s->linesize - 16,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
+                                             s->dest[0] - 8 * s->linesize - 8,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
+                                             s->dest[1] - 8 * s->uvlinesize - 
8,
+                                             s->uvlinesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
+                                             s->dest[2] - 8 * s->uvlinesize - 
8,
+                                             s->uvlinesize);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
+                                             s->dest[0] - 16 * s->linesize,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1],
+                                             s->dest[0] - 16 * s->linesize + 8,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2],
+                                             s->dest[0] - 8 * s->linesize,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
+                                             s->dest[0] - 8 * s->linesize + 8,
+                                             s->linesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
+                                             s->dest[1] - 8 * s->uvlinesize,
+                                             s->uvlinesize);
+            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
+                                             s->dest[2] - 8 * s->uvlinesize,
+                                             s->uvlinesize);
+        }
+    }
+
+#define inc_blk_idx(x) do { \
+        if (++x >= v->n_allocated_blks) x = 0; \
+    } while (0)
+
+    inc_blk_idx(v->topleft_blk_idx);
+    inc_blk_idx(v->top_blk_idx);
+    inc_blk_idx(v->left_blk_idx);
+    inc_blk_idx(v->cur_blk_idx);
+}
+
 static void vc1_loop_filter_iblk(VC1Context *v, int pq)
 {
     MpegEncContext *s = &v->s;
@@ -187,6 +251,150 @@ static void vc1_loop_filter_iblk(VC1Context *v, int pq)
     }
 }
 
+static void vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
+{
+    MpegEncContext *s = &v->s;
+    int j;
+
+    /* The loopfilter runs 1 row and 1 column behind the overlap filter, which
+     * means it runs two rows/cols behind the decoding loop. */
+    if (!s->first_slice_line) {
+        if (s->mb_x) {
+            if (s->mb_y >= s->start_mb_y + 2) {
+                v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 16 * s->linesize - 
16, s->linesize, pq);
+
+                if (s->mb_x >= 2)
+                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * 
s->linesize - 16, s->linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 
8, s->linesize, pq);
+                for(j = 0; j < 2; j++) {
+                    v->vc1dsp.vc1_v_loop_filter8(s->dest[j+1] - 8 * 
s->uvlinesize - 8, s->uvlinesize, pq);
+                    if (s->mb_x >= 2) {
+                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j+1] - 16 * 
s->uvlinesize - 8, s->uvlinesize, pq);
+                    }
+                }
+            }
+            v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 8 * s->linesize - 16, 
s->linesize, pq);
+        }
+
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_y >= s->start_mb_y + 2) {
+                v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 16 * s->linesize, 
s->linesize, pq);
+
+                if (s->mb_x)
+                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * 
s->linesize, s->linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize + 
8, s->linesize, pq);
+                for(j = 0; j < 2; j++) {
+                    v->vc1dsp.vc1_v_loop_filter8(s->dest[j+1] - 8 * 
s->uvlinesize, s->uvlinesize, pq);
+                    if (s->mb_x >= 2) {
+                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j+1] - 16 * 
s->uvlinesize, s->uvlinesize, pq);
+                    }
+                }
+            }
+            v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 8 * s->linesize, 
s->linesize, pq);
+        }
+
+        if (s->mb_y == s->mb_height) {
+            if (s->mb_x) {
+                if (s->mb_x >= 2)
+                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * 
s->linesize - 16, s->linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 
8, s->linesize, pq);
+                if (s->mb_x >= 2) {
+                    for(j = 0; j < 2; j++) {
+                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j+1] - 8 * 
s->uvlinesize - 8, s->uvlinesize, pq);
+                    }
+                }
+            }
+            
+            if (s->mb_x == s->mb_width - 1) {
+                if (s->mb_x)
+                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * 
s->linesize, s->linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 
8, s->linesize, pq);
+                if (s->mb_x) {
+                    for(j = 0; j < 2; j++) {
+                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j+1] - 8 * 
s->uvlinesize, s->uvlinesize, pq);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void vc1_smooth_overlap_filter_iblk(VC1Context *v)
+{
+    MpegEncContext *s = &v->s;
+    int mb_pos;
+
+    if (v->condover == CONDOVER_NONE) return;
+
+    mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+
+    /* Within a MB, the horizontal overlap always runs before the vertical.
+     * To accomplish that, we run the H on left and internal borders of the
+     * currently decoded MB. Then, we wait for the next overlap iteration
+     * to do H overlap on the right edge of this MB, before moving over and
+     * running the V overlap. Therefore, the V overlap makes us trail by one
+     * MB col and the H overlap filter makes us trail by one MB row. This
+     * is reflected in the time at which we run the put_pixels loop. */
+    if(v->condover == CONDOVER_ALL || v->pq >= 9 || 
v->over_flags_plane[mb_pos]) {
+        if(s->mb_x && (v->condover == CONDOVER_ALL || v->pq >= 9 ||
+                       v->over_flags_plane[mb_pos - 1])) {
+            v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][1],
+                                      v->block[v->cur_blk_idx][0]);
+            v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][3],
+                                      v->block[v->cur_blk_idx][2]);
+            if(!(s->flags & CODEC_FLAG_GRAY)) {
+                v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][4],
+                                          v->block[v->cur_blk_idx][4]);
+                v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][5],
+                                          v->block[v->cur_blk_idx][5]);
+            }
+        }
+        v->vc1dsp.vc1_h_s_overlap(v->block[v->cur_blk_idx][0],
+                                  v->block[v->cur_blk_idx][1]);
+        v->vc1dsp.vc1_h_s_overlap(v->block[v->cur_blk_idx][2],
+                                  v->block[v->cur_blk_idx][3]);
+
+        if (s->mb_x == s->mb_width - 1) {
+            if(!s->first_slice_line && (v->condover == CONDOVER_ALL || v->pq 
>= 9 ||
+                                        v->over_flags_plane[mb_pos - 
s->mb_stride])) {
+                v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][2],
+                                          v->block[v->cur_blk_idx][0]);
+                v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][3],
+                                          v->block[v->cur_blk_idx][1]);
+                if(!(s->flags & CODEC_FLAG_GRAY)) {
+                    v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][4],
+                                              v->block[v->cur_blk_idx][4]);
+                    v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][5],
+                                              v->block[v->cur_blk_idx][5]);
+                }
+            }
+            v->vc1dsp.vc1_v_s_overlap(v->block[v->cur_blk_idx][0],
+                                      v->block[v->cur_blk_idx][2]);
+            v->vc1dsp.vc1_v_s_overlap(v->block[v->cur_blk_idx][1],
+                                      v->block[v->cur_blk_idx][3]);
+        }
+    }
+    if (s->mb_x && (v->condover == CONDOVER_ALL || v->over_flags_plane[mb_pos 
- 1])) {
+        if(!s->first_slice_line && (v->condover == CONDOVER_ALL || v->pq >= 9 
||
+                                    v->over_flags_plane[mb_pos - s->mb_stride 
- 1])) {
+            v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][2],
+                                      v->block[v->left_blk_idx][0]);
+            v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][3],
+                                      v->block[v->left_blk_idx][1]);
+            if(!(s->flags & CODEC_FLAG_GRAY)) {
+                v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][4],
+                                          v->block[v->left_blk_idx][4]);
+                v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][5],
+                                          v->block[v->left_blk_idx][5]);
+            }
+        }
+        v->vc1dsp.vc1_v_s_overlap(v->block[v->left_blk_idx][0],
+                                  v->block[v->left_blk_idx][2]);
+        v->vc1dsp.vc1_v_s_overlap(v->block[v->left_blk_idx][1],
+                                  v->block[v->left_blk_idx][3]);
+    }
+}
+
 /** Do motion compensation over 1 macroblock
  * Mostly adapted hpel_motion and qpel_motion from mpegvideo.c
  */
@@ -2764,7 +2972,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
 
 /** Decode blocks of I-frame for advanced profile
  */
-static void vc1_decode_i_blocks_adv(VC1Context *v, int mby_start, int mby_end)
+static void vc1_decode_i_blocks_adv(VC1Context *v)
 {
     int k;
     MpegEncContext *s = &v->s;
@@ -2773,7 +2981,6 @@ static void vc1_decode_i_blocks_adv(VC1Context *v, int 
mby_start, int mby_end)
     int mb_pos;
     int mquant = v->pq;
     int mqdiff;
-    int overlap;
     GetBitContext *gb = &s->gb;
 
     /* select codingmode used for VLC tables selection */
@@ -2805,26 +3012,20 @@ static void vc1_decode_i_blocks_adv(VC1Context *v, int 
mby_start, int mby_end)
     s->mb_x = s->mb_y = 0;
     s->mb_intra = 1;
     s->first_slice_line = 1;
-    s->mb_y = mby_start;
-    if (mby_start) {
+    s->mb_y = s->start_mb_y;
+    if (s->start_mb_y) {
         s->mb_x = 0;
         ff_init_block_index(s);
         memset(&s->coded_block[s->block_index[0]-s->b8_stride], 0,
                s->b8_stride * sizeof(*s->coded_block));
     }
-    for(; s->mb_y < mby_end; s->mb_y++) {
+    for(; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         ff_init_block_index(s);
         for(;s->mb_x < s->mb_width; s->mb_x++) {
-            uint8_t *dst[6];
+            DCTELEM (*block)[64] = v->block[v->cur_blk_idx];
             ff_update_block_index(s);
-            dst[0] = s->dest[0];
-            dst[1] = dst[0] + 8;
-            dst[2] = s->dest[0] + s->linesize * 8;
-            dst[3] = dst[2] + 8;
-            dst[4] = s->dest[1];
-            dst[5] = s->dest[2];
-            s->dsp.clear_blocks(s->block[0]);
+            s->dsp.clear_blocks(block[0]);
             mb_pos = s->mb_x + s->mb_y * s->mb_stride;
             s->current_picture.mb_type[mb_pos] = MB_TYPE_INTRA;
             s->current_picture.motion_val[1][s->block_index[0]][0] = 0;
@@ -2837,13 +3038,8 @@ static void vc1_decode_i_blocks_adv(VC1Context *v, int 
mby_start, int mby_end)
             else
                 v->s.ac_pred = v->acpred_plane[mb_pos];
 
-            if(v->condover == CONDOVER_SELECT) {
-                if(v->overflg_is_raw)
-                    overlap = get_bits1(&v->s.gb);
-                else
-                    overlap = v->over_flags_plane[mb_pos];
-            } else
-                overlap = (v->condover == CONDOVER_ALL);
+            if (v->condover == CONDOVER_SELECT && v->overflg_is_raw)
+                v->over_flags_plane[mb_pos] = get_bits1(&v->s.gb);
 
             GET_MQUANT();
 
@@ -2865,40 +3061,18 @@ static void vc1_decode_i_blocks_adv(VC1Context *v, int 
mby_start, int mby_end)
                 v->a_avail = !s->first_slice_line || (k==2 || k==3);
                 v->c_avail = !!s->mb_x || (k==1 || k==3);
 
-                vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? 
v->codingset : v->codingset2, mquant);
+                vc1_decode_i_block_adv(v, block[k], k, val, (k<4)? 
v->codingset : v->codingset2, mquant);
 
                 if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
-                v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
-                s->dsp.put_signed_pixels_clamped(s->block[k], dst[k],
-                                                 k & 4 ? s->uvlinesize : 
s->linesize);
+                v->vc1dsp.vc1_inv_trans_8x8(block[k]);
             }
 
-            if(overlap) {
-                if(s->mb_x) {
-                    v->vc1dsp.vc1_h_overlap(s->dest[0], s->linesize);
-                    v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize, 
s->linesize);
-                    if(!(s->flags & CODEC_FLAG_GRAY)) {
-                        v->vc1dsp.vc1_h_overlap(s->dest[1], s->uvlinesize);
-                        v->vc1dsp.vc1_h_overlap(s->dest[2], s->uvlinesize);
-                    }
-                }
-                v->vc1dsp.vc1_h_overlap(s->dest[0] + 8, s->linesize);
-                v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize + 8, 
s->linesize);
-                if(!s->first_slice_line) {
-                    v->vc1dsp.vc1_v_overlap(s->dest[0], s->linesize);
-                    v->vc1dsp.vc1_v_overlap(s->dest[0] + 8, s->linesize);
-                    if(!(s->flags & CODEC_FLAG_GRAY)) {
-                        v->vc1dsp.vc1_v_overlap(s->dest[1], s->uvlinesize);
-                        v->vc1dsp.vc1_v_overlap(s->dest[2], s->uvlinesize);
-                    }
-                }
-                v->vc1dsp.vc1_v_overlap(s->dest[0] + 8 * s->linesize, 
s->linesize);
-                v->vc1dsp.vc1_v_overlap(s->dest[0] + 8 * s->linesize + 8, 
s->linesize);
-            }
-            if(v->s.loop_filter) vc1_loop_filter_iblk(v, v->pq);
+            vc1_smooth_overlap_filter_iblk(v);
+            vc1_put_signed_blocks_clamped(v);
+            if(v->s.loop_filter) vc1_loop_filter_iblk_delayed(v, v->pq);
 
             if(get_bits_count(&s->gb) > v->bits) {
-                ff_er_add_slice(s, 0, mby_start, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
+                ff_er_add_slice(s, 0, s->start_mb_y, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
                 av_log(s->avctx, AV_LOG_ERROR, "Bits overconsumption: %i > 
%i\n", get_bits_count(&s->gb), v->bits);
                 return;
             }
@@ -2909,12 +3083,21 @@ static void vc1_decode_i_blocks_adv(VC1Context *v, int 
mby_start, int mby_end)
             ff_draw_horiz_band(s, (s->mb_y-1) * 16, 16);
         s->first_slice_line = 0;
     }
+
+    /* raw bottom MB row */
+    s->mb_x = 0;
+    ff_init_block_index(s);
+    for(;s->mb_x < s->mb_width; s->mb_x++) {
+        ff_update_block_index(s);
+        vc1_put_signed_blocks_clamped(v);
+        if(v->s.loop_filter) vc1_loop_filter_iblk_delayed(v, v->pq);
+    }
     if (v->s.loop_filter)
         ff_draw_horiz_band(s, (s->mb_height-1)*16, 16);
-    ff_er_add_slice(s, 0, mby_start, s->mb_width - 1, mby_end - 1, 
(AC_END|DC_END|MV_END));
+    ff_er_add_slice(s, 0, s->start_mb_y, s->mb_width - 1, s->end_mb_y - 1, 
(AC_END|DC_END|MV_END));
 }
 
-static void vc1_decode_p_blocks(VC1Context *v, int mby_start, int mby_end)
+static void vc1_decode_p_blocks(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int apply_loop_filter;
@@ -2947,17 +3130,17 @@ static void vc1_decode_p_blocks(VC1Context *v, int 
mby_start, int mby_end)
     apply_loop_filter = s->loop_filter && !(s->avctx->skip_loop_filter >= 
AVDISCARD_NONKEY);
     s->first_slice_line = 1;
     memset(v->cbp_base, 0, sizeof(v->cbp_base[0])*2*s->mb_stride);
-    for(s->mb_y = mby_start; s->mb_y < mby_end; s->mb_y++) {
+    for(s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         ff_init_block_index(s);
         for(; s->mb_x < s->mb_width; s->mb_x++) {
             ff_update_block_index(s);
 
             vc1_decode_p_mb(v);
-            if (s->mb_y != mby_start && apply_loop_filter)
+            if (s->mb_y != s->start_mb_y && apply_loop_filter)
                 vc1_apply_p_loop_filter(v);
             if(get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) 
{
-                ff_er_add_slice(s, 0, mby_start, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
+                ff_er_add_slice(s, 0, s->start_mb_y, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
                 av_log(s->avctx, AV_LOG_ERROR, "Bits overconsumption: %i > %i 
at %ix%i\n", get_bits_count(&s->gb), v->bits,s->mb_x,s->mb_y);
                 return;
             }
@@ -2966,7 +3149,7 @@ static void vc1_decode_p_blocks(VC1Context *v, int 
mby_start, int mby_end)
         memmove(v->ttblk_base, v->ttblk, 
sizeof(v->ttblk_base[0])*s->mb_stride);
         memmove(v->is_intra_base, v->is_intra, 
sizeof(v->is_intra_base[0])*s->mb_stride);
         memmove(v->luma_mv_base, v->luma_mv, 
sizeof(v->luma_mv_base[0])*s->mb_stride);
-        if (s->mb_y != mby_start) ff_draw_horiz_band(s, (s->mb_y-1) * 16, 16);
+        if (s->mb_y != s->start_mb_y) ff_draw_horiz_band(s, (s->mb_y-1) * 16, 
16);
         s->first_slice_line = 0;
     }
     if (apply_loop_filter) {
@@ -2977,12 +3160,12 @@ static void vc1_decode_p_blocks(VC1Context *v, int 
mby_start, int mby_end)
             vc1_apply_p_loop_filter(v);
         }
     }
-    if (mby_end >= mby_start)
-        ff_draw_horiz_band(s, (mby_end-1) * 16, 16);
-    ff_er_add_slice(s, 0, mby_start, s->mb_width - 1, mby_end - 1, 
(AC_END|DC_END|MV_END));
+    if (s->end_mb_y >= s->start_mb_y)
+        ff_draw_horiz_band(s, (s->end_mb_y-1) * 16, 16);
+    ff_er_add_slice(s, 0, s->start_mb_y, s->mb_width - 1, s->end_mb_y - 1, 
(AC_END|DC_END|MV_END));
 }
 
-static void vc1_decode_b_blocks(VC1Context *v, int mby_start, int mby_end)
+static void vc1_decode_b_blocks(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
 
@@ -3012,7 +3195,7 @@ static void vc1_decode_b_blocks(VC1Context *v, int 
mby_start, int mby_end)
     }
 
     s->first_slice_line = 1;
-    for(s->mb_y = mby_start; s->mb_y < mby_end; s->mb_y++) {
+    for(s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         ff_init_block_index(s);
         for(; s->mb_x < s->mb_width; s->mb_x++) {
@@ -3020,7 +3203,7 @@ static void vc1_decode_b_blocks(VC1Context *v, int 
mby_start, int mby_end)
 
             vc1_decode_b_mb(v);
             if(get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) 
{
-                ff_er_add_slice(s, 0, mby_start, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
+                ff_er_add_slice(s, 0, s->start_mb_y, s->mb_x, s->mb_y, 
(AC_END|DC_END|MV_END));
                 av_log(s->avctx, AV_LOG_ERROR, "Bits overconsumption: %i > %i 
at %ix%i\n", get_bits_count(&s->gb), v->bits,s->mb_x,s->mb_y);
                 return;
             }
@@ -3034,7 +3217,7 @@ static void vc1_decode_b_blocks(VC1Context *v, int 
mby_start, int mby_end)
     }
     if (v->s.loop_filter)
         ff_draw_horiz_band(s, (s->mb_height-1)*16, 16);
-    ff_er_add_slice(s, 0, mby_start, s->mb_width - 1, mby_end - 1, 
(AC_END|DC_END|MV_END));
+    ff_er_add_slice(s, 0, s->start_mb_y, s->mb_width - 1, s->end_mb_y - 1, 
(AC_END|DC_END|MV_END));
 }
 
 static void vc1_decode_skip_blocks(VC1Context *v)
@@ -3056,20 +3239,24 @@ static void vc1_decode_skip_blocks(VC1Context *v)
     s->pict_type = FF_P_TYPE;
 }
 
-static void vc1_decode_blocks(VC1Context *v, int mby_start, int mby_end)
+static void vc1_decode_blocks(VC1Context *v)
 {
 
     v->s.esc3_level_length = 0;
     if(v->x8_type){
         ff_intrax8_decode_picture(&v->x8, 2*v->pq+v->halfpq, 
v->pq*(!v->pquantizer) );
     }else{
+        v->cur_blk_idx = 0;
+        v->left_blk_idx = -1;
+        v->topleft_blk_idx = 1;
+        v->top_blk_idx = 2;
         switch(v->s.pict_type) {
         case FF_I_TYPE:
             if(v->profile == PROFILE_ADVANCED)
 {
 #undef printf
 //printf("I\n");
-                vc1_decode_i_blocks_adv(v, mby_start, mby_end);
+                vc1_decode_i_blocks_adv(v);
 }
             else
                 vc1_decode_i_blocks(v);
@@ -3083,7 +3270,7 @@ static void vc1_decode_blocks(VC1Context *v, int 
mby_start, int mby_end)
             else
 {
 //printf("P\n");
-                vc1_decode_p_blocks(v, mby_start, mby_end);
+                vc1_decode_p_blocks(v);
 }
             break;
         case FF_B_TYPE:
@@ -3091,14 +3278,14 @@ static void vc1_decode_blocks(VC1Context *v, int 
mby_start, int mby_end)
                 if(v->profile == PROFILE_ADVANCED)
 {
 //printf("BI\n");
-                    vc1_decode_i_blocks_adv(v, mby_start, mby_end);
+                    vc1_decode_i_blocks_adv(v);
 }
                 else
                     vc1_decode_i_blocks(v);
             }else
 {
 //printf("B\n");
-                vc1_decode_b_blocks(v, mby_start, mby_end);
+                vc1_decode_b_blocks(v);
 }
             break;
         }
@@ -3349,6 +3536,8 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
     v->acpred_plane = av_malloc(s->mb_stride * s->mb_height);
     v->over_flags_plane = av_malloc(s->mb_stride * s->mb_height);
 
+    v->n_allocated_blks = s->mb_width + 2;
+    v->block = av_malloc(sizeof(*v->block) * v->n_allocated_blks);
     v->cbp_base = av_malloc(sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
     v->cbp = v->cbp_base + s->mb_stride;
     v->ttblk_base = av_malloc(sizeof(v->ttblk_base[0]) * 2 * s->mb_stride);
@@ -3555,8 +3744,9 @@ static int vc1_decode_frame(AVCodecContext *avctx,
         for (i = 0; i <= n_slices; i++) {
             if (i && get_bits1(&s->gb))
                 vc1_parse_frame_header_adv(v, &s->gb);
-            vc1_decode_blocks(v, i == 0 ? 0 : FFMAX(0, slices[i-1].mby_start),
-                i == n_slices ? s->mb_height : FFMIN(s->mb_height, 
slices[i].mby_start));
+            s->start_mb_y = (i == 0)        ? 0 : FFMAX(0, 
slices[i-1].mby_start);
+            s->end_mb_y   = (i == n_slices) ? s->mb_height : 
FFMIN(s->mb_height, slices[i].mby_start);
+            vc1_decode_blocks(v);
             if (i != n_slices) s->gb = slices[i].gb;
         }
 //av_log(s->avctx, AV_LOG_INFO, "Consumed %i/%i bits\n", 
get_bits_count(&s->gb), s->gb.size_in_bits);
@@ -3613,6 +3803,7 @@ static av_cold int vc1_decode_end(AVCodecContext *avctx)
     av_freep(&v->acpred_plane);
     av_freep(&v->over_flags_plane);
     av_freep(&v->mb_type_base);
+    av_freep(&v->block);
     av_freep(&v->cbp_base);
     av_freep(&v->ttblk_base);
     av_freep(&v->is_intra_base); // FIXME use v->mb_type[]
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index e131553..14f0dc3 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -78,6 +78,58 @@ static void vc1_h_overlap_c(uint8_t* src, int stride)
     }
 }
 
+static void vc1_v_s_overlap_c(DCTELEM *top,  DCTELEM *bottom)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd1 = 4, rnd2 = 3;
+    for(i = 0; i < 8; i++) {
+        a = top[48];
+        b = top[56];
+        c = bottom[0];
+        d = bottom[8];
+        d1 = a - d;
+        d2 = a - d + b - c;
+        
+        top[48]   = ((a << 3) - d1 + rnd1) >> 3;
+        top[56]   = ((b << 3) - d2 + rnd2) >> 3;
+        bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
+        bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
+
+        bottom++;
+        top++;
+        rnd2 = 7 - rnd2;
+        rnd1 = 7 - rnd1;
+    }
+}
+
+static void vc1_h_s_overlap_c(DCTELEM *left, DCTELEM *right)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd1 = 4, rnd2 = 3;
+    for(i = 0; i < 8; i++) {
+        a = left[6];
+        b = left[7];
+        c = right[0];
+        d = right[1];
+        d1 = a - d;
+        d2 = a - d + b - c;
+
+        left[6]  = ((a << 3) - d1 + rnd1) >> 3;
+        left[7]  = ((b << 3) - d2 + rnd2) >> 3;
+        right[0] = ((c << 3) + d2 + rnd1) >> 3;
+        right[1] = ((d << 3) + d1 + rnd2) >> 3;
+
+        right += 8;
+        left += 8;
+        rnd2 = 7 - rnd2;
+        rnd1 = 7 - rnd1;
+    }
+}
+
 /**
  * VC-1 in-loop deblocking filter for one line
  * @param src source block type
@@ -672,6 +724,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
     dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
     dsp->vc1_h_overlap = vc1_h_overlap_c;
     dsp->vc1_v_overlap = vc1_v_overlap_c;
+    dsp->vc1_h_s_overlap = vc1_h_s_overlap_c;
+    dsp->vc1_v_s_overlap = vc1_v_s_overlap_c;
     dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c;
     dsp->vc1_h_loop_filter4 = vc1_h_loop_filter4_c;
     dsp->vc1_v_loop_filter8 = vc1_v_loop_filter8_c;
@@ -718,6 +772,6 @@ av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
 
     if (HAVE_ALTIVEC)
         ff_vc1dsp_init_altivec(dsp);
-    if (HAVE_MMX)
-        ff_vc1dsp_init_mmx(dsp);
+    //if (HAVE_MMX)
+    //    ff_vc1dsp_init_mmx(dsp);
 }
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index 7b1ae10..e1b6ba0 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -40,8 +40,10 @@ typedef struct VC1DSPContext {
     void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
     void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
     void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_v_overlap)(uint8_t* src, int stride);
-    void (*vc1_h_overlap)(uint8_t* src, int stride);
+    void (*vc1_v_overlap)(uint8_t *src, int stride);
+    void (*vc1_h_overlap)(uint8_t *src, int stride);
+    void (*vc1_v_s_overlap)(DCTELEM *top,  DCTELEM *bottom);
+    void (*vc1_h_s_overlap)(DCTELEM *left, DCTELEM *right);
     void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
     void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
     void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
-- 
1.7.2.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to