"Ronald S. Bultje" <[email protected]> writes:
> Hi,
>
> as you can see, our fate is yellow since I committed H264/10bit. While
> debugging, I found that h264pred appears to have some aliasing
> problems, in the form of:
> (pixel4=uint64_t)
>
> ((pixel4 *) dst)[x] = ((pixel4 *) src)[x];
>
> ... and this is partially responsible for the fate failures. Basically
> what happens is that very ocasionally, some pixels are simply not
> stored in memory. Unfortunately I didn't get to test this patch
> because Kostylev's machine died while I was debugging the issue. Seems
> to me that replacing each occurrence of the above with something like
> AV_WN4PA(dst, AV_RN4PA(src)); should fix it. This patch doesn't break
> 10bit H264 on my system. It may fix fate on the currently failing
> machines, not sure...
Whether or not it fixes the current failure, the code as is currently
wrong and your patch is in the right direction.
> diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
> index 066e837..c600133 100644
> --- a/libavcodec/h264pred_template.c
> +++ b/libavcodec/h264pred_template.c
> @@ -31,20 +31,21 @@
> static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - const pixel4 a= ((pixel4*)(src-stride))[0];
> - ((pixel4*)(src+0*stride))[0]= a;
> - ((pixel4*)(src+1*stride))[0]= a;
> - ((pixel4*)(src+2*stride))[0]= a;
> - ((pixel4*)(src+3*stride))[0]= a;
> + const pixel4 a= AV_RN4PA(src-stride);
> +
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t
> *topright, int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
> - ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
> - ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
> + AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
> + AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
> + AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
> + AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
> }
>
> static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int
> _stride){
> @@ -52,60 +53,69 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const
> uint8_t *topright, int _strid
> int stride = _stride/sizeof(pixel);
> const int dc= ( src[-stride] + src[1-stride] + src[2-stride] +
> src[3-stride]
> + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride]
> + src[-1+3*stride] + 4) >>3;
> + const pixel4 a = PIXEL_SPLAT_X4(dc);
>
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride]
> + src[-1+3*stride] + 2) >>2;
> + const pixel4 a = PIXEL_SPLAT_X4(dc);
>
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> const int dc= ( src[-stride] + src[1-stride] + src[2-stride] +
> src[3-stride] + 2) >>2;
> + const pixel4 a = PIXEL_SPLAT_X4(dc);
>
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
> + const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
> +
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
> + const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
> +
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
> static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
> int _stride){
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - ((pixel4*)(src+0*stride))[0]=
> - ((pixel4*)(src+1*stride))[0]=
> - ((pixel4*)(src+2*stride))[0]=
> - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
> + const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
> +
> + AV_WN4PA(src+0*stride, a);
> + AV_WN4PA(src+1*stride, a);
> + AV_WN4PA(src+2*stride, a);
> + AV_WN4PA(src+3*stride, a);
> }
>
>
> @@ -286,16 +296,16 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src,
> int _stride){
> int i;
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - const pixel4 a = ((pixel4*)(src-stride))[0];
> - const pixel4 b = ((pixel4*)(src-stride))[1];
> - const pixel4 c = ((pixel4*)(src-stride))[2];
> - const pixel4 d = ((pixel4*)(src-stride))[3];
> + const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
> + const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
> + const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
> + const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
>
> for(i=0; i<16; i++){
> - ((pixel4*)(src+i*stride))[0] = a;
> - ((pixel4*)(src+i*stride))[1] = b;
> - ((pixel4*)(src+i*stride))[2] = c;
> - ((pixel4*)(src+i*stride))[3] = d;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
> + AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
> + AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
> }
> }
>
> @@ -305,19 +315,21 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src,
> int stride){
> stride /= sizeof(pixel);
>
> for(i=0; i<16; i++){
> - ((pixel4*)(src+i*stride))[0] =
> - ((pixel4*)(src+i*stride))[1] =
> - ((pixel4*)(src+i*stride))[2] =
> - ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
> + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
> +
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
> }
> }
The above look OK.
> #define PREDICT_16x16_DC(v)\
> for(i=0; i<16; i++){\
> - AV_WN4P(src+ 0, v);\
> - AV_WN4P(src+ 4, v);\
> - AV_WN4P(src+ 8, v);\
> - AV_WN4P(src+12, v);\
> + AV_WN4PA(src+ 0, v);\
> + AV_WN4PA(src+ 4, v);\
> + AV_WN4PA(src+ 8, v);\
> + AV_WN4PA(src+12, v);\
> src += stride;\
> }
This looks odd. If src is 4-pixel aligned, it's correct, otherwise not.
The A stands for "aligned".
> @@ -432,12 +444,12 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int
> _stride){
> int i;
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> - const pixel4 a= ((pixel4*)(src-stride))[0];
> - const pixel4 b= ((pixel4*)(src-stride))[1];
> + const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
> + const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
>
> for(i=0; i<8; i++){
> - ((pixel4*)(src+i*stride))[0]= a;
> - ((pixel4*)(src+i*stride))[1]= b;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
> }
> }
>
> @@ -447,19 +459,21 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src,
> int stride){
> stride /= sizeof(pixel);
>
> for(i=0; i<8; i++){
> - ((pixel4*)(src+i*stride))[0]=
> - ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
> + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
> }
> }
>
> #define PRED8x8_X(n, v)\
> static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
> int i;\
> + const pixel4 a = PIXEL_SPLAT_X4(v);\
> pixel *src = (pixel*)_src;\
> stride /= sizeof(pixel);\
> for(i=0; i<8; i++){\
> - ((pixel4*)(src+i*stride))[0]=\
> - ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
> }\
> }
>
> @@ -483,12 +497,12 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int
> stride){
> dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
>
> for(i=0; i<4; i++){
> - ((pixel4*)(src+i*stride))[0]=
> - ((pixel4*)(src+i*stride))[1]= dc0splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
> }
> for(i=4; i<8; i++){
> - ((pixel4*)(src+i*stride))[0]=
> - ((pixel4*)(src+i*stride))[1]= dc2splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
> }
> }
>
> @@ -508,12 +522,12 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int
> stride){
> dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
>
> for(i=0; i<4; i++){
> - ((pixel4*)(src+i*stride))[0]= dc0splat;
> - ((pixel4*)(src+i*stride))[1]= dc1splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
> }
> for(i=4; i<8; i++){
> - ((pixel4*)(src+i*stride))[0]= dc0splat;
> - ((pixel4*)(src+i*stride))[1]= dc1splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
> }
> }
>
> @@ -536,12 +550,12 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, int
> stride){
> dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
>
> for(i=0; i<4; i++){
> - ((pixel4*)(src+i*stride))[0]= dc0splat;
> - ((pixel4*)(src+i*stride))[1]= dc1splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
> }
> for(i=4; i<8; i++){
> - ((pixel4*)(src+i*stride))[0]= dc2splat;
> - ((pixel4*)(src+i*stride))[1]= dc3splat;
> + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
> + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
> }
> }
>
> @@ -636,8 +650,8 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, int
> _stride){
> #define PREDICT_8x8_DC(v) \
> int y; \
> for( y = 0; y < 8; y++ ) { \
> - ((pixel4*)src)[0] = \
> - ((pixel4*)src)[1] = v; \
> + AV_WN4PA(((pixel4*)src)+0, v); \
> + AV_WN4PA(((pixel4*)src)+1, v); \
> src += stride; \
> }
These also look OK.
> @@ -693,6 +707,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int
> has_topleft, int has_top
> int y;
> pixel *src = (pixel*)_src;
> int stride = _stride/sizeof(pixel);
> + pixel4 a, b;
>
> PREDICT_8x8_LOAD_TOP;
> src[0] = t0;
> @@ -703,9 +718,11 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int
> has_topleft, int has_top
> src[5] = t5;
> src[6] = t6;
> src[7] = t7;
> + a = AV_RN4PA(((pixel4*)src)+0);
> + b = AV_RN4PA(((pixel4*)src)+1);
> for( y = 1; y < 8; y++ ) {
> - ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
> - ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
> + AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
> + AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
AV_COPY* might be used here.
> }
> }
> static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int
> has_topright, int _stride)
> diff --git a/libavcodec/high_bit_depth.h b/libavcodec/high_bit_depth.h
> index 6f2b6a7..511cd00 100644
> --- a/libavcodec/high_bit_depth.h
> +++ b/libavcodec/high_bit_depth.h
> @@ -14,6 +14,7 @@
> # undef rnd_avg_pixel4
> # undef AV_RN2P
> # undef AV_RN4P
> +# undef AV_RN4PA
> # undef AV_WN2P
> # undef AV_WN4P
> # undef AV_WN4PA
> @@ -46,6 +47,7 @@ CLIP_PIXEL(10)
> # define rnd_avg_pixel4 rnd_avg64
> # define AV_RN2P AV_RN32
> # define AV_RN4P AV_RN64
> +# define AV_RN4PA AV_RN64A
> # define AV_WN2P AV_WN32
> # define AV_WN4P AV_WN64
> # define AV_WN4PA AV_WN64A
> @@ -61,6 +63,7 @@ CLIP_PIXEL(10)
> # define rnd_avg_pixel4 rnd_avg32
> # define AV_RN2P AV_RN16
> # define AV_RN4P AV_RN32
> +# define AV_RN4PA AV_RN32A
> # define AV_WN2P AV_WN16
> # define AV_WN4P AV_WN32
> # define AV_WN4PA AV_WN32A
OK
--
Måns Rullgård
[email protected]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel