On 9/9/2017 1:27 PM, Michael Niedermayer wrote:
+ // If the image is sufficiently aligned, compute 8 samples at once + if (!(((uintptr_t)dst) & 7)) { + uint64_t *dst64 = (uint64_t *)dst; + int w = avctx->width>>1; + for (x = 0; x < w; x++) { + dst64[x] = (dst64[x] << 3) & 0xFCFCFCFCFCFCFCFCULL; + } + x *= 8; + } else + x = 0; + for (; x < avctx->width * 4; x++) { dst[x] = dst[x] << 3; }
Forgive me if I'm not understanding the code correctly, but couldn't you always apply the optimization if you align the first (up to) 7 samples?
Pseudocode: uint64_t *dst64 = (uint64_t *)dst; int w = avctx->width>>1; x=0 // compute un-aligned beginning samples for (; x < (avctx->width * 4) && (((uintptr_t)dst) & 7); x++) { dst[x] = dst[x] << 3; } // compute aligned samples for (; x < w; x+=8) { dst64[x] = (dst64[x] << 3) & 0xFCFCFCFCFCFCFCFCULL; } x -= 8; // compute un-aligned ending samples for (; x < avctx->width * 4; x++) { dst[x] = dst[x] << 3; } _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel