On 9/9/2017 1:27 PM, Michael Niedermayer wrote:
+            // If the image is sufficiently aligned, compute 8 samples at once
+            if (!(((uintptr_t)dst) & 7)) {
+                uint64_t *dst64 = (uint64_t *)dst;
+                int w = avctx->width>>1;
+                for (x = 0; x < w; x++) {
+                    dst64[x] = (dst64[x] << 3) & 0xFCFCFCFCFCFCFCFCULL;
+                }
+                x *= 8;
+            } else
+                x = 0;
+            for (; x < avctx->width * 4; x++) {
                  dst[x] = dst[x] << 3;
              }

Forgive me if I'm not understanding the code correctly, but couldn't you always apply the optimization if you align the first (up to) 7 samples?

Pseudocode:

uint64_t *dst64 = (uint64_t *)dst;
int w = avctx->width>>1;
x=0
// compute un-aligned beginning samples
for (; x < (avctx->width * 4) && (((uintptr_t)dst) & 7); x++) {
    dst[x] = dst[x] << 3;
}
// compute aligned samples
for (; x < w; x+=8) {
    dst64[x] = (dst64[x] << 3) & 0xFCFCFCFCFCFCFCFCULL;
}
x -= 8;
// compute un-aligned ending samples
for (; x < avctx->width * 4; x++) {
    dst[x] = dst[x] << 3;
}
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to