These patches are updates to patches previously posted to the mailing lists, with some bugs fixed and the reasoning behind some changes expanded on.
This addes macros in postprocess.c that use inline asm for x86, __builtin_prefetch if using a recent enough gcc compatable compiler, and that does nothing otherwise. Inline asm in postprocess_template.c was replaced by these macros. --- libpostproc/postprocess.c | 10 ++++++ libpostproc/postprocess_template.c | 63 +++++--------------------------------- 2 files changed, 18 insertions(+), 55 deletions(-) diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c index 9d89782..f8d28ba 100644 --- a/libpostproc/postprocess.c +++ b/libpostproc/postprocess.c @@ -197,6 +197,16 @@ static inline void prefetcht2(const void *p) : : "r" (p) ); } +#elif AV_GCC_VERSION_AT_LEAST(3,2) +#define prefetchnta(p) __builtin_prefetch(p,0,0) +#define prefetcht0(p) __builtin_prefetch(p,0,1) +#define prefetcht1(p) __builtin_prefetch(p,0,2) +#define prefetcht2(p) __builtin_prefetch(p,0,3) +#else +#define prefetchnta(p) +#define prefetcht0(p) +#define prefetcht1(p) +#define prefetcht2(p) #endif /* The horizontal functions exist only in C because the MMX diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c index 16e441a..6377ea7 100644 --- a/libpostproc/postprocess_template.c +++ b/libpostproc/postprocess_template.c @@ -3368,34 +3368,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ // finish 1 block before the next otherwise we might have a problem // with the L1 Cache of the P4 ... or only a few blocks at a time or something for(x=0; x<width; x+=BLOCK_SIZE){ - -#if TEMPLATE_PP_MMXEXT && HAVE_6REGS -/* - prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); -*/ - - __asm__( - "mov %4, %%"REG_a" \n\t" - "shr $2, %%"REG_a" \n\t" - "and $6, %%"REG_a" \n\t" - "add %5, %%"REG_a" \n\t" - "mov %%"REG_a", %%"REG_d" \n\t" - "imul %1, %%"REG_a" \n\t" - "imul %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - "add %1, %%"REG_a" \n\t" - "add %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), - "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) - : "%"REG_a, "%"REG_d - ); -#endif + prefetchnta(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); + prefetchnta(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); + prefetcht0(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); + prefetcht0(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); @@ -3474,33 +3450,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ uint8_t *dstBlockStart = dstBlock; const uint8_t *srcBlockStart = srcBlock; for(; x < endx; x+=BLOCK_SIZE){ -#if TEMPLATE_PP_MMXEXT && HAVE_6REGS -/* - prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); -*/ - - __asm__( - "mov %4, %%"REG_a" \n\t" - "shr $2, %%"REG_a" \n\t" - "and $6, %%"REG_a" \n\t" - "add %5, %%"REG_a" \n\t" - "mov %%"REG_a", %%"REG_d" \n\t" - "imul %1, %%"REG_a" \n\t" - "imul %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - "add %1, %%"REG_a" \n\t" - "add %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), - "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) - : "%"REG_a, "%"REG_d - ); -#endif + prefetchnta(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); + prefetchnta(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); + prefetcht0(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); + prefetcht0(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); -- 2.3.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel