[FFmpeg-cvslog] checkasm/sw_scale: hscale does not requires cpuflag test.
ffmpeg | branch: master | Alan Kelly | Fri Jul 15 17:01:31 2022 +0200| [da0a37bab7434ef485146ce8575c7948db1fe3e2] | committer: Anton Khirnov checkasm/sw_scale: hscale does not requires cpuflag test. This is done in ff_shuffle_filter_coefficients. Signed-off-by: Anton Khirnov > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=da0a37bab7434ef485146ce8575c7948db1fe3e2 --- tests/checkasm/sw_scale.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 9c07dd0421..86d266fb3e 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -278,8 +278,6 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); -int cpu_flags = av_get_cpu_flags(); - ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -328,8 +326,7 @@ static void check_hscale(void) ctx->dstW = ctx->chrDstW = input_sizes[dstWi]; ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale: Enable hscale_avx2 for all input sizes.
ffmpeg | branch: master | Alan Kelly | Fri Jul 15 16:59:43 2022 +0200| [a38293e4448c9389e604af9858984361a5677a20] | committer: Anton Khirnov libswscale: Enable hscale_avx2 for all input sizes. ff_shuffle_filter_coefficients shuffles the tail as required. Signed-off-by: Anton Khirnov > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a38293e4448c9389e604af9858984361a5677a20 --- libswscale/utils.c| 19 --- libswscale/x86/swscale.c | 6 ++ tests/checkasm/sw_scale.c | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 34503e57f4..baa1791ebe 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -268,8 +268,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); -// avx2 hscale filter processes 16 pixel blocks. -if (!filter || dstW % 16 != 0) +if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -281,9 +280,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -297,6 +298,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy([to], [from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 89ef9f5d2b..ec1ca0e01c 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -625,10 +625,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { -if (c->chrDstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if (c->dstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index cbe4460a99..9c07dd0421 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -329,7 +329,7 @@ static void check_hscale(void) ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] sws: allow avx2 hscale to process inputs of any size.
ffmpeg | branch: master | Alan Kelly | Tue Apr 26 10:00:02 2022 +0200| [a6724285fd45111436dd5242eab2c489182aa5c2] | committer: Anton Khirnov sws: allow avx2 hscale to process inputs of any size. The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. Signed-off-by: Anton Khirnov > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a6724285fd45111436dd5242eab2c489182aa5c2 --- libswscale/x86/scale_avx2.asm | 44 ++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..37095e596a 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif +cmp wq, 0x10 +jl .tail_loop +sub wq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, add fltposq, 0x40 add countq, 0x10 cmp countq, wq -jl .loop +jle .loop + +add wq, 0x10 +cmp countq, wq +jge .end + +.tail_loop: +movu xm1, [fltposq] +%ifidn %1, X4 +pxor xm9, xm9 +pxor xm10, xm10 +xor innerq, innerq +.tail_innerloop: +%endif +vpcmpeqd xm13, xm13 +vpgatherdd xm3,[srcmemq + xm1], xm13 +vpunpcklbw xm5, xm3, xm0 +vpunpckhbw xm6, xm3, xm0 +vpmaddwd xm5, xm5, [filterq] +vpmaddwd xm6, xm6, [filterq + 0x10] +add filterq, 0x20 +%ifidn %1, X4 +paddd xm9, xm5 +paddd xm10, xm6 +paddd xm1, xm14 +add innerq, 1 +cmp innerq, fltsizeq +jl .tail_innerloop +vphaddd xm5, xm9, xm10 +%else +vphaddd xm5, xm5, xm6 +%endif +vpsrad xm5, 7 +vpackssdw xm5, xm5, xm5 +vmovq [dstq + countq * 2], xm5 +add fltposq, 0x10 +add countq, 0x4 +cmp countq, wq +jl .tail_loop +.end: REP_RET %endmacro ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext
ffmpeg | branch: master | Alan Kelly | Wed Aug 17 11:20:39 2022 +0200| [51a34e8525fea2bbc29b42831d7a17f34e8518d3] | committer: Andreas Rheinhardt sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext Signed-off-by: Andreas Rheinhardt > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=51a34e8525fea2bbc29b42831d7a17f34e8518d3 --- libswscale/x86/swscale.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 32d441245d..89ef9f5d2b 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -205,20 +205,17 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ int remainder = (dstW % step); \ int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ -yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ +yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ if(pixelsProcessed > 0) \ ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ - ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ + ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ return; \ } -#if HAVE_MMX_EXTERNAL -YUV2YUVX_FUNC_MMX(mmx, 16) -#endif #if HAVE_MMXEXT_EXTERNAL YUV2YUVX_FUNC_MMX(mmxext, 16) #endif ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale: Re-factor ff_shuffle_filter_coefficients.
ffmpeg | branch: master | Alan Kelly | Thu Feb 17 11:03:52 2022 +0100| [e534d98af3bfdc2c926b15301404e2d85524a048] | committer: Michael Niedermayer libswscale: Re-factor ff_shuffle_filter_coefficients. Make the code more readable and follow the style guide. Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e534d98af3bfdc2c926b15301404e2d85524a048 --- libswscale/utils.c | 66 +- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 344c87dfdf..7c8e1bbdde 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,42 +278,48 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, + int filterSize, int16_t *filter, + int dstW) +{ #if ARCH_X86_64 -int i, j, k, l; +int i, j, k; int cpu_flags = av_get_cpu_flags(); +// avx2 hscale filter processes 16 pixel blocks. +if (!filter || dstW % 16 != 0) +return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if (dstW % 16 == 0){ -if (filter != NULL){ -for (i = 0; i < dstW; i += 8){ -FFSWAP(int, filterPos[i + 2], filterPos[i+4]); -FFSWAP(int, filterPos[i + 3], filterPos[i+5]); -} -if (filterSize > 4){ -int16_t *tmp2 = av_malloc(dstW * filterSize * 2); -if (!tmp2) -return AVERROR(ENOMEM); -memcpy(tmp2, filter, dstW * filterSize * 2); -for (i = 0; i < dstW; i += 16){//pixel -for (k = 0; k < filterSize / 4; ++k){//fcoeff -for (j = 0; j < 16; ++j){//inner pixel -for (l = 0; l < 4; ++l){//coeff -int from = i * filterSize + j * filterSize + k * 4 + l; -int to = (i) * filterSize + j * 4 + l + k * 64; -filter[to] = tmp2[from]; -} -} -} -} -av_free(tmp2); -} -} -} +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { + int16_t *filterCopy = NULL; + if (filterSize > 4) { + if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) + return AVERROR(ENOMEM); + memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t)); + } + // Do not swap filterPos for pixels which won't be processed by + // the main loop. + for (i = 0; i + 8 <= dstW; i += 8) { + FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); + FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + } + if (filterSize > 4) { + // 16 pixels are processed at a time. + for (i = 0; i + 16 <= dstW; i += 16) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 16; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 16; + memcpy([to], [from], 4 * sizeof(int16_t)); + } + } + } + } + av_free(filterCopy); } } -return 0; #endif +return 0; } int sws_isSupportedInput(enum AVPixelFormat pix_fmt) ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale: Check and propagate memory allocation errors from ff_shuffle_filter_coefficients.
ffmpeg | branch: master | Alan Kelly | Thu Feb 17 11:03:21 2022 +0100| [f1a5414c97a594d6f1d011860753794681ec56c5] | committer: Michael Niedermayer libswscale: Check and propagate memory allocation errors from ff_shuffle_filter_coefficients. Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f1a5414c97a594d6f1d011860753794681ec56c5 --- libswscale/swscale_internal.h | 2 +- libswscale/utils.c| 11 --- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 3a78d95ba6..26d28d42e6 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, #define MAX_LINES_AHEAD 4 //shuffle filter and filterPos for hyScale and hcScale filters in avx2 -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index c5ea8853d5..344c87dfdf 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ #if ARCH_X86_64 int i, j, k, l; int cpu_flags = av_get_cpu_flags(); @@ -292,6 +292,8 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz } if (filterSize > 4){ int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +if (!tmp2) +return AVERROR(ENOMEM); memcpy(tmp2, filter, dstW * filterSize * 2); for (i = 0; i < dstW; i += 16){//pixel for (k = 0; k < filterSize / 4; ++k){//fcoeff @@ -310,6 +312,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz } } } +return 0; #endif } @@ -1836,7 +1839,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); +if (ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW) < 0) +goto nomem; if ((ret = initFilter(>hChrFilter, >hChrFilterPos, >hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1846,7 +1850,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); +if (ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW) < 0) +goto nomem; } } // initialize horizontal stuff ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
ffmpeg | branch: master | Alan Kelly | Tue Dec 21 20:56:41 2021 +0100| [ffbab99f2c22be06ef3c564fd38320d40e48a2b5] | committer: James Almer libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER. This flag is set on Haswell and earlier and all AMD cpus. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ffbab99f2c22be06ef3c564fd38320d40e48a2b5 --- doc/APIchanges | 3 +++ libavutil/cpu.h | 1 + libavutil/version.h | 4 ++-- libavutil/x86/cpu.c | 15 ++- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/APIchanges b/doc/APIchanges index 93fc45ced4..ac75040274 100644 --- a/doc/APIchanges +++ b/doc/APIchanges @@ -14,6 +14,9 @@ libavutil: 2021-04-27 API changes, most recent first: +2021-12-21 - xx - lavu 57.12.100 - cpu.h + Add AV_CPU_FLAG_SLOW_GATHER. + 2021-12-20 - xx - lavu 57.11.101 - display.h Modified the documentation of av_display_rotation_set() to match its longstanding actual behaviour of treating diff --git a/libavutil/cpu.h b/libavutil/cpu.h index ae443eccad..ce9bf14bf7 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -54,6 +54,7 @@ #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used +#define AV_CPU_FLAG_SLOW_GATHER 0x200 ///< CPU has slow gathers. #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard #define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 diff --git a/libavutil/version.h b/libavutil/version.h index 678401fcf5..668f9206fe 100644 --- a/libavutil/version.h +++ b/libavutil/version.h @@ -79,8 +79,8 @@ */ #define LIBAVUTIL_VERSION_MAJOR 57 -#define LIBAVUTIL_VERSION_MINOR 11 -#define LIBAVUTIL_VERSION_MICRO 101 +#define LIBAVUTIL_VERSION_MINOR 12 +#define LIBAVUTIL_VERSION_MICRO 100 #define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \ LIBAVUTIL_VERSION_MINOR, \ diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..441b4695d5 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) { rval |= AV_CPU_FLAG_AVX2; +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +/* Haswell has slow gather */ +if (!strncmp(vendor.c, "GenuineIntel", 12)) +if (family == 6 && model < 70) +rval |= AV_CPU_FLAG_SLOW_GATHER; +/* Zen 3 and earlier have slow gather */ +if (!strncmp(vendor.c, "AuthenticAMD", 12)) +if (family <= 0x19) +rval |= AV_CPU_FLAG_SLOW_GATHER; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.
ffmpeg | branch: master | Alan Kelly | Mon Dec 20 15:45:45 2021 +0100| [eebe406c808e6061ee76e93a616537b5369dbf40] | committer: James Almer libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions. This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions are only used where they are faster. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=eebe406c808e6061ee76e93a616537b5369dbf40 --- libswscale/utils.c| 2 +- libswscale/x86/swscale.c | 2 +- tests/checkasm/sw_scale.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index d4a72d3ce1..7158384f0b 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz #if ARCH_X86_64 int i, j, k, l; int cpu_flags = av_get_cpu_flags(); -if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ if (dstW % 16 == 0){ if (filter != NULL){ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index c49a05c37b..ffc7691c12 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -578,7 +578,7 @@ switch(c->dstBpc){ \ break; \ } -if (EXTERNAL_AVX2_FAST(cpu_flags)) { +if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { if (c->chrDstW % 16 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index f4912e6c2c..3c0a083b42 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -217,7 +217,7 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if (cpu_flags & AV_CPU_FLAG_AVX2) +if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] x86/scale_avx2: Change asm indent from 2 to 4 spaces.
ffmpeg | branch: master | Alan Kelly | Thu Dec 16 17:27:10 2021 +0100| [9092e58c4469f5488f305fdb85e34e61bba7b04a] | committer: James Almer x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9092e58c4469f5488f305fdb85e34e61bba7b04a --- libswscale/x86/scale_avx2.asm | 96 +-- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 2cd7e968d3..eb472db12f 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -45,63 +45,63 @@ SECTION .text %macro SCALE_FUNC 1 cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner - pxor m0, m0 - mova m15, [swizzle] - mov countq, $0 - movsxd wq, wd +pxor m0, m0 +mova m15, [swizzle] +mov countq, $0 +movsxd wq, wd %ifidn %1, X4 - mova m14, [four] - shr fltsized, 2 +mova m14, [four] +shr fltsized, 2 %endif .loop: - movu m1, [fltposq] - movu m2, [fltposq+32] +movu m1, [fltposq] +movu m2, [fltposq+32] %ifidn %1, X4 - pxor m9, m9 - pxor m10, m10 - pxor m11, m11 - pxor m12, m12 - mov innerq, $0 +pxor m9, m9 +pxor m10, m10 +pxor m11, m11 +pxor m12, m12 +mov innerq, $0 .innerloop: %endif - vpcmpeqd m13, m13 - vpgatherdd m3,[srcmemq + m1], m13 - vpcmpeqd m13, m13 - vpgatherdd m4,[srcmemq + m2], m13 - vpunpcklbw m5, m3, m0 - vpunpckhbw m6, m3, m0 - vpunpcklbw m7, m4, m0 - vpunpckhbw m8, m4, m0 - vpmaddwd m5, m5, [filterq] - vpmaddwd m6, m6, [filterq + 32] - vpmaddwd m7, m7, [filterq + 64] - vpmaddwd m8, m8, [filterq + 96] - add filterq, $80 +vpcmpeqd m13, m13 +vpgatherdd m3,[srcmemq + m1], m13 +vpcmpeqd m13, m13 +vpgatherdd m4,[srcmemq + m2], m13 +vpunpcklbw m5, m3, m0 +vpunpckhbw m6, m3, m0 +vpunpcklbw m7, m4, m0 +vpunpckhbw m8, m4, m0 +vpmaddwd m5, m5, [filterq] +vpmaddwd m6, m6, [filterq + 32] +vpmaddwd m7, m7, [filterq + 64] +vpmaddwd m8, m8, [filterq + 96] +add filterq, $80 %ifidn %1, X4 - paddd m9, m5 - paddd m10, m6 - paddd m11, m7 - paddd m12, m8 - paddd m1, m14 - paddd m2, m14 - add innerq, $1 - cmp innerq, fltsizeq - jl .innerloop - vphaddd m5, m9, m10 - vphaddd m6, m11, m12 +paddd m9, m5 +paddd m10, m6 +paddd m11, m7 +paddd m12, m8 +paddd m1, m14 +paddd m2, m14 +add innerq, $1 +cmp innerq, fltsizeq +jl .innerloop +vphaddd m5, m9, m10 +vphaddd m6, m11, m12 %else - vphaddd m5, m5, m6 - vphaddd m6, m7, m8 +vphaddd m5, m5, m6 +vphaddd m6, m7, m8 %endif - vpsrad m5, 7 - vpsrad m6, 7 - vpackssdw m5, m5, m6 - vpermd m5, m15, m5 - vmovdqu [dstq + countq * 2], m5 - add fltposq, $40 - add countq, $10 - cmp countq, wq - jl .loop +vpsrad m5, 7 +vpsrad m6, 7 +vpackssdw m5, m5, m6 +vpermd m5, m15, m5 +vmovdqu [dstq + countq * 2], m5 +add fltposq, $40 +add countq, $10 +cmp countq, wq +jl .loop REP_RET %endmacro ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] x86/swscale: fix minor coding style issues
ffmpeg | branch: master | Alan Kelly | Thu Dec 16 17:05:48 2021 +0100| [86663963e6419a127cf52a03758855f4f8f8689f] | committer: James Almer x86/swscale: fix minor coding style issues Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=86663963e6419a127cf52a03758855f4f8f8689f --- libswscale/x86/swscale.c | 14 +++--- tests/checkasm/sw_scale.c | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 164b06d6ba..c49a05c37b 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -578,13 +578,13 @@ switch(c->dstBpc){ \ break; \ } -if (EXTERNAL_AVX2_FAST(cpu_flags)){ - if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if(c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if(c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); - } +if (EXTERNAL_AVX2_FAST(cpu_flags)) { +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { +if (c->chrDstW % 16 == 0) +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +if (c->dstW % 16 == 0) +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +} } if (EXTERNAL_AVX2_FAST(cpu_flags)) { diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 011cb46428..f4912e6c2c 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -217,9 +217,8 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if (cpu_flags & AV_CPU_FLAG_AVX2){ +if (cpu_flags & AV_CPU_FLAG_AVX2) ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); -} if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
ffmpeg | branch: master | Alan Kelly | Wed Dec 15 10:35:02 2021 +0100| [f900a19fa94b1a55b660ec2e5c13419d59754bc0] | committer: James Almer libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f900a19fa94b1a55b660ec2e5c13419d59754bc0 --- libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 ++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 +++ tests/checkasm/sw_scale.c | 20 ++-- 6 files changed, 186 insertions(+), 5 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 708facba67..64aa0b9804 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index ae92ac9fbc..d4a72d3ce1 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 0 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(>hChrFilter, >hChrFilterPos, >hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \
[FFmpeg-cvslog] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0
ffmpeg | branch: release/4.4 | Alan Kelly | Thu Apr 1 12:00:16 2021 +0200| [95aacf30e3803c57d91ff62975b375e394f61d49] | committer: Michael Niedermayer libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0 Signed-off-by: Michael Niedermayer (cherry picked from commit dc57762cb43619f91fd2a5d95510fa3b14cfeaaf) Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=95aacf30e3803c57d91ff62975b375e394f61d49 --- libswscale/x86/swscale.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index cc9e8b0155..0848a31461 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +if(dstW > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ return; \ } @@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ +if(pixelsProcessed > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext
ffmpeg | branch: release/4.4 | Alan Kelly | Thu Apr 1 12:00:15 2021 +0200| [4aeedf4c2a8f35be667d5dd40c84bd27730ef1db] | committer: Michael Niedermayer libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext Signed-off-by: Michael Niedermayer (cherry picked from commit 3ce8d092448827842c451807f03010ad5129fd8f) Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4aeedf4c2a8f35be667d5dd40c84bd27730ef1db --- libswscale/x86/yuv2yuvX.asm | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddwm3, m3, m2 paddwm4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddwm6, m6, m2 paddwm1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psrawm3, m3, 3 psrawm4, m4, 3 +%if cpuflag(sse3) psrawm6, m6, 3 psrawm1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 -add offsetq, mmsize * 2 +%endif +add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX
ffmpeg | branch: release/4.4 | Alan Kelly | Thu Apr 1 12:00:17 2021 +0200| [6bc2058d00b119d265c9970eac213d2922d15129] | committer: Michael Niedermayer tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX Signed-off-by: Michael Niedermayer (cherry picked from commit e1484bc455dff500f8b35b58d434924bca0e03d6) Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6bc2058d00b119d265c9970eac213d2922d15129 --- tests/checkasm/sw_scale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index a10118704b..3ac0f9082f 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -68,8 +68,8 @@ static void check_yuv2yuvX(void) #define FILTER_SIZES 4 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16}; #define LARGEST_INPUT_SIZE 512 -#define INPUT_SIZES 4 -static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512}; +#define INPUT_SIZES 6 +static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512}; declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, @@ -107,7 +107,7 @@ static void check_yuv2yuvX(void) for(j = 0; j < 4; ++j) vFilterData[i].coeff[j + 4] = filter_coeff[i]; } -if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", filter_sizes[fsi], osi)){ +if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", filter_sizes[fsi], osi, dstW)){ memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0])); memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0])); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0
ffmpeg | branch: master | Alan Kelly | Thu Apr 1 12:00:16 2021 +0200| [dc57762cb43619f91fd2a5d95510fa3b14cfeaaf] | committer: Michael Niedermayer libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0 Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=dc57762cb43619f91fd2a5d95510fa3b14cfeaaf --- libswscale/x86/swscale.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index cc9e8b0155..0848a31461 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +if(dstW > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ return; \ } @@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ +if(pixelsProcessed > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext
ffmpeg | branch: master | Alan Kelly | Thu Apr 1 12:00:15 2021 +0200| [3ce8d092448827842c451807f03010ad5129fd8f] | committer: Michael Niedermayer libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3ce8d092448827842c451807f03010ad5129fd8f --- libswscale/x86/yuv2yuvX.asm | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddwm3, m3, m2 paddwm4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddwm6, m6, m2 paddwm1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psrawm3, m3, 3 psrawm4, m4, 3 +%if cpuflag(sse3) psrawm6, m6, 3 psrawm1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 -add offsetq, mmsize * 2 +%endif +add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX
ffmpeg | branch: master | Alan Kelly | Thu Apr 1 12:00:17 2021 +0200| [e1484bc455dff500f8b35b58d434924bca0e03d6] | committer: Michael Niedermayer tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e1484bc455dff500f8b35b58d434924bca0e03d6 --- tests/checkasm/sw_scale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index a10118704b..3ac0f9082f 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -68,8 +68,8 @@ static void check_yuv2yuvX(void) #define FILTER_SIZES 4 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16}; #define LARGEST_INPUT_SIZE 512 -#define INPUT_SIZES 4 -static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512}; +#define INPUT_SIZES 6 +static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512}; declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, @@ -107,7 +107,7 @@ static void check_yuv2yuvX(void) for(j = 0; j < 4; ++j) vFilterData[i].coeff[j + 4] = filter_coeff[i]; } -if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", filter_sizes[fsi], osi)){ +if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", filter_sizes[fsi], osi, dstW)){ memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0])); memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0])); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] checkasm/sw_scale: properly initialize src_pixer and filter_coeff buffers
ffmpeg | branch: master | Alan Kelly | Fri Feb 19 14:55:39 2021 +0100| [ee18edb13a9ae3041df961dd5003c2055b5cab35] | committer: James Almer checkasm/sw_scale: properly initialize src_pixer and filter_coeff buffers Fixes valgrind uninitialised value warnings. Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ee18edb13a9ae3041df961dd5003c2055b5cab35 --- tests/checkasm/sw_scale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 7504f8b45f..dee1af820c 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -86,8 +86,8 @@ static void check_yuv2yuvX(void) uint16_t coeff[8]; } *vFilterData; uint8_t d_val = rnd(); -randomize_buffers(filter_coeff, LARGEST_FILTER); -randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE); +randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof(int16_t)); +randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * sizeof(int16_t)); ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] swscale: move yuv2yuvX_sse3 to yasm, unrolls main loop
ffmpeg | branch: master | Alan Kelly | Thu Jan 14 15:47:03 2021 +0100| [554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f] | committer: Paul B Mahol swscale: move yuv2yuvX_sse3 to yasm, unrolls main loop And other small optimizations for ~20% speedup. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f --- libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c | 130 +++- libswscale/x86/swscale_template.c | 82 --- libswscale/x86/yuv2yuvX.asm | 136 ++ tests/checkasm/sw_scale.c | 103 + 5 files changed, 294 insertions(+), 158 deletions(-) diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 15c0b22f20..3df193a067 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 0x0001000100010001ULL; +#define YUV2YUVX_FUNC_DECL(opt) \ +static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const int16_t **src, \ + uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset); \ + +YUV2YUVX_FUNC_DECL(mmx) +YUV2YUVX_FUNC_DECL(mmxext) +YUV2YUVX_FUNC_DECL(sse3) +YUV2YUVX_FUNC_DECL(avx2) + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c")