[FFmpeg-cvslog] checkasm/sw_scale: hscale does not requires cpuflag test.

2022-08-18 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Fri 
Jul 15 17:01:31 2022 +0200| [da0a37bab7434ef485146ce8575c7948db1fe3e2] | 
committer: Anton Khirnov

checkasm/sw_scale: hscale does not requires cpuflag test.

This is done in ff_shuffle_filter_coefficients.

Signed-off-by: Anton Khirnov 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=da0a37bab7434ef485146ce8575c7948db1fe3e2
---

 tests/checkasm/sw_scale.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 9c07dd0421..86d266fb3e 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -278,8 +278,6 @@ static void check_hscale(void)
   const uint8_t *src, const int16_t *filter,
   const int32_t *filterPos, int filterSize);
 
-int cpu_flags = av_get_cpu_flags();
-
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
@@ -328,8 +326,7 @@ static void check_hscale(void)
 ctx->dstW = ctx->chrDstW = input_sizes[dstWi];
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", 
ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libswscale: Enable hscale_avx2 for all input sizes.

2022-08-18 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Fri 
Jul 15 16:59:43 2022 +0200| [a38293e4448c9389e604af9858984361a5677a20] | 
committer: Anton Khirnov

libswscale: Enable hscale_avx2 for all input sizes.

ff_shuffle_filter_coefficients shuffles the tail as required.

Signed-off-by: Anton Khirnov 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a38293e4448c9389e604af9858984361a5677a20
---

 libswscale/utils.c| 19 ---
 libswscale/x86/swscale.c  |  6 ++
 tests/checkasm/sw_scale.c |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 34503e57f4..baa1791ebe 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -268,8 +268,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 #if ARCH_X86_64
 int i, j, k;
 int cpu_flags = av_get_cpu_flags();
-// avx2 hscale filter processes 16 pixel blocks.
-if (!filter || dstW % 16 != 0)
+if (!filter)
 return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
@@ -281,9 +280,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
// Do not swap filterPos for pixels which won't be processed by
// the main loop.
-   for (i = 0; i + 8 <= dstW; i += 8) {
+   for (i = 0; i + 16 <= dstW; i += 16) {
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
+   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
}
if (filterSize > 4) {
// 16 pixels are processed at a time.
@@ -297,6 +298,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
}
}
+   // 4 pixels are processed at a time in the tail.
+   for (; i < dstW; i += 4) {
+   // 4 filter coeffs are processed at a time.
+   int rem = dstW - i >= 4 ? 4 : dstW - i;
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < rem; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 4;
+   memcpy([to], [from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
}
av_free(filterCopy);
 }
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 89ef9f5d2b..ec1ca0e01c 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -625,10 +625,8 @@ switch(c->dstBpc){ \
 
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
-if (c->chrDstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if (c->dstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
 }
 }
 
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index cbe4460a99..9c07dd0421 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -329,7 +329,7 @@ static void check_hscale(void)
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
 if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", 
ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] sws: allow avx2 hscale to process inputs of any size.

2022-08-18 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Tue 
Apr 26 10:00:02 2022 +0200| [a6724285fd45111436dd5242eab2c489182aa5c2] | 
committer: Anton Khirnov

sws: allow avx2 hscale to process inputs of any size.

The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.

Signed-off-by: Anton Khirnov 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a6724285fd45111436dd5242eab2c489182aa5c2
---

 libswscale/x86/scale_avx2.asm | 44 ++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..37095e596a 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 mova m14, [four]
 shr fltsized, 2
 %endif
+cmp wq, 0x10
+jl .tail_loop
+sub wq, 0x10
 .loop:
 movu m1, [fltposq]
 movu m2, [fltposq+32]
@@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 add fltposq, 0x40
 add countq, 0x10
 cmp countq, wq
-jl .loop
+jle .loop
+
+add wq, 0x10
+cmp countq, wq
+jge .end
+
+.tail_loop:
+movu xm1, [fltposq]
+%ifidn %1, X4
+pxor xm9, xm9
+pxor xm10, xm10
+xor innerq, innerq
+.tail_innerloop:
+%endif
+vpcmpeqd  xm13, xm13
+vpgatherdd xm3,[srcmemq + xm1], xm13
+vpunpcklbw xm5, xm3, xm0
+vpunpckhbw xm6, xm3, xm0
+vpmaddwd xm5, xm5, [filterq]
+vpmaddwd xm6, xm6, [filterq + 0x10]
+add filterq, 0x20
+%ifidn %1, X4
+paddd xm9, xm5
+paddd xm10, xm6
+paddd xm1, xm14
+add innerq, 1
+cmp innerq, fltsizeq
+jl .tail_innerloop
+vphaddd xm5, xm9, xm10
+%else
+vphaddd xm5, xm5, xm6
+%endif
+vpsrad  xm5, 7
+vpackssdw xm5, xm5, xm5
+vmovq [dstq + countq * 2], xm5
+add fltposq, 0x10
+add countq, 0x4
+cmp countq, wq
+jl .tail_loop
+.end:
 REP_RET
 %endmacro
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext

2022-08-18 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Wed 
Aug 17 11:20:39 2022 +0200| [51a34e8525fea2bbc29b42831d7a17f34e8518d3] | 
committer: Andreas Rheinhardt

sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext

Signed-off-by: Andreas Rheinhardt 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=51a34e8525fea2bbc29b42831d7a17f34e8518d3
---

 libswscale/x86/swscale.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 32d441245d..89ef9f5d2b 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -205,20 +205,17 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 int remainder = (dstW % step); \
 int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
-yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
+yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
 if(pixelsProcessed > 0) \
 ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
-  ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
+  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
 } \
 return; \
 }
 
-#if HAVE_MMX_EXTERNAL
-YUV2YUVX_FUNC_MMX(mmx, 16)
-#endif
 #if HAVE_MMXEXT_EXTERNAL
 YUV2YUVX_FUNC_MMX(mmxext, 16)
 #endif

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-02-17 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Feb 17 
11:03:52 2022 +0100| [e534d98af3bfdc2c926b15301404e2d85524a048] | committer: 
Michael Niedermayer

libswscale: Re-factor ff_shuffle_filter_coefficients.

Make the code more readable and follow the style guide.

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e534d98af3bfdc2c926b15301404e2d85524a048
---

 libswscale/utils.c | 66 +-
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 344c87dfdf..7c8e1bbdde 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,42 +278,48 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
+   int filterSize, int16_t *filter,
+   int dstW)
+{
 #if ARCH_X86_64
-int i, j, k, l;
+int i, j, k;
 int cpu_flags = av_get_cpu_flags();
+// avx2 hscale filter processes 16 pixel blocks.
+if (!filter || dstW % 16 != 0)
+return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
-if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if (dstW % 16 == 0){
-if (filter != NULL){
-for (i = 0; i < dstW; i += 8){
-FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
-FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
-}
-if (filterSize > 4){
-int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
-if (!tmp2)
-return AVERROR(ENOMEM);
-memcpy(tmp2, filter, dstW * filterSize * 2);
-for (i = 0; i < dstW; i += 16){//pixel
-for (k = 0; k < filterSize / 4; ++k){//fcoeff
-for (j = 0; j < 16; ++j){//inner pixel
-for (l = 0; l < 4; ++l){//coeff
-int from = i * filterSize + j * 
filterSize + k * 4 + l;
-int to = (i) * filterSize + j * 4 + l 
+ k * 64;
-filter[to] = tmp2[from];
-}
-}
-}
-}
-av_free(tmp2);
-}
-}
-}
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+   int16_t *filterCopy = NULL;
+   if (filterSize > 4) {
+   if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
+   return AVERROR(ENOMEM);
+   memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t));
+   }
+   // Do not swap filterPos for pixels which won't be processed by
+   // the main loop.
+   for (i = 0; i + 8 <= dstW; i += 8) {
+   FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
+   FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   }
+   if (filterSize > 4) {
+   // 16 pixels are processed at a time.
+   for (i = 0; i + 16 <= dstW; i += 16) {
+   // 4 filter coeffs are processed at a time.
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < 16; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 16;
+   memcpy([to], [from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
+   }
+   av_free(filterCopy);
 }
 }
-return 0;
 #endif
+return 0;
 }
 
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libswscale: Check and propagate memory allocation errors from ff_shuffle_filter_coefficients.

2022-02-17 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Feb 17 
11:03:21 2022 +0100| [f1a5414c97a594d6f1d011860753794681ec56c5] | committer: 
Michael Niedermayer

libswscale: Check and propagate memory allocation errors from 
ff_shuffle_filter_coefficients.

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f1a5414c97a594d6f1d011860753794681ec56c5
---

 libswscale/swscale_internal.h |  2 +-
 libswscale/utils.c| 11 ---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 3a78d95ba6..26d28d42e6 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 #define MAX_LINES_AHEAD 4
 
 //shuffle filter and filterPos for hyScale and hcScale filters in avx2
-void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
+int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index c5ea8853d5..344c87dfdf 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
 #if ARCH_X86_64
 int i, j, k, l;
 int cpu_flags = av_get_cpu_flags();
@@ -292,6 +292,8 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 }
 if (filterSize > 4){
 int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+if (!tmp2)
+return AVERROR(ENOMEM);
 memcpy(tmp2, filter, dstW * filterSize * 2);
 for (i = 0; i < dstW; i += 16){//pixel
 for (k = 0; k < filterSize / 4; ++k){//fcoeff
@@ -310,6 +312,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 }
 }
 }
+return 0;
 #endif
 }
 
@@ -1836,7 +1839,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
+if (ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW) < 0)
+goto nomem;
 if ((ret = initFilter(>hChrFilter, >hChrFilterPos,
>hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1846,7 +1850,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
+if (ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW) < 0)
+goto nomem;
 }
 } // initialize horizontal stuff
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-21 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Tue 
Dec 21 20:56:41 2021 +0100| [ffbab99f2c22be06ef3c564fd38320d40e48a2b5] | 
committer: James Almer

libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

This flag is set on Haswell and earlier and all AMD cpus.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ffbab99f2c22be06ef3c564fd38320d40e48a2b5
---

 doc/APIchanges  |  3 +++
 libavutil/cpu.h |  1 +
 libavutil/version.h |  4 ++--
 libavutil/x86/cpu.c | 15 ++-
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 93fc45ced4..ac75040274 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -14,6 +14,9 @@ libavutil: 2021-04-27
 
 API changes, most recent first:
 
+2021-12-21 - xx - lavu 57.12.100 - cpu.h
+  Add AV_CPU_FLAG_SLOW_GATHER.
+
 2021-12-20 - xx - lavu 57.11.101 - display.h
   Modified the documentation of av_display_rotation_set()
   to match its longstanding actual behaviour of treating
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS 
support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x200 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC  0x0001 ///< standard
 #define AV_CPU_FLAG_VSX  0x0002 ///< ISA 2.06
diff --git a/libavutil/version.h b/libavutil/version.h
index 678401fcf5..668f9206fe 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,8 +79,8 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  57
-#define LIBAVUTIL_VERSION_MINOR  11
-#define LIBAVUTIL_VERSION_MICRO 101
+#define LIBAVUTIL_VERSION_MINOR  12
+#define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
LIBAVUTIL_VERSION_MINOR, \
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..441b4695d5 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) {
 rval |= AV_CPU_FLAG_AVX2;
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+/* Haswell has slow gather */
+if (!strncmp(vendor.c, "GenuineIntel", 12))
+if (family == 6 && model < 70)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+/* Zen 3 and earlier have slow gather */
+if (!strncmp(vendor.c, "AuthenticAMD", 12))
+if (family <= 0x19)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.

2021-12-21 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Mon 
Dec 20 15:45:45 2021 +0100| [eebe406c808e6061ee76e93a616537b5369dbf40] | 
committer: James Almer

libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.

This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions
are only used where they are faster.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=eebe406c808e6061ee76e93a616537b5369dbf40
---

 libswscale/utils.c| 2 +-
 libswscale/x86/swscale.c  | 2 +-
 tests/checkasm/sw_scale.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index d4a72d3ce1..7158384f0b 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 #if ARCH_X86_64
 int i, j, k, l;
 int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
 if (dstW % 16 == 0){
 if (filter != NULL){
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index c49a05c37b..ffc7691c12 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -578,7 +578,7 @@ switch(c->dstBpc){ \
  break; \
 }
 
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
 if (c->chrDstW % 16 == 0)
 ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index f4912e6c2c..3c0a083b42 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -217,7 +217,7 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if (cpu_flags & AV_CPU_FLAG_AVX2)
+if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
 ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] x86/scale_avx2: Change asm indent from 2 to 4 spaces.

2021-12-16 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Dec 16 
17:27:10 2021 +0100| [9092e58c4469f5488f305fdb85e34e61bba7b04a] | committer: 
James Almer

x86/scale_avx2: Change asm indent from 2 to 4 spaces.

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9092e58c4469f5488f305fdb85e34e61bba7b04a
---

 libswscale/x86/scale_avx2.asm | 96 +--
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 2cd7e968d3..eb472db12f 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -45,63 +45,63 @@ SECTION .text
 
 %macro SCALE_FUNC 1
 cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, 
fltsize, count, inner
-  pxor m0, m0
-  mova m15, [swizzle]
-  mov countq, $0
-  movsxd wq, wd
+pxor m0, m0
+mova m15, [swizzle]
+mov countq, $0
+movsxd wq, wd
 %ifidn %1, X4
-  mova m14, [four]
-  shr fltsized, 2
+mova m14, [four]
+shr fltsized, 2
 %endif
 .loop:
-  movu m1, [fltposq]
-  movu m2, [fltposq+32]
+movu m1, [fltposq]
+movu m2, [fltposq+32]
 %ifidn %1, X4
-  pxor m9, m9
-  pxor m10, m10
-  pxor m11, m11
-  pxor m12, m12
-  mov innerq, $0
+pxor m9, m9
+pxor m10, m10
+pxor m11, m11
+pxor m12, m12
+mov innerq, $0
 .innerloop:
 %endif
-  vpcmpeqd  m13, m13
-  vpgatherdd m3,[srcmemq + m1], m13
-  vpcmpeqd  m13, m13
-  vpgatherdd m4,[srcmemq + m2], m13
-  vpunpcklbw m5, m3, m0
-  vpunpckhbw m6, m3, m0
-  vpunpcklbw m7, m4, m0
-  vpunpckhbw m8, m4, m0
-  vpmaddwd m5, m5, [filterq]
-  vpmaddwd m6, m6, [filterq + 32]
-  vpmaddwd m7, m7, [filterq + 64]
-  vpmaddwd m8, m8, [filterq + 96]
-  add filterq, $80
+vpcmpeqd  m13, m13
+vpgatherdd m3,[srcmemq + m1], m13
+vpcmpeqd  m13, m13
+vpgatherdd m4,[srcmemq + m2], m13
+vpunpcklbw m5, m3, m0
+vpunpckhbw m6, m3, m0
+vpunpcklbw m7, m4, m0
+vpunpckhbw m8, m4, m0
+vpmaddwd m5, m5, [filterq]
+vpmaddwd m6, m6, [filterq + 32]
+vpmaddwd m7, m7, [filterq + 64]
+vpmaddwd m8, m8, [filterq + 96]
+add filterq, $80
 %ifidn %1, X4
-  paddd m9, m5
-  paddd m10, m6
-  paddd m11, m7
-  paddd m12, m8
-  paddd m1, m14
-  paddd m2, m14
-  add innerq, $1
-  cmp innerq, fltsizeq
-  jl .innerloop
-  vphaddd m5, m9, m10
-  vphaddd m6, m11, m12
+paddd m9, m5
+paddd m10, m6
+paddd m11, m7
+paddd m12, m8
+paddd m1, m14
+paddd m2, m14
+add innerq, $1
+cmp innerq, fltsizeq
+jl .innerloop
+vphaddd m5, m9, m10
+vphaddd m6, m11, m12
 %else
-  vphaddd m5, m5, m6
-  vphaddd m6, m7, m8
+vphaddd m5, m5, m6
+vphaddd m6, m7, m8
 %endif
-  vpsrad  m5, 7
-  vpsrad  m6, 7
-  vpackssdw m5, m5, m6
-  vpermd m5, m15, m5
-  vmovdqu [dstq + countq * 2], m5
-  add fltposq, $40
-  add countq, $10
-  cmp countq, wq
-  jl .loop
+vpsrad  m5, 7
+vpsrad  m6, 7
+vpackssdw m5, m5, m6
+vpermd m5, m15, m5
+vmovdqu [dstq + countq * 2], m5
+add fltposq, $40
+add countq, $10
+cmp countq, wq
+jl .loop
 REP_RET
 %endmacro
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] x86/swscale: fix minor coding style issues

2021-12-16 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Dec 16 
17:05:48 2021 +0100| [86663963e6419a127cf52a03758855f4f8f8689f] | committer: 
James Almer

x86/swscale: fix minor coding style issues

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=86663963e6419a127cf52a03758855f4f8f8689f
---

 libswscale/x86/swscale.c  | 14 +++---
 tests/checkasm/sw_scale.c |  3 +--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 164b06d6ba..c49a05c37b 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -578,13 +578,13 @@ switch(c->dstBpc){ \
  break; \
 }
 
-if (EXTERNAL_AVX2_FAST(cpu_flags)){
-  if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if(c->chrDstW % 16 == 0)
-  ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if(c->dstW % 16 == 0)
-  ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
-  }
+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+if (c->chrDstW % 16 == 0)
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+if (c->dstW % 16 == 0)
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+}
 }
 
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 011cb46428..f4912e6c2c 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -217,9 +217,8 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if (cpu_flags & AV_CPU_FLAG_AVX2){
+if (cpu_flags & AV_CPU_FLAG_AVX2)
 ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
-}
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-12-15 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Wed Dec 15 
10:35:02 2021 +0100| [f900a19fa94b1a55b660ec2e5c13419d59754bc0] | committer: 
James Almer

libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all 
filter sizes.

Fixes so that fate under 64 bit Windows passes.

These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f900a19fa94b1a55b660ec2e5c13419d59754bc0
---

 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 ++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 +++
 tests/checkasm/sw_scale.c |  20 ++--
 6 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 708facba67..64aa0b9804 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ae92ac9fbc..d4a72d3ce1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 0 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(>hChrFilter, >hChrFilterPos,
>hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o  \
+   x86/scale_avx2.o  \
  

[FFmpeg-cvslog] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

2021-04-03 Thread Alan Kelly
ffmpeg | branch: release/4.4 | Alan Kelly  | Thu Apr  1 
12:00:16 2021 +0200| [95aacf30e3803c57d91ff62975b375e394f61d49] | committer: 
Michael Niedermayer

libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

Signed-off-by: Michael Niedermayer 
(cherry picked from commit dc57762cb43619f91fd2a5d95510fa3b14cfeaaf)
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=95aacf30e3803c57d91ff62975b375e394f61d49
---

 libswscale/x86/swscale.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index cc9e8b0155..0848a31461 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+if(dstW > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
 return; \
 }
 
@@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
+if(pixelsProcessed > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
   ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
 } \

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

2021-04-03 Thread Alan Kelly
ffmpeg | branch: release/4.4 | Alan Kelly  | Thu Apr  1 
12:00:15 2021 +0200| [4aeedf4c2a8f35be667d5dd40c84bd27730ef1db] | committer: 
Michael Niedermayer

libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

Signed-off-by: Michael Niedermayer 
(cherry picked from commit 3ce8d092448827842c451807f03010ad5129fd8f)
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4aeedf4c2a8f35be667d5dd40c84bd27730ef1db
---

 libswscale/x86/yuv2yuvX.asm | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 521880dabe..b6294cb919 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -37,8 +37,10 @@ SECTION .text
 cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %if notcpuflag(sse3)
 %define movr mova
+%define unroll 1
 %else
 %define movr movdqu
+%define unroll 2
 %endif
 movsxdifnidn dstWq, dstWd
 movsxdifnidn offsetq, offsetd
@@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 .outerloop:
 mova m4, m7
 mova m3, m7
+%if cpuflag(sse3)
 mova m6, m7
 mova m1, m7
+%endif
 .loop:
 %if cpuflag(avx2)
 vpbroadcastq m0, [filterSizeq + 8]
@@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
 paddwm3, m3, m2
 paddwm4, m4, m5
+%if cpuflag(sse3)
 pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
 pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
 paddwm6, m6, m2
 paddwm1, m1, m5
+%endif
 add  filterSizeq, $10
 mov  srcq, [filterSizeq]
 test srcq, srcq
 jnz  .loop
 psrawm3, m3, 3
 psrawm4, m4, 3
+%if cpuflag(sse3)
 psrawm6, m6, 3
 psrawm1, m1, 3
+%endif
 packuswb m3, m3, m4
+%if cpuflag(sse3)
 packuswb m6, m6, m1
+%endif
 mov  srcq, [filterq]
 %if cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
 movr [destq + offsetq], m3
+%if cpuflag(sse3)
 movr [destq + offsetq + mmsize], m6
-add  offsetq, mmsize * 2
+%endif
+add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
 jb  .outerloop

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

2021-04-03 Thread Alan Kelly
ffmpeg | branch: release/4.4 | Alan Kelly  | Thu Apr  1 
12:00:17 2021 +0200| [6bc2058d00b119d265c9970eac213d2922d15129] | committer: 
Michael Niedermayer

tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

Signed-off-by: Michael Niedermayer 
(cherry picked from commit e1484bc455dff500f8b35b58d434924bca0e03d6)
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6bc2058d00b119d265c9970eac213d2922d15129
---

 tests/checkasm/sw_scale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index a10118704b..3ac0f9082f 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -68,8 +68,8 @@ static void check_yuv2yuvX(void)
 #define FILTER_SIZES 4
 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
 #define LARGEST_INPUT_SIZE 512
-#define INPUT_SIZES 4
-static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512};
+#define INPUT_SIZES 6
+static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512};
 
 declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
   int filterSize, const int16_t **src, uint8_t *dest,
@@ -107,7 +107,7 @@ static void check_yuv2yuvX(void)
 for(j = 0; j < 4; ++j)
 vFilterData[i].coeff[j + 4] = filter_coeff[i];
 }
-if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", 
filter_sizes[fsi], osi)){
+if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", 
filter_sizes[fsi], osi, dstW)){
 memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
 memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

2021-04-01 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Apr  1 
12:00:16 2021 +0200| [dc57762cb43619f91fd2a5d95510fa3b14cfeaaf] | committer: 
Michael Niedermayer

libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=dc57762cb43619f91fd2a5d95510fa3b14cfeaaf
---

 libswscale/x86/swscale.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index cc9e8b0155..0848a31461 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+if(dstW > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
 return; \
 }
 
@@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
+if(pixelsProcessed > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
   ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
 } \

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

2021-04-01 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Apr  1 
12:00:15 2021 +0200| [3ce8d092448827842c451807f03010ad5129fd8f] | committer: 
Michael Niedermayer

libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3ce8d092448827842c451807f03010ad5129fd8f
---

 libswscale/x86/yuv2yuvX.asm | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 521880dabe..b6294cb919 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -37,8 +37,10 @@ SECTION .text
 cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %if notcpuflag(sse3)
 %define movr mova
+%define unroll 1
 %else
 %define movr movdqu
+%define unroll 2
 %endif
 movsxdifnidn dstWq, dstWd
 movsxdifnidn offsetq, offsetd
@@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 .outerloop:
 mova m4, m7
 mova m3, m7
+%if cpuflag(sse3)
 mova m6, m7
 mova m1, m7
+%endif
 .loop:
 %if cpuflag(avx2)
 vpbroadcastq m0, [filterSizeq + 8]
@@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
 paddwm3, m3, m2
 paddwm4, m4, m5
+%if cpuflag(sse3)
 pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
 pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
 paddwm6, m6, m2
 paddwm1, m1, m5
+%endif
 add  filterSizeq, $10
 mov  srcq, [filterSizeq]
 test srcq, srcq
 jnz  .loop
 psrawm3, m3, 3
 psrawm4, m4, 3
+%if cpuflag(sse3)
 psrawm6, m6, 3
 psrawm1, m1, 3
+%endif
 packuswb m3, m3, m4
+%if cpuflag(sse3)
 packuswb m6, m6, m1
+%endif
 mov  srcq, [filterq]
 %if cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
 movr [destq + offsetq], m3
+%if cpuflag(sse3)
 movr [destq + offsetq + mmsize], m6
-add  offsetq, mmsize * 2
+%endif
+add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
 jb  .outerloop

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

2021-04-01 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu Apr  1 
12:00:17 2021 +0200| [e1484bc455dff500f8b35b58d434924bca0e03d6] | committer: 
Michael Niedermayer

tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e1484bc455dff500f8b35b58d434924bca0e03d6
---

 tests/checkasm/sw_scale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index a10118704b..3ac0f9082f 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -68,8 +68,8 @@ static void check_yuv2yuvX(void)
 #define FILTER_SIZES 4
 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
 #define LARGEST_INPUT_SIZE 512
-#define INPUT_SIZES 4
-static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512};
+#define INPUT_SIZES 6
+static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512};
 
 declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
   int filterSize, const int16_t **src, uint8_t *dest,
@@ -107,7 +107,7 @@ static void check_yuv2yuvX(void)
 for(j = 0; j < 4; ++j)
 vFilterData[i].coeff[j + 4] = filter_coeff[i];
 }
-if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", 
filter_sizes[fsi], osi)){
+if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", 
filter_sizes[fsi], osi, dstW)){
 memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
 memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] checkasm/sw_scale: properly initialize src_pixer and filter_coeff buffers

2021-02-19 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Fri 
Feb 19 14:55:39 2021 +0100| [ee18edb13a9ae3041df961dd5003c2055b5cab35] | 
committer: James Almer

checkasm/sw_scale: properly initialize src_pixer and filter_coeff buffers

Fixes valgrind uninitialised value warnings.

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ee18edb13a9ae3041df961dd5003c2055b5cab35
---

 tests/checkasm/sw_scale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 7504f8b45f..dee1af820c 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -86,8 +86,8 @@ static void check_yuv2yuvX(void)
 uint16_t coeff[8];
 } *vFilterData;
 uint8_t d_val = rnd();
-randomize_buffers(filter_coeff, LARGEST_FILTER);
-randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE);
+randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * 
LARGEST_INPUT_SIZE * sizeof(int16_t));
+randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * 
sizeof(int16_t));
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-cvslog] swscale: move yuv2yuvX_sse3 to yasm, unrolls main loop

2021-02-17 Thread Alan Kelly
ffmpeg | branch: master | Alan Kelly  | Thu 
Jan 14 15:47:03 2021 +0100| [554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f] | 
committer: Paul B Mahol

swscale: move yuv2yuvX_sse3 to yasm, unrolls main loop

And other small optimizations for ~20% speedup.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f
---

 libswscale/x86/Makefile   |   1 +
 libswscale/x86/swscale.c  | 130 +++-
 libswscale/x86/swscale_template.c |  82 ---
 libswscale/x86/yuv2yuvX.asm   | 136 ++
 tests/checkasm/sw_scale.c | 103 +
 5 files changed, 294 insertions(+), 158 deletions(-)

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 15c0b22f20..3df193a067 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 
0x8080808080808080ULL;
 DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 
0x0001000100010001ULL;
 
 
+#define YUV2YUVX_FUNC_DECL(opt)  \
+static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const 
int16_t **src, \
+   uint8_t *dest, int dstW, \
+   const uint8_t *dither, int offset); \
+
+YUV2YUVX_FUNC_DECL(mmx)
+YUV2YUVX_FUNC_DECL(mmxext)
+YUV2YUVX_FUNC_DECL(sse3)
+YUV2YUVX_FUNC_DECL(avx2)
+
 //MMX versions
 #if HAVE_MMX_INLINE
 #undef RENAME
@@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c")