On Fri, Jun 7, 2024 at 4:05 PM Ramiro Polla <ramiro.po...@gmail.com> wrote: > > chrRangeFromJpeg_8_c: 19.9 > chrRangeFromJpeg_8_sse4: 16.2 > chrRangeFromJpeg_24_c: 60.7 > chrRangeFromJpeg_24_sse4: 28.9 > chrRangeFromJpeg_128_c: 325.7 > chrRangeFromJpeg_128_sse4: 160.2 > chrRangeFromJpeg_144_c: 364.2 > chrRangeFromJpeg_144_sse4: 194.9 > chrRangeFromJpeg_256_c: 630.7 > chrRangeFromJpeg_256_sse4: 337.4 > chrRangeFromJpeg_512_c: 1240.4 > chrRangeFromJpeg_512_sse4: 668.4 > chrRangeToJpeg_8_c: 37.7 > chrRangeToJpeg_8_sse4: 19.7 > chrRangeToJpeg_24_c: 114.7 > chrRangeToJpeg_24_sse4: 30.2 > chrRangeToJpeg_128_c: 636.4 > chrRangeToJpeg_128_sse4: 161.7 > chrRangeToJpeg_144_c: 715.7 > chrRangeToJpeg_144_sse4: 272.9 > chrRangeToJpeg_256_c: 1256.7 > chrRangeToJpeg_256_sse4: 341.9 > chrRangeToJpeg_512_c: 2498.7 > chrRangeToJpeg_512_sse4: 668.4 > lumRangeFromJpeg_8_c: 11.7 > lumRangeFromJpeg_8_sse4: 12.4 > lumRangeFromJpeg_24_c: 36.9 > lumRangeFromJpeg_24_sse4: 17.7 > lumRangeFromJpeg_128_c: 228.4 > lumRangeFromJpeg_128_sse4: 85.2 > lumRangeFromJpeg_144_c: 272.9 > lumRangeFromJpeg_144_sse4: 96.9 > lumRangeFromJpeg_256_c: 463.4 > lumRangeFromJpeg_256_sse4: 183.9 > lumRangeFromJpeg_512_c: 879.9 > lumRangeFromJpeg_512_sse4: 355.9 > lumRangeToJpeg_8_c: 17.7 > lumRangeToJpeg_8_sse4: 15.4 > lumRangeToJpeg_24_c: 56.2 > lumRangeToJpeg_24_sse4: 18.4 > lumRangeToJpeg_128_c: 331.4 > lumRangeToJpeg_128_sse4: 84.4 > lumRangeToJpeg_144_c: 375.2 > lumRangeToJpeg_144_sse4: 96.9 > lumRangeToJpeg_256_c: 649.7 > lumRangeToJpeg_256_sse4: 184.4 > lumRangeToJpeg_512_c: 1281.9 > lumRangeToJpeg_512_sse4: 355.9 > --- > libswscale/swscale_internal.h | 1 + > libswscale/utils.c | 2 + > libswscale/x86/Makefile | 1 + > libswscale/x86/range_convert.asm | 100 +++++++++++++++++++++++++++++++ > libswscale/x86/swscale.c | 36 +++++++++++ > 5 files changed, 140 insertions(+) > create mode 100644 libswscale/x86/range_convert.asm > > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h > index d4b0c3cee2..92f6105443 100644 > --- a/libswscale/swscale_internal.h > +++ b/libswscale/swscale_internal.h > @@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY); > > av_cold void ff_sws_init_range_convert(SwsContext *c); > av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c); > +av_cold void ff_sws_init_range_convert_x86(SwsContext *c); > > SwsFunc ff_yuv2rgb_init_x86(SwsContext *c); > SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c); > diff --git a/libswscale/utils.c b/libswscale/utils.c > index 476a24fea5..8dfa57b5ff 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, > const int inv_table[4], > ff_sws_init_range_convert(c); > #if ARCH_LOONGARCH64 > ff_sws_init_range_convert_loongarch(c); > +#elif ARCH_X86 > + ff_sws_init_range_convert_x86(c); > #endif > } > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index 68391494be..f00154941d 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o > \ > x86/output.o \ > x86/scale.o \ > x86/scale_avx2.o > \ > + x86/range_convert.o \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/range_convert.asm > b/libswscale/x86/range_convert.asm > new file mode 100644 > index 0000000000..333265fb65 > --- /dev/null > +++ b/libswscale/x86/range_convert.asm > @@ -0,0 +1,100 @@ > +;****************************************************************************** > +;* Copyright (c) 2024 Ramiro Polla > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +; NOTE: there is no need to clamp the input when converting to jpeg range > +; (like we do in the C code) because packssdw will saturate the output. > + > +;----------------------------------------------------------------------------- > +; lumConvertRange > +; > +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width); > +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width); > +; > +;----------------------------------------------------------------------------- > + > +%macro LUMCONVERTRANGE 4 > +SECTION_RODATA > +mult_%1: times 4 dd %2 > +offset_%1: times 4 dd %3 > +SECTION .text > +cglobal %1, 2, 3, 3, dst, width, x > + movsxdifnidn widthq, widthd > + xor xq, xq > + mova m1, [mult_%1] > + mova m2, [offset_%1] > +.loop: > + pmovsxwd m0, [dstq+xq*2] > + pmulld m0, m1 > + paddd m0, m2 > + psrad m0, %4 > + packssdw m0, m0 > + movh [dstq+xq*2], m0 > + add xq, mmsize / 4 > + cmp xd, widthd > + jl .loop > + RET > +%endmacro > + > +;----------------------------------------------------------------------------- > +; chrConvertRange > +; > +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); > +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); > +; > +;----------------------------------------------------------------------------- > + > +%macro CHRCONVERTRANGE 4 > +SECTION_RODATA > +mult_%1: times 4 dd %2 > +offset_%1: times 4 dd %3 > +SECTION .text > +cglobal %1, 3, 4, 4, dstU, dstV, width, x > + movsxdifnidn widthq, widthd > + xor xq, xq > + mova m1, [mult_%1] > + mova m2, [offset_%1] > +.loop: > + pmovsxwd m0, [dstUq+xq*2] > + pmulld m0, m1 > + paddd m0, m2 > + psrad m0, %4 > + packssdw m0, m0 > + movh [dstUq+xq*2], m0 > + pmovsxwd m0, [dstVq+xq*2] > + pmulld m0, m1 > + paddd m0, m2 > + psrad m0, %4 > + packssdw m0, m0 > + movh [dstVq+xq*2], m0 > + add xq, mmsize / 4 > + cmp xd, widthd > + jl .loop > + RET > +%endmacro > + > +%if ARCH_X86_64 > +INIT_XMM sse4 > +LUMCONVERTRANGE lumRangeToJpeg, 19077, -39057361, 14 > +CHRCONVERTRANGE chrRangeToJpeg, 4663, -9289992, 12 > +LUMCONVERTRANGE lumRangeFromJpeg, 14071, 33561947, 14 > +CHRCONVERTRANGE chrRangeFromJpeg, 1799, 4081085, 11 > +%endif > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index fff8bb4396..c5ddfb5605 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -447,6 +447,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2); > INPUT_PLANAR_RGB_A_ALL_DECL(avx2); > #endif > > +#if ARCH_X86_64 > +#define RANGE_CONVERT_FUNCS(opt) do { \ > + if (c->dstBpc <= 14) { \ > + if (c->srcRange) { \ > + c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \ > + c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \ > + } else { \ > + c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \ > + c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \ > + } \ > + } \ > +} while (0) > + > +#define RANGE_CONVERT_FUNCS_DECL(opt) \ > +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \ > +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ > +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \ > +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ > + > +RANGE_CONVERT_FUNCS_DECL(sse4); > + > +av_cold void ff_sws_init_range_convert_x86(SwsContext *c) > +{ > + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { > + int cpu_flags = av_get_cpu_flags(); > + if (EXTERNAL_SSE4(cpu_flags)) { > + RANGE_CONVERT_FUNCS(sse4); > + } > + } > +} > +#endif > + > av_cold void ff_sws_init_swscale_x86(SwsContext *c) > { > int cpu_flags = av_get_cpu_flags(); > @@ -805,4 +837,8 @@ switch(c->dstBpc){ \ > } > > #endif > + > +#if ARCH_X86_64 > + ff_sws_init_range_convert_x86(c); > +#endif > } > -- > 2.30.2 >
Attached version is a little bit different, moving the consts out of the macro (so they can be reused by avx2) and processing twice the amount of data per loop.
From b8f72b1c4c8393becea9962378af6d7dffabbce2 Mon Sep 17 00:00:00 2001 From: Ramiro Polla <ramiro.po...@gmail.com> Date: Thu, 6 Jun 2024 18:33:34 +0200 Subject: [PATCH] swscale/x86: add sse4 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 19.9 chrRangeFromJpeg_8_sse4: 16.2 chrRangeFromJpeg_24_c: 60.7 chrRangeFromJpeg_24_sse4: 28.9 chrRangeFromJpeg_128_c: 325.7 chrRangeFromJpeg_128_sse4: 160.2 chrRangeFromJpeg_144_c: 364.2 chrRangeFromJpeg_144_sse4: 194.9 chrRangeFromJpeg_256_c: 630.7 chrRangeFromJpeg_256_sse4: 337.4 chrRangeFromJpeg_512_c: 1240.4 chrRangeFromJpeg_512_sse4: 668.4 chrRangeToJpeg_8_c: 37.7 chrRangeToJpeg_8_sse4: 19.7 chrRangeToJpeg_24_c: 114.7 chrRangeToJpeg_24_sse4: 30.2 chrRangeToJpeg_128_c: 636.4 chrRangeToJpeg_128_sse4: 161.7 chrRangeToJpeg_144_c: 715.7 chrRangeToJpeg_144_sse4: 272.9 chrRangeToJpeg_256_c: 1256.7 chrRangeToJpeg_256_sse4: 341.9 chrRangeToJpeg_512_c: 2498.7 chrRangeToJpeg_512_sse4: 668.4 lumRangeFromJpeg_8_c: 11.7 lumRangeFromJpeg_8_sse4: 12.4 lumRangeFromJpeg_24_c: 36.9 lumRangeFromJpeg_24_sse4: 17.7 lumRangeFromJpeg_128_c: 228.4 lumRangeFromJpeg_128_sse4: 85.2 lumRangeFromJpeg_144_c: 272.9 lumRangeFromJpeg_144_sse4: 96.9 lumRangeFromJpeg_256_c: 463.4 lumRangeFromJpeg_256_sse4: 183.9 lumRangeFromJpeg_512_c: 879.9 lumRangeFromJpeg_512_sse4: 355.9 lumRangeToJpeg_8_c: 17.7 lumRangeToJpeg_8_sse4: 15.4 lumRangeToJpeg_24_c: 56.2 lumRangeToJpeg_24_sse4: 18.4 lumRangeToJpeg_128_c: 331.4 lumRangeToJpeg_128_sse4: 84.4 lumRangeToJpeg_144_c: 375.2 lumRangeToJpeg_144_sse4: 96.9 lumRangeToJpeg_256_c: 649.7 lumRangeToJpeg_256_sse4: 184.4 lumRangeToJpeg_512_c: 1281.9 lumRangeToJpeg_512_sse4: 355.9 --- libswscale/swscale_internal.h | 1 + libswscale/utils.c | 2 + libswscale/x86/Makefile | 1 + libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++ libswscale/x86/swscale.c | 36 +++++++++ 5 files changed, 170 insertions(+) create mode 100644 libswscale/x86/range_convert.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index d4b0c3cee2..92f6105443 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY); av_cold void ff_sws_init_range_convert(SwsContext *c); av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c); +av_cold void ff_sws_init_range_convert_x86(SwsContext *c); SwsFunc ff_yuv2rgb_init_x86(SwsContext *c); SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c); diff --git a/libswscale/utils.c b/libswscale/utils.c index 476a24fea5..8dfa57b5ff 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], ff_sws_init_range_convert(c); #if ARCH_LOONGARCH64 ff_sws_init_range_convert_loongarch(c); +#elif ARCH_X86 + ff_sws_init_range_convert_x86(c); #endif } diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 68391494be..f00154941d 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ x86/scale_avx2.o \ + x86/range_convert.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm new file mode 100644 index 0000000000..13983a386b --- /dev/null +++ b/libswscale/x86/range_convert.asm @@ -0,0 +1,130 @@ +;****************************************************************************** +;* Copyright (c) 2024 Ramiro Polla +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +chr_to_mult: times 4 dd 4663 +chr_to_offset: times 4 dd -9289992 +%define chr_to_shift 12 + +chr_from_mult: times 4 dd 1799 +chr_from_offset: times 4 dd 4081085 +%define chr_from_shift 11 + +lum_to_mult: times 4 dd 19077 +lum_to_offset: times 4 dd -39057361 +%define lum_to_shift 14 + +lum_from_mult: times 4 dd 14071 +lum_from_offset: times 4 dd 33561947 +%define lum_from_shift 14 + +SECTION .text + +; NOTE: there is no need to clamp the input when converting to jpeg range +; (like we do in the C code) because packssdw will saturate the output. + +;----------------------------------------------------------------------------- +; lumConvertRange +; +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width); +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width); +; +;----------------------------------------------------------------------------- + +%macro LUMCONVERTRANGE 4 +cglobal %1, 2, 3, 3, dst, width, x + movsxdifnidn widthq, widthd + xor xq, xq + mova m4, [%2] + mova m5, [%3] +.loop: + pmovsxwd m0, [dstq+xq*2] + pmovsxwd m1, [dstq+xq*2+mmsize/2] + pmulld m0, m4 + pmulld m1, m4 + paddd m0, m5 + paddd m1, m5 + psrad m0, %4 + psrad m1, %4 + packssdw m0, m0 + packssdw m1, m1 + movq [dstq+xq*2], m0 + movq [dstq+xq*2+mmsize/2], m1 + add xq, mmsize / 2 + cmp xd, widthd + jl .loop + RET +%endmacro + +;----------------------------------------------------------------------------- +; chrConvertRange +; +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); +; +;----------------------------------------------------------------------------- + +%macro CHRCONVERTRANGE 4 +cglobal %1, 3, 4, 4, dstU, dstV, width, x + movsxdifnidn widthq, widthd + xor xq, xq + mova m4, [%2] + mova m5, [%3] +.loop: + pmovsxwd m0, [dstUq+xq*2] + pmovsxwd m1, [dstUq+xq*2+mmsize/2] + pmovsxwd m2, [dstVq+xq*2] + pmovsxwd m3, [dstVq+xq*2+mmsize/2] + pmulld m0, m4 + pmulld m1, m4 + pmulld m2, m4 + pmulld m3, m4 + paddd m0, m5 + paddd m1, m5 + paddd m2, m5 + paddd m3, m5 + psrad m0, %4 + psrad m1, %4 + psrad m2, %4 + psrad m3, %4 + packssdw m0, m0 + packssdw m1, m1 + packssdw m2, m2 + packssdw m3, m3 + movq [dstUq+xq*2], m0 + movq [dstUq+xq*2+mmsize/2], m1 + movq [dstVq+xq*2], m2 + movq [dstVq+xq*2+mmsize/2], m3 + add xq, mmsize / 2 + cmp xd, widthd + jl .loop + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift +CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift +LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift +CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift +%endif diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index fff8bb4396..c5ddfb5605 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -447,6 +447,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2); INPUT_PLANAR_RGB_A_ALL_DECL(avx2); #endif +#if ARCH_X86_64 +#define RANGE_CONVERT_FUNCS(opt) do { \ + if (c->dstBpc <= 14) { \ + if (c->srcRange) { \ + c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \ + c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \ + } else { \ + c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \ + c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \ + } \ + } \ +} while (0) + +#define RANGE_CONVERT_FUNCS_DECL(opt) \ +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \ +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \ +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ + +RANGE_CONVERT_FUNCS_DECL(sse4); + +av_cold void ff_sws_init_range_convert_x86(SwsContext *c) +{ + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE4(cpu_flags)) { + RANGE_CONVERT_FUNCS(sse4); + } + } +} +#endif + av_cold void ff_sws_init_swscale_x86(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -805,4 +837,8 @@ switch(c->dstBpc){ \ } #endif + +#if ARCH_X86_64 + ff_sws_init_range_convert_x86(c); +#endif } -- 2.30.2
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".