PR #20714 opened by MakarDev URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20714 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20714.patch
Signed-off-by: MakarDev <[email protected]> Added 1D AVX2 assembly vectorization support for the sliding window accumulator on the avfiler/boxblur filter. Benchmarking results AVX2: - vf_boxblur.boxblur_row8 [OK] - vf_boxblur.boxblur_row16 [OK] checkasm: all 2 tests passed boxblur_blur_row8_c: 884.7 ( 1.00x) boxblur_blur_row8_avx2: 92.7 ( 9.54x) boxblur_blur_row16_c: 315.8 ( 1.00x) boxblur_blur_row16_avx2: 255.3 ( 1.24x) From 7915f3f232ac5e57fb7ac7e342653108c2119719 Mon Sep 17 00:00:00 2001 From: MakarDev <[email protected]> Date: Thu, 16 Oct 2025 22:44:31 -0700 Subject: [PATCH] avfilter/boxblur: add AVX2 horizontal pass Signed-off-by: MakarDev <[email protected]> --- libavfilter/vf_boxblur_dsp.h | 45 +++ libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_boxblur.asm | 575 ++++++++++++++++++++++++++++++ libavfilter/x86/vf_boxblur_init.c | 71 ++++ tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_boxblur.c | 165 +++++++++ 8 files changed, 863 insertions(+) create mode 100644 libavfilter/vf_boxblur_dsp.h create mode 100644 libavfilter/x86/vf_boxblur.asm create mode 100644 libavfilter/x86/vf_boxblur_init.c create mode 100644 tests/checkasm/vf_boxblur.c diff --git a/libavfilter/vf_boxblur_dsp.h b/libavfilter/vf_boxblur_dsp.h new file mode 100644 index 0000000000..246c748eea --- /dev/null +++ b/libavfilter/vf_boxblur_dsp.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2025 Makar Kuznietsov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_BOXBLUR_DSP_H +#define AVFILTER_BOXBLUR_DSP_H + +#include <stddef.h> +#include <stdint.h> + +typedef struct FFBoxblurDSPContext { + /* 1D horizontal blur on one row of len pixels */ + void (*blur_row8)(uint8_t *dst, ptrdiff_t dst_step, + const uint8_t *src, ptrdiff_t src_step, + int len, int radius); + + void (*blur_row16)(uint16_t *dst, ptrdiff_t dst_step, + const uint16_t *src, ptrdiff_t src_step, + int len, int radius); +} FFBoxblurDSPContext; + +/* C initializers */ +void ff_boxblur_dsp_init(FFBoxblurDSPContext *dsp); +void ff_boxblur_dsp_init_aarch64(FFBoxblurDSPContext *dsp); +void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp); + +#endif /* AVFILTER_BOXBLUR_DSP_H */ + + diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index b485c10fbe..a89e9e4b78 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -14,6 +14,7 @@ OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq_init.o OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o +OBJS-$(CONFIG_BOXBLUR_FILTER) += x86/vf_boxblur_init.o OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate_init.o OBJS-$(CONFIG_HALDCLUT_FILTER) += x86/vf_lut3d_init.o OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip_init.o @@ -63,6 +64,7 @@ X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate.o X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o X86ASM-OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur.o X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o +X86ASM-OBJS-$(CONFIG_BOXBLUR_FILTER) += x86/vf_boxblur.o X86ASM-OBJS-$(CONFIG_HALDCLUT_FILTER) += x86/vf_lut3d.o X86ASM-OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip.o X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o diff --git a/libavfilter/x86/vf_boxblur.asm b/libavfilter/x86/vf_boxblur.asm new file mode 100644 index 0000000000..48bd64d8f6 --- /dev/null +++ b/libavfilter/x86/vf_boxblur.asm @@ -0,0 +1,575 @@ +;***************************************************************************** +;* x86 AVX2-optimized functions for boxblur 1D row blur +;* +;* Copyright (C) 2025 Makar Kuznietsov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +; --------------------------------------------------------------------------- +; void ff_boxblur_blur_rowb_avx2(uint8_t *dst, ptrdiff_t dst_step, +; const uint8_t *src, ptrdiff_t src_step, +; int len, int radius) +; AVX2 implementation for 8-bit pixels +; --------------------------------------------------------------------------- +%if ARCH_X86_64 +GLOBAL ff_boxblur_blur_rowb_avx2 +ff_boxblur_blur_rowb_avx2: + ; System V AMD64 args: + ; RDI=dst (uint8_t*), RSI=dst_step, RDX=src (uint8_t*), RCX=src_step, R8d=len, R9d=radius + push RBP + mov RBP, RSP + push RBX + push R12 + push R13 + push R14 + push R15 + + mov R12, RDI ; dst + mov R13, RSI ; dst_step + mov R14, RDX ; src + mov R15, RCX ; src_step + ; R8d = len, R9d = radius (already in place) + + ; Compute inv = ((1<<16) + length/2) / length + mov eax, R9d ; radius + lea edx, [RAX*2+1] ; edx = length = 2*radius+1 + mov ecx, edx ; ecx = length + mov ebx, edx + shr ebx, 1 ; ebx = length/2 + mov eax, 1 + shl eax, 16 ; eax = 1<<16 + add eax, ebx ; eax = (1<<16) + length/2 + xor edx, edx + div ecx ; eax = inv = ((1<<16)+len/2)/len + mov R11d, eax ; R11d = inv + + ; sum = src[radius*src_step] + mov eax, R9d ; eax = radius + imul RAX, R15 ; RAX = radius * src_step + movzx ebx, byte [R14+RAX] + mov R10d, ebx ; R10d = sum (int) + + ; for (x=0;x<radius;x++) sum += src[x*src_step]<<1 + xor eax, eax ; x = 0 +.sum_doubles_loop: + cmp eax, R9d + jge .sum_done + mov edx, eax + imul RDX, R15 ; RDX = x*src_step + movzx ebx, byte [R14+RDX] + add R10d, ebx + add R10d, ebx ; sum += val*2 + inc eax + jmp .sum_doubles_loop +.sum_done: + + ; sum = sum*inv + (1<<15), keep in Q16 (R10) + movsxd RDX, R10d + imul RDX, R11 + add RDX, 1<<15 + mov R10, RDX ; R10 = fixed-point accumulator (Q16) + + ; --------------------------- + ; Loop 1: head (reflect-left) + ; --------------------------- + xor eax, eax ; x = 0 +.loop1: + cmp eax, R8d ; x < len ? + jge .after_loop1 + cmp eax, R9d ; x <= radius ? + jg .after_loop1 + ; diff = src[(radius+x)*src_step] - src[(radius-x)*src_step] + mov edx, R9d + add edx, eax + imul RDX, R15 + movzx ebx, byte [R14+RDX] + mov esi, R9d + sub esi, eax + movsxd RSI, esi + imul RSI, R15 + movzx ecx, byte [R14+RSI] + sub ebx, ecx ; ebx = diff + movsxd RDX, ebx + imul RDX, R11 ; diff*inv (Q16) + add R10, RDX ; acc += diff*inv + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + imul RSI, R13 + mov [R12 + RSI], dl + inc eax + jmp .loop1 +.after_loop1: + + ; -------------------------------------------- + ; Loop 2: steady-state (no reflection) – AVX2 + ; -------------------------------------------- + ; Fast path requires unit strides and >= 8 pixels remaining before tail. + ; Condition: (R13 == 1) && (R15 == 1) && (x < len - radius - 8) + cmp R13, 1 + jne .loop2_scalar + cmp R15, 1 + jne .loop2_scalar + + ; Check if we have at least 16 pixels in steady state + mov edi, R8d + sub edi, R9d + sub edi, 16 ; edi = len - radius - 16 + cmp eax, edi + jg .loop2_scalar + + ; Setup pointers for AVX2 block processing + ; pNew = src + (radius + x) + ; pOld = src + (x - radius - 1) + ; pDst = dst + x + ; Use: RDI=pNew, RSI=pOld, RBX=pDst (R13 dst_step not needed since stride=1) + + mov edi, eax + add edi, R9d + lea RDI, [R14+RDI] ; RDI = pNew + mov esi, eax + sub esi, R9d + dec esi + lea RSI, [R14+RSI] ; RSI = pOld + lea RBX, [R12+RAX] ; RBX = pDst + + ; Broadcast inv (AVX2: must go through XMM first) + vmovd xmm5, R11d ; Move R11d to xmm5 + vpbroadcastd ymm5, xmm5 ; ymm5 = inv (int32 broadcast) + +.loop2_avx2: + ; Check if we still have 16 pixels: x <= (len - radius - 16) ? + mov ecx, R8d + sub ecx, R9d + sub ecx, 16 + cmp eax, ecx + jg .loop2_avx2_done + + ; Load 16 incoming bytes (new edge) and 16 outgoing bytes (old edge) + ; new: src[(radius + x) .. +15] + ; old: src[(x - radius - 1) .. +15] + vmovdqu xmm0, [RDI] ; new 16B from pNew + vmovdqu xmm1, [RSI] ; old 16B from pOld + + ; Extend 16x u8 -> 16x u16, then compute diff + vpmovzxbw ymm0, xmm0 ; 16x u8 -> 16x u16 in ymm0 + vpmovzxbw ymm1, xmm1 ; 16x u8 -> 16x u16 in ymm1 + vpsubw ymm0, ymm0, ymm1 ; 16x s16 (diff) + + ; Split into low and high 8 elements for 32-bit processing + vextracti128 xmm1, ymm0, 1 ; high 8x s16 + vpmovsxwd ymm2, xmm0 ; low 8x s16 -> 8x s32 in ymm2 + vpmovsxwd ymm3, xmm1 ; high 8x s16 -> 8x s32 in ymm3 + + ; Multiply by inv (broadcast in ymm5) + vpmulld ymm2, ymm2, ymm5 ; low 8x s32 increments + vpmulld ymm3, ymm3, ymm5 ; high 8x s32 increments + + ; Compute prefix sum on ymm2 (first 8 elements) + vpslldq ymm6, ymm2, 4 + vpaddd ymm2, ymm2, ymm6 + vpslldq ymm6, ymm2, 8 + vpaddd ymm2, ymm2, ymm6 + ; Cross-lane add for ymm2 + vextracti128 xmm6, ymm2, 0 + vpshufd xmm7, xmm6, 0xFF + vextracti128 xmm8, ymm2, 1 + vpaddd xmm8, xmm8, xmm7 + vinserti128 ymm2, ymm2, xmm8, 1 + + ; Get the last element of ymm2 to carry to ymm3 + vextracti128 xmm8, ymm2, 1 + vpshufd xmm8, xmm8, 0xFF ; last element of first 8 + vpbroadcastd ymm8, xmm8 ; broadcast to all lanes + + ; Compute prefix sum on ymm3 (next 8 elements) + vpslldq ymm6, ymm3, 4 + vpaddd ymm3, ymm3, ymm6 + vpslldq ymm6, ymm3, 8 + vpaddd ymm3, ymm3, ymm6 + vextracti128 xmm6, ymm3, 0 + vpshufd xmm7, xmm6, 0xFF + vextracti128 xmm9, ymm3, 1 + vpaddd xmm9, xmm9, xmm7 + vinserti128 ymm3, ymm3, xmm9, 1 + vpaddd ymm3, ymm3, ymm8 ; add carry from first 8 + + ; Add previous accumulator to both + vmovd xmm1, R10d + vpbroadcastd ymm1, xmm1 + vpaddd ymm2, ymm2, ymm1 ; first 8 accumulators + vpaddd ymm3, ymm3, ymm1 ; next 8 accumulators (already includes carry) + + ; Extract last accumulator for next iteration (from ymm3) + vextracti128 xmm4, ymm3, 1 + vpshufd xmm6, xmm4, 0xFF + movd edx, xmm6 + movsxd RDX, edx + mov R10, RDX + + ; Shift and pack both registers + vpsrad ymm2, ymm2, 16 ; first 8 results + vpsrad ymm3, ymm3, 16 ; next 8 results + + ; Pack ymm2 and ymm3 down to 16 bytes + vpackssdw ymm2, ymm2, ymm3 ; 16x s16 (but lanes are mixed) + vpermq ymm2, ymm2, 0xD8 ; fix lane order: 11011000b = (0,2,1,3) + vextracti128 xmm3, ymm2, 1 + vpackuswb xmm2, xmm2, xmm3 ; 16x u8 + vmovdqu [RBX], xmm2 ; store 16 bytes + + ; Advance pointers and x by 16 + add RDI, 16 ; pNew += 16 + add RSI, 16 ; pOld += 16 + add RBX, 16 ; pDst += 16 + add eax, 16 ; x += 16 + jmp .loop2_avx2 + +.loop2_avx2_done: + ; Fall through to scalar remainder + +.loop2_scalar: + ; Loop 2 scalar: for (; x < len - radius; x++) + cmp eax, R8d + jge .after_loop2 + mov edx, R8d + sub edx, R9d + cmp eax, edx ; x < len - radius ? + jge .after_loop2 + ; diff = src[(radius+x)*src_step] - src[(x-radius-1)*src_step] + mov edx, R9d + add edx, eax + imul RDX, R15 + movzx ebx, byte [R14+RDX] + mov edx, eax + sub edx, R9d + dec edx + movsxd RDX, edx + imul RDX, R15 + movzx ecx, byte [R14+RDX] + sub ebx, ecx + movsxd RDX, ebx + imul RDX, R11 + add R10, RDX + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + imul RSI, R13 + mov [R12 + RSI], dl + inc eax + jmp .loop2_scalar +.after_loop2: + + ; --------------------------- + ; Loop 3: tail (reflect-right) + ; --------------------------- +.loop3: + cmp eax, R8d + jge .end8 + ; diff = src[(2*len-radius-x-1)*src_step] - src[(x-radius-1)*src_step] + mov edx, R8d + lea edx, [edx*2] + sub edx, R9d + sub edx, eax + dec edx + movsxd RDX, edx + imul RDX, R15 + movzx ebx, byte [R14+RDX] + mov edx, eax + sub edx, R9d + dec edx + movsxd RDX, edx + imul RDX, R15 + movzx ecx, byte [R14+RDX] + sub ebx, ecx + movsxd RDX, ebx + imul RDX, R11 + add R10, RDX + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + imul RSI, R13 + mov [R12 + RSI], dl + inc eax + jmp .loop3 + +.end8: + vzeroupper + pop R15 + pop R14 + pop R13 + pop R12 + pop RBX + pop RBP + ret +%endif + + +; --------------------------------------------------------------------------- +; void ff_boxblur_blur_roww_avx2(uint16_t *dst, ptrdiff_t dst_step, +; const uint16_t *src, ptrdiff_t src_step, +; int bytes, int radius) +; AVX2 implementation for 16-bit (8 pixels per iteration) +; --------------------------------------------------------------------------- +%if ARCH_X86_64 +GLOBAL ff_boxblur_blur_roww_avx2 +ff_boxblur_blur_roww_avx2: + ; RDI=dst(uint16_t*), RSI=dst_step, RDX=src(uint16_t*), RCX=src_step, R8d=bytes, R9d=radius + push RBP + mov RBP, RSP + push RBX + push R12 + push R13 + push R14 + push R15 + + mov R12, RDI ; dst + mov R13, RSI ; dst_step (in bytes) + mov R14, RDX ; src + mov R15, RCX ; src_step (in bytes) + + ; len = bytes/2 + mov eax, R8d + shr eax, 1 ; eax = len (number of uint16 elements) + mov R8d, eax + + ; Compute inv + mov eax, R9d ; radius + lea edx, [RAX*2+1] ; length = 2*radius+1 + mov ecx, edx + mov ebx, edx + shr ebx, 1 ; length/2 + mov eax, 1 + shl eax, 16 + add eax, ebx + xor edx, edx + div ecx ; eax = inv + mov R11d, eax ; R11 = inv + + ; Initialize sum = src[radius] + mov eax, R9d + shl eax, 1 ; radius * 2 (since src_step is in bytes for uint16) + movzx ebx, word [R14 + RAX] + mov R10d, ebx + + ; sum += src[x] * 2 for x in [0..radius) + xor eax, eax +.sum_loop16: + cmp eax, R9d + jge .sum_done16 + shl eax, 1 ; x * 2 (byte offset) + movzx ebx, word [R14 + RAX] + shr eax, 1 ; restore x + add R10d, ebx + add R10d, ebx + inc eax + jmp .sum_loop16 +.sum_done16: + + ; sum = sum*inv + (1<<15) + movsxd RDX, R10d + imul RDX, R11 + add RDX, 1<<15 + mov R10, RDX + + ; Loop 1: head (x=0..radius) + xor eax, eax +.loop1_16: + cmp eax, R8d + jge .after_loop1_16 + cmp eax, R9d + jg .after_loop1_16 + ; diff = src[(radius+x)*2] - src[(radius-x)*2] + mov edx, R9d + add edx, eax + shl edx, 1 ; byte offset + movzx ebx, word [R14 + RDX] + mov edx, R9d + sub edx, eax + shl edx, 1 + movzx ecx, word [R14 + RDX] + sub ebx, ecx + movsxd RDX, ebx + imul RDX, R11 + add R10, RDX + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + shl RSI, 1 ; dst byte offset + mov [R12 + RSI], dx + inc eax + jmp .loop1_16 +.after_loop1_16: + + ; Loop 2: steady-state with AVX2 + ; Check for unit strides (step = 2 for uint16) + cmp R13, 2 + jne .loop2_scalar16 + cmp R15, 2 + jne .loop2_scalar16 + + ; Check if we have at least 8 pixels + mov edi, R8d + sub edi, R9d + sub edi, 8 + cmp eax, edi + jg .loop2_scalar16 + + ; Setup pointers + lea RDI, [R14 + RAX*2] ; pNew = src + (x)*2 + add RDI, R9 + add RDI, R9 ; += radius*2 + lea RSI, [R14 + RAX*2] ; pOld = src + (x-radius-1)*2 + sub RSI, R9 + sub RSI, R9 ; -= radius*2 + sub RSI, 2 ; -= 2 + lea RBX, [R12 + RAX*2] ; pDst = dst + x*2 + + ; Broadcast inv + vmovd xmm5, R11d + vpbroadcastd ymm5, xmm5 + +.loop2_avx2_16: + mov ecx, R8d + sub ecx, R9d + sub ecx, 8 + cmp eax, ecx + jg .loop2_avx2_done16 + + ; Load 8x uint16 (16 bytes) + vmovdqu xmm0, [RDI] + vmovdqu xmm1, [RSI] + + ; Extend to 8x int32 + vpmovzxwd ymm0, xmm0 + vpmovzxwd ymm1, xmm1 + vpsubd ymm0, ymm0, ymm1 ; 8x s32 (diff) + vpmulld ymm0, ymm0, ymm5 ; 8x s32 (Q16 increments) + + ; Prefix sum + vpslldq ymm2, ymm0, 4 + vpaddd ymm0, ymm0, ymm2 + vpslldq ymm2, ymm0, 8 + vpaddd ymm0, ymm0, ymm2 + vextracti128 xmm2, ymm0, 0 + vpshufd xmm3, xmm2, 0xFF + vextracti128 xmm4, ymm0, 1 + vpaddd xmm4, xmm4, xmm3 + vinserti128 ymm0, ymm0, xmm4, 1 + + ; Add prev accumulator + vmovd xmm1, R10d + vpbroadcastd ymm1, xmm1 + vpaddd ymm0, ymm0, ymm1 + + ; Extract last for next iteration + vextracti128 xmm4, ymm0, 1 + vpshufd xmm6, xmm4, 0xFF + movd edx, xmm6 + movsxd RDX, edx + mov R10, RDX + + ; Shift and pack + vpsrad ymm0, ymm0, 16 + vextracti128 xmm2, ymm0, 0 + vextracti128 xmm3, ymm0, 1 + vpackssdw xmm2, xmm2, xmm3 + vmovdqu [RBX], xmm2 + + ; Advance by 8 pixels + add RDI, 16 + add RSI, 16 + add RBX, 16 + add eax, 8 + jmp .loop2_avx2_16 + +.loop2_avx2_done16: + ; Fall through to scalar + +.loop2_scalar16: + cmp eax, R8d + jge .after_loop2_16 + mov edx, R8d + sub edx, R9d + cmp eax, edx + jge .after_loop2_16 + ; diff = src[(radius+x)*2] - src[(x-radius-1)*2] + mov edx, R9d + add edx, eax + shl edx, 1 + movzx ebx, word [R14 + RDX] + mov edx, eax + sub edx, R9d + dec edx + shl edx, 1 + movzx ecx, word [R14 + RDX] + sub ebx, ecx + movsxd RDX, ebx + imul RDX, R11 + add R10, RDX + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + shl RSI, 1 + mov [R12 + RSI], dx + inc eax + jmp .loop2_scalar16 +.after_loop2_16: + + ; Loop 3: tail +.loop3_16: + cmp eax, R8d + jge .end16 + ; diff = src[(2*len-radius-x-1)*2] - src[(x-radius-1)*2] + mov edx, R8d + lea edx, [edx*2] + sub edx, R9d + sub edx, eax + dec edx + shl edx, 1 + movzx ebx, word [R14 + RDX] + mov edx, eax + sub edx, R9d + dec edx + shl edx, 1 + movzx ecx, word [R14 + RDX] + sub ebx, ecx + movsxd RDX, ebx + imul RDX, R11 + add R10, RDX + mov RDX, R10 + sar RDX, 16 + mov RSI, RAX + shl RSI, 1 + mov [R12 + RSI], dx + inc eax + jmp .loop3_16 +.end16: + vzeroupper + pop R15 + pop R14 + pop R13 + pop R12 + pop RBX + pop RBP + ret +%endif diff --git a/libavfilter/x86/vf_boxblur_init.c b/libavfilter/x86/vf_boxblur_init.c new file mode 100644 index 0000000000..16013d35d9 --- /dev/null +++ b/libavfilter/x86/vf_boxblur_init.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2025 Makar Kuznietsov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" + +#include "libavfilter/vf_boxblur_dsp.h" + +/* + * We implement x86 CPU dispatch for 1D row blurs used by boxblur's + * separable horizontal/vertical passes. + */ + +#if HAVE_X86ASM +#if HAVE_AVX2_EXTERNAL +/* 32-byte vector width */ +void ff_boxblur_blur_rowb_avx2(uint8_t *dst, ptrdiff_t dst_step, + const uint8_t *src, ptrdiff_t src_step, + int bytes, int radius); +void ff_boxblur_blur_roww_avx2(uint16_t *dst, ptrdiff_t dst_step, + const uint16_t *src, ptrdiff_t src_step, + int bytes, int radius); + +static void blur_row8_avx2(uint8_t *dst, ptrdiff_t dst_step, + const uint8_t *src, ptrdiff_t src_step, + int len, int radius) +{ + ff_boxblur_blur_rowb_avx2(dst, dst_step, src, src_step, len, radius); +} + +static void blur_row16_avx2(uint16_t *dst, ptrdiff_t dst_step, + const uint16_t *src, ptrdiff_t src_step, + int len, int radius) +{ + ff_boxblur_blur_roww_avx2(dst, dst_step, src, src_step, len * 2, radius); +} +#endif +#endif /* HAVE_X86ASM */ + +av_cold void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + dsp->blur_row8 = blur_row8_avx2; + dsp->blur_row16 = blur_row16_avx2; + } +#endif +#endif + (void) dsp; +} + + diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index e47070d90f..265790639f 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o +AVFILTEROBJS-$(CONFIG_BOXBLUR_FILTER) += vf_boxblur.o AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 4469e043f5..99e1704cca 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -300,6 +300,9 @@ static const struct { #if CONFIG_GBLUR_FILTER { "vf_gblur", checkasm_check_vf_gblur }, #endif + #if CONFIG_BOXBLUR_FILTER + { "vf_boxblur", checkasm_check_boxblur }, + #endif #if CONFIG_HFLIP_FILTER { "vf_hflip", checkasm_check_vf_hflip }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e1ccd4011b..6da78e6f30 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -152,6 +152,7 @@ void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); void checkasm_check_vf_sobel(void); +void checkasm_check_boxblur(void); void checkasm_check_vp3dsp(void); void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); diff --git a/tests/checkasm/vf_boxblur.c b/tests/checkasm/vf_boxblur.c new file mode 100644 index 0000000000..ed4111d00d --- /dev/null +++ b/tests/checkasm/vf_boxblur.c @@ -0,0 +1,165 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <string.h> +#include "checkasm.h" +#include "libavutil/mem_internal.h" + +#include "libavfilter/vf_boxblur_dsp.h" + +static void blur_row8_ref(uint8_t *dst, ptrdiff_t dst_step, + const uint8_t *src, ptrdiff_t src_step, + int len, int radius) +{ + if (radius <= 0 || len <= 0) { + for (int i = 0; i < len; i++) + dst[i * dst_step] = src[i * src_step]; + return; + } + const int length = radius * 2 + 1; + const int inv = ((1 << 16) + length / 2) / length; + int sum = src[radius * src_step]; + for (int x = 0; x < radius; x++) + sum += (int)src[x * src_step] << 1; + sum = sum * inv + (1 << 15); + int x = 0; + for (; x <= radius && x < len; x++) { + const int right = (radius + x) * src_step; + const int left = (radius - x) * src_step; + sum += ((int)src[right] - (int)src[left]) * inv; + dst[x * dst_step] = (uint8_t)(sum >> 16); + } + for (; x < len - radius; x++) { + const int in = (radius + x) * src_step; + const int out = (x - radius - 1) * src_step; + sum += ((int)src[in] - (int)src[out]) * inv; + dst[x * dst_step] = (uint8_t)(sum >> 16); + } + for (; x < len; x++) { + const int in = (2 * len - radius - x - 1) * src_step; + const int out = (x - radius - 1) * src_step; + sum += ((int)src[in] - (int)src[out]) * inv; + dst[x * dst_step] = (uint8_t)(sum >> 16); + } +} + +static void blur_row16_ref(uint16_t *dst, ptrdiff_t dst_step, + const uint16_t *src, ptrdiff_t src_step, + int len, int radius) +{ + if (radius <= 0 || len <= 0) { + for (int i = 0; i < len; i++) + *(uint16_t *)((uint8_t *)dst + i * dst_step) = *(const uint16_t *)((const uint8_t *)src + i * src_step); + return; + } + const int step_e = (int)(src_step >> 1); + const int dstep_e = (int)(dst_step >> 1); + const int length = radius * 2 + 1; + const int inv = ((1 << 16) + length / 2) / length; + int sum = src[radius * step_e]; + for (int x = 0; x < radius; x++) + sum += (int)src[x * step_e] << 1; + sum = sum * inv + (1 << 15); + int x = 0; + for (; x <= radius && x < len; x++) { + const int right = (radius + x) * step_e; + const int left = (radius - x) * step_e; + sum += ((int)src[right] - (int)src[left]) * inv; + dst[x * dstep_e] = (uint16_t)(sum >> 16); + } + for (; x < len - radius; x++) { + const int in = (radius + x) * step_e; + const int out = (x - radius - 1) * step_e; + sum += ((int)src[in] - (int)src[out]) * inv; + dst[x * dstep_e] = (uint16_t)(sum >> 16); + } + for (; x < len; x++) { + const int in = (2 * len - radius - x - 1) * step_e; + const int out = (x - radius - 1) * step_e; + sum += ((int)src[in] - (int)src[out]) * inv; + dst[x * dstep_e] = (uint16_t)(sum >> 16); + } +} + +static void check_row8(void) +{ + FFBoxblurDSPContext dsp = {0}; + /* Set ref by default, then let x86 override */ + dsp.blur_row8 = blur_row8_ref; + ff_boxblur_dsp_init_x86(&dsp); + + declare_func(void, uint8_t *, ptrdiff_t, const uint8_t *, ptrdiff_t, int, int); + + LOCAL_ALIGNED_32(uint8_t, src, [2048]); + LOCAL_ALIGNED_32(uint8_t, dst_ref, [2048]); + LOCAL_ALIGNED_32(uint8_t, dst_new, [2048]); + + for (int iter = 0; iter < 16; iter++) { + const int len = 32 + (rnd() % 256); + const int radius = FFMIN((len - 1) / 2, rnd() % 16); + for (int i = 0; i < len; i++) + src[i] = rnd(); + + if (check_func(dsp.blur_row8, "boxblur_blur_row8")) { + call_ref(dst_ref, 1, src, 1, len, radius); + call_new(dst_new, 1, src, 1, len, radius); + if (memcmp(dst_ref, dst_new, len)) + fail(); + bench_new(dst_new, 1, src, 1, len, radius); + } + } +} + +static void check_row16(void) +{ + FFBoxblurDSPContext dsp = {0}; + dsp.blur_row16 = blur_row16_ref; + ff_boxblur_dsp_init_x86(&dsp); + + declare_func(void, uint16_t *, ptrdiff_t, const uint16_t *, ptrdiff_t, int, int); + + LOCAL_ALIGNED_32(uint16_t, src, [2048]); + LOCAL_ALIGNED_32(uint16_t, dst_ref, [2048]); + LOCAL_ALIGNED_32(uint16_t, dst_new, [2048]); + + for (int iter = 0; iter < 16; iter++) { + const int len = 32 + (rnd() % 256); + const int radius = FFMIN((len - 1) / 2, rnd() % 16); + for (int i = 0; i < len; i++) + src[i] = rnd(); + + if (check_func(dsp.blur_row16, "boxblur_blur_row16")) { + call_ref(dst_ref, 2, src, 2, len, radius); + call_new(dst_new, 2, src, 2, len, radius); + if (memcmp(dst_ref, dst_new, len * sizeof(uint16_t))) + fail(); + bench_new(dst_new, 2, src, 2, len, radius); + } + } +} + +void checkasm_check_boxblur(void); +void checkasm_check_boxblur(void) +{ + check_row8(); + report("boxblur_row8"); + check_row16(); + report("boxblur_row16"); +} + + -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
