[FFmpeg-devel] [PATCH] avfilter/boxblur: add AVX2 horizontal pass (PR #20714)

MakarDev via ffmpeg-devel Sat, 18 Oct 2025 08:20:18 -0700

PR #20714 opened by MakarDev
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20714
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20714.patch


Signed-off-by: MakarDev <[email protected]>
Added 1D AVX2 assembly vectorization support for the sliding window accumulator 
on the avfiler/boxblur filter. 
Benchmarking results

AVX2:
 - vf_boxblur.boxblur_row8  [OK]
 - vf_boxblur.boxblur_row16 [OK]
checkasm: all 2 tests passed
boxblur_blur_row8_c:                                   884.7 ( 1.00x)
boxblur_blur_row8_avx2:                                 92.7 ( 9.54x)
boxblur_blur_row16_c:                                  315.8 ( 1.00x)
boxblur_blur_row16_avx2:                               255.3 ( 1.24x)


From 7915f3f232ac5e57fb7ac7e342653108c2119719 Mon Sep 17 00:00:00 2001
From: MakarDev <[email protected]>
Date: Thu, 16 Oct 2025 22:44:31 -0700
Subject: [PATCH] avfilter/boxblur: add AVX2 horizontal pass

Signed-off-by: MakarDev <[email protected]>
---
 libavfilter/vf_boxblur_dsp.h      |  45 +++
 libavfilter/x86/Makefile          |   2 +
 libavfilter/x86/vf_boxblur.asm    | 575 ++++++++++++++++++++++++++++++
 libavfilter/x86/vf_boxblur_init.c |  71 ++++
 tests/checkasm/Makefile           |   1 +
 tests/checkasm/checkasm.c         |   3 +
 tests/checkasm/checkasm.h         |   1 +
 tests/checkasm/vf_boxblur.c       | 165 +++++++++
 8 files changed, 863 insertions(+)
 create mode 100644 libavfilter/vf_boxblur_dsp.h
 create mode 100644 libavfilter/x86/vf_boxblur.asm
 create mode 100644 libavfilter/x86/vf_boxblur_init.c
 create mode 100644 tests/checkasm/vf_boxblur.c

diff --git a/libavfilter/vf_boxblur_dsp.h b/libavfilter/vf_boxblur_dsp.h
new file mode 100644
index 0000000000..246c748eea
--- /dev/null
+++ b/libavfilter/vf_boxblur_dsp.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BOXBLUR_DSP_H
+#define AVFILTER_BOXBLUR_DSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct FFBoxblurDSPContext {
+    /* 1D horizontal blur on one row of len pixels */
+    void (*blur_row8)(uint8_t *dst, ptrdiff_t dst_step,
+                      const uint8_t *src, ptrdiff_t src_step,
+                      int len, int radius);
+
+    void (*blur_row16)(uint16_t *dst, ptrdiff_t dst_step,
+                       const uint16_t *src, ptrdiff_t src_step,
+                       int len, int radius);
+} FFBoxblurDSPContext;
+
+/* C initializers */
+void ff_boxblur_dsp_init(FFBoxblurDSPContext *dsp);
+void ff_boxblur_dsp_init_aarch64(FFBoxblurDSPContext *dsp);
+void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp);
+
+#endif /* AVFILTER_BOXBLUR_DSP_H */
+
+
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index b485c10fbe..a89e9e4b78 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -14,6 +14,7 @@ OBJS-$(CONFIG_EQ_FILTER)                     += 
x86/vf_eq_init.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += x86/vf_gblur_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
+OBJS-$(CONFIG_BOXBLUR_FILTER)                += x86/vf_boxblur_init.o
 OBJS-$(CONFIG_FRAMERATE_FILTER)              += x86/vf_framerate_init.o
 OBJS-$(CONFIG_HALDCLUT_FILTER)               += x86/vf_lut3d_init.o
 OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
@@ -63,6 +64,7 @@ X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += 
x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GBLUR_FILTER)           += x86/vf_gblur.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
+X86ASM-OBJS-$(CONFIG_BOXBLUR_FILTER)         += x86/vf_boxblur.o
 X86ASM-OBJS-$(CONFIG_HALDCLUT_FILTER)        += x86/vf_lut3d.o
 X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
 X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
diff --git a/libavfilter/x86/vf_boxblur.asm b/libavfilter/x86/vf_boxblur.asm
new file mode 100644
index 0000000000..48bd64d8f6
--- /dev/null
+++ b/libavfilter/x86/vf_boxblur.asm
@@ -0,0 +1,575 @@
+;*****************************************************************************
+;* x86 AVX2-optimized functions for boxblur 1D row blur
+;*
+;* Copyright (C) 2025 Makar Kuznietsov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; ---------------------------------------------------------------------------
+; void ff_boxblur_blur_rowb_avx2(uint8_t *dst, ptrdiff_t dst_step,
+;                                const uint8_t *src, ptrdiff_t src_step,
+;                                int len, int radius)
+; AVX2 implementation for 8-bit pixels
+; ---------------------------------------------------------------------------
+%if ARCH_X86_64
+GLOBAL ff_boxblur_blur_rowb_avx2
+ff_boxblur_blur_rowb_avx2:
+    ; System V AMD64 args:
+    ; RDI=dst (uint8_t*), RSI=dst_step, RDX=src (uint8_t*), RCX=src_step, 
R8d=len, R9d=radius
+    push RBP
+    mov RBP, RSP
+    push RBX
+    push R12
+    push R13
+    push R14
+    push R15
+
+    mov R12, RDI            ; dst
+    mov R13, RSI            ; dst_step
+    mov R14, RDX            ; src
+    mov R15, RCX            ; src_step
+    ; R8d = len, R9d = radius (already in place)
+
+    ; Compute inv = ((1<<16) + length/2) / length
+    mov eax, R9d            ; radius
+    lea edx, [RAX*2+1]      ; edx = length = 2*radius+1
+    mov ecx, edx            ; ecx = length
+    mov ebx, edx
+    shr ebx, 1              ; ebx = length/2
+    mov eax, 1
+    shl eax, 16             ; eax = 1<<16
+    add eax, ebx            ; eax = (1<<16) + length/2
+    xor edx, edx
+    div ecx                 ; eax = inv = ((1<<16)+len/2)/len
+    mov R11d, eax           ; R11d = inv
+
+    ; sum = src[radius*src_step]
+    mov eax, R9d            ; eax = radius
+    imul RAX, R15           ; RAX = radius * src_step
+    movzx ebx, byte [R14+RAX]
+    mov R10d, ebx           ; R10d = sum (int)
+
+    ; for (x=0;x<radius;x++) sum += src[x*src_step]<<1
+    xor eax, eax            ; x = 0
+.sum_doubles_loop:
+    cmp eax, R9d
+    jge .sum_done
+    mov edx, eax
+    imul RDX, R15           ; RDX = x*src_step
+    movzx ebx, byte [R14+RDX]
+    add R10d, ebx
+    add R10d, ebx           ; sum += val*2
+    inc eax
+    jmp .sum_doubles_loop
+.sum_done:
+
+    ; sum = sum*inv + (1<<15), keep in Q16 (R10)
+    movsxd RDX, R10d
+    imul RDX, R11
+    add RDX, 1<<15
+    mov R10, RDX            ; R10 = fixed-point accumulator (Q16)
+
+    ; ---------------------------
+    ; Loop 1: head (reflect-left)
+    ; ---------------------------
+    xor eax, eax            ; x = 0
+.loop1:
+    cmp eax, R8d            ; x < len ?
+    jge .after_loop1
+    cmp eax, R9d            ; x <= radius ?
+    jg .after_loop1
+    ; diff = src[(radius+x)*src_step] - src[(radius-x)*src_step]
+    mov edx, R9d
+    add edx, eax
+    imul RDX, R15
+    movzx ebx, byte [R14+RDX]
+    mov esi, R9d
+    sub esi, eax
+    movsxd RSI, esi
+    imul RSI, R15
+    movzx ecx, byte [R14+RSI]
+    sub ebx, ecx            ; ebx = diff
+    movsxd RDX, ebx
+    imul RDX, R11           ; diff*inv (Q16)
+    add R10, RDX            ; acc += diff*inv
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    imul RSI, R13
+    mov [R12 + RSI], dl
+    inc eax
+    jmp .loop1
+.after_loop1:
+
+    ; --------------------------------------------
+    ; Loop 2: steady-state (no reflection) – AVX2
+    ; --------------------------------------------
+    ; Fast path requires unit strides and >= 8 pixels remaining before tail.
+    ; Condition: (R13 == 1) && (R15 == 1) && (x < len - radius - 8)
+    cmp R13, 1
+    jne .loop2_scalar
+    cmp R15, 1
+    jne .loop2_scalar
+    
+    ; Check if we have at least 16 pixels in steady state
+    mov edi, R8d
+    sub edi, R9d
+    sub edi, 16             ; edi = len - radius - 16
+    cmp eax, edi
+    jg  .loop2_scalar
+
+    ; Setup pointers for AVX2 block processing
+    ; pNew = src + (radius + x)
+    ; pOld = src + (x - radius - 1)
+    ; pDst = dst + x
+    ; Use: RDI=pNew, RSI=pOld, RBX=pDst (R13 dst_step not needed since 
stride=1)
+    
+    mov edi, eax
+    add edi, R9d
+    lea RDI, [R14+RDI]      ; RDI = pNew
+    mov esi, eax
+    sub esi, R9d
+    dec esi
+    lea RSI, [R14+RSI]      ; RSI = pOld
+    lea RBX, [R12+RAX]      ; RBX = pDst
+
+    ; Broadcast inv (AVX2: must go through XMM first)
+    vmovd xmm5, R11d        ; Move R11d to xmm5
+    vpbroadcastd ymm5, xmm5 ; ymm5 = inv (int32 broadcast)
+
+.loop2_avx2:
+    ; Check if we still have 16 pixels: x <= (len - radius - 16) ?
+    mov ecx, R8d
+    sub ecx, R9d
+    sub ecx, 16
+    cmp eax, ecx
+    jg  .loop2_avx2_done
+
+    ; Load 16 incoming bytes (new edge) and 16 outgoing bytes (old edge)
+    ; new: src[(radius + x) .. +15]
+    ; old: src[(x - radius - 1) .. +15]
+    vmovdqu xmm0, [RDI]         ; new 16B from pNew
+    vmovdqu xmm1, [RSI]         ; old 16B from pOld
+    
+    ; Extend 16x u8 -> 16x u16, then compute diff
+    vpmovzxbw ymm0, xmm0        ; 16x u8 -> 16x u16 in ymm0
+    vpmovzxbw ymm1, xmm1        ; 16x u8 -> 16x u16 in ymm1
+    vpsubw ymm0, ymm0, ymm1     ; 16x s16 (diff)
+
+    ; Split into low and high 8 elements for 32-bit processing
+    vextracti128 xmm1, ymm0, 1  ; high 8x s16
+    vpmovsxwd ymm2, xmm0        ; low 8x s16 -> 8x s32 in ymm2
+    vpmovsxwd ymm3, xmm1        ; high 8x s16 -> 8x s32 in ymm3
+    
+    ; Multiply by inv (broadcast in ymm5)
+    vpmulld ymm2, ymm2, ymm5    ; low 8x s32 increments
+    vpmulld ymm3, ymm3, ymm5    ; high 8x s32 increments
+
+    ; Compute prefix sum on ymm2 (first 8 elements)
+    vpslldq ymm6, ymm2, 4
+    vpaddd ymm2, ymm2, ymm6
+    vpslldq ymm6, ymm2, 8
+    vpaddd ymm2, ymm2, ymm6
+    ; Cross-lane add for ymm2
+    vextracti128 xmm6, ymm2, 0
+    vpshufd xmm7, xmm6, 0xFF
+    vextracti128 xmm8, ymm2, 1
+    vpaddd xmm8, xmm8, xmm7
+    vinserti128 ymm2, ymm2, xmm8, 1
+
+    ; Get the last element of ymm2 to carry to ymm3
+    vextracti128 xmm8, ymm2, 1
+    vpshufd xmm8, xmm8, 0xFF    ; last element of first 8
+    vpbroadcastd ymm8, xmm8     ; broadcast to all lanes
+
+    ; Compute prefix sum on ymm3 (next 8 elements)
+    vpslldq ymm6, ymm3, 4
+    vpaddd ymm3, ymm3, ymm6
+    vpslldq ymm6, ymm3, 8
+    vpaddd ymm3, ymm3, ymm6
+    vextracti128 xmm6, ymm3, 0
+    vpshufd xmm7, xmm6, 0xFF
+    vextracti128 xmm9, ymm3, 1
+    vpaddd xmm9, xmm9, xmm7
+    vinserti128 ymm3, ymm3, xmm9, 1
+    vpaddd ymm3, ymm3, ymm8     ; add carry from first 8
+
+    ; Add previous accumulator to both
+    vmovd xmm1, R10d
+    vpbroadcastd ymm1, xmm1
+    vpaddd ymm2, ymm2, ymm1     ; first 8 accumulators
+    vpaddd ymm3, ymm3, ymm1     ; next 8 accumulators (already includes carry)
+
+    ; Extract last accumulator for next iteration (from ymm3)
+    vextracti128 xmm4, ymm3, 1
+    vpshufd xmm6, xmm4, 0xFF
+    movd edx, xmm6
+    movsxd RDX, edx
+    mov R10, RDX
+
+    ; Shift and pack both registers
+    vpsrad ymm2, ymm2, 16       ; first 8 results
+    vpsrad ymm3, ymm3, 16       ; next 8 results
+    
+    ; Pack ymm2 and ymm3 down to 16 bytes
+    vpackssdw ymm2, ymm2, ymm3  ; 16x s16 (but lanes are mixed)
+    vpermq ymm2, ymm2, 0xD8     ; fix lane order: 11011000b = (0,2,1,3)
+    vextracti128 xmm3, ymm2, 1
+    vpackuswb xmm2, xmm2, xmm3  ; 16x u8
+    vmovdqu [RBX], xmm2         ; store 16 bytes
+
+    ; Advance pointers and x by 16
+    add RDI, 16         ; pNew += 16
+    add RSI, 16         ; pOld += 16
+    add RBX, 16         ; pDst += 16
+    add eax, 16         ; x += 16
+    jmp .loop2_avx2
+
+.loop2_avx2_done:
+    ; Fall through to scalar remainder
+
+.loop2_scalar:
+    ; Loop 2 scalar: for (; x < len - radius; x++)
+    cmp eax, R8d
+    jge .after_loop2
+    mov edx, R8d
+    sub edx, R9d
+    cmp eax, edx            ; x < len - radius ?
+    jge .after_loop2
+    ; diff = src[(radius+x)*src_step] - src[(x-radius-1)*src_step]
+    mov edx, R9d
+    add edx, eax
+    imul RDX, R15
+    movzx ebx, byte [R14+RDX]
+    mov edx, eax
+    sub edx, R9d
+    dec edx
+    movsxd RDX, edx
+    imul RDX, R15
+    movzx ecx, byte [R14+RDX]
+    sub ebx, ecx
+    movsxd RDX, ebx
+    imul RDX, R11
+    add R10, RDX
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    imul RSI, R13
+    mov [R12 + RSI], dl
+    inc eax
+    jmp .loop2_scalar
+.after_loop2:
+
+    ; ---------------------------
+    ; Loop 3: tail (reflect-right)
+    ; ---------------------------
+.loop3:
+    cmp eax, R8d
+    jge .end8
+    ; diff = src[(2*len-radius-x-1)*src_step] - src[(x-radius-1)*src_step]
+    mov edx, R8d
+    lea edx, [edx*2]
+    sub edx, R9d
+    sub edx, eax
+    dec edx
+    movsxd RDX, edx
+    imul RDX, R15
+    movzx ebx, byte [R14+RDX]
+    mov edx, eax
+    sub edx, R9d
+    dec edx
+    movsxd RDX, edx
+    imul RDX, R15
+    movzx ecx, byte [R14+RDX]
+    sub ebx, ecx
+    movsxd RDX, ebx
+    imul RDX, R11
+    add R10, RDX
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    imul RSI, R13
+    mov [R12 + RSI], dl
+    inc eax
+    jmp .loop3
+
+.end8:
+    vzeroupper
+    pop R15
+    pop R14
+    pop R13
+    pop R12
+    pop RBX
+    pop RBP
+    ret
+%endif
+
+
+; ---------------------------------------------------------------------------
+; void ff_boxblur_blur_roww_avx2(uint16_t *dst, ptrdiff_t dst_step,
+;                                const uint16_t *src, ptrdiff_t src_step,
+;                                int bytes, int radius)
+; AVX2 implementation for 16-bit (8 pixels per iteration)
+; ---------------------------------------------------------------------------
+%if ARCH_X86_64
+GLOBAL ff_boxblur_blur_roww_avx2
+ff_boxblur_blur_roww_avx2:
+    ; RDI=dst(uint16_t*), RSI=dst_step, RDX=src(uint16_t*), RCX=src_step, 
R8d=bytes, R9d=radius
+    push RBP
+    mov RBP, RSP
+    push RBX
+    push R12
+    push R13
+    push R14
+    push R15
+
+    mov R12, RDI            ; dst
+    mov R13, RSI            ; dst_step (in bytes)
+    mov R14, RDX            ; src
+    mov R15, RCX            ; src_step (in bytes)
+    
+    ; len = bytes/2
+    mov eax, R8d
+    shr eax, 1              ; eax = len (number of uint16 elements)
+    mov R8d, eax
+
+    ; Compute inv
+    mov eax, R9d            ; radius
+    lea edx, [RAX*2+1]      ; length = 2*radius+1
+    mov ecx, edx
+    mov ebx, edx
+    shr ebx, 1              ; length/2
+    mov eax, 1
+    shl eax, 16
+    add eax, ebx
+    xor edx, edx
+    div ecx                 ; eax = inv
+    mov R11d, eax           ; R11 = inv
+
+    ; Initialize sum = src[radius]
+    mov eax, R9d
+    shl eax, 1              ; radius * 2 (since src_step is in bytes for 
uint16)
+    movzx ebx, word [R14 + RAX]
+    mov R10d, ebx
+
+    ; sum += src[x] * 2 for x in [0..radius)
+    xor eax, eax
+.sum_loop16:
+    cmp eax, R9d
+    jge .sum_done16
+    shl eax, 1              ; x * 2 (byte offset)
+    movzx ebx, word [R14 + RAX]
+    shr eax, 1              ; restore x
+    add R10d, ebx
+    add R10d, ebx
+    inc eax
+    jmp .sum_loop16
+.sum_done16:
+
+    ; sum = sum*inv + (1<<15)
+    movsxd RDX, R10d
+    imul RDX, R11
+    add RDX, 1<<15
+    mov R10, RDX
+
+    ; Loop 1: head (x=0..radius)
+    xor eax, eax
+.loop1_16:
+    cmp eax, R8d
+    jge .after_loop1_16
+    cmp eax, R9d
+    jg .after_loop1_16
+    ; diff = src[(radius+x)*2] - src[(radius-x)*2]
+    mov edx, R9d
+    add edx, eax
+    shl edx, 1              ; byte offset
+    movzx ebx, word [R14 + RDX]
+    mov edx, R9d
+    sub edx, eax
+    shl edx, 1
+    movzx ecx, word [R14 + RDX]
+    sub ebx, ecx
+    movsxd RDX, ebx
+    imul RDX, R11
+    add R10, RDX
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    shl RSI, 1              ; dst byte offset
+    mov [R12 + RSI], dx
+    inc eax
+    jmp .loop1_16
+.after_loop1_16:
+
+    ; Loop 2: steady-state with AVX2
+    ; Check for unit strides (step = 2 for uint16)
+    cmp R13, 2
+    jne .loop2_scalar16
+    cmp R15, 2
+    jne .loop2_scalar16
+    
+    ; Check if we have at least 8 pixels
+    mov edi, R8d
+    sub edi, R9d
+    sub edi, 8
+    cmp eax, edi
+    jg .loop2_scalar16
+
+    ; Setup pointers
+    lea RDI, [R14 + RAX*2]  ; pNew = src + (x)*2
+    add RDI, R9
+    add RDI, R9             ; += radius*2
+    lea RSI, [R14 + RAX*2]  ; pOld = src + (x-radius-1)*2
+    sub RSI, R9
+    sub RSI, R9             ; -= radius*2
+    sub RSI, 2              ; -= 2
+    lea RBX, [R12 + RAX*2]  ; pDst = dst + x*2
+
+    ; Broadcast inv
+    vmovd xmm5, R11d
+    vpbroadcastd ymm5, xmm5
+
+.loop2_avx2_16:
+    mov ecx, R8d
+    sub ecx, R9d
+    sub ecx, 8
+    cmp eax, ecx
+    jg .loop2_avx2_done16
+
+    ; Load 8x uint16 (16 bytes)
+    vmovdqu xmm0, [RDI]
+    vmovdqu xmm1, [RSI]
+    
+    ; Extend to 8x int32
+    vpmovzxwd ymm0, xmm0
+    vpmovzxwd ymm1, xmm1
+    vpsubd ymm0, ymm0, ymm1     ; 8x s32 (diff)
+    vpmulld ymm0, ymm0, ymm5    ; 8x s32 (Q16 increments)
+
+    ; Prefix sum
+    vpslldq ymm2, ymm0, 4
+    vpaddd ymm0, ymm0, ymm2
+    vpslldq ymm2, ymm0, 8
+    vpaddd ymm0, ymm0, ymm2
+    vextracti128 xmm2, ymm0, 0
+    vpshufd xmm3, xmm2, 0xFF
+    vextracti128 xmm4, ymm0, 1
+    vpaddd xmm4, xmm4, xmm3
+    vinserti128 ymm0, ymm0, xmm4, 1
+
+    ; Add prev accumulator
+    vmovd xmm1, R10d
+    vpbroadcastd ymm1, xmm1
+    vpaddd ymm0, ymm0, ymm1
+
+    ; Extract last for next iteration
+    vextracti128 xmm4, ymm0, 1
+    vpshufd xmm6, xmm4, 0xFF
+    movd edx, xmm6
+    movsxd RDX, edx
+    mov R10, RDX
+
+    ; Shift and pack
+    vpsrad ymm0, ymm0, 16
+    vextracti128 xmm2, ymm0, 0
+    vextracti128 xmm3, ymm0, 1
+    vpackssdw xmm2, xmm2, xmm3
+    vmovdqu [RBX], xmm2
+
+    ; Advance by 8 pixels
+    add RDI, 16
+    add RSI, 16
+    add RBX, 16
+    add eax, 8
+    jmp .loop2_avx2_16
+
+.loop2_avx2_done16:
+    ; Fall through to scalar
+
+.loop2_scalar16:
+    cmp eax, R8d
+    jge .after_loop2_16
+    mov edx, R8d
+    sub edx, R9d
+    cmp eax, edx
+    jge .after_loop2_16
+    ; diff = src[(radius+x)*2] - src[(x-radius-1)*2]
+    mov edx, R9d
+    add edx, eax
+    shl edx, 1
+    movzx ebx, word [R14 + RDX]
+    mov edx, eax
+    sub edx, R9d
+    dec edx
+    shl edx, 1
+    movzx ecx, word [R14 + RDX]
+    sub ebx, ecx
+    movsxd RDX, ebx
+    imul RDX, R11
+    add R10, RDX
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    shl RSI, 1
+    mov [R12 + RSI], dx
+    inc eax
+    jmp .loop2_scalar16
+.after_loop2_16:
+
+    ; Loop 3: tail
+.loop3_16:
+    cmp eax, R8d
+    jge .end16
+    ; diff = src[(2*len-radius-x-1)*2] - src[(x-radius-1)*2]
+    mov edx, R8d
+    lea edx, [edx*2]
+    sub edx, R9d
+    sub edx, eax
+    dec edx
+    shl edx, 1
+    movzx ebx, word [R14 + RDX]
+    mov edx, eax
+    sub edx, R9d
+    dec edx
+    shl edx, 1
+    movzx ecx, word [R14 + RDX]
+    sub ebx, ecx
+    movsxd RDX, ebx
+    imul RDX, R11
+    add R10, RDX
+    mov RDX, R10
+    sar RDX, 16
+    mov RSI, RAX
+    shl RSI, 1
+    mov [R12 + RSI], dx
+    inc eax
+    jmp .loop3_16
+.end16:
+    vzeroupper
+    pop R15
+    pop R14
+    pop R13
+    pop R12
+    pop RBX
+    pop RBP
+    ret
+%endif
diff --git a/libavfilter/x86/vf_boxblur_init.c 
b/libavfilter/x86/vf_boxblur_init.c
new file mode 100644
index 0000000000..16013d35d9
--- /dev/null
+++ b/libavfilter/x86/vf_boxblur_init.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/vf_boxblur_dsp.h"
+
+/*
+ * We implement x86 CPU dispatch for 1D row blurs used by boxblur's
+ * separable horizontal/vertical passes.
+ */
+
+#if HAVE_X86ASM
+#if HAVE_AVX2_EXTERNAL
+/* 32-byte vector width */
+void ff_boxblur_blur_rowb_avx2(uint8_t *dst, ptrdiff_t dst_step,
+                               const uint8_t *src, ptrdiff_t src_step,
+                               int bytes, int radius);
+void ff_boxblur_blur_roww_avx2(uint16_t *dst, ptrdiff_t dst_step,
+                               const uint16_t *src, ptrdiff_t src_step,
+                               int bytes, int radius);
+
+static void blur_row8_avx2(uint8_t *dst, ptrdiff_t dst_step,
+                           const uint8_t *src, ptrdiff_t src_step,
+                           int len, int radius)
+{
+    ff_boxblur_blur_rowb_avx2(dst, dst_step, src, src_step, len, radius);
+}
+
+static void blur_row16_avx2(uint16_t *dst, ptrdiff_t dst_step,
+                            const uint16_t *src, ptrdiff_t src_step,
+                            int len, int radius)
+{
+    ff_boxblur_blur_roww_avx2(dst, dst_step, src, src_step, len * 2, radius);
+}
+#endif
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        dsp->blur_row8  = blur_row8_avx2;
+        dsp->blur_row16 = blur_row16_avx2;
+    }
+#endif
+#endif
+    (void) dsp;
+}
+
+
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..265790639f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+AVFILTEROBJS-$(CONFIG_BOXBLUR_FILTER)    += vf_boxblur.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_IDET_FILTER)       += vf_idet.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..99e1704cca 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -300,6 +300,9 @@ static const struct {
     #if CONFIG_GBLUR_FILTER
         { "vf_gblur", checkasm_check_vf_gblur },
     #endif
+    #if CONFIG_BOXBLUR_FILTER
+        { "vf_boxblur", checkasm_check_boxblur },
+    #endif
     #if CONFIG_HFLIP_FILTER
         { "vf_hflip", checkasm_check_vf_hflip },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..6da78e6f30 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -152,6 +152,7 @@ void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
+void checkasm_check_boxblur(void);
 void checkasm_check_vp3dsp(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
diff --git a/tests/checkasm/vf_boxblur.c b/tests/checkasm/vf_boxblur.c
new file mode 100644
index 0000000000..ed4111d00d
--- /dev/null
+++ b/tests/checkasm/vf_boxblur.c
@@ -0,0 +1,165 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavfilter/vf_boxblur_dsp.h"
+
+static void blur_row8_ref(uint8_t *dst, ptrdiff_t dst_step,
+                          const uint8_t *src, ptrdiff_t src_step,
+                          int len, int radius)
+{
+    if (radius <= 0 || len <= 0) {
+        for (int i = 0; i < len; i++)
+            dst[i * dst_step] = src[i * src_step];
+        return;
+    }
+    const int length = radius * 2 + 1;
+    const int inv = ((1 << 16) + length / 2) / length;
+    int sum = src[radius * src_step];
+    for (int x = 0; x < radius; x++)
+        sum += (int)src[x * src_step] << 1;
+    sum = sum * inv + (1 << 15);
+    int x = 0;
+    for (; x <= radius && x < len; x++) {
+        const int right = (radius + x) * src_step;
+        const int left  = (radius - x) * src_step;
+        sum += ((int)src[right] - (int)src[left]) * inv;
+        dst[x * dst_step] = (uint8_t)(sum >> 16);
+    }
+    for (; x < len - radius; x++) {
+        const int in  = (radius + x) * src_step;
+        const int out = (x - radius - 1) * src_step;
+        sum += ((int)src[in] - (int)src[out]) * inv;
+        dst[x * dst_step] = (uint8_t)(sum >> 16);
+    }
+    for (; x < len; x++) {
+        const int in  = (2 * len - radius - x - 1) * src_step;
+        const int out = (x - radius - 1) * src_step;
+        sum += ((int)src[in] - (int)src[out]) * inv;
+        dst[x * dst_step] = (uint8_t)(sum >> 16);
+    }
+}
+
+static void blur_row16_ref(uint16_t *dst, ptrdiff_t dst_step,
+                           const uint16_t *src, ptrdiff_t src_step,
+                           int len, int radius)
+{
+    if (radius <= 0 || len <= 0) {
+        for (int i = 0; i < len; i++)
+            *(uint16_t *)((uint8_t *)dst + i * dst_step) = *(const uint16_t 
*)((const uint8_t *)src + i * src_step);
+        return;
+    }
+    const int step_e = (int)(src_step >> 1);
+    const int dstep_e = (int)(dst_step >> 1);
+    const int length = radius * 2 + 1;
+    const int inv = ((1 << 16) + length / 2) / length;
+    int sum = src[radius * step_e];
+    for (int x = 0; x < radius; x++)
+        sum += (int)src[x * step_e] << 1;
+    sum = sum * inv + (1 << 15);
+    int x = 0;
+    for (; x <= radius && x < len; x++) {
+        const int right = (radius + x) * step_e;
+        const int left  = (radius - x) * step_e;
+        sum += ((int)src[right] - (int)src[left]) * inv;
+        dst[x * dstep_e] = (uint16_t)(sum >> 16);
+    }
+    for (; x < len - radius; x++) {
+        const int in  = (radius + x) * step_e;
+        const int out = (x - radius - 1) * step_e;
+        sum += ((int)src[in] - (int)src[out]) * inv;
+        dst[x * dstep_e] = (uint16_t)(sum >> 16);
+    }
+    for (; x < len; x++) {
+        const int in  = (2 * len - radius - x - 1) * step_e;
+        const int out = (x - radius - 1) * step_e;
+        sum += ((int)src[in] - (int)src[out]) * inv;
+        dst[x * dstep_e] = (uint16_t)(sum >> 16);
+    }
+}
+
+static void check_row8(void)
+{
+    FFBoxblurDSPContext dsp = {0};
+    /* Set ref by default, then let x86 override */
+    dsp.blur_row8 = blur_row8_ref;
+    ff_boxblur_dsp_init_x86(&dsp);
+
+    declare_func(void, uint8_t *, ptrdiff_t, const uint8_t *, ptrdiff_t, int, 
int);
+
+    LOCAL_ALIGNED_32(uint8_t, src, [2048]);
+    LOCAL_ALIGNED_32(uint8_t, dst_ref, [2048]);
+    LOCAL_ALIGNED_32(uint8_t, dst_new, [2048]);
+
+    for (int iter = 0; iter < 16; iter++) {
+        const int len = 32 + (rnd() % 256);
+        const int radius = FFMIN((len - 1) / 2, rnd() % 16);
+        for (int i = 0; i < len; i++)
+            src[i] = rnd();
+
+        if (check_func(dsp.blur_row8, "boxblur_blur_row8")) {
+            call_ref(dst_ref, 1, src, 1, len, radius);
+            call_new(dst_new, 1, src, 1, len, radius);
+            if (memcmp(dst_ref, dst_new, len))
+                fail();
+            bench_new(dst_new, 1, src, 1, len, radius);
+        }
+    }
+}
+
+static void check_row16(void)
+{
+    FFBoxblurDSPContext dsp = {0};
+    dsp.blur_row16 = blur_row16_ref;
+    ff_boxblur_dsp_init_x86(&dsp);
+
+    declare_func(void, uint16_t *, ptrdiff_t, const uint16_t *, ptrdiff_t, 
int, int);
+
+    LOCAL_ALIGNED_32(uint16_t, src, [2048]);
+    LOCAL_ALIGNED_32(uint16_t, dst_ref, [2048]);
+    LOCAL_ALIGNED_32(uint16_t, dst_new, [2048]);
+
+    for (int iter = 0; iter < 16; iter++) {
+        const int len = 32 + (rnd() % 256);
+        const int radius = FFMIN((len - 1) / 2, rnd() % 16);
+        for (int i = 0; i < len; i++)
+            src[i] = rnd();
+
+        if (check_func(dsp.blur_row16, "boxblur_blur_row16")) {
+            call_ref(dst_ref, 2, src, 2, len, radius);
+            call_new(dst_new, 2, src, 2, len, radius);
+            if (memcmp(dst_ref, dst_new, len * sizeof(uint16_t)))
+                fail();
+            bench_new(dst_new, 2, src, 2, len, radius);
+        }
+    }
+}
+
+void checkasm_check_boxblur(void);
+void checkasm_check_boxblur(void)
+{
+    check_row8();
+    report("boxblur_row8");
+    check_row16();
+    report("boxblur_row16");
+}
+
+
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] avfilter/boxblur: add AVX2 horizontal pass (PR #20714)

Reply via email to