This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit ddd720ae619784b9834131ac34a2d86004ec3b73 Author: David Christle <[email protected]> AuthorDate: Mon Mar 2 08:49:54 2026 -0800 Commit: Martin Storsjö <[email protected]> CommitDate: Wed Mar 4 10:30:08 2026 +0000 swscale/aarch64: add NEON rgb24tobgr24 byte-swap Add a NEON rgb24tobgr24 using ld3/st3 to swap R and B channels in packed 24bpp RGB buffers. Handles all input sizes with a 16-pixel NEON fast path, 8-pixel NEON cleanup, and scalar tail. checkasm --bench on Apple M3 Max (1920*3 = 5760 bytes): rgb24tobgr24_c: 722.0 ( 1.00x) rgb24tobgr24_neon: 94.9 ( 7.61x) Signed-off-by: David Christle <[email protected]> --- libswscale/aarch64/rgb2rgb.c | 3 +++ libswscale/aarch64/rgb2rgb_neon.S | 43 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index f474228298..5873439db5 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -51,6 +51,8 @@ static void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, } } +void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size); + void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride); @@ -85,6 +87,7 @@ av_cold void rgb2rgb_init_aarch64(void) if (have_neon(cpu_flags)) { ff_rgb24toyv12 = rgb24toyv12; + rgb24tobgr24 = ff_rgb24tobgr24_neon; interleaveBytes = ff_interleave_bytes_neon; deinterleaveBytes = ff_deinterleave_bytes_neon; shuffle_bytes_0321 = ff_shuffle_bytes_0321_neon; diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index f6d625f11f..30bec45c2d 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -241,6 +241,49 @@ function ff_rgb24toyv12_neon, export=1 ret endfunc +// void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size); +function ff_rgb24tobgr24_neon, export=1 + // x0 = src, x1 = dst, w2 = src_size (bytes) + + // Fast path: 48 bytes (16 pixels) per iteration + subs w2, w2, #48 + b.lt 2f +1: + ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48 + subs w2, w2, #48 + mov v3.16b, v0.16b + mov v0.16b, v2.16b + mov v2.16b, v3.16b + st3 {v0.16b, v1.16b, v2.16b}, [x1], #48 + b.ge 1b +2: + add w2, w2, #48 + // Medium path: 24 bytes (8 pixels) + cmp w2, #24 + b.lt 3f + ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24 + sub w2, w2, #24 + mov v3.8b, v0.8b + mov v0.8b, v2.8b + mov v2.8b, v3.8b + st3 {v0.8b, v1.8b, v2.8b}, [x1], #24 +3: + // Scalar tail: 3 bytes (1 pixel) at a time + cmp w2, #3 + b.lt 4f +5: + ldrb w4, [x0, #1] + ldrb w5, [x0, #2] + ldrb w3, [x0], #3 + subs w2, w2, #3 + strb w4, [x1, #1] + strb w3, [x1, #2] + strb w5, [x1], #3 + b.gt 5b +4: + ret +endfunc + // void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, // uint8_t *dest, int width, int height, // int src1Stride, int src2Stride, int dstStride); _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
