The branch, master has been updated via 6f9e8a599dd94911cbc6713b53ae9bf01701c35c (commit) via ca2a88c1b3f31417cda689bdb9b2ae2c9f607ca6 (commit) via 49477972b7175284663c9ef4124345c71dc9c7a1 (commit) from 5929d46f7bd8a19b40dbb266161a2146beed3afb (commit)
- Log ----------------------------------------------------------------- commit 6f9e8a599dd94911cbc6713b53ae9bf01701c35c Author: Dash Santosh <santda...@gmail.com> AuthorDate: Mon Aug 11 10:13:19 2025 +0530 Commit: Martin Storsjö <mar...@martin.st> CommitDate: Tue Aug 12 09:05:00 2025 +0000 checkasm/swscale: fix whitespace issues diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 0306f02695..52e3ebf75c 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -159,7 +159,7 @@ static void check_yuv2yuv1(int accurate) (int) dst0[fail_offset], (int) dst1[fail_offset]); } - if(dstW == LARGEST_INPUT_SIZE) + if (dstW == LARGEST_INPUT_SIZE) bench_new(src_pixels, dst1, dstW, dither, offset); } } @@ -266,7 +266,7 @@ static void check_yuv2yuvX(int accurate, int bit_depth, int dst_pix_format) show_differences_16(dst0, dst1, LARGEST_INPUT_SIZE); } } - if(dstW == LARGEST_INPUT_SIZE) + if (dstW == LARGEST_INPUT_SIZE) bench_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi); } commit ca2a88c1b3f31417cda689bdb9b2ae2c9f607ca6 Author: Dash Santosh <santda...@gmail.com> AuthorDate: Mon Aug 11 10:10:53 2025 +0530 Commit: Martin Storsjö <mar...@martin.st> CommitDate: Tue Aug 12 09:05:00 2025 +0000 swscale/output: Implement yuv2nv12cx neon assembly yuv2nv12cX_2_512_accurate_c: 3540.1 ( 1.00x) yuv2nv12cX_2_512_accurate_neon: 408.0 ( 8.68x) yuv2nv12cX_2_512_approximate_c: 3521.4 ( 1.00x) yuv2nv12cX_2_512_approximate_neon: 409.2 ( 8.61x) yuv2nv12cX_4_512_accurate_c: 4740.0 ( 1.00x) yuv2nv12cX_4_512_accurate_neon: 604.4 ( 7.84x) yuv2nv12cX_4_512_approximate_c: 4681.9 ( 1.00x) yuv2nv12cX_4_512_approximate_neon: 603.3 ( 7.76x) yuv2nv12cX_8_512_accurate_c: 7273.1 ( 1.00x) yuv2nv12cX_8_512_accurate_neon: 1012.2 ( 7.19x) yuv2nv12cX_8_512_approximate_c: 7223.0 ( 1.00x) yuv2nv12cX_8_512_approximate_neon: 1015.8 ( 7.11x) yuv2nv12cX_16_512_accurate_c: 13762.0 ( 1.00x) yuv2nv12cX_16_512_accurate_neon: 1761.4 ( 7.81x) yuv2nv12cX_16_512_approximate_c: 13884.0 ( 1.00x) yuv2nv12cX_16_512_approximate_neon: 1766.8 ( 7.86x) Benchmarked on: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Oryon(TM) CPU 3417 Mhz, 12 Core(s), 12 Logical Processor(s) diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index 4945633856..a650d72f54 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -402,3 +402,230 @@ function ff_yuv2plane1_8_neon, export=1 b.gt 2b // loop until width consumed ret endfunc + +function ff_yuv2nv12cX_neon_asm, export=1 +// w0 - isSwapped +// x1 - uint8_t *chrDither +// x2 - int16_t *chrFilter +// x3 - int chrFilterSize +// x4 - int16_t **chrUSrc +// x5 - int16_t **chrVSrc +// x6 - uint8_t *dest +// x7 - int chrDstW + + stp x19, x20, [sp, #-32]! + stp x21, x22, [sp, #16] + + ld1 {v0.8b}, [x1] // chrDither[0..7] + ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + + ushll v2.4s, v0.4h, #12 // U dither low + ushll2 v3.4s, v0.8h, #12 // U dither high + ushll v4.4s, v1.4h, #12 // V dither low + ushll2 v5.4s, v1.8h, #12 // V dither high + + mov x8, #0 // i = 0 +1: + cmp w7, #16 + blt 5f + + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov v20.16b, v2.16b + mov v21.16b, v3.16b + mov v22.16b, v4.16b + mov v23.16b, v5.16b + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +2: + ldr h6, [x10], #2 // Load filter coefficient + + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + add x15, x13, #16 + add x16, x14, #16 + + ld1 {v24.8h}, [x13] // U samples 0-7 + ld1 {v25.8h}, [x14] // V samples 0-7 + + ld1 {v26.8h}, [x15] // U samples 8-15 + ld1 {v27.8h}, [x16] // V samples 8-15 + subs w9, w9, #1 + + smlal v16.4s, v24.4h, v6.h[0] + smlal2 v17.4s, v24.8h, v6.h[0] + smlal v18.4s, v25.4h, v6.h[0] + smlal2 v19.4s, v25.8h, v6.h[0] + + smlal v20.4s, v26.4h, v6.h[0] + smlal2 v21.4s, v26.8h, v6.h[0] + smlal v22.4s, v27.4h, v6.h[0] + smlal2 v23.4s, v27.8h, v6.h[0] + + b.gt 2b + + sqshrun v24.4h, v16.4s, #16 // Process and store first 8 pixels + sqshrun2 v24.8h, v17.4s, #16 + sqshrun v25.4h, v18.4s, #16 + sqshrun2 v25.8h, v19.4s, #16 + + sqshrun v26.4h, v20.4s, #16 // Process and store next 8 pixels + sqshrun2 v26.8h, v21.4s, #16 + sqshrun v27.4h, v22.4s, #16 + sqshrun2 v27.8h, v23.4s, #16 + + cbz w0, 3f + + uqshrn v28.8b, v24.8h, #3 // Storing U + uqshrn2 v28.16b, v26.8h, #3 + uqshrn v29.8b, v25.8h, #3 // Storing V + uqshrn2 v29.16b, v27.8h, #3 + + st2 {v28.16b, v29.16b}, [x6], #32 + b 4f +3: + uqshrn v28.8b, v25.8h, #3 // Storing V + uqshrn2 v28.16b, v27.8h, #3 + uqshrn v29.8b, v24.8h, #3 // Storing U + uqshrn2 v29.16b, v26.8h, #3 + + st2 {v28.16b, v29.16b}, [x6], #32 +4: + subs w7, w7, #16 + add x8, x8, #16 + b.gt 1b + +5: + cmp w7, #8 + blt 10f +6: + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +7: + ldr h6, [x10], #2 // Load filter coefficient + + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + + ld1 {v20.8h}, [x13] // U samples + ld1 {v21.8h}, [x14] // V samples + subs w9, w9, #1 + + smlal v16.4s, v20.4h, v6.h[0] + smlal2 v17.4s, v20.8h, v6.h[0] + smlal v18.4s, v21.4h, v6.h[0] + smlal2 v19.4s, v21.8h, v6.h[0] + + b.gt 7b + + sqshrun v26.4h, v16.4s, #16 // Final processing and store + sqshrun2 v26.8h, v17.4s, #16 + sqshrun v27.4h, v18.4s, #16 + sqshrun2 v27.8h, v19.4s, #16 + + cbz w0, 8f + uqshrn v28.8b, v26.8h, #3 // Storing U + uqshrn v29.8b, v27.8h, #3 // Storing V + st2 {v28.8b, v29.8b}, [x6], #16 + b 9f +8: + uqshrn v28.8b, v27.8h, #3 // Storing V + uqshrn v29.8b, v26.8h, #3 // Storing U + st2 {v28.8b, v29.8b}, [x6], #16 +9: + subs w7, w7, #8 + add x8, x8, #8 + +10: + cbz w7, 15f // Scalar loop + +11: + and x15, x8, #7 + ldrb w9, [x1, x15] + sxtw x9, w9 + lsl x9, x9, #12 // u = chrDither[i & 7] << 12; + + add x15, x8, #3 + and x15, x15, #7 + ldrb w10, [x1, x15] + sxtw x10, w10 + lsl x10, x10, #12 // v = chrDither[(i + 3) & 7] << 12; + + mov w11, w3 // chrFilterSize counter + mov x12, x2 // chrFilter pointer + mov x13, x4 // chrUSrc base + mov x14, x5 // chrVSrc base + +12: + ldrsh x16, [x12], #2 + + ldr x17, [x13], #8 // chrUSrc[j] + ldr x19, [x14], #8 // chrVSrc[j] + add x17, x17, x8, lsl #1 // &chrUSrc[j][i] + add x19, x19, x8, lsl #1 // &chrVSrc[j][i] + + ldrsh x20, [x17] + ldrsh x21, [x19] + + madd x9, x16, x20, x9 + madd x10, x16, x21, x10 + + subs w11, w11, #1 + b.gt 12b + + asr x9, x9, #19 // Process and store U and V + asr x10, x10, #19 + + cmp x9, #0 + csel x9, x9, xzr, ge + cmp x10, #0 + csel x10, x10, xzr, ge + + mov x22, #1 + lsl x22, x22, #8 + sub x22, x22, #1 + + cmp x9, x22 + csel x9, x22, x9, gt + cmp x10, x22 + csel x10, x22, x10, gt + + cbz w0, 13f + strb w9, [x6], #1 // Storing U + strb w10, [x6], #1 // Storing V + b 14f +13: + strb w10, [x6], #1 // Storing V + strb w9, [x6], #1 // Storing U + +14: + subs w7, w7, #1 + add x8, x8, #1 + b.gt 11b +15: + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #32 + ret +endfunc diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 6fd4cc7265..55fff03a5a 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -191,6 +191,25 @@ void ff_yuv2plane1_8_neon( const uint8_t *dither, int offset); +void ff_yuv2nv12cX_neon_asm(int isSwapped, const uint8_t *chrDither, + const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW); + +static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither, + const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW) +{ + if (!isSwappedChroma(dstFormat)) { + ff_yuv2nv12cX_neon_asm(1, chrDither, chrFilter, chrFilterSize, + chrUSrc, chrVSrc, dest, chrDstW); + } else { + ff_yuv2nv12cX_neon_asm(0, chrDither, chrFilter, chrFilterSize, + chrUSrc, chrVSrc, dest, chrDstW); + } +} + #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \ if (c->srcBpc == 8) { \ if(c->dstBpc <= 14) { \ @@ -300,6 +319,8 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon); if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; + if (isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat)) + c->yuv2nv12cX = ff_yuv2nv12cX_neon; } if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat)) { commit 49477972b7175284663c9ef4124345c71dc9c7a1 Author: Logaprakash Ramajayam <logaprakash.ramaja...@multicorewareinc.com> AuthorDate: Tue Jul 1 23:48:36 2025 -0700 Commit: Martin Storsjö <mar...@martin.st> CommitDate: Tue Aug 12 09:05:00 2025 +0000 swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template() yuv2yuvX_8_2_0_512_accurate_c: 2213.4 ( 1.00x) yuv2yuvX_8_2_0_512_accurate_neon: 147.5 (15.01x) yuv2yuvX_8_2_0_512_approximate_c: 2203.9 ( 1.00x) yuv2yuvX_8_2_0_512_approximate_neon: 154.1 (14.30x) yuv2yuvX_8_2_16_512_accurate_c: 2147.2 ( 1.00x) yuv2yuvX_8_2_16_512_accurate_neon: 150.8 (14.24x) yuv2yuvX_8_2_16_512_approximate_c: 2149.7 ( 1.00x) yuv2yuvX_8_2_16_512_approximate_neon: 146.8 (14.64x) yuv2yuvX_8_2_32_512_accurate_c: 2078.9 ( 1.00x) yuv2yuvX_8_2_32_512_accurate_neon: 139.0 (14.95x) yuv2yuvX_8_2_32_512_approximate_c: 2083.7 ( 1.00x) yuv2yuvX_8_2_32_512_approximate_neon: 140.5 (14.84x) yuv2yuvX_8_2_48_512_accurate_c: 2010.7 ( 1.00x) yuv2yuvX_8_2_48_512_accurate_neon: 138.2 (14.55x) yuv2yuvX_8_2_48_512_approximate_c: 2012.6 ( 1.00x) yuv2yuvX_8_2_48_512_approximate_neon: 141.2 (14.26x) yuv2yuvX_10LE_16_0_512_accurate_c: 7874.1 ( 1.00x) yuv2yuvX_10LE_16_0_512_accurate_neon: 831.6 ( 9.47x) yuv2yuvX_10LE_16_0_512_approximate_c: 7918.1 ( 1.00x) yuv2yuvX_10LE_16_0_512_approximate_neon: 836.1 ( 9.47x) yuv2yuvX_10LE_16_16_512_accurate_c: 7630.9 ( 1.00x) yuv2yuvX_10LE_16_16_512_accurate_neon: 804.5 ( 9.49x) yuv2yuvX_10LE_16_16_512_approximate_c: 7724.7 ( 1.00x) yuv2yuvX_10LE_16_16_512_approximate_neon: 808.6 ( 9.55x) yuv2yuvX_10LE_16_32_512_accurate_c: 7436.4 ( 1.00x) yuv2yuvX_10LE_16_32_512_accurate_neon: 780.4 ( 9.53x) yuv2yuvX_10LE_16_32_512_approximate_c: 7366.7 ( 1.00x) yuv2yuvX_10LE_16_32_512_approximate_neon: 780.5 ( 9.44x) yuv2yuvX_10LE_16_48_512_accurate_c: 7099.9 ( 1.00x) yuv2yuvX_10LE_16_48_512_accurate_neon: 761.0 ( 9.33x) yuv2yuvX_10LE_16_48_512_approximate_c: 7097.6 ( 1.00x) yuv2yuvX_10LE_16_48_512_approximate_neon: 754.6 ( 9.41x) Benchmarked on: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Oryon(TM) CPU 3417 Mhz, 12 Core(s), 12 Logical Processor(s) diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index 190c438870..4945633856 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -20,6 +20,182 @@ #include "libavutil/aarch64/asm.S" +function ff_yuv2planeX_10_neon, export=1 +// x0 = filter (int16_t*) +// w1 = filterSize +// x2 = src (int16_t**) +// x3 = dest (uint16_t*) +// w4 = dstW +// w5 = big_endian +// w6 = output_bits + + mov w8, #27 + sub w8, w8, w6 // shift = 11 + 16 - output_bits + + sub w9, w8, #1 + mov w10, #1 + lsl w9, w10, w9 // val = 1 << (shift - 1) + + dup v1.4s, w9 + dup v2.4s, w9 // Create vectors with val + + neg w16, w8 + dup v20.4s, w16 // Create (-shift) vector for right shift + + mov w10, #1 + lsl w10, w10, w6 + sub w10, w10, #1 // (1U << output_bits) - 1 + dup v21.4s, w10 // Create Clip vector for upper bound + dup v27.8h, w10 + + mov x7, #0 // i = 0 + +1: + cmp w4, #16 // Process 16-pixels if available + blt 4f + + mov v3.16b, v1.16b + mov v4.16b, v2.16b + mov v5.16b, v1.16b + mov v6.16b, v2.16b + + mov w11, w1 // tmpfilterSize = filterSize + mov x12, x2 // srcp = src + mov x13, x0 // filterp = filter + +2: // Filter loop + ldp x14, x15, [x12], #16 // get 2 pointers: src[j] and src[j+1] + ldr s7, [x13], #4 // load filter coefficients + add x14, x14, x7, lsl #1 + add x15, x15, x7, lsl #1 + ld1 {v16.8h, v17.8h}, [x14] + ld1 {v18.8h, v19.8h}, [x15] + + subs w11, w11, #2 // tmpfilterSize -= 2 + + smlal v3.4s, v16.4h, v7.h[0] // Multiply-accumulate + smlal2 v4.4s, v16.8h, v7.h[0] + smlal v5.4s, v17.4h, v7.h[0] + smlal2 v6.4s, v17.8h, v7.h[0] + + smlal v3.4s, v18.4h, v7.h[1] + smlal2 v4.4s, v18.8h, v7.h[1] + smlal v5.4s, v19.4h, v7.h[1] + smlal2 v6.4s, v19.8h, v7.h[1] + + b.gt 2b // continue filter loop + + sshl v3.4s, v3.4s, v20.4s // Shift results + sshl v4.4s, v4.4s, v20.4s + sshl v5.4s, v5.4s, v20.4s + sshl v6.4s, v6.4s, v20.4s + + sqxtun v23.4h, v3.4s // Narrow and clamp to 0 + sqxtun2 v23.8h, v4.4s + sqxtun v24.4h, v5.4s + sqxtun2 v24.8h, v6.4s + + umin v23.8h, v23.8h, v27.8h + umin v24.8h, v24.8h, v27.8h + + cbz w5, 3f // Check if big endian + rev16 v23.16b, v23.16b + rev16 v24.16b, v24.16b // Swap bits for big endian +3: + st1 {v23.8h, v24.8h}, [x3], #32 + + subs w4, w4, #16 // dstW = dstW - 16 + add x7, x7, #16 // i = i + 16 + b 1b // Continue loop + +4: + cmp w4, #8 // Process 8-pixels if available + blt 8f +5: + mov v3.16b, v1.16b + mov v4.16b, v2.16b + + mov w11, w1 // tmpfilterSize = filterSize + mov x12, x2 // srcp = src + mov x13, x0 // filterp = filter + +6: // Filter loop + ldp x14, x15, [x12], #16 + ldr s7, [x13], #4 + add x14, x14, x7, lsl #1 + add x15, x15, x7, lsl #1 + ld1 {v5.8h}, [x14] + ld1 {v6.8h}, [x15] + + subs w11, w11, #2 // tmpfilterSize -= 2 + + smlal v3.4s, v5.4h, v7.h[0] // Multiply-accumulate + smlal2 v4.4s, v5.8h, v7.h[0] + smlal v3.4s, v6.4h, v7.h[1] + smlal2 v4.4s, v6.8h, v7.h[1] + + b.gt 6b // loop until filterSize consumed + + sshl v3.4s, v3.4s, v20.4s // Shift results + sshl v4.4s, v4.4s, v20.4s + + sqxtun v25.4h, v3.4s // Narrow and clamp to 0 + sqxtun2 v25.8h, v4.4s + + umin v25.8h, v25.8h, v27.8h + + cbz w5, 7f // Check if big endian + rev16 v25.16b, v25.16b + +7: + st1 {v25.8h}, [x3], #16 // Store 8 pixels + + subs w4, w4, #8 // dstW = dstW - 8 + add x7, x7, #8 // i = i + 8 +8: + cbz w4, 12f // Scalar loop for remaining pixels +9: + mov w11, w1 // tmpfilterSize = filterSize + mov x12, x2 // srcp = src + mov x13, x0 // filterp = filter + sxtw x9, w9 + mov x17, x9 + +10: // Filter loop + ldr x14, [x12], #8 // Load src pointer + ldrsh w15, [x13], #2 // Load filter coefficient + add x14, x14, x7, lsl #1 // Add pixel offset + ldrh w16, [x14] + + sxtw x16, w16 + sxtw x15, w15 + madd x17, x16, x15, x17 + + subs w11, w11, #1 // tmpfilterSize -= 1 + b.gt 10b // loop until filterSize consumed + + sxtw x8, w8 + asr x17, x17, x8 + cmp x17, #0 + csel x17, x17, xzr, ge // Clamp to 0 if negative + + sxtw x10, w10 + cmp x17, x10 + csel x17, x10, x17, gt // Clamp to max if greater than max + + cbz w5, 11f // Check if big endian + rev16 x17, x17 // Swap bits for big endian +11: + strh w17, [x3], #2 + + subs w4, w4, #1 // dstW = dstW - 1 + add x7, x7, #1 // i = i + 1 + b.gt 9b // Loop if more pixels + +12: + ret +endfunc + function ff_yuv2planeX_8_neon, export=1 // x0 - const int16_t *filter, // x1 - int filterSize, diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 6e5a721c1f..6fd4cc7265 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ ALL_SCALE_FUNCS(neon); +void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits); + +#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ +static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \ + const int16_t **src, uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset) \ +{ \ + ff_yuv2planeX_## template_size ## _neon(filter, \ + filterSize, (const typeX_t **) src, \ + (uint16_t *) dest, dstW, is_be, bits); \ +} + +yuv2NBPS( 9, BE, 1, 10, int16_t) +yuv2NBPS( 9, LE, 0, 10, int16_t) +yuv2NBPS(10, BE, 1, 10, int16_t) +yuv2NBPS(10, LE, 0, 10, int16_t) +yuv2NBPS(12, BE, 1, 10, int16_t) +yuv2NBPS(12, LE, 0, 10, int16_t) +yuv2NBPS(14, BE, 1, 10, int16_t) +yuv2NBPS(14, LE, 0, 10, int16_t) + void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset); @@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) { int cpu_flags = av_get_cpu_flags(); + enum AVPixelFormat dstFormat = c->opts.dst_format; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat); if (have_neon(cpu_flags)) { ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon); @@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; } + + if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat)) { + if (desc->comp[0].depth == 9) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon : yuv2planeX_9LE_neon; + } else if (desc->comp[0].depth == 10) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon : yuv2planeX_10LE_neon; + } else if (desc->comp[0].depth == 12) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon : yuv2planeX_12LE_neon; + } else if (desc->comp[0].depth == 14) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon : yuv2planeX_14LE_neon; + } else + av_assert0(0); + } switch (c->opts.src_format) { case AV_PIX_FMT_ABGR: c->lumToYV12 = ff_abgr32ToY_neon; diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 051b2bb4bf..0306f02695 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -52,50 +52,59 @@ static void yuv2planeX_8_ref(const int16_t *filter, int filterSize, } } -static int cmp_off_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy) -{ - for (size_t i = 0; i < n; i++) { - if (abs(ref[i] - test[i]) > accuracy) - return 1; - } - return 0; +#define CMP_FUNC(bits) \ +static int cmp_off_by_n_##bits(const uint##bits##_t *ref, const uint##bits##_t *test, \ + size_t n, int accuracy) \ +{ \ + for (size_t i = 0; i < n; i++) { \ + if (abs((int)ref[i] - (int)test[i]) > accuracy) \ + return 1; \ + } \ + return 0; \ } -static void print_data(uint8_t *p, size_t len, size_t offset) -{ - size_t i = 0; - for (; i < len; i++) { - if (i % 8 == 0) { - printf("0x%04zx: ", i+offset); - } - printf("0x%02x ", (uint32_t) p[i]); - if (i % 8 == 7) { - printf("\n"); - } - } - if (i % 8 != 0) { - printf("\n"); - } +CMP_FUNC(8) +CMP_FUNC(16) + +#define SHOW_DIFF_FUNC(bits) \ +static void print_data_##bits(const uint##bits##_t *p, size_t len, size_t offset) \ +{ \ + size_t i = 0; \ + for (; i < len; i++) { \ + if (i % 8 == 0) { \ + printf("0x%04zx: ", i+offset); \ + } \ + printf("0x%02x ", (uint32_t) p[i]); \ + if (i % 8 == 7) { \ + printf("\n"); \ + } \ + } \ + if (i % 8 != 0) { \ + printf("\n"); \ + } \ +} \ +static size_t show_differences_##bits(const uint##bits##_t *a, const uint##bits##_t *b, \ + size_t len) \ +{ \ + for (size_t i = 0; i < len; i++) { \ + if (a[i] != b[i]) { \ + size_t offset_of_mismatch = i; \ + size_t offset; \ + if (i >= 8) i-=8; \ + offset = i & (~7); \ + printf("test a:\n"); \ + print_data_##bits(&a[offset], 32, offset); \ + printf("\ntest b:\n"); \ + print_data_##bits(&b[offset], 32, offset); \ + printf("\n"); \ + return offset_of_mismatch; \ + } \ + } \ + return len; \ } -static size_t show_differences(uint8_t *a, uint8_t *b, size_t len) -{ - for (size_t i = 0; i < len; i++) { - if (a[i] != b[i]) { - size_t offset_of_mismatch = i; - size_t offset; - if (i >= 8) i-=8; - offset = i & (~7); - printf("test a:\n"); - print_data(&a[offset], 32, offset); - printf("\ntest b:\n"); - print_data(&b[offset], 32, offset); - printf("\n"); - return offset_of_mismatch; - } - } - return len; -} +SHOW_DIFF_FUNC(8) +SHOW_DIFF_FUNC(16) static void check_yuv2yuv1(int accurate) { @@ -140,10 +149,10 @@ static void check_yuv2yuv1(int accurate) call_ref(src_pixels, dst0, dstW, dither, offset); call_new(src_pixels, dst1, dstW, dither, offset); - if (cmp_off_by_n(dst0, dst1, dstW * sizeof(dst0[0]), accurate ? 0 : 2)) { + if (cmp_off_by_n_8(dst0, dst1, dstW * sizeof(dst0[0]), accurate ? 0 : 2)) { fail(); printf("failed: yuv2yuv1_%d_%di_%s\n", offset, dstW, accurate_str); - fail_offset = show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0])); + fail_offset = show_differences_8(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0])); printf("failing values: src: 0x%04x dither: 0x%02x dst-c: %02x dst-asm: %02x\n", (int) src_pixels[fail_offset], (int) dither[(fail_offset + fail_offset) & 7], @@ -158,7 +167,7 @@ static void check_yuv2yuv1(int accurate) sws_freeContext(sws); } -static void check_yuv2yuvX(int accurate) +static void check_yuv2yuvX(int accurate, int bit_depth, int dst_pix_format) { SwsContext *sws; SwsInternal *c; @@ -179,8 +188,8 @@ static void check_yuv2yuvX(int accurate) const int16_t **src; LOCAL_ALIGNED_16(int16_t, src_pixels, [LARGEST_FILTER * LARGEST_INPUT_SIZE]); LOCAL_ALIGNED_16(int16_t, filter_coeff, [LARGEST_FILTER]); - LOCAL_ALIGNED_16(uint8_t, dst0, [LARGEST_INPUT_SIZE]); - LOCAL_ALIGNED_16(uint8_t, dst1, [LARGEST_INPUT_SIZE]); + LOCAL_ALIGNED_16(uint16_t, dst0, [LARGEST_INPUT_SIZE]); + LOCAL_ALIGNED_16(uint16_t, dst1, [LARGEST_INPUT_SIZE]); LOCAL_ALIGNED_16(uint8_t, dither, [LARGEST_INPUT_SIZE]); union VFilterData{ const int16_t *src; @@ -190,12 +199,14 @@ static void check_yuv2yuvX(int accurate) memset(dither, d_val, LARGEST_INPUT_SIZE); randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof(int16_t)); sws = sws_alloc_context(); + sws->dst_format = dst_pix_format; if (accurate) sws->flags |= SWS_ACCURATE_RND; if (sws_init_context(sws, NULL, NULL) < 0) fail(); c = sws_internal(sws); + c->dstBpc = bit_depth; ff_sws_init_scale(c); for(isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); ++isi){ dstW = input_sizes[isi]; @@ -227,24 +238,36 @@ static void check_yuv2yuvX(int accurate) for(j = 0; j < 4; ++j) vFilterData[i].coeff[j + 4] = filter_coeff[i]; } - if (check_func(c->yuv2planeX, "yuv2yuvX_%d_%d_%d_%s", filter_sizes[fsi], osi, dstW, accurate_str)){ + if (check_func(c->yuv2planeX, "yuv2yuvX_%d%s_%d_%d_%d_%s", bit_depth, (bit_depth == 8) ? "" : (isBE(dst_pix_format) ? "BE" : "LE"), filter_sizes[fsi], osi, dstW, accurate_str)) { // use vFilterData for the mmx function const int16_t *filter = c->use_mmx_vfilter ? (const int16_t*)vFilterData : &filter_coeff[0]; memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0])); memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0])); - // We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that - // function or not, so we can't pass it the parameters correctly. - yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi); - - call_new(filter, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi); - if (cmp_off_by_n(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]), accurate ? 0 : 2)) { - fail(); - printf("failed: yuv2yuvX_%d_%d_%d_%s\n", filter_sizes[fsi], osi, dstW, accurate_str); - show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0])); + if (c->dstBpc == 8) { + // We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that + // function or not, so we can't pass it the parameters correctly. + + yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst0, dstW - osi, dither, osi); + call_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi); + + if (cmp_off_by_n_8((uint8_t*)dst0, (uint8_t*)dst1, LARGEST_INPUT_SIZE, accurate ? 0 : 2)) { + fail(); + printf("failed: yuv2yuvX_%d_%d_%d_%d_%s\n", bit_depth, filter_sizes[fsi], osi, dstW, accurate_str); + show_differences_8((uint8_t*)dst0, (uint8_t*)dst1, LARGEST_INPUT_SIZE); + } + } else { + call_ref(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst0, dstW - osi, dither, osi); + call_new(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi); + + if (cmp_off_by_n_16(dst0, dst1, LARGEST_INPUT_SIZE, accurate ? 0 : 2)) { + fail(); + printf("failed: yuv2yuvX_%d%s_%d_%d_%d_%s\n", bit_depth, isBE(dst_pix_format) ? "BE" : "LE", filter_sizes[fsi], osi, dstW, accurate_str); + show_differences_16(dst0, dst1, LARGEST_INPUT_SIZE); + } } if(dstW == LARGEST_INPUT_SIZE) - bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi); + bench_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi); } av_freep(&src); @@ -311,10 +334,10 @@ static void check_yuv2nv12cX(int accurate) call_ref(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst0, dstW); call_new(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst1, dstW); - if (cmp_off_by_n(dst0, dst1, dstW * 2 * sizeof(dst0[0]), accurate ? 0 : 2)) { + if (cmp_off_by_n_8(dst0, dst1, dstW * 2 * sizeof(dst0[0]), accurate ? 0 : 2)) { fail(); printf("failed: yuv2nv12wX_%d_%d_%s\n", filter_size, dstW, accurate_str); - show_differences(dst0, dst1, dstW * 2 * sizeof(dst0[0])); + show_differences_8(dst0, dst1, dstW * 2 * sizeof(dst0[0])); } if (dstW == LARGEST_INPUT_SIZE) bench_new(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst1, dstW); @@ -441,9 +464,33 @@ void checkasm_check_sw_scale(void) check_yuv2yuv1(0); check_yuv2yuv1(1); report("yuv2yuv1"); - check_yuv2yuvX(0); - check_yuv2yuvX(1); - report("yuv2yuvX"); + check_yuv2yuvX(0, 8, AV_PIX_FMT_YUV420P); + check_yuv2yuvX(1, 8, AV_PIX_FMT_YUV420P); + report("yuv2yuvX_8"); + check_yuv2yuvX(0, 9, AV_PIX_FMT_YUV420P9LE); + check_yuv2yuvX(1, 9, AV_PIX_FMT_YUV420P9LE); + report("yuv2yuvX_9LE"); + check_yuv2yuvX(0, 9, AV_PIX_FMT_YUV420P9BE); + check_yuv2yuvX(1, 9, AV_PIX_FMT_YUV420P9BE); + report("yuv2yuvX_9BE"); + check_yuv2yuvX(0, 10, AV_PIX_FMT_YUV420P10LE); + check_yuv2yuvX(1, 10, AV_PIX_FMT_YUV420P10LE); + report("yuv2yuvX_10LE"); + check_yuv2yuvX(0, 10, AV_PIX_FMT_YUV420P10BE); + check_yuv2yuvX(1, 10, AV_PIX_FMT_YUV420P10BE); + report("yuv2yuvX_10BE"); + check_yuv2yuvX(0, 12, AV_PIX_FMT_YUV420P12LE); + check_yuv2yuvX(1, 12, AV_PIX_FMT_YUV420P12LE); + report("yuv2yuvX_12LE"); + check_yuv2yuvX(0, 12, AV_PIX_FMT_YUV420P12BE); + check_yuv2yuvX(1, 12, AV_PIX_FMT_YUV420P12BE); + report("yuv2yuvX_12BE"); + check_yuv2yuvX(0, 14, AV_PIX_FMT_YUV420P14LE); + check_yuv2yuvX(1, 14, AV_PIX_FMT_YUV420P14LE); + report("yuv2yuvX_14LE"); + check_yuv2yuvX(0, 14, AV_PIX_FMT_YUV420P14BE); + check_yuv2yuvX(1, 14, AV_PIX_FMT_YUV420P14BE); + report("yuv2yuvX_14BE"); check_yuv2nv12cX(0); check_yuv2nv12cX(1); report("yuv2nv12cX"); ----------------------------------------------------------------------- Summary of changes: libswscale/aarch64/output.S | 403 +++++++++++++++++++++++++++++++++++++++++++ libswscale/aarch64/swscale.c | 59 +++++++ tests/checkasm/sw_scale.c | 173 ++++++++++++------- 3 files changed, 572 insertions(+), 63 deletions(-) hooks/post-receive --
_______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".