UPDATE: dav1d 1.2.0

Brad Smith Fri, 05 May 2023 15:30:43 -0700

Here is an update to dav1d 1.2.0.


Changes for 1.2.0 'Arctic Peregrine Falcon':
-------------------------------------------

1.2.0 is a small release of dav1d, adding more SIMD and fixes

- Improvements on attachments of props and T.35 entries on output pictures
- NEON z1/z3 high bit-depth optimizations and improvements for 8bpc
- SSSE3 z2/z3 8bpc and SSSE3 z1/z3 high bit-depth optimziations
- refmvs.save_tmvs optimizations in SSSE3/AVX2/AVX-512
- AVX-512 optimizations for high bit-depth itx (16x64, 32x64, 64x16, 64x32, 
64x64)
- AVX2 optimizations for 12bpc for 16x32, 32x16, 32x32 itx


Index: Makefile
===================================================================
RCS file: /home/cvs/ports/multimedia/dav1d/Makefile,v
retrieving revision 1.32
diff -u -p -u -p -r1.32 Makefile
--- Makefile    24 Apr 2023 21:06:59 -0000      1.32
+++ Makefile    5 May 2023 20:14:06 -0000
@@ -1,8 +1,7 @@
 COMMENT=       small and fast AV1 decoder
 
-VER=           1.1.0
+VER=           1.2.0
 DISTNAME=      dav1d-${VER}
-REVISION=      0
 CATEGORIES=    multimedia
 MASTER_SITES=  https://downloads.videolan.org/pub/videolan/dav1d/${VER}/
 EXTRACT_SUFX=  .tar.xz
Index: distinfo
===================================================================
RCS file: /home/cvs/ports/multimedia/dav1d/distinfo,v
retrieving revision 1.16
diff -u -p -u -p -r1.16 distinfo
--- distinfo    9 Apr 2023 19:52:58 -0000       1.16
+++ distinfo    5 May 2023 20:14:13 -0000
@@ -1,2 +1,2 @@
-SHA256 (dav1d-1.1.0.tar.xz) = +1eq54dfKMMPs9uuSjaD0n4vkd3gnOXGDCLO+bxY39E=
-SIZE (dav1d-1.1.0.tar.xz) = 845284
+SHA256 (dav1d-1.2.0.tar.xz) = Ixvti8G7KKQdiNprTCwRjehLkuXx1nyv+ht/garqjG4=
+SIZE (dav1d-1.2.0.tar.xz) = 866120
Index: patches/patch-src_arm_64_ipred16_S
===================================================================
RCS file: /home/cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred16_S,v
retrieving revision 1.1
diff -u -p -u -p -r1.1 patch-src_arm_64_ipred16_S
--- patches/patch-src_arm_64_ipred16_S  24 Apr 2023 21:06:59 -0000      1.1
+++ patches/patch-src_arm_64_ipred16_S  5 May 2023 21:20:53 -0000
@@ -386,8 +386,8 @@ Index: src/arm/64/ipred16.S
 +      .popsection
  endfunc
  
- // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -1418,13 +1436,13 @@ function ipred_filter_\bpc\()bpc_neon
+ const padding_mask_buf
+@@ -2445,13 +2463,13 @@ function ipred_filter_\bpc\()bpc_neon
          add             x6,  x6,  w5, uxtw
          ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
          clz             w9,  w3
@@ -404,7 +404,7 @@ Index: src/arm/64/ipred16.S
          sxtl            v18.8h,  v18.8b
          sxtl            v19.8h,  v19.8b
          add             x6,  x0,  x1
-@@ -1698,11 +1716,13 @@ function ipred_filter_\bpc\()bpc_neon
+@@ -2725,11 +2743,13 @@ function ipred_filter_\bpc\()bpc_neon
  9:
          ret
  
@@ -422,7 +422,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  .endm
  
-@@ -1722,11 +1742,11 @@ endfunc
+@@ -2749,11 +2769,11 @@ endfunc
  function pal_pred_16bpc_neon, export=1
          ld1             {v30.8h}, [x2]
          clz             w9,  w4
@@ -437,7 +437,7 @@ Index: src/arm/64/ipred16.S
          br              x6
  40:
          AARCH64_VALID_JUMP_TARGET
-@@ -1895,12 +1915,14 @@ function pal_pred_16bpc_neon, export=1
+@@ -2922,12 +2942,14 @@ function pal_pred_16bpc_neon, export=1
          b.gt            64b
          ret
  
@@ -457,7 +457,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -1911,12 +1933,12 @@ endfunc
+@@ -2938,12 +2960,12 @@ endfunc
  function ipred_cfl_128_16bpc_neon, export=1
          dup             v31.8h,  w7   // bitdepth_max
          clz             w9,  w3
@@ -473,7 +473,7 @@ Index: src/arm/64/ipred16.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          movi            v30.8h,  #0
-@@ -2048,12 +2070,14 @@ L(ipred_cfl_splat_w16):
+@@ -3075,12 +3097,14 @@ L(ipred_cfl_splat_w16):
          b.gt            1b
          ret
  
@@ -492,7 +492,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2064,12 +2088,12 @@ endfunc
+@@ -3091,12 +3115,12 @@ endfunc
  function ipred_cfl_top_16bpc_neon, export=1
          dup             v31.8h,  w7   // bitdepth_max
          clz             w9,  w3
@@ -508,7 +508,7 @@ Index: src/arm/64/ipred16.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          movi            v30.8h,  #0
-@@ -2107,11 +2131,13 @@ function ipred_cfl_top_16bpc_neon, export=1
+@@ -3134,11 +3158,13 @@ function ipred_cfl_top_16bpc_neon, export=1
          dup             v0.8h,   v0.h[0]
          b               L(ipred_cfl_splat_w16)
  
@@ -526,7 +526,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2124,15 +2150,15 @@ function ipred_cfl_left_16bpc_neon, export=1
+@@ -3151,15 +3177,15 @@ function ipred_cfl_left_16bpc_neon, export=1
          sub             x2,  x2,  w4, uxtw #1
          clz             w9,  w3
          clz             w8,  w4
@@ -548,7 +548,7 @@ Index: src/arm/64/ipred16.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          movi            v30.8h,  #0
-@@ -2174,11 +2200,13 @@ L(ipred_cfl_left_h32):
+@@ -3201,11 +3227,13 @@ L(ipred_cfl_left_h32):
          dup             v0.8h,   v0.h[0]
          br              x9
  
@@ -566,7 +566,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2194,16 +2222,15 @@ function ipred_cfl_16bpc_neon, export=1
+@@ -3221,16 +3249,15 @@ function ipred_cfl_16bpc_neon, export=1
          clz             w9,  w3
          clz             w6,  w4
          dup             v16.4s, w8               // width + height
@@ -587,7 +587,7 @@ Index: src/arm/64/ipred16.S
          ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
          dup             v17.4s,  w8              // -ctz(width + height)
          add             x6,  x0,  x1
-@@ -2327,15 +2354,17 @@ L(ipred_cfl_w32):
+@@ -3354,15 +3381,17 @@ L(ipred_cfl_w32):
          dup             v0.8h,   v0.h[0]
          b               L(ipred_cfl_splat_w16)
  
@@ -613,7 +613,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -2344,14 +2373,14 @@ endfunc
+@@ -3371,14 +3400,14 @@ endfunc
  function ipred_cfl_ac_420_16bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -631,7 +631,7 @@ Index: src/arm/64/ipred16.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -2483,9 +2512,9 @@ L(ipred_cfl_ac_420_w8_hpad):
+@@ -3510,9 +3539,9 @@ L(ipred_cfl_ac_420_w8_hpad):
  
  L(ipred_cfl_ac_420_w16):
          AARCH64_VALID_JUMP_TARGET
@@ -644,7 +644,7 @@ Index: src/arm/64/ipred16.S
          br              x7
  
  L(ipred_cfl_ac_420_w16_wpad0):
-@@ -2662,17 +2691,19 @@ L(ipred_cfl_ac_420_w16_hpad):
+@@ -3689,17 +3718,19 @@ L(ipred_cfl_ac_420_w16_hpad):
          lsl             w6,  w6,  #2
          b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
  
@@ -672,7 +672,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -2681,14 +2712,14 @@ endfunc
+@@ -3708,14 +3739,14 @@ endfunc
  function ipred_cfl_ac_422_16bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -690,7 +690,7 @@ Index: src/arm/64/ipred16.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -2789,9 +2820,9 @@ L(ipred_cfl_ac_422_w8_wpad):
+@@ -3816,9 +3847,9 @@ L(ipred_cfl_ac_422_w8_wpad):
  
  L(ipred_cfl_ac_422_w16):
          AARCH64_VALID_JUMP_TARGET
@@ -703,7 +703,7 @@ Index: src/arm/64/ipred16.S
          br              x7
  
  L(ipred_cfl_ac_422_w16_wpad0):
-@@ -2910,17 +2941,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
+@@ -3937,17 +3968,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
          mov             v1.16b,  v3.16b
          b               L(ipred_cfl_ac_420_w16_hpad)
  
@@ -731,7 +731,7 @@ Index: src/arm/64/ipred16.S
  endfunc
  
  // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -2929,14 +2962,14 @@ endfunc
+@@ -3956,14 +3989,14 @@ endfunc
  function ipred_cfl_ac_444_16bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -749,7 +749,7 @@ Index: src/arm/64/ipred16.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -3045,10 +3078,11 @@ L(ipred_cfl_ac_444_w16_wpad):
+@@ -4072,10 +4105,11 @@ L(ipred_cfl_ac_444_w16_wpad):
  
  L(ipred_cfl_ac_444_w32):
          AARCH64_VALID_JUMP_TARGET
@@ -764,7 +764,7 @@ Index: src/arm/64/ipred16.S
          br              x7
  
  L(ipred_cfl_ac_444_w32_wpad0):
-@@ -3163,15 +3197,17 @@ L(ipred_cfl_ac_444_w32_hpad):
+@@ -4190,15 +4224,17 @@ L(ipred_cfl_ac_444_w32_hpad):
          lsl             w6,  w6,  #3
          b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
  
Index: patches/patch-src_arm_64_ipred_S
===================================================================
RCS file: /home/cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred_S,v
retrieving revision 1.1
diff -u -p -u -p -r1.1 patch-src_arm_64_ipred_S
--- patches/patch-src_arm_64_ipred_S    24 Apr 2023 21:06:59 -0000      1.1
+++ patches/patch-src_arm_64_ipred_S    5 May 2023 21:20:55 -0000
@@ -387,7 +387,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  const padding_mask_buf
-@@ -1604,11 +1622,11 @@ endfunc
+@@ -1614,11 +1632,11 @@ endfunc
  //                               const int dx, const int max_base_x);
  function ipred_z1_fill1_8bpc_neon, export=1
          clz             w9,  w3
@@ -402,7 +402,7 @@ Index: src/arm/64/ipred.S
          ld1r            {v31.16b}, [x10]          // padding
          mov             w7,  w5
          mov             w15, #64
-@@ -1767,12 +1785,14 @@ function ipred_z1_fill1_8bpc_neon, export=1
+@@ -1777,12 +1795,14 @@ function ipred_z1_fill1_8bpc_neon, export=1
          mov             w3,  w12
          b               169b
  
@@ -422,7 +422,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  function ipred_z1_fill2_8bpc_neon, export=1
-@@ -1890,11 +1910,11 @@ endconst
+@@ -1900,11 +1920,11 @@ endconst
  function ipred_z3_fill1_8bpc_neon, export=1
          cmp             w6,  #64
          clz             w9,  w3
@@ -437,7 +437,7 @@ Index: src/arm/64/ipred.S
          movrel          x11, increments
          ld1r            {v31.16b}, [x10]          // padding
          ld1             {v30.8h},  [x11]          // increments
-@@ -2229,17 +2249,20 @@ L(ipred_z3_fill1_large_w16):
+@@ -2243,17 +2263,20 @@ L(ipred_z3_fill1_large_h16):
  9:
          ret
  
@@ -464,7 +464,7 @@ Index: src/arm/64/ipred.S
          b.gt            L(ipred_z3_fill_padding_wide)
          // w3 = remaining width, w4 = constant height
          mov             w12, w4
-@@ -2250,8 +2273,7 @@ function ipred_z3_fill_padding_neon, export=0
+@@ -2264,8 +2287,7 @@ function ipred_z3_fill_padding_neon, export=0
          // power of two in the remaining width, and repeating.
          clz             w9,  w3
          sub             w9,  w9,  #25
@@ -474,7 +474,7 @@ Index: src/arm/64/ipred.S
          br              x9
  
  2:
-@@ -2331,13 +2353,15 @@ function ipred_z3_fill_padding_neon, export=0
+@@ -2345,13 +2367,15 @@ function ipred_z3_fill_padding_neon, export=0
  9:
          ret
  
@@ -496,17 +496,7 @@ Index: src/arm/64/ipred.S
  
  L(ipred_z3_fill_padding_wide):
          // Fill a WxH rectangle with padding, with W > 16.
-@@ -2367,7 +2391,8 @@ L(ipred_z3_fill_padding_wide):
- endfunc
- 
- function ipred_z3_fill2_8bpc_neon, export=1
--        adr             x8,  L(ipred_z3_fill1_tbl)
-+        adrp            x8,  L(ipred_z3_fill1_tbl)
-+        add             x8,  x8, :lo12: L(ipred_z3_fill1_tbl)
-         add             x10, x2,  w6,  uxtw       // left[max_base_y]
-         movrel          x11, increments
-         ld1r            {v31.16b}, [x10]          // padding
-@@ -2493,13 +2518,13 @@ function ipred_filter_8bpc_neon, export=1
+@@ -2506,13 +2530,13 @@ function ipred_filter_8bpc_neon, export=1
          add             x6,  x6,  w5, uxtw
          ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
          clz             w9,  w3
@@ -523,7 +513,7 @@ Index: src/arm/64/ipred.S
          sxtl            v18.8h,  v18.8b
          sxtl            v19.8h,  v19.8b
          add             x6,  x0,  x1
-@@ -2640,11 +2665,13 @@ function ipred_filter_8bpc_neon, export=1
+@@ -2653,11 +2677,13 @@ function ipred_filter_8bpc_neon, export=1
  9:
          ret
  
@@ -541,7 +531,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2653,11 +2680,11 @@ endfunc
+@@ -2666,11 +2692,11 @@ endfunc
  function pal_pred_8bpc_neon, export=1
          ld1             {v0.8h}, [x2]
          clz             w9,  w4
@@ -556,7 +546,7 @@ Index: src/arm/64/ipred.S
          add             x2,  x0,  x1
          lsl             x1,  x1,  #1
          br              x6
-@@ -2735,12 +2762,14 @@ function pal_pred_8bpc_neon, export=1
+@@ -2748,12 +2774,14 @@ function pal_pred_8bpc_neon, export=1
          b.gt            64b
          ret
  
@@ -576,7 +566,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2749,12 +2778,12 @@ endfunc
+@@ -2762,12 +2790,12 @@ endfunc
  //                              const int16_t *ac, const int alpha);
  function ipred_cfl_128_8bpc_neon, export=1
          clz             w9,  w3
@@ -592,7 +582,7 @@ Index: src/arm/64/ipred.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          br              x7
-@@ -2859,12 +2888,14 @@ L(ipred_cfl_splat_w16):
+@@ -2872,12 +2900,14 @@ L(ipred_cfl_splat_w16):
          b.gt            1b
          ret
  
@@ -611,7 +601,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2873,12 +2904,12 @@ endfunc
+@@ -2886,12 +2916,12 @@ endfunc
  //                              const int16_t *ac, const int alpha);
  function ipred_cfl_top_8bpc_neon, export=1
          clz             w9,  w3
@@ -627,7 +617,7 @@ Index: src/arm/64/ipred.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          br              x7
-@@ -2913,11 +2944,13 @@ function ipred_cfl_top_8bpc_neon, export=1
+@@ -2926,11 +2956,13 @@ function ipred_cfl_top_8bpc_neon, export=1
          dup             v0.8h,   v2.h[0]
          b               L(ipred_cfl_splat_w16)
  
@@ -645,7 +635,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2928,15 +2961,15 @@ function ipred_cfl_left_8bpc_neon, export=1
+@@ -2941,15 +2973,15 @@ function ipred_cfl_left_8bpc_neon, export=1
          sub             x2,  x2,  w4, uxtw
          clz             w9,  w3
          clz             w8,  w4
@@ -667,7 +657,7 @@ Index: src/arm/64/ipred.S
          add             x6,  x0,  x1
          lsl             x1,  x1,  #1
          br              x7
-@@ -2975,11 +3008,13 @@ L(ipred_cfl_left_h32):
+@@ -2988,11 +3020,13 @@ L(ipred_cfl_left_h32):
          dup             v0.8h,   v2.h[0]
          br              x9
  
@@ -685,7 +675,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -2993,16 +3028,15 @@ function ipred_cfl_8bpc_neon, export=1
+@@ -3006,16 +3040,15 @@ function ipred_cfl_8bpc_neon, export=1
          clz             w9,  w3
          clz             w6,  w4
          dup             v16.8h, w8               // width + height
@@ -706,7 +696,7 @@ Index: src/arm/64/ipred.S
          ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
          dup             v17.8h,  w8              // -ctz(width + height)
          add             x6,  x0,  x1
-@@ -3119,15 +3153,17 @@ L(ipred_cfl_w32):
+@@ -3132,15 +3165,17 @@ L(ipred_cfl_w32):
          dup             v0.8h,   v0.h[0]
          b               L(ipred_cfl_splat_w16)
  
@@ -732,7 +722,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -3136,14 +3172,14 @@ endfunc
+@@ -3149,14 +3184,14 @@ endfunc
  function ipred_cfl_ac_420_8bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -750,7 +740,7 @@ Index: src/arm/64/ipred.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -3282,9 +3318,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
+@@ -3295,9 +3330,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
  
  L(ipred_cfl_ac_420_w16):
          AARCH64_VALID_JUMP_TARGET
@@ -763,7 +753,7 @@ Index: src/arm/64/ipred.S
          br              x7
  
  L(ipred_cfl_ac_420_w16_wpad0):
-@@ -3441,17 +3477,19 @@ L(ipred_cfl_ac_420_w16_hpad):
+@@ -3454,17 +3489,19 @@ L(ipred_cfl_ac_420_w16_hpad):
          lsl             w6,  w6,  #1
          b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
  
@@ -791,7 +781,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -3460,14 +3498,14 @@ endfunc
+@@ -3473,14 +3510,14 @@ endfunc
  function ipred_cfl_ac_422_8bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -809,7 +799,7 @@ Index: src/arm/64/ipred.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -3558,9 +3596,9 @@ L(ipred_cfl_ac_422_w8_wpad):
+@@ -3571,9 +3608,9 @@ L(ipred_cfl_ac_422_w8_wpad):
  
  L(ipred_cfl_ac_422_w16):
          AARCH64_VALID_JUMP_TARGET
@@ -822,7 +812,7 @@ Index: src/arm/64/ipred.S
          br              x7
  
  L(ipred_cfl_ac_422_w16_wpad0):
-@@ -3663,17 +3701,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
+@@ -3676,17 +3713,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
          mov             v1.16b,  v3.16b
          b               L(ipred_cfl_ac_420_w16_hpad)
  
@@ -850,7 +840,7 @@ Index: src/arm/64/ipred.S
  endfunc
  
  // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -3682,14 +3722,14 @@ endfunc
+@@ -3695,14 +3734,14 @@ endfunc
  function ipred_cfl_ac_444_8bpc_neon, export=1
          clz             w8,  w5
          lsl             w4,  w4,  #2
@@ -868,7 +858,7 @@ Index: src/arm/64/ipred.S
          sub             w8,  w6,  w4         // height - h_pad
          rbit            w9,  w5              // rbit(width)
          rbit            w10, w6              // rbit(height)
-@@ -3810,9 +3850,10 @@ L(ipred_cfl_ac_444_w16_wpad):
+@@ -3823,9 +3862,10 @@ L(ipred_cfl_ac_444_w16_wpad):
  
  L(ipred_cfl_ac_444_w32):
          AARCH64_VALID_JUMP_TARGET
@@ -882,7 +872,7 @@ Index: src/arm/64/ipred.S
          br              x7
  
  L(ipred_cfl_ac_444_w32_wpad0):
-@@ -3958,15 +3999,17 @@ L(ipred_cfl_ac_444_w32_hpad):
+@@ -3971,15 +4011,17 @@ L(ipred_cfl_ac_444_w32_hpad):
          dup             v4.8h,   v4.h[0]
          b               L(ipred_cfl_ac_420_w8_subtract_dc)
  
Index: patches/patch-src_thread_task_c
===================================================================
RCS file: patches/patch-src_thread_task_c
diff -N patches/patch-src_thread_task_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_thread_task_c     5 May 2023 22:24:18 -0000
@@ -0,0 +1,28 @@
+threading: Fix a race on task_thread.init_done
+
+Index: src/thread_task.c
+--- src/thread_task.c.orig
++++ src/thread_task.c
+@@ -327,6 +327,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *co
+         f->task_thread.pending_tasks.tail->next = &tasks[0];
+     f->task_thread.pending_tasks.tail = prev_t;
+     atomic_store(&f->task_thread.pending_tasks.merge, 1);
++    atomic_store(&f->task_thread.init_done, 1);
+     pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+ 
+     return 0;
+@@ -730,14 +731,11 @@ void *dav1d_worker_task(void *data) {
+                             dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
+                             f->n_tile_data = 0;
+                             pthread_cond_signal(&f->task_thread.cond);
+-                            atomic_store(&f->task_thread.init_done, 1);
+-                            continue;
+                         } else {
+                             pthread_mutex_unlock(&ttd->lock);
+                         }
+                     }
+                 }
+-                atomic_store(&f->task_thread.init_done, 1);
+                 pthread_mutex_lock(&ttd->lock);
+             } else {
+                 pthread_mutex_lock(&ttd->lock);

UPDATE: dav1d 1.2.0

Reply via email to