Hello community,

here is the log from the commit of package dav1d for openSUSE:Factory checked 
in at 2019-03-13 09:16:24
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/dav1d (Old)
 and      /work/SRC/openSUSE:Factory/.dav1d.new.28833 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "dav1d"

Wed Mar 13 09:16:24 2019 rev:3 rq:684500 version:0.2.1

Changes:
--------
--- /work/SRC/openSUSE:Factory/dav1d/dav1d.changes      2019-03-05 
12:25:29.820838223 +0100
+++ /work/SRC/openSUSE:Factory/.dav1d.new.28833/dav1d.changes   2019-03-13 
09:16:39.191377919 +0100
@@ -1,0 +2,10 @@
+Tue Mar 12 22:23:22 UTC 2019 - klaatu <[email protected]>
+
+- Update to version 0.2.1
+  * SSSE3 optimization for cdef_dir
+  * AVX-2 improvements of the existing CDEF optimizations
+  * NEON improvements of the existing CDEF and wiener
+    optimizations
+  * Clarification about the numbering/versionning scheme
+
+-------------------------------------------------------------------

Old:
----
  dav1d-0.2.0.tar.gz

New:
----
  dav1d-0.2.1.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ dav1d.spec ++++++
--- /var/tmp/diff_new_pack.WHLmum/_old  2019-03-13 09:16:39.951377841 +0100
+++ /var/tmp/diff_new_pack.WHLmum/_new  2019-03-13 09:16:39.955377841 +0100
@@ -18,7 +18,7 @@
 
 %define sover   1
 Name:           dav1d
-Version:        0.2.0
+Version:        0.2.1
 Release:        0
 Summary:        An AV1 decoder
 License:        BSD-2-Clause

++++++ dav1d-0.2.0.tar.gz -> dav1d-0.2.1.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/NEWS new/dav1d-0.2.1/NEWS
--- old/dav1d-0.2.0/NEWS        2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/NEWS        2019-03-12 15:28:36.000000000 +0100
@@ -1,3 +1,12 @@
+Changes for 0.2.1 'Antelope':
+----------------------------
+
+ - SSSE3 optimization for cdef_dir
+ - AVX-2 improvements of the existing CDEF optimizations
+ - NEON improvements of the existing CDEF and wiener optimizations
+ - Clarification about the numbering/versionning scheme
+
+
 Changes for 0.2.0 'Antelope':
 ----------------------------
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/THANKS.md new/dav1d-0.2.1/THANKS.md
--- old/dav1d-0.2.0/THANKS.md   2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/THANKS.md   2019-03-12 15:28:36.000000000 +0100
@@ -16,4 +16,4 @@
 
 And all the dav1d Authors (git shortlog -sn), including:
 
-Janne Grunau, Ronald S. Bultje, James Almer, Marvin Scholz, Henrik Gramner, 
Martin Storsjö, Luc Trudeau, David Michael Barr, Hugo Beauzée-Luyssen, Steve 
Lhomme, Jean-Baptiste Kempf, Derek Buitenhuis, Nathan E. Egge, Raphaël Zumer, 
Francois Cartegnie, Niklas Haas, Konstantin Pavlov, Boyuan Xiao, Raphael Zumer 
and Michael Bradshaw.
+Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, 
Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo 
Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. 
Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël 
Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle 
Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan 
Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale 
Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, 
Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr 
and skal.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/dav1d.h 
new/dav1d-0.2.1/include/dav1d/dav1d.h
--- old/dav1d-0.2.0/include/dav1d/dav1d.h       2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/include/dav1d/dav1d.h       2019-03-12 15:28:36.000000000 
+0100
@@ -74,16 +74,6 @@
 DAV1D_API const char *dav1d_version(void);
 
 /**
- * Get library version based on version control system.
- */
-DAV1D_API const char *dav1d_version_vcs(void);
-
-/**
- * Get library version as unsigned int.
- */
-DAV1D_API unsigned int dav1d_version_int(void);
-
-/**
  * Initialize settings to default values.
  *
  * @param s Input settings context.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/meson.build 
new/dav1d-0.2.1/include/dav1d/meson.build
--- old/dav1d-0.2.0/include/dav1d/meson.build   2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/include/dav1d/meson.build   2019-03-12 15:28:36.000000000 
+0100
@@ -24,9 +24,9 @@
 
 # installed version.h header generation
 version_h_data = configuration_data()
-version_h_data.set('DAV1D_VERSION_MAJOR', dav1d_version_major)
-version_h_data.set('DAV1D_VERSION_MINOR', dav1d_version_minor)
-version_h_data.set('DAV1D_VERSION_PATCH', dav1d_version_revision)
+version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
+version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
+version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
 version_h_target = configure_file(input: 'version.h.in',
                                   output: 'version.h',
                                   configuration: version_h_data)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/version.h.in 
new/dav1d-0.2.1/include/dav1d/version.h.in
--- old/dav1d-0.2.0/include/dav1d/version.h.in  2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/include/dav1d/version.h.in  2019-03-12 15:28:36.000000000 
+0100
@@ -27,12 +27,8 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H
 
-#define DAV1D_VERSION_MAJOR @DAV1D_VERSION_MAJOR@
-#define DAV1D_VERSION_MINOR @DAV1D_VERSION_MINOR@
-#define DAV1D_VERSION_PATCH @DAV1D_VERSION_PATCH@
-
-#define DAV1D_VERSION 
"@DAV1D_VERSION_MAJOR@.@DAV1D_VERSION_MINOR@.@DAV1D_VERSION_PATCH@"
-
-#define DAV1D_VERSION_INT (@DAV1D_VERSION_MAJOR@ << 16 | @DAV1D_VERSION_MINOR@ 
<< 8 | @DAV1D_VERSION_PATCH@)
+#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
+#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
+#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
 
 #endif /* DAV1D_VERSION_H */
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/include/vcs_version.h.in 
new/dav1d-0.2.1/include/vcs_version.h.in
--- old/dav1d-0.2.0/include/vcs_version.h.in    2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/include/vcs_version.h.in    2019-03-12 15:28:36.000000000 
+0100
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION_VCS "@VCS_TAG@"
+#define DAV1D_VERSION "@VCS_TAG@"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/meson.build new/dav1d-0.2.1/meson.build
--- old/dav1d-0.2.0/meson.build 2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/meson.build 2019-03-12 15:28:36.000000000 +0100
@@ -23,18 +23,18 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '0.2.0',
+    version: '0.2.1',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
     meson_version: '>= 0.47.0')
 
-dav1d_soname_version   = '1.0.0'
-dav1d_version_array    = dav1d_soname_version.split('.')
-dav1d_version_major    = dav1d_version_array[0]
-dav1d_version_minor    = dav1d_version_array[1]
-dav1d_version_revision = dav1d_version_array[2]
+dav1d_soname_version   = '1.0.1'
+dav1d_api_version_array    = dav1d_soname_version.split('.')
+dav1d_api_version_major    = dav1d_api_version_array[0]
+dav1d_api_version_minor    = dav1d_api_version_array[1]
+dav1d_api_version_revision = dav1d_api_version_array[2]
 
 dav1d_src_root = meson.current_source_dir()
 cc = meson.get_compiler('c')
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/arm/32/looprestoration.S 
new/dav1d-0.2.1/src/arm/32/looprestoration.S
--- old/dav1d-0.2.0/src/arm/32/looprestoration.S        2019-03-04 
15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/arm/32/looprestoration.S        2019-03-12 
15:28:36.000000000 +0100
@@ -283,14 +283,12 @@
         .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
         .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
 
+44:     // 4 pixels valid in d2/d16, fill d3/d17 with padding.
+        vmov            d3,  d4
+        vmov            d17, d18
+        b               88f
         // Shift q1 right, shifting out invalid pixels,
         // shift q1 left to the original offset, shifting in padding pixels.
-44:     // 4 pixels valid
-        vext.8          q1,  q1,  q1,  #8
-        vext.8          q1,  q1,  q2,  #8
-        vext.8          q8,  q8,  q8,  #8
-        vext.8          q8,  q8,  q9,  #8
-        b               88f
 55:     // 5 pixels valid
         vext.8          q1,  q1,  q1,  #10
         vext.8          q1,  q1,  q2,  #6
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/arm/64/cdef.S 
new/dav1d-0.2.1/src/arm/64/cdef.S
--- old/dav1d-0.2.0/src/arm/64/cdef.S   2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/arm/64/cdef.S   2019-03-12 15:28:36.000000000 +0100
@@ -136,8 +136,7 @@
 
 .macro padding_func w, stride, rn, rw
 function cdef_padding\w\()_neon, export=1
-        movi            v30.16b, #255
-        ushr            v30.8h, v30.8h, #1 // INT16_MAX
+        movi            v30.8h,  #0x80, lsl #8
         mov             v31.16b, v30.16b
         sub             x0,  x0,  #2*(2*\stride+2)
         tst             w6,  #4 // CDEF_HAVE_TOP
@@ -290,29 +289,23 @@
 .endif
 .endm
 .macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
-        cmeq            v16.8h,  \s1\().8h,  v31.8h
-        cmeq            v17.8h,  \s2\().8h,  v31.8h
-        bic             v16.16b, \s1\().16b, v16.16b
-        bic             v17.16b, \s2\().16b, v17.16b
         umin            v2.8h,   v2.8h,  \s1\().8h
-        umax            v3.8h,   v3.8h,  v16.8h
+        smax            v3.8h,   v3.8h,  \s1\().8h
         umin            v2.8h,   v2.8h,  \s2\().8h
-        umax            v3.8h,   v3.8h,  v17.8h
+        smax            v3.8h,   v3.8h,  \s2\().8h
 
         cbz             \threshold, 3f
         uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
         uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
         ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
         ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
-        sub             v17.8h, \thresh_vec, v17.8h // threshold - (abs(diff) 
>> shift)
-        sub             v21.8h, \thresh_vec, v21.8h // threshold - (abs(diff) 
>> shift)
-        smax            v17.8h, v29.8h, v17.8h      // imax(0, threshold - ())
-        smax            v21.8h, v29.8h, v21.8h      // imax(0, threshold - ())
+        uqsub           v17.8h, \thresh_vec, v17.8h // imax(0, threshold - 
(abs(diff) >> shift))
+        uqsub           v21.8h, \thresh_vec, v21.8h // imax(0, threshold - 
(abs(diff) >> shift))
         cmhi            v18.8h, v0.8h,  \s1\().8h   // px > p0
         cmhi            v22.8h, v0.8h,  \s2\().8h   // px > p1
-        smin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
-        smin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
-        dup             v19.8h, \tap                // taps[k]/taps[k]
+        umin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
+        umin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
+        dup             v19.8h, \tap                // taps[k]
         neg             v16.8h, v17.8h              // -imin()
         neg             v20.8h, v21.8h              // -imin()
         bsl             v18.16b, v16.16b, v17.16b   // constrain() = 
apply_sign()
@@ -332,11 +325,8 @@
         add             x8,  x8,  w9, uxtw #1
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
-        movi            v31.16b,  #255
         movi            v30.8h,   #15
-        movi            v29.8h,   #0
         dup             v28.8h,   w6                // damping
-        ushr            v31.8h,   v31.8h, #1        // INT16_MAX
 
         dup             v25.8h, w3                  // threshold
         dup             v27.8h, w4                  // threshold
@@ -344,10 +334,8 @@
         clz             v26.8h, v27.8h              // clz(threshold)
         sub             v24.8h, v30.8h, v24.8h      // ulog2(threshold)
         sub             v26.8h, v30.8h, v26.8h      // ulog2(threshold)
-        sub             v24.8h, v28.8h, v24.8h      // damping - 
ulog2(threshold)
-        sub             v26.8h, v28.8h, v26.8h      // damping - 
ulog2(threshold)
-        smax            v24.8h, v29.8h, v24.8h      // shift = imax(0, damping 
- ulog2(threshold))
-        smax            v26.8h, v29.8h, v26.8h      // shift = imax(0, damping 
- ulog2(threshold))
+        uqsub           v24.8h, v28.8h, v24.8h      // shift = imax(0, damping 
- ulog2(threshold))
+        uqsub           v26.8h, v28.8h, v26.8h      // shift = imax(0, damping 
- ulog2(threshold))
         neg             v24.8h, v24.8h              // -shift
         neg             v26.8h, v26.8h              // -shift
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/arm/64/looprestoration.S 
new/dav1d-0.2.1/src/arm/64/looprestoration.S
--- old/dav1d-0.2.0/src/arm/64/looprestoration.S        2019-03-04 
15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/arm/64/looprestoration.S        2019-03-12 
15:28:36.000000000 +0100
@@ -224,31 +224,25 @@
         mov             v3.16b,  v28.16b
         mov             v5.16b,  v29.16b
         br              x11
+44:     // 4 pixels valid in v2/v4, fill the high half with padding.
+        ins             v2.d[1], v3.d[0]
+        ins             v4.d[1], v5.d[0]
+        b               88f
         // Shift v2 right, shifting out invalid pixels,
         // shift v2 left to the original offset, shifting in padding pixels.
-44:     // 4 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #8
-        ext             v2.16b,  v2.16b,  v3.16b,  #8
-        ext             v4.16b,  v4.16b,  v4.16b,  #8
-        ext             v4.16b,  v4.16b,  v5.16b,  #8
-        b               88f
 55:     // 5 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #10
         ext             v2.16b,  v2.16b,  v3.16b,  #6
         ext             v4.16b,  v4.16b,  v4.16b,  #10
         ext             v4.16b,  v4.16b,  v5.16b,  #6
         b               88f
-66:     // 6 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #12
-        ext             v2.16b,  v2.16b,  v3.16b,  #4
-        ext             v4.16b,  v4.16b,  v4.16b,  #12
-        ext             v4.16b,  v4.16b,  v5.16b,  #4
+66:     // 6 pixels valid, fill the upper 2 pixels with padding.
+        ins             v2.s[3], v3.s[0]
+        ins             v4.s[3], v5.s[0]
         b               88f
-77:     // 7 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #14
-        ext             v2.16b,  v2.16b,  v3.16b,  #2
-        ext             v4.16b,  v4.16b,  v4.16b,  #14
-        ext             v4.16b,  v4.16b,  v5.16b,  #2
+77:     // 7 pixels valid, fill the last pixel with padding.
+        ins             v2.h[7], v3.h[0]
+        ins             v4.h[7], v5.h[0]
         b               88f
 
 L(variable_shift_tbl):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/arm/looprestoration_init_tmpl.c 
new/dav1d-0.2.1/src/arm/looprestoration_init_tmpl.c
--- old/dav1d-0.2.0/src/arm/looprestoration_init_tmpl.c 2019-03-04 
15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/arm/looprestoration_init_tmpl.c 2019-03-12 
15:28:36.000000000 +0100
@@ -29,8 +29,6 @@
 #include "src/looprestoration.h"
 
 #include "common/attributes.h"
-#include "common/intops.h"
-#include "src/tables.h"
 
 #if BITDEPTH == 8
 // This calculates things slightly differently than the reference C version.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/dav1d.rc.in 
new/dav1d-0.2.1/src/dav1d.rc.in
--- old/dav1d-0.2.0/src/dav1d.rc.in     2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/dav1d.rc.in     2019-03-12 15:28:36.000000000 +0100
@@ -1,13 +1,15 @@
-#define VERSION_NUMBER 
@VERSION_MAJOR@,@VERSION_MINOR@,@VERSION_REVISION@,@VERSION_EXTRA@
-#define VERSION_NUMBER_STR "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_REVISION@"
+#define API_VERSION_NUMBER 
@API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR 
"@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER 
@PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR 
"@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
 
 #include <windows.h>
 
 1 VERSIONINFO
 FILETYPE VFT_DLL
 FILEOS VOS_NT_WINDOWS32
-PRODUCTVERSION VERSION_NUMBER
-FILEVERSION VERSION_NUMBER
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
 BEGIN
   BLOCK "StringFileInfo"
   BEGIN
@@ -15,9 +17,9 @@
     BEGIN
       VALUE "CompanyName", "VideoLAN"
       VALUE "ProductName", "dav1d"
-      VALUE "ProductVersion", VERSION_NUMBER_STR
-      VALUE "FileVersion", VERSION_NUMBER_STR
-      VALUE "FileDescription", "dav1d AV1 decoder"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 
decoder"
       VALUE "InternalName", "dav1d"
       VALUE "OriginalFilename", "libdav1d.dll"
       VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and 
dav1d Authors"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/lib.c new/dav1d-0.2.1/src/lib.c
--- old/dav1d-0.2.0/src/lib.c   2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/lib.c   2019-03-12 15:28:36.000000000 +0100
@@ -56,14 +56,6 @@
     return DAV1D_VERSION;
 }
 
-const char *dav1d_version_vcs(void) {
-    return DAV1D_VERSION_VCS;
-}
-
-unsigned int dav1d_version_int(void) {
-    return DAV1D_VERSION_INT;
-}
-
 void dav1d_default_settings(Dav1dSettings *const s) {
     s->n_frame_threads = 1;
     s->n_tile_threads = 1;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/meson.build 
new/dav1d-0.2.1/src/meson.build
--- old/dav1d-0.2.0/src/meson.build     2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/meson.build     2019-03-12 15:28:36.000000000 +0100
@@ -147,10 +147,12 @@
     rc_version_array = meson.project_version().split('.')
     winmod = import('windows')
     rc_data = configuration_data()
-    rc_data.set('VERSION_MAJOR', rc_version_array[0])
-    rc_data.set('VERSION_MINOR', rc_version_array[1])
-    rc_data.set('VERSION_REVISION', rc_version_array[2])
-    rc_data.set('VERSION_EXTRA', '0')
+    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
+    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
+    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
+    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
+    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
+    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
     rc_data.set('COPYRIGHT_YEARS', '2019')
 
     rc_file = configure_file(
@@ -201,7 +203,7 @@
 if host_machine.system() == 'windows'
     dav1d_soversion = ''
 else
-    dav1d_soversion = dav1d_version_major
+    dav1d_soversion = dav1d_api_version_major
 endif
 
 libdav1d = library('dav1d',
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef.asm 
new/dav1d-0.2.1/src/x86/cdef.asm
--- old/dav1d-0.2.0/src/x86/cdef.asm    2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/x86/cdef.asm    2019-03-12 15:28:36.000000000 +0100
@@ -33,10 +33,13 @@
 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
            dd 420, 210, 140, 105
 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
-shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 pw_128: times 2 dw 128
 pw_2048: times 2 dw 2048
-tap_table: dw 4, 2, 3, 3, 2, 1
+tap_table: ; masks for 8 bit shifts
+           db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+           ; weights
+           db 4, 2, 3, 3, 2, 1
            db -1 * 16 + 1, -2 * 16 + 2
            db  0 * 16 + 1, -1 * 16 + 2
            db  0 * 16 + 1,  0 * 16 + 2
@@ -55,56 +58,59 @@
 
 SECTION .text
 
-%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride
     ; load p0/p1
     movsx         offq, byte [dirq+kq+%1]       ; off1
-%if %5 == 4
-    movq           xm5, [stkq+offq*2+%6*0]      ; p0
-    movq           xm6, [stkq+offq*2+%6*2]
-    movhps         xm5, [stkq+offq*2+%6*1]
-    movhps         xm6, [stkq+offq*2+%6*3]
+%if %6 == 4
+    movq           xm5, [stkq+offq*2+%7*0]      ; p0
+    movq           xm6, [stkq+offq*2+%7*2]
+    movhps         xm5, [stkq+offq*2+%7*1]
+    movhps         xm6, [stkq+offq*2+%7*3]
     vinserti128     m5, xm6, 1
 %else
-    movu           xm5, [stkq+offq*2+%6*0]      ; p0
-    vinserti128     m5, [stkq+offq*2+%6*1], 1
+    movu           xm5, [stkq+offq*2+%7*0]      ; p0
+    vinserti128     m5, [stkq+offq*2+%7*1], 1
 %endif
     neg           offq                          ; -off1
-%if %5 == 4
-    movq           xm6, [stkq+offq*2+%6*0]      ; p1
-    movq           xm9, [stkq+offq*2+%6*2]
-    movhps         xm6, [stkq+offq*2+%6*1]
-    movhps         xm9, [stkq+offq*2+%6*3]
+%if %6 == 4
+    movq           xm6, [stkq+offq*2+%7*0]      ; p1
+    movq           xm9, [stkq+offq*2+%7*2]
+    movhps         xm6, [stkq+offq*2+%7*1]
+    movhps         xm9, [stkq+offq*2+%7*3]
     vinserti128     m6, xm9, 1
 %else
-    movu           xm6, [stkq+offq*2+%6*0]      ; p1
-    vinserti128     m6, [stkq+offq*2+%6*1], 1
+    movu           xm6, [stkq+offq*2+%7*0]      ; p1
+    vinserti128     m6, [stkq+offq*2+%7*1], 1
 %endif
-    pcmpeqw         m9, m14, m5
-    pcmpeqw        m10, m14, m6
-    pandn           m9, m5
-    pandn          m10, m6
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pmaxsw          m7, m10                     ; max after p1
-    pminsw          m8, m6                      ; min after p1
+    ; out of bounds values are set to a value that is a both a large unsigned
+    ; value and a negative signed value.
+    ; use signed max and unsigned min to remove them
+    pmaxsw          m7, m5                      ; max after p0
+    pminuw          m8, m5                      ; min after p0
+    pmaxsw          m7, m6                      ; max after p1
+    pminuw          m8, m6                      ; min after p1
 
     ; accumulate sum[m15] over p0/p1
+    ; calculate difference before converting
     psubw           m5, m4                      ; diff_p0(p0 - px)
     psubw           m6, m4                      ; diff_p1(p1 - px)
-    pabsw           m9, m5
-    pabsw          m10, m6
-    psignw         m11, %4, m5
-    psignw         m12, %4, m6
-    psrlw           m5, m9, %2
-    psrlw           m6, m10, %2
-    psubusw         m5, %3, m5
-    psubusw         m6, %3, m6
-    pminsw          m5, m9                      ; constrain(diff_p0)
-    pminsw          m6, m10                     ; constrain(diff_p1)
-    pmullw          m5, m11                     ; constrain(diff_p0) * taps
-    pmullw          m6, m12                     ; constrain(diff_p1) * taps
+
+    ; convert to 8-bits with signed saturation
+    ; saturating to large diffs has no impact on the results
+    packsswb        m5, m6
+
+    ; group into pairs so we can accumulate using maddubsw
+    pshufb          m5, m12
+    pabsb           m9, m5
+    psignb         m10, %5, m5
+    psrlw           m5, m9, %2                  ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+
+    ; use unsigned min since abs diff can equal 0x80
+    pminub          m5, m9
+    pmaddubsw       m5, m10
     paddw          m15, m5
-    paddw          m15, m6
 %endmacro
 
 %macro cdef_filter_fn 3 ; w, h, stride
@@ -118,7 +124,7 @@
 %endif
 %define px rsp+2*16+2*%3
     pcmpeqw        m14, m14
-    psrlw          m14, 1                   ; 0x7fff
+    psllw          m14, 15                  ; 0x8000
     mov          edged, r8m
 
     ; prepare pixel buffers - body/right
@@ -358,6 +364,9 @@
     INIT_YMM avx2
     DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
 %undef edged
+    ; register to shuffle values into after packing
+    vbroadcasti128 m12, [shufb_lohi]
+
     movifnidn     prid, prim
     movifnidn     secd, secm
     mov       dampingd, r7m
@@ -378,21 +387,25 @@
     mov        [rsp+0], pridmpq                 ; pri_shift
     mov        [rsp+8], secdmpq                 ; sec_shift
 
+    DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+
     ; pri/sec_taps[k] [4 total]
-    DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3
+    DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
     movd           xm0, prid
     movd           xm1, secd
-    vpbroadcastw    m0, xm0                     ; pri_strength
-    vpbroadcastw    m1, xm1                     ; sec_strength
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    vpbroadcastb    m1, xm1                     ; sec_strength
     and           prid, 1
-    lea           tapq, [tap_table]
-    lea           priq, [tapq+priq*4]           ; pri_taps
-    lea           secq, [tapq+8]                ; sec_taps
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    lea           secq, [tableq+12]             ; sec_taps
 
     ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
-    DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3
+    DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
     mov           dird, r6m
-    lea           tapq, [tapq+dirq*2+12]
+    lea           dirq, [tapq+dirq*2+14]
 %if %1*%2*2/mmsize > 1
  %if %1 == 4
     DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
@@ -404,7 +417,7 @@
     DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
 %endif
     lea           stkq, [px]
-    pxor           m13, m13
+    pxor           m11, m11
 %if %1*%2*2/mmsize > 1
 .v_loop:
 %endif
@@ -423,20 +436,20 @@
     mova            m7, m4                      ; max
     mova            m8, m4                      ; min
 .k_loop:
-    vpbroadcastw    m2, [priq+kq*2]             ; pri_taps
-    vpbroadcastw    m3, [secq+kq*2]             ; sec_taps
+    vpbroadcastb    m2, [priq+kq]               ; pri_taps
+    vpbroadcastb    m3, [secq+kq]               ; sec_taps
 
-    ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3
-    ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3
-    ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3
+    ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3
+    ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3
+    ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3
 
     dec             kq
     jge .k_loop
 
-    vpbroadcastd   m12, [pw_2048]
-    pcmpgtw        m11, m13, m15
-    paddw          m15, m11
-    pmulhrsw       m15, m12
+    vpbroadcastd   m10, [pw_2048]
+    pcmpgtw         m9, m11, m15
+    paddw          m15, m9
+    pmulhrsw       m15, m10
     paddw           m4, m15
     pminsw          m4, m7
     pmaxsw          m4, m8
@@ -586,9 +599,8 @@
     ; and [upper half]:
     ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
     ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
-    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
 
-    vbroadcasti128 m14, [shufw_210xxxxx]
     pslldq          m4, m11, 2
     psrldq         m11, 14
     pslldq          m5, m12, 4
@@ -602,7 +614,7 @@
     paddw          m11, m13                 ; partial_sum_alt[3/2] right
     vbroadcasti128 m13, [div_table+32]
     paddw           m4, m5                  ; partial_sum_alt[3/2] left
-    pshufb         m11, m14
+    pshuflw        m11, m11, q3012
     punpckhwd       m6, m4, m11
     punpcklwd       m4, m11
     pmaddwd         m6, m6
@@ -617,7 +629,7 @@
     ; and [upper half]:
     ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
     ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
-    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
 
     pslldq          m5, m1, 2
     psrldq          m1, 14
@@ -630,7 +642,7 @@
     paddw           m6, m7
     paddw           m1, m3                  ; partial_sum_alt[0/1] right
     paddw           m5, m6                  ; partial_sum_alt[0/1] left
-    pshufb          m1, m14
+    pshuflw         m1, m1, q3012
     punpckhwd       m6, m5, m1
     punpcklwd       m5, m1
     pmaddwd         m6, m6
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef_init_tmpl.c 
new/dav1d-0.2.1/src/x86/cdef_init_tmpl.c
--- old/dav1d-0.2.0/src/x86/cdef_init_tmpl.c    2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/src/x86/cdef_init_tmpl.c    2019-03-12 15:28:36.000000000 
+0100
@@ -38,6 +38,7 @@
 decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
 
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
 
 void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -45,6 +46,7 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH ==8
+    c->dir = dav1d_cdef_dir_ssse3;
     c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
     c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
     c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef_ssse3.asm 
new/dav1d-0.2.1/src/x86/cdef_ssse3.asm
--- old/dav1d-0.2.0/src/x86/cdef_ssse3.asm      2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/src/x86/cdef_ssse3.asm      2019-03-12 15:28:36.000000000 
+0100
@@ -29,10 +29,17 @@
 
 SECTION_RODATA 16
 
+%if ARCH_X86_32
 pb_0: times 16 db 0
+%endif
+pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
 pw_0x7FFF: times 8 dw 0x7FFF
+pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
+div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 
120, 105, 105
+           dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 
105, 105, 105
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
 tap_table: dw 4, 2, 3, 3, 2, 1
            db -1 * 16 + 1, -2 * 16 + 2
            db  0 * 16 + 1, -1 * 16 + 2
@@ -711,3 +718,589 @@
 cdef_filter_fn 8, 8, 32
 cdef_filter_fn 4, 8, 32
 cdef_filter_fn 4, 4, 32
+
+%macro MULLD 2
+ %if ARCH_X86_32
+  %define m15 m1
+ %endif
+    pmulhuw        m15, %1, %2
+    pmullw          %1, %2
+    pslld          m15, 16
+    paddd           %1, m15
+%endmacro
+
+%if ARCH_X86_64
+cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
+    lea       stride3q, [strideq*3]
+    movq            m1, [srcq+strideq*0]
+    movhps          m1, [srcq+strideq*1]
+    movq            m3, [srcq+strideq*2]
+    movhps          m3, [srcq+stride3q]
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+
+    pxor            m8, m8
+    psadbw          m0, m1, m8
+    psadbw          m2, m3, m8
+    psadbw          m4, m5, m8
+    psadbw          m6, m7, m8
+    packssdw        m0, m2
+    packssdw        m4, m6
+    packssdw        m0, m4
+    SWAP            m0, m9
+
+    punpcklbw       m0, m1, m8
+    punpckhbw       m1, m8
+    punpcklbw       m2, m3, m8
+    punpckhbw       m3, m8
+    punpcklbw       m4, m5, m8
+    punpckhbw       m5, m8
+    punpcklbw       m6, m7, m8
+    punpckhbw       m7, m8
+
+    mova            m8, [pw_128]
+    psubw           m0, m8
+    psubw           m1, m8
+    psubw           m2, m8
+    psubw           m3, m8
+    psubw           m4, m8
+    psubw           m5, m8
+    psubw           m6, m8
+    psubw           m7, m8
+    psllw           m8, 3
+    psubw           m9, m8                  ; partial_sum_hv[0]
+
+    paddw           m8, m0, m1
+    paddw          m10, m2, m3
+    paddw           m8, m4
+    paddw          m10, m5
+    paddw           m8, m6
+    paddw          m10, m7
+    paddw           m8, m10                 ; partial_sum_hv[1]
+
+    pmaddwd         m8, m8
+    pmaddwd         m9, m9
+    phaddd          m9, m8
+    SWAP            m8, m9
+    MULLD           m8, [div_table+48]
+
+    pslldq          m9, m1, 2
+    psrldq         m10, m1, 14
+    pslldq         m11, m2, 4
+    psrldq         m12, m2, 12
+    pslldq         m13, m3, 6
+    psrldq         m14, m3, 10
+    paddw           m9, m0
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14                 ; partial_sum_diag[0] top/right 
half
+    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
+    pslldq         m11, m4, 8
+    psrldq         m12, m4, 8
+    pslldq         m13, m5, 10
+    psrldq         m14, m5, 6
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m6, 12
+    psrldq         m12, m6, 4
+    pslldq         m13, m7, 14
+    psrldq         m14, m7, 2
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
+    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
+    pshufb         m10, [shufw_6543210x]
+    punpckhwd      m11, m9, m10
+    punpcklwd       m9, m10
+    pmaddwd        m11, m11
+    pmaddwd         m9, m9
+    MULLD          m11, [div_table+16]
+    MULLD           m9, [div_table+0]
+    paddd           m9, m11                 ; cost[0a-d]
+
+    pslldq         m10, m0, 14
+    psrldq         m11, m0, 2
+    pslldq         m12, m1, 12
+    psrldq         m13, m1, 4
+    pslldq         m14, m2, 10
+    psrldq         m15, m2, 6
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m3, 8
+    psrldq         m13, m3, 8
+    pslldq         m14, m4, 6
+    psrldq         m15, m4, 10
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m5, 4
+    psrldq         m13, m5, 12
+    pslldq         m14, m6, 2
+    psrldq         m15, m6, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
+    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
+    pshufb         m11, [shufw_6543210x]
+    punpckhwd      m12, m10, m11
+    punpcklwd      m10, m11
+    pmaddwd        m12, m12
+    pmaddwd        m10, m10
+    MULLD          m12, [div_table+16]
+    MULLD          m10, [div_table+0]
+    paddd          m10, m12                 ; cost[4a-d]
+    phaddd          m9, m10                 ; cost[0a/b,4a/b]
+
+    paddw          m10, m0, m1
+    paddw          m11, m2, m3
+    paddw          m12, m4, m5
+    paddw          m13, m6, m7
+    phaddw          m0, m4
+    phaddw          m1, m5
+    phaddw          m2, m6
+    phaddw          m3, m7
+
+    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+    pslldq          m4, m11, 2
+    psrldq          m5, m11, 14
+    pslldq          m6, m12, 4
+    psrldq          m7, m12, 12
+    pslldq         m14, m13, 6
+    psrldq         m15, m13, 10
+    paddw           m4, m10
+    paddw           m5, m7
+    paddw           m4, m6
+    paddw           m5, m15                 ; partial_sum_alt[3] right
+    paddw           m4, m14                 ; partial_sum_alt[3] left
+    pshuflw         m5, m5, q3012
+    punpckhwd       m6, m4, m5
+    punpcklwd       m4, m5
+    pmaddwd         m6, m6
+    pmaddwd         m4, m4
+    MULLD           m6, [div_table+48]
+    MULLD           m4, [div_table+32]
+    paddd           m4, m6                  ; cost[7a-d]
+
+    pslldq          m5, m10, 6
+    psrldq          m6, m10, 10
+    pslldq          m7, m11, 4
+    psrldq         m10, m11, 12
+    pslldq         m11, m12, 2
+    psrldq         m12, 14
+    paddw           m5, m7
+    paddw           m6, m10
+    paddw           m5, m11
+    paddw           m6, m12
+    paddw           m5, m13
+    pshuflw         m6, m6, q3012
+    punpckhwd       m7, m5, m6
+    punpcklwd       m5, m6
+    pmaddwd         m7, m7
+    pmaddwd         m5, m5
+    MULLD           m7, [div_table+48]
+    MULLD           m5, [div_table+32]
+    paddd           m5, m7                  ; cost[5a-d]
+
+    pslldq          m6, m1, 2
+    psrldq          m7, m1, 14
+    pslldq         m10, m2, 4
+    psrldq         m11, m2, 12
+    pslldq         m12, m3, 6
+    psrldq         m13, m3, 10
+    paddw           m6, m0
+    paddw           m7, m11
+    paddw           m6, m10
+    paddw           m7, m13                 ; partial_sum_alt[3] right
+    paddw           m6, m12                 ; partial_sum_alt[3] left
+    pshuflw         m7, m7, q3012
+    punpckhwd      m10, m6, m7
+    punpcklwd       m6, m7
+    pmaddwd        m10, m10
+    pmaddwd         m6, m6
+    MULLD          m10, [div_table+48]
+    MULLD           m6, [div_table+32]
+    paddd           m6, m10                 ; cost[1a-d]
+
+    pshufd          m0, m0, q1032
+    pshufd          m1, m1, q1032
+    pshufd          m2, m2, q1032
+    pshufd          m3, m3, q1032
+
+    pslldq         m10, m0, 6
+    psrldq         m11, m0, 10
+    pslldq         m12, m1, 4
+    psrldq         m13, m1, 12
+    pslldq         m14, m2, 2
+    psrldq          m2, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m2
+    paddw          m10, m3
+    pshuflw        m11, m11, q3012
+    punpckhwd      m12, m10, m11
+    punpcklwd      m10, m11
+    pmaddwd        m12, m12
+    pmaddwd        m10, m10
+    MULLD          m12, [div_table+48]
+    MULLD          m10, [div_table+32]
+    paddd          m10, m12                 ; cost[3a-d]
+
+    phaddd          m0, m9, m8              ; cost[0,4,2,6]
+    phaddd          m6, m5
+    phaddd         m10, m4
+    phaddd          m1, m6, m10             ; cost[1,5,3,7]
+
+    pcmpgtd         m2, m1, m0              ; [1/5/3/7] > [0/4/2/6]
+    pand            m3, m2, m1
+    pandn           m4, m2, m0
+    por             m3, m4                  ; higher 4 values
+    pshufd          m1, m1, q2301
+    pshufd          m0, m0, q2301
+    pand            m1, m2, m1
+    pandn           m4, m2, m0
+    por             m0, m4, m1              ; 4 values at idx^4 offset
+    pand           m14, m2, [pd_0to7+16]
+    pandn          m15, m2, [pd_0to7]
+    por            m15, m14
+
+    punpckhqdq      m4, m3, m0
+    punpcklqdq      m3, m0
+    pcmpgtd         m5, m4, m3              ; [2or3-6or7] > [0or1/4or5]
+    punpcklqdq      m5, m5
+    pand            m6, m5, m4
+    pandn           m7, m5, m3
+    por             m6, m7                  ; { highest 2 values, complements 
at idx^4 }
+    movhlps        m14, m15
+    pand           m14, m5, m14
+    pandn          m13, m5, m15
+    por            m15, m13, m14
+
+    pshufd          m7, m6, q3311
+    pcmpgtd         m8, m7, m6              ; [4or5or6or7] > [0or1or2or3]
+    punpcklqdq      m8, m8
+    pand            m9, m8, m7
+    pandn          m10, m8, m6
+    por             m9, m10                 ; max
+    movhlps        m10, m9                  ; complement at idx^4
+    psubd           m9, m10
+    psrld           m9, 10
+    movd        [varq], m9
+    pshufd         m14, m15, q1111
+    pand           m14, m8, m14
+    pandn          m13, m8, m15
+    por            m15, m13, m14
+    movd           eax, m15
+%else
+cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
+ %define PIC_reg r4
+    LEA        PIC_reg, PIC_base_offset
+
+    pxor            m0, m0
+    mova            m1, [PIC_sym(pw_128)]
+
+    lea       stride3q, [strideq*3]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m2, m5, m0
+    psadbw          m3, m7, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    mova    [esp+0x00], m4
+    mova    [esp+0x10], m5
+    mova    [esp+0x20], m6
+    mova    [esp+0x50], m7
+
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m3, m5, m0
+    psadbw          m0, m7, m0
+    packssdw        m3, m0
+    pxor            m0, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    psllw           m1, 3
+    psubw           m2, m1                  ; partial_sum_hv[0]
+    pmaddwd         m2, m2
+
+    mova            m3, [esp+0x50]
+    mova            m0, [esp+0x00]
+    paddw           m0, [esp+0x10]
+    paddw           m1, m3, [esp+0x20]
+    paddw           m0, m4
+    paddw           m1, m5
+    paddw           m0, m6
+    paddw           m1, m7
+    paddw           m0, m1                  ; partial_sum_hv[1]
+    pmaddwd         m0, m0
+
+    phaddd          m2, m0
+    MULLD           m2, [PIC_sym(div_table)+48]
+    mova    [esp+0x30], m2
+
+    mova            m1, [esp+0x10]
+    pslldq          m0, m1, 2
+    psrldq          m1, 14
+    paddw           m0, [esp+0x00]
+    pslldq          m2, m3, 6
+    psrldq          m3, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 4
+    psrldq          m3, 12
+    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
+    paddw           m1, m3                  ; partial_sum_diag[0] top/right 
half
+    pslldq          m2, m4, 8
+    psrldq          m3, m4, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 10
+    psrldq          m3, m5, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 12
+    psrldq          m3, m6, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m7, 14
+    psrldq          m3, m7, 2
+    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+16]
+    MULLD           m0, [PIC_sym(div_table)+0]
+    paddd           m0, m2                  ; cost[0a-d]
+    mova    [esp+0x40], m0
+
+    mova            m1, [esp+0x00]
+    pslldq          m0, m1, 14
+    psrldq          m1, 2
+    paddw           m0, m7
+    pslldq          m2, m3, 8
+    psrldq          m3, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 10
+    psrldq          m3, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x10]
+    pslldq          m2, m3, 12
+    psrldq          m3, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m4, 6
+    psrldq          m3, m4, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 4
+    psrldq          m3, m5, 12
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 2
+    psrldq          m3, m6, 14
+    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+16]
+    MULLD           m0, [PIC_sym(div_table)+0]
+    paddd           m0, m2                  ; cost[4a-d]
+    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
+    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
+    mova    [esp+0x30], m1
+
+    phaddw          m0, [esp+0x00], m4
+    phaddw          m1, [esp+0x10], m5
+    paddw           m4, m5
+    mova            m2, [esp+0x20]
+    paddw           m5, m2, m3
+    phaddw          m2, m6
+    paddw           m6, m7
+    phaddw          m3, m7
+    mova            m7, [esp+0x00]
+    paddw           m7, [esp+0x10]
+    mova    [esp+0x00], m0
+    mova    [esp+0x10], m1
+    mova    [esp+0x20], m2
+
+    pslldq          m1, m4, 4
+    pslldq          m2, m6, 6
+    pslldq          m0, m5, 2
+    paddw           m1, m2
+    paddw           m0, m7
+    psrldq          m2, m5, 14
+    paddw           m0, m1                  ; partial_sum_alt[3] left
+    psrldq          m1, m4, 12
+    paddw           m1, m2
+    psrldq          m2, m6, 10
+    paddw           m1, m2                  ; partial_sum_alt[3] right
+    pshuflw         m1, m1, q3012
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48]
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[7a-d]
+    mova    [esp+0x40], m0
+
+    pslldq          m0, m7, 6
+    psrldq          m7, 10
+    pslldq          m1, m5, 4
+    psrldq          m5, 12
+    pslldq          m2, m4, 2
+    psrldq          m4, 14
+    paddw           m0, m6
+    paddw           m7, m5
+    paddw           m0, m1
+    paddw           m7, m4
+    paddw           m0, m2
+    pshuflw         m7, m7, q3012
+    punpckhwd       m2, m0, m7
+    punpcklwd       m0, m7
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48]
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[5a-d]
+    mova    [esp+0x50], m0
+
+    mova            m1, [esp+0x10]
+    mova            m2, [esp+0x20]
+    pslldq          m0, m1, 2
+    psrldq          m1, 14
+    pslldq          m4, m2, 4
+    psrldq          m2, 12
+    pslldq          m5, m3, 6
+    psrldq          m6, m3, 10
+    paddw           m0, [esp+0x00]
+    paddw           m1, m2
+    paddw           m4, m5
+    paddw           m1, m6                  ; partial_sum_alt[3] right
+    paddw           m0, m4                  ; partial_sum_alt[3] left
+    pshuflw         m1, m1, q3012
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48]
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[1a-d]
+    phaddd          m0, [esp+0x50]
+    mova    [esp+0x50], m0
+
+    pshufd          m0, [esp+0x00], q1032
+    pshufd          m1, [esp+0x10], q1032
+    pshufd          m2, [esp+0x20], q1032
+    pshufd          m3, m3, q1032
+
+    pslldq          m4, m0, 6
+    psrldq          m0, 10
+    pslldq          m5, m1, 4
+    psrldq          m1, 12
+    pslldq          m6, m2, 2
+    psrldq          m2, 14
+    paddw           m4, m3
+    paddw           m0, m1
+    paddw           m5, m6
+    paddw           m0, m2
+    paddw           m4, m5
+    pshuflw         m0, m0, q3012
+    punpckhwd      m2, m4, m0
+    punpcklwd      m4, m0
+    pmaddwd        m2, m2
+    pmaddwd        m4, m4
+    MULLD          m2, [PIC_sym(div_table)+48]
+    MULLD          m4, [PIC_sym(div_table)+32]
+    paddd          m4, m2                   ; cost[3a-d]
+    phaddd         m4, [esp+0x40]
+
+    mova            m1, [esp+0x50]
+    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
+    phaddd          m1, m4                  ; cost[1,5,3,7]
+
+    pcmpgtd         m2, m1, m0              ; [1/5/3/7] > [0/4/2/6]
+    pand            m3, m2, m1
+    pandn           m4, m2, m0
+    por             m3, m4                  ; higher 4 values
+    pshufd          m1, m1, q2301
+    pshufd          m0, m0, q2301
+    pand            m1, m2, m1
+    pandn           m4, m2, m0
+    por             m0, m4, m1              ; 4 values at idx^4 offset
+    pand            m5, m2, [PIC_sym(pd_0to7)+16]
+    pandn           m6, m2, [PIC_sym(pd_0to7)]
+    por             m6, m5
+
+    punpckhqdq      m4, m3, m0
+    punpcklqdq      m3, m0
+    pcmpgtd         m0, m4, m3              ; [2or3-6or7] > [0or1/4or5]
+    punpcklqdq      m0, m0
+    pand            m1, m0, m4
+    pandn           m7, m0, m3
+    por             m1, m7                  ; { highest 2 values, complements 
at idx^4 }
+    movhlps         m5, m6
+    pand            m5, m0, m5
+    pandn           m3, m0, m6
+    por             m6, m3, m5
+
+    pshufd          m7, m1, q3311
+    pcmpgtd         m2, m7, m1              ; [4or5or6or7] > [0or1or2or3]
+    punpcklqdq      m2, m2
+    pand            m0, m2, m7
+    pandn           m7, m2, m1
+    por             m0, m7                  ; max
+    movhlps         m7, m0                  ; complement at idx^4
+    psubd           m0, m7
+    psrld           m0, 10
+    movd        [varq], m0
+    pshufd          m5, m6, q1111
+    pand            m5, m2, m5
+    pandn           m3, m2, m6
+    por             m6, m3, m5
+    movd           eax, m6
+%endif
+
+    RET
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/src/x86/looprestoration_ssse3.asm 
new/dav1d-0.2.1/src/x86/looprestoration_ssse3.asm
--- old/dav1d-0.2.0/src/x86/looprestoration_ssse3.asm   2019-03-04 
15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/src/x86/looprestoration_ssse3.asm   2019-03-12 
15:28:36.000000000 +0100
@@ -35,6 +35,7 @@
              db 1, 2
 pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
                   db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
 pb_0: times 16 db 0
 pb_2: times 16 db 2
 pb_3: times 16 db 3
@@ -509,17 +510,11 @@
 ;;      self-guided     ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-%macro MULLD 2-3 1 ; %3 = is_constant
-    pmuludq       m5, %1, %2
-    psrlq         %1, 32
- %if %3 == 0
-    pshufd        m3, %2, q2301
-    pmuludq       %1, m3
- %else
-    pmuludq       %1, %2
- %endif
-    shufps        %1, m5, q2020
-    pshufd        %1, %1, q1302
+%macro MULLD 2
+    pmulhuw       m5, %1, %2
+    pmullw        %1, %2
+    pslld         m5, 16
+    paddd         %1, m5
 %endmacro
 
 %macro GATHERDD 2
@@ -766,7 +761,7 @@
     jl .loop_x
     RET
 
-cglobal sgr_calc_ab1, 4, 7, 14, a, b, w, h, s
+cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
     movifnidn     sd, sm
     sub           aq, (384+16-1)*4
     sub           bq, (384+16-1)*2
@@ -777,17 +772,20 @@
     SETUP_PIC r5, 0
 %endif
     movd          m6, sd
-    pshufd        m6, m6, 0
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
     pxor          m7, m7
     DEFINE_ARGS a, b, w, h, x
 %if ARCH_X86_64
     mova          m8, [pd_0xF00801C7]
     mova          m9, [pw_256]
     psrld        m10, m9, 13                        ; pd_2048
+    mova         m11, [pb_unpcklwdw]
 %else
  %define m8     [PIC_sym(pd_0xF00801C7)]
  %define m9     [PIC_sym(pw_256)]
  %define m10    [PIC_sym(pd_2048)]
+ %define m11    [PIC_sym(pb_unpcklwdw)]
 %endif
 .loop_y:
     mov           xq, -2
@@ -818,10 +816,12 @@
     GATHERDD      m2, m3
     psrld         m4, 24
     psrld         m2, 24
-    MULLD         m0, m4, 0
-    MULLD         m1, m2, 0
-    packssdw      m4, m2
-    psubw         m5, m9, m4
+    packssdw      m3, m4, m2
+    pshufb        m4, m11
+    MULLD         m0, m4
+    pshufb        m2, m11
+    MULLD         m1, m2
+    psubw         m5, m9, m3
     paddd         m0, m10
     paddd         m1, m10
     psrld         m0, 12
@@ -1516,7 +1516,8 @@
     SETUP_PIC r5, 0
 %endif
     movd          m6, sd
-    pshufd        m6, m6, 0
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
     pxor          m7, m7
     DEFINE_ARGS a, b, w, h, x
 %if ARCH_X86_64
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/tools/dav1d.c 
new/dav1d-0.2.1/tools/dav1d.c
--- old/dav1d-0.2.0/tools/dav1d.c       2019-03-04 15:21:54.000000000 +0100
+++ new/dav1d-0.2.1/tools/dav1d.c       2019-03-12 15:28:36.000000000 +0100
@@ -73,11 +73,11 @@
     Dav1dContext *c;
     Dav1dData data;
     unsigned n_out = 0, total, fps[2];
-    const char *version = dav1d_version_vcs();
+    const char *version = dav1d_version();
 
-    if (strcmp(version, DAV1D_VERSION_VCS)) {
+    if (strcmp(version, DAV1D_VERSION)) {
         fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
-                version, DAV1D_VERSION_VCS);
+                version, DAV1D_VERSION);
         return -1;
     }
 
@@ -100,7 +100,7 @@
     }
 
     if (!cli_settings.quiet)
-        fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version_vcs());
+        fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version());
 
     // skip frames until a sequence header is found
     if (cli_settings.skip) {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dav1d-0.2.0/tools/dav1d_cli_parse.c 
new/dav1d-0.2.1/tools/dav1d_cli_parse.c
--- old/dav1d-0.2.0/tools/dav1d_cli_parse.c     2019-03-04 15:21:54.000000000 
+0100
+++ new/dav1d-0.2.1/tools/dav1d_cli_parse.c     2019-03-12 15:28:36.000000000 
+0100
@@ -263,7 +263,7 @@
                 !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]);
             break;
         case 'v':
-            fprintf(stderr, "%s\n", dav1d_version_vcs());
+            fprintf(stderr, "%s\n", dav1d_version());
             exit(0);
         case ARG_CPU_MASK:
             dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,


Reply via email to