Hello community, here is the log from the commit of package dav1d for openSUSE:Factory checked in at 2019-03-13 09:16:24 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/dav1d (Old) and /work/SRC/openSUSE:Factory/.dav1d.new.28833 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "dav1d" Wed Mar 13 09:16:24 2019 rev:3 rq:684500 version:0.2.1 Changes: -------- --- /work/SRC/openSUSE:Factory/dav1d/dav1d.changes 2019-03-05 12:25:29.820838223 +0100 +++ /work/SRC/openSUSE:Factory/.dav1d.new.28833/dav1d.changes 2019-03-13 09:16:39.191377919 +0100 @@ -1,0 +2,10 @@ +Tue Mar 12 22:23:22 UTC 2019 - klaatu <[email protected]> + +- Update to version 0.2.1 + * SSSE3 optimization for cdef_dir + * AVX-2 improvements of the existing CDEF optimizations + * NEON improvements of the existing CDEF and wiener + optimizations + * Clarification about the numbering/versionning scheme + +------------------------------------------------------------------- Old: ---- dav1d-0.2.0.tar.gz New: ---- dav1d-0.2.1.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ dav1d.spec ++++++ --- /var/tmp/diff_new_pack.WHLmum/_old 2019-03-13 09:16:39.951377841 +0100 +++ /var/tmp/diff_new_pack.WHLmum/_new 2019-03-13 09:16:39.955377841 +0100 @@ -18,7 +18,7 @@ %define sover 1 Name: dav1d -Version: 0.2.0 +Version: 0.2.1 Release: 0 Summary: An AV1 decoder License: BSD-2-Clause ++++++ dav1d-0.2.0.tar.gz -> dav1d-0.2.1.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/NEWS new/dav1d-0.2.1/NEWS --- old/dav1d-0.2.0/NEWS 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/NEWS 2019-03-12 15:28:36.000000000 +0100 @@ -1,3 +1,12 @@ +Changes for 0.2.1 'Antelope': +---------------------------- + + - SSSE3 optimization for cdef_dir + - AVX-2 improvements of the existing CDEF optimizations + - NEON improvements of the existing CDEF and wiener optimizations + - Clarification about the numbering/versionning scheme + + Changes for 0.2.0 'Antelope': ---------------------------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/THANKS.md new/dav1d-0.2.1/THANKS.md --- old/dav1d-0.2.0/THANKS.md 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/THANKS.md 2019-03-12 15:28:36.000000000 +0100 @@ -16,4 +16,4 @@ And all the dav1d Authors (git shortlog -sn), including: -Janne Grunau, Ronald S. Bultje, James Almer, Marvin Scholz, Henrik Gramner, Martin Storsjö, Luc Trudeau, David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Jean-Baptiste Kempf, Derek Buitenhuis, Nathan E. Egge, Raphaël Zumer, Francois Cartegnie, Niklas Haas, Konstantin Pavlov, Boyuan Xiao, Raphael Zumer and Michael Bradshaw. +Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/dav1d.h new/dav1d-0.2.1/include/dav1d/dav1d.h --- old/dav1d-0.2.0/include/dav1d/dav1d.h 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/include/dav1d/dav1d.h 2019-03-12 15:28:36.000000000 +0100 @@ -74,16 +74,6 @@ DAV1D_API const char *dav1d_version(void); /** - * Get library version based on version control system. - */ -DAV1D_API const char *dav1d_version_vcs(void); - -/** - * Get library version as unsigned int. - */ -DAV1D_API unsigned int dav1d_version_int(void); - -/** * Initialize settings to default values. * * @param s Input settings context. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/meson.build new/dav1d-0.2.1/include/dav1d/meson.build --- old/dav1d-0.2.0/include/dav1d/meson.build 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/include/dav1d/meson.build 2019-03-12 15:28:36.000000000 +0100 @@ -24,9 +24,9 @@ # installed version.h header generation version_h_data = configuration_data() -version_h_data.set('DAV1D_VERSION_MAJOR', dav1d_version_major) -version_h_data.set('DAV1D_VERSION_MINOR', dav1d_version_minor) -version_h_data.set('DAV1D_VERSION_PATCH', dav1d_version_revision) +version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major) +version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor) +version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision) version_h_target = configure_file(input: 'version.h.in', output: 'version.h', configuration: version_h_data) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/include/dav1d/version.h.in new/dav1d-0.2.1/include/dav1d/version.h.in --- old/dav1d-0.2.0/include/dav1d/version.h.in 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/include/dav1d/version.h.in 2019-03-12 15:28:36.000000000 +0100 @@ -27,12 +27,8 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H -#define DAV1D_VERSION_MAJOR @DAV1D_VERSION_MAJOR@ -#define DAV1D_VERSION_MINOR @DAV1D_VERSION_MINOR@ -#define DAV1D_VERSION_PATCH @DAV1D_VERSION_PATCH@ - -#define DAV1D_VERSION "@DAV1D_VERSION_MAJOR@.@DAV1D_VERSION_MINOR@.@DAV1D_VERSION_PATCH@" - -#define DAV1D_VERSION_INT (@DAV1D_VERSION_MAJOR@ << 16 | @DAV1D_VERSION_MINOR@ << 8 | @DAV1D_VERSION_PATCH@) +#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@ +#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@ +#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@ #endif /* DAV1D_VERSION_H */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/include/vcs_version.h.in new/dav1d-0.2.1/include/vcs_version.h.in --- old/dav1d-0.2.0/include/vcs_version.h.in 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/include/vcs_version.h.in 2019-03-12 15:28:36.000000000 +0100 @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION_VCS "@VCS_TAG@" +#define DAV1D_VERSION "@VCS_TAG@" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/meson.build new/dav1d-0.2.1/meson.build --- old/dav1d-0.2.0/meson.build 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/meson.build 2019-03-12 15:28:36.000000000 +0100 @@ -23,18 +23,18 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.2.0', + version: '0.2.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.47.0') -dav1d_soname_version = '1.0.0' -dav1d_version_array = dav1d_soname_version.split('.') -dav1d_version_major = dav1d_version_array[0] -dav1d_version_minor = dav1d_version_array[1] -dav1d_version_revision = dav1d_version_array[2] +dav1d_soname_version = '1.0.1' +dav1d_api_version_array = dav1d_soname_version.split('.') +dav1d_api_version_major = dav1d_api_version_array[0] +dav1d_api_version_minor = dav1d_api_version_array[1] +dav1d_api_version_revision = dav1d_api_version_array[2] dav1d_src_root = meson.current_source_dir() cc = meson.get_compiler('c') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/arm/32/looprestoration.S new/dav1d-0.2.1/src/arm/32/looprestoration.S --- old/dav1d-0.2.0/src/arm/32/looprestoration.S 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/arm/32/looprestoration.S 2019-03-12 15:28:36.000000000 +0100 @@ -283,14 +283,12 @@ .word 66f - L(variable_shift_tbl) + CONFIG_THUMB .word 77f - L(variable_shift_tbl) + CONFIG_THUMB +44: // 4 pixels valid in d2/d16, fill d3/d17 with padding. + vmov d3, d4 + vmov d17, d18 + b 88f // Shift q1 right, shifting out invalid pixels, // shift q1 left to the original offset, shifting in padding pixels. -44: // 4 pixels valid - vext.8 q1, q1, q1, #8 - vext.8 q1, q1, q2, #8 - vext.8 q8, q8, q8, #8 - vext.8 q8, q8, q9, #8 - b 88f 55: // 5 pixels valid vext.8 q1, q1, q1, #10 vext.8 q1, q1, q2, #6 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/arm/64/cdef.S new/dav1d-0.2.1/src/arm/64/cdef.S --- old/dav1d-0.2.0/src/arm/64/cdef.S 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/arm/64/cdef.S 2019-03-12 15:28:36.000000000 +0100 @@ -136,8 +136,7 @@ .macro padding_func w, stride, rn, rw function cdef_padding\w\()_neon, export=1 - movi v30.16b, #255 - ushr v30.8h, v30.8h, #1 // INT16_MAX + movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w6, #4 // CDEF_HAVE_TOP @@ -290,29 +289,23 @@ .endif .endm .macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap - cmeq v16.8h, \s1\().8h, v31.8h - cmeq v17.8h, \s2\().8h, v31.8h - bic v16.16b, \s1\().16b, v16.16b - bic v17.16b, \s2\().16b, v17.16b umin v2.8h, v2.8h, \s1\().8h - umax v3.8h, v3.8h, v16.8h + smax v3.8h, v3.8h, \s1\().8h umin v2.8h, v2.8h, \s2\().8h - umax v3.8h, v3.8h, v17.8h + smax v3.8h, v3.8h, \s2\().8h cbz \threshold, 3f uabd v16.8h, v0.8h, \s1\().8h // abs(diff) uabd v20.8h, v0.8h, \s2\().8h // abs(diff) ushl v17.8h, v16.8h, \shift // abs(diff) >> shift ushl v21.8h, v20.8h, \shift // abs(diff) >> shift - sub v17.8h, \thresh_vec, v17.8h // threshold - (abs(diff) >> shift) - sub v21.8h, \thresh_vec, v21.8h // threshold - (abs(diff) >> shift) - smax v17.8h, v29.8h, v17.8h // imax(0, threshold - ()) - smax v21.8h, v29.8h, v21.8h // imax(0, threshold - ()) + uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift)) + uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift)) cmhi v18.8h, v0.8h, \s1\().8h // px > p0 cmhi v22.8h, v0.8h, \s2\().8h // px > p1 - smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax()) - smin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax()) - dup v19.8h, \tap // taps[k]/taps[k] + umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax()) + umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax()) + dup v19.8h, \tap // taps[k] neg v16.8h, v17.8h // -imin() neg v20.8h, v21.8h // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() @@ -332,11 +325,8 @@ add x8, x8, w9, uxtw #1 movrel x9, directions\w add x5, x9, w5, uxtw #1 - movi v31.16b, #255 movi v30.8h, #15 - movi v29.8h, #0 dup v28.8h, w6 // damping - ushr v31.8h, v31.8h, #1 // INT16_MAX dup v25.8h, w3 // threshold dup v27.8h, w4 // threshold @@ -344,10 +334,8 @@ clz v26.8h, v27.8h // clz(threshold) sub v24.8h, v30.8h, v24.8h // ulog2(threshold) sub v26.8h, v30.8h, v26.8h // ulog2(threshold) - sub v24.8h, v28.8h, v24.8h // damping - ulog2(threshold) - sub v26.8h, v28.8h, v26.8h // damping - ulog2(threshold) - smax v24.8h, v29.8h, v24.8h // shift = imax(0, damping - ulog2(threshold)) - smax v26.8h, v29.8h, v26.8h // shift = imax(0, damping - ulog2(threshold)) + uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold)) + uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold)) neg v24.8h, v24.8h // -shift neg v26.8h, v26.8h // -shift diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/arm/64/looprestoration.S new/dav1d-0.2.1/src/arm/64/looprestoration.S --- old/dav1d-0.2.0/src/arm/64/looprestoration.S 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/arm/64/looprestoration.S 2019-03-12 15:28:36.000000000 +0100 @@ -224,31 +224,25 @@ mov v3.16b, v28.16b mov v5.16b, v29.16b br x11 +44: // 4 pixels valid in v2/v4, fill the high half with padding. + ins v2.d[1], v3.d[0] + ins v4.d[1], v5.d[0] + b 88f // Shift v2 right, shifting out invalid pixels, // shift v2 left to the original offset, shifting in padding pixels. -44: // 4 pixels valid - ext v2.16b, v2.16b, v2.16b, #8 - ext v2.16b, v2.16b, v3.16b, #8 - ext v4.16b, v4.16b, v4.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - b 88f 55: // 5 pixels valid ext v2.16b, v2.16b, v2.16b, #10 ext v2.16b, v2.16b, v3.16b, #6 ext v4.16b, v4.16b, v4.16b, #10 ext v4.16b, v4.16b, v5.16b, #6 b 88f -66: // 6 pixels valid - ext v2.16b, v2.16b, v2.16b, #12 - ext v2.16b, v2.16b, v3.16b, #4 - ext v4.16b, v4.16b, v4.16b, #12 - ext v4.16b, v4.16b, v5.16b, #4 +66: // 6 pixels valid, fill the upper 2 pixels with padding. + ins v2.s[3], v3.s[0] + ins v4.s[3], v5.s[0] b 88f -77: // 7 pixels valid - ext v2.16b, v2.16b, v2.16b, #14 - ext v2.16b, v2.16b, v3.16b, #2 - ext v4.16b, v4.16b, v4.16b, #14 - ext v4.16b, v4.16b, v5.16b, #2 +77: // 7 pixels valid, fill the last pixel with padding. + ins v2.h[7], v3.h[0] + ins v4.h[7], v5.h[0] b 88f L(variable_shift_tbl): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/arm/looprestoration_init_tmpl.c new/dav1d-0.2.1/src/arm/looprestoration_init_tmpl.c --- old/dav1d-0.2.0/src/arm/looprestoration_init_tmpl.c 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/arm/looprestoration_init_tmpl.c 2019-03-12 15:28:36.000000000 +0100 @@ -29,8 +29,6 @@ #include "src/looprestoration.h" #include "common/attributes.h" -#include "common/intops.h" -#include "src/tables.h" #if BITDEPTH == 8 // This calculates things slightly differently than the reference C version. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/dav1d.rc.in new/dav1d-0.2.1/src/dav1d.rc.in --- old/dav1d-0.2.0/src/dav1d.rc.in 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/dav1d.rc.in 2019-03-12 15:28:36.000000000 +0100 @@ -1,13 +1,15 @@ -#define VERSION_NUMBER @VERSION_MAJOR@,@VERSION_MINOR@,@VERSION_REVISION@,@VERSION_EXTRA@ -#define VERSION_NUMBER_STR "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_REVISION@" +#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 +#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" +#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 +#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" #include <windows.h> 1 VERSIONINFO FILETYPE VFT_DLL FILEOS VOS_NT_WINDOWS32 -PRODUCTVERSION VERSION_NUMBER -FILEVERSION VERSION_NUMBER +PRODUCTVERSION PROJECT_VERSION_NUMBER +FILEVERSION API_VERSION_NUMBER BEGIN BLOCK "StringFileInfo" BEGIN @@ -15,9 +17,9 @@ BEGIN VALUE "CompanyName", "VideoLAN" VALUE "ProductName", "dav1d" - VALUE "ProductVersion", VERSION_NUMBER_STR - VALUE "FileVersion", VERSION_NUMBER_STR - VALUE "FileDescription", "dav1d AV1 decoder" + VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR + VALUE "FileVersion", API_VERSION_NUMBER_STR + VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" VALUE "InternalName", "dav1d" VALUE "OriginalFilename", "libdav1d.dll" VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/lib.c new/dav1d-0.2.1/src/lib.c --- old/dav1d-0.2.0/src/lib.c 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/lib.c 2019-03-12 15:28:36.000000000 +0100 @@ -56,14 +56,6 @@ return DAV1D_VERSION; } -const char *dav1d_version_vcs(void) { - return DAV1D_VERSION_VCS; -} - -unsigned int dav1d_version_int(void) { - return DAV1D_VERSION_INT; -} - void dav1d_default_settings(Dav1dSettings *const s) { s->n_frame_threads = 1; s->n_tile_threads = 1; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/meson.build new/dav1d-0.2.1/src/meson.build --- old/dav1d-0.2.0/src/meson.build 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/meson.build 2019-03-12 15:28:36.000000000 +0100 @@ -147,10 +147,12 @@ rc_version_array = meson.project_version().split('.') winmod = import('windows') rc_data = configuration_data() - rc_data.set('VERSION_MAJOR', rc_version_array[0]) - rc_data.set('VERSION_MINOR', rc_version_array[1]) - rc_data.set('VERSION_REVISION', rc_version_array[2]) - rc_data.set('VERSION_EXTRA', '0') + rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) + rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) + rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) + rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) + rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) + rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) rc_data.set('COPYRIGHT_YEARS', '2019') rc_file = configure_file( @@ -201,7 +203,7 @@ if host_machine.system() == 'windows' dav1d_soversion = '' else - dav1d_soversion = dav1d_version_major + dav1d_soversion = dav1d_api_version_major endif libdav1d = library('dav1d', diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef.asm new/dav1d-0.2.1/src/x86/cdef.asm --- old/dav1d-0.2.0/src/x86/cdef.asm 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/x86/cdef.asm 2019-03-12 15:28:36.000000000 +0100 @@ -33,10 +33,13 @@ div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 -shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 -tap_table: dw 4, 2, 3, 3, 2, 1 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 @@ -55,56 +58,59 @@ SECTION .text -%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 -%if %5 == 4 - movq xm5, [stkq+offq*2+%6*0] ; p0 - movq xm6, [stkq+offq*2+%6*2] - movhps xm5, [stkq+offq*2+%6*1] - movhps xm6, [stkq+offq*2+%6*3] +%if %6 == 4 + movq xm5, [stkq+offq*2+%7*0] ; p0 + movq xm6, [stkq+offq*2+%7*2] + movhps xm5, [stkq+offq*2+%7*1] + movhps xm6, [stkq+offq*2+%7*3] vinserti128 m5, xm6, 1 %else - movu xm5, [stkq+offq*2+%6*0] ; p0 - vinserti128 m5, [stkq+offq*2+%6*1], 1 + movu xm5, [stkq+offq*2+%7*0] ; p0 + vinserti128 m5, [stkq+offq*2+%7*1], 1 %endif neg offq ; -off1 -%if %5 == 4 - movq xm6, [stkq+offq*2+%6*0] ; p1 - movq xm9, [stkq+offq*2+%6*2] - movhps xm6, [stkq+offq*2+%6*1] - movhps xm9, [stkq+offq*2+%6*3] +%if %6 == 4 + movq xm6, [stkq+offq*2+%7*0] ; p1 + movq xm9, [stkq+offq*2+%7*2] + movhps xm6, [stkq+offq*2+%7*1] + movhps xm9, [stkq+offq*2+%7*3] vinserti128 m6, xm9, 1 %else - movu xm6, [stkq+offq*2+%6*0] ; p1 - vinserti128 m6, [stkq+offq*2+%6*1], 1 + movu xm6, [stkq+offq*2+%7*0] ; p1 + vinserti128 m6, [stkq+offq*2+%7*1], 1 %endif - pcmpeqw m9, m14, m5 - pcmpeqw m10, m14, m6 - pandn m9, m5 - pandn m10, m6 - pmaxsw m7, m9 ; max after p0 - pminsw m8, m5 ; min after p0 - pmaxsw m7, m10 ; max after p1 - pminsw m8, m6 ; min after p1 + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 ; accumulate sum[m15] over p0/p1 + ; calculate difference before converting psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) - pabsw m9, m5 - pabsw m10, m6 - psignw m11, %4, m5 - psignw m12, %4, m6 - psrlw m5, m9, %2 - psrlw m6, m10, %2 - psubusw m5, %3, m5 - psubusw m6, %3, m6 - pminsw m5, m9 ; constrain(diff_p0) - pminsw m6, m10 ; constrain(diff_p1) - pmullw m5, m11 ; constrain(diff_p0) * taps - pmullw m6, m12 ; constrain(diff_p1) * taps + + ; convert to 8-bits with signed saturation + ; saturating to large diffs has no impact on the results + packsswb m5, m6 + + ; group into pairs so we can accumulate using maddubsw + pshufb m5, m12 + pabsb m9, m5 + psignb m10, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + + ; use unsigned min since abs diff can equal 0x80 + pminub m5, m9 + pmaddubsw m5, m10 paddw m15, m5 - paddw m15, m6 %endmacro %macro cdef_filter_fn 3 ; w, h, stride @@ -118,7 +124,7 @@ %endif %define px rsp+2*16+2*%3 pcmpeqw m14, m14 - psrlw m14, 1 ; 0x7fff + psllw m14, 15 ; 0x8000 mov edged, r8m ; prepare pixel buffers - body/right @@ -358,6 +364,9 @@ INIT_YMM avx2 DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp %undef edged + ; register to shuffle values into after packing + vbroadcasti128 m12, [shufb_lohi] + movifnidn prid, prim movifnidn secd, secm mov dampingd, r7m @@ -378,21 +387,25 @@ mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3 + DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3 movd xm0, prid movd xm1, secd - vpbroadcastw m0, xm0 ; pri_strength - vpbroadcastw m1, xm1 ; sec_strength + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength and prid, 1 - lea tapq, [tap_table] - lea priq, [tapq+priq*4] ; pri_taps - lea secq, [tapq+8] ; sec_taps + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 + DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3 mov dird, r6m - lea tapq, [tapq+dirq*2+12] + lea dirq, [tapq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k @@ -404,7 +417,7 @@ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k %endif lea stkq, [px] - pxor m13, m13 + pxor m11, m11 %if %1*%2*2/mmsize > 1 .v_loop: %endif @@ -423,20 +436,20 @@ mova m7, m4 ; max mova m8, m4 ; min .k_loop: - vpbroadcastw m2, [priq+kq*2] ; pri_taps - vpbroadcastw m3, [secq+kq*2] ; sec_taps + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps - ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 - ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 - ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 + ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3 + ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3 + ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3 dec kq jge .k_loop - vpbroadcastd m12, [pw_2048] - pcmpgtw m11, m13, m15 - paddw m15, m11 - pmulhrsw m15, m12 + vpbroadcastd m10, [pw_2048] + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, m10 paddw m4, m15 pminsw m4, m7 pmaxsw m4, m8 @@ -586,9 +599,8 @@ ; and [upper half]: ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx - ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd - vbroadcasti128 m14, [shufw_210xxxxx] pslldq m4, m11, 2 psrldq m11, 14 pslldq m5, m12, 4 @@ -602,7 +614,7 @@ paddw m11, m13 ; partial_sum_alt[3/2] right vbroadcasti128 m13, [div_table+32] paddw m4, m5 ; partial_sum_alt[3/2] left - pshufb m11, m14 + pshuflw m11, m11, q3012 punpckhwd m6, m4, m11 punpcklwd m4, m11 pmaddwd m6, m6 @@ -617,7 +629,7 @@ ; and [upper half]: ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx - ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m5, m1, 2 psrldq m1, 14 @@ -630,7 +642,7 @@ paddw m6, m7 paddw m1, m3 ; partial_sum_alt[0/1] right paddw m5, m6 ; partial_sum_alt[0/1] left - pshufb m1, m14 + pshuflw m1, m1, q3012 punpckhwd m6, m5, m1 punpcklwd m5, m1 pmaddwd m6, m6 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef_init_tmpl.c new/dav1d-0.2.1/src/x86/cdef_init_tmpl.c --- old/dav1d-0.2.0/src/x86/cdef_init_tmpl.c 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/x86/cdef_init_tmpl.c 2019-03-12 15:28:36.000000000 +0100 @@ -38,6 +38,7 @@ decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3); decl_cdef_dir_fn(dav1d_cdef_dir_avx2); +decl_cdef_dir_fn(dav1d_cdef_dir_ssse3); void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -45,6 +46,7 @@ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH ==8 + c->dir = dav1d_cdef_dir_ssse3; c->fb[0] = dav1d_cdef_filter_8x8_ssse3; c->fb[1] = dav1d_cdef_filter_4x8_ssse3; c->fb[2] = dav1d_cdef_filter_4x4_ssse3; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/x86/cdef_ssse3.asm new/dav1d-0.2.1/src/x86/cdef_ssse3.asm --- old/dav1d-0.2.0/src/x86/cdef_ssse3.asm 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/x86/cdef_ssse3.asm 2019-03-12 15:28:36.000000000 +0100 @@ -29,10 +29,17 @@ SECTION_RODATA 16 +%if ARCH_X86_32 pb_0: times 16 db 0 +%endif +pw_128: times 8 dw 128 pw_256: times 8 dw 256 pw_2048: times 8 dw 2048 pw_0x7FFF: times 8 dw 0x7FFF +pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7 +div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105 + dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105 +shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 tap_table: dw 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 @@ -711,3 +718,589 @@ cdef_filter_fn 8, 8, 32 cdef_filter_fn 4, 8, 32 cdef_filter_fn 4, 4, 32 + +%macro MULLD 2 + %if ARCH_X86_32 + %define m15 m1 + %endif + pmulhuw m15, %1, %2 + pmullw %1, %2 + pslld m15, 16 + paddd %1, m15 +%endmacro + +%if ARCH_X86_64 +cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq m1, [srcq+strideq*0] + movhps m1, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + movhps m3, [srcq+stride3q] + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + + pxor m8, m8 + psadbw m0, m1, m8 + psadbw m2, m3, m8 + psadbw m4, m5, m8 + psadbw m6, m7, m8 + packssdw m0, m2 + packssdw m4, m6 + packssdw m0, m4 + SWAP m0, m9 + + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + punpcklbw m2, m3, m8 + punpckhbw m3, m8 + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + punpcklbw m6, m7, m8 + punpckhbw m7, m8 + + mova m8, [pw_128] + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + psubw m4, m8 + psubw m5, m8 + psubw m6, m8 + psubw m7, m8 + psllw m8, 3 + psubw m9, m8 ; partial_sum_hv[0] + + paddw m8, m0, m1 + paddw m10, m2, m3 + paddw m8, m4 + paddw m10, m5 + paddw m8, m6 + paddw m10, m7 + paddw m8, m10 ; partial_sum_hv[1] + + pmaddwd m8, m8 + pmaddwd m9, m9 + phaddd m9, m8 + SWAP m8, m9 + MULLD m8, [div_table+48] + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m0 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 ; partial_sum_diag[0] top/right half + paddw m9, m11 ; partial_sum_diag[0] top/left half + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 ; partial_sum_diag[0][0-7] + paddw m10, m14 ; partial_sum_diag[0][8-14,zero] + pshufb m10, [shufw_6543210x] + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + MULLD m11, [div_table+16] + MULLD m9, [div_table+0] + paddd m9, m11 ; cost[0a-d] + + pslldq m10, m0, 14 + psrldq m11, m0, 2 + pslldq m12, m1, 12 + psrldq m13, m1, 4 + pslldq m14, m2, 10 + psrldq m15, m2, 6 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m3, 8 + psrldq m13, m3, 8 + pslldq m14, m4, 6 + psrldq m15, m4, 10 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m5, 4 + psrldq m13, m5, 12 + pslldq m14, m6, 2 + psrldq m15, m6, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 ; partial_sum_diag[1][8-14,zero] + paddw m10, m7 ; partial_sum_diag[1][0-7] + pshufb m11, [shufw_6543210x] + punpckhwd m12, m10, m11 + punpcklwd m10, m11 + pmaddwd m12, m12 + pmaddwd m10, m10 + MULLD m12, [div_table+16] + MULLD m10, [div_table+0] + paddd m10, m12 ; cost[4a-d] + phaddd m9, m10 ; cost[0a/b,4a/b] + + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) + pslldq m4, m11, 2 + psrldq m5, m11, 14 + pslldq m6, m12, 4 + psrldq m7, m12, 12 + pslldq m14, m13, 6 + psrldq m15, m13, 10 + paddw m4, m10 + paddw m5, m7 + paddw m4, m6 + paddw m5, m15 ; partial_sum_alt[3] right + paddw m4, m14 ; partial_sum_alt[3] left + pshuflw m5, m5, q3012 + punpckhwd m6, m4, m5 + punpcklwd m4, m5 + pmaddwd m6, m6 + pmaddwd m4, m4 + MULLD m6, [div_table+48] + MULLD m4, [div_table+32] + paddd m4, m6 ; cost[7a-d] + + pslldq m5, m10, 6 + psrldq m6, m10, 10 + pslldq m7, m11, 4 + psrldq m10, m11, 12 + pslldq m11, m12, 2 + psrldq m12, 14 + paddw m5, m7 + paddw m6, m10 + paddw m5, m11 + paddw m6, m12 + paddw m5, m13 + pshuflw m6, m6, q3012 + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + pmaddwd m7, m7 + pmaddwd m5, m5 + MULLD m7, [div_table+48] + MULLD m5, [div_table+32] + paddd m5, m7 ; cost[5a-d] + + pslldq m6, m1, 2 + psrldq m7, m1, 14 + pslldq m10, m2, 4 + psrldq m11, m2, 12 + pslldq m12, m3, 6 + psrldq m13, m3, 10 + paddw m6, m0 + paddw m7, m11 + paddw m6, m10 + paddw m7, m13 ; partial_sum_alt[3] right + paddw m6, m12 ; partial_sum_alt[3] left + pshuflw m7, m7, q3012 + punpckhwd m10, m6, m7 + punpcklwd m6, m7 + pmaddwd m10, m10 + pmaddwd m6, m6 + MULLD m10, [div_table+48] + MULLD m6, [div_table+32] + paddd m6, m10 ; cost[1a-d] + + pshufd m0, m0, q1032 + pshufd m1, m1, q1032 + pshufd m2, m2, q1032 + pshufd m3, m3, q1032 + + pslldq m10, m0, 6 + psrldq m11, m0, 10 + pslldq m12, m1, 4 + psrldq m13, m1, 12 + pslldq m14, m2, 2 + psrldq m2, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m2 + paddw m10, m3 + pshuflw m11, m11, q3012 + punpckhwd m12, m10, m11 + punpcklwd m10, m11 + pmaddwd m12, m12 + pmaddwd m10, m10 + MULLD m12, [div_table+48] + MULLD m10, [div_table+32] + paddd m10, m12 ; cost[3a-d] + + phaddd m0, m9, m8 ; cost[0,4,2,6] + phaddd m6, m5 + phaddd m10, m4 + phaddd m1, m6, m10 ; cost[1,5,3,7] + + pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6] + pand m3, m2, m1 + pandn m4, m2, m0 + por m3, m4 ; higher 4 values + pshufd m1, m1, q2301 + pshufd m0, m0, q2301 + pand m1, m2, m1 + pandn m4, m2, m0 + por m0, m4, m1 ; 4 values at idx^4 offset + pand m14, m2, [pd_0to7+16] + pandn m15, m2, [pd_0to7] + por m15, m14 + + punpckhqdq m4, m3, m0 + punpcklqdq m3, m0 + pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5] + punpcklqdq m5, m5 + pand m6, m5, m4 + pandn m7, m5, m3 + por m6, m7 ; { highest 2 values, complements at idx^4 } + movhlps m14, m15 + pand m14, m5, m14 + pandn m13, m5, m15 + por m15, m13, m14 + + pshufd m7, m6, q3311 + pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3] + punpcklqdq m8, m8 + pand m9, m8, m7 + pandn m10, m8, m6 + por m9, m10 ; max + movhlps m10, m9 ; complement at idx^4 + psubd m9, m10 + psrld m9, 10 + movd [varq], m9 + pshufd m14, m15, q1111 + pand m14, m8, m14 + pandn m13, m8, m15 + por m15, m13, m14 + movd eax, m15 +%else +cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3 + %define PIC_reg r4 + LEA PIC_reg, PIC_base_offset + + pxor m0, m0 + mova m1, [PIC_sym(pw_128)] + + lea stride3q, [strideq*3] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + psadbw m2, m5, m0 + psadbw m3, m7, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + mova [esp+0x00], m4 + mova [esp+0x10], m5 + mova [esp+0x20], m6 + mova [esp+0x50], m7 + + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + psadbw m3, m5, m0 + psadbw m0, m7, m0 + packssdw m3, m0 + pxor m0, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + psllw m1, 3 + psubw m2, m1 ; partial_sum_hv[0] + pmaddwd m2, m2 + + mova m3, [esp+0x50] + mova m0, [esp+0x00] + paddw m0, [esp+0x10] + paddw m1, m3, [esp+0x20] + paddw m0, m4 + paddw m1, m5 + paddw m0, m6 + paddw m1, m7 + paddw m0, m1 ; partial_sum_hv[1] + pmaddwd m0, m0 + + phaddd m2, m0 + MULLD m2, [PIC_sym(div_table)+48] + mova [esp+0x30], m2 + + mova m1, [esp+0x10] + pslldq m0, m1, 2 + psrldq m1, 14 + paddw m0, [esp+0x00] + pslldq m2, m3, 6 + psrldq m3, 10 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 4 + psrldq m3, 12 + paddw m0, m2 ; partial_sum_diag[0] top/left half + paddw m1, m3 ; partial_sum_diag[0] top/right half + pslldq m2, m4, 8 + psrldq m3, m4, 8 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 10 + psrldq m3, m5, 6 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 12 + psrldq m3, m6, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m7, 14 + psrldq m3, m7, 2 + paddw m0, m2 ; partial_sum_diag[0][0-7] + paddw m1, m3 ; partial_sum_diag[0][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [PIC_sym(shufw_6543210x)] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [PIC_sym(div_table)+16] + MULLD m0, [PIC_sym(div_table)+0] + paddd m0, m2 ; cost[0a-d] + mova [esp+0x40], m0 + + mova m1, [esp+0x00] + pslldq m0, m1, 14 + psrldq m1, 2 + paddw m0, m7 + pslldq m2, m3, 8 + psrldq m3, 8 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 10 + psrldq m3, 6 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x10] + pslldq m2, m3, 12 + psrldq m3, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m4, 6 + psrldq m3, m4, 10 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 4 + psrldq m3, m5, 12 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 2 + psrldq m3, m6, 14 + paddw m0, m2 ; partial_sum_diag[1][0-7] + paddw m1, m3 ; partial_sum_diag[1][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [PIC_sym(shufw_6543210x)] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [PIC_sym(div_table)+16] + MULLD m0, [PIC_sym(div_table)+0] + paddd m0, m2 ; cost[4a-d] + phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] + phaddd m1, [esp+0x30] ; cost[0,4,2,6] + mova [esp+0x30], m1 + + phaddw m0, [esp+0x00], m4 + phaddw m1, [esp+0x10], m5 + paddw m4, m5 + mova m2, [esp+0x20] + paddw m5, m2, m3 + phaddw m2, m6 + paddw m6, m7 + phaddw m3, m7 + mova m7, [esp+0x00] + paddw m7, [esp+0x10] + mova [esp+0x00], m0 + mova [esp+0x10], m1 + mova [esp+0x20], m2 + + pslldq m1, m4, 4 + pslldq m2, m6, 6 + pslldq m0, m5, 2 + paddw m1, m2 + paddw m0, m7 + psrldq m2, m5, 14 + paddw m0, m1 ; partial_sum_alt[3] left + psrldq m1, m4, 12 + paddw m1, m2 + psrldq m2, m6, 10 + paddw m1, m2 ; partial_sum_alt[3] right + pshuflw m1, m1, q3012 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [PIC_sym(div_table)+48] + MULLD m0, [PIC_sym(div_table)+32] + paddd m0, m2 ; cost[7a-d] + mova [esp+0x40], m0 + + pslldq m0, m7, 6 + psrldq m7, 10 + pslldq m1, m5, 4 + psrldq m5, 12 + pslldq m2, m4, 2 + psrldq m4, 14 + paddw m0, m6 + paddw m7, m5 + paddw m0, m1 + paddw m7, m4 + paddw m0, m2 + pshuflw m7, m7, q3012 + punpckhwd m2, m0, m7 + punpcklwd m0, m7 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [PIC_sym(div_table)+48] + MULLD m0, [PIC_sym(div_table)+32] + paddd m0, m2 ; cost[5a-d] + mova [esp+0x50], m0 + + mova m1, [esp+0x10] + mova m2, [esp+0x20] + pslldq m0, m1, 2 + psrldq m1, 14 + pslldq m4, m2, 4 + psrldq m2, 12 + pslldq m5, m3, 6 + psrldq m6, m3, 10 + paddw m0, [esp+0x00] + paddw m1, m2 + paddw m4, m5 + paddw m1, m6 ; partial_sum_alt[3] right + paddw m0, m4 ; partial_sum_alt[3] left + pshuflw m1, m1, q3012 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [PIC_sym(div_table)+48] + MULLD m0, [PIC_sym(div_table)+32] + paddd m0, m2 ; cost[1a-d] + phaddd m0, [esp+0x50] + mova [esp+0x50], m0 + + pshufd m0, [esp+0x00], q1032 + pshufd m1, [esp+0x10], q1032 + pshufd m2, [esp+0x20], q1032 + pshufd m3, m3, q1032 + + pslldq m4, m0, 6 + psrldq m0, 10 + pslldq m5, m1, 4 + psrldq m1, 12 + pslldq m6, m2, 2 + psrldq m2, 14 + paddw m4, m3 + paddw m0, m1 + paddw m5, m6 + paddw m0, m2 + paddw m4, m5 + pshuflw m0, m0, q3012 + punpckhwd m2, m4, m0 + punpcklwd m4, m0 + pmaddwd m2, m2 + pmaddwd m4, m4 + MULLD m2, [PIC_sym(div_table)+48] + MULLD m4, [PIC_sym(div_table)+32] + paddd m4, m2 ; cost[3a-d] + phaddd m4, [esp+0x40] + + mova m1, [esp+0x50] + mova m0, [esp+0x30] ; cost[0,4,2,6] + phaddd m1, m4 ; cost[1,5,3,7] + + pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6] + pand m3, m2, m1 + pandn m4, m2, m0 + por m3, m4 ; higher 4 values + pshufd m1, m1, q2301 + pshufd m0, m0, q2301 + pand m1, m2, m1 + pandn m4, m2, m0 + por m0, m4, m1 ; 4 values at idx^4 offset + pand m5, m2, [PIC_sym(pd_0to7)+16] + pandn m6, m2, [PIC_sym(pd_0to7)] + por m6, m5 + + punpckhqdq m4, m3, m0 + punpcklqdq m3, m0 + pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5] + punpcklqdq m0, m0 + pand m1, m0, m4 + pandn m7, m0, m3 + por m1, m7 ; { highest 2 values, complements at idx^4 } + movhlps m5, m6 + pand m5, m0, m5 + pandn m3, m0, m6 + por m6, m3, m5 + + pshufd m7, m1, q3311 + pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3] + punpcklqdq m2, m2 + pand m0, m2, m7 + pandn m7, m2, m1 + por m0, m7 ; max + movhlps m7, m0 ; complement at idx^4 + psubd m0, m7 + psrld m0, 10 + movd [varq], m0 + pshufd m5, m6, q1111 + pand m5, m2, m5 + pandn m3, m2, m6 + por m6, m3, m5 + movd eax, m6 +%endif + + RET diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/src/x86/looprestoration_ssse3.asm new/dav1d-0.2.1/src/x86/looprestoration_ssse3.asm --- old/dav1d-0.2.0/src/x86/looprestoration_ssse3.asm 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/src/x86/looprestoration_ssse3.asm 2019-03-12 15:28:36.000000000 +0100 @@ -35,6 +35,7 @@ db 1, 2 pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 +pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 pb_0: times 16 db 0 pb_2: times 16 db 2 pb_3: times 16 db 3 @@ -509,17 +510,11 @@ ;; self-guided ;; ;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro MULLD 2-3 1 ; %3 = is_constant - pmuludq m5, %1, %2 - psrlq %1, 32 - %if %3 == 0 - pshufd m3, %2, q2301 - pmuludq %1, m3 - %else - pmuludq %1, %2 - %endif - shufps %1, m5, q2020 - pshufd %1, %1, q1302 +%macro MULLD 2 + pmulhuw m5, %1, %2 + pmullw %1, %2 + pslld m5, 16 + paddd %1, m5 %endmacro %macro GATHERDD 2 @@ -766,7 +761,7 @@ jl .loop_x RET -cglobal sgr_calc_ab1, 4, 7, 14, a, b, w, h, s +cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s movifnidn sd, sm sub aq, (384+16-1)*4 sub bq, (384+16-1)*2 @@ -777,17 +772,20 @@ SETUP_PIC r5, 0 %endif movd m6, sd - pshufd m6, m6, 0 + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 pxor m7, m7 DEFINE_ARGS a, b, w, h, x %if ARCH_X86_64 mova m8, [pd_0xF00801C7] mova m9, [pw_256] psrld m10, m9, 13 ; pd_2048 + mova m11, [pb_unpcklwdw] %else %define m8 [PIC_sym(pd_0xF00801C7)] %define m9 [PIC_sym(pw_256)] %define m10 [PIC_sym(pd_2048)] + %define m11 [PIC_sym(pb_unpcklwdw)] %endif .loop_y: mov xq, -2 @@ -818,10 +816,12 @@ GATHERDD m2, m3 psrld m4, 24 psrld m2, 24 - MULLD m0, m4, 0 - MULLD m1, m2, 0 - packssdw m4, m2 - psubw m5, m9, m4 + packssdw m3, m4, m2 + pshufb m4, m11 + MULLD m0, m4 + pshufb m2, m11 + MULLD m1, m2 + psubw m5, m9, m3 paddd m0, m10 paddd m1, m10 psrld m0, 12 @@ -1516,7 +1516,8 @@ SETUP_PIC r5, 0 %endif movd m6, sd - pshufd m6, m6, 0 + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 pxor m7, m7 DEFINE_ARGS a, b, w, h, x %if ARCH_X86_64 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/tools/dav1d.c new/dav1d-0.2.1/tools/dav1d.c --- old/dav1d-0.2.0/tools/dav1d.c 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/tools/dav1d.c 2019-03-12 15:28:36.000000000 +0100 @@ -73,11 +73,11 @@ Dav1dContext *c; Dav1dData data; unsigned n_out = 0, total, fps[2]; - const char *version = dav1d_version_vcs(); + const char *version = dav1d_version(); - if (strcmp(version, DAV1D_VERSION_VCS)) { + if (strcmp(version, DAV1D_VERSION)) { fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n", - version, DAV1D_VERSION_VCS); + version, DAV1D_VERSION); return -1; } @@ -100,7 +100,7 @@ } if (!cli_settings.quiet) - fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version_vcs()); + fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version()); // skip frames until a sequence header is found if (cli_settings.skip) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dav1d-0.2.0/tools/dav1d_cli_parse.c new/dav1d-0.2.1/tools/dav1d_cli_parse.c --- old/dav1d-0.2.0/tools/dav1d_cli_parse.c 2019-03-04 15:21:54.000000000 +0100 +++ new/dav1d-0.2.1/tools/dav1d_cli_parse.c 2019-03-12 15:28:36.000000000 +0100 @@ -263,7 +263,7 @@ !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]); break; case 'v': - fprintf(stderr, "%s\n", dav1d_version_vcs()); + fprintf(stderr, "%s\n", dav1d_version()); exit(0); case ARG_CPU_MASK: dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
