Hello community, here is the log from the commit of package vapoursynth for openSUSE:Factory checked in at 2020-08-25 09:39:41 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/vapoursynth (Old) and /work/SRC/openSUSE:Factory/.vapoursynth.new.3399 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "vapoursynth" Tue Aug 25 09:39:41 2020 rev:14 rq:829017 version:52 Changes: -------- --- /work/SRC/openSUSE:Factory/vapoursynth/vapoursynth.changes 2020-08-12 10:37:49.300330694 +0200 +++ /work/SRC/openSUSE:Factory/.vapoursynth.new.3399/vapoursynth.changes 2020-08-25 09:41:26.728285249 +0200 @@ -1,0 +2,12 @@ +Mon Aug 24 08:07:44 UTC 2020 - Michael Vetter <mvet...@suse.com> + +- Update to 52: + * updated visual studio 2019 runtime version + * updated zimg + * updated vsrepo with support for python wheel packages + * vsgenstubs is now included with vsrepo + * fixed maximum for 16 bit input with diagonal filters and optimizations + * fixed deadlock in fmserial filters introduced in r51 + * fixed more averageframes bugs + +------------------------------------------------------------------- Old: ---- vapoursynth-R51.tar.gz New: ---- vapoursynth-R52.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ vapoursynth.spec ++++++ --- /var/tmp/diff_new_pack.ohUSKu/_old 2020-08-25 09:41:27.392285551 +0200 +++ /var/tmp/diff_new_pack.ohUSKu/_new 2020-08-25 09:41:27.396285552 +0200 @@ -17,7 +17,7 @@ Name: vapoursynth -Version: 51 +Version: 52 Release: 0 Summary: A video processing framework License: LGPL-2.1-only ++++++ vapoursynth-R51.tar.gz -> vapoursynth-R52.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/ChangeLog new/vapoursynth-R52/ChangeLog --- old/vapoursynth-R51/ChangeLog 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/ChangeLog 2020-08-20 19:48:22.000000000 +0200 @@ -1,3 +1,12 @@ +r52: +updated visual studio 2019 runtime version +updated zimg +updated vsrepo with support for python wheel packages +vsgenstubs is now included with vsrepo +fixed maximum for 16 bit input with diagonal filters and optimizations +fixed deadlock in fmserial filters introduced in r51 +fixed more averageframes bugs (sekrit-twc) + r51: updated visual studio 2019 runtime version fixed a cache shrinking issue diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' "old/vapoursynth-R51/build instructions windows.md" "new/vapoursynth-R52/build instructions windows.md" --- "old/vapoursynth-R51/build instructions windows.md" 2020-07-29 11:18:16.000000000 +0200 +++ "new/vapoursynth-R52/build instructions windows.md" 2020-08-20 19:48:22.000000000 +0200 @@ -5,7 +5,7 @@ ## Required languages and applications * Needs [Visual Studio 2019](https://visualstudio.microsoft.com/de/vs/) -* It also needs both [32bit](https://www.python.org/ftp/python/3.8.3/python-3.8.3-webinstall.exe) and [64bit](https://www.python.org/ftp/python/3.8.3/python-3.8.3-amd64-webinstall.exe) Python 3.8 series (the msvc project assumes that you installed python for all users.) +* It also needs both [32bit](https://www.python.org/) and [64bit](https://www.python.org/) Python 3.8 series (the msvc project assumes that you installed python for all users.) * [InnoSetup 6.x](http://www.jrsoftware.org/isdl.php) is needed to create the installer (default installation path assumed) * [7-zip](https://www.7-zip.org/) is needed to compress the portable version (default installation path assumed) @@ -13,7 +13,7 @@ * Clone VapourSynth * Clone VSRepo into the VapourSynth dir (`git clone https://github.com/vapoursynth/vsrepo`) -* Clone zimg v2.9 branch into the VapourSynth dir (`git clone https://github.com/sekrit-twc/zimg --branch v2.9`) +* Clone zimg into the VapourSynth dir (`git clone https://github.com/sekrit-twc/zimg --branch v2.9`) * Clone avs+ into the VapourSynth dir (`git clone https://github.com/AviSynth/AviSynthPlus.git`) * Compile 32 and 64 bit releases using the VapourSynth solution diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/configure.ac new/vapoursynth-R52/configure.ac --- old/vapoursynth-R51/configure.ac 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/configure.ac 2020-08-20 19:48:22.000000000 +0200 @@ -1,4 +1,4 @@ -AC_INIT([vapoursynth], [51], [https://github.com/vapoursynth/vapoursynth/issues], [vapoursynth], [http://www.vapoursynth.com/]) +AC_INIT([vapoursynth], [52], [https://github.com/vapoursynth/vapoursynth/issues], [vapoursynth], [http://www.vapoursynth.com/]) : ${CFLAGS=""} : ${CXXFLAGS=""} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/doc/conf.py new/vapoursynth-R52/doc/conf.py --- old/vapoursynth-R51/doc/conf.py 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/doc/conf.py 2020-08-20 19:48:22.000000000 +0200 @@ -49,7 +49,7 @@ # built documents. # # The short X.Y version. -version = 'R51' +version = 'R52' # The full version, including alpha/beta/rc tags. release = version diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/installer/scripts/products/vcredist2017.iss new/vapoursynth-R52/installer/scripts/products/vcredist2017.iss --- old/vapoursynth-R51/installer/scripts/products/vcredist2017.iss 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/installer/scripts/products/vcredist2017.iss 2020-08-20 19:48:22.000000000 +0200 @@ -10,8 +10,8 @@ [Code] const - vcredist2017_url = 'http://download.visualstudio.microsoft.com/download/pr/d60aa805-26e9-47df-b4e3-cd6fcc392333/A06AAC66734A618AB33C1522920654DDFC44FC13CAFAA0F0AB85B199C3D51DC0/VC_redist.x86.exe'; - vcredist2017_url_x64 = 'http://download.visualstudio.microsoft.com/download/pr/d60aa805-26e9-47df-b4e3-cd6fcc392333/7D7105C52FCD6766BEEE1AE162AA81E278686122C1E44890712326634D0B055E/VC_redist.x64.exe'; + vcredist2017_url = 'http://download.visualstudio.microsoft.com/download/pr/9fe82b83-f3a1-43f5-8f25-ebe24529854c/B4D433E2F66B30B478C0D080CCD5217CA2A963C16E90CAF10B1E0592B7D8D519/VC_redist.x86.exe'; + vcredist2017_url_x64 = 'http://download.visualstudio.microsoft.com/download/pr/fd5d2eea-32b8-4814-b55e-28c83dd72d9c/952A0C6CB4A3DD14C3666EF05BB1982C5FF7F87B7103C2BA896354F00651E358/VC_redist.x64.exe'; vcredist2017_upgradecode = '{65E5BD06-6392-3027-8C26-853107D3CF1A}'; vcredist2017_upgradecode_x64 = '{36F68A90-239C-34DF-B58C-64B30153CE35}'; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/installer/setup.py new/vapoursynth-R52/installer/setup.py --- old/vapoursynth-R51/installer/setup.py 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/installer/setup.py 2020-08-20 19:48:22.000000000 +0200 @@ -1,4 +1,4 @@ -CURRENT_RELEASE = "51" +CURRENT_RELEASE = "52" # Always prefer setuptools over distutils from setuptools import setup, find_packages diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/installer/vsinstaller.iss new/vapoursynth-R52/installer/vsinstaller.iss --- old/vapoursynth-R51/installer/vsinstaller.iss 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/installer/vsinstaller.iss 2020-08-20 19:48:22.000000000 +0200 @@ -1,4 +1,4 @@ -#define Version '51' +#define Version '52' #define VersionExtra '' #define PythonVersion '3.8' #define PythonCompactVersion '38' @@ -96,6 +96,8 @@ ;vsrepo Source: ..\vsrepo\vsrepo.py; DestDir: {app}\vsrepo; Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo +Source: ..\vsrepo\vsgenstubs\__init__.py; DestDir: {app}\vsrepo\vsgenstubs; Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo +Source: ..\vsrepo\vsgenstubs\_vapoursynth.part.pyi; DestDir: {app}\vsrepo\vsgenstubs; Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo Source: 7z.exe; DestDir: {app}\vsrepo; Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo Source: 7z.dll; DestDir: {app}\vsrepo; Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo @@ -187,7 +189,7 @@ [Code] -const VSRuntimeVersion = '14.26.28720'; +const VSRuntimeVersion = '14.27.29016'; type TPythonPath = record diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/setup.py new/vapoursynth-R52/setup.py --- old/vapoursynth-R51/setup.py 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/setup.py 2020-08-20 19:48:22.000000000 +0200 @@ -85,7 +85,7 @@ author = "Fredrik Mellbin", author_email = "fredrik.mell...@gmail.com", license = "LGPL 2.1 or later", - version = "51", + version = "52", long_description = "A portable replacement for Avisynth", platforms = "All", ext_modules = [Extension("vapoursynth", [join("src", "cython", "vapoursynth.pyx")], diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/core/kernel/x86/generic_avx2.cpp new/vapoursynth-R52/src/core/kernel/x86/generic_avx2.cpp --- old/vapoursynth-R51/src/core/kernel/x86/generic_avx2.cpp 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/core/kernel/x86/generic_avx2.cpp 2020-08-20 19:48:22.000000000 +0200 @@ -1152,7 +1152,7 @@ filter_plane_3x3<MinMaxFixedWord<STENCIL_ALL, true>>(src, src_stride, dst, dst_stride, *params, width, height); break; default: - filter_plane_3x3<MinMaxByte<true>>(src, src_stride, dst, dst_stride, *params, width, height); + filter_plane_3x3<MinMaxWord<true>>(src, src_stride, dst, dst_stride, *params, width, height); break; } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/core/kernel/x86/generic_sse2.cpp new/vapoursynth-R52/src/core/kernel/x86/generic_sse2.cpp --- old/vapoursynth-R51/src/core/kernel/x86/generic_sse2.cpp 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/core/kernel/x86/generic_sse2.cpp 2020-08-20 19:48:22.000000000 +0200 @@ -1178,7 +1178,7 @@ filter_plane_3x3<MinMaxFixedWord<STENCIL_ALL, true>>(src, src_stride, dst, dst_stride, *params, width, height); break; default: - filter_plane_3x3<MinMaxByte<true>>(src, src_stride, dst, dst_stride, *params, width, height); + filter_plane_3x3<MinMaxWord<true>>(src, src_stride, dst, dst_stride, *params, width, height); break; } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/core/version.h new/vapoursynth-R52/src/core/version.h --- old/vapoursynth-R51/src/core/version.h 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/core/version.h 2020-08-20 19:48:22.000000000 +0200 @@ -34,7 +34,7 @@ #define XSTR(x) STR(x) #define STR(x) #x -#define VAPOURSYNTH_CORE_VERSION 51 +#define VAPOURSYNTH_CORE_VERSION 52 #if defined(VS_FRAME_GUARD) && !defined(NDEBUG) #define VS_OPTIONS_TEXT "Options: Frame Guard + Extra Assertions\n" #elif defined(VS_FRAME_GUARD) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/core/vsthreadpool.cpp new/vapoursynth-R52/src/core/vsthreadpool.cpp --- old/vapoursynth-R51/src/core/vsthreadpool.cpp 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/core/vsthreadpool.cpp 2020-08-20 19:48:22.000000000 +0200 @@ -77,9 +77,6 @@ // Go through all tasks from the top (oldest) and process the first one possible owner->tasks.sort(taskCmp); - // fixme, test if this matters at all! - std::set<VSNode *> seenNodes; - for (auto iter = owner->tasks.begin(); iter != owner->tasks.end(); ++iter) { FrameContext *mainContext = iter->get(); FrameContext *leafContext = nullptr; @@ -108,9 +105,6 @@ mainContext = mainContext->upstreamContext.get(); } - if (!seenNodes.insert(mainContext->clip).second) - continue; - VSNode *clip = mainContext->clip; int filterMode = clip->filterMode; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/cython/vapoursynth.pyx new/vapoursynth-R52/src/cython/vapoursynth.pyx --- old/vapoursynth-R51/src/cython/vapoursynth.pyx 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/cython/vapoursynth.pyx 2020-08-20 19:48:22.000000000 +0200 @@ -70,7 +70,7 @@ 'core', ] -__version__ = namedtuple("VapourSynthVersion", "release_major release_minor")(51, 0) +__version__ = namedtuple("VapourSynthVersion", "release_major release_minor")(52, 0) __api_version__ = namedtuple("VapourSynthAPIVersion", "api_major api_minor")(VAPOURSYNTH_API_MAJOR, VAPOURSYNTH_API_MINOR) @final diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/vapoursynth-R51/src/filters/misc/miscfilters.cpp new/vapoursynth-R52/src/filters/misc/miscfilters.cpp --- old/vapoursynth-R51/src/filters/misc/miscfilters.cpp 2020-07-29 11:18:16.000000000 +0200 +++ new/vapoursynth-R52/src/filters/misc/miscfilters.cpp 2020-08-20 19:48:22.000000000 +0200 @@ -239,63 +239,111 @@ weights[i / 2] = _mm_set1_epi32((static_cast<uint32_t>(weight_hi) << 16) | weight_lo); } if (numSrcs % 2) - weights[numSrcs / 2 - 1] = _mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1])); + weights[numSrcs / 2] = _mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1])); - __m128i bias = _mm_setzero_si128(); __m128 scale = _mm_set_ps1(1.0f / d->scale); - if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || d->vi.format->colorFamily == cmYCoCg)) - bias = _mm_set1_epi8(128); + if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || d->vi.format->colorFamily == cmYCoCg)) { + __m128i bias = _mm_set1_epi8(128); - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; w += 16) { - __m128i accum_lolo = _mm_setzero_si128(); - __m128i accum_lohi = _mm_setzero_si128(); - __m128i accum_hilo = _mm_setzero_si128(); - __m128i accum_hihi = _mm_setzero_si128(); - - for (size_t i = 0; i < numSrcs; i += 2) { - __m128i coeffs = weights[i / 2]; - __m128i v1 = _mm_sub_epi8(_mm_load_si128((const __m128i *)(srcpp[i + 0] + w)), bias); - __m128i v2 = _mm_sub_epi8(_mm_load_si128((const __m128i *)(srcpp[i + 1] + w)), bias); - __m128i v1_sign = _mm_cmplt_epi8(v1, _mm_setzero_si128()); - __m128i v2_sign = _mm_cmplt_epi8(v2, _mm_setzero_si128()); - - __m128i v1_lo = _mm_unpacklo_epi8(v1, v1_sign); - __m128i v1_hi = _mm_unpackhi_epi8(v1, v1_sign); - __m128i v2_lo = _mm_unpacklo_epi8(v2, v2_sign); - __m128i v2_hi = _mm_unpackhi_epi8(v2, v2_sign); - - accum_lolo = _mm_add_epi32(accum_lolo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_lo, v2_lo))); - accum_lohi = _mm_add_epi32(accum_lohi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_lo, v2_lo))); - accum_hilo = _mm_add_epi32(accum_hilo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_hi, v2_hi))); - accum_hihi = _mm_add_epi32(accum_hihi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_hi, v2_hi))); - } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; w += 16) { + __m128i accum_lolo = _mm_setzero_si128(); + __m128i accum_lohi = _mm_setzero_si128(); + __m128i accum_hilo = _mm_setzero_si128(); + __m128i accum_hihi = _mm_setzero_si128(); - __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo); - __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi); - __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo); - __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi); - accumf_lolo = _mm_mul_ps(accumf_lolo, scale); - accumf_lohi = _mm_mul_ps(accumf_lohi, scale); - accumf_hilo = _mm_mul_ps(accumf_hilo, scale); - accumf_hihi = _mm_mul_ps(accumf_hihi, scale); - - accum_lolo = _mm_cvtps_epi32(accumf_lolo); - accum_lohi = _mm_cvtps_epi32(accumf_lohi); - accum_hilo = _mm_cvtps_epi32(accumf_hilo); - accum_hihi = _mm_cvtps_epi32(accumf_hihi); - - accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi); - accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi); - accum_lolo = _mm_packs_epi16(accum_lolo, accum_hilo); + for (size_t i = 0; i < numSrcs; i += 2) { + __m128i coeffs = weights[i / 2]; + __m128i v1 = _mm_sub_epi8(_mm_load_si128((const __m128i *)(srcpp[i + 0] + w)), bias); + __m128i v2 = _mm_sub_epi8(_mm_load_si128((const __m128i *)(srcpp[i + 1] + w)), bias); + __m128i v1_sign = _mm_cmplt_epi8(v1, _mm_setzero_si128()); + __m128i v2_sign = _mm_cmplt_epi8(v2, _mm_setzero_si128()); + + __m128i v1_lo = _mm_unpacklo_epi8(v1, v1_sign); + __m128i v1_hi = _mm_unpackhi_epi8(v1, v1_sign); + __m128i v2_lo = _mm_unpacklo_epi8(v2, v2_sign); + __m128i v2_hi = _mm_unpackhi_epi8(v2, v2_sign); + + accum_lolo = _mm_add_epi32(accum_lolo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_lo, v2_lo))); + accum_lohi = _mm_add_epi32(accum_lohi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_lo, v2_lo))); + accum_hilo = _mm_add_epi32(accum_hilo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_hi, v2_hi))); + accum_hihi = _mm_add_epi32(accum_hihi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_hi, v2_hi))); + } - accum_lolo = _mm_add_epi8(accum_lolo, bias); - _mm_store_si128((__m128i *)(dstp + w), accum_lolo); + __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo); + __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi); + __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo); + __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi); + accumf_lolo = _mm_mul_ps(accumf_lolo, scale); + accumf_lohi = _mm_mul_ps(accumf_lohi, scale); + accumf_hilo = _mm_mul_ps(accumf_hilo, scale); + accumf_hihi = _mm_mul_ps(accumf_hihi, scale); + + accum_lolo = _mm_cvtps_epi32(accumf_lolo); + accum_lohi = _mm_cvtps_epi32(accumf_lohi); + accum_hilo = _mm_cvtps_epi32(accumf_hilo); + accum_hihi = _mm_cvtps_epi32(accumf_hihi); + + accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi); + accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi); + accum_lolo = _mm_packs_epi16(accum_lolo, accum_hilo); + + accum_lolo = _mm_add_epi8(accum_lolo, bias); + _mm_store_si128((__m128i *)(dstp + w), accum_lolo); + } + + std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t *ptr) { return ptr + stride; }); + dstp += stride; } + } else { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; w += 16) { + __m128i accum_lolo = _mm_setzero_si128(); + __m128i accum_lohi = _mm_setzero_si128(); + __m128i accum_hilo = _mm_setzero_si128(); + __m128i accum_hihi = _mm_setzero_si128(); + + for (size_t i = 0; i < numSrcs; i += 2) { + __m128i coeffs = weights[i / 2]; + __m128i v1 = _mm_load_si128((const __m128i *)(srcpp[i + 0] + w)); + __m128i v2 = _mm_load_si128((const __m128i *)(srcpp[i + 1] + w)); + + __m128i v1_lo = _mm_unpacklo_epi8(v1, _mm_setzero_si128()); + __m128i v1_hi = _mm_unpackhi_epi8(v1, _mm_setzero_si128()); + __m128i v2_lo = _mm_unpacklo_epi8(v2, _mm_setzero_si128()); + __m128i v2_hi = _mm_unpackhi_epi8(v2, _mm_setzero_si128()); + + accum_lolo = _mm_add_epi32(accum_lolo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_lo, v2_lo))); + accum_lohi = _mm_add_epi32(accum_lohi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_lo, v2_lo))); + accum_hilo = _mm_add_epi32(accum_hilo, _mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_hi, v2_hi))); + accum_hihi = _mm_add_epi32(accum_hihi, _mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_hi, v2_hi))); + } - std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t *ptr) { return ptr + stride; }); - dstp += stride; + __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo); + __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi); + __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo); + __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi); + accumf_lolo = _mm_mul_ps(accumf_lolo, scale); + accumf_lohi = _mm_mul_ps(accumf_lohi, scale); + accumf_hilo = _mm_mul_ps(accumf_hilo, scale); + accumf_hihi = _mm_mul_ps(accumf_hihi, scale); + + accum_lolo = _mm_cvtps_epi32(accumf_lolo); + accum_lohi = _mm_cvtps_epi32(accumf_lohi); + accum_hilo = _mm_cvtps_epi32(accumf_hilo); + accum_hihi = _mm_cvtps_epi32(accumf_hihi); + + accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi); + accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi); + accum_lolo = _mm_packus_epi16(accum_lolo, accum_hilo); + + _mm_store_si128((__m128i *)(dstp + w), accum_lolo); + } + + std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t *ptr) { return ptr + stride; }); + dstp += stride; + } } } @@ -318,13 +366,13 @@ __m128i weights[16]; __m128 scale = _mm_set_ps1(1.0f / d->scale); - for (size_t i = 0; i < numSrcs; i += 2) { + for (size_t i = 0; i < (numSrcs & ~1); i += 2) { uint16_t weight_lo = static_cast<int16_t>(d->weights[i]); uint16_t weight_hi = static_cast<int16_t>(d->weights[i + 1]); weights[i / 2] = _mm_set1_epi32((static_cast<uint32_t>(weight_hi) << 16) | weight_lo); } if (numSrcs % 2) - weights[numSrcs / 2 - 1] = _mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1])); + weights[numSrcs / 2] = _mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1])); if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || d->vi.format->colorFamily == cmYCoCg)) { __m128i bias = _mm_set1_epi16(1U << (d->vi.format->bitsPerSample - 1)); @@ -368,7 +416,7 @@ __m128i accumbias = _mm_setzero_si128(); __m128i maxVal = _mm_add_epi16(_mm_set1_epi16((1U << d->vi.format->bitsPerSample) - 1), _mm_set1_epi16(INT16_MIN)); - for (size_t i = 0; i < numSrcs / 2; ++i) { + for (size_t i = 0; i < (numSrcs + 1) / 2; ++i) { accumbias = _mm_add_epi32(accumbias, _mm_madd_epi16(_mm_set1_epi16(INT16_MIN), weights[i])); }