Hello community,

here is the log from the commit of package vapoursynth for openSUSE:Factory 
checked in at 2020-08-25 09:39:41
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/vapoursynth (Old)
 and      /work/SRC/openSUSE:Factory/.vapoursynth.new.3399 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "vapoursynth"

Tue Aug 25 09:39:41 2020 rev:14 rq:829017 version:52

Changes:
--------
--- /work/SRC/openSUSE:Factory/vapoursynth/vapoursynth.changes  2020-08-12 
10:37:49.300330694 +0200
+++ /work/SRC/openSUSE:Factory/.vapoursynth.new.3399/vapoursynth.changes        
2020-08-25 09:41:26.728285249 +0200
@@ -1,0 +2,12 @@
+Mon Aug 24 08:07:44 UTC 2020 - Michael Vetter <mvet...@suse.com>
+
+- Update to 52:
+  * updated visual studio 2019 runtime version
+  * updated zimg
+  * updated vsrepo with support for python wheel packages
+  * vsgenstubs is now included with vsrepo
+  * fixed maximum for 16 bit input with diagonal filters and optimizations
+  * fixed deadlock in fmserial filters introduced in r51
+  * fixed more averageframes bugs
+
+-------------------------------------------------------------------

Old:
----
  vapoursynth-R51.tar.gz

New:
----
  vapoursynth-R52.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ vapoursynth.spec ++++++
--- /var/tmp/diff_new_pack.ohUSKu/_old  2020-08-25 09:41:27.392285551 +0200
+++ /var/tmp/diff_new_pack.ohUSKu/_new  2020-08-25 09:41:27.396285552 +0200
@@ -17,7 +17,7 @@
 
 
 Name:           vapoursynth
-Version:        51
+Version:        52
 Release:        0
 Summary:        A video processing framework
 License:        LGPL-2.1-only

++++++ vapoursynth-R51.tar.gz -> vapoursynth-R52.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/ChangeLog 
new/vapoursynth-R52/ChangeLog
--- old/vapoursynth-R51/ChangeLog       2020-07-29 11:18:16.000000000 +0200
+++ new/vapoursynth-R52/ChangeLog       2020-08-20 19:48:22.000000000 +0200
@@ -1,3 +1,12 @@
+r52:
+updated visual studio 2019 runtime version
+updated zimg
+updated vsrepo with support for python wheel packages
+vsgenstubs is now included with vsrepo
+fixed maximum for 16 bit input with diagonal filters and optimizations
+fixed deadlock in fmserial filters introduced in r51
+fixed more averageframes bugs (sekrit-twc)
+
 r51:
 updated visual studio 2019 runtime version
 fixed a cache shrinking issue
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' "old/vapoursynth-R51/build instructions windows.md" 
"new/vapoursynth-R52/build instructions windows.md"
--- "old/vapoursynth-R51/build instructions windows.md" 2020-07-29 
11:18:16.000000000 +0200
+++ "new/vapoursynth-R52/build instructions windows.md" 2020-08-20 
19:48:22.000000000 +0200
@@ -5,7 +5,7 @@
 ## Required languages and applications
 
 * Needs [Visual Studio 2019](https://visualstudio.microsoft.com/de/vs/)
-* It also needs both 
[32bit](https://www.python.org/ftp/python/3.8.3/python-3.8.3-webinstall.exe) 
and 
[64bit](https://www.python.org/ftp/python/3.8.3/python-3.8.3-amd64-webinstall.exe)
 Python 3.8 series (the msvc project assumes that you installed python for all 
users.)
+* It also needs both [32bit](https://www.python.org/) and 
[64bit](https://www.python.org/) Python 3.8 series (the msvc project assumes 
that you installed python for all users.)
 * [InnoSetup 6.x](http://www.jrsoftware.org/isdl.php) is needed to create the 
installer (default installation path assumed)
 * [7-zip](https://www.7-zip.org/) is needed to compress the portable version 
(default installation path assumed)
 
@@ -13,7 +13,7 @@
 
 * Clone VapourSynth
 * Clone VSRepo into the VapourSynth dir (`git clone 
https://github.com/vapoursynth/vsrepo`)
-* Clone zimg v2.9 branch into the VapourSynth dir (`git clone 
https://github.com/sekrit-twc/zimg --branch v2.9`)
+* Clone zimg into the VapourSynth dir (`git clone 
https://github.com/sekrit-twc/zimg --branch v2.9`)
 * Clone avs+ into the VapourSynth dir (`git clone 
https://github.com/AviSynth/AviSynthPlus.git`)
 * Compile 32 and 64 bit releases using the VapourSynth solution
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/configure.ac 
new/vapoursynth-R52/configure.ac
--- old/vapoursynth-R51/configure.ac    2020-07-29 11:18:16.000000000 +0200
+++ new/vapoursynth-R52/configure.ac    2020-08-20 19:48:22.000000000 +0200
@@ -1,4 +1,4 @@
-AC_INIT([vapoursynth], [51], 
[https://github.com/vapoursynth/vapoursynth/issues], [vapoursynth], 
[http://www.vapoursynth.com/])
+AC_INIT([vapoursynth], [52], 
[https://github.com/vapoursynth/vapoursynth/issues], [vapoursynth], 
[http://www.vapoursynth.com/])
 
 : ${CFLAGS=""}
 : ${CXXFLAGS=""}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/doc/conf.py 
new/vapoursynth-R52/doc/conf.py
--- old/vapoursynth-R51/doc/conf.py     2020-07-29 11:18:16.000000000 +0200
+++ new/vapoursynth-R52/doc/conf.py     2020-08-20 19:48:22.000000000 +0200
@@ -49,7 +49,7 @@
 # built documents.
 #
 # The short X.Y version.
-version = 'R51'
+version = 'R52'
 # The full version, including alpha/beta/rc tags.
 release = version
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/vapoursynth-R51/installer/scripts/products/vcredist2017.iss 
new/vapoursynth-R52/installer/scripts/products/vcredist2017.iss
--- old/vapoursynth-R51/installer/scripts/products/vcredist2017.iss     
2020-07-29 11:18:16.000000000 +0200
+++ new/vapoursynth-R52/installer/scripts/products/vcredist2017.iss     
2020-08-20 19:48:22.000000000 +0200
@@ -10,8 +10,8 @@
 
 [Code]
 const
-       vcredist2017_url = 
'http://download.visualstudio.microsoft.com/download/pr/d60aa805-26e9-47df-b4e3-cd6fcc392333/A06AAC66734A618AB33C1522920654DDFC44FC13CAFAA0F0AB85B199C3D51DC0/VC_redist.x86.exe';
-       vcredist2017_url_x64 = 
'http://download.visualstudio.microsoft.com/download/pr/d60aa805-26e9-47df-b4e3-cd6fcc392333/7D7105C52FCD6766BEEE1AE162AA81E278686122C1E44890712326634D0B055E/VC_redist.x64.exe';
+       vcredist2017_url = 
'http://download.visualstudio.microsoft.com/download/pr/9fe82b83-f3a1-43f5-8f25-ebe24529854c/B4D433E2F66B30B478C0D080CCD5217CA2A963C16E90CAF10B1E0592B7D8D519/VC_redist.x86.exe';
+       vcredist2017_url_x64 = 
'http://download.visualstudio.microsoft.com/download/pr/fd5d2eea-32b8-4814-b55e-28c83dd72d9c/952A0C6CB4A3DD14C3666EF05BB1982C5FF7F87B7103C2BA896354F00651E358/VC_redist.x64.exe';
 
        vcredist2017_upgradecode = '{65E5BD06-6392-3027-8C26-853107D3CF1A}';
        vcredist2017_upgradecode_x64 = '{36F68A90-239C-34DF-B58C-64B30153CE35}';
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/installer/setup.py 
new/vapoursynth-R52/installer/setup.py
--- old/vapoursynth-R51/installer/setup.py      2020-07-29 11:18:16.000000000 
+0200
+++ new/vapoursynth-R52/installer/setup.py      2020-08-20 19:48:22.000000000 
+0200
@@ -1,4 +1,4 @@
-CURRENT_RELEASE = "51"
+CURRENT_RELEASE = "52"
 
 # Always prefer setuptools over distutils
 from setuptools import setup, find_packages
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/installer/vsinstaller.iss 
new/vapoursynth-R52/installer/vsinstaller.iss
--- old/vapoursynth-R51/installer/vsinstaller.iss       2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/installer/vsinstaller.iss       2020-08-20 
19:48:22.000000000 +0200
@@ -1,4 +1,4 @@
-#define Version '51'
+#define Version '52'
 #define VersionExtra ''
 #define PythonVersion '3.8'
 #define PythonCompactVersion '38'
@@ -96,6 +96,8 @@
 
 ;vsrepo
 Source: ..\vsrepo\vsrepo.py; DestDir: {app}\vsrepo; Flags: ignoreversion 
uninsrestartdelete restartreplace; Components: vsrepo
+Source: ..\vsrepo\vsgenstubs\__init__.py; DestDir: {app}\vsrepo\vsgenstubs; 
Flags: ignoreversion uninsrestartdelete restartreplace; Components: vsrepo
+Source: ..\vsrepo\vsgenstubs\_vapoursynth.part.pyi; DestDir: 
{app}\vsrepo\vsgenstubs; Flags: ignoreversion uninsrestartdelete 
restartreplace; Components: vsrepo
 Source: 7z.exe; DestDir: {app}\vsrepo; Flags: ignoreversion uninsrestartdelete 
restartreplace; Components: vsrepo
 Source: 7z.dll; DestDir: {app}\vsrepo; Flags: ignoreversion uninsrestartdelete 
restartreplace; Components: vsrepo
 
@@ -187,7 +189,7 @@
 
 [Code]
 
-const VSRuntimeVersion = '14.26.28720';
+const VSRuntimeVersion = '14.27.29016';
 
 type
   TPythonPath = record
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/setup.py new/vapoursynth-R52/setup.py
--- old/vapoursynth-R51/setup.py        2020-07-29 11:18:16.000000000 +0200
+++ new/vapoursynth-R52/setup.py        2020-08-20 19:48:22.000000000 +0200
@@ -85,7 +85,7 @@
     author = "Fredrik Mellbin",
     author_email = "fredrik.mell...@gmail.com",
     license = "LGPL 2.1 or later",
-    version = "51",
+    version = "52",
     long_description = "A portable replacement for Avisynth",
     platforms = "All",
     ext_modules = [Extension("vapoursynth", [join("src", "cython", 
"vapoursynth.pyx")],
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/core/kernel/x86/generic_avx2.cpp 
new/vapoursynth-R52/src/core/kernel/x86/generic_avx2.cpp
--- old/vapoursynth-R51/src/core/kernel/x86/generic_avx2.cpp    2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/src/core/kernel/x86/generic_avx2.cpp    2020-08-20 
19:48:22.000000000 +0200
@@ -1152,7 +1152,7 @@
         filter_plane_3x3<MinMaxFixedWord<STENCIL_ALL, true>>(src, src_stride, 
dst, dst_stride, *params, width, height);
         break;
     default:
-        filter_plane_3x3<MinMaxByte<true>>(src, src_stride, dst, dst_stride, 
*params, width, height);
+        filter_plane_3x3<MinMaxWord<true>>(src, src_stride, dst, dst_stride, 
*params, width, height);
         break;
     }
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/core/kernel/x86/generic_sse2.cpp 
new/vapoursynth-R52/src/core/kernel/x86/generic_sse2.cpp
--- old/vapoursynth-R51/src/core/kernel/x86/generic_sse2.cpp    2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/src/core/kernel/x86/generic_sse2.cpp    2020-08-20 
19:48:22.000000000 +0200
@@ -1178,7 +1178,7 @@
         filter_plane_3x3<MinMaxFixedWord<STENCIL_ALL, true>>(src, src_stride, 
dst, dst_stride, *params, width, height);
         break;
     default:
-        filter_plane_3x3<MinMaxByte<true>>(src, src_stride, dst, dst_stride, 
*params, width, height);
+        filter_plane_3x3<MinMaxWord<true>>(src, src_stride, dst, dst_stride, 
*params, width, height);
         break;
     }
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/core/version.h 
new/vapoursynth-R52/src/core/version.h
--- old/vapoursynth-R51/src/core/version.h      2020-07-29 11:18:16.000000000 
+0200
+++ new/vapoursynth-R52/src/core/version.h      2020-08-20 19:48:22.000000000 
+0200
@@ -34,7 +34,7 @@
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
-#define VAPOURSYNTH_CORE_VERSION 51
+#define VAPOURSYNTH_CORE_VERSION 52
 #if defined(VS_FRAME_GUARD) && !defined(NDEBUG)
 #define VS_OPTIONS_TEXT "Options: Frame Guard + Extra Assertions\n"
 #elif defined(VS_FRAME_GUARD)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/core/vsthreadpool.cpp 
new/vapoursynth-R52/src/core/vsthreadpool.cpp
--- old/vapoursynth-R51/src/core/vsthreadpool.cpp       2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/src/core/vsthreadpool.cpp       2020-08-20 
19:48:22.000000000 +0200
@@ -77,9 +77,6 @@
 // Go through all tasks from the top (oldest) and process the first one 
possible
         owner->tasks.sort(taskCmp);
 
-        // fixme, test if this matters at all!
-        std::set<VSNode *> seenNodes;
-
         for (auto iter = owner->tasks.begin(); iter != owner->tasks.end(); 
++iter) {
             FrameContext *mainContext = iter->get();
             FrameContext *leafContext = nullptr;
@@ -108,9 +105,6 @@
                 mainContext = mainContext->upstreamContext.get();
             }
 
-            if (!seenNodes.insert(mainContext->clip).second)
-                continue;
-
             VSNode *clip = mainContext->clip;
             int filterMode = clip->filterMode;
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/cython/vapoursynth.pyx 
new/vapoursynth-R52/src/cython/vapoursynth.pyx
--- old/vapoursynth-R51/src/cython/vapoursynth.pyx      2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/src/cython/vapoursynth.pyx      2020-08-20 
19:48:22.000000000 +0200
@@ -70,7 +70,7 @@
   'core', 
 ]
     
-__version__ = namedtuple("VapourSynthVersion", "release_major 
release_minor")(51, 0)
+__version__ = namedtuple("VapourSynthVersion", "release_major 
release_minor")(52, 0)
 __api_version__ = namedtuple("VapourSynthAPIVersion", "api_major 
api_minor")(VAPOURSYNTH_API_MAJOR, VAPOURSYNTH_API_MINOR)
 
 @final
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/vapoursynth-R51/src/filters/misc/miscfilters.cpp 
new/vapoursynth-R52/src/filters/misc/miscfilters.cpp
--- old/vapoursynth-R51/src/filters/misc/miscfilters.cpp        2020-07-29 
11:18:16.000000000 +0200
+++ new/vapoursynth-R52/src/filters/misc/miscfilters.cpp        2020-08-20 
19:48:22.000000000 +0200
@@ -239,63 +239,111 @@
         weights[i / 2] = _mm_set1_epi32((static_cast<uint32_t>(weight_hi) << 
16) | weight_lo);
     }
     if (numSrcs % 2)
-        weights[numSrcs / 2 - 1] = 
_mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1]));
+        weights[numSrcs / 2] = 
_mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1]));
 
-    __m128i bias = _mm_setzero_si128();
     __m128 scale = _mm_set_ps1(1.0f / d->scale);
 
-    if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || 
d->vi.format->colorFamily == cmYCoCg))
-        bias = _mm_set1_epi8(128);
+    if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || 
d->vi.format->colorFamily == cmYCoCg)) {
+        __m128i bias = _mm_set1_epi8(128);
 
-    for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; w += 16) {
-            __m128i accum_lolo = _mm_setzero_si128();
-            __m128i accum_lohi = _mm_setzero_si128();
-            __m128i accum_hilo = _mm_setzero_si128();
-            __m128i accum_hihi = _mm_setzero_si128();
-
-            for (size_t i = 0; i < numSrcs; i += 2) {
-                __m128i coeffs = weights[i / 2];
-                __m128i v1 = _mm_sub_epi8(_mm_load_si128((const __m128i 
*)(srcpp[i + 0] + w)), bias);
-                __m128i v2 = _mm_sub_epi8(_mm_load_si128((const __m128i 
*)(srcpp[i + 1] + w)), bias);
-                __m128i v1_sign = _mm_cmplt_epi8(v1, _mm_setzero_si128());
-                __m128i v2_sign = _mm_cmplt_epi8(v2, _mm_setzero_si128());
-
-                __m128i v1_lo = _mm_unpacklo_epi8(v1, v1_sign);
-                __m128i v1_hi = _mm_unpackhi_epi8(v1, v1_sign);
-                __m128i v2_lo = _mm_unpacklo_epi8(v2, v2_sign);
-                __m128i v2_hi = _mm_unpackhi_epi8(v2, v2_sign);
-
-                accum_lolo = _mm_add_epi32(accum_lolo, _mm_madd_epi16(coeffs, 
_mm_unpacklo_epi16(v1_lo, v2_lo)));
-                accum_lohi = _mm_add_epi32(accum_lohi, _mm_madd_epi16(coeffs, 
_mm_unpackhi_epi16(v1_lo, v2_lo)));
-                accum_hilo = _mm_add_epi32(accum_hilo, _mm_madd_epi16(coeffs, 
_mm_unpacklo_epi16(v1_hi, v2_hi)));
-                accum_hihi = _mm_add_epi32(accum_hihi, _mm_madd_epi16(coeffs, 
_mm_unpackhi_epi16(v1_hi, v2_hi)));
-            }
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; w += 16) {
+                __m128i accum_lolo = _mm_setzero_si128();
+                __m128i accum_lohi = _mm_setzero_si128();
+                __m128i accum_hilo = _mm_setzero_si128();
+                __m128i accum_hihi = _mm_setzero_si128();
 
-            __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo);
-            __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi);
-            __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo);
-            __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi);
-            accumf_lolo = _mm_mul_ps(accumf_lolo, scale);
-            accumf_lohi = _mm_mul_ps(accumf_lohi, scale);
-            accumf_hilo = _mm_mul_ps(accumf_hilo, scale);
-            accumf_hihi = _mm_mul_ps(accumf_hihi, scale);
-
-            accum_lolo = _mm_cvtps_epi32(accumf_lolo);
-            accum_lohi = _mm_cvtps_epi32(accumf_lohi);
-            accum_hilo = _mm_cvtps_epi32(accumf_hilo);
-            accum_hihi = _mm_cvtps_epi32(accumf_hihi);
-
-            accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi);
-            accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi);
-            accum_lolo = _mm_packs_epi16(accum_lolo, accum_hilo);
+                for (size_t i = 0; i < numSrcs; i += 2) {
+                    __m128i coeffs = weights[i / 2];
+                    __m128i v1 = _mm_sub_epi8(_mm_load_si128((const __m128i 
*)(srcpp[i + 0] + w)), bias);
+                    __m128i v2 = _mm_sub_epi8(_mm_load_si128((const __m128i 
*)(srcpp[i + 1] + w)), bias);
+                    __m128i v1_sign = _mm_cmplt_epi8(v1, _mm_setzero_si128());
+                    __m128i v2_sign = _mm_cmplt_epi8(v2, _mm_setzero_si128());
+
+                    __m128i v1_lo = _mm_unpacklo_epi8(v1, v1_sign);
+                    __m128i v1_hi = _mm_unpackhi_epi8(v1, v1_sign);
+                    __m128i v2_lo = _mm_unpacklo_epi8(v2, v2_sign);
+                    __m128i v2_hi = _mm_unpackhi_epi8(v2, v2_sign);
+
+                    accum_lolo = _mm_add_epi32(accum_lolo, 
_mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_lo, v2_lo)));
+                    accum_lohi = _mm_add_epi32(accum_lohi, 
_mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_lo, v2_lo)));
+                    accum_hilo = _mm_add_epi32(accum_hilo, 
_mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_hi, v2_hi)));
+                    accum_hihi = _mm_add_epi32(accum_hihi, 
_mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_hi, v2_hi)));
+                }
 
-            accum_lolo = _mm_add_epi8(accum_lolo, bias);
-            _mm_store_si128((__m128i *)(dstp + w), accum_lolo);
+                __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo);
+                __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi);
+                __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo);
+                __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi);
+                accumf_lolo = _mm_mul_ps(accumf_lolo, scale);
+                accumf_lohi = _mm_mul_ps(accumf_lohi, scale);
+                accumf_hilo = _mm_mul_ps(accumf_hilo, scale);
+                accumf_hihi = _mm_mul_ps(accumf_hihi, scale);
+
+                accum_lolo = _mm_cvtps_epi32(accumf_lolo);
+                accum_lohi = _mm_cvtps_epi32(accumf_lohi);
+                accum_hilo = _mm_cvtps_epi32(accumf_hilo);
+                accum_hihi = _mm_cvtps_epi32(accumf_hihi);
+
+                accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi);
+                accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi);
+                accum_lolo = _mm_packs_epi16(accum_lolo, accum_hilo);
+
+                accum_lolo = _mm_add_epi8(accum_lolo, bias);
+                _mm_store_si128((__m128i *)(dstp + w), accum_lolo);
+            }
+
+            std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t 
*ptr) { return ptr + stride; });
+            dstp += stride;
         }
+    } else {
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; w += 16) {
+                __m128i accum_lolo = _mm_setzero_si128();
+                __m128i accum_lohi = _mm_setzero_si128();
+                __m128i accum_hilo = _mm_setzero_si128();
+                __m128i accum_hihi = _mm_setzero_si128();
+
+                for (size_t i = 0; i < numSrcs; i += 2) {
+                    __m128i coeffs = weights[i / 2];
+                    __m128i v1 = _mm_load_si128((const __m128i *)(srcpp[i + 0] 
+ w));
+                    __m128i v2 = _mm_load_si128((const __m128i *)(srcpp[i + 1] 
+ w));
+
+                    __m128i v1_lo = _mm_unpacklo_epi8(v1, _mm_setzero_si128());
+                    __m128i v1_hi = _mm_unpackhi_epi8(v1, _mm_setzero_si128());
+                    __m128i v2_lo = _mm_unpacklo_epi8(v2, _mm_setzero_si128());
+                    __m128i v2_hi = _mm_unpackhi_epi8(v2, _mm_setzero_si128());
+
+                    accum_lolo = _mm_add_epi32(accum_lolo, 
_mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_lo, v2_lo)));
+                    accum_lohi = _mm_add_epi32(accum_lohi, 
_mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_lo, v2_lo)));
+                    accum_hilo = _mm_add_epi32(accum_hilo, 
_mm_madd_epi16(coeffs, _mm_unpacklo_epi16(v1_hi, v2_hi)));
+                    accum_hihi = _mm_add_epi32(accum_hihi, 
_mm_madd_epi16(coeffs, _mm_unpackhi_epi16(v1_hi, v2_hi)));
+                }
 
-        std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t *ptr) 
{ return ptr + stride; });
-        dstp += stride;
+                __m128 accumf_lolo = _mm_cvtepi32_ps(accum_lolo);
+                __m128 accumf_lohi = _mm_cvtepi32_ps(accum_lohi);
+                __m128 accumf_hilo = _mm_cvtepi32_ps(accum_hilo);
+                __m128 accumf_hihi = _mm_cvtepi32_ps(accum_hihi);
+                accumf_lolo = _mm_mul_ps(accumf_lolo, scale);
+                accumf_lohi = _mm_mul_ps(accumf_lohi, scale);
+                accumf_hilo = _mm_mul_ps(accumf_hilo, scale);
+                accumf_hihi = _mm_mul_ps(accumf_hihi, scale);
+
+                accum_lolo = _mm_cvtps_epi32(accumf_lolo);
+                accum_lohi = _mm_cvtps_epi32(accumf_lohi);
+                accum_hilo = _mm_cvtps_epi32(accumf_hilo);
+                accum_hihi = _mm_cvtps_epi32(accumf_hihi);
+
+                accum_lolo = _mm_packs_epi32(accum_lolo, accum_lohi);
+                accum_hilo = _mm_packs_epi32(accum_hilo, accum_hihi);
+                accum_lolo = _mm_packus_epi16(accum_lolo, accum_hilo);
+
+                _mm_store_si128((__m128i *)(dstp + w), accum_lolo);
+            }
+
+            std::transform(srcpp, srcpp + numSrcs, srcpp, [=](const uint8_t 
*ptr) { return ptr + stride; });
+            dstp += stride;
+        }
     }
 }
 
@@ -318,13 +366,13 @@
     __m128i weights[16];
     __m128 scale = _mm_set_ps1(1.0f / d->scale);
 
-    for (size_t i = 0; i < numSrcs; i += 2) {
+    for (size_t i = 0; i < (numSrcs & ~1); i += 2) {
         uint16_t weight_lo = static_cast<int16_t>(d->weights[i]);
         uint16_t weight_hi = static_cast<int16_t>(d->weights[i + 1]);
         weights[i / 2] = _mm_set1_epi32((static_cast<uint32_t>(weight_hi) << 
16) | weight_lo);
     }
     if (numSrcs % 2)
-        weights[numSrcs / 2 - 1] = 
_mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1]));
+        weights[numSrcs / 2] = 
_mm_set1_epi32(static_cast<uint16_t>(d->weights[numSrcs - 1]));
 
     if ((plane == 1 || plane == 2) && (d->vi.format->colorFamily == cmYUV || 
d->vi.format->colorFamily == cmYCoCg)) {
         __m128i bias = _mm_set1_epi16(1U << (d->vi.format->bitsPerSample - 1));
@@ -368,7 +416,7 @@
         __m128i accumbias = _mm_setzero_si128();
         __m128i maxVal = _mm_add_epi16(_mm_set1_epi16((1U << 
d->vi.format->bitsPerSample) - 1), _mm_set1_epi16(INT16_MIN));
 
-        for (size_t i = 0; i < numSrcs / 2; ++i) {
+        for (size_t i = 0; i < (numSrcs + 1) / 2; ++i) {
             accumbias = _mm_add_epi32(accumbias, 
_mm_madd_epi16(_mm_set1_epi16(INT16_MIN), weights[i]));
         }
 


Reply via email to