Title: [102312] trunk/Source/WebCore
Revision
102312
Author
[email protected]
Date
2011-12-07 21:23:42 -0800 (Wed, 07 Dec 2011)

Log Message

Implement the SSE optimization in SincResampler::process()
https://bugs.webkit.org/show_bug.cgi?id=73789

Patch by Xingnan Wang <[email protected]> on 2011-12-07
Reviewed by Benjamin Poulain.

Here is about 70% performance improvement on the hot spot of sample convolving.

* platform/audio/SincResampler.cpp:

Modified Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (102311 => 102312)


--- trunk/Source/WebCore/ChangeLog	2011-12-08 05:08:32 UTC (rev 102311)
+++ trunk/Source/WebCore/ChangeLog	2011-12-08 05:23:42 UTC (rev 102312)
@@ -1,3 +1,14 @@
+2011-12-07  Xingnan Wang  <[email protected]>
+
+        Implement the SSE optimization in SincResampler::process()
+        https://bugs.webkit.org/show_bug.cgi?id=73789
+
+        Reviewed by Benjamin Poulain.
+
+        Here is about 70% performance improvement on the hot spot of sample convolving.
+
+        * platform/audio/SincResampler.cpp:
+
 2011-12-07  Luke Macpherson   <[email protected]>
 
         Implement border image source properties in CSSStyleApplyProperty.

Modified: trunk/Source/WebCore/platform/audio/SincResampler.cpp (102311 => 102312)


--- trunk/Source/WebCore/platform/audio/SincResampler.cpp	2011-12-08 05:08:32 UTC (rev 102311)
+++ trunk/Source/WebCore/platform/audio/SincResampler.cpp	2011-12-08 05:23:42 UTC (rev 102312)
@@ -35,6 +35,10 @@
 #include "AudioBus.h"
 #include <wtf/MathExtras.h>
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
 using namespace std;
 
 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
@@ -246,8 +250,6 @@
             // Generate a single output sample. 
             int n = m_kernelSize;
 
-            // FIXME: add SIMD optimizations for the following. The scalar code-path can probably also be optimized better.
-
 #define CONVOLVE_ONE_SAMPLE      \
             input = *inputP++;   \
             sum1 += input * *k1; \
@@ -257,6 +259,76 @@
 
             {
                 float input;
+
+#ifdef __SSE2__
+                // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+                while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
+                    CONVOLVE_ONE_SAMPLE
+                    n--;
+                }
+
+                // Now the inputP is aligned and start to apply SSE.
+                float* endP = inputP + n - n % 4;
+                __m128 mInput;
+                __m128 mK1;
+                __m128 mK2;
+                __m128 mul1;
+                __m128 mul2;
+
+                __m128 sums1 = _mm_setzero_ps();
+                __m128 sums2 = _mm_setzero_ps();
+                bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
+                bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
+#define LOAD_DATA(l1, l2)                        \
+                mInput = _mm_load_ps(inputP);    \
+                mK1 = _mm_##l1##_ps(k1);         \
+                mK2 = _mm_##l2##_ps(k2);
+
+#define CONVOLVE_4_SAMPLES                       \
+                mul1 = _mm_mul_ps(mInput, mK1);  \
+                mul2 = _mm_mul_ps(mInput, mK2);  \
+                sums1 = _mm_add_ps(sums1, mul1); \
+                sums2 = _mm_add_ps(sums2, mul2); \
+                inputP += 4;                     \
+                k1 += 4;                         \
+                k2 += 4;
+
+                if (k1Aligned && k2Aligned) { // both aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(load, load)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else if (!k1Aligned && k2Aligned) { // only k2 aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(loadu, load)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else if (k1Aligned && !k2Aligned) { // only k1 aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(load, loadu)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else { // both non-aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(loadu, loadu)
+                        CONVOLVE_4_SAMPLES
+                    }
+                }
+
+                // Summarize the SSE results to sum1 and sum2.
+                float* groupSumP = reinterpret_cast<float*>(&sums1);
+                sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
+                groupSumP = reinterpret_cast<float*>(&sums2);
+                sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
+
+                n %= 4;
+                while (n) {
+                    CONVOLVE_ONE_SAMPLE
+                    n--;
+                }
+#else
+                // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
                 
                 // Optimize size 32 and size 64 kernels by unrolling the while loop.
                 // A 20 - 30% speed improvement was measured in some cases by using this approach.
@@ -365,6 +437,7 @@
                         CONVOLVE_ONE_SAMPLE
                     }
                 }
+#endif
             }
 
             // Linearly interpolate the two "convolutions".
_______________________________________________
webkit-changes mailing list
[email protected]
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes

Reply via email to