Skip to site navigation (Press enter)

[webkit-changes] [101894] trunk/Source/WebCore

commit-queue Fri, 02 Dec 2011 18:43:49 -0800

Title: [101894] trunk/Source/WebCore

Revision: 101894
Author: [email protected]
Date: 2011-12-02 18:43:43 -0800 (Fri, 02 Dec 2011)

Log Message

-Implement the SSE optimization for vsmul and vadd.
https://bugs.webkit.org/show_bug.cgi?id=73182


Patch by James Wei <[email protected]> & Xingnan Wang <[email protected]> on 2011-12-02
Reviewed by Kenneth Russell.

* platform/audio/VectorMath.cpp:
(WebCore:VectorMath):

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/audio/VectorMath.cpp

Diff

Modified: trunk/Source/WebCore/ChangeLog (101893 => 101894)


--- trunk/Source/WebCore/ChangeLog	2011-12-03 02:34:47 UTC (rev 101893)
+++ trunk/Source/WebCore/ChangeLog	2011-12-03 02:43:43 UTC (rev 101894)
@@ -1,3 +1,13 @@
+2011-12-02  James Wei <[email protected]> & Xingnan Wang <[email protected]>
+
+        -Implement the SSE optimization for vsmul and vadd.
+        https://bugs.webkit.org/show_bug.cgi?id=73182
+
+        Reviewed by Kenneth Russell.
+
+        * platform/audio/VectorMath.cpp:
+        (WebCore:VectorMath):
+
 2011-12-02  David Grogan  <[email protected]>
 
         Grant workers experimental access to IndexedDB.

Modified: trunk/Source/WebCore/platform/audio/VectorMath.cpp (101893 => 101894)


--- trunk/Source/WebCore/platform/audio/VectorMath.cpp	2011-12-03 02:34:47 UTC (rev 101893)
+++ trunk/Source/WebCore/platform/audio/VectorMath.cpp	2011-12-03 02:43:43 UTC (rev 101894)
@@ -32,6 +32,10 @@
 #include <Accelerate/Accelerate.h>
 #endif
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
 namespace WebCore {
 
 namespace VectorMath {
@@ -39,7 +43,7 @@
 #if OS(DARWIN)
 // On the Mac we use the highly optimized versions in Accelerate.framework
 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
-// our namespaced function names, so we must handle this case differently.  Other architectures (64bit, ARM, etc.) do not include this header file.
+// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
 
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 {
@@ -63,7 +67,58 @@
 
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 {
-    // FIXME: optimize for SSE
+#ifdef __SSE2__
+    if ((sourceStride == 1) && (destStride == 1)) {
+        
+        int n = framesToProcess;
+        float k = *scale;
+
+        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
+            *destP = k * *sourceP;
+            sourceP++;
+            destP++;
+            n--;
+        }
+
+        // Now the sourceP address is aligned and start to apply SSE.
+        int group = n / 4;
+        __m128 mScale = _mm_set_ps1(k);
+        __m128* pSource;
+        __m128* pDest;
+        __m128 dest;
+
+
+        if (reinterpret_cast<size_t>(destP) & 0x0F) {
+            while (group--) {
+                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+                dest = _mm_mul_ps(*pSource, mScale);
+                _mm_storeu_ps(destP, dest);
+
+                sourceP += 4;
+                destP += 4;
+            }
+        } else {
+            while (group--) {
+                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_mul_ps(*pSource, mScale);
+
+                sourceP += 4;
+                destP += 4;
+            }
+        }
+
+        // Non-SSE handling for remaining frames which is less than 4.
+        n %= 4;
+        while (n) {
+            *destP = k * *sourceP;
+            sourceP++;
+            destP++;
+            n--;
+        }
+    } else { // If strides are not 1, rollback to normal algorithm.
+#endif
     int n = framesToProcess;
     float k = *scale;
     while (n--) {
@@ -71,11 +126,97 @@
         sourceP += sourceStride;
         destP += destStride;
     }
+#ifdef __SSE2__
+    }
+#endif
 }
 
 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
 {
-    // FIXME: optimize for SSE
+#ifdef __SSE2__
+    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
+
+        int n = framesToProcess;
+
+        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
+            *destP = *source1P + *source2P;
+            source1P++;
+            source2P++;
+            destP++;
+            n--;
+        }
+
+        // Now the source1P address is aligned and start to apply SSE.
+        int group = n / 4;
+        __m128* pSource1;
+        __m128* pSource2;
+        __m128* pDest;
+        __m128 source2;
+        __m128 dest;
+
+        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
+        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
+
+        if (source2Aligned && destAligned) { // all aligned
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_add_ps(*pSource1, *pSource2);
+
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+
+        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+                dest = _mm_add_ps(*pSource1, *pSource2);
+                _mm_storeu_ps(destP, dest);
+
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+
+        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                source2 = _mm_loadu_ps(source2P);
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_add_ps(*pSource1, source2);
+
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                source2 = _mm_loadu_ps(source2P);
+                dest = _mm_add_ps(*pSource1, source2);
+                _mm_storeu_ps(destP, dest);
+
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        }
+
+        // Non-SSE handling for remaining frames which is less than 4.
+        n %= 4;
+        while (n) {
+            *destP = *source1P + *source2P;
+            source1P++;
+            source2P++;
+            destP++;
+            n--;
+        }
+    } else { // if strides are not 1, rollback to normal algorithm
+#endif
     int n = framesToProcess;
     while (n--) {
         *destP = *source1P + *source2P;
@@ -83,6 +224,9 @@
         source2P += sourceStride2;
         destP += destStride;
     }
+#ifdef __SSE2__
+    }
+#endif
 }
 
 #endif // OS(DARWIN)

_______________________________________________
webkit-changes mailing list
[email protected]
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes