Modified: trunk/Source/WebCore/platform/audio/VectorMath.cpp (101893 => 101894)
--- trunk/Source/WebCore/platform/audio/VectorMath.cpp 2011-12-03 02:34:47 UTC (rev 101893)
+++ trunk/Source/WebCore/platform/audio/VectorMath.cpp 2011-12-03 02:43:43 UTC (rev 101894)
@@ -32,6 +32,10 @@
#include <Accelerate/Accelerate.h>
#endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
namespace WebCore {
namespace VectorMath {
@@ -39,7 +43,7 @@
#if OS(DARWIN)
// On the Mac we use the highly optimized versions in Accelerate.framework
// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
-// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
+// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
{
@@ -63,7 +67,58 @@
void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
{
- // FIXME: optimize for SSE
+#ifdef __SSE2__
+ if ((sourceStride == 1) && (destStride == 1)) {
+
+ int n = framesToProcess;
+ float k = *scale;
+
+ // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+ while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
+ *destP = k * *sourceP;
+ sourceP++;
+ destP++;
+ n--;
+ }
+
+ // Now the sourceP address is aligned and start to apply SSE.
+ int group = n / 4;
+ __m128 mScale = _mm_set_ps1(k);
+ __m128* pSource;
+ __m128* pDest;
+ __m128 dest;
+
+
+ if (reinterpret_cast<size_t>(destP) & 0x0F) {
+ while (group--) {
+ pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+ dest = _mm_mul_ps(*pSource, mScale);
+ _mm_storeu_ps(destP, dest);
+
+ sourceP += 4;
+ destP += 4;
+ }
+ } else {
+ while (group--) {
+ pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+ pDest = reinterpret_cast<__m128*>(destP);
+ *pDest = _mm_mul_ps(*pSource, mScale);
+
+ sourceP += 4;
+ destP += 4;
+ }
+ }
+
+ // Non-SSE handling for remaining frames which is less than 4.
+ n %= 4;
+ while (n) {
+ *destP = k * *sourceP;
+ sourceP++;
+ destP++;
+ n--;
+ }
+ } else { // If strides are not 1, rollback to normal algorithm.
+#endif
int n = framesToProcess;
float k = *scale;
while (n--) {
@@ -71,11 +126,97 @@
sourceP += sourceStride;
destP += destStride;
}
+#ifdef __SSE2__
+ }
+#endif
}
void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
{
- // FIXME: optimize for SSE
+#ifdef __SSE2__
+ if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
+
+ int n = framesToProcess;
+
+ // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+ while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
+ *destP = *source1P + *source2P;
+ source1P++;
+ source2P++;
+ destP++;
+ n--;
+ }
+
+ // Now the source1P address is aligned and start to apply SSE.
+ int group = n / 4;
+ __m128* pSource1;
+ __m128* pSource2;
+ __m128* pDest;
+ __m128 source2;
+ __m128 dest;
+
+ bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
+ bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
+
+ if (source2Aligned && destAligned) { // all aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+ pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+ pDest = reinterpret_cast<__m128*>(destP);
+ *pDest = _mm_add_ps(*pSource1, *pSource2);
+
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ }
+
+ } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+ pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+ dest = _mm_add_ps(*pSource1, *pSource2);
+ _mm_storeu_ps(destP, dest);
+
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ }
+
+ } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+ source2 = _mm_loadu_ps(source2P);
+ pDest = reinterpret_cast<__m128*>(destP);
+ *pDest = _mm_add_ps(*pSource1, source2);
+
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ }
+ } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+ source2 = _mm_loadu_ps(source2P);
+ dest = _mm_add_ps(*pSource1, source2);
+ _mm_storeu_ps(destP, dest);
+
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ }
+ }
+
+ // Non-SSE handling for remaining frames which is less than 4.
+ n %= 4;
+ while (n) {
+ *destP = *source1P + *source2P;
+ source1P++;
+ source2P++;
+ destP++;
+ n--;
+ }
+ } else { // if strides are not 1, rollback to normal algorithm
+#endif
int n = framesToProcess;
while (n--) {
*destP = *source1P + *source2P;
@@ -83,6 +224,9 @@
source2P += sourceStride2;
destP += destStride;
}
+#ifdef __SSE2__
+ }
+#endif
}
#endif // OS(DARWIN)