Skip to site navigation (Press enter)

[webkit-changes] [138640] trunk/Source/WebCore

benjamin Wed, 02 Jan 2013 13:18:14 -0800

Title: [138640] trunk/Source/WebCore

Revision: 138640
Author: benja...@webkit.org
Date: 2013-01-02 13:19:57 -0800 (Wed, 02 Jan 2013)

Log Message

Optimize TransformationMatrix::multiply() for x86_64
https://bugs.webkit.org/show_bug.cgi?id=105719


Reviewed by Sam Weinig.

On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
We can use that in two ways to optimize matrix multiplications:
-Keep the source matrix completely in registers. Write the result directly in
 the source matrix's memory. This avoids the memcpy at the end of the multiplication
 and various memory operations.
-Use SIMD with SSE to perform 2 operations at a time.

The parameter from the second matrix are loaded one by one in XMM registers.
Loading them with SSE then shuffling the values perform worse than loading
one by one.

This is only enabled on 64bits as x86 only has access to 8 XMM registers and
the function should be written differently.

On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.

* platform/graphics/transforms/TransformationMatrix.cpp:
(WebCore::TransformationMatrix::multiply):
* platform/graphics/transforms/TransformationMatrix.h:
(TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h

Diff

Modified: trunk/Source/WebCore/ChangeLog (138639 => 138640)


--- trunk/Source/WebCore/ChangeLog	2013-01-02 21:13:56 UTC (rev 138639)
+++ trunk/Source/WebCore/ChangeLog	2013-01-02 21:19:57 UTC (rev 138640)
@@ -1,3 +1,31 @@
+2013-01-02  Benjamin Poulain  <benja...@webkit.org>
+
+        Optimize TransformationMatrix::multiply() for x86_64
+        https://bugs.webkit.org/show_bug.cgi?id=105719
+
+        Reviewed by Sam Weinig.
+
+        On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
+        We can use that in two ways to optimize matrix multiplications:
+        -Keep the source matrix completely in registers. Write the result directly in
+         the source matrix's memory. This avoids the memcpy at the end of the multiplication
+         and various memory operations.
+        -Use SIMD with SSE to perform 2 operations at a time.
+
+        The parameter from the second matrix are loaded one by one in XMM registers.
+        Loading them with SSE then shuffling the values perform worse than loading
+        one by one.
+
+        This is only enabled on 64bits as x86 only has access to 8 XMM registers and
+        the function should be written differently.
+
+        On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
+
+        * platform/graphics/transforms/TransformationMatrix.cpp:
+        (WebCore::TransformationMatrix::multiply):
+        * platform/graphics/transforms/TransformationMatrix.h:
+        (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
+
 2013-01-02  Mike West  <mk...@chromium.org>
 
         Clean up the loadXXXStyle() idiom in StyleResolver.

Modified: trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp (138639 => 138640)


--- trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp	2013-01-02 21:13:56 UTC (rev 138639)
+++ trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp	2013-01-02 21:19:57 UTC (rev 138640)
@@ -36,6 +36,10 @@
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>
 
+#if CPU(X86_64)
+#include <emmintrin.h>
+#endif
+
 using namespace std;
 
 namespace WebCore {
@@ -968,9 +972,7 @@
                                 to.y() - from.y());
 }
 
-//
-// *this = mat * *this
-//
+// this = mat * this.
 TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
 {
 #if CPU(APPLE_ARMV7S)
@@ -1115,6 +1117,130 @@
     }
 #undef MATRIX_MULTIPLY_ONE_LINE
 
+#elif CPU(X86_64)
+    // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
+    __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
+    __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
+    __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
+    __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
+
+    // FIXME: move futher down before it is actually used.
+    __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
+    __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
+    __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
+    __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
+
+    // First row.
+    __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
+    __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
+    __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
+    __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
+
+    // output00 and output01.
+    __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][0], accumulator);
+
+    // output02 and output03.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][2], accumulator);
+
+    // Second row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
+
+    // output10 and output11.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][0], accumulator);
+
+    // output12 and output13.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][2], accumulator);
+
+    // Third row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
+
+    // output20 and output21.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][0], accumulator);
+
+    // output22 and output23.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][2], accumulator);
+
+    // Fourth row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
+
+    // output30 and output31.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][0], accumulator);
+
+    // output32 and output33.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][2], accumulator);
 #else
     Matrix4 tmp;

Modified: trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h (138639 => 138640)


--- trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h	2013-01-02 21:13:56 UTC (rev 138639)
+++ trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h	2013-01-02 21:19:57 UTC (rev 138640)
@@ -72,7 +72,7 @@
 class TransformationMatrix {
     WTF_MAKE_FAST_ALLOCATED;
 public:
-#if CPU(APPLE_ARMV7S)
+#if CPU(APPLE_ARMV7S) || CPU(X86_64)
     typedef double Matrix4[4][4] __attribute__((aligned (16)));
 #else
     typedef double Matrix4[4][4];
@@ -226,7 +226,7 @@
     double f() const { return m_matrix[3][1]; }
     void setF(double f) { m_matrix[3][1] = f; }
 
-    // this = this * mat
+    // this = mat * this.
     TransformationMatrix& multiply(const TransformationMatrix&);
 
     TransformationMatrix& scale(double);

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo/webkit-changes