Modified: trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp (138639 => 138640)
--- trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp 2013-01-02 21:13:56 UTC (rev 138639)
+++ trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp 2013-01-02 21:19:57 UTC (rev 138640)
@@ -36,6 +36,10 @@
#include <wtf/Assertions.h>
#include <wtf/MathExtras.h>
+#if CPU(X86_64)
+#include <emmintrin.h>
+#endif
+
using namespace std;
namespace WebCore {
@@ -968,9 +972,7 @@
to.y() - from.y());
}
-//
-// *this = mat * *this
-//
+// this = mat * this.
TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
{
#if CPU(APPLE_ARMV7S)
@@ -1115,6 +1117,130 @@
}
#undef MATRIX_MULTIPLY_ONE_LINE
+#elif CPU(X86_64)
+ // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
+ __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
+ __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
+ __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
+ __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
+
+ // FIXME: move futher down before it is actually used.
+ __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
+ __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
+ __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
+ __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
+
+ // First row.
+ __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
+ __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
+ __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
+ __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
+
+ // output00 and output01.
+ __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[0][0], accumulator);
+
+ // output02 and output03.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[0][2], accumulator);
+
+ // Second row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
+
+ // output10 and output11.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[1][0], accumulator);
+
+ // output12 and output13.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[1][2], accumulator);
+
+ // Third row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
+
+ // output20 and output21.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[2][0], accumulator);
+
+ // output22 and output23.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[2][2], accumulator);
+
+ // Fourth row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
+
+ // output30 and output31.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[3][0], accumulator);
+
+ // output32 and output33.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[3][2], accumulator);
#else
Matrix4 tmp;