Skip to site navigation (Press enter)

[webkit-changes] [123605] trunk/Source/WebCore

allan . jensen Wed, 25 Jul 2012 04:54:50 -0700

Title: [123605] trunk/Source/WebCore

Revision: 123605
Author: allan.jen...@nokia.com
Date: 2012-07-25 04:54:36 -0700 (Wed, 25 Jul 2012)

Log Message

Fix arithmetic composite filter for auto-vectorization
https://bugs.webkit.org/show_bug.cgi?id=92123


Reviewed by Nikolas Zimmermann.

Since only clamping of the result prevents GCC from auto-vectorizing the inner loop, this patch
adds a faster version of the inner loop to handle cases where clamping is unnecessary,

* platform/graphics/filters/FEComposite.cpp:
(WebCore::computeArithmeticPixelsUnclamped):
(WebCore::arithmeticSoftware):

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/graphics/filters/FEComposite.cpp

Diff

Modified: trunk/Source/WebCore/ChangeLog (123604 => 123605)


--- trunk/Source/WebCore/ChangeLog	2012-07-25 11:46:56 UTC (rev 123604)
+++ trunk/Source/WebCore/ChangeLog	2012-07-25 11:54:36 UTC (rev 123605)
@@ -1,5 +1,19 @@
 2012-07-25  Allan Sandfeld Jensen  <allan.jen...@nokia.com>
 
+        Fix arithmetic composite filter for auto-vectorization
+        https://bugs.webkit.org/show_bug.cgi?id=92123
+
+        Reviewed by Nikolas Zimmermann.
+
+        Since only clamping of the result prevents GCC from auto-vectorizing the inner loop, this patch 
+        adds a faster version of the inner loop to handle cases where clamping is unnecessary, 
+
+        * platform/graphics/filters/FEComposite.cpp:
+        (WebCore::computeArithmeticPixelsUnclamped):
+        (WebCore::arithmeticSoftware):
+
+2012-07-25  Allan Sandfeld Jensen  <allan.jen...@nokia.com>
+
         Fix blend filter for autovectorizing
         https://bugs.webkit.org/show_bug.cgi?id=91398

Modified: trunk/Source/WebCore/platform/graphics/filters/FEComposite.cpp (123604 => 123605)


--- trunk/Source/WebCore/platform/graphics/filters/FEComposite.cpp	2012-07-25 11:46:56 UTC (rev 123604)
+++ trunk/Source/WebCore/platform/graphics/filters/FEComposite.cpp	2012-07-25 11:54:36 UTC (rev 123605)
@@ -131,9 +131,9 @@
     float scaledK1;
     float scaledK4;
     if (b1)
-        scaledK1 = k1 / 255.f;
+        scaledK1 = k1 / 255.0f;
     if (b4)
-        scaledK4 = k4 * 255.f;
+        scaledK4 = k4 * 255.0f;
 
     while (--pixelArrayLength >= 0) {
         unsigned char i1 = *source;
@@ -155,24 +155,63 @@
     }
 }
 
-static inline void arithmeticSoftware(unsigned char* source, unsigned char* destination, int pixelArrayLength,
-                       float k1, float k2, float k3, float k4)
+// computeArithmeticPixelsUnclamped is a faster version of computeArithmeticPixels for the common case where clamping
+// is not necessary. This enables aggresive compiler optimizations such as auto-vectorization.
+template <int b1, int b4>
+static inline void computeArithmeticPixelsUnclamped(unsigned char* source, unsigned char* destination, int pixelArrayLength, float k1, float k2, float k3, float k4)
 {
-    if (!k4) {
-        if (!k1) {
-            computeArithmeticPixels<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
-            return;
-        }
+    float scaledK1;
+    float scaledK4;
+    if (b1)
+        scaledK1 = k1 / 255.0f;
+    if (b4)
+        scaledK4 = k4 * 255.0f;
 
-        computeArithmeticPixels<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
-        return;
+    while (--pixelArrayLength >= 0) {
+        unsigned char i1 = *source;
+        unsigned char i2 = *destination;
+        float result = k2 * i1 + k3 * i2;
+        if (b1)
+            result += scaledK1 * i1 * i2;
+        if (b4)
+            result += scaledK4;
+
+        *destination = result;
+        ++source;
+        ++destination;
     }
+}
 
-    if (!k1) {
-        computeArithmeticPixels<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+static inline void arithmeticSoftware(unsigned char* source, unsigned char* destination, int pixelArrayLength, float k1, float k2, float k3, float k4)
+{
+    float upperLimit = std::max(0.0f, k1) + std::max(0.0f, k2) + std::max(0.0f, k3) + k4;
+    float lowerLimit = std::min(0.0f, k1) + std::min(0.0f, k2) + std::min(0.0f, k3) + k4;
+    if ((k4 >= 0.0f && k4 <= 1.0f) && (upperLimit >= 0.0f && upperLimit <= 1.0f) && (lowerLimit >= 0.0f && lowerLimit <= 1.0f)) {
+        if (k4) {
+            if (k1)
+                computeArithmeticPixelsUnclamped<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+            else
+                computeArithmeticPixelsUnclamped<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        } else {
+            if (k1)
+                computeArithmeticPixelsUnclamped<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+            else
+                computeArithmeticPixelsUnclamped<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        }
         return;
     }
-    computeArithmeticPixels<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+
+    if (k4) {
+        if (k1)
+            computeArithmeticPixels<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        else
+            computeArithmeticPixels<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+    } else {
+        if (k1)
+            computeArithmeticPixels<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        else
+            computeArithmeticPixels<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+    }
 }
 
 inline void FEComposite::platformArithmeticSoftware(Uint8ClampedArray* source, Uint8ClampedArray* destination,

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo/webkit-changes