Author: post
Date: 2010-06-25 16:25:52 +0200 (Fri, 25 Jun 2010)
New Revision: 3442

Modified:
   trunk/plugins/dcp/dcp-sse4.c
Log:
Move SSE2 curve and contrast into SSE4 code.

Modified: trunk/plugins/dcp/dcp-sse4.c
===================================================================
--- trunk/plugins/dcp/dcp-sse4.c        2010-06-23 07:52:05 UTC (rev 3441)
+++ trunk/plugins/dcp/dcp-sse4.c        2010-06-25 14:25:52 UTC (rev 3442)
@@ -23,6 +23,8 @@
 #ifdef __SSE4_1__
 #include <smmintrin.h>
 
+#include <math.h> /* powf() */
+
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 /* We ignore this pragma, because we are casting a pointer from float to int 
to pass a float using */
 /* _mm_insert_epi32, since no-one was kind enough to include "insertps xmm, 
mem32, imm8" */
@@ -54,7 +56,6 @@
        return acc;
 }
 
-static gfloat _half_ps[4] __attribute__ ((aligned (16))) = 
{0.5f,0.5f,0.5f,0.5f};
 static gfloat _rgb_div_ps[4] __attribute__ ((aligned (16))) = {1.0/65535.0, 
1.0/65535.0, 1.0/65535.0, 1.0/65535.0};
 
 
@@ -86,7 +87,7 @@
        __m128 v_mask = _mm_cmpeq_ps(gap, zero_ps);
        v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
 
-       h = _mm_xor_ps(r,r);
+       h = _mm_setzero_ps();
 
        /* Set gap to one where sat = 0, this will avoid divisions by zero, 
these values will not be used */
        ones_ps = _mm_and_ps(ones_ps, v_mask);
@@ -476,6 +477,7 @@
 #define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
 N[0] = A; N[1] = A; N[2] = A; N[3] = A;
 
+static gfloat _twofiftysix_ps[4] __attribute__ ((aligned (16))) = 
{255.9999f,255.9999f,255.9999f,255.9999f};
 
 
 gboolean
@@ -490,9 +492,17 @@
        __m128 r, g, b, r2, g2, b2;
        __m128i zero;
        
-       gboolean do_contrast = (ABS(1.0f - dcp->contrast) > 0.001f);
+       gboolean do_contrast = (dcp->contrast > 1.001f);
+       gboolean do_highrec = (dcp->contrast < 0.999f);
        __m128 hue_add = _mm_set_ps(dcp->hue, dcp->hue, dcp->hue, dcp->hue);
        __m128 sat = _mm_set_ps(dcp->saturation, dcp->saturation, 
dcp->saturation, dcp->saturation);
+       float exposure_simple = MAX(1.0, powf(2.0f, dcp->exposure));
+       float __recover_radius = 0.5 * exposure_simple;
+       SETFLOAT4_SAME(_inv_recover_radius, 1.0f / __recover_radius);
+       SETFLOAT4_SAME(_recover_radius, 1.0 - __recover_radius);
+       SETFLOAT4_SAME(_contr_base, 0.5f);
+       SETFLOAT4_SAME(_inv_contrast, 1.0f - dcp->contrast);
+       int xfer[4] __attribute__ ((aligned (16)));
 
        SETFLOAT4(_min_cam, 0.0f, dcp->camera_white.z, dcp->camera_white.y, 
dcp->camera_white.x);
        SETFLOAT4_SAME(_black_minus_radius, dcp->exposure_black - 
dcp->exposure_radius);
@@ -611,6 +621,7 @@
                        h = _mm_sub_ps(h, six_masked_gt);
                        h = _mm_add_ps(h, six_masked_lt);
 
+                       __m128 v_stored = v;
                        HSVtoRGB_SSE4(&h, &s, &v);
                        r = h; g = s; b = v;
                        
@@ -650,41 +661,66 @@
                        /* Contrast in gamma 2.0 */
                        if (do_contrast)
                        {
-                               __m128 half_ps = _mm_load_ps(_half_ps);
+                               __m128 contr_base = _mm_load_ps(_contr_base);
                                __m128 contrast = _mm_load_ps(_contrast);
                                min_val = _mm_load_ps(_very_small_ps);
-                               r = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(r), half_ps)), half_ps);
-                               g = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(g), half_ps)), half_ps);
-                               b = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(b), half_ps)), half_ps);
                                r = _mm_max_ps(r, min_val);
                                g = _mm_max_ps(g, min_val);
                                b = _mm_max_ps(b, min_val);
+                               r = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(r), contr_base)), contr_base);
+                               g = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(g), contr_base)), contr_base);
+                               b = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(b), contr_base)), contr_base);
+                               r = _mm_max_ps(r, min_val);
+                               g = _mm_max_ps(g, min_val);
+                               b = _mm_max_ps(b, min_val);
                                r = _mm_mul_ps(r,r);
                                g = _mm_mul_ps(g,g);
                                b = _mm_mul_ps(b,b);
                        }
+                       else if (do_highrec)
+                       {
+                               max_val = _mm_load_ps(_ones_ps);
+                               __m128 inv_contrast = 
_mm_load_ps(_inv_contrast);
+                               __m128 recover_radius = 
_mm_load_ps(_recover_radius);
+                               __m128 inv_recover_radius = 
_mm_load_ps(_inv_recover_radius);
 
+                               /* Distance from 1.0 - radius */
+                               __m128 dist = _mm_sub_ps(v_stored, 
recover_radius);
+                               /* Scale so distance is normalized, clamp */
+                               __m128 dist_scaled = _mm_min_ps(max_val, 
_mm_mul_ps(dist, inv_recover_radius));
+
+                               __m128 mul_val = _mm_sub_ps(max_val, 
_mm_mul_ps(dist_scaled, inv_contrast));
+
+                               r = _mm_mul_ps(r, mul_val);
+                               g = _mm_mul_ps(g, mul_val);
+                               b = _mm_mul_ps(b, mul_val);
+                       }
+
                        /* Convert to HSV */
                        RGBtoHSV_SSE4(&r, &g, &b);
                        h = r; s = g; v = b;
 
                        if (!dcp->curve_is_flat)                        
                        {
-                               /* Convert v to lookup values */
-                               /* TODO: Use 8 bit fraction as interpolation, 
for interpolating
-                               * a more precise lookup using linear 
interpolation. Maybe use less than
-                               * 16 bits for lookup for speed, 10 bits with 
interpolation should be enough */
-                               __m128 v_mul = _mm_load_ps(_16_bit_ps);
-                               v = _mm_mul_ps(v, v_mul);
-                               __m128i lookup = _mm_cvtps_epi32(v);
+                               /* Convert v to lookup values and interpolate */
+                               __m128 v_mul = _mm_mul_ps(v, 
_mm_load_ps(_twofiftysix_ps));
+                               __m128i lookup = _mm_cvtps_epi32(v_mul);
+                               _mm_store_si128((__m128i*)&xfer[0], lookup);
 
-                               __m128i v_curved = lookup;
-                               v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,0)], 0);
-                               v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,1)], 1);
-                               v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,2)], 2);
-                               v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,3)], 3);
-
-                               v = PS(v_curved);
+                               /* Calculate fractions */
+                               __m128 frac = _mm_sub_ps(v_mul, 
_mm_floor_ps(v_mul));
+                               __m128 inv_frac = 
_mm_sub_ps(_mm_load_ps(_ones_ps), frac);
+                               
+                               /* Load two adjacent curve values and 
interpolate between them */
+                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]]));
+                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]]));
+                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]]);
+                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]]);
+                               
+                               /* Pack all lower values in v0, high in v1 and 
interpolate */
+                               __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(2,0,2,0));
+                               __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(3,1,3,1));
+                               v = _mm_add_ps(_mm_mul_ps(inv_frac, v0), 
_mm_mul_ps(frac, v1));
                        }
 
                        /* Apply looktable */


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to