Author: post
Date: 2010-10-12 17:48:57 +0200 (Tue, 12 Oct 2010)
New Revision: 3547

Modified:
   trunk/plugins/dcp/dcp-sse2.c
   trunk/plugins/dcp/dcp-sse4.c
   trunk/plugins/dcp/dcp.c
Log:
Use double size tables for lookups for curve and tonecurve. That way we avoid 
cacheline splits and unaligned 64 bit lookups which are very expensive on Intel 
processors, and also faster on AMD. 

Modified: trunk/plugins/dcp/dcp-sse2.c
===================================================================
--- trunk/plugins/dcp/dcp-sse2.c        2010-10-10 12:13:25 UTC (rev 3546)
+++ trunk/plugins/dcp/dcp-sse2.c        2010-10-12 15:48:57 UTC (rev 3547)
@@ -29,6 +29,10 @@
 /* _mm_insert_epi32, since no-one was kind enough to include "insertps xmm, 
mem32, imm8" */
 /* as a valid intrinsic. So we use the integer equivalent instead */
 
+/* Regarding table lookups: */
+/* We are using double sized tables to avoid cache-splits, */
+/* when looking up curve and rgb_tone */
+
 static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 
1.0f};
 static gfloat _two_ps[4] __attribute__ ((aligned (16))) = {2.0f, 2.0f, 2.0f, 
2.0f};
 static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15, 
6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
@@ -459,10 +463,10 @@
        __m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
 
        /* Load two adjacent curve values and interpolate between them */
-       __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[0]]));
-       __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[2]]));
-       p0p1 = _mm_loadh_pi(p0p1, (__m64*)&tone_lut[xfer[1]]);
-       p2p3 = _mm_loadh_pi(p2p3, (__m64*)&tone_lut[xfer[3]]);
+       __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[0]*2]));
+       __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[2]*2]));
+       p0p1 = _mm_loadh_pi(p0p1, (__m64*)&tone_lut[xfer[1]*2]);
+       p2p3 = _mm_loadh_pi(p2p3, (__m64*)&tone_lut[xfer[3]*2]);
 
        /* Pack all lower values in v0, high in v1 and interpolate */
        __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0));
@@ -850,10 +854,10 @@
                                __m128 inv_frac = 
_mm_sub_ps(_mm_load_ps(_ones_ps), frac);
                                
                                /* Load two adjacent curve values and 
interpolate between them */
-                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]]));
-                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]]));
-                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]]);
-                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]]);
+                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]*2]));
+                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]*2]));
+                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]*2]);
+                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]*2]);
                                
                                /* Pack all lower values in v0, high in v1 and 
interpolate */
                                __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(2,0,2,0));

Modified: trunk/plugins/dcp/dcp-sse4.c
===================================================================
--- trunk/plugins/dcp/dcp-sse4.c        2010-10-10 12:13:25 UTC (rev 3546)
+++ trunk/plugins/dcp/dcp-sse4.c        2010-10-12 15:48:57 UTC (rev 3547)
@@ -727,10 +727,10 @@
                                __m128 inv_frac = 
_mm_sub_ps(_mm_load_ps(_ones_ps), frac);
                                
                                /* Load two adjacent curve values and 
interpolate between them */
-                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]]));
-                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]]));
-                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]]);
-                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]]);
+                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]*2]));
+                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]*2]));
+                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]*2]);
+                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]*2]);
                                
                                /* Pack all lower values in v0, high in v1 and 
interpolate */
                                __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(2,0,2,0));

Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c     2010-10-10 12:13:25 UTC (rev 3546)
+++ trunk/plugins/dcp/dcp.c     2010-10-12 15:48:57 UTC (rev 3547)
@@ -24,6 +24,7 @@
 #include "dcp.h"
 #include "adobe-camera-raw-tone.h"
 #include <string.h> /* memcpy */
+#include <stdlib.h>  /* posix_memalign() */
 
 RS_DEFINE_FILTER(rs_dcp, RSDcp)
 
@@ -59,7 +60,8 @@
 {
        RSDcp *dcp = RS_DCP(object);
 
-       g_free(dcp->curve_samples);
+       if (dcp->curve_samples)
+               free(dcp->curve_samples);
        g_free(dcp->_huesatmap_precalc_unaligned);
        g_free(dcp->_looktable_precalc_unaligned);
 
@@ -224,9 +226,9 @@
                                                value = powf(value, 2.2f);
 
                                                /* Store in table */
-                                               dcp->curve_samples[i] = value;
+                                               dcp->curve_samples[i*2] = 
dcp->curve_samples[i*2+1] = value;
                                        }
-                                       dcp->curve_samples[256] = 
dcp->curve_samples[255];
+                                       dcp->curve_samples[256*2] = 
dcp->curve_samples[256*2+1] = dcp->curve_samples[255*2];
                                }
                        }
                        if (knots)
@@ -235,7 +237,7 @@
                else
                        dcp->curve_is_flat = TRUE;
 
-               for(i=0;i<257;i++)
+               for(i=0;i<257*2;i++)
                        dcp->curve_samples[i] = MIN(1.0f, MAX(0.0f, 
dcp->curve_samples[i]));
 
                changed = TRUE;
@@ -260,7 +262,7 @@
        if (dcp->huesatmap2)
                g_object_unref(dcp->huesatmap2);
        if (dcp->tone_curve_lut)
-               g_free(dcp->tone_curve_lut);
+               free(dcp->tone_curve_lut);
        dcp->huesatmap1 = NULL;
        dcp->huesatmap2 = NULL;
        dcp->huesatmap_interpolated = NULL;
@@ -277,8 +279,7 @@
 rs_dcp_init(RSDcp *dcp)
 {
        RSDcpClass *klass = RS_DCP_GET_CLASS(dcp);
-
-       dcp->curve_samples = g_new(gfloat, 257);
+       g_assert(0 == posix_memalign((void**)&dcp->curve_samples, 16, 
sizeof(gfloat)*2*257));
        dcp->huesatmap_interpolated = NULL;
        dcp->use_profile = FALSE;
        dcp->curve_is_flat = TRUE;
@@ -794,8 +795,8 @@
 lookup_tone(gfloat value, const gfloat * const tone_lut)
 {
        gfloat lookup = CLAMP(value * 1024.0f, 0.0f, 1023.9999f);
-       gfloat v0 = tone_lut[(gint)lookup];
-       gfloat v1 = tone_lut[(gint)lookup + 1];
+       gfloat v0 = tone_lut[(gint)lookup*2];
+       gfloat v1 = tone_lut[(gint)lookup*2 + 1];
        lookup -= floorf(lookup);
        return v0 * (1.0f - lookup) + v1 * lookup;      
 }
@@ -887,13 +888,13 @@
        /* Preloads cache with lookup data */
        if (!dcp->curve_is_flat)
        {
-               for (i = 0; i < 257; i+=(cache_line_bytes/sizeof(gfloat)))
+               for (i = 0; i < 514; i+=(cache_line_bytes/sizeof(gfloat)))
                        unused = dcp->curve_samples[i];
        }
 
        if (dcp->tone_curve_lut) 
        {
-               for (i = 0; i < 1025; i+=(cache_line_bytes/sizeof(gfloat)))
+               for (i = 0; i < 2050; i+=(cache_line_bytes/sizeof(gfloat)))
                        unused = dcp->tone_curve_lut[i];
        }
 
@@ -1058,8 +1059,8 @@
                        if (!dcp->curve_is_flat)
                        {
                                gfloat lookup = CLAMP(v * 256.0f, 0.0f, 
255.9999f);
-                               gfloat v0 = dcp->curve_samples[(gint)lookup];
-                               gfloat v1 = dcp->curve_samples[(gint)lookup + 
1];
+                               gfloat v0 = dcp->curve_samples[(gint)lookup*2];
+                               gfloat v1 = dcp->curve_samples[(gint)lookup*2 + 
1];
                                lookup -= floorf(lookup);
                                v = v0 * (1.0f - lookup) + v1 * lookup;
                        }
@@ -1253,6 +1254,7 @@
 static void
 read_profile(RSDcp *dcp, RSDcpFile *dcp_file)
 {
+       gint i;
        free_dcp_profile(dcp);
        
        /* ColorMatrix */
@@ -1268,7 +1270,6 @@
        dcp->tone_curve = rs_dcp_file_get_tonecurve(dcp_file);
        if (!dcp->tone_curve)
        {
-               gint i;
                gint num_knots = adobe_default_table_size;
                gfloat *knots = g_new0(gfloat, adobe_default_table_size * 2);
 
@@ -1280,10 +1281,13 @@
                dcp->tone_curve = rs_spline_new(knots, num_knots, NATURAL);
                g_free(knots);
        }
-       dcp->tone_curve_lut = g_new(gfloat, 1025);
+       g_assert(0 == posix_memalign((void**)&dcp->tone_curve_lut, 16, 
sizeof(gfloat)*2*1025));
        gfloat *tc = rs_spline_sample(dcp->tone_curve, NULL, 1024);
-       memcpy(dcp->tone_curve_lut, tc, 1024*sizeof(gfloat));
-       dcp->tone_curve_lut[1024] = dcp->tone_curve_lut[1023];
+       for (i=0; i< 1024; i++)
+       {
+               dcp->tone_curve_lut[i*2] = dcp->tone_curve_lut[i*2+1] = tc[i];
+       }
+       dcp->tone_curve_lut[1024*2] = dcp->tone_curve_lut[1024*2+1] = tc[1023];
        g_free(tc);
 
        /* ForwardMatrix */


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to