Author: post
Date: 2010-10-16 16:04:18 +0200 (Sat, 16 Oct 2010)
New Revision: 3561

Modified:
   trunk/plugins/dcp/dcp-sse4.c
Log:
Add SSE4 tonecurve assembler.

Modified: trunk/plugins/dcp/dcp-sse4.c
===================================================================
--- trunk/plugins/dcp/dcp-sse4.c        2010-10-16 14:03:21 UTC (rev 3560)
+++ trunk/plugins/dcp/dcp-sse4.c        2010-10-16 14:04:18 UTC (rev 3561)
@@ -470,8 +470,108 @@
        *_v = v;
 }
 
+static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) = 
{1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f};
 
+static inline __m128 
+curve_interpolate_lookup(__m128 value, const gfloat * const tone_lut)
+{
+       /* Convert v to lookup values and interpolate */
+       __m128 mul = _mm_mul_ps(value, _mm_load_ps(_thousand_24_ps));
+       __m128i lookup = _mm_cvtps_epi32(mul);
 
+       /* Calculate fractions */
+       __m128 frac = _mm_sub_ps(mul, _mm_floor_ps(mul));
+       __m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
+
+       /* Load two adjacent curve values and interpolate between them */
+       __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,0)*2]));
+       __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,2)*2]));
+       p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&tone_lut[_mm_extract_epi32(lookup,1)*2]);
+       p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&tone_lut[_mm_extract_epi32(lookup,3)*2]);
+
+       /* Pack all lower values in v0, high in v1 and interpolate */
+       __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0));
+       __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(3,1,3,1));
+       return _mm_add_ps(_mm_mul_ps(inv_frac, v0), _mm_mul_ps(frac, v1));
+}
+
+/* TODO: Possibly use blend_ps instead of masking and or'ing */
+static void 
+rgb_tone_sse4(__m128* _r, __m128* _g, __m128* _b, const gfloat * const 
tone_lut)
+{
+       __m128 r = *_r;
+       __m128 g = *_g;
+       __m128 b = *_b;
+       __m128 small_ps = _mm_load_ps(_very_small_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       
+       /* Clamp  to avoid lookups out of table */
+       r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps);
+       g =  _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
+       b =  _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
+       
+       /* Find largest and smallest values */
+       __m128 lg = _mm_max_ps(b, _mm_max_ps(r, g));
+       __m128 sm = _mm_min_ps(b, _mm_min_ps(r, g));
+
+       /* Lookup */
+       __m128 LG = curve_interpolate_lookup(lg, tone_lut);
+       __m128 SM = curve_interpolate_lookup(sm, tone_lut);
+
+       /* Create masks for largest, smallest and medium values */
+       /* This is done in integer SSE2, since they have double the throughput 
*/
+       __m128i ones = _mm_cmpeq_epi32(DW(r), DW(r));
+       __m128i is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg));
+       __m128i is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg));
+       __m128i is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg));
+       
+       __m128i is_r_sm = _mm_andnot_si128(is_r_lg, _mm_cmpeq_epi32(DW(r), 
DW(sm)));
+       __m128i is_g_sm = _mm_andnot_si128(is_g_lg, _mm_cmpeq_epi32(DW(g), 
DW(sm)));
+       __m128i is_b_sm = _mm_andnot_si128(is_b_lg, _mm_cmpeq_epi32(DW(b), 
DW(sm)));
+       
+       __m128i is_r_md = _mm_xor_si128(ones, _mm_or_si128(is_r_lg, is_r_sm));
+       __m128i is_g_md = _mm_xor_si128(ones, _mm_or_si128(is_g_lg, is_g_sm));
+       __m128i is_b_md = _mm_xor_si128(ones, _mm_or_si128(is_b_lg, is_b_sm));
+
+       /* Find all medium values based on masks */
+       __m128 md = PS(_mm_or_si128(_mm_or_si128(
+                                  _mm_and_si128(DW(r), is_r_md), 
+                                                                
_mm_and_si128(DW(g), is_g_md)),
+                                                                               
           _mm_and_si128(DW(b), is_b_md)));
+
+       /* Calculate tone corrected medium value */
+       __m128 p = _mm_rcp_ps(_mm_sub_ps(lg, sm));
+       __m128 q = _mm_sub_ps(md, sm);
+       __m128 o = _mm_sub_ps(LG, SM);
+       __m128 MD = _mm_add_ps(SM, _mm_mul_ps(o, _mm_mul_ps(p, q)));
+
+       /* Inserted here again, to lighten register presssure */
+       is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg));
+       is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg));
+       is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg));
+
+       /* Combine corrected values to output RGB */
+       r = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_r_lg),
+                                                _mm_and_si128(DW(SM), 
is_r_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_r_md)));
+       
+       g = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_g_lg),
+                                                _mm_and_si128(DW(SM), 
is_g_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_g_md)));
+       
+       b = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_b_lg),
+                                                _mm_and_si128(DW(SM), 
is_b_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_b_md)));
+
+       *_r = r;
+       *_g = g;
+       *_b = b;
+}
+
+
 #define SETFLOAT4(N, A, B, C, D) float N[4] __attribute__ ((aligned (16))); \
 N[0] = D; N[1] = C; N[2] = B; N[3] = A;
 
@@ -761,7 +861,7 @@
                        /* Apply Tone Curve  in RGB space*/
                        if (dcp->tone_curve_lut) 
                        {
-                               rgb_tone_sse2( &r, &g, &b, dcp->tone_curve_lut);
+                               rgb_tone_sse4( &r, &g, &b, dcp->tone_curve_lut);
                        }
 
                        /* Convert to 16 bit */


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to