Author: post
Date: 2010-01-31 12:31:53 +0100 (Sun, 31 Jan 2010)
New Revision: 3143

Modified:
   trunk/plugins/lensfun/lensfun-sse2.c
Log:
Lensfun: Calculate image offsets in SSE2 to avoid many expensive GET_PIXEL 
commands.

Modified: trunk/plugins/lensfun/lensfun-sse2.c
===================================================================
--- trunk/plugins/lensfun/lensfun-sse2.c        2010-01-31 10:50:07 UTC (rev 
3142)
+++ trunk/plugins/lensfun/lensfun-sse2.c        2010-01-31 11:31:53 UTC (rev 
3143)
@@ -27,6 +27,7 @@
 #include <emmintrin.h>
 
 static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f, 
256.0f, 256.0f, 0.0f};
+static gint _zero12[4] __attribute__ ((aligned (16))) = {0,1,2,0};
 
 gboolean is_sse2_compiled()
 {
@@ -98,29 +99,61 @@
 
        int xfer[16] __attribute__ ((aligned (16)));
 
-       _mm_store_si128((__m128i*)xfer, _mm_srai_epi32(x, 8));
-       _mm_store_si128((__m128i*)&xfer[4], _mm_srai_epi32(y, 8));
-       _mm_store_si128((__m128i*)&xfer[8], nx);
-       _mm_store_si128((__m128i*)&xfer[12], ny);
+       /* Pitch as pixels */
+       __m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >> 
2)<<16));
+
+       /* Remove remainder */
+       __m128i tx = _mm_srai_epi32(x, 8);
+       __m128i ty = _mm_srai_epi32(y, 8);
        
+       /* Multiply y by pitch */
+       ty = _mm_packs_epi32(ty, ty);
+       __m128i ty_lo = _mm_mullo_epi16(ty, pitch);
+       __m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
+       ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
+       
+       /* Same to next pixel */
+       ny = _mm_packs_epi32(ny, ny);
+       __m128i ny_lo = _mm_mullo_epi16(ny, pitch);
+       __m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
+       ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
+       
+       /* Add pitch and x offset */
+       __m128i a_offset =  _mm_add_epi32(tx, ty);
+       __m128i b_offset =  _mm_add_epi32(nx, ty);
+       __m128i c_offset =  _mm_add_epi32(tx, ny);
+       __m128i d_offset =  _mm_add_epi32(nx, ny);
+
+       /* Multiply by pixelsize and add RGB offsets */
+       __m128i zero12 = _mm_load_si128((__m128i*)_zero12);
+       a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
+       b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
+       c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
+       d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
+
+       _mm_store_si128((__m128i*)xfer, a_offset);
+       _mm_store_si128((__m128i*)&xfer[4], b_offset);
+       _mm_store_si128((__m128i*)&xfer[8], c_offset);
+       _mm_store_si128((__m128i*)&xfer[12], d_offset);
+       
        gushort* pixels[12];
        
        /* Loop unrolled, allows agressive instruction reordering */
        /* Red, then G & B */
-       pixels[0] = GET_PIXEL(in, xfer[0], xfer[4]);    // a
-       pixels[1] = GET_PIXEL(in, xfer[8], xfer[4]);    // b
-       pixels[2] = GET_PIXEL(in, xfer[0], xfer[12]);   // c
-       pixels[3] = GET_PIXEL(in, xfer[8], xfer[12]);   // d
+       pixels[0] = in->pixels + xfer[0];       // a
+       pixels[1] = in->pixels + xfer[4];       // b
+       pixels[2] = in->pixels + xfer[8];       // c
+       pixels[3] = in->pixels + xfer[12];      // d
                
-       pixels[4] = GET_PIXEL(in, xfer[1], xfer[1+4]) + 1;              // a
-       pixels[4+1] = GET_PIXEL(in, xfer[1+8], xfer[1+4]) + 1;  // b
-       pixels[4+2] = GET_PIXEL(in, xfer[1], xfer[1+12]) + 1;   // c
-       pixels[4+3] = GET_PIXEL(in, xfer[1+8], xfer[1+12]) + 1; // d
+       pixels[4] = in->pixels + xfer[1+0];             // a
+       pixels[5] = in->pixels + xfer[1+4];             // b
+       pixels[6] = in->pixels + xfer[1+8];             // c
+       pixels[7] = in->pixels + xfer[1+12];    // d
 
-       pixels[2*4] = GET_PIXEL(in, xfer[2], xfer[2+4]) + 2;            // a
-       pixels[2*4+1] = GET_PIXEL(in, xfer[2+8], xfer[2+4]) + 2;        // b
-       pixels[2*4+2] = GET_PIXEL(in, xfer[2], xfer[2+12]) + 2;         // c
-       pixels[2*4+3] = GET_PIXEL(in, xfer[2+8], xfer[2+12]) + 2;       // d
+       pixels[8] = in->pixels + xfer[2+0];             // a
+       pixels[9] = in->pixels + xfer[2+4];             // b
+       pixels[10] = in->pixels + xfer[2+8];    // c
+       pixels[11] = in->pixels + xfer[2+12];   // d
 
        /* Calculate distances */
        __m128i twofiftyfive = _mm_set1_epi32(255);


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to