# HG changeset patch # User Alexey Osipov <si...@lerlan.ru> # Date 1310760987 -25200 # Branch stabilize_optimize # Node ID ac09f716b03da55b8a3e5bd47a0f38e377e6ece8 # Parent ca546347d93b572f79923401fad4b90a5334b53c Using SSE2 optimized code for compareSubImg() and contrastSubImgYUV(). Those optimization are switchable at compile-time and honour config.h HAVE_ASM_SSE2 define.
diff -r ca546347d93b -r ac09f716b03d filter/stabilize/filter_stabilize.c --- a/filter/stabilize/filter_stabilize.c Fri Jul 15 17:10:40 2011 +0700 +++ b/filter/stabilize/filter_stabilize.c Sat Jul 16 03:16:27 2011 +0700 @@ -7,7 +7,7 @@ * * Copyright (C) Alexey Osipov - Jule 2011 * simba at lerlan dot ru - * speed optimizations + * speed optimizations including SSE2 code * * This file is part of transcode, a video stream processing tool * @@ -68,6 +68,44 @@ * this is really just for debugging and development */ // #define STABVERBOSE +#ifdef HAVE_ASM_SSE2 + +/* use SSE2 for compareSubImg */ +#define USE_SSE2_CMP + +/* use SSE2 for compareSubImg even more, + * sometimes this may be slower, + * enabling this also limit SSE_SUM_ROWS to 8 */ +#define USE_SSE2_CMP_HOR + +/* how many 16-byte rows to summ in SSE2 registers + * before output them to regular variable + * from 1 to 255, + * bigger values faster, but may cause registers overflow, + * which leads to incorrect transformation data. + * lower values not much slower, but safer + * if USE_SSE_HORIZ enabled, then this must not be larger than 8 */ +#define SSE2_CMP_SUM_ROWS 8 + +/* use SSE2 for contrastSubImg (only YUV version) + * may be used without USE_SSE */ +#define USE_SSE2_YUV_CONTRAST + + +#ifdef USE_SSE2_CMP +#define NEED_EMMINTRIN +#endif + +#ifdef USE_SSE2_YUV_CONTRAST +#define NEED_EMMINTRIN +#endif + +#ifdef NEED_EMMINTRIN +#include <emmintrin.h> +#endif + +#endif + #define MAXLONG ((unsigned long int)(-1)) typedef struct _field { @@ -162,6 +200,9 @@ const Field* field, int width, int height, int bytesPerPixel,int d_x,int d_y, unsigned long int treshold); double contrastSubImgYUV(StabData* sd, const Field* field); +#ifdef USE_SSE2_YUV_CONTRAST +double contrastSubImgYUVSSE(unsigned char* const I, const Field* field, int width, int height); +#endif double contrastSubImgRGB(StabData* sd, const Field* field); double contrastSubImg(unsigned char* const I, const Field* field, int width, int height, int bytesPerPixel); @@ -314,26 +355,117 @@ int s2 = field->size / 2; unsigned long int sum = 0; +#ifdef USE_SSE2_CMP + static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00}; + unsigned char row = 0; +#ifndef USE_SSE2_CMP_HOR + unsigned char summes[16]; + int i; +#endif + __m128i xmmsum, xmmmask; + xmmsum = _mm_setzero_si128(); + xmmmask = _mm_loadu_si128(mask); +#endif + p1=I1 + ((field->x - s2) + (field->y - s2)*width)*bytesPerPixel; p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width)*bytesPerPixel; - // TODO: use some mmx or sse stuff here for (j = 0; j < field->size; j++){ +#ifdef USE_SSE2_CMP + for (k = 0; k < field->size * bytesPerPixel; k+=16){ + { + __m128i xmm0, xmm1, xmm2; + xmm0 = _mm_loadu_si128(p1); + xmm1 = _mm_loadu_si128(p2); + + xmm2 = _mm_subs_epu8(xmm0, xmm1); + xmm0 = _mm_subs_epu8(xmm1, xmm0); + xmm0 = _mm_adds_epu8(xmm0, xmm2); + + xmm1 = _mm_and_si128(xmm0, xmmmask); + xmm0 = _mm_srli_si128(xmm0, 1); + xmm0 = _mm_and_si128(xmm0, xmmmask); + + xmmsum = _mm_adds_epu16(xmmsum, xmm0); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + } + + p1+=16; + p2+=16; + + row++; + if (row == SSE2_CMP_SUM_ROWS) { + row = 0; +#ifdef USE_SSE2_CMP_HOR + { + __m128i xmm1; + + xmm1 = _mm_srli_si128(xmmsum, 8); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + xmm1 = _mm_srli_si128(xmmsum, 4); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + xmm1 = _mm_srli_si128(xmmsum, 2); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + sum += _mm_extract_epi16(xmmsum, 0); + } +#else + _mm_storeu_si128((__m128i*)summes, xmmsum); + for(i = 0; i < 16; i+=2) + sum += summes[i] + summes[i+1]*256; +#endif + xmmsum = _mm_setzero_si128(); + } +#else for (k = 0; k < field->size * bytesPerPixel; k++) { sum += abs((int)*p1 - (int)*p2); p1++; p2++; +#endif } if (sum > treshold) break; p1 += (width - field->size) * bytesPerPixel; p2 += (width - field->size) * bytesPerPixel; } + +#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16) + //process all data left unprocessed + //this part can be safely ignored if + //SSE_SUM_ROWS = {1, 2, 4, 8, 16} +#ifdef USE_SSE2_CMP_HOR + { + __m128i xmm1; + + xmm1 = _mm_srli_si128(xmmsum, 8); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + xmm1 = _mm_srli_si128(xmmsum, 4); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + xmm1 = _mm_srli_si128(xmmsum, 2); + xmmsum = _mm_adds_epu16(xmmsum, xmm1); + + sum += _mm_extract_epi16(xmmsum, 0); + } +#else + _mm_storeu_si128((__m128i*)summes, xmmsum); + for(i = 0; i < 16; i+=2) + sum += summes[i] + summes[i+1]*256; +#endif +#endif + return sum; } /** \see contrastSubImg called with bytesPerPixel=1*/ double contrastSubImgYUV(StabData* sd, const Field* field){ +#ifdef USE_SSE2_YUV_CONTRAST + return contrastSubImgYUVSSE(sd->curr,field,sd->width,sd->height); +#else return contrastSubImg(sd->curr,field,sd->width,sd->height,1); +#endif } /** @@ -347,6 +479,63 @@ + contrastSubImg(I+2,field,sd->width,sd->height,3))/3; } + +#ifdef USE_SSE2_YUV_CONTRAST +/** + \see contrastSubImg using SSE2 optimization, YUV only + */ +double contrastSubImgYUVSSE(unsigned char* const I, const Field* field, + int width, int height) +{ + int k, j; + unsigned char* p = NULL; + int s2 = field->size / 2; + + static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; + + p = I + ((field->x - s2) + (field->y - s2)*width); + + __m128i mmin, mmax; + + mmin = _mm_loadu_si128(full); + mmax = _mm_setzero_si128(); + + for (j = 0; j < field->size; j++){ + for (k = 0; k < field->size; k += 16) { + __m128i xmm0; + xmm0 = _mm_loadu_si128(p); + mmin = _mm_min_epu8(mmin, xmm0); + mmax = _mm_max_epu8(mmax, xmm0); + p += 16; + } + p += (width - field->size); + } + + __m128i xmm1; + xmm1 = _mm_srli_si128(mmin, 8); + mmin = _mm_min_epu8(mmin, xmm1); + xmm1 = _mm_srli_si128(mmin, 4); + mmin = _mm_min_epu8(mmin, xmm1); + xmm1 = _mm_srli_si128(mmin, 2); + mmin = _mm_min_epu8(mmin, xmm1); + xmm1 = _mm_srli_si128(mmin, 1); + mmin = _mm_min_epu8(mmin, xmm1); + unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0); + + xmm1 = _mm_srli_si128(mmax, 8); + mmax = _mm_max_epu8(mmax, xmm1); + xmm1 = _mm_srli_si128(mmax, 4); + mmax = _mm_max_epu8(mmax, xmm1); + xmm1 = _mm_srli_si128(mmax, 2); + mmax = _mm_max_epu8(mmax, xmm1); + xmm1 = _mm_srli_si128(mmax, 1); + mmax = _mm_max_epu8(mmax, xmm1); + unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0); + + return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0 +} +#endif + /** calculates Michelson-contrast in the given small part of the given image @@ -366,7 +555,7 @@ unsigned char maxi = 0; p = I + ((field->x - s2) + (field->y - s2)*width)*bytesPerPixel; - // TODO: use some mmx or sse stuff here + for (j = 0; j < field->size; j++){ for (k = 0; k < field->size * bytesPerPixel; k++) { mini = (mini < *p) ? mini : *p; @@ -1000,7 +1189,9 @@ // shift and size: shakiness 1: height/40; 10: height/4 sd->maxshift = TC_MAX(4,(TC_MIN(sd->width, sd->height)*sd->shakiness)/40); sd->field_size = TC_MAX(4,(TC_MIN(sd->width, sd->height)*sd->shakiness)/40); - +#if defined(USE_SSE2_CMP) || defined(USE_SSE2_YUV_CONTRAST) + sd->field_size = (sd->field_size / 16 + 1) * 16; //must be multiple of 16 pixels for SSE2 +#endif tc_log_info(MOD_NAME, "Fieldsize: %i, Maximal translation: %i pixel", sd->field_size, sd->maxshift); if (sd->algo==1) {