On Sat, Aug 02, 2014 at 11:34:07PM +0200, Clément Bœsch wrote: > --- > configure | 2 + > doc/APIchanges | 3 + > libavutil/Makefile | 3 + > libavutil/pixelutils.c | 142 ++++++++++++++++++++++++++++++++++++ > libavutil/pixelutils.h | 52 ++++++++++++++ > libavutil/version.h | 2 +- > libavutil/x86/Makefile | 4 ++ > libavutil/x86/pixelutils.asm | 155 > ++++++++++++++++++++++++++++++++++++++++ > libavutil/x86/pixelutils.h | 26 +++++++ > libavutil/x86/pixelutils_init.c | 58 +++++++++++++++ > tests/fate/libavutil.mak | 5 ++ > tests/ref/fate/pixelutils | 15 ++++ > 12 files changed, 466 insertions(+), 1 deletion(-) > create mode 100644 libavutil/pixelutils.c > create mode 100644 libavutil/pixelutils.h > create mode 100644 libavutil/x86/pixelutils.asm > create mode 100644 libavutil/x86/pixelutils.h > create mode 100644 libavutil/x86/pixelutils_init.c > create mode 100644 tests/ref/fate/pixelutils > > diff --git a/configure b/configure > index 9c3af50..57edd1d 100755 > --- a/configure > +++ b/configure > @@ -144,6 +144,7 @@ Component options: > --disable-mdct disable MDCT code > --disable-rdft disable RDFT code > --disable-fft disable FFT code > + --disable-pixelutils disable pixel utils in libavutil > > Hardware accelerators: > --disable-dxva2 disable DXVA2 code [autodetect] > @@ -1451,6 +1452,7 @@ SUBSYSTEM_LIST=" > lsp > lzo > mdct > + pixelutils > network > rdft > " > diff --git a/doc/APIchanges b/doc/APIchanges > index abca377..69ca682 100644 > --- a/doc/APIchanges > +++ b/doc/APIchanges > @@ -15,6 +15,9 @@ libavutil: 2012-10-22 > > API changes, most recent first: > > +2014-08-02 - xxxxxxx - lavu 52.95.100 - pixelutils.h > + Add pixelutils API with SAD functions > + > 2014-07-30 - ba3e331 - lavu 52.94.100 - frame.h > Add av_frame_side_data_name() > > diff --git a/libavutil/Makefile b/libavutil/Makefile > index 91751dc..d57a741 100644 > --- a/libavutil/Makefile > +++ b/libavutil/Makefile > @@ -44,6 +44,7 @@ HEADERS = adler32.h > \ > opt.h \ > parseutils.h \ > pixdesc.h \ > + pixelutils.h \ > pixfmt.h \ > random_seed.h \ > replaygain.h \ > @@ -113,6 +114,7 @@ OBJS = adler32.o > \ > opt.o \ > parseutils.o \ > pixdesc.o \ > + pixelutils.o \ > random_seed.o \ > rational.o \ > rc4.o \ > @@ -170,6 +172,7 @@ TESTPROGS = adler32 > \ > pca \ > parseutils \ > pixdesc \ > + pixelutils \ > random_seed \ > rational \ > ripemd \ > diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c > new file mode 100644 > index 0000000..278aa80 > --- /dev/null > +++ b/libavutil/pixelutils.c > @@ -0,0 +1,142 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "config.h" > +#include "common.h" > +#include "pixelutils.h" > + > +#if CONFIG_PIXELUTILS > + > +#include "x86/pixelutils.h" > + > +static av_always_inline int sad_wxh(const uint8_t *src1, ptrdiff_t stride1, > + const uint8_t *src2, ptrdiff_t stride2, > + int w, int h) > +{ > + int x, y, sum = 0; > + > + for (y = 0; y < h; y++) { > + for (x = 0; x < w; x++) > + sum += abs(src1[x] - src2[x]); > + src1 += stride1; > + src2 += stride2; > + } > + return sum; > +} > + > +#define DECLARE_BLOCK_FUNCTIONS(size) > \ > +static int block_sad_##size##x##size##_c(const uint8_t *src1, ptrdiff_t > stride1, \ > + const uint8_t *src2, ptrdiff_t > stride2) \ > +{ > \ > + return sad_wxh(src1, stride1, src2, stride2, size, size); > \ > +} > + > +DECLARE_BLOCK_FUNCTIONS(2) > +DECLARE_BLOCK_FUNCTIONS(4) > +DECLARE_BLOCK_FUNCTIONS(8) > +DECLARE_BLOCK_FUNCTIONS(16) > + > +static const av_pixelutils_sad_fn sad_c[] = { > + block_sad_2x2_c, > + block_sad_4x4_c, > + block_sad_8x8_c, > + block_sad_16x16_c, > +}; > + > +#endif /* CONFIG_PIXELUTILS */ > + > +av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int > aligned, void *log_ctx) > +{ > +#if !CONFIG_PIXELUTILS > + av_log(log_ctx, AV_LOG_ERROR, "pixelutils support is required " > + "but libavutil is not compiled with it\n"); > + return NULL; > +#else > + av_pixelutils_sad_fn sad[FF_ARRAY_ELEMS(sad_c)]; > + > + memcpy(sad, sad_c, sizeof(sad)); > + > + if (w_bits < 1 || w_bits > FF_ARRAY_ELEMS(sad) || > + h_bits < 1 || h_bits > FF_ARRAY_ELEMS(sad)) > + return NULL; > + if (w_bits != h_bits) // only squared sad for now > + return NULL; > + > +#if ARCH_X86 > + ff_pixelutils_sad_init_x86(sad, aligned); > +#endif > + > + return sad[w_bits - 1]; > +#endif > +} > + > +#ifdef TEST > +#define W1 320 > +#define H1 240 > +#define W2 640 > +#define H2 480 > +int main(void) > +{ > + int i, a, ret = 0; > + DECLARE_ALIGNED(32, uint32_t, buf1)[W1*H1]; > + DECLARE_ALIGNED(32, uint32_t, buf2)[W2*H2]; > + uint32_t state = 0; > + > + for (i = 0; i < W1*H1; i++) { > + buf1[i] = state; > + state = state * 1664525 + 1013904223; > + } > + > + for (i = 0; i < W2*H2; i++) { > + buf2[i] = state; > + state = state * 1664525 + 1013904223; > + }
the code should in addition be tested with maximal and minimal difference cases [...] > +;------------------------------------------------------------------------------- > +; int ff_pixelutils_sad_[au]_16x16_sse(const uint8_t *src1, ptrdiff_t > stride1, > +; const uint8_t *src2, ptrdiff_t > stride2); > +;------------------------------------------------------------------------------- > +%macro SAD_XMM_16x16 1 > +INIT_XMM sse2 > +cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 > + pxor m2, m2 > +%rep 8 > + mov%1 m0, [src2q] > + mov%1 m1, [src2q + stride2q] > + psadbw m0, [src1q] > + psadbw m1, [src1q + stride1q] > + paddw m2, m0 > + paddw m2, m1 > + lea src1q, [src1q + 2*stride1q] > + lea src2q, [src2q + 2*stride2q] > +%endrep > + movhlps m0, m2 > + paddw m2, m0 > + movd eax, m2 > + RET > +%endmacro there are various improvments possible, though these should be in a seperate patch and not in gcc->yasm but the pxor can be avoided by lifting the first iteration out and using m2 as destination it might be faster to use 2 accumulator registers as that way both could execute with no dependancies on the other as you unroll the loop, addressing can be done with fewer instructions LGTM otherwise [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Democracy is the form of government in which you can choose your dictator
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel