Hi all,
Trying to get back on the loop with SSE stuff.
Here a first patch for SSE instructions.
I'm not remebering well (so far) how to format the SSE instructions.


On 17/11/2015 10:37, Nicolas Bertrand wrote:
> From: Nicolas Bertrand <[email protected]>
> 
> Cheers
> Nicolas
> 
> ---
>  libavcodec/jpeg2000dsp.c | 54 
> +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c
> index 6e04c3a..a546b7d 100644
> --- a/libavcodec/jpeg2000dsp.c
> +++ b/libavcodec/jpeg2000dsp.c
> @@ -24,6 +24,10 @@
>  #include "libavutil/attributes.h"
>  #include "jpeg2000dsp.h"
>  
> +#ifdef __SSE__
> +#include <xmmintrin.h>
> +#endif
> +
>  /* Inverse ICT parameters in float and integer.
>   * int value = (float value) * (1<<16) */
>  static const float f_ict_params[4] = {
> @@ -40,9 +44,56 @@ static const int i_ict_params[4] = {
>      116130
>  };
>  
> +static void mct_decode_sse(
> +                float* restrict c0,
> +                float* restrict c1,
> +                float* restrict c2,
> +                int n)
> +{
> +    int i;
> +    __m128 vrv, vgu, vgv, vbu;
> +    vrv = _mm_set1_ps(1.402f);
> +    vgu = _mm_set1_ps(0.34413f);
> +    vgv = _mm_set1_ps(0.71414f);
> +    vbu = _mm_set1_ps(1.772f);
> +    for (i = 0; i < (n >> 3); ++i) {
> +        __m128 vy, vu, vv;
> +        __m128 vr, vg, vb;
> +
> +        vy = _mm_load_ps(c0);
> +        vu = _mm_load_ps(c1);
> +        vv = _mm_load_ps(c2);
> +        vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
> +        vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, 
> vgv));
> +        vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
> +        _mm_store_ps(c0, vr);
> +        _mm_store_ps(c1, vg);
> +        _mm_store_ps(c2, vb);
> +        c0 += 4;
> +        c1 += 4;
> +        c2 += 4;
> +        vy = _mm_load_ps(c0);
> +        vu = _mm_load_ps(c1);
> +        vv = _mm_load_ps(c2);
> +        vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
> +        vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, 
> vgv));
> +        vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
> +        _mm_store_ps(c0, vr);
> +        _mm_store_ps(c1, vg);
> +        _mm_store_ps(c2, vb);
> +        c0 += 4;
> +        c1 += 4;
> +        c2 += 4;
> +    }
> +    n &= 7;
> +}
> +
>  static void ict_float(void *_src0, void *_src1, void *_src2, int csize)
>  {
> -    float *src0 = _src0, *src1 = _src1, *src2 = _src2;
> +   float *src0 = _src0, *src1 = _src1, *src2 = _src2;
> +#ifdef __SSE__
> +    mct_decode_sse(src0, src1, src2, csize);
> +#else
>      float i0f, i1f, i2f;
>      int i;
>  
> @@ -55,6 +106,7 @@ static void ict_float(void *_src0, void *_src1, void 
> *_src2, int csize)
>          *src1++ = i1f;
>          *src2++ = i2f;
>      }
> +#endif
>  }
>  
>  static void ict_int(void *_src0, void *_src1, void *_src2, int csize)
> 

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to