[pulseaudio-discuss] mini svolume vectorization framework

Jason Newton Sun, 03 Apr 2011 04:16:08 -0700

The goal was speed speed speed and DRY as much as possible with a touchof robustness to odd configurations. This code uses intrinsics to dothe SIMD stuff. Build time dependency on boost. It should (I hope) becomparable if not faster than the orc stuff. Readability is arguableand I should mention I got the ideas for some of the things I did fromEigen (the template library for linear algebra). Unfortinately giventhe need for saturating multiplies, eigen itself was unsuitable forintegral types in the volume code.

Inside is a basic tested version for 16bit SSE2 svolume mixing, it isonly integrated inside the testing routine in svolume_sse.c. floatsupport was also added but is untested. neon code was also added but isuntested (I don't have an arm machine to test on). a non-vectorizedimplementation was also included (yet again untested). So why submitthe patch now? To get some feedback from others - ie here's what thingslook like and perform, shall we carry forward?

This also lead to the discovery of a sort of bug in the referenceimplementation and others using its same technique:

154: 7fff != 5028 (0012 * 4740b0d)
936: 7fff != 1f2c (0007 * 4740b0d)

This is from the signed short result checking code in said testingroutine from which my results differed from the current c referenceimplementation. The lhs is my results where as the rhs is from thereference. Clearly the reference implementation is not performing asaturating multiply in all cases though these are some big volumenumbers one probably wont' see in practice. Still, confused me for awhile when I first started working on this code and that big number is avalid volume inside the scope of these functions (int32).


-Jason

>From dbafd38747d0013ffdec1482c86c865f18033924 Mon Sep 17 00:00:00 2001
From: Jason Newton <nev...@gmail.com>
Date: Sat, 2 Apr 2011 03:40:01 -0700
Subject: [PATCH] mini vectorization framework for svolume utilizing C++ and template
 specialization and compiler intrinsics to code once, implement certain
 operations with SIMD and end up with optimum results. Assuming optimum
 results, this is also pretty readable and reusable.

---
 configure.ac                         |    3 +
 src/Makefile.am                      |    2 +-
 src/pulsecore/svolume.txx            |  256 ++++++++++++++++++++++++++++++++++
 src/pulsecore/svolume_neon.txx       |   91 ++++++++++++
 src/pulsecore/svolume_sse.c          |   27 +++-
 src/pulsecore/svolume_sse2.txx       |   90 ++++++++++++
 src/pulsecore/svolume_vectorized.cpp |   27 ++++
 src/pulsecore/svolume_vectorized.h   |    3 +
 8 files changed, 490 insertions(+), 9 deletions(-)
 create mode 100644 src/pulsecore/svolume.txx
 create mode 100644 src/pulsecore/svolume_neon.txx
 create mode 100644 src/pulsecore/svolume_sse2.txx
 create mode 100644 src/pulsecore/svolume_vectorized.cpp
 create mode 100644 src/pulsecore/svolume_vectorized.h

diff --git a/configure.ac b/configure.ac
index 5b41bb4..334fa5c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -78,6 +78,9 @@ AC_PROG_MKDIR_P
 AC_PROG_CC
 AC_PROG_CC_C99
 AM_PROG_CC_C_O
+AC_PROG_CXX
+CXXFLAGS="$CXXFLAGS -std=gnu++0x -Wall -O2 -DNDEBUG"
+
 AC_PROG_GCC_TRADITIONAL
 AC_USE_SYSTEM_EXTENSIONS
 
diff --git a/src/Makefile.am b/src/Makefile.am
index bdedded..751d3e4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -881,7 +881,7 @@ libpulsecore_@PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/start-child.c pulsecore/start-child.h \
 		pulsecore/thread-mq.c pulsecore/thread-mq.h \
 		pulsecore/time-smoother.c pulsecore/time-smoother.h \
-		pulsecore/database.h
+		pulsecore/database.h pulsecore/svolume_vectorized.cpp
 
 libpulsecore_@PA_MAJORMINOR@_la_CFLAGS = $(AM_CFLAGS) $(LIBSAMPLERATE_CFLAGS) $(LIBSPEEX_CFLAGS) $(LIBSNDFILE_CFLAGS) $(WINSOCK_CFLAGS)
 libpulsecore_@PA_MAJORMINOR@_la_LDFLAGS = $(AM_LDFLAGS) -avoid-version
diff --git a/src/pulsecore/svolume.txx b/src/pulsecore/svolume.txx
new file mode 100644
index 0000000..6444a3c
--- /dev/null
+++ b/src/pulsecore/svolume.txx
@@ -0,0 +1,256 @@
+#include <boost/math/common_factor.hpp>
+#include <limits>
+#include <cstdint>
+#include <cstdlib>
+#include <algorithm>
+#include <cassert>
+
+enum vec_platform_t{
+    NONE,
+    SSE2,
+    AVX,
+    NEON
+};
+
+enum endianness_t{
+    NE,
+    RE
+};
+
+template<typename v_type, typename packet_t>
+struct pload{
+    inline
+    packet_t operator()(v_type *v){
+    }
+};
+
+template<typename sample_t>
+struct pload<sample_t, sample_t>{
+    inline
+    sample_t operator()(sample_t *v){
+        return *v;
+    }
+};
+
+template<typename sample_t, int lcm, int blocksize, vec_platform_t vec_platform, endianness_t endianness, typename packet_t>
+struct BaseMult{
+    sample_t *volume;
+    unsigned n_vol_vecs;
+    packet_t vol0;
+
+    inline BaseMult(sample_t *volume, unsigned n_vol_vecs):volume(volume), n_vol_vecs(n_vol_vecs){
+        if(blocksize == lcm){
+            vol0 = pload<sample_t, packet_t>()(&volume[0]);
+        }
+    }
+    inline
+    packet_t GetVolume(unsigned s_i){//c++ awesomeness decrees you must access this symbol via this->
+        if(blocksize == lcm){
+            return vol0;
+        }else{
+            return pload<sample_t, packet_t>()(&volume[s_i % (blocksize * n_vol_vecs)]);
+        }
+    }
+};
+
+template<typename sample_t, int lcm, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+class Mult : BaseMult<sample_t, lcm, blocksize, vec_platform, endianness, sample_t>{
+    inline Mult(sample_t *volume, unsigned n_vol_vecs): BaseMult<sample_t, lcm, blocksize, vec_platform, endianness, sample_t>(volume, n_vol_vecs){
+    }
+    inline
+    void operator()(register unsigned vol_i, sample_t *sample_vec){
+        //default implementation?, need "upper" type map
+    }
+};
+
+template<typename sample_t, int blocksize, vec_platform_t vec_platform, endianness_t endianness, typename packet_t = sample_t *>
+struct EndianSwap{
+    inline
+    void operator()(packet_t packet){
+    }
+};
+
+/* C implementation  operations */
+template<typename v_t>
+struct upper_type{
+    typedef int64_t type;
+};
+template<> struct upper_type<int8_t>{
+    typedef int16_t type;
+};
+template<> struct upper_type<int16_t>{
+    typedef int32_t type;
+};
+template<> struct upper_type<int32_t>{
+    typedef int64_t type;
+};
+
+template<typename sample_t, int lcm, endianness_t endianness>
+struct Mult<sample_t, lcm, 1, NONE, endianness> : BaseMult<sample_t, lcm, 1, NONE, endianness, sample_t>{
+    inline Mult(sample_t *volume, unsigned n_vol_vecs):BaseMult<sample_t, lcm, 1, NONE, endianness, sample_t>(volume, n_vol_vecs){
+    }
+    inline
+    void operator()(unsigned s_i, sample_t *sample_vec){
+        typedef typename upper_type<sample_t>::type upper_t;
+        EndianSwap<sample_t, 1, NONE, endianness>(*reinterpret_cast<sample_t *>(sample_vec));
+        return (sample_t)std::min(
+            std::max(upper_t(*sample_vec) * this->GetVolume(s_i), (upper_t) std::numeric_limits<sample_t>::min()),
+            (upper_t) std::numeric_limits<sample_t>::max()
+        );
+    }
+};
+
+template<int lcm, endianness_t endianness>
+struct Mult<float, lcm, 1, NONE, endianness> : BaseMult<float, lcm, 1, NONE, endianness, float>{
+    inline Mult(float *volume, unsigned n_vol_vecs):BaseMult<float, lcm, 1, NONE, endianness, float>(volume, n_vol_vecs){
+    }
+    inline
+    void operator()(unsigned s_i, float *sample_vec){
+        EndianSwap<int32_t, 1, NONE, endianness>(*reinterpret_cast<int32_t *>(sample_vec));
+        return *sample_vec * this->GetVolume(s_i);
+    }
+};
+
+template<>
+struct EndianSwap<int32_t, 1, NONE, RE, int32_t>{
+    inline EndianSwap(){
+    }
+    inline
+    void operator()(int32_t &v){
+        v = (v >> 24) | (v << 24) | ((v>>8) & 0xff00) | ((v<<8) & 0xff0000);
+    }
+};
+
+template<>
+struct EndianSwap<int16_t, 1, NONE, RE, int16_t>{
+    inline EndianSwap(){
+    }
+    inline
+    void operator()(int32_t &v){
+        v = (v >> 8) | (v << 8);
+    }
+};
+
+template<typename SampOpT, typename sample_t, int blocksize>
+inline
+void work_loop(SampOpT &op, sample_t *samples, unsigned length){
+    const unsigned rem = length % blocksize;
+    unsigned i = 0;
+    for(; i < (length - rem); i += blocksize){
+        op(i, &samples[i]);
+    }
+    if(rem){
+        sample_t remsamp[blocksize];//aligned
+        for(unsigned j = 0; j < rem; ++j){
+            remsamp[j] = samples[i + j];
+        }
+        op(i, &remsamp[0]);
+        for(unsigned j = 0; j < rem; ++j){
+            samples[i + j] = remsamp[j];
+        }
+    }
+}
+
+//only handles the case of vol_t >= sample_t, geared to integrals
+template<typename sample_t, typename vol_t, int _lcm, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+struct MultLoop{
+    inline
+    void operator()(sample_t *samples, vol_t *_volumes, unsigned channels, unsigned length, const unsigned _lcmr = 0){
+        length /= sizeof(sample_t);
+        vol_t min = std::numeric_limits<sample_t>::min();
+        vol_t max = std::numeric_limits<sample_t>::max();
+
+        const unsigned lcm = (_lcm == -1 ? _lcmr : _lcm);
+        const unsigned n_volume_vectors = lcm / blocksize;
+        sample_t volume_vectors[n_volume_vectors * blocksize];//needs alignment ensured
+        for(unsigned i = 0; i < lcm; ++i){
+            volume_vectors[i] = (sample_t) std::max(std::min(_volumes[i % channels], max), min);
+        }
+
+        typedef Mult<sample_t, _lcm, blocksize, vec_platform, endianness> MultOpT;
+        MultOpT multOp(&volume_vectors[0], n_volume_vectors);
+
+        work_loop<MultOpT, sample_t, blocksize>(multOp, samples, length);
+    }
+};
+
+template<typename vol_t, int _lcm, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+struct MultLoop<float, vol_t, _lcm, blocksize, vec_platform, endianness>{
+    inline
+    void operator()(float *samples, vol_t *_volumes, unsigned channels, unsigned length, const unsigned _lcmr = 0){
+        length /= sizeof(float);
+
+        const unsigned lcm = (_lcm == -1 ? _lcmr : _lcm);
+        const unsigned n_volume_vectors = lcm / blocksize;
+        float volume_vectors[n_volume_vectors * blocksize];//needs alignment ensured
+        for(unsigned i = 0; i < lcm; ++i){
+            volume_vectors[i] = (float) _volumes[i % channels];
+        }
+
+        const unsigned rem = length % blocksize;
+        typedef Mult<float, _lcm, blocksize, vec_platform, endianness> MultOpT;
+        MultOpT multOp(&volume_vectors[0], n_volume_vectors);
+
+        work_loop<MultOpT, float, blocksize>(multOp, samples, length);
+    }
+};
+
+template<typename sample_t, typename vol_t, int channels, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+inline
+void mult_loop(sample_t *samples, vol_t *_volumes, unsigned length){
+    const unsigned lcm = boost::math::static_lcm<blocksize, channels>::value;
+    MultLoop<sample_t, vol_t, lcm, blocksize, vec_platform, endianness>()(samples, _volumes, channels, length);
+}
+
+template<typename sample_t, typename vol_t, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+inline
+void mult_loop(sample_t *samples, vol_t *_volumes, unsigned channels, unsigned length){
+    const unsigned lcm = boost::math::lcm((unsigned) blocksize, channels);
+    MultLoop<sample_t, vol_t, -1, blocksize, vec_platform, endianness>()(samples, _volumes, channels, length, lcm);
+}
+
+template<typename sample_t, typename vol_t, int blocksize, vec_platform_t vec_platform, endianness_t endianness>
+inline
+void submain_volume_loop(sample_t *samples, vol_t *volumes, unsigned channels, unsigned length){
+    //list out all the common channels that have an lcm = blocksize
+    switch(channels){
+        case 1:
+            mult_loop<sample_t, vol_t, 1, blocksize, vec_platform, endianness>(samples, volumes, length);
+            break;
+        case 2:
+            mult_loop<sample_t, vol_t, 2, blocksize, vec_platform, endianness>(samples, volumes, length);
+            break;
+        case 4:
+            mult_loop<sample_t, vol_t, 4, blocksize, vec_platform, endianness>(samples, volumes, length);
+            break;
+        case 8:
+            mult_loop<sample_t, vol_t, 8, blocksize, vec_platform, endianness>(samples, volumes, length);
+            break;
+        default:
+            mult_loop<sample_t, vol_t, blocksize, vec_platform, endianness>(samples, volumes, channels, length);
+            break;
+    }
+}
+
+template<typename _value_t, vec_platform_t _vec_platform>
+struct BasicPacketTraits{
+    typedef _value_t value_t;
+    enum{
+        vec_platform = _vec_platform
+    };
+};
+
+template<typename _value_t, vec_platform_t _vec_platform>
+struct PacketTraits : BasicPacketTraits<_value_t, _vec_platform>{
+    enum{
+        blocksize = 1
+    };
+};
+
+#ifdef __SSE2__
+#include "svolume_sse2.txx"
+#endif
+
+#ifdef __ARM_NEON__
+#include "svolume_neon.txx"
+#endif
diff --git a/src/pulsecore/svolume_neon.txx b/src/pulsecore/svolume_neon.txx
new file mode 100644
index 0000000..37c3730
--- /dev/null
+++ b/src/pulsecore/svolume_neon.txx
@@ -0,0 +1,91 @@
+#include <arm_neon.h>
+
+template<>
+struct PacketTraits<int16_t, NEON> : BasicPacketTraits<int16_t, NEON>{
+    enum{
+        blocksize = 16
+    };
+};
+
+template<>
+struct pload<int16_t, int16x8_t>{
+    inline
+    int16x8_t operator()(int16_t *v){
+        return vld1q_s16(v);
+    }
+};
+
+template<>
+struct EndianSwap<int16_t, 8, NEON, RE, int16x8_t>{
+    inline
+    void operator()(int16x8_t &p){
+        //shift hi to lo, lo to hi, or together
+        int16x8_t lo = vqshrqu_n_s16(p, 8);
+        int16x8_t hi = vqshlqu_n_s16(p, 8);
+        p = vorrq_s16(lo, hi);
+    }
+};
+
+template<endianness_t endianness>
+struct Mult<int16_t, 8, NEON, endianness>{
+    inline
+    void operator()(sample_t *volume, sample_t *sample_vec){
+        //int16x4x2_t vol = vld1q_s16(volume);
+        int16x8_t vol = this->GetVolume();
+        int16x8_t samp = pload<int16_t, int16x8_t>()(sample_vec);
+        //int16x8_t samp = vld1q_s16(sample_vec);
+        EndianSwap<int16_t, 8, NEON, endianness, int16x8_t>()(samp);
+        //int16x4x2_t samp = vld2_s16(sample_vec);
+
+        //vqdmull_s16, then narrowing arithmetic shift right 1
+        int32x4_t samplo = vqdmull_s16(vget_low_s16(vol), vget_low_s16(samp));
+        int32x4_t samphi = vqdmull_s16(vget_high_s16(vol), vget_high_s16(samp));
+        samp = vcombine_s16(vqshrn_n_s32(samplo, 1), vqshrn_n_s32(samp1, 1));
+
+        vst1q_s16(sample_vec, samp);
+    }
+};
+
+template<>
+struct pload<float, float32_t>{
+    inline
+    float32_t operator()(float *v){
+        return vld1q_f32(v);
+    }
+};
+
+template<>
+struct EndianSwap<int32_t, 4, NEON, RE, int32x4_t>{
+    uint32x4_t vmask_2;
+    uint32x4_t vmask_3;
+
+    inline EndianSwap(){
+        vmask_2 = vdupq_n_s32(0x00ff0000);
+        vmask_3 = vdupq_n_s32(0x0000ff00);
+    }
+    inline
+    void operator()(int32x4_t &v){
+        int32x4_t hi = vqshru_n_s32(v, 24);
+        int32x4_t lo = vqshlu_n_s32(v, 24);
+        int32x4_t midhi = vqshlqu_n_u32(v, 8);
+        midhi = vandq_s32(midhi, vmask_3);
+
+        int32x4_t midlo = vqshru_n_s32(v, 8);
+        midlo = vandq_s32(midlo, vmask_2);
+        v = vorrq_s32(vorrq_s32(hi, lo), vorrq_s32(midhi, midlo));
+    }
+};
+
+template<endianness_t endianness>
+struct Mult<float, 4, NEON, endianness>{
+    EndianSwap<int32_t, 4, NEON, endianness, int32x4_t> swapper;
+    inline
+    void operator()(sample_t *volume, sample_t *sample_vec){
+        //int16x4x2_t vol = vld1q_s16(volume);
+        float32_t vol = this->GetVolume();
+        float32_t samp = pload<float, float32_t>()(sample_vec);
+        swapper(*((int32x4_t *) &samp));
+        samp = vmulq_f32(vol, samp);
+        vst1q_f32(sample_vec, samp);
+    }
+};
diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c
index ef07a24..8035413 100644
--- a/src/pulsecore/svolume_sse.c
+++ b/src/pulsecore/svolume_sse.c
@@ -247,15 +247,17 @@ static void pa_volume_s16re_sse2(int16_t *samples, int32_t *volumes, unsigned ch
     );
 }
 
-#undef RUN_TEST
+#define RUN_TEST
 
 #ifdef RUN_TEST
 #define CHANNELS 2
-#define SAMPLES 1022
-#define TIMES 1000
+#define SAMPLES 1022*4
+#define TIMES 100000
 #define PADDING 16
 
-static void run_test(void) {
+#include "svolume_vectorized.h"
+
+static void run_test (void) {
     int16_t samples[SAMPLES];
     int16_t samples_ref[SAMPLES];
     int16_t samples_orig[SAMPLES];
@@ -277,8 +279,9 @@ static void run_test(void) {
     for (padding = 0; padding < PADDING; padding++, i++)
         volumes[i] = volumes[padding];
 
-    func(samples_ref, volumes, CHANNELS, sizeof(samples));
-    pa_volume_s16ne_sse2(samples, volumes, CHANNELS, sizeof(samples));
+    func (samples_ref, volumes, CHANNELS, sizeof (samples));
+    //pa_volume_s16ne_sse2 (samples, volumes, CHANNELS, sizeof (samples));
+    pa_volume_s16ne_vectorized(samples, volumes, CHANNELS, sizeof (samples));
     for (i = 0; i < SAMPLES; i++) {
         if (samples[i] != samples_ref[i]) {
             printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
@@ -296,8 +299,16 @@ static void run_test(void) {
 
     start = pa_rtclock_now();
     for (j = 0; j < TIMES; j++) {
-        memcpy(samples_ref, samples_orig, sizeof(samples));
-        func(samples_ref, volumes, CHANNELS, sizeof (samples));
+        memcpy (samples, samples_orig, sizeof (samples));
+        pa_volume_s16ne_vectorized(samples, volumes, CHANNELS, sizeof (samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("vectorized: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (j = 0; j < TIMES; j++) {
+        memcpy (samples_ref, samples_orig, sizeof (samples));
+        func (samples_ref, volumes, CHANNELS, sizeof (samples));
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
diff --git a/src/pulsecore/svolume_sse2.txx b/src/pulsecore/svolume_sse2.txx
new file mode 100644
index 0000000..5195152
--- /dev/null
+++ b/src/pulsecore/svolume_sse2.txx
@@ -0,0 +1,90 @@
+#include <emmintrin.h>
+
+template<>
+struct PacketTraits<int16_t, SSE2> : BasicPacketTraits<int16_t, SSE2>{
+    enum{
+        blocksize = 8
+    };
+};
+
+template<>
+struct EndianSwap<int32_t, 4, SSE2, RE, __m128i>{
+    __m128i vmask_2;
+    __m128i vmask_3;
+
+    inline EndianSwap(){
+        vmask_2 = _mm_set1_epi32(0x00ff0000);
+        vmask_3 = _mm_set1_epi32(0x0000ff00);
+    }
+    inline
+    void operator()(__m128i &v){
+        __m128i hi = _mm_slli_epi32(v, 24);
+        __m128i lo = _mm_srli_epi32(v, 24);
+        __m128i midhi = _mm_slli_epi32(v, 8);
+        midhi = _mm_and_si128(midhi, vmask_3);
+
+        __m128i midlo = _mm_srli_epi32(v, 8);
+        midlo = _mm_and_si128(midlo, vmask_2);
+        v = _mm_or_si128(_mm_or_si128(hi, lo), _mm_or_si128(midhi, midlo));
+    }
+};
+
+template<typename sample_t>
+struct pload<sample_t, __m128i>{
+    inline
+    __m128i operator()(sample_t *v){
+        return _mm_load_si128(reinterpret_cast<__m128i *>(v));
+    }
+};
+
+template<>
+struct pload<float, __m128>{
+    inline
+    __m128 operator()(float *v){
+        return _mm_load_ps(v);
+    }
+};
+
+template<>
+struct EndianSwap<int16_t, 8, SSE2, RE, __m128i>{
+    inline
+    void operator()(__m128i &v){
+        __m128i hi = _mm_slli_epi16(v, 8);
+        __m128i lo = _mm_srli_epi16(v, 8);//non arithmetic
+        v = _mm_or_si128(hi, lo);
+    }
+};
+
+template<int lcm, endianness_t endianness>
+struct Mult<int16_t, lcm, 8, SSE2, endianness> : BaseMult<int16_t, lcm, 8, SSE2, endianness, __m128i>{
+    inline Mult(int16_t *volume, unsigned n_vol_vecs):BaseMult<int16_t, lcm, 8, SSE2, endianness, __m128i>(volume, n_vol_vecs){
+    }
+    inline
+    void operator()(unsigned s_i, int16_t *sample_vec){
+        __m128i s = pload<int16_t, __m128i>()(sample_vec);
+        __m128i v = this->GetVolume(s_i);
+        EndianSwap<int16_t, 8, SSE2, endianness, __m128i>()(s);
+        __m128i hiterms = _mm_mulhi_epi16(v, s);
+        __m128i loterms = _mm_mullo_epi16(v, s);
+        __m128i tophalf = _mm_unpackhi_epi16(loterms, hiterms);
+        __m128i bothalf = _mm_unpacklo_epi16(loterms, hiterms);
+        __m128i result = _mm_packs_epi32(bothalf, tophalf);
+        _mm_store_si128((__m128i *) sample_vec, result);//store the results
+    }
+};
+
+template<int lcm, endianness_t endianness>
+struct Mult<float, lcm, 4, SSE2, endianness> : BaseMult<float, lcm, 4, SSE2, endianness, __m128>{
+    EndianSwap<int32_t, 4, SSE2, endianness, __m128i> swapper;
+
+    inline Mult(float *volume, unsigned n_vol_vecs):BaseMult<float, lcm, 4, SSE2, endianness, __m128>(volume, n_vol_vecs){
+    }
+    inline
+    void operator()(unsigned s_i, int16_t *sample_vec){
+        __m128 v = this->GetVolume(s_i);
+        __m128 s = pload<float, __m128>(sample_vec);
+        swapper(*((__m128 *) &s));
+        __m128 result = _mm_mul_ps(v, s);
+        _mm_store_si128((__m128i *) sample_vec, *((__m128i *) &result));
+    }
+};
diff --git a/src/pulsecore/svolume_vectorized.cpp b/src/pulsecore/svolume_vectorized.cpp
new file mode 100644
index 0000000..8ce1b3d
--- /dev/null
+++ b/src/pulsecore/svolume_vectorized.cpp
@@ -0,0 +1,27 @@
+//#include <Eigen/Core>
+//#include <Eigen/Dense>
+
+#include "svolume.txx"
+
+extern "C"{
+void pa_volume_s16ne_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length);
+void pa_volume_s16re_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length);
+}
+
+#if defined(__SSE2__)
+const vec_platform_t vec_platform = SSE2;
+#elif defined(__ARM_NEON__)
+const vec_platform_t vec_platform = ARM;
+#endif
+
+void pa_volume_s16ne_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length){
+    typedef int16_t sample_t;
+    const unsigned blocksize = PacketTraits<sample_t, vec_platform>::blocksize;
+    submain_volume_loop<sample_t, int32_t, blocksize, vec_platform, NE>(samples, volumes, channels, length);
+}
+
+void pa_volume_s16re_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length){
+    typedef int16_t sample_t;
+    const unsigned blocksize = PacketTraits<sample_t, vec_platform>::blocksize;
+    submain_volume_loop<sample_t, int32_t, blocksize, vec_platform, RE>(samples, volumes, channels, length);
+}
diff --git a/src/pulsecore/svolume_vectorized.h b/src/pulsecore/svolume_vectorized.h
new file mode 100644
index 0000000..eae6b1e
--- /dev/null
+++ b/src/pulsecore/svolume_vectorized.h
@@ -0,0 +1,3 @@
+
+void pa_volume_s16ne_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length);
+void pa_volume_s16re_vectorized(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length);
-- 
1.7.3.4

_______________________________________________
pulseaudio-discuss mailing list
pulseaudio-discuss@mail.0pointer.de
https://tango.0pointer.de/mailman/listinfo/pulseaudio-discuss

[pulseaudio-discuss] mini svolume vectorization framework

Reply via email to