[MERGED] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-05-19 Thread Tom Tsou
Tom Tsou has submitted this change and it was merged.

Change subject: buildenv: Split up SSE3 and SSE4.1 code
..


buildenv: Split up SSE3 and SSE4.1 code

Currently we find SSE3 and SSE4.1 code mixed togehter along with
generic code in one file. This introduces the risk that the
compiler exidantly mixes SSE4.1 instructions into an SSE3, or
even worse into a generic code path.

This commit splits the SSE3 and SSE4.1 code into separate files
and compiles them with the matching target options.

Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
---
M Transceiver52M/x86/Makefile.am
M Transceiver52M/x86/convert.c
A Transceiver52M/x86/convert_sse_3.c
A Transceiver52M/x86/convert_sse_3.h
A Transceiver52M/x86/convert_sse_4_1.c
A Transceiver52M/x86/convert_sse_4_1.h
M Transceiver52M/x86/convolve.c
A Transceiver52M/x86/convolve_sse_3.c
A Transceiver52M/x86/convolve_sse_3.h
R config/ax_sse.m4
M configure.ac
M utils/convolvetest/Makefile
12 files changed, 893 insertions(+), 661 deletions(-)

Approvals:
  Tom Tsou: Looks good to me, approved
  Neels Hofmeyr: Looks good to me, but someone else must approve
  Harald Welte: Looks good to me, but someone else must approve
  Jenkins Builder: Verified

Objections:
  Vadim Yanitskiy: I would prefer this is not merged as is



diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am
index 7a0b75f..45aa629 100644
--- a/Transceiver52M/x86/Makefile.am
+++ b/Transceiver52M/x86/Makefile.am
@@ -1,7 +1,28 @@
 if !ARCH_ARM
-AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common
+AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common
 
 noinst_LTLIBRARIES = libarch.la
+noinst_LTLIBRARIES += libarch_sse_3.la
+noinst_LTLIBRARIES += libarch_sse_4_1.la
+
+libarch_la_LIBADD =
+
+# SSE 3 specific code
+if HAVE_SSE3
+libarch_sse_3_la_SOURCES = \
+   convert_sse_3.c \
+   convolve_sse_3.c
+libarch_sse_3_la_CFLAGS = $(AM_CFLAGS) -msse3
+libarch_la_LIBADD += libarch_sse_3.la
+endif
+
+# SSE 4.1 specific code
+if HAVE_SSE4_1
+libarch_sse_4_1_la_SOURCES = \
+   convert_sse_4_1.c
+libarch_sse_4_1_la_CFLAGS = $(AM_CFLAGS) -msse4.1
+libarch_la_LIBADD += libarch_sse_4_1.la
+endif
 
 libarch_la_SOURCES = \
../common/convolve_base.c \
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index 3f76b65..db98050 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -20,6 +20,8 @@
 #include 
 #include 
 #include "convert.h"
+#include "convert_sse_3.h"
+#include "convert_sse_4_1.h"
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -35,140 +37,6 @@
 };
 
 static struct convert_cpu_context c;
-
-#ifdef HAVE_SSE3
-#include 
-#include 
-
-#ifdef HAVE_SSE4_1
-#include 
-
-/* 16*N 16-bit signed integer converted to single precision floats */
-static void _sse_convert_si16_ps_16n(float *restrict out,
-const short *restrict in,
-int len)
-{
-   __m128i m0, m1, m2, m3, m4, m5;
-   __m128 m6, m7, m8, m9;
-
-   for (int i = 0; i < len / 16; i++) {
-   /* Load (unaligned) packed floats */
-   m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]);
-   m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]);
-
-   /* Unpack */
-   m2 = _mm_cvtepi16_epi32(m0);
-   m4 = _mm_cvtepi16_epi32(m1);
-   m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
-   m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
-   m3 = _mm_cvtepi16_epi32(m0);
-   m5 = _mm_cvtepi16_epi32(m1);
-
-   /* Convert */
-   m6 = _mm_cvtepi32_ps(m2);
-   m7 = _mm_cvtepi32_ps(m3);
-   m8 = _mm_cvtepi32_ps(m4);
-   m9 = _mm_cvtepi32_ps(m5);
-
-   /* Store */
-   _mm_storeu_ps([16 * i + 0], m6);
-   _mm_storeu_ps([16 * i + 4], m7);
-   _mm_storeu_ps([16 * i + 8], m8);
-   _mm_storeu_ps([16 * i + 12], m9);
-   }
-}
-
-/* 16*N 16-bit signed integer conversion with remainder */
-static void _sse_convert_si16_ps(float *restrict out,
-const short *restrict in,
-int len)
-{
-   int start = len / 16 * 16;
-
-   _sse_convert_si16_ps_16n(out, in, len);
-
-   for (int i = 0; i < len % 16; i++)
-   out[start + i] = in[start + i];
-}
-#endif /* HAVE_SSE4_1 */
-
-/* 8*N single precision floats scaled and converted to 16-bit signed integer */
-static void _sse_convert_scale_ps_si16_8n(short *restrict out,
- const float *restrict in,
- float scale, int len)
-{
-   __m128 m0, m1, m2;
-   __m128i m4, m5;
-
-   for (int i = 0; i < len / 8; i++) {
-   /* Load (unaligned) packed floats */
-   m0 = 

osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-05-09 Thread Vadim Yanitskiy

Patch Set 4: Code-Review-1

(1 comment)

https://gerrit.osmocom.org/#/c/2134/4/config/ax_sse.m4
File config/ax_sse.m4:

PS4, Line 44:   case $host_cpu in
: i[[3456]]86*|x86_64*|amd64*)
On some CPU architectures (e.g. ARM), which don't match this
condition, both HAVE_SSE3 and HAVE_SSE4_1 will never be defined.

I made the same mistake in libosmocore, see fix:
https://gerrit.osmocom.org/#/c/2548/1/m4/ax_check_simd.m4


-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 4
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-HasComments: Yes


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-05-02 Thread Tom Tsou

Patch Set 4: Code-Review+2

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 4
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-Reviewer: Tom Tsou 
Gerrit-HasComments: No


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-04-26 Thread Neels Hofmeyr

Patch Set 4: Code-Review+1

(1 comment)

I possibly need this patch to be able to get a running binary for our 
osmo-gsm-tester main unit. Despite building --without-sse, I get an illegal 
instruction as soon as osmo-trx starts to access the SDR. Hopefully this fixes 
the problem. See https://osmocom.org/issues/1869#note-24 -- Tom, would you mind 
to review and possibly accept this?

https://gerrit.osmocom.org/#/c/2134/4//COMMIT_MSG
Commit Message:

Line 11: compiler exidantly mixes SSE4.1 instructions into an SSE3, or
funny typo :) accidently


-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 4
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: Yes


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-04-10 Thread Harald Welte

Patch Set 4: Code-Review+1

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 4
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: No


[PATCH] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-23 Thread dexter
Hello Max, Neels Hofmeyr, Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2134

to look at the new patch set (#3).

buildenv: Split up SSE3 and SSE4.1 code

Currently we find SSE3 and SSE4.1 code mixed togehter along with
generic code in one file. This introduces the risk that the
compiler exidantly mixes SSE4.1 instructions into an SSE3, or
even worse into a generic code path.

This commit splits the SSE3 and SSE4.1 code into separate files
and compiles them with the matching target options.

Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
---
M Transceiver52M/x86/Makefile.am
M Transceiver52M/x86/convert.c
A Transceiver52M/x86/convert_sse_3.c
A Transceiver52M/x86/convert_sse_3.h
A Transceiver52M/x86/convert_sse_4_1.c
A Transceiver52M/x86/convert_sse_4_1.h
M Transceiver52M/x86/convolve.c
A Transceiver52M/x86/convolve_sse_3.c
A Transceiver52M/x86/convolve_sse_3.h
R config/ax_sse.m4
M configure.ac
M utils/convolvetest/Makefile
12 files changed, 893 insertions(+), 661 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/osmo-trx refs/changes/34/2134/3

diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am
index 7a0b75f..70b7064 100644
--- a/Transceiver52M/x86/Makefile.am
+++ b/Transceiver52M/x86/Makefile.am
@@ -1,7 +1,28 @@
 if !ARCH_ARM
-AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common
+AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common
 
 noinst_LTLIBRARIES = libarch.la
+noinst_LTLIBRARIES += libarch_sse_3.la
+noinst_LTLIBRARIES += libarch_sse_4_1.la
+
+libarch_la_LIBADD =
+
+# SSE 3 specific code
+if HAVE_SSE3
+libarch_sse_3_la_SOURCES = \
+   convert_sse_3.c \
+   convolve_sse_3.c
+libarch_sse_3_la_CFLAGS = -msse3
+libarch_la_LIBADD += libarch_sse_3.la
+endif
+
+# SSE 4.1 specific code
+if HAVE_SSE4_1
+libarch_sse_4_1_la_SOURCES = \
+   convert_sse_4_1.c
+libarch_sse_4_1_la_CFLAGS = -msse4.1
+libarch_la_LIBADD += libarch_sse_4_1.la
+endif
 
 libarch_la_SOURCES = \
../common/convolve_base.c \
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index 3f76b65..db98050 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -20,6 +20,8 @@
 #include 
 #include 
 #include "convert.h"
+#include "convert_sse_3.h"
+#include "convert_sse_4_1.h"
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -35,140 +37,6 @@
 };
 
 static struct convert_cpu_context c;
-
-#ifdef HAVE_SSE3
-#include 
-#include 
-
-#ifdef HAVE_SSE4_1
-#include 
-
-/* 16*N 16-bit signed integer converted to single precision floats */
-static void _sse_convert_si16_ps_16n(float *restrict out,
-const short *restrict in,
-int len)
-{
-   __m128i m0, m1, m2, m3, m4, m5;
-   __m128 m6, m7, m8, m9;
-
-   for (int i = 0; i < len / 16; i++) {
-   /* Load (unaligned) packed floats */
-   m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]);
-   m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]);
-
-   /* Unpack */
-   m2 = _mm_cvtepi16_epi32(m0);
-   m4 = _mm_cvtepi16_epi32(m1);
-   m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
-   m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
-   m3 = _mm_cvtepi16_epi32(m0);
-   m5 = _mm_cvtepi16_epi32(m1);
-
-   /* Convert */
-   m6 = _mm_cvtepi32_ps(m2);
-   m7 = _mm_cvtepi32_ps(m3);
-   m8 = _mm_cvtepi32_ps(m4);
-   m9 = _mm_cvtepi32_ps(m5);
-
-   /* Store */
-   _mm_storeu_ps([16 * i + 0], m6);
-   _mm_storeu_ps([16 * i + 4], m7);
-   _mm_storeu_ps([16 * i + 8], m8);
-   _mm_storeu_ps([16 * i + 12], m9);
-   }
-}
-
-/* 16*N 16-bit signed integer conversion with remainder */
-static void _sse_convert_si16_ps(float *restrict out,
-const short *restrict in,
-int len)
-{
-   int start = len / 16 * 16;
-
-   _sse_convert_si16_ps_16n(out, in, len);
-
-   for (int i = 0; i < len % 16; i++)
-   out[start + i] = in[start + i];
-}
-#endif /* HAVE_SSE4_1 */
-
-/* 8*N single precision floats scaled and converted to 16-bit signed integer */
-static void _sse_convert_scale_ps_si16_8n(short *restrict out,
- const float *restrict in,
- float scale, int len)
-{
-   __m128 m0, m1, m2;
-   __m128i m4, m5;
-
-   for (int i = 0; i < len / 8; i++) {
-   /* Load (unaligned) packed floats */
-   m0 = _mm_loadu_ps([8 * i + 0]);
-   m1 = _mm_loadu_ps([8 * i + 4]);
-   m2 = _mm_load1_ps();
-
-   /* Scale */
-   m0 = _mm_mul_ps(m0, m2);
-   m1 = _mm_mul_ps(m1, m2);
-
-   /* 

osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-23 Thread Harald Welte

Patch Set 1:

> AX_EXT is macro from autoconf-archive package - I think it's avery
 > bad idea to supply our own modified copy: this means we'll have to
 > track all upstream fixes manually. I think we should either move
 > this code directly to configure.ac or add our own macro wrapping
 > ax_ext. If neither possible we should at the very least explicitly
 > mention from which version of autoconf-archive this was
 > copy-pasted.

I guess simply rename it or merge into configure.ac is fine.  I don't thin we 
need or want to track any upstream changes, as what we are doing now is quite 
different from upstream.

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: No


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-23 Thread Harald Welte

Patch Set 1: Code-Review+1

> could you
 > take a brief look and decide which way to go with the 'attribute'
 > style -- are they no longer needed now, or should we rewrite this
 > patch to use those instead?

this is exactly what has happened. The implementation first used those 
attributes and now switched to separate files. Guess you have missed the 
history here.  Both dexter and I agree that the separate files makes us more 
comfortable in being certain any version of the compiler will never use the 
wrong optimziation in the wrong function.

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: No


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-23 Thread Max

Patch Set 1:

FYI: the latest version of autoconf-archive is 2017.03.21

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: No


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-20 Thread Neels Hofmeyr

Patch Set 1: Code-Review-1

(3 comments)

in OS#1869 there is mention of the 'attribute' based per-function cpu features, 
but I can't see them in this patch. I must apologize because I assumed you were 
aware of this way to solve, since Holger had already posted it a long time ago 
somewhere. I should have brought your attention to it more urgently. With those 
attributes it may not be necessary to separate the files at all ... could you 
take a brief look and decide which way to go with the 'attribute' style -- are 
they no longer needed now, or should we rewrite this patch to use those instead?

https://gerrit.osmocom.org/#/c/2134/1/Transceiver52M/x86/Makefile.am
File Transceiver52M/x86/Makefile.am:

Line 33: 
whitespace


https://gerrit.osmocom.org/#/c/2134/1/Transceiver52M/x86/convert.c
File Transceiver52M/x86/convert.c:

Line 36:void (*convert_scale_ps_si16) (short *, const float *, float, 
int);
this doesn't change anything, does it?? drop this chunk.


https://gerrit.osmocom.org/#/c/2134/1/utils/convolvetest/Makefile
File utils/convolvetest/Makefile:

Line 19: 
(we usually avoid empty lines at eof)


-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Neels Hofmeyr 
Gerrit-HasComments: Yes


osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-20 Thread Max

Patch Set 1: Code-Review-1

AX_EXT is macro from autoconf-archive package - I think it's avery bad idea to 
supply our own modified copy: this means we'll have to track all upstream fixes 
manually. I think we should either move this code directly to configure.ac or 
add our own macro wrapping ax_ext. If neither possible we should at the very 
least explicitly mention from which version of autoconf-archive this was 
copy-pasted.

-- 
To view, visit https://gerrit.osmocom.org/2134
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-HasComments: No


[PATCH] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code

2017-03-20 Thread dexter

Review at  https://gerrit.osmocom.org/2134

buildenv: Split up SSE3 and SSE4.1 code

Currently we find SSE3 and SSE4.1 code mixed togehter along with
generic code in one file. This introduces the risk that the
compiler exidantly mixes SSE4.1 instructions into an SSE3, or
even worse into a generic code path.

This commit splits the SSE3 and SSE4.1 code into separate files
and compiles them with the matching target options.

Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
---
M Transceiver52M/x86/Makefile.am
M Transceiver52M/x86/convert.c
A Transceiver52M/x86/convert_sse_3.c
A Transceiver52M/x86/convert_sse_3.h
A Transceiver52M/x86/convert_sse_4_1.c
A Transceiver52M/x86/convert_sse_4_1.h
M Transceiver52M/x86/convolve.c
A Transceiver52M/x86/convolve_sse_3.c
A Transceiver52M/x86/convolve_sse_3.h
M config/ax_ext.m4
M configure.ac
M utils/convolvetest/Makefile
12 files changed, 896 insertions(+), 662 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/osmo-trx refs/changes/34/2134/1

diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am
index 7a0b75f..dbf8a9e 100644
--- a/Transceiver52M/x86/Makefile.am
+++ b/Transceiver52M/x86/Makefile.am
@@ -1,7 +1,28 @@
 if !ARCH_ARM
-AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common
+AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common
 
 noinst_LTLIBRARIES = libarch.la
+noinst_LTLIBRARIES += libarch_sse_3.la
+noinst_LTLIBRARIES += libarch_sse_4_1.la
+
+libarch_la_LIBADD =
+
+# SSE 3 specific code
+if HAVE_SSE3
+libarch_sse_3_la_SOURCES = \
+   convert_sse_3.c \
+   convolve_sse_3.c
+libarch_sse_3_la_CFLAGS = -msse3
+libarch_la_LIBADD += libarch_sse_3.la
+endif
+
+# SSE 4.1 specific code
+if HAVE_SSE4_1
+libarch_sse_4_1_la_SOURCES = \
+   convert_sse_4_1.c
+libarch_sse_4_1_la_CFLAGS = -msse4.1
+libarch_la_LIBADD += libarch_sse_4_1.la
+endif
 
 libarch_la_SOURCES = \
../common/convolve_base.c \
@@ -9,3 +30,4 @@
convert.c \
convolve.c
 endif
+
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index 3f76b65..f3dd125 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -20,6 +20,8 @@
 #include 
 #include 
 #include "convert.h"
+#include "convert_sse_3.h"
+#include "convert_sse_4_1.h"
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -29,146 +31,12 @@
 struct convert_cpu_context {
void (*convert_si16_ps_16n) (float *, const short *, int);
void (*convert_si16_ps) (float *, const short *, int);
-   void (*convert_scale_ps_si16_16n)(short *, const float *, float, int);
-   void (*convert_scale_ps_si16_8n)(short *, const float *, float, int);
-   void (*convert_scale_ps_si16)(short *, const float *, float, int);
+   void (*convert_scale_ps_si16_16n) (short *, const float *, float, int);
+   void (*convert_scale_ps_si16_8n) (short *, const float *, float, int);
+   void (*convert_scale_ps_si16) (short *, const float *, float, int);
 };
 
 static struct convert_cpu_context c;
-
-#ifdef HAVE_SSE3
-#include 
-#include 
-
-#ifdef HAVE_SSE4_1
-#include 
-
-/* 16*N 16-bit signed integer converted to single precision floats */
-static void _sse_convert_si16_ps_16n(float *restrict out,
-const short *restrict in,
-int len)
-{
-   __m128i m0, m1, m2, m3, m4, m5;
-   __m128 m6, m7, m8, m9;
-
-   for (int i = 0; i < len / 16; i++) {
-   /* Load (unaligned) packed floats */
-   m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]);
-   m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]);
-
-   /* Unpack */
-   m2 = _mm_cvtepi16_epi32(m0);
-   m4 = _mm_cvtepi16_epi32(m1);
-   m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
-   m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
-   m3 = _mm_cvtepi16_epi32(m0);
-   m5 = _mm_cvtepi16_epi32(m1);
-
-   /* Convert */
-   m6 = _mm_cvtepi32_ps(m2);
-   m7 = _mm_cvtepi32_ps(m3);
-   m8 = _mm_cvtepi32_ps(m4);
-   m9 = _mm_cvtepi32_ps(m5);
-
-   /* Store */
-   _mm_storeu_ps([16 * i + 0], m6);
-   _mm_storeu_ps([16 * i + 4], m7);
-   _mm_storeu_ps([16 * i + 8], m8);
-   _mm_storeu_ps([16 * i + 12], m9);
-   }
-}
-
-/* 16*N 16-bit signed integer conversion with remainder */
-static void _sse_convert_si16_ps(float *restrict out,
-const short *restrict in,
-int len)
-{
-   int start = len / 16 * 16;
-
-   _sse_convert_si16_ps_16n(out, in, len);
-
-   for (int i = 0; i < len % 16; i++)
-   out[start + i] = in[start + i];
-}
-#endif /* HAVE_SSE4_1 */
-
-/* 8*N single precision floats scaled and converted to 16-bit signed integer */
-static void _sse_convert_scale_ps_si16_8n(short *restrict