[MERGED] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Tom Tsou has submitted this change and it was merged. Change subject: buildenv: Split up SSE3 and SSE4.1 code .. buildenv: Split up SSE3 and SSE4.1 code Currently we find SSE3 and SSE4.1 code mixed togehter along with generic code in one file. This introduces the risk that the compiler exidantly mixes SSE4.1 instructions into an SSE3, or even worse into a generic code path. This commit splits the SSE3 and SSE4.1 code into separate files and compiles them with the matching target options. Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 --- M Transceiver52M/x86/Makefile.am M Transceiver52M/x86/convert.c A Transceiver52M/x86/convert_sse_3.c A Transceiver52M/x86/convert_sse_3.h A Transceiver52M/x86/convert_sse_4_1.c A Transceiver52M/x86/convert_sse_4_1.h M Transceiver52M/x86/convolve.c A Transceiver52M/x86/convolve_sse_3.c A Transceiver52M/x86/convolve_sse_3.h R config/ax_sse.m4 M configure.ac M utils/convolvetest/Makefile 12 files changed, 893 insertions(+), 661 deletions(-) Approvals: Tom Tsou: Looks good to me, approved Neels Hofmeyr: Looks good to me, but someone else must approve Harald Welte: Looks good to me, but someone else must approve Jenkins Builder: Verified Objections: Vadim Yanitskiy: I would prefer this is not merged as is diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am index 7a0b75f..45aa629 100644 --- a/Transceiver52M/x86/Makefile.am +++ b/Transceiver52M/x86/Makefile.am @@ -1,7 +1,28 @@ if !ARCH_ARM -AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common +AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common noinst_LTLIBRARIES = libarch.la +noinst_LTLIBRARIES += libarch_sse_3.la +noinst_LTLIBRARIES += libarch_sse_4_1.la + +libarch_la_LIBADD = + +# SSE 3 specific code +if HAVE_SSE3 +libarch_sse_3_la_SOURCES = \ + convert_sse_3.c \ + convolve_sse_3.c +libarch_sse_3_la_CFLAGS = $(AM_CFLAGS) -msse3 +libarch_la_LIBADD += libarch_sse_3.la +endif + +# SSE 4.1 specific code +if HAVE_SSE4_1 +libarch_sse_4_1_la_SOURCES = \ + convert_sse_4_1.c +libarch_sse_4_1_la_CFLAGS = $(AM_CFLAGS) -msse4.1 +libarch_la_LIBADD += libarch_sse_4_1.la +endif libarch_la_SOURCES = \ ../common/convolve_base.c \ diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 3f76b65..db98050 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -20,6 +20,8 @@ #include #include #include "convert.h" +#include "convert_sse_3.h" +#include "convert_sse_4_1.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -35,140 +37,6 @@ }; static struct convert_cpu_context c; - -#ifdef HAVE_SSE3 -#include -#include - -#ifdef HAVE_SSE4_1 -#include - -/* 16*N 16-bit signed integer converted to single precision floats */ -static void _sse_convert_si16_ps_16n(float *restrict out, -const short *restrict in, -int len) -{ - __m128i m0, m1, m2, m3, m4, m5; - __m128 m6, m7, m8, m9; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]); - m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]); - - /* Unpack */ - m2 = _mm_cvtepi16_epi32(m0); - m4 = _mm_cvtepi16_epi32(m1); - m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); - m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); - m3 = _mm_cvtepi16_epi32(m0); - m5 = _mm_cvtepi16_epi32(m1); - - /* Convert */ - m6 = _mm_cvtepi32_ps(m2); - m7 = _mm_cvtepi32_ps(m3); - m8 = _mm_cvtepi32_ps(m4); - m9 = _mm_cvtepi32_ps(m5); - - /* Store */ - _mm_storeu_ps([16 * i + 0], m6); - _mm_storeu_ps([16 * i + 4], m7); - _mm_storeu_ps([16 * i + 8], m8); - _mm_storeu_ps([16 * i + 12], m9); - } -} - -/* 16*N 16-bit signed integer conversion with remainder */ -static void _sse_convert_si16_ps(float *restrict out, -const short *restrict in, -int len) -{ - int start = len / 16 * 16; - - _sse_convert_si16_ps_16n(out, in, len); - - for (int i = 0; i < len % 16; i++) - out[start + i] = in[start + i]; -} -#endif /* HAVE_SSE4_1 */ - -/* 8*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_8n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2; - __m128i m4, m5; - - for (int i = 0; i < len / 8; i++) { - /* Load (unaligned) packed floats */ - m0 =
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 4: Code-Review-1 (1 comment) https://gerrit.osmocom.org/#/c/2134/4/config/ax_sse.m4 File config/ax_sse.m4: PS4, Line 44: case $host_cpu in : i[[3456]]86*|x86_64*|amd64*) On some CPU architectures (e.g. ARM), which don't match this condition, both HAVE_SSE3 and HAVE_SSE4_1 will never be defined. I made the same mistake in libosmocore, see fix: https://gerrit.osmocom.org/#/c/2548/1/m4/ax_check_simd.m4 -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 4 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-Reviewer: Tom Tsou Gerrit-Reviewer: Vadim Yanitskiy Gerrit-HasComments: Yes
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 4: Code-Review+2 -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 4 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-Reviewer: Tom Tsou Gerrit-HasComments: No
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 4: Code-Review+1 (1 comment) I possibly need this patch to be able to get a running binary for our osmo-gsm-tester main unit. Despite building --without-sse, I get an illegal instruction as soon as osmo-trx starts to access the SDR. Hopefully this fixes the problem. See https://osmocom.org/issues/1869#note-24 -- Tom, would you mind to review and possibly accept this? https://gerrit.osmocom.org/#/c/2134/4//COMMIT_MSG Commit Message: Line 11: compiler exidantly mixes SSE4.1 instructions into an SSE3, or funny typo :) accidently -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 4 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: Yes
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 4: Code-Review+1 -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 4 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: No
[PATCH] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Hello Max, Neels Hofmeyr, Harald Welte, Jenkins Builder, I'd like you to reexamine a change. Please visit https://gerrit.osmocom.org/2134 to look at the new patch set (#3). buildenv: Split up SSE3 and SSE4.1 code Currently we find SSE3 and SSE4.1 code mixed togehter along with generic code in one file. This introduces the risk that the compiler exidantly mixes SSE4.1 instructions into an SSE3, or even worse into a generic code path. This commit splits the SSE3 and SSE4.1 code into separate files and compiles them with the matching target options. Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 --- M Transceiver52M/x86/Makefile.am M Transceiver52M/x86/convert.c A Transceiver52M/x86/convert_sse_3.c A Transceiver52M/x86/convert_sse_3.h A Transceiver52M/x86/convert_sse_4_1.c A Transceiver52M/x86/convert_sse_4_1.h M Transceiver52M/x86/convolve.c A Transceiver52M/x86/convolve_sse_3.c A Transceiver52M/x86/convolve_sse_3.h R config/ax_sse.m4 M configure.ac M utils/convolvetest/Makefile 12 files changed, 893 insertions(+), 661 deletions(-) git pull ssh://gerrit.osmocom.org:29418/osmo-trx refs/changes/34/2134/3 diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am index 7a0b75f..70b7064 100644 --- a/Transceiver52M/x86/Makefile.am +++ b/Transceiver52M/x86/Makefile.am @@ -1,7 +1,28 @@ if !ARCH_ARM -AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common +AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common noinst_LTLIBRARIES = libarch.la +noinst_LTLIBRARIES += libarch_sse_3.la +noinst_LTLIBRARIES += libarch_sse_4_1.la + +libarch_la_LIBADD = + +# SSE 3 specific code +if HAVE_SSE3 +libarch_sse_3_la_SOURCES = \ + convert_sse_3.c \ + convolve_sse_3.c +libarch_sse_3_la_CFLAGS = -msse3 +libarch_la_LIBADD += libarch_sse_3.la +endif + +# SSE 4.1 specific code +if HAVE_SSE4_1 +libarch_sse_4_1_la_SOURCES = \ + convert_sse_4_1.c +libarch_sse_4_1_la_CFLAGS = -msse4.1 +libarch_la_LIBADD += libarch_sse_4_1.la +endif libarch_la_SOURCES = \ ../common/convolve_base.c \ diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 3f76b65..db98050 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -20,6 +20,8 @@ #include #include #include "convert.h" +#include "convert_sse_3.h" +#include "convert_sse_4_1.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -35,140 +37,6 @@ }; static struct convert_cpu_context c; - -#ifdef HAVE_SSE3 -#include -#include - -#ifdef HAVE_SSE4_1 -#include - -/* 16*N 16-bit signed integer converted to single precision floats */ -static void _sse_convert_si16_ps_16n(float *restrict out, -const short *restrict in, -int len) -{ - __m128i m0, m1, m2, m3, m4, m5; - __m128 m6, m7, m8, m9; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]); - m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]); - - /* Unpack */ - m2 = _mm_cvtepi16_epi32(m0); - m4 = _mm_cvtepi16_epi32(m1); - m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); - m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); - m3 = _mm_cvtepi16_epi32(m0); - m5 = _mm_cvtepi16_epi32(m1); - - /* Convert */ - m6 = _mm_cvtepi32_ps(m2); - m7 = _mm_cvtepi32_ps(m3); - m8 = _mm_cvtepi32_ps(m4); - m9 = _mm_cvtepi32_ps(m5); - - /* Store */ - _mm_storeu_ps([16 * i + 0], m6); - _mm_storeu_ps([16 * i + 4], m7); - _mm_storeu_ps([16 * i + 8], m8); - _mm_storeu_ps([16 * i + 12], m9); - } -} - -/* 16*N 16-bit signed integer conversion with remainder */ -static void _sse_convert_si16_ps(float *restrict out, -const short *restrict in, -int len) -{ - int start = len / 16 * 16; - - _sse_convert_si16_ps_16n(out, in, len); - - for (int i = 0; i < len % 16; i++) - out[start + i] = in[start + i]; -} -#endif /* HAVE_SSE4_1 */ - -/* 8*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_8n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2; - __m128i m4, m5; - - for (int i = 0; i < len / 8; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_ps([8 * i + 0]); - m1 = _mm_loadu_ps([8 * i + 4]); - m2 = _mm_load1_ps(); - - /* Scale */ - m0 = _mm_mul_ps(m0, m2); - m1 = _mm_mul_ps(m1, m2); - - /*
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 1: > AX_EXT is macro from autoconf-archive package - I think it's avery > bad idea to supply our own modified copy: this means we'll have to > track all upstream fixes manually. I think we should either move > this code directly to configure.ac or add our own macro wrapping > ax_ext. If neither possible we should at the very least explicitly > mention from which version of autoconf-archive this was > copy-pasted. I guess simply rename it or merge into configure.ac is fine. I don't thin we need or want to track any upstream changes, as what we are doing now is quite different from upstream. -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 1 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: No
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 1: Code-Review+1 > could you > take a brief look and decide which way to go with the 'attribute' > style -- are they no longer needed now, or should we rewrite this > patch to use those instead? this is exactly what has happened. The implementation first used those attributes and now switched to separate files. Guess you have missed the history here. Both dexter and I agree that the separate files makes us more comfortable in being certain any version of the compiler will never use the wrong optimziation in the wrong function. -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 1 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Harald Welte Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: No
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 1: FYI: the latest version of autoconf-archive is 2017.03.21 -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 1 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: No
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 1: Code-Review-1 (3 comments) in OS#1869 there is mention of the 'attribute' based per-function cpu features, but I can't see them in this patch. I must apologize because I assumed you were aware of this way to solve, since Holger had already posted it a long time ago somewhere. I should have brought your attention to it more urgently. With those attributes it may not be necessary to separate the files at all ... could you take a brief look and decide which way to go with the 'attribute' style -- are they no longer needed now, or should we rewrite this patch to use those instead? https://gerrit.osmocom.org/#/c/2134/1/Transceiver52M/x86/Makefile.am File Transceiver52M/x86/Makefile.am: Line 33: whitespace https://gerrit.osmocom.org/#/c/2134/1/Transceiver52M/x86/convert.c File Transceiver52M/x86/convert.c: Line 36:void (*convert_scale_ps_si16) (short *, const float *, float, int); this doesn't change anything, does it?? drop this chunk. https://gerrit.osmocom.org/#/c/2134/1/utils/convolvetest/Makefile File utils/convolvetest/Makefile: Line 19: (we usually avoid empty lines at eof) -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 1 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-Reviewer: Neels Hofmeyr Gerrit-HasComments: Yes
osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Patch Set 1: Code-Review-1 AX_EXT is macro from autoconf-archive package - I think it's avery bad idea to supply our own modified copy: this means we'll have to track all upstream fixes manually. I think we should either move this code directly to configure.ac or add our own macro wrapping ax_ext. If neither possible we should at the very least explicitly mention from which version of autoconf-archive this was copy-pasted. -- To view, visit https://gerrit.osmocom.org/2134 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: comment Gerrit-Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 Gerrit-PatchSet: 1 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexterGerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Max Gerrit-HasComments: No
[PATCH] osmo-trx[master]: buildenv: Split up SSE3 and SSE4.1 code
Review at https://gerrit.osmocom.org/2134 buildenv: Split up SSE3 and SSE4.1 code Currently we find SSE3 and SSE4.1 code mixed togehter along with generic code in one file. This introduces the risk that the compiler exidantly mixes SSE4.1 instructions into an SSE3, or even worse into a generic code path. This commit splits the SSE3 and SSE4.1 code into separate files and compiles them with the matching target options. Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3 --- M Transceiver52M/x86/Makefile.am M Transceiver52M/x86/convert.c A Transceiver52M/x86/convert_sse_3.c A Transceiver52M/x86/convert_sse_3.h A Transceiver52M/x86/convert_sse_4_1.c A Transceiver52M/x86/convert_sse_4_1.h M Transceiver52M/x86/convolve.c A Transceiver52M/x86/convolve_sse_3.c A Transceiver52M/x86/convolve_sse_3.h M config/ax_ext.m4 M configure.ac M utils/convolvetest/Makefile 12 files changed, 896 insertions(+), 662 deletions(-) git pull ssh://gerrit.osmocom.org:29418/osmo-trx refs/changes/34/2134/1 diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am index 7a0b75f..dbf8a9e 100644 --- a/Transceiver52M/x86/Makefile.am +++ b/Transceiver52M/x86/Makefile.am @@ -1,7 +1,28 @@ if !ARCH_ARM -AM_CFLAGS = -Wall -std=gnu99 $(SIMD_FLAGS) -I${srcdir}/../common +AM_CFLAGS = -Wall -std=gnu99 -I${srcdir}/../common noinst_LTLIBRARIES = libarch.la +noinst_LTLIBRARIES += libarch_sse_3.la +noinst_LTLIBRARIES += libarch_sse_4_1.la + +libarch_la_LIBADD = + +# SSE 3 specific code +if HAVE_SSE3 +libarch_sse_3_la_SOURCES = \ + convert_sse_3.c \ + convolve_sse_3.c +libarch_sse_3_la_CFLAGS = -msse3 +libarch_la_LIBADD += libarch_sse_3.la +endif + +# SSE 4.1 specific code +if HAVE_SSE4_1 +libarch_sse_4_1_la_SOURCES = \ + convert_sse_4_1.c +libarch_sse_4_1_la_CFLAGS = -msse4.1 +libarch_la_LIBADD += libarch_sse_4_1.la +endif libarch_la_SOURCES = \ ../common/convolve_base.c \ @@ -9,3 +30,4 @@ convert.c \ convolve.c endif + diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 3f76b65..f3dd125 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -20,6 +20,8 @@ #include #include #include "convert.h" +#include "convert_sse_3.h" +#include "convert_sse_4_1.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -29,146 +31,12 @@ struct convert_cpu_context { void (*convert_si16_ps_16n) (float *, const short *, int); void (*convert_si16_ps) (float *, const short *, int); - void (*convert_scale_ps_si16_16n)(short *, const float *, float, int); - void (*convert_scale_ps_si16_8n)(short *, const float *, float, int); - void (*convert_scale_ps_si16)(short *, const float *, float, int); + void (*convert_scale_ps_si16_16n) (short *, const float *, float, int); + void (*convert_scale_ps_si16_8n) (short *, const float *, float, int); + void (*convert_scale_ps_si16) (short *, const float *, float, int); }; static struct convert_cpu_context c; - -#ifdef HAVE_SSE3 -#include -#include - -#ifdef HAVE_SSE4_1 -#include - -/* 16*N 16-bit signed integer converted to single precision floats */ -static void _sse_convert_si16_ps_16n(float *restrict out, -const short *restrict in, -int len) -{ - __m128i m0, m1, m2, m3, m4, m5; - __m128 m6, m7, m8, m9; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_si128((__m128i *) [16 * i + 0]); - m1 = _mm_loadu_si128((__m128i *) [16 * i + 8]); - - /* Unpack */ - m2 = _mm_cvtepi16_epi32(m0); - m4 = _mm_cvtepi16_epi32(m1); - m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); - m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); - m3 = _mm_cvtepi16_epi32(m0); - m5 = _mm_cvtepi16_epi32(m1); - - /* Convert */ - m6 = _mm_cvtepi32_ps(m2); - m7 = _mm_cvtepi32_ps(m3); - m8 = _mm_cvtepi32_ps(m4); - m9 = _mm_cvtepi32_ps(m5); - - /* Store */ - _mm_storeu_ps([16 * i + 0], m6); - _mm_storeu_ps([16 * i + 4], m7); - _mm_storeu_ps([16 * i + 8], m8); - _mm_storeu_ps([16 * i + 12], m9); - } -} - -/* 16*N 16-bit signed integer conversion with remainder */ -static void _sse_convert_si16_ps(float *restrict out, -const short *restrict in, -int len) -{ - int start = len / 16 * 16; - - _sse_convert_si16_ps_16n(out, in, len); - - for (int i = 0; i < len % 16; i++) - out[start + i] = in[start + i]; -} -#endif /* HAVE_SSE4_1 */ - -/* 8*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_8n(short *restrict