[MERGED] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-24 Thread Harald Welte
Harald Welte has submitted this change and it was merged.

Change subject: core/conv: add x86 SSE support for Viterbi decoder
..


core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. Runtime SIMD detection is currently
implemented through the __builtin_cpu_supports call.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 748 insertions(+), 10 deletions(-)

Approvals:
  Tom Tsou: Looks good to me, but someone else must approve
  Alexander Chemeris: Looks good to me, but someone else must approve
  Harald Welte: Looks good to me, approved
  Jenkins Builder: Verified



diff --git a/src/Makefile.am b/src/Makefile.am
index 5724055..a0aa5a0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -23,6 +23,12 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c sercomm.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+# Per-object flags hack
+viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..2097a02 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = 

libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-24 Thread Harald Welte

Patch Set 9: Code-Review+2

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-Reviewer: tnt 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-23 Thread Tom Tsou

Patch Set 9: Code-Review+1

(1 comment)

https://gerrit.osmocom.org/#/c/2454/9/src/viterbi_sse.c
File src/viterbi_sse.c:

PS9, Line 68: M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, 
M4));
> Same >= vs > handling that we saw in the non-SSE case. SSE does not provide
Done


-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-Reviewer: tnt 
Gerrit-HasComments: Yes


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-22 Thread tnt

Patch Set 9:

I didn't get a chance to read that yet.

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-Reviewer: tnt 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-22 Thread Harald Welte

Patch Set 9:

ok, everyone except Tom has voted +1. @tom, any further feedback on this, or 
shall we merge it?

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-Reviewer: tnt 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-21 Thread Alexander Chemeris

Patch Set 9: Code-Review+1

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-20 Thread Harald Welte

Patch Set 9: Code-Review+1

(1 comment)

https://gerrit.osmocom.org/#/c/2454/9/src/viterbi_gen.c
File src/viterbi_gen.c:

Line 132: int16_t *osmo_conv_vdec_malloc(size_t n)
actually, it might make sense in a follow-up patch to use talloc here. We 
generally ues talloc from libosmo* and discourage any direct access to malloc.  
But that's of course unrelated to this patch subject.


-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: Yes


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-19 Thread Vadim Yanitskiy

Patch Set 9:

(1 comment)

https://gerrit.osmocom.org/#/c/2454/9/src/viterbi_sse.c
File src/viterbi_sse.c:

PS9, Line 68: M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, 
M4));
> Same >= vs > handling that we saw in the non-SSE case. SSE does not provide
Great! Thank you very much again :)


-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: Yes


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-19 Thread Tom Tsou

Patch Set 9:

(1 comment)

https://gerrit.osmocom.org/#/c/2454/9/src/viterbi_sse.c
File src/viterbi_sse.c:

PS9, Line 68: M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, 
M4));
Same >= vs > handling that we saw in the non-SSE case. SSE does not provide a 
signed 16-bit integer greater than or equal to instruction, so we use this 
chained combination.


-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 9
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: Yes


[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-19 Thread Tom Tsou
Hello Vadim Yanitskiy, Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#9).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. Runtime SIMD detection is currently
implemented through the __builtin_cpu_supports call.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 748 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/9

diff --git a/src/Makefile.am b/src/Makefile.am
index 6948e1a..e8e67ef 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,12 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+# Per-object flags hack
+viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..2097a02 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+   #endif
break;
case 3:
+   #ifdef HAVE_SSE3
+   

[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-19 Thread Tom Tsou
Hello Vadim Yanitskiy, Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#8).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. Runtime SIMD detection is currently
implemented through the __builtin_cpu_supports call.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 748 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/8

diff --git a/src/Makefile.am b/src/Makefile.am
index 6948e1a..e8e67ef 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,12 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+# Per-object flags hack
+viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..2097a02 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+   #endif
break;
case 3:
+   #ifdef HAVE_SSE3
+   

[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-07 Thread Vadim Yanitskiy
Hello Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#7).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. Runtime SIMD detection is currently
implemented through the __builtin_cpu_supports call.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 748 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/7

diff --git a/src/Makefile.am b/src/Makefile.am
index 6948e1a..e8e67ef 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,12 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+# Per-object flags hack
+viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..2097a02 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+   #endif
break;
case 3:
+   #ifdef HAVE_SSE3
+   dec->metric_func = 

[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-02 Thread Vadim Yanitskiy
Hello Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#6).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. The SIMD detection is currently implemented
through the __builtin_cpu_supports(EXT), which is only supported
by GCC.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 747 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/6

diff --git a/src/Makefile.am b/src/Makefile.am
index 6948e1a..2f19838 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,7 +4,7 @@
 LIBVERSION=8:0:0
 
 AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_builddir)/include
-AM_CFLAGS = -Wall $(TALLOC_CFLAGS)
+AM_CFLAGS = -Wall $(TALLOC_CFLAGS) $(SIMD_FLAGS)
 
 lib_LTLIBRARIES = libosmocore.la
 
@@ -19,6 +19,10 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..d68db23 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func 

libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-01 Thread Vadim Yanitskiy

Patch Set 4: Code-Review-1

> Build Failed
 > 
 > http://jenkins.osmocom.org/jenkins/job/libosmocore-gerrit/1009/ :
 > FAILURE

Here we have the same problem with FreeBSD, as described in
previous change, but there is also another issue. Have a look
at Debian build log, and you will see, that SCH test fails.

This failure caused by SSE decoder implementation, which
provides one-byte different decoding result from original
implementation. I'll describe more details in the ML.

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 4
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: No


[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-01 Thread Vadim Yanitskiy
Hello Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#4).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. The SIMD detection is currently implemented
through the __builtin_cpu_supports(EXT), which is only supported
by GCC.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 746 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/4

diff --git a/src/Makefile.am b/src/Makefile.am
index 999436d..2f19838 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,10 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..d68db23 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,11 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -43,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -89,12 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -294,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -430,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -456,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+   #endif
break;
case 3:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+ 

[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-01 Thread Vadim Yanitskiy
Hello Harald Welte, Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#3).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. The SIMD detection is currently implemented
through the __builtin_cpu_supports(EXT), which is only supported
by GCC.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 746 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/3

diff --git a/src/Makefile.am b/src/Makefile.am
index 999436d..2f19838 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,10 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index ea4fb21..d68db23 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,12 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
-#define SSE_ALIGN  16
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -44,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -90,20 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Aligned Memory Allocator
- * SSE requires 16-byte memory alignment. We store relevant trellis values
- * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
- * so the allocated memory is casted as such.
- */
-static int16_t *vdec_malloc(size_t n)
-{
-#ifdef HAVE_SSE3
-   return (int16_t *) memalign(SSE_ALIGN, sizeof(int16_t) * n);
-#else
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-#endif
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -303,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -439,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -465,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+

libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-05-01 Thread Harald Welte

Patch Set 2: Code-Review+2

good idea to test with qemu!

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 2
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-04-30 Thread Vadim Yanitskiy

Patch Set 2:

> (1 comment)

No, of course. SSE3 support is mandatory for accelerated Viterbi
implementation only (viterbi_sse.c). If SSE3 isn't supported by
target CPU, then not-accelerated Viterbi implementation will be
used (viterbi_sse.c).

BTW: I forgot to change commit author :(

-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 2
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: Vadim Yanitskiy 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: No


libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-04-30 Thread Harald Welte

Patch Set 2:

(1 comment)

https://gerrit.osmocom.org/#/c/2454/2//COMMIT_MSG
Commit Message:

Line 11: is the minimal requirement. SSE4.1 and AVX2 are used if available.
Do you mean this patch introduces a mandatory requirement for SSE3 capable 
CPUs? if so, it is not acceptable.


-- 
To view, visit https://gerrit.osmocom.org/2454
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
Gerrit-PatchSet: 2
Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-Owner: Vadim Yanitskiy 
Gerrit-Reviewer: Alexander Chemeris 
Gerrit-Reviewer: Harald Welte 
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: Max 
Gerrit-Reviewer: Tom Tsou 
Gerrit-Reviewer: dexter 
Gerrit-HasComments: Yes


[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-04-30 Thread Vadim Yanitskiy
Hello Jenkins Builder,

I'd like you to reexamine a change.  Please visit

https://gerrit.osmocom.org/2454

to look at the new patch set (#2).

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

To enable this feature, the source code should be configured with
SIMD support (see --enable-simd). Otherwise, the viterbi_sse.c
won't be compiled.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. The SIMD detection is currently implemented
through the __builtin_cpu_supports(EXT), which is only supported
by GCC.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
M src/viterbi_gen.c
A src/viterbi_sse.c
4 files changed, 746 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/2

diff --git a/src/Makefile.am b/src/Makefile.am
index 999436d..2f19838 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,10 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index ea4fb21..d68db23 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,12 +24,34 @@
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
-#define SSE_ALIGN  16
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -44,6 +66,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -90,20 +127,6 @@
void (*metric_func)(const int8_t *, const int16_t *,
int16_t *, int16_t *, int);
 };
-
-/* Aligned Memory Allocator
- * SSE requires 16-byte memory alignment. We store relevant trellis values
- * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
- * so the allocated memory is casted as such.
- */
-static int16_t *vdec_malloc(size_t n)
-{
-#ifdef HAVE_SSE3
-   return (int16_t *) memalign(SSE_ALIGN, sizeof(int16_t) * n);
-#else
-   return (int16_t *) malloc(sizeof(int16_t) * n);
-#endif
-}
 
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
@@ -303,9 +326,9 @@
if (!trellis)
return;
 
+   vdec_free(trellis->outputs);
+   vdec_free(trellis->sums);
free(trellis->vals);
-   free(trellis->outputs);
-   free(trellis->sums);
free(trellis);
 }
 
@@ -439,7 +462,7 @@
if (!dec)
return;
 
-   free(dec->paths[0]);
+   vdec_free(dec->paths[0]);
free(dec->paths);
free_trellis(dec->trellis);
free(dec);
@@ -465,13 +488,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+

[PATCH] libosmocore[master]: core/conv: add x86 SSE support for Viterbi decoder

2017-04-30 Thread Vadim Yanitskiy

Review at  https://gerrit.osmocom.org/2454

core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based
SSE operations. SSE3, found on virtually all modern x86 processors,
is the minimal requirement. SSE4.1 and AVX2 are used if available.

To enable this feature, the source code should be configured with
SIMD support (see --enable-simd). Otherwise, the viterbi_sse.c
won't be compiled.

Also, the original code was extended with runtime SIMD detection,
so only supported extensions will be used by target CPU. It makes
the library more partable, what is very important for binary
packages distribution. The SIMD detection is currently implemented
through the __builtin_cpu_supports(EXT), which is only supported
by GCC.

Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
---
M src/Makefile.am
M src/viterbi.c
A src/viterbi_sse.c
3 files changed, 699 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/54/2454/1

diff --git a/src/Makefile.am b/src/Makefile.am
index 999436d..2f19838 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,6 +19,10 @@
 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 viterbi.c viterbi_gen.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index ea4fb21..1484a31 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -20,16 +20,30 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include 
 #include 
 #include 
 #include 
 
-#include 
 #include "config.h"
+
+#include 
 
 #define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)  (K == 7 ? 64 : 16)
 #define SSE_ALIGN  16
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointer will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
 
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
@@ -44,6 +58,21 @@
int16_t *sums, int16_t *paths, int norm);
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
+
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+   int16_t *sums, int16_t *paths, int norm);
+#endif
 
 /* Trellis State
  * state - Internal lshift register value
@@ -96,13 +125,14 @@
  * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
  * so the allocated memory is casted as such.
  */
-static int16_t *vdec_malloc(size_t n)
+static int16_t *vdec_malloc_no_sse3(size_t n)
 {
-#ifdef HAVE_SSE3
-   return (int16_t *) memalign(SSE_ALIGN, sizeof(int16_t) * n);
-#else
return (int16_t *) malloc(sizeof(int16_t) * n);
-#endif
+}
+
+static int16_t *vdec_malloc_sse3(size_t n)
+{
+   return (int16_t *) memalign(SSE_ALIGN, sizeof(int16_t) * n);
 }
 
 /* Accessor calls */
@@ -465,13 +495,31 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n2 :
+   osmo_conv_gen_metrics_k5_n2_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+   #endif
break;
case 3:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n3 :
+   osmo_conv_gen_metrics_k5_n3_sse;
+   #else
dec->metric_func = osmo_conv_gen_metrics_k5_n3;
+   #endif
break;
case 4:
+   #ifdef HAVE_SSE3
+   dec->metric_func = !sse3_supported ?
+   osmo_conv_gen_metrics_k5_n4 :
+