Hello,
I am about to prepare some ARM NEON optimized code for PulseAudio;
attached is a stand-alone test program demonstrating
sconv_s16le_from_float() and sconv_s16le_to_float() on 1019 samples
questions:
is it acceptable to use ARM NEON intrinsics?
or is __asm__ __volatile or assembler source preferred?
or Orc code?
I picked intrinsics due to simplicity... the generated code (gcc-4.6,
-O2) looks clean
# ./sconv_neon
checking NEON sconv_s16le_from_float(2038)
NEON: 3723 usec.
ref: 64516 usec.
checking NEON sconv_s16le_to_float(2038)
NEON: 1923 usec.
ref: 18280 usec.
runtime is for 1000 repetitions on a Beagleboard-XM (NEON vs. reference C
code)
if it looks OK to you, I'll go ahead and submit patches to integrate with
PA...
regards, p.
--
Peter Meerwald
+43-664-2444418 (mobile)
/*
* Copyright 2012 Peter Meerwald <[email protected]>
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>
typedef short int16_t;
typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b);
typedef long long unsigned int pa_usec_t;
#define pa_assert(x) assert(x)
#define PA_CLAMP_UNLIKELY(x, low, high) \
(((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))
static void pa_log_info(const char *format, ...) {
va_list ap;
char buf[1024];
va_start(ap, format);
vsprintf(buf, format, ap);
printf("%s\n", buf);
va_end(ap);
}
#define pa_log_debug pa_log_info
static pa_usec_t pa_rtclock_now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000ULL + tv.tv_usec;
}
#if defined(__arm__)
#include "arm_neon.h"
void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) {
pa_assert(a);
pa_assert(b);
for (; n > 0; n--) {
float v = *(a++);
v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
*(b++) = (int16_t) lrintf(v * 0x7FFF);
}
}
void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
unsigned i;
const float32x4_t plusone4 = vdupq_n_f32(1.0f);
const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
const float32x4_t half4 = vdupq_n_f32(0.5f);
const float32x4_t scale4 = vdupq_n_f32(32767.0f);
const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
for (i = 0; i < n/4; i++) {
float32x4_t v4 = ((float32x4_t *)a)[i];
v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
}
// leftovers
for (i = n & ~3; i < n; i++) {
b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
}
}
void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) {
pa_assert(a);
pa_assert(b);
for (; n > 0; n--)
*(b++) = ((float) (*(a++)))/(float) 0x7FFF;
}
void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
unsigned i;
const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
for (i = 0; i < n/4; i++) {
((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
}
// leftovers
const float invscale = 1.0f / 0x7FFF;
for (i = n & ~3; i < n; i++) {
b[i] = a[i] * invscale;
}
}
#define SAMPLES 1019
#define TIMES 300
static void run_test_from(void) {
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
float floats[SAMPLES];
int i;
pa_usec_t start, stop;
pa_convert_func_t func;
pa_log_debug("checking NEON sconv_s16le_from_float(%zd)", sizeof(samples));
memset(samples_ref, 0, sizeof(samples_ref));
memset(samples, 0, sizeof(samples));
for (i = 0; i < SAMPLES; i++) {
floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
}
func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne;
func(SAMPLES, floats, samples_ref);
pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
for (i = 0; i < SAMPLES; i++) {
if (abs(samples[i] - samples_ref[i]) > 0) {
pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
floats[i]);
}
}
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
}
stop = pa_rtclock_now();
pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
func(SAMPLES, floats, samples_ref);
}
stop = pa_rtclock_now();
pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}
static void run_test_to(void) {
int16_t samples[SAMPLES];
float floats[SAMPLES];
float floats_ref[SAMPLES];
int i;
pa_usec_t start, stop;
pa_convert_func_t func;
pa_log_debug("checking NEON sconv_s16le_to_float(%zd)", sizeof(samples));
memset(floats_ref, 0, sizeof(floats_ref));
memset(floats, 0, sizeof(float));
for (i = 0; i < SAMPLES; i++) {
samples[i] = rand() - RAND_MAX/2;
}
func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne;
func(SAMPLES, samples, floats_ref);
pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
for (i = 0; i < SAMPLES; i++) {
if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
samples[i]);
}
}
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
}
stop = pa_rtclock_now();
pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
func(SAMPLES, samples, floats_ref);
}
stop = pa_rtclock_now();
pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}
#endif /* defined(__arm__) */
int main() {
run_test_from();
run_test_to();
return EXIT_SUCCESS;
}
_______________________________________________
pulseaudio-discuss mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pulseaudio-discuss