Hi,
Our Faust DSP audio language (faust.grame.fr) can generate C,C++ ... or
wast/wasm code. On a simple example (signal integrator), we are comparing
the wasm code directly generated from our wast/wasm backend, to the
C backed later on compiled/optimized in wasm with EMCC. Here are the
results:
1) in C we get :
#ifndef FAUSTFLOAT
#define FAUSTFLOAT float
#endif
typedef struct {
float fRec0[2];
int fSamplingFreq;
} mydsp;
void computemydsp(mydsp* dsp, int count, FAUSTFLOAT** inputs, FAUSTFLOAT**
outputs) {
FAUSTFLOAT* input0 = inputs[0];
FAUSTFLOAT* output0 = outputs[0];
/* C99 loop */
{
int i;
for (i = 0; (i < count); i = (i + 1)) {
dsp->fRec0[0] = ((float)input0[i] + dsp->fRec0[1]);
output0[i] = (FAUSTFLOAT)dsp->fRec0[0];
dsp->fRec0[1] = dsp->fRec0[0];
}
}
}
2) in wast from Faust, we get:
(func $compute (param $dsp i32) (param $count i32) (param $inputs i32)
(param $outputs i32)
(local $input0 i32)
(local $output0 i32)
(local $i i32)
(set_local $input0 (i32.const 0))
(set_local $output0 (i32.const 0))
(set_local $i (i32.const 0))
(set_local $input0 (i32.load (i32.add (get_local $inputs) (i32.shl
(i32.const 0) (i32.const 2)))))
(set_local $output0 (i32.load (i32.add (get_local $outputs) (i32.shl
(i32.const 0) (i32.const 2)))))
(set_local $i (i32.const 0))
(loop $for-in-i
(block $for-out-i
(f32.store (i32.const 0) (f32.add (f32.load (i32.add (get_local $input0)
(i32.shl (get_local $i) (i32.const 2)))) (f32.load (i32.const 4))))
(f32.store (i32.add (get_local $output0) (i32.shl (get_local $i) (i32.const
2))) (f32.load (i32.const 0)))
(f32.store (i32.const 4) (f32.load (i32.const 0)))
(set_local $i (i32.add (get_local $i) (i32.const 1)))
(if (i32.lt_s (get_local $i) (get_local $count)) (br $for-in-i) (br
$for-out-i))
)
)
)
==> so we get some f32.load/f32.store in the object fRec0 field that could
be possibly optimized using an intermediate local variable.
3) With emcc -O1 (emcc testWASM.cpp -O2 -s WASM=1 -s SIDE_MODULE=1 -o
testWASM.wasm)
(func $0 (type $0) (param $var$0 i32) (param $var$1 i32) (param $var$2 i32)
(param $var$3 i32)
(local $var$4 i32)
(local $var$5 i32)
(local $var$6 f32)
(block $label$1
(set_local $var$4
(i32.load
(get_local $var$2)
)
)
(set_local $var$5
(i32.load
(get_local $var$3)
)
)
(if
(i32.le_s
(get_local $var$1)
(i32.const 0)
)
(return)
)
(set_local $var$3
(i32.add
(get_local $var$0)
(i32.const 4)
)
)
(set_local $var$2
(i32.const 0)
)
(loop $label$3
(f32.store
(get_local $var$0)
(tee_local $var$6
(f32.add
(f32.load
(i32.add
(get_local $var$4)
(i32.shl
(get_local $var$2)
(i32.const 2)
)
)
)
(f32.load
(get_local $var$3)
)
)
)
)
(f32.store
(i32.add
(get_local $var$5)
(i32.shl
(get_local $var$2)
(i32.const 2)
)
)
(get_local $var$6)
)
(i32.store
(get_local $var$3)
(i32.load
(get_local $var$0)
)
)
(br_if $label$3
(i32.ne
(tee_local $var$2
(i32.add
(get_local $var$2)
(i32.const 1)
)
)
(get_local $var$1)
)
)
)
)
)
4) With emcc -O3
(func $0 (type $0) (param $var$0 i32) (param $var$1 i32) (param $var$2 i32)
(param $var$3 i32)
(local $var$4 i32)
(local $var$5 i32)
(local $var$6 i32)
(local $var$7 f32)
(block $label$1
(set_local $var$4
(i32.load
(get_local $var$2)
)
)
(set_local $var$3
(i32.load
(get_local $var$3)
)
)
(if
(i32.le_s
(get_local $var$1)
(i32.const 0)
)
(return)
)
(set_local $var$2
(i32.const 0)
)
(set_local $var$7
(f32.load
(tee_local $var$5
(i32.add
(get_local $var$0)
(i32.const 4)
)
)
)
)
(loop $label$3
(f32.store
(get_local $var$0)
(tee_local $var$7
(f32.add
(f32.load
(i32.add
(get_local $var$4)
(i32.shl
(get_local $var$2)
(i32.const 2)
)
)
)
(get_local $var$7)
)
)
)
(f32.store
(i32.add
(get_local $var$3)
(i32.shl
(get_local $var$2)
(i32.const 2)
)
)
(get_local $var$7)
)
(i32.store
(get_local $var$5)
(tee_local $var$6
(i32.load
(get_local $var$0)
)
)
)
(set_local $var$7
(f32.reinterpret/i32
(get_local $var$6)
)
)
(br_if $label$3
(i32.ne
(tee_local $var$2
(i32.add
(get_local $var$2)
(i32.const 1)
)
)
(get_local $var$1)
)
)
)
)
)
The Emcc optimized code seems rather complicated: one of the f32.store is
transformed in a i32.store at -O1 level. In -O3 mode, the i32 $var$6
intermediate variable is even "reinterpreted" as a f32 type with the
f32.reinterpret/i32
instruction. Is this supposed to be faster at the end ?
Thanks for any good explanation.
--
You received this message because you are subscribed to the Google Groups
"emscripten-discuss" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.