Strange EMCC optimization on some C code

letz Thu, 19 Oct 2017 05:45:08 -0700

Hi,

Our Faust DSP audio language (faust.grame.fr) can generate C,C++ ... or 
wast/wasm code. On a simple example (signal integrator), we are comparing 
the wasm code directly generated from our wast/wasm backend, to the 
C backed later on compiled/optimized in wasm with EMCC. Here are the 
results:


1) in C we get : 

#ifndef FAUSTFLOAT
#define FAUSTFLOAT float
#endif  

typedef struct {

   float fRec0[2];
   int fSamplingFreq;

} mydsp;

void computemydsp(mydsp* dsp, int count, FAUSTFLOAT** inputs, FAUSTFLOAT** 
outputs) {
   FAUSTFLOAT* input0 = inputs[0];
   FAUSTFLOAT* output0 = outputs[0];
   /* C99 loop */
   {
       int i;
       for (i = 0; (i < count); i = (i + 1)) {
           dsp->fRec0[0] = ((float)input0[i] + dsp->fRec0[1]);
           output0[i] = (FAUSTFLOAT)dsp->fRec0[0];
           dsp->fRec0[1] = dsp->fRec0[0];

       }
   }

}

2) in wast from Faust, we get:

(func $compute (param $dsp i32) (param $count i32) (param $inputs i32) 
(param $outputs i32)
(local $input0 i32)
(local $output0 i32)
(local $i i32)
(set_local $input0 (i32.const 0))
(set_local $output0 (i32.const 0))
(set_local $i (i32.const 0))
(set_local $input0 (i32.load (i32.add (get_local $inputs) (i32.shl 
(i32.const 0) (i32.const 2)))))
(set_local $output0 (i32.load (i32.add (get_local $outputs) (i32.shl 
(i32.const 0) (i32.const 2)))))
(set_local $i (i32.const 0))
(loop $for-in-i 
(block $for-out-i 
(f32.store (i32.const 0) (f32.add (f32.load (i32.add (get_local $input0) 
(i32.shl (get_local $i) (i32.const 2)))) (f32.load (i32.const 4))))
(f32.store (i32.add (get_local $output0) (i32.shl (get_local $i) (i32.const 
2))) (f32.load (i32.const 0)))
(f32.store (i32.const 4) (f32.load (i32.const 0)))
(set_local $i (i32.add (get_local $i) (i32.const 1)))
(if (i32.lt_s (get_local $i) (get_local $count)) (br $for-in-i) (br 
$for-out-i))
)
)
)


==> so we get some f32.load/f32.store in the object fRec0 field that could 
be possibly optimized using an intermediate local variable.

3) With emcc -O1  (emcc testWASM.cpp -O2 -s WASM=1 -s SIDE_MODULE=1 -o 
testWASM.wasm)

(func $0 (type $0) (param $var$0 i32) (param $var$1 i32) (param $var$2 i32) 
(param $var$3 i32)
 (local $var$4 i32)
 (local $var$5 i32)
 (local $var$6 f32)
 (block $label$1
  (set_local $var$4
   (i32.load
    (get_local $var$2)
   )
  )
  (set_local $var$5
   (i32.load
    (get_local $var$3)
   )
  )
  (if
   (i32.le_s
    (get_local $var$1)
    (i32.const 0)
   )
   (return)
  )
  (set_local $var$3
   (i32.add
    (get_local $var$0)
    (i32.const 4)
   )
  )
  (set_local $var$2
   (i32.const 0)
  )
  (loop $label$3
   (f32.store
    (get_local $var$0)
    (tee_local $var$6
     (f32.add
      (f32.load
       (i32.add
        (get_local $var$4)
        (i32.shl
         (get_local $var$2)
         (i32.const 2)
        )
       )
      )
      (f32.load
       (get_local $var$3)
      )
     )
    )
   )
   (f32.store
    (i32.add
     (get_local $var$5)
     (i32.shl
      (get_local $var$2)
      (i32.const 2)
     )
    )
    (get_local $var$6)
   )
   (i32.store
    (get_local $var$3)
    (i32.load
     (get_local $var$0)
    )
   )
   (br_if $label$3
    (i32.ne
     (tee_local $var$2
      (i32.add
       (get_local $var$2)
       (i32.const 1)
      )
     )
     (get_local $var$1)
    )
   )
  )
 )
)

4) With emcc -O3

(func $0 (type $0) (param $var$0 i32) (param $var$1 i32) (param $var$2 i32) 
(param $var$3 i32)
 (local $var$4 i32)
 (local $var$5 i32)
 (local $var$6 i32)
 (local $var$7 f32)
 (block $label$1
  (set_local $var$4
   (i32.load
    (get_local $var$2)
   )
  )
  (set_local $var$3
   (i32.load
    (get_local $var$3)
   )
  )
  (if
   (i32.le_s
    (get_local $var$1)
    (i32.const 0)
   )
   (return)
  )
  (set_local $var$2
   (i32.const 0)
  )
  (set_local $var$7
   (f32.load
    (tee_local $var$5
     (i32.add
      (get_local $var$0)
      (i32.const 4)
     )
    )
   )
  )
  (loop $label$3
   (f32.store
    (get_local $var$0)
    (tee_local $var$7
     (f32.add
      (f32.load
       (i32.add
        (get_local $var$4)
        (i32.shl
         (get_local $var$2)
         (i32.const 2)
        )
       )
      )
      (get_local $var$7)
     )
    )
   )
   (f32.store
    (i32.add
     (get_local $var$3)
     (i32.shl
      (get_local $var$2)
      (i32.const 2)
     )
    )
    (get_local $var$7)
   )
   (i32.store
    (get_local $var$5)
    (tee_local $var$6
     (i32.load
      (get_local $var$0)
     )
    )
   )
   (set_local $var$7
    (f32.reinterpret/i32
     (get_local $var$6)
    )
   )
   (br_if $label$3
    (i32.ne
     (tee_local $var$2
      (i32.add
       (get_local $var$2)
       (i32.const 1)
      )
     )
     (get_local $var$1)
    )
   )
  )
 )
)

The Emcc optimized code seems rather complicated: one of the  f32.store is 
transformed in a i32.store at -O1 level. In -O3 mode, the i32 $var$6 
intermediate variable is even "reinterpreted" as a f32  type with the 
f32.reinterpret/i32 
instruction. Is this supposed to be faster at the end ?

Thanks for any good explanation.

-- 
You received this message because you are subscribed to the Google Groups 
"emscripten-discuss" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Strange EMCC optimization on some C code

Reply via email to