Re: [wsjt-devel] optimizing wsprd_exp.c for ARMv7-A

Pavel Demin Wed, 30 Mar 2016 10:06:05 -0700

Hi Bill,

Thanks for your suggestion to switch to the single precision FFTW3.

I've replaced 'fftw_' with 'fftwf_' in wsprd_exp.c and tested 'wsprd_exp-J -w' with 410 .wav files. This modification does not affect the numberof decoded messages and improves the decoding speed by a couple of percents.

Then I've replaced almost all the remaining doubles with floats inwsprd_exp.c. This modification improves the decoding speed by anothercouple of percents.

Attached is a patch file with all the modifications. This patch alsofixes a small bug in the writec2file function that I described in myprevious e-mail.

After applying this patch, I observe the following improvements in thedecoding speed:

 ~10% on Intel Pentium
 ~20% on ARM Cortex-A9

Best regards,

Pavel

Index: Makefile
===================================================================
--- Makefile    (revision 6563)
+++ Makefile    (working copy)
@@ -5,7 +5,7 @@
 CFLAGS= -I/usr/include -Wall -Wno-missing-braces -O3 -ffast-math
 LDFLAGS = -L/usr/lib
 FFLAGS =  -O2 -Wall -Wno-conversion
-LIBS = -lfftw3 -lm
+LIBS = -lfftw3f -lm
 
 # Default rules
 %.o: %.c $(DEPS)
Index: wsprd_exp.c
===================================================================
--- wsprd_exp.c (revision 6563)
+++ wsprd_exp.c (working copy)
@@ -46,7 +46,7 @@
 // Possible PATIENCE options: FFTW_ESTIMATE, FFTW_ESTIMATE_PATIENT,
 // FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE
 #define PATIENCE FFTW_ESTIMATE
-fftw_plan PLAN1,PLAN2,PLAN3;
+fftwf_plan PLAN1,PLAN2,PLAN3;
 
 unsigned char pr3[162]=
 {1,1,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,1,0,
@@ -64,7 +64,7 @@
 int printdata=0;
 
 //***************************************************************************
-unsigned long readc2file(char *ptr_to_infile, double *idat, double *qdat,
+unsigned long readc2file(char *ptr_to_infile, float *idat, float *qdat,
                          double *freq, int *wspr_type)
 {
     float *buffer;
@@ -102,7 +102,7 @@
 }
 
 //***************************************************************************
-unsigned long readwavfile(char *ptr_to_infile, int ntrmin, double *idat, 
double *qdat )
+unsigned long readwavfile(char *ptr_to_infile, int ntrmin, float *idat, float 
*qdat )
 {
     unsigned long i, j, npoints;
     int nfft1, nfft2, nh2, i0;
@@ -126,8 +126,8 @@
         return 1;
     }
     
-    double *realin;
-    fftw_complex *fftin, *fftout;
+    float *realin;
+    fftwf_complex *fftin, *fftout;
     
     FILE *fp;
     short int *buf2;
@@ -142,9 +142,9 @@
     nr=fread(buf2,2,npoints,fp);       //Read raw data
     fclose(fp);
     
-    realin=(double*) fftw_malloc(sizeof(double)*nfft1);
-    fftout=(fftw_complex*) fftw_malloc(sizeof(fftw_complex)*nfft1);
-    PLAN1 = fftw_plan_dft_r2c_1d(nfft1, realin, fftout, PATIENCE);
+    realin=(float*) fftwf_malloc(sizeof(float)*nfft1);
+    fftout=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*nfft1);
+    PLAN1 = fftwf_plan_dft_r2c_1d(nfft1, realin, fftout, PATIENCE);
     
     for (i=0; i<npoints; i++) {
         realin[i]=buf2[i]/32768.0;
@@ -155,10 +155,10 @@
     }
     
     free(buf2);
-    fftw_execute(PLAN1);
-    fftw_free(realin);
+    fftwf_execute(PLAN1);
+    fftwf_free(realin);
     
-    fftin=(fftw_complex*) fftw_malloc(sizeof(fftw_complex)*nfft2);
+    fftin=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*nfft2);
     
     for (i=0; i<nfft2; i++) {
         j=i0+i;
@@ -167,10 +167,10 @@
         fftin[i][1]=fftout[j][1];
     }
     
-    fftw_free(fftout);
-    fftout=(fftw_complex*) fftw_malloc(sizeof(fftw_complex)*nfft2);
-    PLAN2 = fftw_plan_dft_1d(nfft2, fftin, fftout, FFTW_BACKWARD, PATIENCE);
-    fftw_execute(PLAN2);
+    fftwf_free(fftout);
+    fftout=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*nfft2);
+    PLAN2 = fftwf_plan_dft_1d(nfft2, fftin, fftout, FFTW_BACKWARD, PATIENCE);
+    fftwf_execute(PLAN2);
     
     for (i=0; i<nfft2; i++) {
         idat[i]=fftout[i][0]/1000.0;
@@ -177,16 +177,16 @@
         qdat[i]=fftout[i][1]/1000.0;
     }
     
-    fftw_free(fftin);
-    fftw_free(fftout);
+    fftwf_free(fftin);
+    fftwf_free(fftout);
     return nfft2;
 }
 
 //***************************************************************************
-void sync_and_demodulate(double *id, double *qd, long np,
-                         unsigned char *symbols, double *f1, int ifmin, int 
ifmax, double fstep,
+void sync_and_demodulate(float *id, float *qd, long np,
+                         unsigned char *symbols, float *f1, int ifmin, int 
ifmax, float fstep,
                          int *shift1, int lagmin, int lagmax, int lagstep,
-                         double *drift1, int symfac, double *sync, int mode)
+                         float *drift1, int symfac, float *sync, int mode)
 {
     /***********************************************************************
      * mode = 0: no frequency or drift search. find best time lag.          *
@@ -195,18 +195,18 @@
      *           symbols using passed frequency and shift.                  *
      ************************************************************************/
     
-    static double fplast=-10000.0;
-    static double dt=1.0/375.0, df=375.0/256.0;
-    static double pi=3.14159265358979323846;
-    double twopidt, df15=df*1.5, df05=df*0.5;
+    static float fplast=-10000.0;
+    static float dt=1.0/375.0, df=375.0/256.0;
+    static float pi=3.14159265358979323846;
+    float twopidt, df15=df*1.5, df05=df*0.5;
 
     int i, j, k, lag;
-    double i0[162],q0[162],i1[162],q1[162],i2[162],q2[162],i3[162],q3[162];
-    double p0,p1,p2,p3,cmet,totp,syncmax,fac;
-    double c0[256],s0[256],c1[256],s1[256],c2[256],s2[256],c3[256],s3[256];
-    double dphi0, cdphi0, sdphi0, dphi1, cdphi1, sdphi1, dphi2, cdphi2, sdphi2,
+    float i0[162],q0[162],i1[162],q1[162],i2[162],q2[162],i3[162],q3[162];
+    float p0,p1,p2,p3,cmet,totp,syncmax,fac;
+    float c0[256],s0[256],c1[256],s1[256],c2[256],s2[256],c3[256],s3[256];
+    float dphi0, cdphi0, sdphi0, dphi1, cdphi1, sdphi1, dphi2, cdphi2, sdphi2,
     dphi3, cdphi3, sdphi3;
-    double f0=0.0, fp, ss, fbest=0.0, fsum=0.0, f2sum=0.0, fsymb[162];
+    float f0=0.0, fp, ss, fbest=0.0, fsum=0.0, f2sum=0.0, fsymb[162];
     int best_shift = 0, ifreq;
 
     syncmax=-1e30;
@@ -221,7 +221,7 @@
             ss=0.0;
             totp=0.0;
             for (i=0; i<162; i++) {
-                fp = f0 + (*drift1/2.0)*((double)i-81.0)/81.0;
+                fp = f0 + (*drift1/2.0)*((float)i-81.0)/81.0;
                 if( i==0 || (fp != fplast) ) {  // only calculate sin/cos if 
necessary
                     dphi0=twopidt*(fp-df15);
                     cdphi0=cos(dphi0);
@@ -332,23 +332,23 @@
 /***************************************************************************
  symbol-by-symbol signal subtraction
  ****************************************************************************/
-void subtract_signal(double *id, double *qd, long np,
-                     double f0, int shift0, double drift0, unsigned char* 
channel_symbols)
+void subtract_signal(float *id, float *qd, long np,
+                     float f0, int shift0, float drift0, unsigned char* 
channel_symbols)
 {
-    double dt=1.0/375.0, df=375.0/256.0;
+    float dt=1.0/375.0, df=375.0/256.0;
     int i, j, k;
-    double pi=4.*atan(1.0),twopidt, fp;
+    float pi=4.*atan(1.0),twopidt, fp;
     
-    double i0,q0;
-    double c0[256],s0[256];
-    double dphi, cdphi, sdphi;
+    float i0,q0;
+    float c0[256],s0[256];
+    float dphi, cdphi, sdphi;
     
     twopidt=2*pi*dt;
     
     for (i=0; i<162; i++) {
-        fp = f0 + ((double)drift0/2.0)*((double)i-81.0)/81.0;
+        fp = f0 + ((float)drift0/2.0)*((float)i-81.0)/81.0;
         
-        dphi=twopidt*(fp+((double)channel_symbols[i]-1.5)*df);
+        dphi=twopidt*(fp+((float)channel_symbols[i]-1.5)*df);
         cdphi=cos(dphi);
         sdphi=sin(dphi);
         
@@ -388,30 +388,30 @@
 /******************************************************************************
  Fully coherent signal subtraction
  
*******************************************************************************/
-void subtract_signal2(double *id, double *qd, long np,
-                      double f0, int shift0, double drift0, unsigned char* 
channel_symbols)
+void subtract_signal2(float *id, float *qd, long np,
+                      float f0, int shift0, float drift0, unsigned char* 
channel_symbols)
 {
-    double dt=1.0/375.0, df=375.0/256.0;
-    double pi=4.*atan(1.0), twopidt, phi=0, dphi, cs;
+    float dt=1.0/375.0, df=375.0/256.0;
+    float pi=4.*atan(1.0), twopidt, phi=0, dphi, cs;
     int i, j, k, ii, nsym=162, nspersym=256,  nfilt=256; //nfilt must be even 
number.
     int nsig=nsym*nspersym;
     int nc2=45000;
     
-    double *refi, *refq, *ci, *cq, *cfi, *cfq;
+    float *refi, *refq, *ci, *cq, *cfi, *cfq;
 
-    refi=malloc(sizeof(double)*nc2);
-    refq=malloc(sizeof(double)*nc2);
-    ci=malloc(sizeof(double)*nc2);
-    cq=malloc(sizeof(double)*nc2);
-    cfi=malloc(sizeof(double)*nc2);
-    cfq=malloc(sizeof(double)*nc2);
+    refi=malloc(sizeof(float)*nc2);
+    refq=malloc(sizeof(float)*nc2);
+    ci=malloc(sizeof(float)*nc2);
+    cq=malloc(sizeof(float)*nc2);
+    cfi=malloc(sizeof(float)*nc2);
+    cfq=malloc(sizeof(float)*nc2);
     
-    memset(refi,0,sizeof(double)*nc2);
-    memset(refq,0,sizeof(double)*nc2);
-    memset(ci,0,sizeof(double)*nc2);
-    memset(cq,0,sizeof(double)*nc2);
-    memset(cfi,0,sizeof(double)*nc2);
-    memset(cfq,0,sizeof(double)*nc2);
+    memset(refi,0,sizeof(float)*nc2);
+    memset(refq,0,sizeof(float)*nc2);
+    memset(ci,0,sizeof(float)*nc2);
+    memset(cq,0,sizeof(float)*nc2);
+    memset(cfi,0,sizeof(float)*nc2);
+    memset(cfq,0,sizeof(float)*nc2);
     
     twopidt=2.0*pi*dt;
     
@@ -427,11 +427,11 @@
     //
     for (i=0; i<nsym; i++) {
         
-        cs=(double)channel_symbols[i];
+        cs=(float)channel_symbols[i];
         
         dphi=twopidt*
         (
-         f0 + (drift0/2.0)*((double)i-(double)nsym/2.0)/((double)nsym/2.0)
+         f0 + (drift0/2.0)*((float)i-(float)nsym/2.0)/((float)nsym/2.0)
          + (cs-1.5)*df
          );
         
@@ -457,10 +457,10 @@
     }
     
     //quick and dirty filter - may want to do better
-    double w[nfilt], norm=0, partialsum[nfilt];
-    memset(partialsum,0,sizeof(double)*nfilt);
+    float w[nfilt], norm=0, partialsum[nfilt];
+    memset(partialsum,0,sizeof(float)*nfilt);
     for (i=0; i<nfilt; i++) {
-        w[i]=sin(pi*(double)i/(double)(nfilt-1));
+        w[i]=sin(pi*(float)i/(float)(nfilt-1));
         norm=norm+w[i];
     }
     for (i=0; i<nfilt; i++) {
@@ -510,12 +510,12 @@
 }
 
 unsigned long writec2file(char *c2filename, int trmin, double freq
-                          , double *idat, double *qdat)
+                          , float *idat, float *qdat)
 {
     int i;
-    double *buffer;
-    buffer=malloc(sizeof(double)*2*45000);
-    memset(buffer,0,sizeof(double)*2*45000);
+    float *buffer;
+    buffer=malloc(sizeof(float)*2*45000);
+    memset(buffer,0,sizeof(float)*2*45000);
     
     FILE *fp;
     
@@ -534,7 +534,7 @@
         buffer[2*i+1]=-qdat[i];
     }
     
-    nwrite = fwrite(buffer, sizeof(double), 2*45000, fp);
+    nwrite = fwrite(buffer, sizeof(float), 2*45000, fp);
     if( nwrite == 2*45000 ) {
         return nwrite;
     } else {
@@ -586,22 +586,22 @@
     int shift1, lagmin, lagmax, lagstep, ifmin, ifmax, worth_a_try, 
not_decoded;
     unsigned int nbits=81, stacksize=200000;
     unsigned int npoints, metric, cycles, maxnp;
-    double df=375.0/256.0/2;
-    double freq0[200],snr0[200],drift0[200],sync0[200];
+    float df=375.0/256.0/2;
+    float freq0[200],snr0[200],drift0[200],sync0[200];
     int shift0[200];
-    double dt=1.0/375.0, dt_print;
+    float dt=1.0/375.0, dt_print;
     double dialfreq_cmdline=0.0, dialfreq, freq_print;
     double dialfreq_error=0.0;
-    double fmin=-110, fmax=110;
-    double f1, fstep, sync1, drift1;
-    double psavg[512];
-    double *idat, *qdat;
+    float fmin=-110, fmax=110;
+    float f1, fstep, sync1, drift1;
+    float psavg[512];
+    float *idat, *qdat;
     clock_t t0,t00;
-    double tfano=0.0,treadwav=0.0,tcandidates=0.0,tsync0=0.0;
-    double tsync1=0.0,tsync2=0.0,ttotal=0.0;
+    float tfano=0.0,treadwav=0.0,tcandidates=0.0,tsync0=0.0;
+    float tsync1=0.0,tsync2=0.0,ttotal=0.0;
     
-    struct result { char date[7]; char time[5]; double sync; double snr;
-                    double dt; double freq; char message[23]; double drift;
+    struct result { char date[7]; char time[5]; float sync; float snr;
+                    float dt; float freq; char message[23]; float drift;
                     unsigned int cycles; int jitter; };
     struct result decodes[50];
     
@@ -617,32 +617,32 @@
 
     callsign=malloc(sizeof(char)*13);
     call_loc_pow=malloc(sizeof(char)*23);
-    double allfreqs[100];
+    float allfreqs[100];
     char allcalls[100][13];
-    memset(allfreqs,0,sizeof(double)*100);
+    memset(allfreqs,0,sizeof(float)*100);
     memset(allcalls,0,sizeof(char)*100*13);
     
     int uniques=0, noprint=0, ndecodes_pass=0;
     
     // Parameters used for performance-tuning:
-    unsigned int maxcycles=10000;             //Decoder timeout limit
-    double minsync1=0.10;                    //First sync limit
-    double minsync2=0.12;                    //Second sync limit
+    unsigned int maxcycles=10000;            //Decoder timeout limit
+    float minsync1=0.10;                     //First sync limit
+    float minsync2=0.12;                     //Second sync limit
     int iifac=8;                             //Step size in final DT peakup
     int symfac=50;                           //Soft-symbol normalizing factor
     int maxdrift=4;                          //Maximum (+/-) drift
-    double minrms=52.0 * (symfac/64.0);      //Final test for plausible 
decoding
+    float minrms=52.0 * (symfac/64.0);      //Final test for plausible decoding
     delta=60;                                //Fano threshold step
-    double bias=0.42;                        //Fano metric bias (used for both 
Fano and stack algorithms)
+    float bias=0.42;                        //Fano metric bias (used for both 
Fano and stack algorithms)
     
     t00=clock();
-    fftw_complex *fftin, *fftout;
+    fftwf_complex *fftin, *fftout;
 #include "./metric_tables.c"
     
     int mettab[2][256];
     
-    idat=malloc(sizeof(double)*maxpts);
-    qdat=malloc(sizeof(double)*maxpts);
+    idat=malloc(sizeof(float)*maxpts);
+    qdat=malloc(sizeof(float)*maxpts);
     
     while ( (c = getopt(argc, argv, "a:cC:de:f:HJmqstwvz:")) !=-1 ) {
         switch (c) {
@@ -714,7 +714,7 @@
         mettab[1][i]=round( 10*(metric_tables[2][255-i]-bias) );
     }
     
-    FILE *fp_fftw_wisdom_file, *fall_wspr, *fwsprd, *fhash, *ftimer;
+    FILE *fp_fftwf_wisdom_file, *fall_wspr, *fwsprd, *fhash, *ftimer;
     strcpy(wisdom_fname,".");
     strcpy(all_fname,".");
     strcpy(spots_fname,".");
@@ -732,9 +732,9 @@
     strncat(spots_fname,"/wspr_spots.txt",20);
     strncat(timer_fname,"/wspr_timer.out",20);
     strncat(hash_fname,"/hashtable.txt",20);
-    if ((fp_fftw_wisdom_file = fopen(wisdom_fname, "r"))) {  //Open FFTW wisdom
-        fftw_import_wisdom_from_file(fp_fftw_wisdom_file);
-        fclose(fp_fftw_wisdom_file);
+    if ((fp_fftwf_wisdom_file = fopen(wisdom_fname, "r"))) {  //Open FFTW 
wisdom
+        fftwf_import_wisdom_from_file(fp_fftwf_wisdom_file);
+        fclose(fp_fftwf_wisdom_file);
     }
     
     fall_wspr=fopen(all_fname,"a");
@@ -755,7 +755,7 @@
         
         t0 = clock();
         npoints=readwavfile(ptr_to_infile, wspr_type, idat, qdat);
-        treadwav += (double)(clock()-t0)/CLOCKS_PER_SEC;
+        treadwav += (float)(clock()-t0)/CLOCKS_PER_SEC;
         
         if( npoints == 1 ) {
             return 1;
@@ -782,12 +782,12 @@
 
     // Do windowed ffts over 2 symbols, stepped by half symbols
     int nffts=4*floor(npoints/512)-1;
-    fftin=(fftw_complex*) fftw_malloc(sizeof(fftw_complex)*512);
-    fftout=(fftw_complex*) fftw_malloc(sizeof(fftw_complex)*512);
-    PLAN3 = fftw_plan_dft_1d(512, fftin, fftout, FFTW_FORWARD, PATIENCE);
+    fftin=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*512);
+    fftout=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*512);
+    PLAN3 = fftwf_plan_dft_1d(512, fftin, fftout, FFTW_FORWARD, PATIENCE);
     
-    double ps[512][nffts];
-    double w[512];
+    float ps[512][nffts];
+    float w[512];
     for(i=0; i<512; i++) {
         w[i]=sin(0.006147931*i);
     }
@@ -811,7 +811,7 @@
         if( ipass > 0 && ndecodes_pass == 0 ) break;
         ndecodes_pass=0;
         
-        memset(ps,0.0, sizeof(double)*512*nffts);
+        memset(ps,0.0, sizeof(float)*512*nffts);
         for (i=0; i<nffts; i++) {
             for(j=0; j<512; j++ ) {
                 k=i*128+j;
@@ -818,7 +818,7 @@
                 fftin[j][0]=idat[k] * w[j];
                 fftin[j][1]=qdat[k] * w[j];
             }
-            fftw_execute(PLAN3);
+            fftwf_execute(PLAN3);
             for (j=0; j<512; j++ ) {
                 k=j+256;
                 if( k>511 )
@@ -828,7 +828,7 @@
         }
         
         // Compute average spectrum
-        memset(psavg,0.0, sizeof(double)*512);
+        memset(psavg,0.0, sizeof(float)*512);
         for (i=0; i<nffts; i++) {
             for (j=0; j<512; j++) {
                 psavg[j]=psavg[j]+ps[j][i];
@@ -837,7 +837,7 @@
         
         // Smooth with 7-point window and limit spectrum to +/-150 Hz
         int window[7]={1,1,1,1,1,1,1};
-        double smspec[411];
+        float smspec[411];
         for (i=0; i<411; i++) {
             smspec[i]=0.0;
             for(j=-3; j<=3; j++) {
@@ -847,14 +847,14 @@
         }
         
         // Sort spectrum values, then pick off noise level as a percentile
-        double tmpsort[411];
+        float tmpsort[411];
         for (j=0; j<411; j++) {
             tmpsort[j]=smspec[j];
         }
-        qsort(tmpsort, 411, sizeof(double), doublecomp);
+        qsort(tmpsort, 411, sizeof(float), floatcomp);
         
         // Noise level of spectrum is estimated as 123/411= 30'th percentile
-        double noise_level = tmpsort[122];
+        float noise_level = tmpsort[122];
         
         /* Renormalize spectrum so that (large) peaks represent an estimate of 
snr.
          * We know from experience that threshold snr is near -7dB in wspr 
bandwidth,
@@ -861,7 +861,7 @@
          * corresponding to -7-26.3=-33.3dB in 2500 Hz bandwidth.
          * The corresponding threshold is -42.3 dB in 2500 Hz bandwidth for 
WSPR-15. */
         
-        double min_snr, snr_scaling_factor;
+        float min_snr, snr_scaling_factor;
         min_snr = pow(10.0,-7.0/10.0); //this is min snr in wspr bw
         if( wspr_type == 2 ) {
             snr_scaling_factor=26.3;
@@ -924,7 +924,7 @@
         
         // bubble sort on snr, bringing freq along for the ride
         int pass;
-        double tmp;
+        float tmp;
         for (pass = 1; pass <= npk - 1; pass++) {
             for (k = 0; k < npk - pass ; k++) {
                 if (snr0[k] < snr0[k+1]) {
@@ -962,7 +962,7 @@
 
         int idrift,ifr,if0,ifd,k0;
         int kindex;
-        double smax,ss,pow,p0,p1,p2,p3;
+        float smax,ss,pow,p0,p1,p2,p3;
         for(j=0; j<npk; j++) {                              //For each 
candidate...
             smax=-1e30;
             if0=freq0[j]/df+256;
@@ -972,7 +972,7 @@
                         ss=0.0;
                         pow=0.0;
                         for (k=0; k<162; k++) {                             
//Sum over symbols
-                            ifd=ifr+((double)k-81.0)/81.0*( (double)idrift 
)/(2.0*df);
+                            ifd=ifr+((float)k-81.0)/81.0*( (float)idrift 
)/(2.0*df);
                             kindex=k0+2*k;
                             if( kindex < nffts ) {
                                 p0=ps[ifd-3][kindex];
@@ -1001,11 +1001,11 @@
                 }
             }
         }
-        tcandidates += (double)(clock()-t0)/CLOCKS_PER_SEC;
+        tcandidates += (float)(clock()-t0)/CLOCKS_PER_SEC;
 
         /*
          Refine the estimates of freq, shift using sync as a metric.
-         Sync is calculated such that it is a double taking values in the range
+         Sync is calculated such that it is a float taking values in the range
          [0.0,1.0].
          
          Function sync_and_demodulate has three modes of operation
@@ -1038,7 +1038,7 @@
             t0 = clock();
             sync_and_demodulate(idat, qdat, npoints, symbols, &f1, ifmin, 
ifmax, fstep, &shift1,
                                 lagmin, lagmax, lagstep, &drift1, symfac, 
&sync1, 0);
-            tsync0 += (double)(clock()-t0)/CLOCKS_PER_SEC;
+            tsync0 += (float)(clock()-t0)/CLOCKS_PER_SEC;
 
             fstep=0.25; ifmin=-2; ifmax=2;
             t0 = clock();
@@ -1047,7 +1047,7 @@
 
             // refine drift estimate
             fstep=0.0; ifmin=0; ifmax=0;
-            double driftp,driftm,syncp,syncm;
+            float driftp,driftm,syncp,syncm;
             driftp=drift1+0.5;
             sync_and_demodulate(idat, qdat, npoints, symbols, &f1, ifmin, 
ifmax, fstep, &shift1,
                                 lagmin, lagmax, lagstep, &driftp, symfac, 
&syncp, 1);
@@ -1064,7 +1064,7 @@
                 sync1=syncm;
             }
 
-            tsync1 += (double)(clock()-t0)/CLOCKS_PER_SEC;
+            tsync1 += (float)(clock()-t0)/CLOCKS_PER_SEC;
 
             // fine-grid lag and freq search
             if( sync1 > minsync1 ) {
@@ -1073,7 +1073,7 @@
                 t0 = clock();
                 sync_and_demodulate(idat, qdat, npoints, symbols, &f1, ifmin, 
ifmax, fstep, &shift1,
                                     lagmin, lagmax, lagstep, &drift1, symfac, 
&sync1, 0);
-                tsync0 += (double)(clock()-t0)/CLOCKS_PER_SEC;
+                tsync0 += (float)(clock()-t0)/CLOCKS_PER_SEC;
             
                 // fine search over frequency
                 fstep=0.05; ifmin=-2; ifmax=2;
@@ -1080,7 +1080,7 @@
                 t0 = clock();
                 sync_and_demodulate(idat, qdat, npoints, symbols, &f1, ifmin, 
ifmax, fstep, &shift1,
                                 lagmin, lagmax, lagstep, &drift1, symfac, 
&sync1, 1);
-                tsync1 += (double)(clock()-t0)/CLOCKS_PER_SEC;
+                tsync1 += (float)(clock()-t0)/CLOCKS_PER_SEC;
 
                 worth_a_try = 1;
             } else {
@@ -1088,7 +1088,7 @@
             }
             
             int idt=0, ii=0, jiggered_shift;
-            double y,sq,rms;
+            float y,sq,rms;
             not_decoded=1;
             
             while ( worth_a_try && not_decoded && idt<=(128/iifac)) {
@@ -1102,11 +1102,11 @@
                 sync_and_demodulate(idat, qdat, npoints, symbols, &f1, ifmin, 
ifmax, fstep,
                                     &jiggered_shift, lagmin, lagmax, lagstep, 
&drift1, symfac,
                                     &sync1, 2);
-                tsync2 += (double)(clock()-t0)/CLOCKS_PER_SEC;
+                tsync2 += (float)(clock()-t0)/CLOCKS_PER_SEC;
 
                 sq=0.0;
                 for(i=0; i<162; i++) {
-                    y=(double)symbols[i] - 128.0;
+                    y=(float)symbols[i] - 128.0;
                     sq += y*y;
                 }
                 rms=sqrt(sq/162.0);
@@ -1123,7 +1123,7 @@
                                            mettab,delta,maxcycles);
                     }
 
-                    tfano += (double)(clock()-t0)/CLOCKS_PER_SEC;
+                    tfano += (float)(clock()-t0)/CLOCKS_PER_SEC;
                     
                 }
                 idt++;
@@ -1236,15 +1236,15 @@
     }
     printf("<DecodeFinished>\n");
     
-    fftw_free(fftin);
-    fftw_free(fftout);
+    fftwf_free(fftin);
+    fftwf_free(fftout);
     
-    if ((fp_fftw_wisdom_file = fopen(wisdom_fname, "w"))) {
-        fftw_export_wisdom_to_file(fp_fftw_wisdom_file);
-        fclose(fp_fftw_wisdom_file);
+    if ((fp_fftwf_wisdom_file = fopen(wisdom_fname, "w"))) {
+        fftwf_export_wisdom_to_file(fp_fftwf_wisdom_file);
+        fclose(fp_fftwf_wisdom_file);
     }
 
-    ttotal += (double)(clock()-t00)/CLOCKS_PER_SEC;
+    ttotal += (float)(clock()-t00)/CLOCKS_PER_SEC;
 
     fprintf(ftimer,"%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n\n",
             treadwav,tcandidates,tsync0,tsync1,tsync2,tfano,ttotal);
@@ -1265,9 +1265,9 @@
     fclose(fwsprd);
     //  fclose(fdiag);
     fclose(ftimer);
-    fftw_destroy_plan(PLAN1);
-    fftw_destroy_plan(PLAN2);
-    fftw_destroy_plan(PLAN3);
+    fftwf_destroy_plan(PLAN1);
+    fftwf_destroy_plan(PLAN2);
+    fftwf_destroy_plan(PLAN3);
     
     if( usehashtable ) {
         fhash=fopen(hash_fname,"w");

------------------------------------------------------------------------------
Transform Data into Opportunity.
Accelerate data analysis in your applications with
Intel Data Analytics Acceleration Library.
Click to learn more.
http://pubads.g.doubleclick.net/gampad/clk?id=278785471&iu=/4140

_______________________________________________
wsjt-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/wsjt-devel

Re: [wsjt-devel] optimizing wsprd_exp.c for ARMv7-A

Reply via email to