#include <complex>
#include <fftw3.h>
#include <cassert>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <iostream>

using namespace std;

uint64_t ts_to_nsec(const timespec &t)
{
  return t.tv_nsec + 1000000000ULL * t.tv_sec;
}

int main(int argc, char **argv)
{
  assert(argc == 2);

  int N = atoi(argv[1]);

  fftw_complex *in = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N);
  fftw_complex *out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N);
  for(int i = 0; i < N; i++)
    in[i][0] = in[i][1] = 1.0;
  
  fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_EXHAUSTIVE | FFTW_DESTROY_INPUT);
  cout << "Done planning\n";
  double add, mul, fma;
  fftw_flops(p, &add, &mul, &fma);

  timespec start, end;
  clock_gettime(CLOCK_MONOTONIC, &start);
  int iters = 100;

  for(int i = 0; i < iters; i++)
    fftw_execute(p);

  clock_gettime(CLOCK_MONOTONIC, &end);

  uint64_t nsec = ts_to_nsec(end) - ts_to_nsec(start);
  double t = double(nsec) * 1e-9;
  double time_per_sample = t / (N * iters);
  cout << "Took " << nsec << "ns, which is " << 1/time_per_sample << " samples per second\n";
  cout << (add + mul + fma) * iters * 1e-6 / t << " MFLOPS\n";
  cout << add/N << " adds " << mul/N << " muls " << fma/N << " fmas\n";
  cout << N * iters * 2 * sizeof(fftw_complex) * 8 * 1e-6 / t << "Mbps\n";
  cout << 5 * (log(N) / log(2)) * N / (t * 1e6 / iters) << "MFLOPS (canonical)\n";
  fftw_print_plan(p);
  cout << endl;
  fftw_destroy_plan(p);
}
