#include "fftc4.h"
#include "fftc8.h"
#include "timing.h"

complex8 *x8;
complex4 *x4;

#define TIMINGS 10
#define TIMINGSLOW 5

void doit4(register timing t[2][TIMINGS],register void (*fft)())
{
  register complex4 *x = x4;
  register int j;
  for (j = 0;j < TIMINGS;++j) {
    timing_now(&t[0][j]); fft(x); timing_now(&t[1][j]);
  }
}

void doit8(register timing t[2][TIMINGS],register void (*fft)())
{
  register complex8 *x = x8;
  register int j;
  for (j = 0;j < TIMINGS;++j) {
    timing_now(&t[0][j]); fft(x); timing_now(&t[1][j]);
  }
}

void print(timing t[2][TIMINGS],int size)
{
  double diff;
  double diffmin;
  int j;

  printf("%4d:",size);
  for (j = TIMINGSLOW;j < TIMINGS;++j) {
    diff = timing_diff(&t[1][j],&t[0][j]);
    printf(" %9.0f",diff);
    if ((j == TIMINGSLOW) || (diff < diffmin)) diffmin = diff;
  }
  printf("  %10.3f/pt\n",diffmin / size);
}

timing start;
timing_basic startb;
timing finish;
timing_basic finishb;

timing t4[20][2][TIMINGS];
timing u4[20][2][TIMINGS];
timing t8[20][2][TIMINGS];
timing u8[20][2][TIMINGS];

void main()
{
  int j;

  x8 = (complex8 *) malloc(8192 * sizeof(complex8));
  if (!x8) exit(1);
  x4 = (complex4 *) malloc(8192 * sizeof(complex4));
  if (!x4) exit(1);

  for (j = 0;j < 8192;++j) x4[j].re = x4[j].im = 0;
  for (j = 0;j < 8192;++j) x8[j].re = x8[j].im = 0;
  /* are there any modern cpus where fp timing is data-dependent? */
  /* aside from overflows, of course. */

  timing_basic_now(&startb);
  timing_now(&start);

  doit4(t4[0],fftc4_2);
  doit4(u4[0],fftc4_un2);
  doit4(t4[1],fftc4_4);
  doit4(u4[1],fftc4_un4);
  doit4(t4[2],fftc4_8);
  doit4(u4[2],fftc4_un8);
  doit4(t4[3],fftc4_16);
  doit4(u4[3],fftc4_un16);
  doit4(t4[4],fftc4_32);
  doit4(u4[4],fftc4_un32);
  doit4(t4[5],fftc4_64);
  doit4(u4[5],fftc4_un64);
  doit4(t4[6],fftc4_128);
  doit4(u4[6],fftc4_un128);
  doit4(t4[7],fftc4_256);
  doit4(u4[7],fftc4_un256);
  doit4(t4[8],fftc4_512);
  doit4(u4[8],fftc4_un512);
  doit4(t4[9],fftc4_1024);
  doit4(u4[9],fftc4_un1024);
  doit4(t4[10],fftc4_2048);
  doit4(u4[10],fftc4_un2048);
  doit4(t4[11],fftc4_4096);
  doit4(u4[11],fftc4_un4096);
  doit4(t4[12],fftc4_8192);
  doit4(u4[12],fftc4_un8192);

  doit8(t8[0],fftc8_2);
  doit8(u8[0],fftc8_un2);
  doit8(t8[1],fftc8_4);
  doit8(u8[1],fftc8_un4);
  doit8(t8[2],fftc8_8);
  doit8(u8[2],fftc8_un8);
  doit8(t8[3],fftc8_16);
  doit8(u8[3],fftc8_un16);
  doit8(t8[4],fftc8_32);
  doit8(u8[4],fftc8_un32);
  doit8(t8[5],fftc8_64);
  doit8(u8[5],fftc8_un64);
  doit8(t8[6],fftc8_128);
  doit8(u8[6],fftc8_un128);
  doit8(t8[7],fftc8_256);
  doit8(u8[7],fftc8_un256);
  doit8(t8[8],fftc8_512);
  doit8(u8[8],fftc8_un512);
  doit8(t8[9],fftc8_1024);
  doit8(u8[9],fftc8_un1024);
  doit8(t8[10],fftc8_2048);
  doit8(u8[10],fftc8_un2048);
  doit8(t8[11],fftc8_4096);
  doit8(u8[11],fftc8_un4096);
  doit8(t8[12],fftc8_8192);
  doit8(u8[12],fftc8_un8192);

  timing_basic_now(&finishb);
  timing_now(&finish);

  print(t4[0],2);
  print(u4[0],2);
  print(t4[1],4);
  print(u4[1],4);
  print(t4[2],8);
  print(u4[2],8);
  print(t4[3],16);
  print(u4[3],16);
  print(t4[4],32);
  print(u4[4],32);
  print(t4[5],64);
  print(u4[5],64);
  print(t4[6],128);
  print(u4[6],128);
  print(t4[7],256);
  print(u4[7],256);
  print(t4[8],512);
  print(u4[8],512);
  print(t4[9],1024);
  print(u4[9],1024);
  print(t4[10],2048);
  print(u4[10],2048);
  print(t4[11],4096);
  print(u4[11],4096);
  print(t4[12],8192);
  print(u4[12],8192);

  print(t8[0],2);
  print(u8[0],2);
  print(t8[1],4);
  print(u8[1],4);
  print(t8[2],8);
  print(u8[2],8);
  print(t8[3],16);
  print(u8[3],16);
  print(t8[4],32);
  print(u8[4],32);
  print(t8[5],64);
  print(u8[5],64);
  print(t8[6],128);
  print(u8[6],128);
  print(t8[7],256);
  print(u8[7],256);
  print(t8[8],512);
  print(u8[8],512);
  print(t8[9],1024);
  print(u8[9],1024);
  print(t8[10],2048);
  print(u8[10],2048);
  print(t8[11],4096);
  print(u8[11],4096);
  print(t8[12],8192);
  print(u8[12],8192);

  printf("Timings are in ticks. Nanoseconds per tick: approximately %f.\n"
    ,timing_basic_diff(&finishb,&startb) / timing_diff(&finish,&start));
  printf("Timings may be underestimates on systems without hardware tick support.\n");

  exit(0);
}
