inline void mult2(register complex *x,register complex *y)
{
  register real t1, t2, t3, t4, t5, t6, t7, t8;

  t1 = x[0].re; t1 *= y[0].re;
  t2 = x[0].im; t2 *= y[0].im;
  t3 = x[0].re; t3 *= y[0].im;
  t4 = x[0].im; t4 *= y[0].re;
  t5 = x[1].re; t5 *= y[1].re;
  t6 = x[1].im; t6 *= y[1].im;
  t7 = x[1].re; t7 *= y[1].im;
  t8 = x[1].im; t8 *= y[1].re;
  t1 += t2;
  t3 += t4;
  t5 -= t6;
  t7 += t8;
  t1 += t1; t3 += t3; t5 += t5; t7 += t7;
  t1 += t1; t3 += t3; t5 += t5; t7 += t7;
  x[0].re = t1;
  x[0].im = t3;
  x[1].re = t5;
  x[1].im = t7;
}

void mult4(complex *x,complex *y)
{
  register real t1, t2, t3, t4, t5, t6, t7, t8;

  mult2(x,y);

  t1 = *(volatile real *)&x[2].re;
  t1 -= x[3].re;
  t2 = *(volatile real *)&x[2].im;
  t2 += x[3].im;
  t3 = *(volatile real *)&y[2].re;
  t3 -= y[3].re;
  t4 = *(volatile real *)&y[2].im;
  t4 += y[3].im;
  t5 = t1;
  t5 *= t3;
  t6 = t2;
  t1 *= t4;

  t6 *= t3;

  t2 *= t4;
  t6 += t1;
  t1 = *(volatile real *)&x[2].re;
  t5 -= t2;
  t1 += x[3].re;
  t2 = *(volatile real *)&x[2].im;
  t8 = t5;
  t7 = t6;
  t2 -= x[3].im;
  t3 = *(volatile real *)&y[2].re;
  t3 += y[3].re;
  t4 = *(volatile real *)&y[2].im;
  t4 -= y[3].im;

  t5 = t1;
  t5 *= t3;
  t6 = t2;
  t1 *= t4;

  t6 *= t3;

  t2 *= t4;
  t6 += t1;
  t1 = *(volatile real *)&x[2].re;
  t5 -= t2;
  t6 -= t8;
  t1 *= y[2].re;
  t5 += t7;
  t2 = *(volatile real *)&x[2].re;
  t2 *= y[2].im;
  t7 = *(volatile real *)&x[2].im;
  t7 *= y[2].im;
  t8 = *(volatile real *)&x[2].im;
  t8 *= y[2].re;
  t1 -= t7;
  t3 = *(volatile real *)&x[3].re;
  t2 += t8;
  t3 *= y[3].re;
  t4 = *(volatile real *)&x[3].re;
  t4 *= y[3].im;
  t7 = *(volatile real *)&x[3].im;
  t7 *= y[3].im;
  t8 = *(volatile real *)&x[3].im;
  t8 *= y[3].re;
  t3 -= t7;
  t7 = t5;
  t4 += t8;
  t1 -= t3;

  t2 += t4;
  t1 += t1;

  t2 += t2;
  t5 -= t1;
  t7 += t1;
  t8 = t2;
  t2 -= t6;
  x[3].re = t5;
  t8 += t6;
  x[3].im = t2;
  x[2].re = t7;
  x[2].im = t8;
}

static void transform(register complex *x,register complex *y,register int k,register const real *wre,register const real *wim)
{
  register real t1, t2, t3, t4, t5, t6, t7, t8;

  /* f0 in x[0], f1 in x[k] */
  /* g0 in y[0], g1 in y[k] */

  t1 = x[0].re; t1 -= x[k].re;
  t3 = y[0].re; t3 -= y[k].re;
  t2 = x[0].im; t2 += x[k].im;
  t5 = t1;
  t4 = y[0].im; t4 += y[k].im;
  t5 *= t3;
  t6 = t2;
  t2 *= t4;
  t7 = *(volatile real *)&*wre;
  t6 *= t3;
  t5 -= t2;
  t1 *= t4;
  t8 = *(volatile real *)&*wre;
  t7 *= t5;
  t6 += t1;
  t5 *= *wim;
  t1 = *(volatile real *)&x[0].re;
  t8 *= t6;
  t1 += x[k].re;
  t6 *= *wim;
  t2 = *(volatile real *)&x[0].im;
  t8 += t5;
  t7 -= t6;
  t2 -= x[k].im;
  t3 = *(volatile real *)&y[0].re;
  t3 += y[k].re;
  t4 = *(volatile real *)&y[0].im;
  t4 -= y[k].im;
  t5 = t1;
  t5 *= t3;
  t6 = t2;
  t6 *= t3;

  t1 *= t4;

  t2 *= t4;
  t6 += t1;
  t1 = *(volatile real *)&x[0].re;
  t5 -= t2;
  t1 *= y[0].re;
  t5 -= t7;
  t6 -= t8;
  /* (f0+f1b)(g0+g1b)-w(f0-f1b)(g0-g1b) in t5 + t6i */
  t2 = *(volatile real *)&x[0].re;
  t2 *= y[0].im;
  t7 = *(volatile real *)&x[0].im; t7 *= y[0].im;
  t8 = *(volatile real *)&x[0].im; t8 *= y[0].re;
  t1 -= t7;
  t3 = *(volatile real *)&x[k].re;
  t2 += t8;
  t3 *= y[k].re;
  t4 = *(volatile real *)&x[k].re; t4 *= y[k].im;
  t7 = *(volatile real *)&x[k].im; t7 *= y[k].im;
  t8 = *(volatile real *)&x[k].im; t8 *= y[k].re;
  t3 -= t7;
  t7 = t5;
  t4 += t8;
  t1 -= t3;

  t2 += t4;

  t1 += t1;
  t2 += t2;
  /* 2(f0g0-f1bg1b) in t1 + t2i */
  /* given more registers, (f0-f1b)(g0+g1b)+(f0+f1b)(g0-g1b) is faster */

  t5 -= t1;
  t7 += t1;
  t8 = t2;
  t2 -= t6;
  t8 += t6;
  x[k].re = t5;
  x[0].re = t7;
  x[k].im = t2;
  x[0].im = t8;

  k ^= 1;
  x += k;
  y += k;
  k = 1 - k;

  t1 = *(volatile real *)&x[0].re; t1 -= x[k].re;
  t3 = *(volatile real *)&y[0].re; t3 -= y[k].re;
  t2 = *(volatile real *)&x[0].im; t2 += x[k].im;
  t4 = *(volatile real *)&y[0].im; t4 += y[k].im;
  t5 = t1;
  t5 *= t3;
  t6 = t2;
  t2 *= t4;
  t7 = *(volatile real *)&*wre;
  t6 *= t3;
  t5 -= t2;
  t1 *= t4;
  t8 = *(volatile real *)&*wre;
  t7 *= t5;
  t6 += t1;
  t5 *= *wim;
  t1 = *(volatile real *)&x[0].re;
  t8 *= t6;
  t1 += x[k].re;
  t6 *= *wim;
  t2 = *(volatile real *)&x[0].im;
  t8 -= t5;
  t7 += t6;
  t2 -= x[k].im;
  t3 = *(volatile real *)&y[0].re;
  t3 += y[k].re;
  t4 = *(volatile real *)&y[0].im;
  t4 -= y[k].im;
  t5 = t1;
  t5 *= t3;
  t6 = t2;
  t6 *= t3;

  t1 *= t4;

  t2 *= t4;
  t6 += t1;
  t1 = *(volatile real *)&x[0].re;
  t5 -= t2;
  t6 += t8;
  t1 *= y[0].re;
  t5 += t7;
  t2 = *(volatile real *)&x[0].re; t2 *= y[0].im;
  t7 = *(volatile real *)&x[0].im; t7 *= y[0].im;
  t8 = *(volatile real *)&x[0].im; t8 *= y[0].re;
  t1 -= t7;
  t3 = *(volatile real *)&x[k].re;
  t2 += t8;
  t3 *= y[k].re;
  t4 = *(volatile real *)&x[k].re;
  t4 *= y[k].im;
  t7 = *(volatile real *)&x[k].im;
  t7 *= y[k].im;
  t8 = *(volatile real *)&x[k].im;
  t8 *= y[k].re;
  t3 -= t7;
  t7 = t5;
  t4 += t8;
  t1 -= t3;

  t2 += t4;

  t1 += t1;
  t2 += t2;

  t5 -= t1;
  t7 += t1;
  t8 = t2;
  t2 -= t6;
  t8 += t6;
  x[k].re = t5;
  x[0].re = t7;
  x[k].im = t2;
  x[0].im = t8;
}

void mult8(complex *x,complex *y)
{
  transform(x + 4,y + 4,2,&sqrthalf,&sqrthalf);
  mult4(x,y);
}

void mult16(complex *x,complex *y)
{
  transform(x + 8,y + 8,4,&roots16[0].re,&roots16[0].im);
  transform(x + 14,y + 14,-3,&roots16[0].im,&roots16[0].re);
  mult8(x,y);
}

void mult32(complex *x,complex *y)
{
  transform(x + 16,y + 16,8,&roots32[0].re,&roots32[0].im);
  transform(x + 28,y + 28,-6,&roots32[2].re,&roots32[2].im);
  transform(x + 20,y + 20,10,&roots32[2].im,&roots32[2].re);
  transform(x + 26,y + 26,-7,&roots32[0].im,&roots32[0].re);
  mult16(x,y);
}

struct status {
  complex *x;
  complex *y;
  const complex *w;
  int count;
} ;

static inline void dofreq(struct status *xy,int j,int k)
{
  register int count;
  register const complex *w;
  register const real *wre;
  register const real *wim;

  count = --xy->count;

  if (count > 0) { w = xy->w++; wre = &w->re; wim = &w->im; }
  else if (count == 0) { wre = wim = &sqrthalf; }
  else { w = --xy->w; wre = &w->im; wim = &w->re; }

  transform(xy->x + j,xy->y + j,k,wre,wim);
}

static void doit(int level,struct status *xy,int n,int j,int k)
{
  if (level < 2)
    dofreq(xy,j,k);
  else {
    doit(level - 2,xy,4 * n,3 * n + j,k - n);
    doit(level - 1,xy,2 * n,j,k);
    doit(level - 2,xy,4 * n,2 * n + j,k + n);
  }
}

static void mult(int n,const complex *roots,complex *x,complex *y)
{
  struct status xy;
  int level;
  int p;

  n >>= 1;
  p = n >> 1; level = 0;
  xy.x = x; xy.y = y; xy.w = roots; xy.count = p >> 1;

  do { doit(level++,&xy,p * 4,p * 2,p); p >>= 1; } while (p > 1);
  p = 4; --level;
  do { doit(--level,&xy,p * 4,p * 3 + 2,1 - p); p <<= 1; } while (p < n);

  mult4(x,y);
}

void mult64(complex *x,complex *y) { mult(64,roots64,x,y); }
void mult128(complex *x,complex *y) { mult(128,roots128,x,y); }
void mult256(complex *x,complex *y) { mult(256,roots256,x,y); }
void mult512(complex *x,complex *y) { mult(512,roots512,x,y); }
void mult1024(complex *x,complex *y) { mult(1024,roots1024,x,y); }
void mult2048(complex *x,complex *y) { mult(2048,roots2048,x,y); }
void mult4096(complex *x,complex *y) { mult(4096,roots4096,x,y); }
void mult8192(complex *x,complex *y) { mult(8192,roots8192,x,y); }
void mult16384(complex *x,complex *y) { mult(16384,roots16384,x,y); }
void mult32768(complex *x,complex *y) { mult(32768,roots32768,x,y); }
void mult65536(complex *x,complex *y) { mult(65536,roots65536,x,y); }
