/* assumes that sizeof(fftr) is a multiple of 4 */

#define A0 (a)
#define A1 ((fftc *) ((char *) a + n * (sizeof(fftc) / 4)))
#define A2 (b)
#define A3 ((fftc *) ((char *) b + n * (sizeof(fftc) / 4)))

#define WR ((fftr *) ((char *) w + n * (sizeof(fftr) / 4)))
#define WI ((fftr *) ((char *) w + j * (sizeof(fftr) / 2)))


/* a[0...4n-1]; w[n...2n-2]; n >= 2 */
void inline fft_pass(register fftc *a,register fftr *w,register int n)
{
  register fftr s1, s2, s3, s4, s5, s6, s7, s8;
  register fftc *b;
  register int j;

  n *= 4;

  b = a + n / 2;
  j = n - 4;

  s1 = A0->re;
  s2 = A0->im;
  s3 = A1->re;
  s4 = A1->im;
  s5 = A0->re;
  s6 = A0->im;
  s7 = A1->re;
  s8 = A1->im;
  s5 += A2->re;
  s6 += A2->im;
  s1 -= A2->re;
  s2 -= A2->im;
  A0->re = s5;
  s7 += A3->re;
  s8 += A3->im;
  s3 -= A3->re;
  s4 -= A3->im;
  A0->im = s6;
  A1->re = s7;
  A1->im = s8;

  s5 = s1;
  s6 = s2;

  ++a;
  ++b;

  s5 -= s4;
  s6 += s3;
  s2 -= s3;
  s1 += s4;

  A2[-1].re = s5;
  A2[-1].im = s6;
  A3[-1].im = s2;
  A3[-1].re = s1;

  loop:
    s1 = A0->re;
    s2 = A0->im;
    s3 = A1->re;
    s4 = A1->im;
    s5 = A0->re;
    s6 = A0->im;
    s7 = A1->re;
    s8 = A1->im;
    s5 += A2->re;
    s6 += A2->im;
    s1 -= A2->re;
    s2 -= A2->im;
    A0->re = s5;
    s7 += A3->re;
    s8 += A3->im;
    s3 -= A3->re;
    s4 -= A3->im;
    A0->im = s6;
    A1->re = s7;
    s5 = s1;
    A1->im = s8;
    s5 -= s4;
    s6 = s2;
    s8 = *WR;
    s7 = s5;
    s5 *= *WI;
    s6 += s3;
    s7 *= s8;
    s1 += s4;
    s8 *= s6;
    s2 -= s3;
    s6 *= *WI;
    s5 += s8;
    s8 = s1;
    s1 *= *WI;
    s7 -= s6;
    s8 *= *WR;
    s6 = s2;
    A2->im = s5;
    s6 *= *WR;
    A2->re = s7;
    s2 *= *WI;
    s6 -= s1;

    ++w;
    j -= 4;

    s8 += s2;
    A3->im = s6;
    A3->re = s8;

    ++a;
    ++b;

  if (j) goto loop;

  return;
}

void inline fft_unpass(register fftc *a,register fftr *w,register int n)
{
  register fftr s1, s2, s3, s4, s5, s6, s7, s8;
  register fftc *b;
  register int j;

  n *= 4;

  b = a + n / 2;
  j = n - 4;

  s1 = A2->im;
  s2 = A3->re;
  s3 = A2->im;
  s4 = A3->re;
  s1 += A3->im;
  s2 += A2->re;
  s3 -= A3->im;
  s4 -= A2->re;
  s5 = A0->re;
  s6 = A0->im;
  s7 = A1->re;
  s5 -= s2;
  s6 -= s1;
  s7 -= s3;
  A2->re = s5;
  A2->im = s6;
  A3->re = s7;
  s5 = A1->im;

  ++a;
  ++b;

  s5 -= s4;
  s4 += A1[-1].im;
  s2 += A0[-1].re;
  s1 += A0[-1].im;
  s3 += A1[-1].re;
  A3[-1].im = s5;
  A0[-1].re = s2;
  A0[-1].im = s1;
  A1[-1].re = s3;
  A1[-1].im = s4;

  do {
    s1 = A2->re;
    s2 = A2->im;
    s3 = A2->re;
    s1 *= *WR;
    s4 = A2->im;
    s2 *= *WI;
    s5 = A3->re;
    s3 *= *WI;
    s1 += s2;
    s4 *= *WR;
    s6 = A3->im;
    s5 *= *WR;
    s7 = A3->re;
    s4 -= s3;
    s6 *= *WI;
    s3 = A3->im;
    s7 *= *WI;
    s5 -= s6;
    s6 = s1;
    s3 *= *WR;
    s1 += s5;
    s5 -= s6;
    s2 = A0->re;
    s7 += s3;
    s8 = s4;
    s2 -= s1;
    s4 += s7;
    s8 -= s7;
    A2->re = s2;
    s3 = A0->im;
    s1 += A0->re;
    s6 = A1->re;
    s3 -= s4;
    s4 += A0->im;
    A0->re = s1;
    s6 -= s8;
    s8 += A1->re;
    s2 = A1->im;
    A2->im = s3;
    s2 -= s5;
    A3->re = s6;
    s5 += A1->im;
    A3->im = s2;
    A0->im = s4;
    A1->re = s8;
    A1->im = s5;

    ++a;
    ++b;
    ++w;
    j -= 4;

  } while (j);
}

void inline fft_2(register fftc *a)
{
  register fftr r0;
  register fftr i0;
  register fftr t;
  register fftr u;
  r0 = a[0].re;
  i0 = a[0].im;
  t = r0;
  u = i0;
  r0 += a[1].re;
  t -= a[1].re;
  i0 += a[1].im;
  u -= a[1].im;
  a[0].re = r0;
  a[0].im = i0;
  a[1].re = t;
  a[1].im = u;
}

void inline fft_4(register fftc *a)
{
  register fftr s1, s2, s3, s4, s5, s6, s7, s8;

  s5 = a[0].re;
  s7 = a[1].re;
  s1 = a[0].re;
  s5 += a[2].re;
  s7 += a[3].re;
  s2 = a[0].im;
  s8 = s5;
  s5 -= s7;
  s6 = a[0].im;
  s8 += s7;
  s3 = a[1].re;
  s6 += a[2].im;
  a[0].re = s8;
  s8 = a[1].im;
  s4 = a[1].im;
  s7 = s6;
  s8 += a[3].im;
  a[1].re = s5;
  s2 -= a[2].im;
  s7 += s8;
  s6 -= s8;
  s3 -= a[3].re;
  a[0].im = s7;
  a[1].im = s6;
  s1 -= a[2].re;
  s5 = s2;
  s4 -= a[3].im;
  s6 = s1;
  s5 += s3;
  s2 -= s3;
  s6 += s4;
  s1 -= s4;
  a[2].im = s5;
  a[3].im = s2;
  a[2].re = s1;
  a[3].re = s6;
}

void inline fft_un4(register fftc *a)
{
  register fftr s1, s2, s3, s4, s5, s6, s7, s8;

  s1 = a[0].re;
  s2 = a[0].im;
  s6 = a[3].re;
  s1 += a[1].re;
  s2 += a[1].im;
  s8 = s6;
  s6 += a[2].re;
  s4 = s1;
  s5 = s2;
  s1 -= s6;
  s4 += s6;
  s3 = a[0].re;
  s6 = a[2].im;
  a[0].re = s4;
  s7 = s6;
  s6 += a[3].im;
  s4 = a[0].im;
  s3 -= a[1].re;
  s5 += s6;
  s2 -= s6;
  s4 -= a[1].im;
  a[0].im = s5;
  s5 = s3;
  s7 -= a[3].im;
  s8 -= a[2].re;
  s6 = s4;
  s3 -= s7;
  s4 -= s8;
  s5 += s7;
  s6 += s8;
  a[2].re = s1;
  a[2].im = s2;
  a[3].re = s3;
  a[3].im = s4;
  a[1].re = s5;
  a[1].im = s6;
}
