/* XXX: write 8 from scratch */
/* XXX: maybe write 16 from scratch */
/* XXX: take advantage of *w == sqrt(1/2) (1+i) halfway through pass */

#include "fftc8.h"

/* a[0...4n-1]; w[-n...-1]; n >= 2 */
void fftc8_pass(register fftc8 *a,register double *w,register int n)
{
  register double s1, s2, s3, s4, s5, s6, s7, s8;
  register fftc8 *b;
  register int j;

  b = a + n + n;
  j = n + n - 2;

  s1 = a[0].re;
  s2 = a[0].im;
  s3 = a[n].re;
  s4 = a[n].im;
  s5 = s1;
  s6 = s2;
  s7 = s3;
  s8 = s4;
  s5 += b[0].re;
  s6 += b[0].im;
  s1 -= b[0].re;
  s2 -= b[0].im;
  a[0].re = s5;
  s7 += b[n].re;
  s8 += b[n].im;
  s3 -= b[n].re;
  s4 -= b[n].im;
  a[0].im = s6;
  a[n].re = s7;
  a[n].im = s8;

  s5 = s1;
  s6 = s2;

  ++a;
  ++b;

  s5 -= s4;
  s6 += s3;
  s2 -= s3;
  s1 += s4;

  b[0 - 1].re = s5;
  b[0 - 1].im = s6;
  b[n - 1].im = s2;
  b[n - 1].re = s1;

  do {
    s1 = a[0].re;
    s2 = a[0].im;
    s3 = a[n].re;
    s4 = a[n].im;
    s5 = s1;
    s6 = s2;
    s7 = s3;
    s8 = s4;
    s5 += b[0].re;
    s6 += b[0].im;
    s1 -= b[0].re;
    s2 -= b[0].im;
    a[0].re = s5;
    s7 += b[n].re;
    s8 += b[n].im;
    s3 -= b[n].re;
    s4 -= b[n].im;
    a[0].im = s6;
    a[n].re = s7;
    a[n].im = s8;

    s5 = s1;
    s5 -= s4;
    s1 += s4;
    s4 = w[n];
    s7 = s5;
    s7 *= s4;
    s6 = s2;
    s6 += s3;
    s5 *= w[j];
    s2 -= s3;
    s8 = s6;
    s8 *= s4;

    s6 *= w[j];
    s5 += s8;
    s8 = s1;
    s1 *= w[j];
    s7 -= s6;
    s8 *= s4;
    s6 = s2;
    b[0].im = s5;
    s6 *= s4;

    b[0].re = s7;
    s2 *= w[j];
    s6 -= s1;

    ++w;
    j -= 2;

    s8 += s2;

    b[n].im = s6;
    b[n].re = s8;

    ++a;
    ++b;

  } while (j);
}

void inline fftc8_2(register fftc8 *a)
{
  register double r0 = a[0].re;
  register double i0 = a[0].im;
  register double t = r0;
  register double u = i0;
  r0 += a[1].re;
  t -= a[1].re;
  i0 += a[1].im;
  u -= a[1].im;
  a[0].re = r0;
  a[0].im = i0;
  a[1].re = t;
  a[1].im = u;
}

void inline fftc8_4(register fftc8 *a)
{
  register double s1, s2, s3, s4, s5, s6, s7, s8;

  s5 = a[0].re;
  s7 = a[1].re;
  s1 = a[0].re;
  s5 += a[2].re;
  s7 += a[3].re;
  s2 = a[0].im;
  s8 = s5;
  s5 -= s7;
  s6 = a[0].im;
  s8 += s7;
  s3 = a[1].re;
  s6 += a[2].im;
  a[0].re = s8;
  s8 = a[1].im;
  s4 = a[1].im;
  s7 = s6;
  s8 += a[3].im;
  a[1].re = s5;
  s2 -= a[2].im;
  s7 += s8;
  s6 -= s8;
  s3 -= a[3].re;
  a[0].im = s7;
  a[1].im = s6;
  s1 -= a[2].re;
  s5 = s2;
  s4 -= a[3].im;
  s6 = s1;
  s5 += s3;
  s2 -= s3;
  s6 += s4;
  s1 -= s4;
  a[2].im = s5;
  a[3].im = s2;
  a[2].re = s1;
  a[3].re = s6;
}

static double roots[] = {
  0.9238795325112867561281831893967882868224
, 0.7071067811865475244008443621048490392848
, 0.3826834323650897717284599840303988667613
, 0.9807852804032304491261822361342390369739
, 0.9238795325112867561281831893967882868224
, 0.8314696123025452370787883776179057567385
, 0.7071067811865475244008443621048490392848
, 0.5555702330196022247428308139485328743749
, 0.3826834323650897717284599840303988667613
, 0.1950903220161282678482848684770222409276
} ;

FFTC8(inline fftc8_8,fftc8_4,fftc8_2,roots - 1,2)
FFTC8(fftc8_16,fftc8_8,fftc8_4,roots - 4,4)
FFTC8(fftc8_32,fftc8_16,fftc8_8,roots - 5,8)

