#include "zmult.h"

#define reg register double /* XXX: long double */

#define butterfly_un0(u,v) \
  t0 = 0[u]; t2 = 0[v]; t1 = t0; \
  t1 -= t2; 0[v] = t1; \
  t0 += t2; 0[u] = t0; \
  t0 = 1[u]; t2 = 1[v]; t1 = t0; \
  t1 -= t2; 1[v] = t1; \
  t0 += t2; 1[u] = t0; \
  t0 = 2[u]; t2 = 2[v]; t1 = t0; \
  t1 -= t2; 2[v] = t1; \
  t0 += t2; 2[u] = t0; \
  t0 = 3[u]; t2 = 3[v]; t1 = t0; \
  t1 -= t2; 3[v] = t1; \
  t0 += t2; 3[u] = t0;

/* (u,v) <- (u + v,2^(-48) (u - v) */
#define butterfly_un48(u,v) \
  t0 = 0[u]; t4 = t0; \
  t1 = 1[u]; t5 = t1; \
  t2 = 2[u]; t6 = t2; \
  t3 = 3[u]; t7 = t3; \
  t0 += 0[v]; 0[u] = t0; \
  t1 += 1[v]; 1[u] = t1; \
  t2 += 2[v]; 2[u] = t2; \
  t3 += 3[v]; 3[u] = t3; \
  t4 = 0[v] - t4; \
  t5 -= 1[v]; \
  0[v] = t5; \
  t6 -= 2[v]; \
  1[v] = t6; \
  t7 -= 3[v]; \
  2[v] = t7; \
  3[v] = t4;

/* (u,v) <- (u + v,2^(-96) (u - v) */
#define butterfly_un96(u,v) \
  t0 = 0[u]; t4 = t0; \
  t1 = 1[u]; t5 = t1; \
  t2 = 2[u]; t6 = t2; \
  t3 = 3[u]; t7 = t3; \
  t0 += 0[v]; 0[u] = t0; \
  t1 += 1[v]; 1[u] = t1; \
  t2 += 2[v]; 2[u] = t2; \
  t3 += 3[v]; 3[u] = t3; \
  t4 = 0[v] - t4; \
  t6 -= 2[v]; \
  0[v] = t6; \
  2[v] = t4; \
  t5 = 1[v] - t5; \
  t7 -= 3[v]; \
  1[v] = t7; \
  3[v] = t5;

/* (u,v) <- (u + v,2^(-144) (u - v) */
#define butterfly_un144(u,v) \
  t0 = 0[u]; t4 = t0; \
  t1 = 1[u]; t5 = t1; \
  t2 = 2[u]; t6 = t2; \
  t3 = 3[u]; t7 = t3; \
  t0 += 0[v]; 0[u] = t0; \
  t1 += 1[v]; 1[u] = t1; \
  t2 += 2[v]; 2[u] = t2; \
  t3 += 3[v]; 3[u] = t3; \
  t4 = 0[v] - t4; \
  t5 = 1[v] - t5; \
  1[v] = t4; \
  t6 = 2[v] - t6; \
  2[v] = t5; \
  t7 -= 3[v]; \
  3[v] = t6; \
  0[v] = t7;

static inline void zmult_4fft_un2_0(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_un0(u,u + 4);
}
static inline void zmult_4fft_un2_96(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_un48(u,u + 4);
}
static inline void zmult_4fft_un2_192(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_un96(u,u + 4);
}
static inline void zmult_4fft_un2_288(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_un144(u,u + 4);
}
static inline void zmult_4fft_un4_0(double u[16])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un2_0(u);
  zmult_4fft_un2_192(u + 8);
  butterfly_un0(u,u + 8);
  butterfly_un0(u + 4,u + 12);
}
static inline void zmult_4fft_un4_192(double u[16])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un2_96(u);
  zmult_4fft_un2_288(u + 8);
  butterfly_un96(u,u + 8);
  butterfly_un96(u + 4,u + 12);
}
void zmult_4fft_un8_0(double u[32])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un4_0(u);
  zmult_4fft_un4_192(u + 16);
  butterfly_un0(u,u + 16);
  butterfly_un0(u + 4,u + 20);
  butterfly_un0(u + 8,u + 24);
  butterfly_un0(u + 12,u + 28);
}

static inline void zmult_4fft_un8_96(double u[32])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un8_0(u);
  zmult_4fft_untwist_12(u);
}

static inline void zmult_4fft_un8_288(double u[32])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un8_0(u);
  zmult_4fft_twist_12(u);
}

void zmult_4fft_un16_192(double u[64])
{
  int i;
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_un8_96(u);
  zmult_4fft_un8_288(u + 32);
  for (i = 0;i < 32;i += 4) {
    butterfly_un96(u + i,u + 32 + i);
  }
}
