#include "zmult.h"

#define reg register double /* XXX: long double */

/* (u,v) <- (u + v,u - v) */
#define butterfly_0(u,v) \
  t0 = 0[u]; t2 = 0[v]; t1 = t0; \
  t1 -= t2; 0[v] = t1; \
  t0 += t2; 0[u] = t0; \
  t0 = 1[u]; t2 = 1[v]; t1 = t0; \
  t1 -= t2; 1[v] = t1; \
  t0 += t2; 1[u] = t0; \
  t0 = 2[u]; t2 = 2[v]; t1 = t0; \
  t1 -= t2; 2[v] = t1; \
  t0 += t2; 2[u] = t0; \
  t0 = 3[u]; t2 = 3[v]; t1 = t0; \
  t1 -= t2; 3[v] = t1; \
  t0 += t2; 3[u] = t0;

/* (u,v) <- (u + 2^48 v,u - 2^48 v) */
#define butterfly_48(u,v) \
  t0 = 0[u]; \
  t1 = 3[v]; \
  t7 = t0; t0 += t1; t7 -= t1; 0[u] = t7; \
  t1 = 1[u]; \
  t2 = 0[v]; \
  t7 = t1; \
  t1 -= t2; \
  0[v] = t0; \
  t7 += t2; 1[u] = t7; \
  t3 = 1[v]; \
  t2 = 2[u]; \
  t7 = t2; t2 -= t3; \
  1[v] = t1; \
  t7 += t3; 2[u] = t7; \
  t4 = 2[v]; \
  t3 = 3[u]; \
  t7 = t3; t3 -= t4; \
  3[v] = t3; \
  2[v] = t2; \
  t7 += t4; 3[u] = t7;

#define butterfly_96(u,v) \
  t0 = 0[u]; \
  t1 = 2[v]; \
  t7 = t0; \
  t0 += t1; \
  t7 -= t1; \
  0[u] = t7; \
  t2 = 2[u]; \
  t3 = 0[v]; \
  t7 = t2; \
  t2 -= t3; \
  0[v] = t0; \
  t7 += t3; \
  2[u] = t7; \
  2[v] = t2; \
  t0 = 1[u]; \
  t1 = 3[v]; \
  t7 = t0; \
  t0 += t1; \
  t7 -= t1; \
  1[u] = t7; \
  t2 = 3[u]; \
  t3 = 1[v]; \
  t7 = t2; \
  t2 -= t3; \
  1[v] = t0; \
  t7 += t3; \
  3[u] = t7; \
  3[v] = t2;

/* (u,v) <- (u + 2^144 v,u - 2^144 v) */
#define butterfly_144(u,v) \
  t0 = 0[u]; t1 = 1[v]; \
  t7 = t0; t0 += t1; t7 -= t1; 0[u] = t7; \
  t1 = 3[u]; t2 = 0[v]; \
  t7 = t1; t1 -= t2; \
  t7 += t2; 3[u] = t7; \
  0[v] = t0; \
  t2 = 2[u]; t3 = 3[v]; \
  t7 = t2; t2 += t3; \
  3[v] = t1; \
  t7 -= t3; 2[u] = t7; \
  t3 = 1[u]; t4 = 2[v]; \
  t7 = t3; t3 += t4; \
  2[v] = t2; \
  t7 -= t4; 1[u] = t7; \
  1[v] = t3;

/* (Z/(2^192+1))[y]/(y^2-1) */
static inline void zmult_4fft_2_0(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_0(u,u + 4);
}

/* (Z/(2^192+1))[y]/(y^2-2^96) */
static inline void zmult_4fft_2_96(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_48(u,u + 4);
}

/* (Z/(2^192+1))[y]/(y^2-2^192) */
static inline void zmult_4fft_2_192(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_96(u,u + 4);
}

/* (Z/(2^192+1))[y]/(y^2-2^288) */
static inline void zmult_4fft_2_288(double u[8])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_144(u,u + 4);
}

/* (Z/(2^192+1))[y]/(y^4-1) */
static inline void zmult_4fft_4_0(double u[16])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_0(u,u + 8);
  butterfly_0(u + 4,u + 12);
  zmult_4fft_2_0(u);
  zmult_4fft_2_192(u + 8);
}

/* (Z/(2^192+1))[y]/(y^4-2^192) */
static inline void zmult_4fft_4_192(double u[16])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_96(u,u + 8);
  butterfly_96(u + 4,u + 12);
  zmult_4fft_2_96(u);
  zmult_4fft_2_288(u + 8);
}

/* (Z/(2^192+1))[y]/(y^8-1) */
void zmult_4fft_8_0(double u[32])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  butterfly_0(u,u + 16);
  butterfly_0(u + 4,u + 20);
  butterfly_0(u + 8,u + 24);
  butterfly_0(u + 12,u + 28);
  zmult_4fft_4_0(u);
  zmult_4fft_4_192(u + 16);
}

/* (Z/(2^192+1))[y]/(y^8-2^96) */
static inline void zmult_4fft_8_96(double u[32])
{
  zmult_4fft_twist_12(u);
  zmult_4fft_8_0(u);
}

/* (Z/(2^192+1))[y]/(y^8-2^288) */
static inline void zmult_4fft_8_288(double u[32])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  zmult_4fft_untwist_12(u);
  zmult_4fft_8_0(u);
}

/* (Z/(2^192+1))[y]/(y^16-2^192) */
inline void zmult_4fft_16_192(double u[64])
{
  int i;
  reg t0, t1, t2, t3, t4, t5, t6, t7;
  for (i = 0;i < 32;i += 4) {
    butterfly_96(u + i,u + 32 + i);
  }
  zmult_4fft_8_96(u);
  zmult_4fft_8_288(u + 32);
}
