#include "zmult.h"

#define reg register double /* XXX */

/* Z[y], degree < 2, coefficient bound 2^26 */
void zmult_poly_2(double out[3],double u[2],double v[2],double tmp[0])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;

  t3 = u[1];
  t3 *= v[0];
  t1 = u[0];
  t1 *= v[1]; t1 += t3; out[1] = t1;
  t0 = u[0];
  t0 *= v[0]; out[0] = t0;
  t2 = u[1];
  t2 *= v[1]; out[2] = t2;
}

/* Z[y], degree < 4, coefficient bound 2^25 */
void zmult_poly_4(double out[7],double u[4],double v[4],double tmp[0])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;

  t0 = u[0];
  t0 *= v[0]; out[0] = t0;
  t6 = u[3];
  t6 *= v[3]; out[6] = t6;
  t3 = u[0];
  t3 *= v[3];
  t2 = u[0];
  t2 *= v[2];
  t4 = u[1];
  t4 *= v[3];
  t1 = u[0];
  t1 *= v[1];
  t5 = u[2];
  t5 *= v[3];
  t7 = u[1];
  t7 *= v[2]; t3 += t7;
  t7 = u[1];
  t7 *= v[1]; t2 += t7;
  t7 = u[2];
  t7 *= v[2]; t4 += t7;
  t7 = u[2];
  t7 *= v[1]; t3 += t7;
  t7 = u[1];
  t7 *= v[0]; t1 += t7; out[1] = t1;
  t7 = u[2];
  t7 *= v[0]; t2 += t7; out[2] = t2;
  t7 = u[3];
  t7 *= v[0]; t3 += t7; out[3] = t3;
  t7 = u[3];
  t7 *= v[1]; t4 += t7; out[4] = t4;
  t7 = u[3];
  t7 *= v[2]; t5 += t7; out[5] = t5;
}

/* Z[y], degree < 8, coefficient bound 2^25 */
void zmult_poly_8(double out[15],double u[8],double v[8],double tmp[0])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;

  t1 = u[0]; t1 *= v[1];
  t0 = u[0]; t0 *= v[0]; out[0] = t0;
  t7 = u[1]; t7 *= v[0]; t1 += t7; out[1] = t1;

  t1 = u[0]; t1 *= v[3];
  t0 = u[0]; t0 *= v[2];
  t7 = u[1]; t7 *= v[2]; t1 += t7;
  t7 = u[1]; t7 *= v[1]; t0 += t7;
  t7 = u[2]; t7 *= v[1]; t1 += t7;
  t7 = u[2]; t7 *= v[0]; t0 += t7; out[2] = t0;
  t7 = u[3]; t7 *= v[0]; t1 += t7; out[3] = t1;

  t1 = u[0]; t1 *= v[5];
  t0 = u[0]; t0 *= v[4];
  t7 = u[1]; t7 *= v[4]; t1 += t7;
  t7 = u[1]; t7 *= v[3]; t0 += t7;
  t7 = u[2]; t7 *= v[3]; t1 += t7;
  t7 = u[2]; t7 *= v[2]; t0 += t7;
  t7 = u[3]; t7 *= v[2]; t1 += t7;
  t7 = u[3]; t7 *= v[1]; t0 += t7;
  t7 = u[4]; t7 *= v[1]; t1 += t7;
  t7 = u[4]; t7 *= v[0]; t0 += t7; out[4] = t0;
  t7 = u[5]; t7 *= v[0]; t1 += t7; out[5] = t1;

  t1 = u[0]; t1 *= v[7];
  t0 = u[0]; t0 *= v[6];
  t7 = u[1]; t7 *= v[6]; t1 += t7;
  t7 = u[1]; t7 *= v[5]; t0 += t7;
  t7 = u[2]; t7 *= v[5]; t1 += t7;
  t7 = u[2]; t7 *= v[4]; t0 += t7;
  t7 = u[3]; t7 *= v[4]; t1 += t7;
  t7 = u[3]; t7 *= v[3]; t0 += t7;
  t7 = u[4]; t7 *= v[3]; t1 += t7;
  t7 = u[4]; t7 *= v[2]; t0 += t7;
  t7 = u[5]; t7 *= v[2]; t1 += t7;
  t7 = u[5]; t7 *= v[1]; t0 += t7;
  t7 = u[6]; t7 *= v[1]; t1 += t7;
  t7 = u[6]; t7 *= v[0]; t0 += t7; out[6] = t0;
  t7 = u[7]; t7 *= v[0]; t1 += t7; out[7] = t1;

  t0 = u[1]; t0 *= v[7];
  t1 = u[2]; t1 *= v[7];
  t7 = u[2]; t7 *= v[6]; t0 += t7;
  t7 = u[3]; t7 *= v[6]; t1 += t7;
  t7 = u[3]; t7 *= v[5]; t0 += t7;
  t7 = u[4]; t7 *= v[5]; t1 += t7;
  t7 = u[4]; t7 *= v[4]; t0 += t7;
  t7 = u[5]; t7 *= v[4]; t1 += t7;
  t7 = u[5]; t7 *= v[3]; t0 += t7;
  t7 = u[6]; t7 *= v[3]; t1 += t7;
  t7 = u[6]; t7 *= v[2]; t0 += t7;
  t7 = u[7]; t7 *= v[2]; t1 += t7; out[9] = t1;
  t7 = u[7]; t7 *= v[1]; t0 += t7; out[8] = t0;

  t0 = u[3]; t0 *= v[7];
  t1 = u[4]; t1 *= v[7];
  t7 = u[4]; t7 *= v[6]; t0 += t7;
  t7 = u[5]; t7 *= v[6]; t1 += t7;
  t7 = u[5]; t7 *= v[5]; t0 += t7;
  t7 = u[6]; t7 *= v[5]; t1 += t7;
  t7 = u[6]; t7 *= v[4]; t0 += t7;
  t7 = u[7]; t7 *= v[4]; t1 += t7; out[11] = t1;
  t7 = u[7]; t7 *= v[3]; t0 += t7; out[10] = t0;

  t0 = u[5]; t0 *= v[7];
  t1 = u[6]; t1 *= v[7];
  t7 = u[6]; t7 *= v[6]; t0 += t7;
  t7 = u[7]; t7 *= v[6]; t1 += t7; out[13] = t1;
  t7 = u[7]; t7 *= v[5]; t0 += t7; out[12] = t0;

  t0 = u[7]; t0 *= v[7]; out[14] = t0;
}

/* Z[y], degree < 16, coefficient bound 2^24 */
void zmult_poly_16(double out[31],double u[16],double v[16],double tmp[0])
{
  reg t0, t1, t2, t3, t4, t5, t6, t7;

  t1 = u[0]; t1 *= v[1];
  t0 = u[0]; t0 *= v[0]; out[0] = t0;
  t7 = u[1]; t7 *= v[0]; t1 += t7; out[1] = t1;

  t1 = u[0]; t1 *= v[3];
  t0 = u[0]; t0 *= v[2];
  t7 = u[1]; t7 *= v[2]; t1 += t7;
  t7 = u[1]; t7 *= v[1]; t0 += t7;
  t7 = u[2]; t7 *= v[1]; t1 += t7;
  t7 = u[2]; t7 *= v[0]; t0 += t7; out[2] = t0;
  t7 = u[3]; t7 *= v[0]; t1 += t7; out[3] = t1;

  t1 = u[0]; t1 *= v[5];
  t0 = u[0]; t0 *= v[4];
  t7 = u[1]; t7 *= v[4]; t1 += t7;
  t7 = u[1]; t7 *= v[3]; t0 += t7;
  t7 = u[2]; t7 *= v[3]; t1 += t7;
  t7 = u[2]; t7 *= v[2]; t0 += t7;
  t7 = u[3]; t7 *= v[2]; t1 += t7;
  t7 = u[3]; t7 *= v[1]; t0 += t7;
  t7 = u[4]; t7 *= v[1]; t1 += t7;
  t7 = u[4]; t7 *= v[0]; t0 += t7; out[4] = t0;
  t7 = u[5]; t7 *= v[0]; t1 += t7; out[5] = t1;

  t1 = u[0]; t1 *= v[7];
  t0 = u[0]; t0 *= v[6];
  t7 = u[1]; t7 *= v[6]; t1 += t7;
  t7 = u[1]; t7 *= v[5]; t0 += t7;
  t7 = u[2]; t7 *= v[5]; t1 += t7;
  t7 = u[2]; t7 *= v[4]; t0 += t7;
  t7 = u[3]; t7 *= v[4]; t1 += t7;
  t7 = u[3]; t7 *= v[3]; t0 += t7;
  t7 = u[4]; t7 *= v[3]; t1 += t7;
  t7 = u[4]; t7 *= v[2]; t0 += t7;
  t7 = u[5]; t7 *= v[2]; t1 += t7;
  t7 = u[5]; t7 *= v[1]; t0 += t7;
  t7 = u[6]; t7 *= v[1]; t1 += t7;
  t7 = u[6]; t7 *= v[0]; t0 += t7; out[6] = t0;
  t7 = u[7]; t7 *= v[0]; t1 += t7; out[7] = t1;

  t1 = u[0]; t1 *= v[9];
  t0 = u[0]; t0 *= v[8];
  t7 = u[1]; t7 *= v[8]; t1 += t7;
  t7 = u[1]; t7 *= v[7]; t0 += t7;
  t7 = u[2]; t7 *= v[7]; t1 += t7;
  t7 = u[2]; t7 *= v[6]; t0 += t7;
  t7 = u[3]; t7 *= v[6]; t1 += t7;
  t7 = u[3]; t7 *= v[5]; t0 += t7;
  t7 = u[4]; t7 *= v[5]; t1 += t7;
  t7 = u[4]; t7 *= v[4]; t0 += t7;
  t7 = u[5]; t7 *= v[4]; t1 += t7;
  t7 = u[5]; t7 *= v[3]; t0 += t7;
  t7 = u[6]; t7 *= v[3]; t1 += t7;
  t7 = u[6]; t7 *= v[2]; t0 += t7;
  t7 = u[7]; t7 *= v[2]; t1 += t7;
  t7 = u[7]; t7 *= v[1]; t0 += t7;
  t7 = u[8]; t7 *= v[1]; t1 += t7;
  t7 = u[8]; t7 *= v[0]; t0 += t7; out[8] = t0;
  t7 = u[9]; t7 *= v[0]; t1 += t7; out[9] = t1;

  t1 = u[0]; t1 *= v[11];
  t0 = u[0]; t0 *= v[10];
  t7 = u[1]; t7 *= v[10]; t1 += t7;
  t7 = u[1]; t7 *= v[9]; t0 += t7;
  t7 = u[2]; t7 *= v[9]; t1 += t7;
  t7 = u[2]; t7 *= v[8]; t0 += t7;
  t7 = u[3]; t7 *= v[8]; t1 += t7;
  t7 = u[3]; t7 *= v[7]; t0 += t7;
  t7 = u[4]; t7 *= v[7]; t1 += t7;
  t7 = u[4]; t7 *= v[6]; t0 += t7;
  t7 = u[5]; t7 *= v[6]; t1 += t7;
  t7 = u[5]; t7 *= v[5]; t0 += t7;
  t7 = u[6]; t7 *= v[5]; t1 += t7;
  t7 = u[6]; t7 *= v[4]; t0 += t7;
  t7 = u[7]; t7 *= v[4]; t1 += t7;
  t7 = u[7]; t7 *= v[3]; t0 += t7;
  t7 = u[8]; t7 *= v[3]; t1 += t7;
  t7 = u[8]; t7 *= v[2]; t0 += t7;
  t7 = u[9]; t7 *= v[2]; t1 += t7;
  t7 = u[9]; t7 *= v[1]; t0 += t7;
  t7 = u[10]; t7 *= v[1]; t1 += t7;
  t7 = u[10]; t7 *= v[0]; t0 += t7; out[10] = t0;
  t7 = u[11]; t7 *= v[0]; t1 += t7; out[11] = t1;

  t1 = u[0]; t1 *= v[13];
  t0 = u[0]; t0 *= v[12];
  t7 = u[1]; t7 *= v[12]; t1 += t7;
  t7 = u[1]; t7 *= v[11]; t0 += t7;
  t7 = u[2]; t7 *= v[11]; t1 += t7;
  t7 = u[2]; t7 *= v[10]; t0 += t7;
  t7 = u[3]; t7 *= v[10]; t1 += t7;
  t7 = u[3]; t7 *= v[9]; t0 += t7;
  t7 = u[4]; t7 *= v[9]; t1 += t7;
  t7 = u[4]; t7 *= v[8]; t0 += t7;
  t7 = u[5]; t7 *= v[8]; t1 += t7;
  t7 = u[5]; t7 *= v[7]; t0 += t7;
  t7 = u[6]; t7 *= v[7]; t1 += t7;
  t7 = u[6]; t7 *= v[6]; t0 += t7;
  t7 = u[7]; t7 *= v[6]; t1 += t7;
  t7 = u[7]; t7 *= v[5]; t0 += t7;
  t7 = u[8]; t7 *= v[5]; t1 += t7;
  t7 = u[8]; t7 *= v[4]; t0 += t7;
  t7 = u[9]; t7 *= v[4]; t1 += t7;
  t7 = u[9]; t7 *= v[3]; t0 += t7;
  t7 = u[10]; t7 *= v[3]; t1 += t7;
  t7 = u[10]; t7 *= v[2]; t0 += t7;
  t7 = u[11]; t7 *= v[2]; t1 += t7;
  t7 = u[11]; t7 *= v[1]; t0 += t7;
  t7 = u[12]; t7 *= v[1]; t1 += t7;
  t7 = u[12]; t7 *= v[0]; t0 += t7; out[12] = t0;
  t7 = u[13]; t7 *= v[0]; t1 += t7; out[13] = t1;

  t1 = u[0]; t1 *= v[15];
  t0 = u[0]; t0 *= v[14];
  t7 = u[1]; t7 *= v[14]; t1 += t7;
  t7 = u[1]; t7 *= v[13]; t0 += t7;
  t7 = u[2]; t7 *= v[13]; t1 += t7;
  t7 = u[2]; t7 *= v[12]; t0 += t7;
  t7 = u[3]; t7 *= v[12]; t1 += t7;
  t7 = u[3]; t7 *= v[11]; t0 += t7;
  t7 = u[4]; t7 *= v[11]; t1 += t7;
  t7 = u[4]; t7 *= v[10]; t0 += t7;
  t7 = u[5]; t7 *= v[10]; t1 += t7;
  t7 = u[5]; t7 *= v[9]; t0 += t7;
  t7 = u[6]; t7 *= v[9]; t1 += t7;
  t7 = u[6]; t7 *= v[8]; t0 += t7;
  t7 = u[7]; t7 *= v[8]; t1 += t7;
  t7 = u[7]; t7 *= v[7]; t0 += t7;
  t7 = u[8]; t7 *= v[7]; t1 += t7;
  t7 = u[8]; t7 *= v[6]; t0 += t7;
  t7 = u[9]; t7 *= v[6]; t1 += t7;
  t7 = u[9]; t7 *= v[5]; t0 += t7;
  t7 = u[10]; t7 *= v[5]; t1 += t7;
  t7 = u[10]; t7 *= v[4]; t0 += t7;
  t7 = u[11]; t7 *= v[4]; t1 += t7;
  t7 = u[11]; t7 *= v[3]; t0 += t7;
  t7 = u[12]; t7 *= v[3]; t1 += t7;
  t7 = u[12]; t7 *= v[2]; t0 += t7;
  t7 = u[13]; t7 *= v[2]; t1 += t7;
  t7 = u[13]; t7 *= v[1]; t0 += t7;
  t7 = u[14]; t7 *= v[1]; t1 += t7;
  t7 = u[14]; t7 *= v[0]; t0 += t7; out[14] = t0;
  t7 = u[15]; t7 *= v[0]; t1 += t7; out[15] = t1;

  t0 = u[1]; t0 *= v[15];
  t1 = u[2]; t1 *= v[15];
  t7 = u[2]; t7 *= v[14]; t0 += t7;
  t7 = u[3]; t7 *= v[14]; t1 += t7;
  t7 = u[3]; t7 *= v[13]; t0 += t7;
  t7 = u[4]; t7 *= v[13]; t1 += t7;
  t7 = u[4]; t7 *= v[12]; t0 += t7;
  t7 = u[5]; t7 *= v[12]; t1 += t7;
  t7 = u[5]; t7 *= v[11]; t0 += t7;
  t7 = u[6]; t7 *= v[11]; t1 += t7;
  t7 = u[6]; t7 *= v[10]; t0 += t7;
  t7 = u[7]; t7 *= v[10]; t1 += t7;
  t7 = u[7]; t7 *= v[9]; t0 += t7;
  t7 = u[8]; t7 *= v[9]; t1 += t7;
  t7 = u[8]; t7 *= v[8]; t0 += t7;
  t7 = u[9]; t7 *= v[8]; t1 += t7;
  t7 = u[9]; t7 *= v[7]; t0 += t7;
  t7 = u[10]; t7 *= v[7]; t1 += t7;
  t7 = u[10]; t7 *= v[6]; t0 += t7;
  t7 = u[11]; t7 *= v[6]; t1 += t7;
  t7 = u[11]; t7 *= v[5]; t0 += t7;
  t7 = u[12]; t7 *= v[5]; t1 += t7;
  t7 = u[12]; t7 *= v[4]; t0 += t7;
  t7 = u[13]; t7 *= v[4]; t1 += t7;
  t7 = u[13]; t7 *= v[3]; t0 += t7;
  t7 = u[14]; t7 *= v[3]; t1 += t7;
  t7 = u[14]; t7 *= v[2]; t0 += t7;
  t7 = u[15]; t7 *= v[2]; t1 += t7; out[17] = t1;
  t7 = u[15]; t7 *= v[1]; t0 += t7; out[16] = t0;

  t0 = u[3]; t0 *= v[15];
  t1 = u[4]; t1 *= v[15];
  t7 = u[4]; t7 *= v[14]; t0 += t7;
  t7 = u[5]; t7 *= v[14]; t1 += t7;
  t7 = u[5]; t7 *= v[13]; t0 += t7;
  t7 = u[6]; t7 *= v[13]; t1 += t7;
  t7 = u[6]; t7 *= v[12]; t0 += t7;
  t7 = u[7]; t7 *= v[12]; t1 += t7;
  t7 = u[7]; t7 *= v[11]; t0 += t7;
  t7 = u[8]; t7 *= v[11]; t1 += t7;
  t7 = u[8]; t7 *= v[10]; t0 += t7;
  t7 = u[9]; t7 *= v[10]; t1 += t7;
  t7 = u[9]; t7 *= v[9]; t0 += t7;
  t7 = u[10]; t7 *= v[9]; t1 += t7;
  t7 = u[10]; t7 *= v[8]; t0 += t7;
  t7 = u[11]; t7 *= v[8]; t1 += t7;
  t7 = u[11]; t7 *= v[7]; t0 += t7;
  t7 = u[12]; t7 *= v[7]; t1 += t7;
  t7 = u[12]; t7 *= v[6]; t0 += t7;
  t7 = u[13]; t7 *= v[6]; t1 += t7;
  t7 = u[13]; t7 *= v[5]; t0 += t7;
  t7 = u[14]; t7 *= v[5]; t1 += t7;
  t7 = u[14]; t7 *= v[4]; t0 += t7;
  t7 = u[15]; t7 *= v[4]; t1 += t7; out[19] = t1;
  t7 = u[15]; t7 *= v[3]; t0 += t7; out[18] = t0;

  t0 = u[5]; t0 *= v[15];
  t1 = u[6]; t1 *= v[15];
  t7 = u[6]; t7 *= v[14]; t0 += t7;
  t7 = u[7]; t7 *= v[14]; t1 += t7;
  t7 = u[7]; t7 *= v[13]; t0 += t7;
  t7 = u[8]; t7 *= v[13]; t1 += t7;
  t7 = u[8]; t7 *= v[12]; t0 += t7;
  t7 = u[9]; t7 *= v[12]; t1 += t7;
  t7 = u[9]; t7 *= v[11]; t0 += t7;
  t7 = u[10]; t7 *= v[11]; t1 += t7;
  t7 = u[10]; t7 *= v[10]; t0 += t7;
  t7 = u[11]; t7 *= v[10]; t1 += t7;
  t7 = u[11]; t7 *= v[9]; t0 += t7;
  t7 = u[12]; t7 *= v[9]; t1 += t7;
  t7 = u[12]; t7 *= v[8]; t0 += t7;
  t7 = u[13]; t7 *= v[8]; t1 += t7;
  t7 = u[13]; t7 *= v[7]; t0 += t7;
  t7 = u[14]; t7 *= v[7]; t1 += t7;
  t7 = u[14]; t7 *= v[6]; t0 += t7;
  t7 = u[15]; t7 *= v[6]; t1 += t7; out[21] = t1;
  t7 = u[15]; t7 *= v[5]; t0 += t7; out[20] = t0;

  t0 = u[7]; t0 *= v[15];
  t1 = u[8]; t1 *= v[15];
  t7 = u[8]; t7 *= v[14]; t0 += t7;
  t7 = u[9]; t7 *= v[14]; t1 += t7;
  t7 = u[9]; t7 *= v[13]; t0 += t7;
  t7 = u[10]; t7 *= v[13]; t1 += t7;
  t7 = u[10]; t7 *= v[12]; t0 += t7;
  t7 = u[11]; t7 *= v[12]; t1 += t7;
  t7 = u[11]; t7 *= v[11]; t0 += t7;
  t7 = u[12]; t7 *= v[11]; t1 += t7;
  t7 = u[12]; t7 *= v[10]; t0 += t7;
  t7 = u[13]; t7 *= v[10]; t1 += t7;
  t7 = u[13]; t7 *= v[9]; t0 += t7;
  t7 = u[14]; t7 *= v[9]; t1 += t7;
  t7 = u[14]; t7 *= v[8]; t0 += t7;
  t7 = u[15]; t7 *= v[8]; t1 += t7; out[23] = t1;
  t7 = u[15]; t7 *= v[7]; t0 += t7; out[22] = t0;

  t0 = u[9]; t0 *= v[15];
  t1 = u[10]; t1 *= v[15];
  t7 = u[10]; t7 *= v[14]; t0 += t7;
  t7 = u[11]; t7 *= v[14]; t1 += t7;
  t7 = u[11]; t7 *= v[13]; t0 += t7;
  t7 = u[12]; t7 *= v[13]; t1 += t7;
  t7 = u[12]; t7 *= v[12]; t0 += t7;
  t7 = u[13]; t7 *= v[12]; t1 += t7;
  t7 = u[13]; t7 *= v[11]; t0 += t7;
  t7 = u[14]; t7 *= v[11]; t1 += t7;
  t7 = u[14]; t7 *= v[10]; t0 += t7;
  t7 = u[15]; t7 *= v[10]; t1 += t7; out[25] = t1;
  t7 = u[15]; t7 *= v[9]; t0 += t7; out[24] = t0;

  t0 = u[11]; t0 *= v[15];
  t1 = u[12]; t1 *= v[15];
  t7 = u[12]; t7 *= v[14]; t0 += t7;
  t7 = u[13]; t7 *= v[14]; t1 += t7;
  t7 = u[13]; t7 *= v[13]; t0 += t7;
  t7 = u[14]; t7 *= v[13]; t1 += t7;
  t7 = u[14]; t7 *= v[12]; t0 += t7;
  t7 = u[15]; t7 *= v[12]; t1 += t7; out[27] = t1;
  t7 = u[15]; t7 *= v[11]; t0 += t7; out[26] = t0;

  t0 = u[13]; t0 *= v[15];
  t1 = u[14]; t1 *= v[15];
  t7 = u[14]; t7 *= v[14]; t0 += t7;
  t7 = u[15]; t7 *= v[14]; t1 += t7; out[29] = t1;
  t7 = u[15]; t7 *= v[13]; t0 += t7; out[28] = t0;

  t0 = u[15]; t0 *= v[15]; out[30] = t0;
}

void zmult_poly_add(double out[],double u[],double v[],unsigned int n)
{
  if (n & 1) {
    out[0] = u[0] + v[0];
    ++out;
    ++u;
    ++v;
    --n;
  }
  if (n & 2) {
    out[0] = u[0] + v[0];
    out[1] = u[1] + v[1];
    out += 2;
    u += 2;
    v += 2;
    n -= 2;
  }
  while (n) {
    out[0] = u[0] + v[0];
    out[1] = u[1] + v[1];
    out[2] = u[2] + v[2];
    out[3] = u[3] + v[3];
    out += 4;
    u += 4;
    v += 4;
    n -= 4;
  }
}

void zmult_poly_sub(double out[],double u[],double v[],unsigned int n)
{
  if (n & 1) {
    out[0] = u[0] - v[0];
    ++out;
    ++u;
    ++v;
    --n;
  }
  if (n & 2) {
    out[0] = u[0] - v[0];
    out[1] = u[1] - v[1];
    out += 2;
    u += 2;
    v += 2;
    n -= 2;
  }
  while (n) {
    out[0] = u[0] - v[0];
    out[1] = u[1] - v[1];
    out[2] = u[2] - v[2];
    out[3] = u[3] - v[3];
    out += 4;
    u += 4;
    v += 4;
    n -= 4;
  }
}

/* (u0v0 + u1v1 x^half)(1+x^half) - (u0-u1)(v0-v1)x^half */
#define KARATSUBA(using,half) \
  int i; \
  reg t0, t1, t2, t3, t4, t5, t6, t7; \
  using(out,u,v,tmp); \
  using(tmp,u + half,v + half,tmp + (2 * half - 1)); \
  zmult_poly_add(out + half,out + half,tmp,half - 1); \
  for (i = (2 * half - 1);i < (3 * half - 1);i += 2) { \
    t0 = tmp[i - half]; out[i + half] = t0; \
    t1 = tmp[i - half + 1]; out[i + half + 1] = t1; \
    t0 += out[i - half]; out[i] = t0; \
    t1 += out[i - half + 1]; out[i + 1] = t1; \
  } \
  zmult_poly_add(out + half,out + half,out,half - 1); \
  zmult_poly_sub(tmp,u,u + half,half); \
  zmult_poly_sub(tmp + half,v,v + half,half); \
  using(tmp + (2 * half),tmp,tmp + half,tmp + (4 * half - 1)); \
  zmult_poly_sub(out + half,out + half,tmp + (2 * half),half); \
  zmult_poly_sub(out + (2 * half),out + (2 * half),tmp + (3 * half),half - 1);

/* Z[y], degree < 32, coefficient bound 2^23 */
void zmult_poly_32(double out[63],double u[32],double v[32],double tmp[63])
{
  KARATSUBA(zmult_poly_16,16)
}

/* Z[y], degree < 64, coefficient bound 2^22 */
void zmult_poly_64(double out[127],double u[64],double v[64],double tmp[190])
{
  KARATSUBA(zmult_poly_32,32)
}

/* Z[y], degree < 128, coefficient bound 2^21 */
void zmult_poly_128(double out[255],double u[128],double v[128],double tmp[445])
{
  KARATSUBA(zmult_poly_64,64)
}

/* Z[y], degree < 256, coefficient bound 2^20 */
void zmult_poly_256(double out[511],double u[256],double v[256],double tmp[956])
{
  KARATSUBA(zmult_poly_128,128)
}

/* Z[y], degree < 512, coefficient bound 2^19 */
void zmult_poly_512(double out[1023],double u[512],double v[512],double tmp[1979])
{
  KARATSUBA(zmult_poly_256,256)
}

/* Z[y]/(y^16+1), coefficient bound 2^24 */
void zmult_poly_16_plus(double out[16],double u[16],double v[16],double tmp[31])
{
  zmult_poly_16(tmp,u,v,tmp);
  zmult_poly_sub(out,tmp,tmp + 16,15);
  out[15] = tmp[15];
}

/* Z[y]/(y^512+1), coefficient bound 2^19 */
void zmult_poly_512_plus(double out[512],double u[512],double v[512],double tmp[3002])
{
  zmult_poly_512(tmp,u,v,tmp + 1023);
  zmult_poly_sub(out,tmp,tmp + 512,511);
  out[511] = tmp[511];
}
