#include "nistp224.h"
#include "floatasm.h"
#define FADD FLOATASM_ADD
#define FSUB FLOATASM_SUB
#define FMUL FLOATASM_MUL
#define FMADD FLOATASM_MADD
#define FMSUB FLOATASM_MSUB
#define FNMADD FLOATASM_NMADD
#define FNMSUB FLOATASM_NMSUB

#define T0 1.0
#define T1 524288.0
#define T2 274877906944.0
#define T3 72057594037927936.0
#define T4 37778931862957161709568.0
#define T5 19807040628566084398385987584.0
#define T6 5192296858534827628530496329220096.0
#define T7 2722258935367507707706996859454145691648.0
#define T8 1427247692705959881058285969449495136382746624.0
#define T9 374144419156711147060143317175368453031918731001856.0
#define T10 196159429230833773869868419475239575503198607639501078528.0
#define T11 102844034832575377634685573909834406561420991602098741459288064.0
#define T12 26959946667150639794667015087019630673637144422540572481103610249216.0

static const struct constants { double x[28]; long long pad; } constants = { {
#define twom128 constants.x[0]
  0.00000000000000000000000000000000000000293873587705571876992184134305561419454666389193021880377187926569604314863681793212890625
#define twom224 constants.x[1]
, 0.00000000000000000000000000000000000000000000000000000000000000000003709206150687421385731735261547639513367564778757791002453039058917581340095629358997312082723208437536338919136001159027049567384892725385725498199462890625
#define alpha (constants.x + 2)
, 6755399441055744.0 * T0
, 6755399441055744.0 * T1
, 6755399441055744.0 * T2
, 6755399441055744.0 * T3
, 6755399441055744.0 * T4
, 6755399441055744.0 * T5
, 6755399441055744.0 * T6
, 6755399441055744.0 * T7
, 6755399441055744.0 * T8
, 6755399441055744.0 * T9
, 6755399441055744.0 * T10
, 6755399441055744.0 * T11
, 6755399441055744.0 * T12
#define beta (constants.x + 15)
, 0.499999999883584678173065185546875 * T0
, 0.499999999883584678173065185546875 * T1
, 0.499999999883584678173065185546875 * T2
, 0.499999999883584678173065185546875 * T3
, 0.499999999883584678173065185546875 * T4
, 0.499999999883584678173065185546875 * T5
, 0.499999999883584678173065185546875 * T6
, 0.499999999883584678173065185546875 * T7
, 0.499999999883584678173065185546875 * T8
, 0.499999999883584678173065185546875 * T9
, 0.499999999883584678173065185546875 * T10
, 0.499999999883584678173065185546875 * T11
, 0.499999999883584678173065185546875 * T12
} } ;

typedef struct {
  double t0; /* multiple of 2^0 */
  double t1; /* multiple of 2^19 */
  double t2; /* multiple of 2^38 */
  double t3; /* multiple of 2^56 */
  double t4; /* multiple of 2^75 */
  double t5; /* multiple of 2^94 */
  double t6; /* multiple of 2^112 */
  double t7; /* multiple of 2^131 */
  double t8; /* multiple of 2^150 */
  double t9; /* multiple of 2^168 */
  double t10; /* multiple of 2^187 */
  double t11; /* multiple of 2^206 */
} small;

static const small zero = { 0 };
static const small one = { 1 };

static struct { double x[9]; long long pad; } spills = { { 0 } };
#define spill38 spills.x[0]
#define spill40 spills.x[1]
#define spill41 spills.x[2]
#define spill42 spills.x[3]
#define spill43 spills.x[4]
#define spill44 spills.x[5]
#define spill45 spills.x[6]
#define spill51 spills.x[7]
#define spill53 spills.x[8]
#define spill63 spills.x[0]
#define spill65 spills.x[1]
#define spill66 spills.x[2]
#define spill67 spills.x[3]
#define spill70 spills.x[4]
#define spill81 spills.x[5]

static small p_fg8h2_h8 = { 0 };
static small p_96127_p6[1] = { { 0 } };
static small p_96127_p24[1] = { { 0 } };
static small p_96127_t1[1] = { { 0 } };
static small p_96127_t2[1] = { { 0 } };
static small p_96127_t3[1] = { { 0 } };
static small p_invert_p96[1] = { { 0 } };
static small p_invert_t1[1] = { { 0 } };
static small p_invert_t2[1] = { { 0 } };
static small ecdouble_q[1] = { { 0 } };
static small ecdouble_r[1] = { { 0 } };
static small ecdouble_x1r3[1] = { { 0 } };
static small ecdouble_x1r[1] = { { 0 } };
static small ecdouble_a[1] = { { 0 } };
static small ecdouble_b[1] = { { 0 } };
static small ecdouble_b8[1] = { { 0 } };
static small ecdouble_qr[1] = { { 0 } };
static small ecdouble_b4x[1] = { { 0 } };
static small ecdouble_yz[1] = { { 0 } };
static small ecadd_a[1] = { { 0 } };
static small ecadd_b[1] = { { 0 } };
static small ecadd_c[1] = { { 0 } };
static small ecadd_d[1] = { { 0 } };
static small ecadd_e[1] = { { 0 } };
static small ecadd_f[1] = { { 0 } };
static small ecadd_g[1] = { { 0 } };
static small ecadd_h[1] = { { 0 } };
static small ecadd_f2g[1] = { { 0 } };
static small ecadd_gx[1] = { { 0 } };
static small ecpack_t1[1] = { { 0 } };
static small ecpack_t2[1] = { { 0 } };
static small ecpack_t3[1] = { { 0 } };
static small n56_t[16][3] = { { { 0 } } };
static small n56_rs[16][2] = { { { 0 } } };
static small n56_q[3] = { { 0 } };
static small n56_q2[3] = { { 0 } };
static small n56_q3[3] = { { 0 } };
static small n56_qrs[2] = { { 0 } };
static small sqrt_u[16][1] = { { { 0 } } };
static small sqrt_t1[1] = { { 0 } };
static small sqrt_t2[1] = { { 0 } };
static small unc_x[1] = { { 0 } };
static small unc_t1[1] = { { 0 } };
static small unc_t2[1] = { { 0 } };

static void c2d(small *out,const unsigned char in[28])
{
  double u;
  double x;
  double c;

  x = in[0]; u = 256.0;
  x += u * in[1]; u *= 256.0;
  x += u * in[2]; u *= 256.0;
  c = (alpha[1] + x) - alpha[1]; x -= c; out->t0 = x; x = c;
  x += u * in[3]; u *= 256.0;
  x += u * in[4]; u *= 256.0;
  x += u * in[5]; u *= 256.0;
  c = (alpha[2] + x) - alpha[2]; x -= c; out->t1 = x; x = c;
  x += u * in[6]; u *= 256.0;
  x += u * in[7]; u *= 256.0;
  c = (alpha[3] + x) - alpha[3]; x -= c; out->t2 = x; x = c;
  x += u * in[8]; u *= 256.0;
  x += u * in[9]; u *= 256.0;
  x += u * in[10]; u *= 256.0;
  c = (alpha[4] + x) - alpha[4]; x -= c; out->t3 = x; x = c;
  x += u * in[11]; u *= 256.0;
  x += u * in[12]; u *= 256.0;
  c = (alpha[5] + x) - alpha[5]; x -= c; out->t4 = x; x = c;
  x += u * in[13]; u *= 256.0;
  x += u * in[14]; u *= 256.0;
  c = (alpha[6] + x) - alpha[6]; x -= c; out->t5 = x; x = c;
  x += u * in[15]; u *= 256.0;
  x += u * in[16]; u *= 256.0;
  x += u * in[17]; u *= 256.0;
  c = (alpha[7] + x) - alpha[7]; x -= c; out->t6 = x; x = c;
  x += u * in[18]; u *= 256.0;
  x += u * in[19]; u *= 256.0;
  c = (alpha[8] + x) - alpha[8]; x -= c; out->t7 = x; x = c;
  x += u * in[20]; u *= 256.0;
  x += u * in[21]; u *= 256.0;
  c = (alpha[9] + x) - alpha[9]; x -= c; out->t8 = x; x = c;
  x += u * in[22]; u *= 256.0;
  x += u * in[23]; u *= 256.0;
  x += u * in[24]; u *= 256.0;
  c = (alpha[10] + x) - alpha[10]; x -= c; out->t9 = x; x = c;
  x += u * in[25]; u *= 256.0;
  x += u * in[26]; u *= 256.0;
  c = (alpha[11] + x) - alpha[11]; x -= c; out->t10 = x; x = c;
  x += u * in[27]; u *= 256.0;
  out->t11 = x;
}

static void d2c(unsigned char out[28],const small *in)
{
  static small x;
  double q;
  int z;

  q = 0.5;
  q -= twom224 * in->t11;
  q -= beta[0]; q += alpha[0]; q -= alpha[0];

  q += in->t0;
  q += twom128 * in->t7;
  q -= beta[1]; q += alpha[1]; q -= alpha[1];

  q += in->t1;
  q += twom128 * in->t8;
  q -= beta[2]; q += alpha[2]; q -= alpha[2];

  q += in->t2;
  q += twom128 * in->t9;
  q -= beta[3]; q += alpha[3]; q -= alpha[3];

  q += in->t3;
  q += twom128 * in->t10;
  q -= beta[4]; q += alpha[4]; q -= alpha[4];

  q += in->t4;
  q += twom128 * in->t11;
  q -= beta[5]; q += alpha[5]; q -= alpha[5];

  q += in->t5;
  q -= beta[6]; q += alpha[6]; q -= alpha[6];

  q += in->t6;
  q -= beta[7]; q += alpha[7]; q -= alpha[7];

  q += in->t7;
  q -= beta[8]; q += alpha[8]; q -= alpha[8];

  q += in->t8;
  q -= beta[9]; q += alpha[9]; q -= alpha[9];

  q += in->t9;
  q -= beta[10]; q += alpha[10]; q -= alpha[10];

  q += in->t10;
  q -= beta[11]; q += alpha[11]; q -= alpha[11];

  q += in->t11;
  q -= beta[12]; q += alpha[12]; q -= alpha[12];

  x.t0 = in->t0 - q * twom224;
  x.t1 = in->t1;
  x.t2 = in->t2;
  x.t3 = in->t3;
  x.t4 = in->t4;
  x.t5 = in->t5 + q * twom128;
  x.t6 = in->t6;
  x.t7 = in->t7;
  x.t8 = in->t8;
  x.t9 = in->t9;
  x.t10 = in->t10;
  x.t11 = in->t11 - q;

  q = x.t0; q -= beta[1]; q += alpha[1]; q -= alpha[1]; x.t1 += q; x.t0 -= q;
  q = x.t1; q -= beta[2]; q += alpha[2]; q -= alpha[2]; x.t2 += q; x.t1 -= q;
  q = x.t2; q -= beta[3]; q += alpha[3]; q -= alpha[3]; x.t3 += q; x.t2 -= q;
  q = x.t3; q -= beta[4]; q += alpha[4]; q -= alpha[4]; x.t4 += q; x.t3 -= q;
  q = x.t4; q -= beta[5]; q += alpha[5]; q -= alpha[5]; x.t5 += q; x.t4 -= q;
  q = x.t5; q -= beta[6]; q += alpha[6]; q -= alpha[6]; x.t6 += q; x.t5 -= q;
  q = x.t6; q -= beta[7]; q += alpha[7]; q -= alpha[7]; x.t7 += q; x.t6 -= q;
  q = x.t7; q -= beta[8]; q += alpha[8]; q -= alpha[8]; x.t8 += q; x.t7 -= q;
  q = x.t8; q -= beta[9]; q += alpha[9]; q -= alpha[9]; x.t9 += q; x.t8 -= q;
  q = x.t9; q -= beta[10]; q += alpha[10]; q -= alpha[10]; x.t10 += q; x.t9 -= q;
  q = x.t10; q -= beta[11]; q += alpha[11]; q -= alpha[11]; x.t11 += q; x.t10 -= q;

  z = x.t0;
  q = 1;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t1 * q; /* bits 19...37 - 16 = 3...21 */
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t2 * q; /* bits 38...55 - 32 = 6...23 */
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t3 * q; /* bits 56...74 - 48 = 8...26 */
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t4 * q; /* bits 75...83 - 72 = 3...21 */
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t5 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t6 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t7 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t8 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t9 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t10 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  z += x.t11 * q;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
  *out++ = z; z >>= 8; q *= 0.00390625;
}

static void p_fghi(small *out,const small *f,const small *g,const small *h,const small *i)
{
#include "opt-powerpc-fghi.c"
}

static inline void p_fg8h2(small *out,const small *f,const small *g,const small *h)
{
#define h8 p_fg8h2_h8
  h8.t11 = 8 * h->t11;
  h8.t10 = 8 * h->t10;
  h8.t9 = 8 * h->t9;
  h8.t8 = 8 * h->t8;
  h8.t7 = 8 * h->t7;
  h8.t6 = 8 * h->t6;
  h8.t5 = 8 * h->t5;
  h8.t4 = 8 * h->t4;
  h8.t3 = 8 * h->t3;
  h8.t2 = 8 * h->t2;
  h8.t1 = 8 * h->t1;
  h8.t0 = 8 * h->t0;
  p_fghi(out,f,g,h,&h8);
#undef h8
}

static void p_fgh(small *out,const small *f,const small *g,const small *h)
{
#include "opt-powerpc-fgh.c"
}

static inline void p_fg(small *out,const small *f,const small *g)
{
  p_fgh(out,f,g,&zero);
}

static void p_f2g(small *out,const small *f,const small *g)
{
#include "opt-powerpc-f2g.c"
}

static inline void p_f2(small *out,const small *f)
{
  p_f2g(out,f,&zero);
}


static void p_96127(small *out96,small *out127,const small *in)
{
#define p6 p_96127_p6
#define p24 p_96127_p24
#define t1 p_96127_t1
#define t2 p_96127_t2
#define t3 p_96127_t3
  int i;

  p_f2(t1,in);
  p_fg(t2,t1,in); /* 2^2-1 */
  p_f2(t1,t2); /* 2^3-2 */
  p_fg(t2,t1,in); /* 2^3-1 */
  p_f2(t1,t2); /* 2^4-2^1 */
  p_f2(t3,t1); /* 2^5-2^2 */
  p_f2(t1,t3); /* 2^6-2^3 */
  p_fg(p6,t1,t2); /* 2^6-1 */
  p_f2(t1,p6); /* 2^7-2 */
  p_f2(t2,t1); /* 2^8-2^2 */
  p_f2(t1,t2); /* 2^9-2^3 */
  p_f2(t2,t1); /* 2^10-2^4 */
  p_f2(t1,t2); /* 2^11-2^5 */
  p_f2(t2,t1); /* 2^12-2^6 */
  p_fg(t1,t2,p6); /* 2^12-1 */
  p_f2(t2,t1); /* 2^13-2 */
  p_f2(t3,t2); /* 2^14-2^2 */
  for (i = 0;i < 5;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(p24,t1,t3); /* 2^24-1 */
  p_f2(t1,p24); /* 2^25-2 */
  p_f2(t3,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t3); p_f2(t3,t1); }
  p_fg(t1,p24,t3); /* 2^48-1 */
  p_f2(t2,t1);
  p_f2(t3,t2);
  for (i = 0;i < 23;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(out96,t1,t3); /* 2^96-1 */
  p_f2(t1,out96);
  p_f2(t2,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t2); p_f2(t2,t1); }
  p_fg(t1,p24,t2); /* 2^120-1 */
  for (i = 0;i < 3;++i) { p_f2(t2,t1); p_f2(t1,t2); }
  p_fg(t2,p6,t1); /* 2^126-1 */
  p_f2(t1,t2); /* 2^127-2 */
  p_fg(out127,t1,in); /* 2^127-1 */
#undef p6
#undef p24
#undef t1
#undef t2
#undef t3
}

static void p_invert(small *out,const small *in)
{
#define p96 p_invert_p96
#define t1 p_invert_t1
#define t2 p_invert_t2
  int i;

  p_96127(p96,out,in); /* 2^96-1 */
  p_f2(t1,out); /* 2^128-2 */
  for (i = 0;i < 48;++i) { p_f2(t2,t1); p_f2(t1,t2); }
  p_fg(out,p96,t1); /* 2^224-2^97+2^96-1 = 2^224-2^96-1 */
#undef p96
#undef t1
#undef t2
}


static void ecneg(small out[3],const small in[3])
{
  out[1].t0 = -in[1].t0;
  out[1].t1 = -in[1].t1;
  out[1].t2 = -in[1].t2;
  out[1].t3 = -in[1].t3;
  out[1].t4 = -in[1].t4;
  out[1].t5 = -in[1].t5;
  out[1].t6 = -in[1].t6;
  out[1].t7 = -in[1].t7;
  out[1].t8 = -in[1].t8;
  out[1].t9 = -in[1].t9;
  out[1].t10 = -in[1].t10;
  out[1].t11 = -in[1].t11;
  out[0] = in[0];
  out[2] = in[2];
}

static void ecdouble(small out[3],const small in[3])
{
#define q ecdouble_q
#define r ecdouble_r
#define x1r3 ecdouble_x1r3
#define x1r ecdouble_x1r
#define a ecdouble_a
#define b ecdouble_b
#define b8 ecdouble_b8
#define qr ecdouble_qr
#define b4x ecdouble_b4x
#define yz ecdouble_yz

  p_f2(q,in + 1);
  p_f2(r,in + 2);
  x1r3->t0 = 3 * (in[0].t0 - r->t0); x1r->t0 = in[0].t0 + r->t0;
  x1r3->t1 = 3 * (in[0].t1 - r->t1); x1r->t1 = in[0].t1 + r->t1;
  x1r3->t2 = 3 * (in[0].t2 - r->t2); x1r->t2 = in[0].t2 + r->t2;
  x1r3->t3 = 3 * (in[0].t3 - r->t3); x1r->t3 = in[0].t3 + r->t3;
  x1r3->t4 = 3 * (in[0].t4 - r->t4); x1r->t4 = in[0].t4 + r->t4;
  x1r3->t5 = 3 * (in[0].t5 - r->t5); x1r->t5 = in[0].t5 + r->t5;
  x1r3->t6 = 3 * (in[0].t6 - r->t6); x1r->t6 = in[0].t6 + r->t6;
  x1r3->t7 = 3 * (in[0].t7 - r->t7); x1r->t7 = in[0].t7 + r->t7;
  x1r3->t8 = 3 * (in[0].t8 - r->t8); x1r->t8 = in[0].t8 + r->t8;
  x1r3->t9 = 3 * (in[0].t9 - r->t9); x1r->t9 = in[0].t9 + r->t9;
  x1r3->t10 = 3 * (in[0].t10 - r->t10); x1r->t10 = in[0].t10 + r->t10;
  x1r3->t11 = 3 * (in[0].t11 - r->t11); x1r->t11 = in[0].t11 + r->t11;
  qr->t0 = q->t0 + r->t0; yz->t0 = in[1].t0 + in[2].t0;
  qr->t1 = q->t1 + r->t1; yz->t1 = in[1].t1 + in[2].t1;
  qr->t2 = q->t2 + r->t2; yz->t2 = in[1].t2 + in[2].t2;
  qr->t3 = q->t3 + r->t3; yz->t3 = in[1].t3 + in[2].t3;
  qr->t4 = q->t4 + r->t4; yz->t4 = in[1].t4 + in[2].t4;
  qr->t5 = q->t5 + r->t5; yz->t5 = in[1].t5 + in[2].t5;
  qr->t6 = q->t6 + r->t6; yz->t6 = in[1].t6 + in[2].t6;
  qr->t7 = q->t7 + r->t7; yz->t7 = in[1].t7 + in[2].t7;
  qr->t8 = q->t8 + r->t8; yz->t8 = in[1].t8 + in[2].t8;
  qr->t9 = q->t9 + r->t9; yz->t9 = in[1].t9 + in[2].t9;
  qr->t10 = q->t10 + r->t10; yz->t10 = in[1].t10 + in[2].t10;
  qr->t11 = q->t11 + r->t11; yz->t11 = in[1].t11 + in[2].t11;
  p_fg(a,x1r3,x1r);
  p_fg(b,in,q);
  b8->t0 = 8 * b->t0;
  b8->t1 = 8 * b->t1;
  b8->t2 = 8 * b->t2;
  b8->t3 = 8 * b->t3;
  b8->t4 = 8 * b->t4;
  b8->t5 = 8 * b->t5;
  b8->t6 = 8 * b->t6;
  b8->t7 = 8 * b->t7;
  b8->t8 = 8 * b->t8;
  b8->t9 = 8 * b->t9;
  b8->t10 = 8 * b->t10;
  b8->t11 = 8 * b->t11;
  p_f2g(out,a,b8);
  b4x->t0 = 4 * b->t0 - out->t0;
  b4x->t1 = 4 * b->t1 - out->t1;
  b4x->t2 = 4 * b->t2 - out->t2;
  b4x->t3 = 4 * b->t3 - out->t3;
  b4x->t4 = 4 * b->t4 - out->t4;
  b4x->t5 = 4 * b->t5 - out->t5;
  b4x->t6 = 4 * b->t6 - out->t6;
  b4x->t7 = 4 * b->t7 - out->t7;
  b4x->t8 = 4 * b->t8 - out->t8;
  b4x->t9 = 4 * b->t9 - out->t9;
  b4x->t10 = 4 * b->t10 - out->t10;
  b4x->t11 = 4 * b->t11 - out->t11;
  p_fg8h2(out + 1,a,b4x,q);
  p_f2g(out + 2,yz,qr);
#undef q
#undef r
#undef x1r3
#undef x1r
#undef a
#undef b
#undef b8
#undef qr
#undef b4x
#undef yz
}

static void ecadd(small out[3],const small p1[3],const small rs1[2],const small p2[3],const small rs2[2])
{
#define a ecadd_a
#define b ecadd_b
#define c ecadd_c
#define d ecadd_d
#define e ecadd_e
#define f ecadd_f
#define g ecadd_g
#define h ecadd_h
#define f2g ecadd_f2g
#define gx ecadd_gx

  p_fg(a,p1,rs2);
  p_fgh(b,p2,rs1,a);
  p_fg(c,p1 + 1,rs2 + 1);
  p_fgh(d,p2 + 1,rs1 + 1,c);
  p_f2(e,b);
  p_fg(f,b,e);
  p_fg(g,a,e);
  p_fg(h,p1 + 2,p2 + 2);
  f2g->t0 = 2 * g->t0 + f->t0;
  f2g->t1 = 2 * g->t1 + f->t1;
  f2g->t2 = 2 * g->t2 + f->t2;
  f2g->t3 = 2 * g->t3 + f->t3;
  f2g->t4 = 2 * g->t4 + f->t4;
  f2g->t5 = 2 * g->t5 + f->t5;
  f2g->t6 = 2 * g->t6 + f->t6;
  f2g->t7 = 2 * g->t7 + f->t7;
  f2g->t8 = 2 * g->t8 + f->t8;
  f2g->t9 = 2 * g->t9 + f->t9;
  f2g->t10 = 2 * g->t10 + f->t10;
  f2g->t11 = 2 * g->t11 + f->t11;
  p_f2g(out,d,f2g);
  p_fg(out + 2,b,h);
  gx->t0 = g->t0 - out->t0;
  gx->t1 = g->t1 - out->t1;
  gx->t2 = g->t2 - out->t2;
  gx->t3 = g->t3 - out->t3;
  gx->t4 = g->t4 - out->t4;
  gx->t5 = g->t5 - out->t5;
  gx->t6 = g->t6 - out->t6;
  gx->t7 = g->t7 - out->t7;
  gx->t8 = g->t8 - out->t8;
  gx->t9 = g->t9 - out->t9;
  gx->t10 = g->t10 - out->t10;
  gx->t11 = g->t11 - out->t11;
  p_fghi(out + 1,d,gx,c,f);

#undef a
#undef b
#undef c
#undef d
#undef e
#undef f
#undef g
#undef h
#undef f2g
#undef gx
}

static void ecpack(unsigned char out[56],const small in[3])
{
#define t1 ecpack_t1
#define t2 ecpack_t2
#define t3 ecpack_t3

  p_invert(t1,in + 2); /* z^-1 */
  p_f2(t2,t1); /* z^-2 */
  p_fg(t3,t1,t2); /* z^-3 */
  p_fg(t1,t2,in); /* xz^-2 */
  p_fg(t2,t3,in + 1); /* yz^-3 */
  d2c(out,t1);
  d2c(out + 28,t2);
#undef t1
#undef t2
#undef t3
}

static void ecunpack(small out[3],unsigned char in[56])
{
  c2d(out,in);
  c2d(out + 1,in + 28);
  out[2] = one;
}


static const small b = { 
  0x5ffb4 * T0
, 0x0646a * T1
, 0x02ce5 * T2
, 0x0ba27 * T3
, 0x2f7fb * T4
, 0x2c2df * T5
, 0x65044 * T6
, 0x0264a * T7
, 0x2afd5 * T8
, 0x404b3 * T9
, 0x150a1 * T10
, 0x2d014 * T11
} ;

int nistp224_valid(unsigned char in[56])
{
  static small x[1];
  static small y[1];
  static small t1[1];
  static small t2[1];
  static small t3[1];
  static double s;

  c2d(x,in);
  c2d(y,in + 28);
  p_f2(t1,x);
  t1->t0 -= 3; /* t1 = x^2 - 3 */
  p_f2g(t2,y,&b); /* t2 = y^2 - b */
  p_fgh(t3,t1,x,t2); /* t3 = x^3 - 3x + b - y^2 */

  s = t3->t0 * t3->t0;
  s += t3->t1 * t3->t1;
  s += t3->t2 * t3->t2;
  s += t3->t3 * t3->t3;
  s += t3->t4 * t3->t4;
  s += t3->t5 * t3->t5;
  s += t3->t6 * t3->t6;
  s += t3->t7 * t3->t7;
  s += t3->t8 * t3->t8;
  s += t3->t9 * t3->t9;
  s += t3->t10 * t3->t10;
  s += t3->t11 * t3->t11;

  if (s) return 0;
  return 1;
}

static void ecrs(small out[2],small in[3])
{
  p_f2(out,in + 2);
  p_fg(out + 1,out,in + 2);
}

int nistp224_56(unsigned char out[56],unsigned char in[56],unsigned char e[28])
{
#define t n56_t
#define rs n56_rs
#define q n56_q
#define q2 n56_q2
#define q3 n56_q3
#define qrs n56_qrs
  int i;
  int j;

  if (!nistp224_valid(in)) {
    for (i = 0;i < 56;++i) out[i] = 0;
    return 0;
  }

  ecunpack(t[9],in);
  ecdouble(t[10],t[9]);
  ecrs(rs[9],t[9]);
  ecrs(rs[10],t[10]);
  ecadd(t[11],t[10],rs[10],t[9],rs[9]);
  ecdouble(t[12],t[10]);
  ecdouble(t[14],t[11]);
  ecrs(rs[12],t[12]);
  ecadd(t[13],t[12],rs[12],t[9],rs[9]);
  ecrs(rs[14],t[14]);
  ecadd(t[15],t[14],rs[14],t[9],rs[9]);
  ecneg(t[7],t[9]);
  ecneg(t[6],t[10]);
  ecneg(t[5],t[11]);
  ecneg(t[4],t[12]);
  ecneg(t[3],t[13]);
  ecneg(t[2],t[14]);
  ecneg(t[1],t[15]);
  ecdouble(t[0],t[4]);

  ecrs(rs[11],t[11]);
  ecrs(rs[13],t[13]);
  ecrs(rs[15],t[15]);
  ecrs(rs[0],t[0]);
  rs[1][0] = rs[15][0]; rs[1][1] = rs[15][1];
  rs[2][0] = rs[14][0]; rs[2][1] = rs[14][1];
  rs[3][0] = rs[13][0]; rs[3][1] = rs[13][1];
  rs[4][0] = rs[12][0]; rs[4][1] = rs[12][1];
  rs[5][0] = rs[11][0]; rs[5][1] = rs[11][1];
  rs[6][0] = rs[10][0]; rs[6][1] = rs[10][1];
  rs[7][0] = rs[9][0]; rs[7][1] = rs[9][1];

  ecdouble(q2,t[0]);
  j = (e[0] >> 4) & 15;
  if (j == 8)
    ecneg(q,q2);
  else {
    ecneg(q3,q2);
    ecrs(qrs,q3);
    ecadd(q,q3,qrs,t[j],rs[j]);
  }
  ecdouble(q2,q);
  ecdouble(q3,q2);
  ecdouble(q2,q3);
  j = e[0] & 15;
  if (j == 8)
    ecdouble(q,q2);
  else {
    ecdouble(q3,q2);
    ecrs(qrs,q3);
    ecadd(q,q3,qrs,t[j],rs[j]);
  }

  for (i = 1;i < 28;++i) {
    ecdouble(q2,q);
    ecdouble(q3,q2);
    ecdouble(q2,q3);
    j = (e[i] >> 4) & 15;
    if (j == 8)
      ecdouble(q,q2);
    else {
      ecdouble(q3,q2);
      ecrs(qrs,q3);
      ecadd(q,q3,qrs,t[j],rs[j]);
    }
    ecdouble(q2,q);
    ecdouble(q3,q2);
    ecdouble(q2,q3);
    j = e[i] & 15;
    if (j == 8)
      ecdouble(q,q2);
    else {
      ecdouble(q3,q2);
      ecrs(qrs,q3);
      ecadd(q,q3,qrs,t[j],rs[j]);
    }
  }

  ecpack(out,q);

  if (!nistp224_valid(out)) {
    for (i = 0;i < 56;++i) out[i] = 0;
    return 0;
  }
  return 1;
}

static const float sqtab[64][16][12] = {
#include "sqtab53.c"
} ;

static const int etab[300] = {
  0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0, 36, 0, 0, 60, 17,
  0, 0, 0, 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 48, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 19, 0, 0,
  0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 3, 8, 0, 0, 23,
  0, 31, 0, 0, 0, 0, 30, 41, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 6,
  25, 56, 21, 0, 0, 0, 0, 0, 0, 37, 0, 0, 0, 42, 0, 52, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,
  0, 0, 0, 58, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 26, 0, 0,
  0, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 20, 0, 10, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 53, 24,
  57, 38, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 9, 62, 0, 0, 0, 0, 63,
  0, 55, 0, 0, 40, 35, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0, 0, 0, 2,
  0, 0, 0, 51, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 16, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 0,
  0, 49, 28, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 11, 0, 32,
} ;

static void p_sqrt(small *out,const small *in)
{
  int e[16];
#define u sqrt_u
#define z1 sqrt_t1
#define z2 sqrt_t2
  int i;
  int k;

  p_96127(out,z1,in); /* z1 is 2^127 - 1 */
  p_fg(out,z1,in); /* 2^127 */
  p_fg(u[0],z1,out); /* 2^128 - 1 */

  for (i = 0;i < 15;++i) {
    p_f2(z1,u[i]);
    p_f2(z2,z1);
    p_f2(z1,z2);
    p_f2(z2,z1);
    p_f2(z1,z2);
    p_f2(u[i + 1],z1);
  }

  for (k = 0;k < 16;++k) {
    int x;

    x = u[15 - k]->t0;
    x %= 300;
    x += 300;
    x %= 300;
    e[k] = etab[x];
    for (i = 14 - k;i >= 0;--i) {
      z2->t0 = sqtab[e[k]][i + k][0] * T0;
      z2->t1 = sqtab[e[k]][i + k][1] * T1;
      z2->t2 = sqtab[e[k]][i + k][2] * T2;
      z2->t3 = sqtab[e[k]][i + k][3] * T3;
      z2->t4 = sqtab[e[k]][i + k][4] * T4;
      z2->t5 = sqtab[e[k]][i + k][5] * T5;
      z2->t6 = sqtab[e[k]][i + k][6] * T6;
      z2->t7 = sqtab[e[k]][i + k][7] * T7;
      z2->t8 = sqtab[e[k]][i + k][8] * T8;
      z2->t9 = sqtab[e[k]][i + k][9] * T9;
      z2->t10 = sqtab[e[k]][i + k][10] * T10;
      z2->t11 = sqtab[e[k]][i + k][11] * T11;
      p_fg(z1,u[i],z2);
      u[i][0] = z1[0];
    }
  }

  /* could check at this point whether e[0] is even */
  e[0] >>= 1;
  for (k = 1;k < 16;++k) {
    e[k - 1] |= ((e[k] & 1) << 5);
    e[k] >>= 1;
  }

  for (k = 0;k < 16;++k) {
    z2->t0 = sqtab[e[k]][k][0] * T0;
    z2->t1 = sqtab[e[k]][k][1] * T1;
    z2->t2 = sqtab[e[k]][k][2] * T2;
    z2->t3 = sqtab[e[k]][k][3] * T3;
    z2->t4 = sqtab[e[k]][k][4] * T4;
    z2->t5 = sqtab[e[k]][k][5] * T5;
    z2->t6 = sqtab[e[k]][k][6] * T6;
    z2->t7 = sqtab[e[k]][k][7] * T7;
    z2->t8 = sqtab[e[k]][k][8] * T8;
    z2->t9 = sqtab[e[k]][k][9] * T9;
    z2->t10 = sqtab[e[k]][k][10] * T10;
    z2->t11 = sqtab[e[k]][k][11] * T11;
    p_fg(z1,out,z2);
    out[0] = z1[0];
  }
#undef u
#undef z1
#undef z2
}

int nistp224_uncompress(unsigned char inout[56])
{
#define x unc_x
#define z1 unc_t1
#define z2 unc_t2
  int i;

  c2d(x,inout);
  p_f2(z1,x);
  z1->t0 -= 3; /* z1 = x^2 - 3 */
  p_fg(z2,z1,x);
  z2->t0 += b.t0;
  z2->t1 += b.t1;
  z2->t2 += b.t2;
  z2->t3 += b.t3;
  z2->t4 += b.t4;
  z2->t5 += b.t5;
  z2->t6 += b.t6;
  z2->t7 += b.t7;
  z2->t8 += b.t8;
  z2->t9 += b.t9;
  z2->t10 += b.t10;
  z2->t11 += b.t11;
  p_sqrt(z1,z2);
  d2c(inout + 28,z1);

  if (!nistp224_valid(inout)) {
    for (i = 0;i < 56;++i) inout[i] = 0;
    return 0;
  }
  return 1;
#undef x
#undef z1
#undef z2
}

int nistp224(unsigned char out[28],unsigned char in[28],unsigned char e[28])
{
  static unsigned char out56[56];
  static unsigned char in56[56];
  int i;

  for (i = 0;i < 28;++i) in56[i] = in[i];
  for (i = 0;i < 28;++i) out[i] = 0;
  if (nistp224_uncompress(in56))
    if (nistp224_56(out56,in56,e)) {
      for (i = 0;i < 28;++i) out[i] = out56[i];
      return 1;
    }
  return 0;
}
