#include "nistp224.h"

static void fpmode(void)
{
  asm volatile("fldcw %0"::"m"(0x137f));
}

#define T0 1.0
#define T1 268435456.0
#define T2 72057594037927936.0
#define T3 19342813113834066795298816.0
#define T4 5192296858534827628530496329220096.0
#define T5 1393796574908163946345982392040522594123776.0
#define T6 374144419156711147060143317175368453031918731001856.0
#define T7 100433627766186892221372630771322662657637687111424552206336.0

static double two28[8] = { T0, T1, T2, T3, T4, T5, T6, T7 };
static double zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

static double half28[8] = {
  1
, 0.0000000037252902984619140625
, 0.00000000000000001387778780781445675529539585113525390625
, 0.000000000000000000000000051698788284564229679463043254372678347863256931304931640625
, 0.0000000000000000000000000000000001925929944387235853055977942584927318538101648215388195239938795566558837890625
, 0.00000000000000000000000000000000000000000071746481373430634031294954664443705921549411424077607513961896135157303433516062796115875244140625
, 0.000000000000000000000000000000000000000000000000002672764710092195646140536467151481878815196880105048697961937492160398640987130358670498253559344448149204254150390625
, 0.0000000000000000000000000000000000000000000000000000000000099568244445778267314305024860481987849444034699538674129617865836635181344566135058943117170011417871311465432881959713995456695556640625
} ;

static double twom128 = 0.00000000000000000000000000000000000000293873587705571876992184134305561419454666389193021880377187926569604314863681793212890625;
static double twom224 = 0.00000000000000000000000000000000000000000000000000000000000000000003709206150687421385731735261547639513367564778757791002453039058917581340095629358997312082723208437536338919136001159027049567384892725385725498199462890625;

static double alpha[16] = {
  13835058055282163712.0
, 3713820117856140824697372672.0
, 996920996838686904677855295210258432.0
, 267608942382367477698428619271780338071764992.0
, 71835728478088540235547516897670742982128396352356352.0
, 19283256531107883306503545108093951230266435925393514023616512.0
, 5176309760092922840576066896707769089338331729127789916371893167849472.0
, 1389505070847794345082851820104254894239239815987686768473491008094957555679232.0
, 372992427307339981616536686110115630075342113098010788080347982669869622759400031649792.0
, 100124392308792660112266642476697755372001774485468098591267575366666549645967525782326347825152.0
, 26876936906133650526689307366821331337459745966815794418799869211565502454161931344570529919259285389312.0
, 7214722814281215676076484393336843348098096790246358642812797864546946125152077138320483323038009455714104246272.0
, 1936687408565181442241939379002258905747279344621917634622994517405485716512630895972234014960103373576927378979560620032.0
, 519875567647652786170952659526828194394331951632965607844464921365241495213554804319995201144526170893260852055263169717932654592.0
, 139553035064756522985470171114496870595899141251965067574046118242682743317772401278628681737042604587666404148403106163238643512694013952.0
, 37460982603791906776579166757517995668983177711979653810309883515504460865837187441443673236761903653917923173456879474745375708151459423675482112.0
} ;

static double beta[9] = {
  0.499999999883584678173065185546875
, 134217727.96875
, 36028797010575360.0
, 9671406554665233583964160.0
, 2596148428662950904457933577256960.0
, 696898287291822696343777832628683286773760.0
, 187072209534799430564191535264372276764693034434560.0
, 50216813871401433012039092040031852667088579398464815759360.0
, 13479973330436769029640167161591920625214739003219108518319787868160.0
} ;

static void c2d(double out[8],unsigned char in[28])
{
  int i;
  long double u;
  long double y;

  u = 0;
  for (i = 6;i >= 0;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  y = alpha[1]; y += u; y -= alpha[1]; u -= y;
  out[0] = u;
  out[1] = y;

  u = 0;
  for (i = 13;i >= 7;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[2];
  y = alpha[3]; y += u; y -= alpha[3]; u -= y;
  out[2] = u;
  out[3] = y;

  u = 0;
  for (i = 20;i >= 14;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[4];
  y = alpha[5]; y += u; y -= alpha[5]; u -= y;
  out[4] = u;
  out[5] = y;

  u = 0;
  for (i = 27;i >= 21;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[6];
  y = alpha[7]; y += u; y -= alpha[7]; u -= y;
  out[6] = u;
  out[7] = y;
}

static void d2c(unsigned char out[28],double in[8])
{
  double x[8];
  long double q;
  int i;
  int u;
  int v;

/*
Inverse of 1-2^-128+2^-224: 1+2^-128-2^-224+2^-256+...
So t/p is approximately 2^(-224)t(1+2^-128-2^-224).
Claim: floor(t/p) = floor(2^(-224) (t(1+2^-128-2^-224)+1/2)).
Indeed, say t = pq + r, 0 <= r <= p - 1.
2^-224 (t(1+2^-128-2^-224)+1/2) - q
  = 2^-224 ((pq+r)(1+2^-128-2^-224)+1/2 - 2^224 q)
  = 2^-224 (1/2 + r(1+2^-128-2^-224) - 2^-32 q((1+2^-96)^2))
Have 0 <= r <= 2^224 - 2^96 so 0 <= r(1+2^-128-2^-224) <= 2^224 - 1.
So suffices to assume q at most 2^30 or so.
*/

  q = 0.5;
  q -= twom224 * in[7];
  q -= beta[0]; q += alpha[0]; q -= alpha[0];

  q += in[0];
  q += twom128 * in[5];
  q -= beta[1]; q += alpha[1]; q -= alpha[1];

  q += in[1];
  q += twom128 * in[6];
  q -= beta[2]; q += alpha[2]; q -= alpha[2];

  q += in[2];
  q += twom128 * in[7];
  q -= beta[3]; q += alpha[3]; q -= alpha[3];

  q += in[3];
  q -= beta[4]; q += alpha[4]; q -= alpha[4];

  q += in[4];
  q -= beta[5]; q += alpha[5]; q -= alpha[5];

  q += in[5];
  q -= beta[6]; q += alpha[6]; q -= alpha[6];

  q += in[6];
  q -= beta[7]; q += alpha[7]; q -= alpha[7];

  q += in[7];
  q -= beta[8]; q += alpha[8]; q -= alpha[8];

  x[0] = in[0] - q * twom224;
  x[1] = in[1];
  x[2] = in[2];
  x[3] = in[3] + q * twom128;
  x[4] = in[4];
  x[5] = in[5];
  x[6] = in[6];
  x[7] = in[7] - q;

  q = x[0];
  q -= beta[1]; q += alpha[1]; q -= alpha[1];
  x[1] += q;
  x[0] -= q;

  q = x[1];
  q -= beta[2]; q += alpha[2]; q -= alpha[2];
  x[2] += q;
  x[1] -= q;

  q = x[2];
  q -= beta[3]; q += alpha[3]; q -= alpha[3];
  x[3] += q;
  x[2] -= q;

  q = x[3];
  q -= beta[4]; q += alpha[4]; q -= alpha[4];
  x[4] += q;
  x[3] -= q;

  q = x[4];
  q -= beta[5]; q += alpha[5]; q -= alpha[5];
  x[5] += q;
  x[4] -= q;

  q = x[5];
  q -= beta[6]; q += alpha[6]; q -= alpha[6];
  x[6] += q;
  x[5] -= q;

  q = x[6];
  q -= beta[7]; q += alpha[7]; q -= alpha[7];
  x[7] += q;
  x[6] -= q;

  for (i = 0;i < 8;++i)
    x[i] *= half28[i];

  for (i = 0;i < 4;++i) {
    u = x[i * 2];
    v = x[i * 2 + 1];
    out[i * 7] = u & 255; u >>= 8;
    out[i * 7 + 1] = u & 255; u >>= 8;
    out[i * 7 + 2] = u & 255; u >>= 8;
    out[i * 7 + 3] = (u + (v << 4)) & 255; v >>= 4;
    out[i * 7 + 4] = v & 255; v >>= 8;
    out[i * 7 + 5] = v & 255; v >>= 8;
    out[i * 7 + 6] = v & 255;
  }
}

static void p_fghi(double out[8],const double f[8],const double g[8],const double h[8],const double i[8])
{
  static double c[8];
  long double x;
  long double y;
  long double r0;
  long double r1;
  long double r2;
  long double r3;
  long double r4;
  long double r5;
#define u14 r0
#define u13 r1
#define u12 r2
#define u11 r3
#define u10 r4
#define u9 r5
#define u8 r0
#define u7 r1
#define u6 r2
#define u5 r3
#define u4 r4
#define u3 r5
#define u2 r1
#define u1 r2
#define u0 r0

  u14 = f[7]; u14 *= g[7];
  x = h[7]; x *= i[7]; u14 -= x;

  y = alpha[15]; y += u14; y -= alpha[15];
  u14 -= y;
  c[7] = y;

  u13 = f[6]; u13 *= g[7];
  x = f[7]; x *= g[6]; u13 += x;
  x = h[6]; x *= i[7]; u13 -= x;
  x = h[7]; x *= i[6]; u13 -= x;

  y = alpha[14]; y += u13; y -= alpha[14];
  u13 -= y;
  y += u14; c[6] = y;

  u12 = f[5]; u12 *= g[7];
  x = f[6]; x *= g[6]; u12 += x;
  x = f[7]; x *= g[5]; u12 += x;
  x = h[5]; x *= i[7]; u12 -= x;
  x = h[6]; x *= i[6]; u12 -= x;
  x = h[7]; x *= i[5]; u12 -= x;

  y = alpha[13]; y += u12; y -= alpha[13];
  u12 -= y;
  y += u13; c[5] = y;


  u11 = f[4]; u11 *= g[7];
  x = f[5]; x *= g[6]; u11 += x;
  x = f[6]; x *= g[5]; u11 += x;
  x = f[7]; x *= g[4]; u11 += x;
  x = h[4]; x *= i[7]; u11 -= x;
  x = h[5]; x *= i[6]; u11 -= x;
  x = h[6]; x *= i[5]; u11 -= x;
  x = h[7]; x *= i[4]; u11 -= x;

  y = alpha[12]; y += u11; y -= alpha[12];
  u11 -= y;
  y += u12; c[4] = y;

  u10 = f[3]; u10 *= g[7];
  x = f[4]; x *= g[6]; u10 += x;
  x = f[5]; x *= g[5]; u10 += x;
  x = f[6]; x *= g[4]; u10 += x;
  x = f[7]; x *= g[3]; u10 += x;
  x = h[3]; x *= i[7]; u10 -= x;
  x = h[4]; x *= i[6]; u10 -= x;
  x = h[5]; x *= i[5]; u10 -= x;
  x = h[6]; x *= i[4]; u10 -= x;
  x = h[7]; x *= i[3]; u10 -= x;
  x = twom128; x *= c[7]; u10 += x;

  y = alpha[11]; y += u10; y -= alpha[11];
  u10 -= y;
  y += u11; c[3] = y;

  u9 = f[2]; u9 *= g[7];
  x = f[3]; x *= g[6]; u9 += x;
  x = f[4]; x *= g[5]; u9 += x;
  x = f[5]; x *= g[4]; u9 += x;
  x = f[6]; x *= g[3]; u9 += x;
  x = f[7]; x *= g[2]; u9 += x;
  x = h[2]; x *= i[7]; u9 -= x;
  x = h[3]; x *= i[6]; u9 -= x;
  x = h[4]; x *= i[5]; u9 -= x;
  x = h[5]; x *= i[4]; u9 -= x;
  x = h[6]; x *= i[3]; u9 -= x;
  x = h[7]; x *= i[2]; u9 -= x;
  x = twom128; x *= c[6]; u9 += x;

  y = alpha[10]; y += u9; y -= alpha[10];
  u9 -= y;
  y += u10; c[2] = y;

  u8 = f[1]; u8 *= g[7];
  x = f[2]; x *= g[6]; u8 += x;
  x = f[3]; x *= g[5]; u8 += x;
  x = f[4]; x *= g[4]; u8 += x;
  x = f[5]; x *= g[3]; u8 += x;
  x = f[6]; x *= g[2]; u8 += x;
  x = f[7]; x *= g[1]; u8 += x;
  x = h[1]; x *= i[7]; u8 -= x;
  x = h[2]; x *= i[6]; u8 -= x;
  x = h[3]; x *= i[5]; u8 -= x;
  x = h[4]; x *= i[4]; u8 -= x;
  x = h[5]; x *= i[3]; u8 -= x;
  x = h[6]; x *= i[2]; u8 -= x;
  x = h[7]; x *= i[1]; u8 -= x;
  x = twom128; x *= c[5]; u8 += x;

  y = alpha[9]; y += u8; y -= alpha[9];
  u8 -= y;
  y += u9; c[1] = y;


  u4 = f[0]; u4 *= g[4];
  x = f[1]; x *= g[3]; u4 += x;
  x = f[2]; x *= g[2]; u4 += x;
  x = f[3]; x *= g[1]; u4 += x;
  x = f[4]; x *= g[0]; u4 += x;
  x = h[0]; x *= i[4]; u4 -= x;
  x = h[1]; x *= i[3]; u4 -= x;
  x = h[2]; x *= i[2]; u4 -= x;
  x = h[3]; x *= i[1]; u4 -= x;
  x = h[4]; x *= i[0]; u4 -= x;
  x = twom224; x *= c[4]; u4 -= x;
  x = twom128; x *= c[1]; u4 += x;

  u5 = f[0]; u5 *= g[5];
  x = f[1]; x *= g[4]; u5 += x;
  x = f[2]; x *= g[3]; u5 += x;
  x = f[3]; x *= g[2]; u5 += x;
  x = f[4]; x *= g[1]; u5 += x;
  x = f[5]; x *= g[0]; u5 += x;
  x = h[0]; x *= i[5]; u5 -= x;
  x = h[1]; x *= i[4]; u5 -= x;
  x = h[2]; x *= i[3]; u5 -= x;
  x = h[3]; x *= i[2]; u5 -= x;
  x = h[4]; x *= i[1]; u5 -= x;
  x = h[5]; x *= i[0]; u5 -= x;
  x = twom224; x *= c[5]; u5 -= x;
  x = twom128; x *= c[2]; u5 += x;

  y = alpha[5]; y += u4; y -= alpha[5];
  u5 += y;
  u4 -= y;

  u6 = f[0]; u6 *= g[6];
  x = f[1]; x *= g[5]; u6 += x;
  x = f[2]; x *= g[4]; u6 += x;
  x = f[3]; x *= g[3]; u6 += x;
  x = f[4]; x *= g[2]; u6 += x;
  x = f[5]; x *= g[1]; u6 += x;
  x = f[6]; x *= g[0]; u6 += x;
  x = h[0]; x *= i[6]; u6 -= x;
  x = h[1]; x *= i[5]; u6 -= x;
  x = h[2]; x *= i[4]; u6 -= x;
  x = h[3]; x *= i[3]; u6 -= x;
  x = h[4]; x *= i[2]; u6 -= x;
  x = h[5]; x *= i[1]; u6 -= x;
  x = h[6]; x *= i[0]; u6 -= x;
  x = twom224; x *= c[6]; u6 -= x;
  x = twom128; x *= c[3]; u6 += x;

  y = alpha[6]; y += u5; y -= alpha[6];
  u6 += y;
  u5 -= y;

  u7 = f[0]; u7 *= g[7];
  x = f[1]; x *= g[6]; u7 += x;
  x = f[2]; x *= g[5]; u7 += x;
  x = f[3]; x *= g[4]; u7 += x;
  x = f[4]; x *= g[3]; u7 += x;
  x = f[5]; x *= g[2]; u7 += x;
  x = f[6]; x *= g[1]; u7 += x;
  x = f[7]; x *= g[0]; u7 += x;
  x = h[0]; x *= i[7]; u7 -= x;
  x = h[1]; x *= i[6]; u7 -= x;
  x = h[2]; x *= i[5]; u7 -= x;
  x = h[3]; x *= i[4]; u7 -= x;
  x = h[4]; x *= i[3]; u7 -= x;
  x = h[5]; x *= i[2]; u7 -= x;
  x = h[6]; x *= i[1]; u7 -= x;
  x = h[7]; x *= i[0]; u7 -= x;
  x = twom224; x *= c[7]; u7 -= x;
  x = twom128; x *= c[4]; u7 += x;

  y = alpha[7]; y += u6; y -= alpha[7];
  u7 += y;
  u6 -= y; out[6] = u6;

  y = alpha[8]; y += u7; y -= alpha[8];
  u7 -= y; out[7] = u7;
  y += u8; c[0] = y;


  u0 = f[0]; u0 *= g[0];
  x = h[0]; x *= i[0]; u0 -= x;
  x = twom224; x *= c[0]; u0 -= x;

  u1 = f[0]; u1 *= g[1];
  x = f[1]; x *= g[0]; u1 += x;
  x = h[0]; x *= i[1]; u1 -= x;
  x = h[1]; x *= i[0]; u1 -= x;
  x = twom224; x *= c[1]; u1 -= x;

  y = alpha[1]; y += u0; y -= alpha[1];
  u1 += y;
  u0 -= y; out[0] = u0;

  u2 = f[0]; u2 *= g[2];
  x = f[1]; x *= g[1]; u2 += x;
  x = f[2]; x *= g[0]; u2 += x;
  x = h[0]; x *= i[2]; u2 -= x;
  x = h[1]; x *= i[1]; u2 -= x;
  x = h[2]; x *= i[0]; u2 -= x;
  x = twom224; x *= c[2]; u2 -= x;

  y = alpha[2]; y += u1; y -= alpha[2];
  u2 += y;
  u1 -= y; out[1] = u1;

  u3 = f[0]; u3 *= g[3];
  x = f[1]; x *= g[2]; u3 += x;
  x = f[2]; x *= g[1]; u3 += x;
  x = f[3]; x *= g[0]; u3 += x;
  x = h[0]; x *= i[3]; u3 -= x;
  x = h[1]; x *= i[2]; u3 -= x;
  x = h[2]; x *= i[1]; u3 -= x;
  x = h[3]; x *= i[0]; u3 -= x;
  x = twom224; x *= c[3]; u3 -= x;
  x = twom128; x *= c[0]; u3 += x;

  y = alpha[3]; y += u2; y -= alpha[3];
  u3 += y;
  u2 -= y; out[2] = u2;

  y = alpha[4]; y += u3; y -= alpha[4];
  u4 += y;
  u3 -= y; out[3] = u3;

  y = alpha[5]; y += u4; y -= alpha[5];
  u5 += y; out[5] = u5;
  u4 -= y; out[4] = u4;
}

/* out = fg - 8h^2 */
static void p_fg8h2(double out[8],const double f[8],const double g[8],const double h[8])
{
  static double h8[8];
  h8[7] = 8 * h[7];
  h8[6] = 8 * h[6];
  h8[5] = 8 * h[5];
  h8[4] = 8 * h[4];
  h8[3] = 8 * h[3];
  h8[2] = 8 * h[2];
  h8[1] = 8 * h[1];
  h8[0] = 8 * h[0];
  p_fghi(out,f,g,h,h8);
}

/* out = fg - h */
static void p_fgh(double out[8],const double f[8],const double g[8],const double h[8])
{
  static double one[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
  p_fghi(out,f,g,h,one);
}

/* out = f^2 - g */
static void p_f2g(double out[8],const double f[8],const double g[8])
{
  p_fgh(out,f,f,g);
}

/* out = fg */
static void p_fg(double out[8],const double f[8],const double g[8])
{
  p_fgh(out,f,g,zero);
}

/* out = f^2 */
static void p_f2(double out[8],const double f[8])
{
  p_fg(out,f,f);
}

static void p_96127(double out96[8],double out127[8],double in[8])
{
  static double p6[8];
  static double p24[8];
  static double t1[8];
  static double t2[8];
  static double t3[8];
  int i;

  p_f2(t1,in);
  p_fg(t2,t1,in); /* 2^2-1 */
  p_f2(t1,t2); /* 2^3-2 */
  p_fg(t2,t1,in); /* 2^3-1 */
  p_f2(t1,t2); /* 2^4-2^1 */
  p_f2(t3,t1); /* 2^5-2^2 */
  p_f2(t1,t3); /* 2^6-2^3 */
  p_fg(p6,t1,t2); /* 2^6-1 */
  p_f2(t1,p6); /* 2^7-2 */
  p_f2(t2,t1); /* 2^8-2^2 */
  p_f2(t1,t2); /* 2^9-2^3 */
  p_f2(t2,t1); /* 2^10-2^4 */
  p_f2(t1,t2); /* 2^11-2^5 */
  p_f2(t2,t1); /* 2^12-2^6 */
  p_fg(t1,t2,p6); /* 2^12-1 */
  p_f2(t2,t1); /* 2^13-2 */
  p_f2(t3,t2); /* 2^14-2^2 */
  for (i = 0;i < 5;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(p24,t1,t3); /* 2^24-1 */
  p_f2(t1,p24); /* 2^25-2 */
  p_f2(t3,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t3); p_f2(t3,t1); }
  p_fg(t1,p24,t3); /* 2^48-1 */
  p_f2(t2,t1);
  p_f2(t3,t2);
  for (i = 0;i < 23;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(out96,t1,t3); /* 2^96-1 */
  p_f2(t1,out96);
  p_f2(t2,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t2); p_f2(t2,t1); }
  p_fg(t1,p24,t2); /* 2^120-1 */
  for (i = 0;i < 3;++i) { p_f2(t2,t1); p_f2(t1,t2); }
  p_fg(t2,p6,t1); /* 2^126-1 */
  p_f2(t1,t2); /* 2^127-2 */
  p_fg(out127,t1,in); /* 2^127-1 */
}

static void p_invert(double out[8],double in[8])
{
  static double p96[8]; /* 2^96-1 */
  static double t1[8];
  static double t2[8];
  int i;

  p_96127(p96,out,in);
  p_f2(t1,out); /* 2^128-2 */
  for (i = 0;i < 48;++i) { p_f2(t2,t1); p_f2(t1,t2); }
  p_fg(out,p96,t1); /* 2^224-2^97+2^96-1 = 2^224-2^96-1 */
}


static void ecneg(double out[24],double in[24])
{
  int i;
  for (i = 0;i < 8;++i) {
    out[i] = in[i];
    out[8 + i] = -in[8 + i];
    out[16 + i] = in[16 + i];
  }
}

static void ecdouble(double out[24],double in[24])
{
  static double q[8];
  static double r[8];
  static double x1r3[8];
  static double x1r[8];
  static double a[8];
  static double b[8];
  static double b8[8];
  static double qr[8];
  static double b4x[8];
  static double yz[8];
  int i;

  p_f2(q,in + 8);
  p_f2(r,in + 16);
  for (i = 0;i < 8;++i) {
    x1r3[i] = 3 * (in[i] - r[i]);
    x1r[i] = in[i] + r[i];
    qr[i] = q[i] + r[i];
    yz[i] = in[i + 8] + in[i + 16];
  }
  p_fg(a,x1r3,x1r);
  p_fg(b,in,q);
  for (i = 0;i < 8;++i) b8[i] = 8 * b[i];
  p_f2g(out,a,b8);
  for (i = 0;i < 8;++i) b4x[i] = 4 * b[i] - out[i];
  p_fg8h2(out + 8,a,b4x,q);
  p_f2g(out + 16,yz,qr);
}

static void ecadd(double out[24],double p1[24],double rs1[16],double p2[24],double rs2[16])
{
  static double a[8];
  static double b[8];
  static double c[8];
  static double d[8];
  static double e[8];
  static double f[8];
  static double g[8];
  static double h[8];
  static double f2g[8];
  static double gx[8];
  int i;

  p_fg(a,p1,rs2);
  p_fgh(b,p2,rs1,a);
  p_fg(c,p1 + 8,rs2 + 8);
  p_fgh(d,p2 + 8,rs1 + 8,c);
  p_f2(e,b);
  p_fg(f,b,e);
  p_fg(g,a,e);
  p_fg(h,p1 + 16,p2 + 16);
  for (i = 0;i < 8;++i) f2g[i] = f[i] + 2 * g[i];
  p_f2g(out,d,f2g);
  p_fg(out + 16,b,h);
  for (i = 0;i < 8;++i) gx[i] = g[i] - out[i];
  p_fghi(out + 8,d,gx,c,f);
}

static void ecpack(unsigned char out[56],double in[24])
{
  static double t1[8];
  static double t2[8];
  static double t3[8];

  p_invert(t1,in + 16); /* z^-1 */
  p_f2(t2,t1); /* z^-2 */
  p_fg(t3,t1,t2); /* z^-3 */
  p_fg(t1,t2,in); /* xz^-2 */
  p_fg(t2,t3,in + 8); /* yz^-3 */
  d2c(out,t1);
  d2c(out + 28,t2);
}

static void ecunpack(double out[24],unsigned char in[56])
{
  c2d(out,in);
  c2d(out + 8,in + 28);
  out[16] = 1;
  out[17] = 0;
  out[18] = 0;
  out[19] = 0;
  out[20] = 0;
  out[21] = 0;
  out[22] = 0;
  out[23] = 0;
}


static double b[8] = {
  0x355ffb4 * T0,
  0x0b39432 * T1,
  0xfd8ba27 * T2,
  0xb0b7d7b * T3,
  0x2565044 * T4,
  0xabf5413 * T5,
  0x50c04b3 * T6,
  0xb4050a8 * T7
} ;

int nistp224_valid(unsigned char in[56])
{
  static double x[8];
  static double y[8];
  static double t1[8];
  static double t2[8];
  static double t3[8];
  static double s;

  fpmode();

  c2d(x,in);
  c2d(y,in + 28);
  p_f2(t1,x);
  t1[0] -= 3; /* t1 = x^2 - 3 */
  p_f2g(t2,y,b); /* t2 = y^2 - b */
  p_fgh(t3,t1,x,t2); /* t3 = x^3 - 3x + b - y^2 */

  s = t3[0] * t3[0] + t3[1] * t3[1] + t3[2] * t3[2] + t3[3] * t3[3] + t3[4] * t3[4] + t3[5] * t3[5] + t3[6] * t3[6] + t3[7] * t3[7];
  if (s) return 0;
  return 1;
}

static void ecrs(double out[16],double in[24])
{
  p_f2(out,in + 16);
  p_fg(out + 8,out,in + 16);
}

int nistp224_56(unsigned char out[56],unsigned char in[56],unsigned char e[28])
{
  static double t[16][32]; /* t[8+i] is p^i, for nonzero i */
  static double rs[16][16];
  static double q[24];
  static double q2[24];
  static double q3[24];
  static double qrs[16];
  int i;
  int j;

  fpmode();

  if (!nistp224_valid(in)) {
    for (i = 0;i < 56;++i) out[i] = 0;
    return 0;
  }

  ecunpack(t[9],in);
  ecdouble(t[10],t[9]);
  ecrs(rs[9],t[9]);
  ecrs(rs[10],t[10]);
  ecadd(t[11],t[10],rs[10],t[9],rs[9]);
  ecdouble(t[12],t[10]);
  ecdouble(t[14],t[11]);
  ecrs(rs[12],t[12]);
  ecadd(t[13],t[12],rs[12],t[9],rs[9]);
  ecrs(rs[14],t[14]);
  ecadd(t[15],t[14],rs[14],t[9],rs[9]);
  ecneg(t[7],t[9]);
  ecneg(t[6],t[10]);
  ecneg(t[5],t[11]);
  ecneg(t[4],t[12]);
  ecneg(t[3],t[13]);
  ecneg(t[2],t[14]);
  ecneg(t[1],t[15]);
  ecdouble(t[0],t[4]);

  ecrs(rs[11],t[11]);
  ecrs(rs[13],t[13]);
  ecrs(rs[15],t[15]);
  ecrs(rs[0],t[0]);
  for (i = 0;i < 16;++i) rs[1][i] = rs[15][i];
  for (i = 0;i < 16;++i) rs[2][i] = rs[14][i];
  for (i = 0;i < 16;++i) rs[3][i] = rs[13][i];
  for (i = 0;i < 16;++i) rs[4][i] = rs[12][i];
  for (i = 0;i < 16;++i) rs[5][i] = rs[11][i];
  for (i = 0;i < 16;++i) rs[6][i] = rs[10][i];
  for (i = 0;i < 16;++i) rs[7][i] = rs[9][i];

  ecdouble(q2,t[0]);
  j = (e[0] >> 4) & 15;
  if (j == 8)
    ecneg(q,q2);
  else {
    ecneg(q3,q2);
    ecrs(qrs,q3);
    ecadd(q,q3,qrs,t[j],rs[j]);
  }
  ecdouble(q2,q);
  ecdouble(q3,q2);
  ecdouble(q2,q3);
  j = e[0] & 15;
  if (j == 8)
    ecdouble(q,q2);
  else {
    ecdouble(q3,q2);
    ecrs(qrs,q3);
    ecadd(q,q3,qrs,t[j],rs[j]);
  }

  for (i = 1;i < 28;++i) {
    ecdouble(q2,q);
    ecdouble(q3,q2);
    ecdouble(q2,q3);
    j = (e[i] >> 4) & 15;
    if (j == 8)
      ecdouble(q,q2);
    else {
      ecdouble(q3,q2);
      ecrs(qrs,q3);
      ecadd(q,q3,qrs,t[j],rs[j]);
    }
    ecdouble(q2,q);
    ecdouble(q3,q2);
    ecdouble(q2,q3);
    j = e[i] & 15;
    if (j == 8)
      ecdouble(q,q2);
    else {
      ecdouble(q3,q2);
      ecrs(qrs,q3);
      ecadd(q,q3,qrs,t[j],rs[j]);
    }
  }

  ecpack(out,q);

  if (!nistp224_valid(out)) {
    for (i = 0;i < 56;++i) out[i] = 0;
    return 0;
  }
  return 1;
}

static const int sqtab[64][16][8] = {
#include "sqtab64.c"
} ;

static const int etab[300] = {
  0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 49, 0, 0, 8, 0, 14, 0, 27, 19, 0, 0, 0, 0, 0, 52,
  0, 0, 0, 0, 0, 0, 0, 0, 29, 38, 0, 0, 0, 0, 53, 43, 57, 56, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 16, 55, 28, 5, 0, 0, 0, 0, 0, 42, 30, 0,
  0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 7, 0, 26, 0, 0, 0, 0, 0, 22,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 0, 0, 0, 0, 0, 15, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 35, 0, 0,
  0, 0, 0, 2, 12, 0, 0, 0, 0, 1, 0, 33, 0, 0, 0, 0, 44, 34, 0, 0,
  0, 0, 0, 3, 0, 0, 0, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 47, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 54, 0, 0, 0, 0, 0, 58, 0, 39, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
  0, 0, 62, 10, 0, 0, 0, 0, 0, 37, 60, 23, 48, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 24, 25, 11, 21, 0, 0, 0, 0, 6, 61, 0, 0, 0, 0, 0, 0, 0,
  0, 20, 0, 0, 0, 0, 0, 51, 59, 0, 46, 0, 40, 0, 0, 17, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 45, 0, 0, 0, 0, 32,
} ;

static void p_sqrt(double out[8],double in[8])
{
  int e[16];
  static double u[16][8];
  static double t1[8];
  static double t2[8];
  int i;
  int j;
  int k;

  p_96127(out,t1,in); /* t1 is 2^127 - 1 */
  p_fg(out,t1,in); /* 2^127 */
  p_fg(u[0],t1,out); /* 2^128 - 1 */

  for (i = 0;i < 15;++i) {
    p_f2(t1,u[i]);
    p_f2(t2,t1);
    p_f2(t1,t2);
    p_f2(t2,t1);
    p_f2(t1,t2);
    p_f2(u[i + 1],t1);
  }

  for (k = 0;k < 16;++k) {
    int x;

    x = u[15 - k][0];
    x %= 300;
    x += 300;
    x %= 300;
    e[k] = etab[x];
    for (i = 14 - k;i >= 0;--i) {
      for (j = 0;j < 8;++j) t2[j] = sqtab[e[k]][i + k][j] * two28[j];
      p_fg(t1,u[i],t2);
      for (j = 0;j < 8;++j) u[i][j] = t1[j];
    }
  }

  /* could check at this point whether e[0] is even */
  e[0] >>= 1;
  for (k = 1;k < 16;++k) {
    e[k - 1] |= ((e[k] & 1) << 5);
    e[k] >>= 1;
  }

  for (k = 0;k < 16;++k) {
    for (j = 0;j < 8;++j) t2[j] = sqtab[e[k]][k][j] * two28[j];
    p_fg(t1,out,t2);
    for (j = 0;j < 8;++j) out[j] = t1[j];
  }
}

int nistp224_uncompress(unsigned char inout[56])
{
  static double x[8];
  static double t1[8];
  static double t2[8];
  int i;

  fpmode();

  c2d(x,inout);
  p_f2(t1,x);
  t1[0] -= 3; /* t1 = x^2 - 3 */
  p_fg(t2,t1,x);
  for (i = 0;i < 8;++i) t2[i] += b[i]; /* t2 = x^3 - 3x + b */
  p_sqrt(t1,t2);
  d2c(inout + 28,t1);

  if (!nistp224_valid(inout)) {
    for (i = 0;i < 56;++i) inout[i] = 0;
    return 0;
  }
  return 1;
}

int nistp224(unsigned char out[28],unsigned char in[28],unsigned char e[28])
{
  static unsigned char out56[56];
  static unsigned char in56[56];
  int i;

  for (i = 0;i < 28;++i) in56[i] = in[i];
  for (i = 0;i < 28;++i) out[i] = 0;
  if (nistp224_uncompress(in56))
    if (nistp224_56(out56,in56,e)) {
      for (i = 0;i < 28;++i) out[i] = out56[i];
      return 1;
    }
  return 0;
}
