#include "nistp224.h"

static const int sqtab[64][16][8] = {
#include "sqtab64.c"
} ;

static const int etab[300] = {
  0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 49, 0, 0, 8, 0, 14, 0, 27, 19, 0, 0, 0, 0, 0, 52,
  0, 0, 0, 0, 0, 0, 0, 0, 29, 38, 0, 0, 0, 0, 53, 43, 57, 56, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 16, 55, 28, 5, 0, 0, 0, 0, 0, 42, 30, 0,
  0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 7, 0, 26, 0, 0, 0, 0, 0, 22,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 0, 0, 0, 0, 0, 15, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 35, 0, 0,
  0, 0, 0, 2, 12, 0, 0, 0, 0, 1, 0, 33, 0, 0, 0, 0, 44, 34, 0, 0,
  0, 0, 0, 3, 0, 0, 0, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 47, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 54, 0, 0, 0, 0, 0, 58, 0, 39, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
  0, 0, 62, 10, 0, 0, 0, 0, 0, 37, 60, 23, 48, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 24, 25, 11, 21, 0, 0, 0, 0, 6, 61, 0, 0, 0, 0, 0, 0, 0,
  0, 20, 0, 0, 0, 0, 0, 51, 59, 0, 46, 0, 40, 0, 0, 17, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 45, 0, 0, 0, 0, 32,
} ;

#define T0 1.0
#define T1 268435456.0
#define T2 72057594037927936.0
#define T3 19342813113834066795298816.0
#define T4 5192296858534827628530496329220096.0
#define T5 1393796574908163946345982392040522594123776.0
#define T6 374144419156711147060143317175368453031918731001856.0
#define T7 100433627766186892221372630771322662657637687111424552206336.0

static const double zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static const double one[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
static const double c6[8] = {
  0x355ffb4 * T0,
  0x0b39432 * T1,
  0xfd8ba27 * T2,
  0xb0b7d7b * T3,
  0x2565044 * T4,
  0xabf5413 * T5,
  0x50c04b3 * T6,
  0xb4050a8 * T7
} ;

static const double constants[] = {
#define constant_2 (constants[0])
  2.0
#define constant_3 (constants[1])
, 3.0
#define constant_4 (constants[2])
, 4.0
#define constant_8 (constants[3])
, 8.0
#define twom128 (constants[4])
, 0.00000000000000000000000000000000000000293873587705571876992184134305561419454666389193021880377187926569604314863681793212890625
#define twom224 (constants[5])
, 0.00000000000000000000000000000000000000000000000000000000000000000003709206150687421385731735261547639513367564778757791002453039058917581340095629358997312082723208437536338919136001159027049567384892725385725498199462890625
#define two28 (constants + 6)
, T0
, T1
, T2
, T3
, T4
, T5
, T6
, T7
#define half28 (constants + 14)
, 1 / T0
, 1 / T1
, 1 / T2
, 1 / T3
, 1 / T4
, 1 / T5
, 1 / T6
, 1 / T7
#define alpha (constants + 22)
, 13835058055282163712.0
, 3713820117856140824697372672.0
, 996920996838686904677855295210258432.0
, 267608942382367477698428619271780338071764992.0
, 71835728478088540235547516897670742982128396352356352.0
, 19283256531107883306503545108093951230266435925393514023616512.0
, 5176309760092922840576066896707769089338331729127789916371893167849472.0
, 1389505070847794345082851820104254894239239815987686768473491008094957555679232.0
, 372992427307339981616536686110115630075342113098010788080347982669869622759400031649792.0
, 100124392308792660112266642476697755372001774485468098591267575366666549645967525782326347825152.0
, 26876936906133650526689307366821331337459745966815794418799869211565502454161931344570529919259285389312.0
, 7214722814281215676076484393336843348098096790246358642812797864546946125152077138320483323038009455714104246272.0
, 1936687408565181442241939379002258905747279344621917634622994517405485716512630895972234014960103373576927378979560620032.0
, 519875567647652786170952659526828194394331951632965607844464921365241495213554804319995201144526170893260852055263169717932654592.0
, 139553035064756522985470171114496870595899141251965067574046118242682743317772401278628681737042604587666404148403106163238643512694013952.0
, 37460982603791906776579166757517995668983177711979653810309883515504460865837187441443673236761903653917923173456879474745375708151459423675482112.0
#define beta (constants + 38)
, 0.499999999883584678173065185546875
, 134217727.96875
, 36028797010575360.0
, 9671406554665233583964160.0
, 2596148428662950904457933577256960.0
, 696898287291822696343777832628683286773760.0
, 187072209534799430564191535264372276764693034434560.0
, 50216813871401433012039092040031852667088579398464815759360.0
, 13479973330436769029640167161591920625214739003219108518319787868160.0
} ;

static double tmp[120];
static double tmp2[40];
static double tmp3[16][40];

static void ecmul(double *,double *,unsigned char *);
static void p_fghi(double *,const double *,const double *,const double *,const double *);
static void p_fg8h2(double *,const double *,const double *,const double *);
static void p_fgh(double *,const double *,const double *,const double *);
static void p_fg(double *,const double *,const double *);
static void p_f2g(double *,const double *,const double *);
static void p_f2(double *,const double *);
static void p_96127(double *,double *,const double *);
static void d2c(unsigned char *,const double *);

static inline void fpmode(void)
{
  asm volatile("fldcw %0"::"m"(0x137f));
}

static void p_sqrt(double out[8],const double in[8])
{
  static int e[16];
#define u tmp3
#define t1 (tmp + 100)
#define t2 (tmp + 108)
#define t3 (tmp + 116)
  register const int *z;
  register int i;
  register int k;
  register double r0;
  register double r1;
  register double r2;
  register double r3;
  register double r4;
  register double r5;
  register double r6;
  register double r7;

  p_96127(t3,t1,in); /* t1 is 2^127 - 1 */
  p_fg(t3,t1,in); /* 2^127 */
  p_fg(u[0],t1,t3); /* 2^128 - 1 */

  for (i = 0;i < 15;++i) {
    p_f2(t1,u[i]);
    p_f2(t2,t1);
    p_f2(t1,t2);
    p_f2(t2,t1);
    p_f2(t1,t2);
    p_f2(u[i + 1],t1);
  }

  for (k = 0;k < 16;++k) {
    i = u[15 - k][0];
    i %= 300;
    i += 300;
    i %= 300;
    e[k] = etab[i];

    z = sqtab[e[k]][14];
    for (i = 14 - k;i >= 0;--i) {
      r0 = z[0];
      r1 = z[1];
      r4 = u[i][0];
      r0 *= two28[0];
      r5 = u[i][1];
      r1 *= two28[1];
      t1[0] = r4;
      t2[0] = r0;
      t1[1] = r5;
      t2[1] = r1;
      r2 = z[2];
      r3 = z[3];
      r6 = u[i][2];
      r2 *= two28[2];
      r7 = u[i][3];
      r3 *= two28[3];
      t1[2] = r6;
      t2[2] = r2;
      t1[3] = r7;
      t2[3] = r3;
      r0 = z[4];
      r1 = z[5];
      r4 = u[i][4];
      r0 *= two28[4];
      r5 = u[i][5];
      r1 *= two28[5];
      t1[4] = r4;
      t2[4] = r0;
      t1[5] = r5;
      t2[5] = r1;
      r2 = z[6];
      r3 = z[7];
      r6 = u[i][6];
      r2 *= two28[6];
      r7 = u[i][7];
      r3 *= two28[7];
      t1[6] = r6;
      t2[6] = r2;
      t1[7] = r7;
      t2[7] = r3;
      z -= 8;
      p_fg(u[i],t1,t2);
    }
  }

  /* could check at this point whether e[0] is even */
  e[0] >>= 1;
  for (k = 1;k < 16;++k) {
    e[k - 1] |= ((e[k] & 1) << 5);
    e[k] >>= 1;
  }

  for (k = 0;k < 16;++k) {
    z = sqtab[e[k]][k];
    for (i = 0;i < 8;++i) {
      r0 = z[i];
      r4 = t3[i];
      r0 *= two28[i];
      t1[i] = r4;
      t2[i] = r0;
    }
    p_fg(t3,t1,t2);
  }

  for (i = 0;i < 8;++i) {
    r0 = t3[i];
    out[i] = r0;
  }
#undef u
#undef t1
#undef t2
#undef t3
}

static void p_96127(double out96[8],double out127[8],const double in[8])
{
#define p6 (tmp + 0)
#define p24 (tmp + 8)
#define t1 (tmp + 16)
#define t2 (tmp + 24)
#define t3 (tmp + 32)
  int i;

  p_f2(t1,in);
  p_fg(t2,t1,in); /* 2^2-1 */
  p_f2(t1,t2); /* 2^3-2 */
  p_fg(t2,t1,in); /* 2^3-1 */
  p_f2(t1,t2); /* 2^4-2^1 */
  p_f2(t3,t1); /* 2^5-2^2 */
  p_f2(t1,t3); /* 2^6-2^3 */
  p_fg(p6,t1,t2); /* 2^6-1 */
  p_f2(t1,p6); /* 2^7-2 */
  p_f2(t2,t1); /* 2^8-2^2 */
  p_f2(t1,t2); /* 2^9-2^3 */
  p_f2(t2,t1); /* 2^10-2^4 */
  p_f2(t1,t2); /* 2^11-2^5 */
  p_f2(t2,t1); /* 2^12-2^6 */
  p_fg(t1,t2,p6); /* 2^12-1 */
  p_f2(t2,t1); /* 2^13-2 */
  p_f2(t3,t2); /* 2^14-2^2 */
  for (i = 0;i < 5;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(p24,t1,t3); /* 2^24-1 */
  p_f2(t1,p24); /* 2^25-2 */
  p_f2(t3,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t3); p_f2(t3,t1); }
  p_fg(t1,p24,t3); /* 2^48-1 */
  p_f2(t2,t1);
  p_f2(t3,t2);
  for (i = 0;i < 23;++i) { p_f2(t2,t3); p_f2(t3,t2); }
  p_fg(out96,t1,t3); /* 2^96-1 */
  p_f2(t1,out96);
  p_f2(t2,t1);
  for (i = 0;i < 11;++i) { p_f2(t1,t2); p_f2(t2,t1); }
  p_fg(t1,p24,t2); /* 2^120-1 */
  for (i = 0;i < 3;++i) { p_f2(t2,t1); p_f2(t1,t2); }
  p_fg(t2,p6,t1); /* 2^126-1 */
  p_f2(t1,t2); /* 2^127-2 */
  p_fg(out127,t1,in); /* 2^127-1 */
#undef p6
#undef p24
#undef t1
#undef t2
#undef t3
}

static void ecpack(unsigned char out[56],const double in[24])
{
#define t1 (tmp + 32)
#define t3 (tmp + 40)
#define t2 (tmp)
  int i;

  p_96127(t3,t1,in + 16);
  p_f2(t2,t1); /* 2^128-2 */
  for (i = 0;i < 48;++i) { p_f2(t1,t2); p_f2(t2,t1); }
  p_fg(t1,t3,t2); /* 2^224-2^97+2^96-1 = 2^224-2^96-1 */

  /* t1 now has z^-1 */
  p_f2(t2,t1); /* z^-2 */
  p_fg(t3,t1,t2); /* z^-3 */
  p_fg(t1,t2,in); /* xz^-2 */
  p_fg(t2,t3,in + 8); /* yz^-3 */
  d2c(out,t1);
  d2c(out + 28,t2);
#undef t1
#undef t3
#undef t2
}

static void d2c(unsigned char out[28],const double in[8])
{
#define x (tmp + 16)
  register double q;
  int i;
  int u;
  int v;

  q = 0.5;
  q -= twom224 * in[7];
  q -= beta[0]; q += alpha[0]; q -= alpha[0];

  q += in[0];
  q += twom128 * in[5];
  q -= beta[1]; q += alpha[1]; q -= alpha[1];

  q += in[1];
  q += twom128 * in[6];
  q -= beta[2]; q += alpha[2]; q -= alpha[2];

  q += in[2];
  q += twom128 * in[7];
  q -= beta[3]; q += alpha[3]; q -= alpha[3];

  q += in[3];
  q -= beta[4]; q += alpha[4]; q -= alpha[4];

  q += in[4];
  q -= beta[5]; q += alpha[5]; q -= alpha[5];

  q += in[5];
  q -= beta[6]; q += alpha[6]; q -= alpha[6];

  q += in[6];
  q -= beta[7]; q += alpha[7]; q -= alpha[7];

  q += in[7];
  q -= beta[8]; q += alpha[8]; q -= alpha[8];

  x[0] = in[0] - q * twom224;
  x[1] = in[1];
  x[2] = in[2];
  x[3] = in[3] + q * twom128;
  x[4] = in[4];
  x[5] = in[5];
  x[6] = in[6];
  x[7] = in[7] - q;

  q = x[0];
  q -= beta[1]; q += alpha[1]; q -= alpha[1];
  x[1] += q;
  x[0] -= q;

  q = x[1];
  q -= beta[2]; q += alpha[2]; q -= alpha[2];
  x[2] += q;
  x[1] -= q;

  q = x[2];
  q -= beta[3]; q += alpha[3]; q -= alpha[3];
  x[3] += q;
  x[2] -= q;

  q = x[3];
  q -= beta[4]; q += alpha[4]; q -= alpha[4];
  x[4] += q;
  x[3] -= q;

  q = x[4];
  q -= beta[5]; q += alpha[5]; q -= alpha[5];
  x[5] += q;
  x[4] -= q;

  q = x[5];
  q -= beta[6]; q += alpha[6]; q -= alpha[6];
  x[6] += q;
  x[5] -= q;

  q = x[6];
  q -= beta[7]; q += alpha[7]; q -= alpha[7];
  x[7] += q;
  x[6] -= q;

  for (i = 0;i < 8;++i)
    x[i] *= half28[i];

  for (i = 0;i < 4;++i) {
    u = x[i * 2];
    v = x[i * 2 + 1];
    out[i * 7] = u & 255; u >>= 8;
    out[i * 7 + 1] = u & 255; u >>= 8;
    out[i * 7 + 2] = u & 255; u >>= 8;
    out[i * 7 + 3] = (u + (v << 4)) & 255; v >>= 4;
    out[i * 7 + 4] = v & 255; v >>= 8;
    out[i * 7 + 5] = v & 255; v >>= 8;
    out[i * 7 + 6] = v & 255;
  }
#undef x
}

static void c2d(double out[8],const unsigned char in[28])
{
  int i;
  register double u;
  register double y;

  u = 0;
  for (i = 6;i >= 0;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  y = alpha[1]; y += u; y -= alpha[1]; u -= y;
  out[0] = u;
  out[1] = y;

  u = 0;
  for (i = 13;i >= 7;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[2];
  y = alpha[3]; y += u; y -= alpha[3]; u -= y;
  out[2] = u;
  out[3] = y;

  u = 0;
  for (i = 20;i >= 14;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[4];
  y = alpha[5]; y += u; y -= alpha[5]; u -= y;
  out[4] = u;
  out[5] = y;

  u = 0;
  for (i = 27;i >= 21;--i) {
    u *= 256.0;
    u += (double) (unsigned int) in[i];
  }
  u *= two28[6];
  y = alpha[7]; y += u; y -= alpha[7]; u -= y;
  out[6] = u;
  out[7] = y;
}

static int iszero(const double *in)
{
  return !(in[0] * in[0] + in[1] * in[1] + in[2] * in[2] + in[3] * in[3] + in[4] * in[4] + in[5] * in[5] + in[6] * in[6] + in[7] * in[7]);
}

int nistp224_valid(unsigned char in[56])
{
#define x (tmp)
#define y (tmp + 8)
#define t1 (tmp + 16)
#define t2 (tmp + 24)
#define t3 (tmp + 32)

  fpmode();

  c2d(x,in);
  c2d(y,in + 28);
  p_f2(t1,x);
  t1[0] -= constant_3; /* t1 = x^2 - 3 */
  p_f2g(t2,y,c6); /* t2 = y^2 - c6 */
  p_fgh(t3,t1,x,t2); /* t3 = x^3 - 3x + c6 - y^2 */
  return iszero(t3);
#undef x
#undef y
#undef t1
#undef t2
#undef t3
}

int nistp224_56(unsigned char out[56],unsigned char in[56],unsigned char e[28])
{
#define t1 (tmp)
#define t2 (tmp + 8)
#define t3 (tmp + 16)
#define xye (tmp2)
#define xy (tmp2 + 16)
  int i;

  fpmode();

  c2d(xy,in);
  c2d(xy + 8,in + 28);

  p_f2(t1,xy);
  t1[0] -= constant_3;
  p_f2g(t2,xy + 8,c6);
  p_fgh(t3,t1,xy,t2);

  if (!iszero(t3)) return 0;

  ecmul(xye,xy,e);
  ecpack(out,xye);

  if (!nistp224_valid(out)) {
    for (i = 0;i < 56;++i) out[i] = 0;
    return 0;
  }
  return 1;
#undef t1
#undef t2
#undef t3
#undef xye
#undef xy
}

int nistp224_uncompress(unsigned char inout[56])
{
#define x (tmp2)
#define t1 (tmp2 + 8)
#define t2 (tmp2 + 16)
  int i;

  fpmode();

  c2d(x,inout);
  p_f2(t1,x);
  t1[0] -= 3; /* t1 = x^2 - 3 */
  p_fg(t2,t1,x);
  for (i = 0;i < 8;++i) t2[i] += c6[i]; /* t2 = x^3 - 3x + c6 */
  p_sqrt(t1,t2);
  d2c(inout + 28,t1);

  if (!nistp224_valid(inout)) {
    for (i = 0;i < 56;++i) inout[i] = 0;
    return 0;
  }
  return 1;
#undef x
#undef t1
#undef t2
}

int nistp224(unsigned char out[28],unsigned char in[28],unsigned char e[28])
{
  static char out56[56];
#define xye (tmp2)
#define xy (tmp2 + 24)
  int i;

  fpmode();

  for (i = 0;i < 28;++i) out[i] = 0;

  c2d(xy,in);
  p_f2(xye,xy);
  xye[0] -= 3;
  p_fg(xye + 8,xye,xy);
  for (i = 0;i < 8;++i) xye[8 + i] += c6[i];
  p_sqrt(xy + 8,xye + 8);
  p_f2g(xye,xy + 8,xye + 8);

  if (!iszero(xye)) return 0;

  ecmul(xye,xy,e);
  ecpack(out56,xye);

  if (!nistp224_valid(out56)) return 0;

  for (i = 0;i < 28;++i) out[i] = out56[i];
  return 1;
#undef xye
#undef xy
}

/* out = fg - h */
static void p_fgh(double out[8],const double f[8],const double g[8],const double h[8])
{
#include "opt-ppro-fgh.c"
}

static void p_fg(double out[8],const double f[8],const double g[8])
{
#include "opt-ppro-fg.c"
}

/* out = f^2 - g */
static void p_f2g(double out[8],const double f[8],const double g[8])
{
#include "opt-ppro-f2g.c"
}

static void p_f2(double out[8],const double f[8])
{
#include "opt-ppro-f2.c"
}

static void ecdouble(double out[24],const double in[24])
{
#define q (tmp)
#define r (tmp + 8)
#define x1r3 (tmp + 16)
#define qr (tmp + 16)
#define x1r (tmp + 24)
#define b4x (tmp + 24)
#define a (tmp + 32)
#define b (tmp + 40)
#define b8 (tmp + 48)
#define yz (tmp + 48)
  register double r0;
  register double r1;
  register double r2;
  register int i;

  p_f2(q,in + 8);
  p_f2(r,in + 16);
  p_fg(b,in,q);

  i = 8;
  do {
    r2 = constant_8; r2 *= b[i - 1];
    r0 = in[i - 1]; r0 -= r[i - 1];
    r1 = in[i - 1]; r1 += r[i - 1];
    r0 *= constant_3;
    b8[i - 1] = r2;
    x1r[i - 1] = r1;
    x1r3[i - 1] = r0;
    r2 = constant_8; r2 *= b[i - 2];
    r0 = in[i - 2]; r0 -= r[i - 2];
    r1 = in[i - 2]; r1 += r[i - 2];
    r0 *= constant_3;
    b8[i - 2] = r2;
    x1r[i - 2] = r1;
    x1r3[i - 2] = r0;
  } while (i -= 2);

  p_fg(a,x1r3,x1r);
  p_f2g(out,a,b8);

  i = 8;
  do {
    r0 = constant_4; r0 *= b[i - 1];
    r2 = q[i - 1]; r2 += r[i - 1];
    r0 -= out[i - 1];
    r1 = in[i + 7]; r1 += in[i + 15];
    qr[i - 1] = r2;
    b4x[i - 1] = r0;
    yz[i - 1] = r1;
    r0 = constant_4; r0 *= b[i - 2];
    r2 = q[i - 2]; r2 += r[i - 2];
    r0 -= out[i - 2];
    r1 = in[i + 6]; r1 += in[i + 14];
    qr[i - 2] = r2;
    b4x[i - 2] = r0;
    yz[i - 2] = r1;
  } while (i -= 2);

  p_fg8h2(out + 8,a,b4x,q);
  p_f2g(out + 16,yz,qr);
#undef q
#undef r
#undef x1r3
#undef qr
#undef x1r
#undef b4x
#undef a
#undef b
#undef b8
#undef yz
}

static void ecadd(double out[24],double p1[40],double p2[40])
{
#define a (tmp)
#define b (tmp + 8)
#define c (tmp + 8)
#define d (tmp + 16)
#define e (tmp + 16)
#define f (tmp + 24)
#define g (tmp + 32)
#define h (tmp + 16)
#define i (tmp)
#define j (tmp)
  register double r0;
  register double r1;
  register double r2;
  register double r3;

  p_fg(a,p1,p2 + 24);
  p_fgh(b,p2,p1 + 24,a);
  p_fg(h,p1 + 16,p2 + 16);
  p_fg(out + 16,b,h);
  p_f2(e,b);
  p_fg(g,a,e);
  p_fg(f,b,e);

  r0 = g[0]; r0 *= constant_2;
  r1 = g[1]; r1 *= constant_2;
  r0 += f[0];
  r2 = g[2]; r2 *= constant_2;
  r1 += f[1];
  i[0] = r0;
  r3 = g[3]; r3 *= constant_2;
  r2 += f[2];
  i[1] = r1;
  r0 = g[4]; r0 *= constant_2;
  r3 += f[3];
  i[2] = r2;
  r1 = g[5]; r1 *= constant_2;
  r0 += f[4];
  i[3] = r3;
  r2 = g[6]; r2 *= constant_2;
  r1 += f[5];
  i[4] = r0;
  r3 = g[7]; r3 *= constant_2;
  r2 += f[6];
  i[5] = r1;
  r3 += f[7];
  i[6] = r2;
  i[7] = r3;

  p_fg(c,p1 + 8,p2 + 32);
  p_fgh(d,p2 + 8,p1 + 32,c);
  p_f2g(out,d,i);

  r0 = g[0]; r0 -= out[0];
  r1 = g[1]; r1 -= out[1];
  r2 = g[2]; r2 -= out[2];
  j[0] = r0;
  j[1] = r1;
  r3 = g[3]; r3 -= out[3];
  j[2] = r2;
  r0 = g[4]; r0 -= out[4];
  j[3] = r3;
  r1 = g[5]; r1 -= out[5];
  j[4] = r0;
  r2 = g[6]; r2 -= out[6];
  j[5] = r1;
  r3 = g[7]; r3 -= out[7];
  j[6] = r2;
  j[7] = r3;

  p_fghi(out + 8,d,j,c,f);
#undef a
#undef b
#undef c
#undef d
#undef e
#undef f
#undef g
#undef h
#undef i
#undef j
}

static void ecrs(double x[40])
{
  p_f2(x + 24,x + 16);
  p_fg(x + 32,x + 24,x + 16);
}

static void ecmul(double q[24],double in[16],unsigned char e[28])
{
#define t (tmp3)
#define q2 (tmp + 56)
#define q3 (tmp + 80)
  int i;
  int j;

  j = 8;
  do {
    --j;
    t[9][j] = in[j];
    t[9][j + 8] = in[j + 8];
    t[9][j + 16] = one[j];
  } while (j);

  ecdouble(t[10],t[9]);
  ecrs(t[9]);
  ecrs(t[10]);
  ecadd(t[11],t[10],t[9]);
  ecdouble(t[12],t[10]);
  ecdouble(t[14],t[11]);
  ecrs(t[12]);
  ecadd(t[13],t[12],t[9]);
  ecrs(t[14]);
  ecadd(t[15],t[14],t[9]);
  ecrs(t[11]);
  ecrs(t[13]);
  ecrs(t[15]);

  for (i = 1;i < 8;++i) {
    j = 8;
    do {
      --j;
      t[i][j] = t[16 - i][j];
      t[i][j + 8] = -t[16 - i][j + 8];
      t[i][j + 16] = t[16 - i][j + 16];
      t[i][j + 24] = t[16 - i][j + 24];
      t[i][j + 32] = t[16 - i][j + 32];
    } while (j);
  }

  ecdouble(t[0],t[4]);
  ecrs(t[0]);

  /* t[8 + i] is p^i, for i in -8 -7 -6 -5 -4 -3 -2 -1 1 2 3 4 5 6 7 */

  ecdouble(q2,t[0]);
  i = 8;
  do {
    --i;
    q3[i] = q2[i];
    q3[i + 8] = -q2[i + 8];
    q3[i + 16] = q2[i + 16];
  } while (i);
  j = (e[0] >> 4) & 15;
  if (j == 8) {
    for (i = 0;i < 24;++i) q[i] = q3[i];
  }
  else {
    ecrs(q3);
    ecadd(q,q3,t[j]);
  }

  for (i = 1;i < 56;++i) {
    ecdouble(q2,q);
    ecdouble(q3,q2);
    ecdouble(q2,q3);
    j = e[i >> 1];
    if (!(i & 1)) j >>= 4;
    j &= 15;
    if (j == 8)
      ecdouble(q,q2);
    else {
      ecdouble(q3,q2);
      ecrs(q3);
      ecadd(q,q3,t[j]);
    }
  }
#undef t
#undef q2
#undef q3
}

static void p_fghi(double out[8],const double f[8],const double g[8],const double h[8],const double i[8])
{
#include "opt-ppro-fghi.c"
}

/* out = fg - 8h^2 */
static void p_fg8h2(double out[8],const double f[8],const double g[8],const double h[8])
{
#include "opt-ppro-fg8h2.c"
}
