/*
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
*/

/* AMCL mod p functions */
/* Small Finite Field arithmetic */
/* SU=m, SU is Stack Usage (NOT_SPECIAL Modulus) */

#include "fp_FP256BN.h"

/* Fast Modular Reduction Methods */

/* r=d mod m */
/* d MUST be normalised */
/* Products must be less than pR in all cases !!! */
/* So when multiplying two numbers, their product *must* be less than MODBITS+BASEBITS*NLEN */
/* Results *may* be one bit bigger than MODBITS */

#if MODTYPE_FP256BN == PSEUDO_MERSENNE
/* r=d mod m */

/* Converts from BIG integer to residue form mod Modulus */
void FP_FP256BN_nres(FP_FP256BN *y,BIG_256_56 x)
{
    BIG_256_56_copy(y->g,x);
    y->XES=1;
}

/* Converts from residue form back to BIG integer form */
void FP_FP256BN_redc(BIG_256_56 x,FP_FP256BN *y)
{
    BIG_256_56_copy(x,y->g);
}

/* reduce a DBIG to a BIG exploiting the special form of the modulus */
void FP_FP256BN_mod(BIG_256_56 r,DBIG_256_56 d)
{
    BIG_256_56 t,b;
    chunk v,tw;
    BIG_256_56_split(t,b,d,MODBITS_FP256BN);

    /* Note that all of the excess gets pushed into t. So if squaring a value with a 4-bit excess, this results in
       t getting all 8 bits of the excess product! So products must be less than pR which is Montgomery compatible */

    if (MConst_FP256BN < NEXCESS_256_56)
    {
        BIG_256_56_imul(t,t,MConst_FP256BN);
        BIG_256_56_norm(t);
        BIG_256_56_add(r,t,b);
        BIG_256_56_norm(r);
        tw=r[NLEN_256_56-1];
        r[NLEN_256_56-1]&=TMASK_FP256BN;
        r[0]+=MConst_FP256BN*((tw>>TBITS_FP256BN));
    }
    else
    {
        v=BIG_256_56_pmul(t,t,MConst_FP256BN);
        BIG_256_56_add(r,t,b);
        BIG_256_56_norm(r);
        tw=r[NLEN_256_56-1];
        r[NLEN_256_56-1]&=TMASK_FP256BN;
#if CHUNK == 16
        r[1]+=muladd_256_56(MConst_FP256BN,((tw>>TBITS_FP256BN)+(v<<(BASEBITS_256_56-TBITS_FP256BN))),0,&r[0]);
#else
        r[0]+=MConst_FP256BN*((tw>>TBITS_FP256BN)+(v<<(BASEBITS_256_56-TBITS_FP256BN)));
#endif
    }
    BIG_256_56_norm(r);
}
#endif

/* This only applies to Curve C448, so specialised (for now) */
#if MODTYPE_FP256BN == GENERALISED_MERSENNE

void FP_FP256BN_nres(FP_FP256BN *y,BIG_256_56 x)
{
    BIG_256_56_copy(y->g,x);
    y->XES=1;
}

/* Converts from residue form back to BIG integer form */
void FP_FP256BN_redc(BIG_256_56 x,FP_FP256BN *y)
{
    BIG_256_56_copy(x,y->g);
}

/* reduce a DBIG to a BIG exploiting the special form of the modulus */
void FP_FP256BN_mod(BIG_256_56 r,DBIG_256_56 d)
{
    BIG_256_56 t,b;
    chunk carry;
    BIG_256_56_split(t,b,d,MBITS_FP256BN);

    BIG_256_56_add(r,t,b);

    BIG_256_56_dscopy(d,t);
    BIG_256_56_dshl(d,MBITS_FP256BN/2);

    BIG_256_56_split(t,b,d,MBITS_FP256BN);

    BIG_256_56_add(r,r,t);
    BIG_256_56_add(r,r,b);
    BIG_256_56_norm(r);
    BIG_256_56_shl(t,MBITS_FP256BN/2);

    BIG_256_56_add(r,r,t);

    carry=r[NLEN_256_56-1]>>TBITS_FP256BN;

    r[NLEN_256_56-1]&=TMASK_FP256BN;
    r[0]+=carry;

    r[224/BASEBITS_256_56]+=carry<<(224%BASEBITS_256_56); /* need to check that this falls mid-word */
    BIG_256_56_norm(r);
}

#endif

#if MODTYPE_FP256BN == MONTGOMERY_FRIENDLY

/* convert to Montgomery n-residue form */
void FP_FP256BN_nres(FP_FP256BN *y,BIG_256_56 x)
{
    DBIG_256_56 d;
    BIG_256_56 r;
    BIG_256_56_rcopy(r,R2modp_FP256BN);
    BIG_256_56_mul(d,x,r);
    FP_FP256BN_mod(y->g,d);
    y->XES=2;
}

/* convert back to regular form */
void FP_FP256BN_redc(BIG_256_56 x,FP_FP256BN *y)
{
    DBIG_256_56 d;
    BIG_256_56_dzero(d);
    BIG_256_56_dscopy(d,y->g);
    FP_FP256BN_mod(x,d);
}

/* fast modular reduction from DBIG to BIG exploiting special form of the modulus */
void FP_FP256BN_mod(BIG_256_56 a,DBIG_256_56 d)
{
    int i;

    for (i=0; i<NLEN_256_56; i++)
        d[NLEN_256_56+i]+=muladd_256_56(d[i],MConst_FP256BN-1,d[i],&d[NLEN_256_56+i-1]);

    BIG_256_56_sducopy(a,d);
    BIG_256_56_norm(a);
}

#endif

#if MODTYPE_FP256BN == NOT_SPECIAL

/* convert to Montgomery n-residue form */
void FP_FP256BN_nres(FP_FP256BN *y,BIG_256_56 x)
{
    DBIG_256_56 d;
    BIG_256_56 r;
    BIG_256_56_rcopy(r,R2modp_FP256BN);
    BIG_256_56_mul(d,x,r);
    FP_FP256BN_mod(y->g,d);
    y->XES=2;
}

/* convert back to regular form */
void FP_FP256BN_redc(BIG_256_56 x,FP_FP256BN *y)
{
    DBIG_256_56 d;
    BIG_256_56_dzero(d);
    BIG_256_56_dscopy(d,y->g);
    FP_FP256BN_mod(x,d);
}


/* reduce a DBIG to a BIG using Montgomery's no trial division method */
/* d is expected to be dnormed before entry */
/* SU= 112 */
void FP_FP256BN_mod(BIG_256_56 a,DBIG_256_56 d)
{
    BIG_256_56 mdls;
    BIG_256_56_rcopy(mdls,Modulus_FP256BN);
    BIG_256_56_monty(a,mdls,MConst_FP256BN,d);
}

#endif

/* test x==0 ? */
/* SU= 48 */
int FP_FP256BN_iszilch(FP_FP256BN *x)
{
    BIG_256_56 m;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    BIG_256_56_mod(x->g,m);
    return BIG_256_56_iszilch(x->g);
}

void FP_FP256BN_copy(FP_FP256BN *y,FP_FP256BN *x)
{
    BIG_256_56_copy(y->g,x->g);
    y->XES=x->XES;
}

void FP_FP256BN_rcopy(FP_FP256BN *y, const BIG_256_56 c)
{
    BIG_256_56 b;
    BIG_256_56_rcopy(b,c);
    FP_FP256BN_nres(y,b);
}

/* Swap a and b if d=1 */
void FP_FP256BN_cswap(FP_FP256BN *a,FP_FP256BN *b,int d)
{
    sign32 t,c=d;
    BIG_256_56_cswap(a->g,b->g,d);

    c=~(c-1);
    t=c&((a->XES)^(b->XES));
    a->XES^=t;
    b->XES^=t;

}

/* Move b to a if d=1 */
void FP_FP256BN_cmove(FP_FP256BN *a,FP_FP256BN *b,int d)
{
    sign32 c=-d;

    BIG_256_56_cmove(a->g,b->g,d);
    a->XES^=(a->XES^b->XES)&c;
}

void FP_FP256BN_zero(FP_FP256BN *x)
{
    BIG_256_56_zero(x->g);
    x->XES=1;
}

int FP_FP256BN_equals(FP_FP256BN *x,FP_FP256BN *y)
{
    FP_FP256BN_reduce(x);
    FP_FP256BN_reduce(y);
    if (BIG_256_56_comp(x->g,y->g)==0) return 1;
    return 0;
}

/* output FP */
/* SU= 48 */
void FP_FP256BN_output(FP_FP256BN *r)
{
    BIG_256_56 c;
    FP_FP256BN_redc(c,r);
    BIG_256_56_output(c);
}

void FP_FP256BN_rawoutput(FP_FP256BN *r)
{
    BIG_256_56_rawoutput(r->g);
}

#ifdef GET_STATS
int tsqr=0,rsqr=0,tmul=0,rmul=0;
int tadd=0,radd=0,tneg=0,rneg=0;
int tdadd=0,rdadd=0,tdneg=0,rdneg=0;
#endif

#ifdef FUSED_MODMUL

/* Insert fastest code here */

#endif

/* r=a*b mod Modulus */
/* product must be less that p.R - and we need to know this in advance! */
/* SU= 88 */
void FP_FP256BN_mul(FP_FP256BN *r,FP_FP256BN *a,FP_FP256BN *b)
{
    DBIG_256_56 d;
//    chunk ea,eb;
//    BIG_256_56_norm(a);
//    BIG_256_56_norm(b);
//    ea=EXCESS_FP256BN(a->g);
//    eb=EXCESS_FP256BN(b->g);


    if ((sign64)a->XES*b->XES>(sign64)FEXCESS_FP256BN)
    {
#ifdef DEBUG_REDUCE
        printf("Product too large - reducing it\n");
#endif
        FP_FP256BN_reduce(a);  /* it is sufficient to fully reduce just one of them < p */
    }

#ifdef FUSED_MODMUL
    FP_FP256BN_modmul(r->g,a->g,b->g);
#else
    BIG_256_56_mul(d,a->g,b->g);
    FP_FP256BN_mod(r->g,d);
#endif
    r->XES=2;
}


/* multiplication by an integer, r=a*c */
/* SU= 136 */
void FP_FP256BN_imul(FP_FP256BN *r,FP_FP256BN *a,int c)
{
    int s=0;

    if (c<0)
    {
        c=-c;
        s=1;
    }

#if MODTYPE_FP256BN==PSEUDO_MERSENNE || MODTYPE_FP256BN==GENERALISED_MERSENNE
    DBIG_256_56 d;
    BIG_256_56_pxmul(d,a->g,c);
    FP_FP256BN_mod(r->g,d);
    r->XES=2;

#else
    //Montgomery
    BIG_256_56 k;
    FP_FP256BN f;
    if (a->XES*c<=FEXCESS_FP256BN)
    {
        BIG_256_56_pmul(r->g,a->g,c);
        r->XES=a->XES*c;    // careful here - XES jumps!
    }
    else
    {
        // don't want to do this - only a problem for Montgomery modulus and larger constants
        BIG_256_56_zero(k);
        BIG_256_56_inc(k,c);
        FP_FP256BN_nres(&f,k);
        FP_FP256BN_mul(r,a,&f);
    }
#endif
    /*
        if (c<=NEXCESS_256_56 && a->XES*c <= FEXCESS_FP256BN)
    	{
            BIG_256_56_imul(r->g,a->g,c);
    		r->XES=a->XES*c;
    		FP_FP256BN_norm(r);
    	}
        else
        {
                BIG_256_56_pxmul(d,a->g,c);

                BIG_256_56_rcopy(m,Modulus_FP256BN);
    			BIG_256_56_dmod(r->g,d,m);
                //FP_FP256BN_mod(r->g,d);                /// BIG problem here! Too slow for PM, How to do fast for Monty?
    			r->XES=2;
        }
    */
    if (s)
    {
        FP_FP256BN_neg(r,r);
        FP_FP256BN_norm(r);
    }
}

/* Set r=a^2 mod m */
/* SU= 88 */
void FP_FP256BN_sqr(FP_FP256BN *r,FP_FP256BN *a)
{
    DBIG_256_56 d;
//    chunk ea;
//    BIG_256_56_norm(a);
//    ea=EXCESS_FP256BN(a->g);


    if ((sign64)a->XES*a->XES>(sign64)FEXCESS_FP256BN)
    {
#ifdef DEBUG_REDUCE
        printf("Product too large - reducing it\n");
#endif
        FP_FP256BN_reduce(a);
    }

    BIG_256_56_sqr(d,a->g);
    FP_FP256BN_mod(r->g,d);
    r->XES=2;
}

/* SU= 16 */
/* Set r=a+b */
void FP_FP256BN_add(FP_FP256BN *r,FP_FP256BN *a,FP_FP256BN *b)
{
    BIG_256_56_add(r->g,a->g,b->g);
    r->XES=a->XES+b->XES;
    if (r->XES>FEXCESS_FP256BN)
    {
#ifdef DEBUG_REDUCE
        printf("Sum too large - reducing it \n");
#endif
        FP_FP256BN_reduce(r);
    }
}

/* Set r=a-b mod m */
/* SU= 56 */
void FP_FP256BN_sub(FP_FP256BN *r,FP_FP256BN *a,FP_FP256BN *b)
{
    FP_FP256BN n;
//	BIG_256_56_norm(b);
    FP_FP256BN_neg(&n,b);
//	BIG_256_56_norm(n);
    FP_FP256BN_add(r,a,&n);
}

/* SU= 48 */
/* Fully reduce a mod Modulus */
void FP_FP256BN_reduce(FP_FP256BN *a)
{
    BIG_256_56 m;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    BIG_256_56_mod(a->g,m);
    a->XES=1;
}

void FP_FP256BN_norm(FP_FP256BN *x)
{
    BIG_256_56_norm(x->g);
}

// https://graphics.stanford.edu/~seander/bithacks.html
// constant time log to base 2 (or number of bits in)

static int logb2(unsign32 v)
{
    int r;
    v |= v >> 1;
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    v = v - ((v >> 1) & 0x55555555);
    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
    r = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
    return r;
}

/* Set r=-a mod Modulus */
/* SU= 64 */
void FP_FP256BN_neg(FP_FP256BN *r,FP_FP256BN *a)
{
    int sb;
    BIG_256_56 m;

    BIG_256_56_rcopy(m,Modulus_FP256BN);

    sb=logb2(a->XES-1);
    BIG_256_56_fshl(m,sb);
    BIG_256_56_sub(r->g,m,a->g);
    r->XES=((sign32)1<<sb);

    if (r->XES>FEXCESS_FP256BN)
    {
#ifdef DEBUG_REDUCE
        printf("Negation too large -  reducing it \n");
#endif
        FP_FP256BN_reduce(r);
    }

}

/* Set r=a/2. */
/* SU= 56 */
void FP_FP256BN_div2(FP_FP256BN *r,FP_FP256BN *a)
{
    BIG_256_56 m;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    FP_FP256BN_copy(r,a);
//    BIG_256_56_norm(a);
    if (BIG_256_56_parity(a->g)==0)
    {

        BIG_256_56_fshr(r->g,1);
    }
    else
    {
        BIG_256_56_add(r->g,r->g,m);
        BIG_256_56_norm(r->g);
        BIG_256_56_fshr(r->g,1);
    }
}

/* set w=1/x */
void FP_FP256BN_inv(FP_FP256BN *w,FP_FP256BN *x)
{
    BIG_256_56 m;
    BIG_256_56 b;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    BIG_256_56_copy(w->g,x->g);
    FP_FP256BN_redc(b,w);

    BIG_256_56_invmodp(b,b,m);
    FP_FP256BN_nres(w,b);
}

/* SU=8 */
/* set n=1 */
void FP_FP256BN_one(FP_FP256BN *n)
{
    BIG_256_56 b;
    BIG_256_56_one(b);
    FP_FP256BN_nres(n,b);
}

/* Set r=a^b mod Modulus */
/* SU= 136 */
void FP_FP256BN_pow(FP_FP256BN *r,FP_FP256BN *a,BIG_256_56 b)
{
    BIG_256_56 z,zilch;
    FP_FP256BN w;
    int bt;
    BIG_256_56_zero(zilch);

    BIG_256_56_norm(b);
    BIG_256_56_copy(z,b);
    FP_FP256BN_copy(&w,a);
    FP_FP256BN_one(r);
    while(1)
    {
        bt=BIG_256_56_parity(z);
        BIG_256_56_fshr(z,1);
        if (bt) FP_FP256BN_mul(r,r,&w);
        if (BIG_256_56_comp(z,zilch)==0) break;
        FP_FP256BN_sqr(&w,&w);
    }
    FP_FP256BN_reduce(r);
}

/* is r a QR? */
int FP_FP256BN_qr(FP_FP256BN *r)
{
    int j;
    BIG_256_56 m;
    BIG_256_56 b;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    FP_FP256BN_redc(b,r);
    j=BIG_256_56_jacobi(b,m);
    FP_FP256BN_nres(r,b);
    if (j==1) return 1;
    return 0;

}

/* Set a=sqrt(b) mod Modulus */
/* SU= 160 */
void FP_FP256BN_sqrt(FP_FP256BN *r,FP_FP256BN *a)
{
    FP_FP256BN v,i;
    BIG_256_56 b;
    BIG_256_56 m;
    BIG_256_56_rcopy(m,Modulus_FP256BN);
    BIG_256_56_mod(a->g,m);
    BIG_256_56_copy(b,m);
    if (MOD8_FP256BN==5)
    {
        BIG_256_56_dec(b,5);
        BIG_256_56_norm(b);
        BIG_256_56_fshr(b,3); /* (p-5)/8 */
        FP_FP256BN_copy(&i,a);
        BIG_256_56_fshl(i.g,1);
        FP_FP256BN_pow(&v,&i,b);
        FP_FP256BN_mul(&i,&i,&v);
        FP_FP256BN_mul(&i,&i,&v);
        BIG_256_56_dec(i.g,1);
        FP_FP256BN_mul(r,a,&v);
        FP_FP256BN_mul(r,r,&i);
        FP_FP256BN_reduce(r);
    }
    if (MOD8_FP256BN==3 || MOD8_FP256BN==7)
    {
        BIG_256_56_inc(b,1);
        BIG_256_56_norm(b);
        BIG_256_56_fshr(b,2); /* (p+1)/4 */
        FP_FP256BN_pow(r,a,b);
    }
}

/*
int main()
{

	BIG_256_56 r;

	FP_FP256BN_one(r);
	FP_FP256BN_sqr(r,r);

	BIG_256_56_output(r);

	int i,carry;
	DBIG_256_56 c={0,0,0,0,0,0,0,0};
	BIG_256_56 a={1,2,3,4};
	BIG_256_56 b={3,4,5,6};
	BIG_256_56 r={11,12,13,14};
	BIG_256_56 s={23,24,25,15};
	BIG_256_56 w;

//	printf("NEXCESS_256_56= %d\n",NEXCESS_256_56);
//	printf("MConst_FP256BN= %d\n",MConst_FP256BN);

	BIG_256_56_copy(b,Modulus_FP256BN);
	BIG_256_56_dec(b,1);
	BIG_256_56_norm(b);

	BIG_256_56_randomnum(r); BIG_256_56_norm(r); BIG_256_56_mod(r,Modulus_FP256BN);
//	BIG_256_56_randomnum(s); norm(s); BIG_256_56_mod(s,Modulus_FP256BN);

//	BIG_256_56_output(r);
//	BIG_256_56_output(s);

	BIG_256_56_output(r);
	FP_FP256BN_nres(r);
	BIG_256_56_output(r);
	BIG_256_56_copy(a,r);
	FP_FP256BN_redc(r);
	BIG_256_56_output(r);
	BIG_256_56_dscopy(c,a);
	FP_FP256BN_mod(r,c);
	BIG_256_56_output(r);


//	exit(0);

//	copy(r,a);
	printf("r=   "); BIG_256_56_output(r);
	BIG_256_56_modsqr(r,r,Modulus_FP256BN);
	printf("r^2= "); BIG_256_56_output(r);

	FP_FP256BN_nres(r);
	FP_FP256BN_sqrt(r,r);
	FP_FP256BN_redc(r);
	printf("r=   "); BIG_256_56_output(r);
	BIG_256_56_modsqr(r,r,Modulus_FP256BN);
	printf("r^2= "); BIG_256_56_output(r);


//	for (i=0;i<100000;i++) FP_FP256BN_sqr(r,r);
//	for (i=0;i<100000;i++)
		FP_FP256BN_sqrt(r,r);

	BIG_256_56_output(r);
}
*/