Blame crypto/ec/ecp_nistp521.c

Packit c4476c
/*
Packit c4476c
 * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
 *
Packit c4476c
 * Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
 * this file except in compliance with the License.  You can obtain a copy
Packit c4476c
 * in the file LICENSE in the source distribution or at
Packit c4476c
 * https://www.openssl.org/source/license.html
Packit c4476c
 */
Packit c4476c
Packit c4476c
/* Copyright 2011 Google Inc.
Packit c4476c
 *
Packit c4476c
 * Licensed under the Apache License, Version 2.0 (the "License");
Packit c4476c
 *
Packit c4476c
 * you may not use this file except in compliance with the License.
Packit c4476c
 * You may obtain a copy of the License at
Packit c4476c
 *
Packit c4476c
 *     http://www.apache.org/licenses/LICENSE-2.0
Packit c4476c
 *
Packit c4476c
 *  Unless required by applicable law or agreed to in writing, software
Packit c4476c
 *  distributed under the License is distributed on an "AS IS" BASIS,
Packit c4476c
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Packit c4476c
 *  See the License for the specific language governing permissions and
Packit c4476c
 *  limitations under the License.
Packit c4476c
 */
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
Packit c4476c
 *
Packit c4476c
 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
Packit c4476c
 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
Packit c4476c
 * work which got its smarts from Daniel J. Bernstein's work on the same.
Packit c4476c
 */
Packit c4476c
Packit c4476c
#include <openssl/e_os2.h>
Packit c4476c
#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
Packit c4476c
NON_EMPTY_TRANSLATION_UNIT
Packit c4476c
#else
Packit c4476c
Packit c4476c
# include <string.h>
Packit c4476c
# include <openssl/err.h>
Packit c4476c
# include "ec_local.h"
Packit c4476c
Packit c4476c
# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
Packit c4476c
  /* even with gcc, the typedef won't work for 32-bit platforms */
Packit c4476c
typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
Packit c4476c
                                 * platforms */
Packit c4476c
# else
Packit c4476c
#  error "Your compiler doesn't appear to support 128-bit integer types"
Packit c4476c
# endif
Packit c4476c
Packit c4476c
typedef uint8_t u8;
Packit c4476c
typedef uint64_t u64;
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
Packit c4476c
 * element of this field into 66 bytes where the most significant byte
Packit c4476c
 * contains only a single bit. We call this an felem_bytearray.
Packit c4476c
 */
Packit c4476c
Packit c4476c
typedef u8 felem_bytearray[66];
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
Packit c4476c
 * These values are big-endian.
Packit c4476c
 */
Packit c4476c
static const felem_bytearray nistp521_curve_params[5] = {
Packit c4476c
    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff},
Packit c4476c
    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Packit c4476c
     0xff, 0xfc},
Packit c4476c
    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
Packit c4476c
     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
Packit c4476c
     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
Packit c4476c
     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
Packit c4476c
     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
Packit c4476c
     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
Packit c4476c
     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
Packit c4476c
     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
Packit c4476c
     0x3f, 0x00},
Packit c4476c
    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
Packit c4476c
     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
Packit c4476c
     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
Packit c4476c
     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
Packit c4476c
     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
Packit c4476c
     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
Packit c4476c
     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
Packit c4476c
     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
Packit c4476c
     0xbd, 0x66},
Packit c4476c
    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
Packit c4476c
     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
Packit c4476c
     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
Packit c4476c
     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
Packit c4476c
     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
Packit c4476c
     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
Packit c4476c
     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
Packit c4476c
     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
Packit c4476c
     0x66, 0x50}
Packit c4476c
};
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * The representation of field elements.
Packit c4476c
 * ------------------------------------
Packit c4476c
 *
Packit c4476c
 * We represent field elements with nine values. These values are either 64 or
Packit c4476c
 * 128 bits and the field element represented is:
Packit c4476c
 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
Packit c4476c
 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
Packit c4476c
 * 58 bits apart, but are greater than 58 bits in length, the most significant
Packit c4476c
 * bits of each limb overlap with the least significant bits of the next.
Packit c4476c
 *
Packit c4476c
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
Packit c4476c
 * 'largefelem' */
Packit c4476c
Packit c4476c
# define NLIMBS 9
Packit c4476c
Packit c4476c
typedef uint64_t limb;
Packit c4476c
typedef limb felem[NLIMBS];
Packit c4476c
typedef uint128_t largefelem[NLIMBS];
Packit c4476c
Packit c4476c
static const limb bottom57bits = 0x1ffffffffffffff;
Packit c4476c
static const limb bottom58bits = 0x3ffffffffffffff;
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * bin66_to_felem takes a little-endian byte array and converts it into felem
Packit c4476c
 * form. This assumes that the CPU is little-endian.
Packit c4476c
 */
Packit c4476c
static void bin66_to_felem(felem out, const u8 in[66])
Packit c4476c
{
Packit c4476c
    out[0] = (*((limb *) & in[0])) & bottom58bits;
Packit c4476c
    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
Packit c4476c
    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
Packit c4476c
    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
Packit c4476c
    out[4] = (*((limb *) & in[29])) & bottom58bits;
Packit c4476c
    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
Packit c4476c
    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
Packit c4476c
    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
Packit c4476c
    out[8] = (*((limb *) & in[58])) & bottom57bits;
Packit c4476c
}
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
Packit c4476c
 * array. This assumes that the CPU is little-endian.
Packit c4476c
 */
Packit c4476c
static void felem_to_bin66(u8 out[66], const felem in)
Packit c4476c
{
Packit c4476c
    memset(out, 0, 66);
Packit c4476c
    (*((limb *) & out[0])) = in[0];
Packit c4476c
    (*((limb *) & out[7])) |= in[1] << 2;
Packit c4476c
    (*((limb *) & out[14])) |= in[2] << 4;
Packit c4476c
    (*((limb *) & out[21])) |= in[3] << 6;
Packit c4476c
    (*((limb *) & out[29])) = in[4];
Packit c4476c
    (*((limb *) & out[36])) |= in[5] << 2;
Packit c4476c
    (*((limb *) & out[43])) |= in[6] << 4;
Packit c4476c
    (*((limb *) & out[50])) |= in[7] << 6;
Packit c4476c
    (*((limb *) & out[58])) = in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
Packit c4476c
static int BN_to_felem(felem out, const BIGNUM *bn)
Packit c4476c
{
Packit c4476c
    felem_bytearray b_out;
Packit c4476c
    int num_bytes;
Packit c4476c
Packit c4476c
    if (BN_is_negative(bn)) {
Packit c4476c
        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
Packit c4476c
        return 0;
Packit c4476c
    }
Packit c4476c
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
Packit c4476c
    if (num_bytes < 0) {
Packit c4476c
        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
Packit c4476c
        return 0;
Packit c4476c
    }
Packit c4476c
    bin66_to_felem(out, b_out);
Packit c4476c
    return 1;
Packit c4476c
}
Packit c4476c
Packit c4476c
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
Packit c4476c
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
Packit c4476c
{
Packit c4476c
    felem_bytearray b_out;
Packit c4476c
    felem_to_bin66(b_out, in);
Packit c4476c
    return BN_lebin2bn(b_out, sizeof(b_out), out);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * Field operations
Packit c4476c
 * ----------------
Packit c4476c
 */
Packit c4476c
Packit c4476c
static void felem_one(felem out)
Packit c4476c
{
Packit c4476c
    out[0] = 1;
Packit c4476c
    out[1] = 0;
Packit c4476c
    out[2] = 0;
Packit c4476c
    out[3] = 0;
Packit c4476c
    out[4] = 0;
Packit c4476c
    out[5] = 0;
Packit c4476c
    out[6] = 0;
Packit c4476c
    out[7] = 0;
Packit c4476c
    out[8] = 0;
Packit c4476c
}
Packit c4476c
Packit c4476c
static void felem_assign(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    out[0] = in[0];
Packit c4476c
    out[1] = in[1];
Packit c4476c
    out[2] = in[2];
Packit c4476c
    out[3] = in[3];
Packit c4476c
    out[4] = in[4];
Packit c4476c
    out[5] = in[5];
Packit c4476c
    out[6] = in[6];
Packit c4476c
    out[7] = in[7];
Packit c4476c
    out[8] = in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/* felem_sum64 sets out = out + in. */
Packit c4476c
static void felem_sum64(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    out[0] += in[0];
Packit c4476c
    out[1] += in[1];
Packit c4476c
    out[2] += in[2];
Packit c4476c
    out[3] += in[3];
Packit c4476c
    out[4] += in[4];
Packit c4476c
    out[5] += in[5];
Packit c4476c
    out[6] += in[6];
Packit c4476c
    out[7] += in[7];
Packit c4476c
    out[8] += in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/* felem_scalar sets out = in * scalar */
Packit c4476c
static void felem_scalar(felem out, const felem in, limb scalar)
Packit c4476c
{
Packit c4476c
    out[0] = in[0] * scalar;
Packit c4476c
    out[1] = in[1] * scalar;
Packit c4476c
    out[2] = in[2] * scalar;
Packit c4476c
    out[3] = in[3] * scalar;
Packit c4476c
    out[4] = in[4] * scalar;
Packit c4476c
    out[5] = in[5] * scalar;
Packit c4476c
    out[6] = in[6] * scalar;
Packit c4476c
    out[7] = in[7] * scalar;
Packit c4476c
    out[8] = in[8] * scalar;
Packit c4476c
}
Packit c4476c
Packit c4476c
/* felem_scalar64 sets out = out * scalar */
Packit c4476c
static void felem_scalar64(felem out, limb scalar)
Packit c4476c
{
Packit c4476c
    out[0] *= scalar;
Packit c4476c
    out[1] *= scalar;
Packit c4476c
    out[2] *= scalar;
Packit c4476c
    out[3] *= scalar;
Packit c4476c
    out[4] *= scalar;
Packit c4476c
    out[5] *= scalar;
Packit c4476c
    out[6] *= scalar;
Packit c4476c
    out[7] *= scalar;
Packit c4476c
    out[8] *= scalar;
Packit c4476c
}
Packit c4476c
Packit c4476c
/* felem_scalar128 sets out = out * scalar */
Packit c4476c
static void felem_scalar128(largefelem out, limb scalar)
Packit c4476c
{
Packit c4476c
    out[0] *= scalar;
Packit c4476c
    out[1] *= scalar;
Packit c4476c
    out[2] *= scalar;
Packit c4476c
    out[3] *= scalar;
Packit c4476c
    out[4] *= scalar;
Packit c4476c
    out[5] *= scalar;
Packit c4476c
    out[6] *= scalar;
Packit c4476c
    out[7] *= scalar;
Packit c4476c
    out[8] *= scalar;
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_neg sets |out| to |-in|
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^59 + 2^14
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < 2^62
Packit c4476c
 */
Packit c4476c
static void felem_neg(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    /* In order to prevent underflow, we subtract from 0 mod p. */
Packit c4476c
    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
Packit c4476c
    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
Packit c4476c
Packit c4476c
    out[0] = two62m3 - in[0];
Packit c4476c
    out[1] = two62m2 - in[1];
Packit c4476c
    out[2] = two62m2 - in[2];
Packit c4476c
    out[3] = two62m2 - in[3];
Packit c4476c
    out[4] = two62m2 - in[4];
Packit c4476c
    out[5] = two62m2 - in[5];
Packit c4476c
    out[6] = two62m2 - in[6];
Packit c4476c
    out[7] = two62m2 - in[7];
Packit c4476c
    out[8] = two62m2 - in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_diff64 subtracts |in| from |out|
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^59 + 2^14
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < out[i] + 2^62
Packit c4476c
 */
Packit c4476c
static void felem_diff64(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    /*
Packit c4476c
     * In order to prevent underflow, we add 0 mod p before subtracting.
Packit c4476c
     */
Packit c4476c
    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
Packit c4476c
    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
Packit c4476c
Packit c4476c
    out[0] += two62m3 - in[0];
Packit c4476c
    out[1] += two62m2 - in[1];
Packit c4476c
    out[2] += two62m2 - in[2];
Packit c4476c
    out[3] += two62m2 - in[3];
Packit c4476c
    out[4] += two62m2 - in[4];
Packit c4476c
    out[5] += two62m2 - in[5];
Packit c4476c
    out[6] += two62m2 - in[6];
Packit c4476c
    out[7] += two62m2 - in[7];
Packit c4476c
    out[8] += two62m2 - in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_diff_128_64 subtracts |in| from |out|
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^62 + 2^17
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < out[i] + 2^63
Packit c4476c
 */
Packit c4476c
static void felem_diff_128_64(largefelem out, const felem in)
Packit c4476c
{
Packit c4476c
    /*
Packit c4476c
     * In order to prevent underflow, we add 64p mod p (which is equivalent
Packit c4476c
     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
Packit c4476c
     * digit number with all bits set to 1. See "The representation of field
Packit c4476c
     * elements" comment above for a description of how limbs are used to
Packit c4476c
     * represent a number. 64p is represented with 8 limbs containing a number
Packit c4476c
     * with 58 bits set and one limb with a number with 57 bits set.
Packit c4476c
     */
Packit c4476c
    static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
Packit c4476c
    static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
Packit c4476c
Packit c4476c
    out[0] += two63m6 - in[0];
Packit c4476c
    out[1] += two63m5 - in[1];
Packit c4476c
    out[2] += two63m5 - in[2];
Packit c4476c
    out[3] += two63m5 - in[3];
Packit c4476c
    out[4] += two63m5 - in[4];
Packit c4476c
    out[5] += two63m5 - in[5];
Packit c4476c
    out[6] += two63m5 - in[6];
Packit c4476c
    out[7] += two63m5 - in[7];
Packit c4476c
    out[8] += two63m5 - in[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_diff_128_64 subtracts |in| from |out|
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^126
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < out[i] + 2^127 - 2^69
Packit c4476c
 */
Packit c4476c
static void felem_diff128(largefelem out, const largefelem in)
Packit c4476c
{
Packit c4476c
    /*
Packit c4476c
     * In order to prevent underflow, we add 0 mod p before subtracting.
Packit c4476c
     */
Packit c4476c
    static const uint128_t two127m70 =
Packit c4476c
        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
Packit c4476c
    static const uint128_t two127m69 =
Packit c4476c
        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
Packit c4476c
Packit c4476c
    out[0] += (two127m70 - in[0]);
Packit c4476c
    out[1] += (two127m69 - in[1]);
Packit c4476c
    out[2] += (two127m69 - in[2]);
Packit c4476c
    out[3] += (two127m69 - in[3]);
Packit c4476c
    out[4] += (two127m69 - in[4]);
Packit c4476c
    out[5] += (two127m69 - in[5]);
Packit c4476c
    out[6] += (two127m69 - in[6]);
Packit c4476c
    out[7] += (two127m69 - in[7]);
Packit c4476c
    out[8] += (two127m69 - in[8]);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_square sets |out| = |in|^2
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^62
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < 17 * max(in[i]) * max(in[i])
Packit c4476c
 */
Packit c4476c
static void felem_square(largefelem out, const felem in)
Packit c4476c
{
Packit c4476c
    felem inx2, inx4;
Packit c4476c
    felem_scalar(inx2, in, 2);
Packit c4476c
    felem_scalar(inx4, in, 4);
Packit c4476c
Packit c4476c
    /*-
Packit c4476c
     * We have many cases were we want to do
Packit c4476c
     *   in[x] * in[y] +
Packit c4476c
     *   in[y] * in[x]
Packit c4476c
     * This is obviously just
Packit c4476c
     *   2 * in[x] * in[y]
Packit c4476c
     * However, rather than do the doubling on the 128 bit result, we
Packit c4476c
     * double one of the inputs to the multiplication by reading from
Packit c4476c
     * |inx2|
Packit c4476c
     */
Packit c4476c
Packit c4476c
    out[0] = ((uint128_t) in[0]) * in[0];
Packit c4476c
    out[1] = ((uint128_t) in[0]) * inx2[1];
Packit c4476c
    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
Packit c4476c
    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
Packit c4476c
    out[4] = ((uint128_t) in[0]) * inx2[4] +
Packit c4476c
             ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
Packit c4476c
    out[5] = ((uint128_t) in[0]) * inx2[5] +
Packit c4476c
             ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
Packit c4476c
    out[6] = ((uint128_t) in[0]) * inx2[6] +
Packit c4476c
             ((uint128_t) in[1]) * inx2[5] +
Packit c4476c
             ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
Packit c4476c
    out[7] = ((uint128_t) in[0]) * inx2[7] +
Packit c4476c
             ((uint128_t) in[1]) * inx2[6] +
Packit c4476c
             ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
Packit c4476c
    out[8] = ((uint128_t) in[0]) * inx2[8] +
Packit c4476c
             ((uint128_t) in[1]) * inx2[7] +
Packit c4476c
             ((uint128_t) in[2]) * inx2[6] +
Packit c4476c
             ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * The remaining limbs fall above 2^521, with the first falling at 2^522.
Packit c4476c
     * They correspond to locations one bit up from the limbs produced above
Packit c4476c
     * so we would have to multiply by two to align them. Again, rather than
Packit c4476c
     * operate on the 128-bit result, we double one of the inputs to the
Packit c4476c
     * multiplication. If we want to double for both this reason, and the
Packit c4476c
     * reason above, then we end up multiplying by four.
Packit c4476c
     */
Packit c4476c
Packit c4476c
    /* 9 */
Packit c4476c
    out[0] += ((uint128_t) in[1]) * inx4[8] +
Packit c4476c
              ((uint128_t) in[2]) * inx4[7] +
Packit c4476c
              ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
Packit c4476c
Packit c4476c
    /* 10 */
Packit c4476c
    out[1] += ((uint128_t) in[2]) * inx4[8] +
Packit c4476c
              ((uint128_t) in[3]) * inx4[7] +
Packit c4476c
              ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
Packit c4476c
Packit c4476c
    /* 11 */
Packit c4476c
    out[2] += ((uint128_t) in[3]) * inx4[8] +
Packit c4476c
              ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
Packit c4476c
Packit c4476c
    /* 12 */
Packit c4476c
    out[3] += ((uint128_t) in[4]) * inx4[8] +
Packit c4476c
              ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
Packit c4476c
Packit c4476c
    /* 13 */
Packit c4476c
    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
Packit c4476c
Packit c4476c
    /* 14 */
Packit c4476c
    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
Packit c4476c
Packit c4476c
    /* 15 */
Packit c4476c
    out[6] += ((uint128_t) in[7]) * inx4[8];
Packit c4476c
Packit c4476c
    /* 16 */
Packit c4476c
    out[7] += ((uint128_t) in[8]) * inx2[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_mul sets |out| = |in1| * |in2|
Packit c4476c
 * On entry:
Packit c4476c
 *   in1[i] < 2^64
Packit c4476c
 *   in2[i] < 2^63
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < 17 * max(in1[i]) * max(in2[i])
Packit c4476c
 */
Packit c4476c
static void felem_mul(largefelem out, const felem in1, const felem in2)
Packit c4476c
{
Packit c4476c
    felem in2x2;
Packit c4476c
    felem_scalar(in2x2, in2, 2);
Packit c4476c
Packit c4476c
    out[0] = ((uint128_t) in1[0]) * in2[0];
Packit c4476c
Packit c4476c
    out[1] = ((uint128_t) in1[0]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[0];
Packit c4476c
Packit c4476c
    out[2] = ((uint128_t) in1[0]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[0];
Packit c4476c
Packit c4476c
    out[3] = ((uint128_t) in1[0]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[0];
Packit c4476c
Packit c4476c
    out[4] = ((uint128_t) in1[0]) * in2[4] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[4]) * in2[0];
Packit c4476c
Packit c4476c
    out[5] = ((uint128_t) in1[0]) * in2[5] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[4] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[4]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[5]) * in2[0];
Packit c4476c
Packit c4476c
    out[6] = ((uint128_t) in1[0]) * in2[6] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[5] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[4] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[4]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[5]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[6]) * in2[0];
Packit c4476c
Packit c4476c
    out[7] = ((uint128_t) in1[0]) * in2[7] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[6] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[5] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[4] +
Packit c4476c
             ((uint128_t) in1[4]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[5]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[6]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[7]) * in2[0];
Packit c4476c
Packit c4476c
    out[8] = ((uint128_t) in1[0]) * in2[8] +
Packit c4476c
             ((uint128_t) in1[1]) * in2[7] +
Packit c4476c
             ((uint128_t) in1[2]) * in2[6] +
Packit c4476c
             ((uint128_t) in1[3]) * in2[5] +
Packit c4476c
             ((uint128_t) in1[4]) * in2[4] +
Packit c4476c
             ((uint128_t) in1[5]) * in2[3] +
Packit c4476c
             ((uint128_t) in1[6]) * in2[2] +
Packit c4476c
             ((uint128_t) in1[7]) * in2[1] +
Packit c4476c
             ((uint128_t) in1[8]) * in2[0];
Packit c4476c
Packit c4476c
    /* See comment in felem_square about the use of in2x2 here */
Packit c4476c
Packit c4476c
    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[2]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[3]) * in2x2[6] +
Packit c4476c
              ((uint128_t) in1[4]) * in2x2[5] +
Packit c4476c
              ((uint128_t) in1[5]) * in2x2[4] +
Packit c4476c
              ((uint128_t) in1[6]) * in2x2[3] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[2] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[1];
Packit c4476c
Packit c4476c
    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[3]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[4]) * in2x2[6] +
Packit c4476c
              ((uint128_t) in1[5]) * in2x2[5] +
Packit c4476c
              ((uint128_t) in1[6]) * in2x2[4] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[3] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[2];
Packit c4476c
Packit c4476c
    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[4]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[5]) * in2x2[6] +
Packit c4476c
              ((uint128_t) in1[6]) * in2x2[5] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[4] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[3];
Packit c4476c
Packit c4476c
    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[5]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[6]) * in2x2[6] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[5] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[4];
Packit c4476c
Packit c4476c
    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[6]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[6] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[5];
Packit c4476c
Packit c4476c
    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[7]) * in2x2[7] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[6];
Packit c4476c
Packit c4476c
    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
Packit c4476c
              ((uint128_t) in1[8]) * in2x2[7];
Packit c4476c
Packit c4476c
    out[7] += ((uint128_t) in1[8]) * in2x2[8];
Packit c4476c
}
Packit c4476c
Packit c4476c
static const limb bottom52bits = 0xfffffffffffff;
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_reduce converts a largefelem to an felem.
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^128
Packit c4476c
 * On exit:
Packit c4476c
 *   out[i] < 2^59 + 2^14
Packit c4476c
 */
Packit c4476c
static void felem_reduce(felem out, const largefelem in)
Packit c4476c
{
Packit c4476c
    u64 overflow1, overflow2;
Packit c4476c
Packit c4476c
    out[0] = ((limb) in[0]) & bottom58bits;
Packit c4476c
    out[1] = ((limb) in[1]) & bottom58bits;
Packit c4476c
    out[2] = ((limb) in[2]) & bottom58bits;
Packit c4476c
    out[3] = ((limb) in[3]) & bottom58bits;
Packit c4476c
    out[4] = ((limb) in[4]) & bottom58bits;
Packit c4476c
    out[5] = ((limb) in[5]) & bottom58bits;
Packit c4476c
    out[6] = ((limb) in[6]) & bottom58bits;
Packit c4476c
    out[7] = ((limb) in[7]) & bottom58bits;
Packit c4476c
    out[8] = ((limb) in[8]) & bottom58bits;
Packit c4476c
Packit c4476c
    /* out[i] < 2^58 */
Packit c4476c
Packit c4476c
    out[1] += ((limb) in[0]) >> 58;
Packit c4476c
    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
Packit c4476c
    /*-
Packit c4476c
     * out[1] < 2^58 + 2^6 + 2^58
Packit c4476c
     *        = 2^59 + 2^6
Packit c4476c
     */
Packit c4476c
    out[2] += ((limb) (in[0] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[2] += ((limb) in[1]) >> 58;
Packit c4476c
    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[3] += ((limb) (in[1] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[3] += ((limb) in[2]) >> 58;
Packit c4476c
    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[4] += ((limb) (in[2] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[4] += ((limb) in[3]) >> 58;
Packit c4476c
    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[5] += ((limb) (in[3] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[5] += ((limb) in[4]) >> 58;
Packit c4476c
    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[6] += ((limb) (in[4] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[6] += ((limb) in[5]) >> 58;
Packit c4476c
    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[7] += ((limb) (in[5] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[7] += ((limb) in[6]) >> 58;
Packit c4476c
    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
Packit c4476c
    out[8] += ((limb) (in[6] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    out[8] += ((limb) in[7]) >> 58;
Packit c4476c
    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
Packit c4476c
    /*-
Packit c4476c
     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
Packit c4476c
     *            < 2^59 + 2^13
Packit c4476c
     */
Packit c4476c
    overflow1 = ((limb) (in[7] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    overflow1 += ((limb) in[8]) >> 58;
Packit c4476c
    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
Packit c4476c
    overflow2 = ((limb) (in[8] >> 64)) >> 52;
Packit c4476c
Packit c4476c
    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
Packit c4476c
    overflow2 <<= 1;            /* overflow2 < 2^13 */
Packit c4476c
Packit c4476c
    out[0] += overflow1;        /* out[0] < 2^60 */
Packit c4476c
    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
Packit c4476c
Packit c4476c
    out[1] += out[0] >> 58;
Packit c4476c
    out[0] &= bottom58bits;
Packit c4476c
    /*-
Packit c4476c
     * out[0] < 2^58
Packit c4476c
     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
Packit c4476c
     *        < 2^59 + 2^14
Packit c4476c
     */
Packit c4476c
}
Packit c4476c
Packit c4476c
static void felem_square_reduce(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    largefelem tmp;
Packit c4476c
    felem_square(tmp, in);
Packit c4476c
    felem_reduce(out, tmp);
Packit c4476c
}
Packit c4476c
Packit c4476c
static void felem_mul_reduce(felem out, const felem in1, const felem in2)
Packit c4476c
{
Packit c4476c
    largefelem tmp;
Packit c4476c
    felem_mul(tmp, in1, in2);
Packit c4476c
    felem_reduce(out, tmp);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_inv calculates |out| = |in|^{-1}
Packit c4476c
 *
Packit c4476c
 * Based on Fermat's Little Theorem:
Packit c4476c
 *   a^p = a (mod p)
Packit c4476c
 *   a^{p-1} = 1 (mod p)
Packit c4476c
 *   a^{p-2} = a^{-1} (mod p)
Packit c4476c
 */
Packit c4476c
static void felem_inv(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    felem ftmp, ftmp2, ftmp3, ftmp4;
Packit c4476c
    largefelem tmp;
Packit c4476c
    unsigned i;
Packit c4476c
Packit c4476c
    felem_square(tmp, in);
Packit c4476c
    felem_reduce(ftmp, tmp);    /* 2^1 */
Packit c4476c
    felem_mul(tmp, in, ftmp);
Packit c4476c
    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp);
Packit c4476c
    felem_square(tmp, ftmp);
Packit c4476c
    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
Packit c4476c
    felem_mul(tmp, in, ftmp);
Packit c4476c
    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
Packit c4476c
    felem_square(tmp, ftmp);
Packit c4476c
    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
Packit c4476c
Packit c4476c
    felem_square(tmp, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
Packit c4476c
    felem_square(tmp, ftmp3);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
Packit c4476c
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
    felem_square(tmp, ftmp3);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
Packit c4476c
    felem_square(tmp, ftmp3);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
Packit c4476c
    felem_square(tmp, ftmp3);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
Packit c4476c
    felem_square(tmp, ftmp3);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
Packit c4476c
    felem_assign(ftmp4, ftmp3);
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp);
Packit c4476c
    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
Packit c4476c
    felem_square(tmp, ftmp4);
Packit c4476c
    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 8; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 16; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 32; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 64; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 128; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
Packit c4476c
    felem_assign(ftmp2, ftmp3);
Packit c4476c
Packit c4476c
    for (i = 0; i < 256; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp2);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
Packit c4476c
Packit c4476c
    for (i = 0; i < 9; i++) {
Packit c4476c
        felem_square(tmp, ftmp3);
Packit c4476c
        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp4);
Packit c4476c
    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
Packit c4476c
    felem_mul(tmp, ftmp3, in);
Packit c4476c
    felem_reduce(out, tmp);     /* 2^512 - 3 */
Packit c4476c
}
Packit c4476c
Packit c4476c
/* This is 2^521-1, expressed as an felem */
Packit c4476c
static const felem kPrime = {
Packit c4476c
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
Packit c4476c
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
Packit c4476c
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
Packit c4476c
};
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
Packit c4476c
 * otherwise.
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^59 + 2^14
Packit c4476c
 */
Packit c4476c
static limb felem_is_zero(const felem in)
Packit c4476c
{
Packit c4476c
    felem ftmp;
Packit c4476c
    limb is_zero, is_p;
Packit c4476c
    felem_assign(ftmp, in);
Packit c4476c
Packit c4476c
    ftmp[0] += ftmp[8] >> 57;
Packit c4476c
    ftmp[8] &= bottom57bits;
Packit c4476c
    /* ftmp[8] < 2^57 */
Packit c4476c
    ftmp[1] += ftmp[0] >> 58;
Packit c4476c
    ftmp[0] &= bottom58bits;
Packit c4476c
    ftmp[2] += ftmp[1] >> 58;
Packit c4476c
    ftmp[1] &= bottom58bits;
Packit c4476c
    ftmp[3] += ftmp[2] >> 58;
Packit c4476c
    ftmp[2] &= bottom58bits;
Packit c4476c
    ftmp[4] += ftmp[3] >> 58;
Packit c4476c
    ftmp[3] &= bottom58bits;
Packit c4476c
    ftmp[5] += ftmp[4] >> 58;
Packit c4476c
    ftmp[4] &= bottom58bits;
Packit c4476c
    ftmp[6] += ftmp[5] >> 58;
Packit c4476c
    ftmp[5] &= bottom58bits;
Packit c4476c
    ftmp[7] += ftmp[6] >> 58;
Packit c4476c
    ftmp[6] &= bottom58bits;
Packit c4476c
    ftmp[8] += ftmp[7] >> 58;
Packit c4476c
    ftmp[7] &= bottom58bits;
Packit c4476c
    /* ftmp[8] < 2^57 + 4 */
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
Packit c4476c
     * than our bound for ftmp[8]. Therefore we only have to check if the
Packit c4476c
     * zero is zero or 2^521-1.
Packit c4476c
     */
Packit c4476c
Packit c4476c
    is_zero = 0;
Packit c4476c
    is_zero |= ftmp[0];
Packit c4476c
    is_zero |= ftmp[1];
Packit c4476c
    is_zero |= ftmp[2];
Packit c4476c
    is_zero |= ftmp[3];
Packit c4476c
    is_zero |= ftmp[4];
Packit c4476c
    is_zero |= ftmp[5];
Packit c4476c
    is_zero |= ftmp[6];
Packit c4476c
    is_zero |= ftmp[7];
Packit c4476c
    is_zero |= ftmp[8];
Packit c4476c
Packit c4476c
    is_zero--;
Packit c4476c
    /*
Packit c4476c
     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
Packit c4476c
     * can be set is if is_zero was 0 before the decrement.
Packit c4476c
     */
Packit c4476c
    is_zero = 0 - (is_zero >> 63);
Packit c4476c
Packit c4476c
    is_p = ftmp[0] ^ kPrime[0];
Packit c4476c
    is_p |= ftmp[1] ^ kPrime[1];
Packit c4476c
    is_p |= ftmp[2] ^ kPrime[2];
Packit c4476c
    is_p |= ftmp[3] ^ kPrime[3];
Packit c4476c
    is_p |= ftmp[4] ^ kPrime[4];
Packit c4476c
    is_p |= ftmp[5] ^ kPrime[5];
Packit c4476c
    is_p |= ftmp[6] ^ kPrime[6];
Packit c4476c
    is_p |= ftmp[7] ^ kPrime[7];
Packit c4476c
    is_p |= ftmp[8] ^ kPrime[8];
Packit c4476c
Packit c4476c
    is_p--;
Packit c4476c
    is_p = 0 - (is_p >> 63);
Packit c4476c
Packit c4476c
    is_zero |= is_p;
Packit c4476c
    return is_zero;
Packit c4476c
}
Packit c4476c
Packit c4476c
static int felem_is_zero_int(const void *in)
Packit c4476c
{
Packit c4476c
    return (int)(felem_is_zero(in) & ((limb) 1));
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * felem_contract converts |in| to its unique, minimal representation.
Packit c4476c
 * On entry:
Packit c4476c
 *   in[i] < 2^59 + 2^14
Packit c4476c
 */
Packit c4476c
static void felem_contract(felem out, const felem in)
Packit c4476c
{
Packit c4476c
    limb is_p, is_greater, sign;
Packit c4476c
    static const limb two58 = ((limb) 1) << 58;
Packit c4476c
Packit c4476c
    felem_assign(out, in);
Packit c4476c
Packit c4476c
    out[0] += out[8] >> 57;
Packit c4476c
    out[8] &= bottom57bits;
Packit c4476c
    /* out[8] < 2^57 */
Packit c4476c
    out[1] += out[0] >> 58;
Packit c4476c
    out[0] &= bottom58bits;
Packit c4476c
    out[2] += out[1] >> 58;
Packit c4476c
    out[1] &= bottom58bits;
Packit c4476c
    out[3] += out[2] >> 58;
Packit c4476c
    out[2] &= bottom58bits;
Packit c4476c
    out[4] += out[3] >> 58;
Packit c4476c
    out[3] &= bottom58bits;
Packit c4476c
    out[5] += out[4] >> 58;
Packit c4476c
    out[4] &= bottom58bits;
Packit c4476c
    out[6] += out[5] >> 58;
Packit c4476c
    out[5] &= bottom58bits;
Packit c4476c
    out[7] += out[6] >> 58;
Packit c4476c
    out[6] &= bottom58bits;
Packit c4476c
    out[8] += out[7] >> 58;
Packit c4476c
    out[7] &= bottom58bits;
Packit c4476c
    /* out[8] < 2^57 + 4 */
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
Packit c4476c
     * out. See the comments in felem_is_zero regarding why we don't test for
Packit c4476c
     * other multiples of the prime.
Packit c4476c
     */
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
Packit c4476c
     */
Packit c4476c
Packit c4476c
    is_p = out[0] ^ kPrime[0];
Packit c4476c
    is_p |= out[1] ^ kPrime[1];
Packit c4476c
    is_p |= out[2] ^ kPrime[2];
Packit c4476c
    is_p |= out[3] ^ kPrime[3];
Packit c4476c
    is_p |= out[4] ^ kPrime[4];
Packit c4476c
    is_p |= out[5] ^ kPrime[5];
Packit c4476c
    is_p |= out[6] ^ kPrime[6];
Packit c4476c
    is_p |= out[7] ^ kPrime[7];
Packit c4476c
    is_p |= out[8] ^ kPrime[8];
Packit c4476c
Packit c4476c
    is_p--;
Packit c4476c
    is_p &= is_p << 32;
Packit c4476c
    is_p &= is_p << 16;
Packit c4476c
    is_p &= is_p << 8;
Packit c4476c
    is_p &= is_p << 4;
Packit c4476c
    is_p &= is_p << 2;
Packit c4476c
    is_p &= is_p << 1;
Packit c4476c
    is_p = 0 - (is_p >> 63);
Packit c4476c
    is_p = ~is_p;
Packit c4476c
Packit c4476c
    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
Packit c4476c
Packit c4476c
    out[0] &= is_p;
Packit c4476c
    out[1] &= is_p;
Packit c4476c
    out[2] &= is_p;
Packit c4476c
    out[3] &= is_p;
Packit c4476c
    out[4] &= is_p;
Packit c4476c
    out[5] &= is_p;
Packit c4476c
    out[6] &= is_p;
Packit c4476c
    out[7] &= is_p;
Packit c4476c
    out[8] &= is_p;
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
Packit c4476c
     * 57 is greater than zero as (2^521-1) + x >= 2^522
Packit c4476c
     */
Packit c4476c
    is_greater = out[8] >> 57;
Packit c4476c
    is_greater |= is_greater << 32;
Packit c4476c
    is_greater |= is_greater << 16;
Packit c4476c
    is_greater |= is_greater << 8;
Packit c4476c
    is_greater |= is_greater << 4;
Packit c4476c
    is_greater |= is_greater << 2;
Packit c4476c
    is_greater |= is_greater << 1;
Packit c4476c
    is_greater = 0 - (is_greater >> 63);
Packit c4476c
Packit c4476c
    out[0] -= kPrime[0] & is_greater;
Packit c4476c
    out[1] -= kPrime[1] & is_greater;
Packit c4476c
    out[2] -= kPrime[2] & is_greater;
Packit c4476c
    out[3] -= kPrime[3] & is_greater;
Packit c4476c
    out[4] -= kPrime[4] & is_greater;
Packit c4476c
    out[5] -= kPrime[5] & is_greater;
Packit c4476c
    out[6] -= kPrime[6] & is_greater;
Packit c4476c
    out[7] -= kPrime[7] & is_greater;
Packit c4476c
    out[8] -= kPrime[8] & is_greater;
Packit c4476c
Packit c4476c
    /* Eliminate negative coefficients */
Packit c4476c
    sign = -(out[0] >> 63);
Packit c4476c
    out[0] += (two58 & sign);
Packit c4476c
    out[1] -= (1 & sign);
Packit c4476c
    sign = -(out[1] >> 63);
Packit c4476c
    out[1] += (two58 & sign);
Packit c4476c
    out[2] -= (1 & sign);
Packit c4476c
    sign = -(out[2] >> 63);
Packit c4476c
    out[2] += (two58 & sign);
Packit c4476c
    out[3] -= (1 & sign);
Packit c4476c
    sign = -(out[3] >> 63);
Packit c4476c
    out[3] += (two58 & sign);
Packit c4476c
    out[4] -= (1 & sign);
Packit c4476c
    sign = -(out[4] >> 63);
Packit c4476c
    out[4] += (two58 & sign);
Packit c4476c
    out[5] -= (1 & sign);
Packit c4476c
    sign = -(out[0] >> 63);
Packit c4476c
    out[5] += (two58 & sign);
Packit c4476c
    out[6] -= (1 & sign);
Packit c4476c
    sign = -(out[6] >> 63);
Packit c4476c
    out[6] += (two58 & sign);
Packit c4476c
    out[7] -= (1 & sign);
Packit c4476c
    sign = -(out[7] >> 63);
Packit c4476c
    out[7] += (two58 & sign);
Packit c4476c
    out[8] -= (1 & sign);
Packit c4476c
    sign = -(out[5] >> 63);
Packit c4476c
    out[5] += (two58 & sign);
Packit c4476c
    out[6] -= (1 & sign);
Packit c4476c
    sign = -(out[6] >> 63);
Packit c4476c
    out[6] += (two58 & sign);
Packit c4476c
    out[7] -= (1 & sign);
Packit c4476c
    sign = -(out[7] >> 63);
Packit c4476c
    out[7] += (two58 & sign);
Packit c4476c
    out[8] -= (1 & sign);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * Group operations
Packit c4476c
 * ----------------
Packit c4476c
 *
Packit c4476c
 * Building on top of the field operations we have the operations on the
Packit c4476c
 * elliptic curve group itself. Points on the curve are represented in Jacobian
Packit c4476c
 * coordinates */
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * point_double calculates 2*(x_in, y_in, z_in)
Packit c4476c
 *
Packit c4476c
 * The method is taken from:
Packit c4476c
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
Packit c4476c
 *
Packit c4476c
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
Packit c4476c
 * while x_out == y_in is not (maybe this works, but it's not tested). */
Packit c4476c
static void
Packit c4476c
point_double(felem x_out, felem y_out, felem z_out,
Packit c4476c
             const felem x_in, const felem y_in, const felem z_in)
Packit c4476c
{
Packit c4476c
    largefelem tmp, tmp2;
Packit c4476c
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
Packit c4476c
Packit c4476c
    felem_assign(ftmp, x_in);
Packit c4476c
    felem_assign(ftmp2, x_in);
Packit c4476c
Packit c4476c
    /* delta = z^2 */
Packit c4476c
    felem_square(tmp, z_in);
Packit c4476c
    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
Packit c4476c
Packit c4476c
    /* gamma = y^2 */
Packit c4476c
    felem_square(tmp, y_in);
Packit c4476c
    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
Packit c4476c
Packit c4476c
    /* beta = x*gamma */
Packit c4476c
    felem_mul(tmp, x_in, gamma);
Packit c4476c
    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
Packit c4476c
Packit c4476c
    /* alpha = 3*(x-delta)*(x+delta) */
Packit c4476c
    felem_diff64(ftmp, delta);
Packit c4476c
    /* ftmp[i] < 2^61 */
Packit c4476c
    felem_sum64(ftmp2, delta);
Packit c4476c
    /* ftmp2[i] < 2^60 + 2^15 */
Packit c4476c
    felem_scalar64(ftmp2, 3);
Packit c4476c
    /* ftmp2[i] < 3*2^60 + 3*2^15 */
Packit c4476c
    felem_mul(tmp, ftmp, ftmp2);
Packit c4476c
    /*-
Packit c4476c
     * tmp[i] < 17(3*2^121 + 3*2^76)
Packit c4476c
     *        = 61*2^121 + 61*2^76
Packit c4476c
     *        < 64*2^121 + 64*2^76
Packit c4476c
     *        = 2^127 + 2^82
Packit c4476c
     *        < 2^128
Packit c4476c
     */
Packit c4476c
    felem_reduce(alpha, tmp);
Packit c4476c
Packit c4476c
    /* x' = alpha^2 - 8*beta */
Packit c4476c
    felem_square(tmp, alpha);
Packit c4476c
    /*
Packit c4476c
     * tmp[i] < 17*2^120 < 2^125
Packit c4476c
     */
Packit c4476c
    felem_assign(ftmp, beta);
Packit c4476c
    felem_scalar64(ftmp, 8);
Packit c4476c
    /* ftmp[i] < 2^62 + 2^17 */
Packit c4476c
    felem_diff_128_64(tmp, ftmp);
Packit c4476c
    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
Packit c4476c
    felem_reduce(x_out, tmp);
Packit c4476c
Packit c4476c
    /* z' = (y + z)^2 - gamma - delta */
Packit c4476c
    felem_sum64(delta, gamma);
Packit c4476c
    /* delta[i] < 2^60 + 2^15 */
Packit c4476c
    felem_assign(ftmp, y_in);
Packit c4476c
    felem_sum64(ftmp, z_in);
Packit c4476c
    /* ftmp[i] < 2^60 + 2^15 */
Packit c4476c
    felem_square(tmp, ftmp);
Packit c4476c
    /*
Packit c4476c
     * tmp[i] < 17(2^122) < 2^127
Packit c4476c
     */
Packit c4476c
    felem_diff_128_64(tmp, delta);
Packit c4476c
    /* tmp[i] < 2^127 + 2^63 */
Packit c4476c
    felem_reduce(z_out, tmp);
Packit c4476c
Packit c4476c
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
Packit c4476c
    felem_scalar64(beta, 4);
Packit c4476c
    /* beta[i] < 2^61 + 2^16 */
Packit c4476c
    felem_diff64(beta, x_out);
Packit c4476c
    /* beta[i] < 2^61 + 2^60 + 2^16 */
Packit c4476c
    felem_mul(tmp, alpha, beta);
Packit c4476c
    /*-
Packit c4476c
     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
Packit c4476c
     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
Packit c4476c
     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
Packit c4476c
     *        < 2^128
Packit c4476c
     */
Packit c4476c
    felem_square(tmp2, gamma);
Packit c4476c
    /*-
Packit c4476c
     * tmp2[i] < 17*(2^59 + 2^14)^2
Packit c4476c
     *         = 17*(2^118 + 2^74 + 2^28)
Packit c4476c
     */
Packit c4476c
    felem_scalar128(tmp2, 8);
Packit c4476c
    /*-
Packit c4476c
     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
Packit c4476c
     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
Packit c4476c
     *         < 2^126
Packit c4476c
     */
Packit c4476c
    felem_diff128(tmp, tmp2);
Packit c4476c
    /*-
Packit c4476c
     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
Packit c4476c
     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
Packit c4476c
     *          2^74 + 2^69 + 2^34 + 2^30
Packit c4476c
     *        < 2^128
Packit c4476c
     */
Packit c4476c
    felem_reduce(y_out, tmp);
Packit c4476c
}
Packit c4476c
Packit c4476c
/* copy_conditional copies in to out iff mask is all ones. */
Packit c4476c
static void copy_conditional(felem out, const felem in, limb mask)
Packit c4476c
{
Packit c4476c
    unsigned i;
Packit c4476c
    for (i = 0; i < NLIMBS; ++i) {
Packit c4476c
        const limb tmp = mask & (in[i] ^ out[i]);
Packit c4476c
        out[i] ^= tmp;
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
Packit c4476c
 *
Packit c4476c
 * The method is taken from
Packit c4476c
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
Packit c4476c
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
Packit c4476c
 *
Packit c4476c
 * This function includes a branch for checking whether the two input points
Packit c4476c
 * are equal (while not equal to the point at infinity). See comment below
Packit c4476c
 * on constant-time.
Packit c4476c
 */
Packit c4476c
static void point_add(felem x3, felem y3, felem z3,
Packit c4476c
                      const felem x1, const felem y1, const felem z1,
Packit c4476c
                      const int mixed, const felem x2, const felem y2,
Packit c4476c
                      const felem z2)
Packit c4476c
{
Packit c4476c
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
Packit c4476c
    largefelem tmp, tmp2;
Packit c4476c
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
Packit c4476c
    limb points_equal;
Packit c4476c
Packit c4476c
    z1_is_zero = felem_is_zero(z1);
Packit c4476c
    z2_is_zero = felem_is_zero(z2);
Packit c4476c
Packit c4476c
    /* ftmp = z1z1 = z1**2 */
Packit c4476c
    felem_square(tmp, z1);
Packit c4476c
    felem_reduce(ftmp, tmp);
Packit c4476c
Packit c4476c
    if (!mixed) {
Packit c4476c
        /* ftmp2 = z2z2 = z2**2 */
Packit c4476c
        felem_square(tmp, z2);
Packit c4476c
        felem_reduce(ftmp2, tmp);
Packit c4476c
Packit c4476c
        /* u1 = ftmp3 = x1*z2z2 */
Packit c4476c
        felem_mul(tmp, x1, ftmp2);
Packit c4476c
        felem_reduce(ftmp3, tmp);
Packit c4476c
Packit c4476c
        /* ftmp5 = z1 + z2 */
Packit c4476c
        felem_assign(ftmp5, z1);
Packit c4476c
        felem_sum64(ftmp5, z2);
Packit c4476c
        /* ftmp5[i] < 2^61 */
Packit c4476c
Packit c4476c
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
Packit c4476c
        felem_square(tmp, ftmp5);
Packit c4476c
        /* tmp[i] < 17*2^122 */
Packit c4476c
        felem_diff_128_64(tmp, ftmp);
Packit c4476c
        /* tmp[i] < 17*2^122 + 2^63 */
Packit c4476c
        felem_diff_128_64(tmp, ftmp2);
Packit c4476c
        /* tmp[i] < 17*2^122 + 2^64 */
Packit c4476c
        felem_reduce(ftmp5, tmp);
Packit c4476c
Packit c4476c
        /* ftmp2 = z2 * z2z2 */
Packit c4476c
        felem_mul(tmp, ftmp2, z2);
Packit c4476c
        felem_reduce(ftmp2, tmp);
Packit c4476c
Packit c4476c
        /* s1 = ftmp6 = y1 * z2**3 */
Packit c4476c
        felem_mul(tmp, y1, ftmp2);
Packit c4476c
        felem_reduce(ftmp6, tmp);
Packit c4476c
    } else {
Packit c4476c
        /*
Packit c4476c
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
Packit c4476c
         */
Packit c4476c
Packit c4476c
        /* u1 = ftmp3 = x1*z2z2 */
Packit c4476c
        felem_assign(ftmp3, x1);
Packit c4476c
Packit c4476c
        /* ftmp5 = 2*z1z2 */
Packit c4476c
        felem_scalar(ftmp5, z1, 2);
Packit c4476c
Packit c4476c
        /* s1 = ftmp6 = y1 * z2**3 */
Packit c4476c
        felem_assign(ftmp6, y1);
Packit c4476c
    }
Packit c4476c
Packit c4476c
    /* u2 = x2*z1z1 */
Packit c4476c
    felem_mul(tmp, x2, ftmp);
Packit c4476c
    /* tmp[i] < 17*2^120 */
Packit c4476c
Packit c4476c
    /* h = ftmp4 = u2 - u1 */
Packit c4476c
    felem_diff_128_64(tmp, ftmp3);
Packit c4476c
    /* tmp[i] < 17*2^120 + 2^63 */
Packit c4476c
    felem_reduce(ftmp4, tmp);
Packit c4476c
Packit c4476c
    x_equal = felem_is_zero(ftmp4);
Packit c4476c
Packit c4476c
    /* z_out = ftmp5 * h */
Packit c4476c
    felem_mul(tmp, ftmp5, ftmp4);
Packit c4476c
    felem_reduce(z_out, tmp);
Packit c4476c
Packit c4476c
    /* ftmp = z1 * z1z1 */
Packit c4476c
    felem_mul(tmp, ftmp, z1);
Packit c4476c
    felem_reduce(ftmp, tmp);
Packit c4476c
Packit c4476c
    /* s2 = tmp = y2 * z1**3 */
Packit c4476c
    felem_mul(tmp, y2, ftmp);
Packit c4476c
    /* tmp[i] < 17*2^120 */
Packit c4476c
Packit c4476c
    /* r = ftmp5 = (s2 - s1)*2 */
Packit c4476c
    felem_diff_128_64(tmp, ftmp6);
Packit c4476c
    /* tmp[i] < 17*2^120 + 2^63 */
Packit c4476c
    felem_reduce(ftmp5, tmp);
Packit c4476c
    y_equal = felem_is_zero(ftmp5);
Packit c4476c
    felem_scalar64(ftmp5, 2);
Packit c4476c
    /* ftmp5[i] < 2^61 */
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * The formulae are incorrect if the points are equal, in affine coordinates
Packit c4476c
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
Packit c4476c
     * happens.
Packit c4476c
     *
Packit c4476c
     * We use bitwise operations to avoid potential side-channels introduced by
Packit c4476c
     * the short-circuiting behaviour of boolean operators.
Packit c4476c
     *
Packit c4476c
     * The special case of either point being the point at infinity (z1 and/or
Packit c4476c
     * z2 are zero), is handled separately later on in this function, so we
Packit c4476c
     * avoid jumping to point_double here in those special cases.
Packit c4476c
     *
Packit c4476c
     * Notice the comment below on the implications of this branching for timing
Packit c4476c
     * leaks and why it is considered practically irrelevant.
Packit c4476c
     */
Packit c4476c
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
Packit c4476c
Packit c4476c
    if (points_equal) {
Packit c4476c
        /*
Packit c4476c
         * This is obviously not constant-time but it will almost-never happen
Packit c4476c
         * for ECDH / ECDSA. The case where it can happen is during scalar-mult
Packit c4476c
         * where the intermediate value gets very close to the group order.
Packit c4476c
         * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
Packit c4476c
         * the scalar, it's possible for the intermediate value to be a small
Packit c4476c
         * negative multiple of the base point, and for the final signed digit
Packit c4476c
         * to be the same value. We believe that this only occurs for the scalar
Packit c4476c
         * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
Packit c4476c
         * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
Packit c4476c
         * 71e913863f7, in that case the penultimate intermediate is -9G and
Packit c4476c
         * the final digit is also -9G. Since this only happens for a single
Packit c4476c
         * scalar, the timing leak is irrelevant. (Any attacker who wanted to
Packit c4476c
         * check whether a secret scalar was that exact value, can already do
Packit c4476c
         * so.)
Packit c4476c
         */
Packit c4476c
        point_double(x3, y3, z3, x1, y1, z1);
Packit c4476c
        return;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    /* I = ftmp = (2h)**2 */
Packit c4476c
    felem_assign(ftmp, ftmp4);
Packit c4476c
    felem_scalar64(ftmp, 2);
Packit c4476c
    /* ftmp[i] < 2^61 */
Packit c4476c
    felem_square(tmp, ftmp);
Packit c4476c
    /* tmp[i] < 17*2^122 */
Packit c4476c
    felem_reduce(ftmp, tmp);
Packit c4476c
Packit c4476c
    /* J = ftmp2 = h * I */
Packit c4476c
    felem_mul(tmp, ftmp4, ftmp);
Packit c4476c
    felem_reduce(ftmp2, tmp);
Packit c4476c
Packit c4476c
    /* V = ftmp4 = U1 * I */
Packit c4476c
    felem_mul(tmp, ftmp3, ftmp);
Packit c4476c
    felem_reduce(ftmp4, tmp);
Packit c4476c
Packit c4476c
    /* x_out = r**2 - J - 2V */
Packit c4476c
    felem_square(tmp, ftmp5);
Packit c4476c
    /* tmp[i] < 17*2^122 */
Packit c4476c
    felem_diff_128_64(tmp, ftmp2);
Packit c4476c
    /* tmp[i] < 17*2^122 + 2^63 */
Packit c4476c
    felem_assign(ftmp3, ftmp4);
Packit c4476c
    felem_scalar64(ftmp4, 2);
Packit c4476c
    /* ftmp4[i] < 2^61 */
Packit c4476c
    felem_diff_128_64(tmp, ftmp4);
Packit c4476c
    /* tmp[i] < 17*2^122 + 2^64 */
Packit c4476c
    felem_reduce(x_out, tmp);
Packit c4476c
Packit c4476c
    /* y_out = r(V-x_out) - 2 * s1 * J */
Packit c4476c
    felem_diff64(ftmp3, x_out);
Packit c4476c
    /*
Packit c4476c
     * ftmp3[i] < 2^60 + 2^60 = 2^61
Packit c4476c
     */
Packit c4476c
    felem_mul(tmp, ftmp5, ftmp3);
Packit c4476c
    /* tmp[i] < 17*2^122 */
Packit c4476c
    felem_mul(tmp2, ftmp6, ftmp2);
Packit c4476c
    /* tmp2[i] < 17*2^120 */
Packit c4476c
    felem_scalar128(tmp2, 2);
Packit c4476c
    /* tmp2[i] < 17*2^121 */
Packit c4476c
    felem_diff128(tmp, tmp2);
Packit c4476c
        /*-
Packit c4476c
         * tmp[i] < 2^127 - 2^69 + 17*2^122
Packit c4476c
         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
Packit c4476c
         *        < 2^127
Packit c4476c
         */
Packit c4476c
    felem_reduce(y_out, tmp);
Packit c4476c
Packit c4476c
    copy_conditional(x_out, x2, z1_is_zero);
Packit c4476c
    copy_conditional(x_out, x1, z2_is_zero);
Packit c4476c
    copy_conditional(y_out, y2, z1_is_zero);
Packit c4476c
    copy_conditional(y_out, y1, z2_is_zero);
Packit c4476c
    copy_conditional(z_out, z2, z1_is_zero);
Packit c4476c
    copy_conditional(z_out, z1, z2_is_zero);
Packit c4476c
    felem_assign(x3, x_out);
Packit c4476c
    felem_assign(y3, y_out);
Packit c4476c
    felem_assign(z3, z_out);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*-
Packit c4476c
 * Base point pre computation
Packit c4476c
 * --------------------------
Packit c4476c
 *
Packit c4476c
 * Two different sorts of precomputed tables are used in the following code.
Packit c4476c
 * Each contain various points on the curve, where each point is three field
Packit c4476c
 * elements (x, y, z).
Packit c4476c
 *
Packit c4476c
 * For the base point table, z is usually 1 (0 for the point at infinity).
Packit c4476c
 * This table has 16 elements:
Packit c4476c
 * index | bits    | point
Packit c4476c
 * ------+---------+------------------------------
Packit c4476c
 *     0 | 0 0 0 0 | 0G
Packit c4476c
 *     1 | 0 0 0 1 | 1G
Packit c4476c
 *     2 | 0 0 1 0 | 2^130G
Packit c4476c
 *     3 | 0 0 1 1 | (2^130 + 1)G
Packit c4476c
 *     4 | 0 1 0 0 | 2^260G
Packit c4476c
 *     5 | 0 1 0 1 | (2^260 + 1)G
Packit c4476c
 *     6 | 0 1 1 0 | (2^260 + 2^130)G
Packit c4476c
 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
Packit c4476c
 *     8 | 1 0 0 0 | 2^390G
Packit c4476c
 *     9 | 1 0 0 1 | (2^390 + 1)G
Packit c4476c
 *    10 | 1 0 1 0 | (2^390 + 2^130)G
Packit c4476c
 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
Packit c4476c
 *    12 | 1 1 0 0 | (2^390 + 2^260)G
Packit c4476c
 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
Packit c4476c
 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
Packit c4476c
 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
Packit c4476c
 *
Packit c4476c
 * The reason for this is so that we can clock bits into four different
Packit c4476c
 * locations when doing simple scalar multiplies against the base point.
Packit c4476c
 *
Packit c4476c
 * Tables for other points have table[i] = iG for i in 0 .. 16. */
Packit c4476c
Packit c4476c
/* gmul is the table of precomputed base points */
Packit c4476c
static const felem gmul[16][3] = {
Packit c4476c
{{0, 0, 0, 0, 0, 0, 0, 0, 0},
Packit c4476c
 {0, 0, 0, 0, 0, 0, 0, 0, 0},
Packit c4476c
 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
Packit c4476c
  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
Packit c4476c
  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
Packit c4476c
 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
Packit c4476c
  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
Packit c4476c
  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
Packit c4476c
  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
Packit c4476c
  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
Packit c4476c
 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
Packit c4476c
  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
Packit c4476c
  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
Packit c4476c
  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
Packit c4476c
  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
Packit c4476c
 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
Packit c4476c
  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
Packit c4476c
  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
Packit c4476c
  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
Packit c4476c
  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
Packit c4476c
 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
Packit c4476c
  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
Packit c4476c
  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
Packit c4476c
  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
Packit c4476c
  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
Packit c4476c
 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
Packit c4476c
  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
Packit c4476c
  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
Packit c4476c
  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
Packit c4476c
  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
Packit c4476c
 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
Packit c4476c
  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
Packit c4476c
  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
Packit c4476c
  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
Packit c4476c
  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
Packit c4476c
 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
Packit c4476c
  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
Packit c4476c
  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
Packit c4476c
  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
Packit c4476c
  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
Packit c4476c
 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
Packit c4476c
  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
Packit c4476c
  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
Packit c4476c
  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
Packit c4476c
  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
Packit c4476c
 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
Packit c4476c
  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
Packit c4476c
  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
Packit c4476c
  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
Packit c4476c
  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
Packit c4476c
 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
Packit c4476c
  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
Packit c4476c
  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
Packit c4476c
  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
Packit c4476c
  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
Packit c4476c
 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
Packit c4476c
  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
Packit c4476c
  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
Packit c4476c
  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
Packit c4476c
  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
Packit c4476c
 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
Packit c4476c
  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
Packit c4476c
  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
Packit c4476c
  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
Packit c4476c
  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
Packit c4476c
 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
Packit c4476c
  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
Packit c4476c
  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
Packit c4476c
  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
Packit c4476c
  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
Packit c4476c
 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
Packit c4476c
  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
Packit c4476c
  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
Packit c4476c
{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
Packit c4476c
  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
Packit c4476c
  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
Packit c4476c
 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
Packit c4476c
  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
Packit c4476c
  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
Packit c4476c
 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
Packit c4476c
};
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * select_point selects the |idx|th point from a precomputation table and
Packit c4476c
 * copies it to out.
Packit c4476c
 */
Packit c4476c
 /* pre_comp below is of the size provided in |size| */
Packit c4476c
static void select_point(const limb idx, unsigned int size,
Packit c4476c
                         const felem pre_comp[][3], felem out[3])
Packit c4476c
{
Packit c4476c
    unsigned i, j;
Packit c4476c
    limb *outlimbs = &out[0][0];
Packit c4476c
Packit c4476c
    memset(out, 0, sizeof(*out) * 3);
Packit c4476c
Packit c4476c
    for (i = 0; i < size; i++) {
Packit c4476c
        const limb *inlimbs = &pre_comp[i][0][0];
Packit c4476c
        limb mask = i ^ idx;
Packit c4476c
        mask |= mask >> 4;
Packit c4476c
        mask |= mask >> 2;
Packit c4476c
        mask |= mask >> 1;
Packit c4476c
        mask &= 1;
Packit c4476c
        mask--;
Packit c4476c
        for (j = 0; j < NLIMBS * 3; j++)
Packit c4476c
            outlimbs[j] |= inlimbs[j] & mask;
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
Packit c4476c
/* get_bit returns the |i|th bit in |in| */
Packit c4476c
static char get_bit(const felem_bytearray in, int i)
Packit c4476c
{
Packit c4476c
    if (i < 0)
Packit c4476c
        return 0;
Packit c4476c
    return (in[i >> 3] >> (i & 7)) & 1;
Packit c4476c
}
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * Interleaved point multiplication using precomputed point multiples: The
Packit c4476c
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
Packit c4476c
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
Packit c4476c
 * generator, using certain (large) precomputed multiples in g_pre_comp.
Packit c4476c
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
Packit c4476c
 */
Packit c4476c
static void batch_mul(felem x_out, felem y_out, felem z_out,
Packit c4476c
                      const felem_bytearray scalars[],
Packit c4476c
                      const unsigned num_points, const u8 *g_scalar,
Packit c4476c
                      const int mixed, const felem pre_comp[][17][3],
Packit c4476c
                      const felem g_pre_comp[16][3])
Packit c4476c
{
Packit c4476c
    int i, skip;
Packit c4476c
    unsigned num, gen_mul = (g_scalar != NULL);
Packit c4476c
    felem nq[3], tmp[4];
Packit c4476c
    limb bits;
Packit c4476c
    u8 sign, digit;
Packit c4476c
Packit c4476c
    /* set nq to the point at infinity */
Packit c4476c
    memset(nq, 0, sizeof(nq));
Packit c4476c
Packit c4476c
    /*
Packit c4476c
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
Packit c4476c
     * of the generator (last quarter of rounds) and additions of other
Packit c4476c
     * points multiples (every 5th round).
Packit c4476c
     */
Packit c4476c
    skip = 1;                   /* save two point operations in the first
Packit c4476c
                                 * round */
Packit c4476c
    for (i = (num_points ? 520 : 130); i >= 0; --i) {
Packit c4476c
        /* double */
Packit c4476c
        if (!skip)
Packit c4476c
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
Packit c4476c
Packit c4476c
        /* add multiples of the generator */
Packit c4476c
        if (gen_mul && (i <= 130)) {
Packit c4476c
            bits = get_bit(g_scalar, i + 390) << 3;
Packit c4476c
            if (i < 130) {
Packit c4476c
                bits |= get_bit(g_scalar, i + 260) << 2;
Packit c4476c
                bits |= get_bit(g_scalar, i + 130) << 1;
Packit c4476c
                bits |= get_bit(g_scalar, i);
Packit c4476c
            }
Packit c4476c
            /* select the point to add, in constant time */
Packit c4476c
            select_point(bits, 16, g_pre_comp, tmp);
Packit c4476c
            if (!skip) {
Packit c4476c
                /* The 1 argument below is for "mixed" */
Packit c4476c
                point_add(nq[0], nq[1], nq[2],
Packit c4476c
                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
Packit c4476c
            } else {
Packit c4476c
                memcpy(nq, tmp, 3 * sizeof(felem));
Packit c4476c
                skip = 0;
Packit c4476c
            }
Packit c4476c
        }
Packit c4476c
Packit c4476c
        /* do other additions every 5 doublings */
Packit c4476c
        if (num_points && (i % 5 == 0)) {
Packit c4476c
            /* loop over all scalars */
Packit c4476c
            for (num = 0; num < num_points; ++num) {
Packit c4476c
                bits = get_bit(scalars[num], i + 4) << 5;
Packit c4476c
                bits |= get_bit(scalars[num], i + 3) << 4;
Packit c4476c
                bits |= get_bit(scalars[num], i + 2) << 3;
Packit c4476c
                bits |= get_bit(scalars[num], i + 1) << 2;
Packit c4476c
                bits |= get_bit(scalars[num], i) << 1;
Packit c4476c
                bits |= get_bit(scalars[num], i - 1);
Packit c4476c
                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
Packit c4476c
Packit c4476c
                /*
Packit c4476c
                 * select the point to add or subtract, in constant time
Packit c4476c
                 */
Packit c4476c
                select_point(digit, 17, pre_comp[num], tmp);
Packit c4476c
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
Packit c4476c
                                            * point */
Packit c4476c
                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
Packit c4476c
Packit c4476c
                if (!skip) {
Packit c4476c
                    point_add(nq[0], nq[1], nq[2],
Packit c4476c
                              nq[0], nq[1], nq[2],
Packit c4476c
                              mixed, tmp[0], tmp[1], tmp[2]);
Packit c4476c
                } else {
Packit c4476c
                    memcpy(nq, tmp, 3 * sizeof(felem));
Packit c4476c
                    skip = 0;
Packit c4476c
                }
Packit c4476c
            }
Packit c4476c
        }
Packit c4476c
    }
Packit c4476c
    felem_assign(x_out, nq[0]);
Packit c4476c
    felem_assign(y_out, nq[1]);
Packit c4476c
    felem_assign(z_out, nq[2]);
Packit c4476c
}
Packit c4476c
Packit c4476c
/* Precomputation for the group generator. */
Packit c4476c
struct nistp521_pre_comp_st {
Packit c4476c
    felem g_pre_comp[16][3];
Packit c4476c
    CRYPTO_REF_COUNT references;
Packit c4476c
    CRYPTO_RWLOCK *lock;
Packit c4476c
};
Packit c4476c
Packit c4476c
const EC_METHOD *EC_GFp_nistp521_method(void)
Packit c4476c
{
Packit c4476c
    static const EC_METHOD ret = {
Packit c4476c
        EC_FLAGS_DEFAULT_OCT,
Packit c4476c
        NID_X9_62_prime_field,
Packit c4476c
        ec_GFp_nistp521_group_init,
Packit c4476c
        ec_GFp_simple_group_finish,
Packit c4476c
        ec_GFp_simple_group_clear_finish,
Packit c4476c
        ec_GFp_nist_group_copy,
Packit c4476c
        ec_GFp_nistp521_group_set_curve,
Packit c4476c
        ec_GFp_simple_group_get_curve,
Packit c4476c
        ec_GFp_simple_group_get_degree,
Packit c4476c
        ec_group_simple_order_bits,
Packit c4476c
        ec_GFp_simple_group_check_discriminant,
Packit c4476c
        ec_GFp_simple_point_init,
Packit c4476c
        ec_GFp_simple_point_finish,
Packit c4476c
        ec_GFp_simple_point_clear_finish,
Packit c4476c
        ec_GFp_simple_point_copy,
Packit c4476c
        ec_GFp_simple_point_set_to_infinity,
Packit c4476c
        ec_GFp_simple_set_Jprojective_coordinates_GFp,
Packit c4476c
        ec_GFp_simple_get_Jprojective_coordinates_GFp,
Packit c4476c
        ec_GFp_simple_point_set_affine_coordinates,
Packit c4476c
        ec_GFp_nistp521_point_get_affine_coordinates,
Packit c4476c
        0 /* point_set_compressed_coordinates */ ,
Packit c4476c
        0 /* point2oct */ ,
Packit c4476c
        0 /* oct2point */ ,
Packit c4476c
        ec_GFp_simple_add,
Packit c4476c
        ec_GFp_simple_dbl,
Packit c4476c
        ec_GFp_simple_invert,
Packit c4476c
        ec_GFp_simple_is_at_infinity,
Packit c4476c
        ec_GFp_simple_is_on_curve,
Packit c4476c
        ec_GFp_simple_cmp,
Packit c4476c
        ec_GFp_simple_make_affine,
Packit c4476c
        ec_GFp_simple_points_make_affine,
Packit c4476c
        ec_GFp_nistp521_points_mul,
Packit c4476c
        ec_GFp_nistp521_precompute_mult,
Packit c4476c
        ec_GFp_nistp521_have_precompute_mult,
Packit c4476c
        ec_GFp_nist_field_mul,
Packit c4476c
        ec_GFp_nist_field_sqr,
Packit c4476c
        0 /* field_div */ ,
Packit c4476c
        ec_GFp_simple_field_inv,
Packit c4476c
        0 /* field_encode */ ,
Packit c4476c
        0 /* field_decode */ ,
Packit c4476c
        0,                      /* field_set_to_one */
Packit c4476c
        ec_key_simple_priv2oct,
Packit c4476c
        ec_key_simple_oct2priv,
Packit c4476c
        0, /* set private */
Packit c4476c
        ec_key_simple_generate_key,
Packit c4476c
        ec_key_simple_check_key,
Packit c4476c
        ec_key_simple_generate_public_key,
Packit c4476c
        0, /* keycopy */
Packit c4476c
        0, /* keyfinish */
Packit c4476c
        ecdh_simple_compute_key,
Packit c4476c
        ecdsa_simple_sign_setup,
Packit c4476c
        ecdsa_simple_sign_sig,
Packit c4476c
        ecdsa_simple_verify_sig,
Packit c4476c
        0, /* field_inverse_mod_ord */
Packit c4476c
        0, /* blind_coordinates */
Packit c4476c
        0, /* ladder_pre */
Packit c4476c
        0, /* ladder_step */
Packit c4476c
        0  /* ladder_post */
Packit c4476c
    };
Packit c4476c
Packit c4476c
    return &ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
/******************************************************************************/
Packit c4476c
/*
Packit c4476c
 * FUNCTIONS TO MANAGE PRECOMPUTATION
Packit c4476c
 */
Packit c4476c
Packit c4476c
static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
Packit c4476c
{
Packit c4476c
    NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
Packit c4476c
Packit c4476c
    if (ret == NULL) {
Packit c4476c
        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
Packit c4476c
        return ret;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    ret->references = 1;
Packit c4476c
Packit c4476c
    ret->lock = CRYPTO_THREAD_lock_new();
Packit c4476c
    if (ret->lock == NULL) {
Packit c4476c
        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
Packit c4476c
        OPENSSL_free(ret);
Packit c4476c
        return NULL;
Packit c4476c
    }
Packit c4476c
    return ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
Packit c4476c
{
Packit c4476c
    int i;
Packit c4476c
    if (p != NULL)
Packit c4476c
        CRYPTO_UP_REF(&p->references, &i, p->lock);
Packit c4476c
    return p;
Packit c4476c
}
Packit c4476c
Packit c4476c
void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
Packit c4476c
{
Packit c4476c
    int i;
Packit c4476c
Packit c4476c
    if (p == NULL)
Packit c4476c
        return;
Packit c4476c
Packit c4476c
    CRYPTO_DOWN_REF(&p->references, &i, p->lock);
Packit c4476c
    REF_PRINT_COUNT("EC_nistp521", x);
Packit c4476c
    if (i > 0)
Packit c4476c
        return;
Packit c4476c
    REF_ASSERT_ISNT(i < 0);
Packit c4476c
Packit c4476c
    CRYPTO_THREAD_lock_free(p->lock);
Packit c4476c
    OPENSSL_free(p);
Packit c4476c
}
Packit c4476c
Packit c4476c
/******************************************************************************/
Packit c4476c
/*
Packit c4476c
 * OPENSSL EC_METHOD FUNCTIONS
Packit c4476c
 */
Packit c4476c
Packit c4476c
int ec_GFp_nistp521_group_init(EC_GROUP *group)
Packit c4476c
{
Packit c4476c
    int ret;
Packit c4476c
    ret = ec_GFp_simple_group_init(group);
Packit c4476c
    group->a_is_minus3 = 1;
Packit c4476c
    return ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
Packit c4476c
                                    const BIGNUM *a, const BIGNUM *b,
Packit c4476c
                                    BN_CTX *ctx)
Packit c4476c
{
Packit c4476c
    int ret = 0;
Packit c4476c
    BN_CTX *new_ctx = NULL;
Packit c4476c
    BIGNUM *curve_p, *curve_a, *curve_b;
Packit c4476c
Packit c4476c
    if (ctx == NULL)
Packit c4476c
        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
Packit c4476c
            return 0;
Packit c4476c
    BN_CTX_start(ctx);
Packit c4476c
    curve_p = BN_CTX_get(ctx);
Packit c4476c
    curve_a = BN_CTX_get(ctx);
Packit c4476c
    curve_b = BN_CTX_get(ctx);
Packit c4476c
    if (curve_b == NULL)
Packit c4476c
        goto err;
Packit c4476c
    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
Packit c4476c
    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
Packit c4476c
    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
Packit c4476c
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
Packit c4476c
        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
Packit c4476c
              EC_R_WRONG_CURVE_PARAMETERS);
Packit c4476c
        goto err;
Packit c4476c
    }
Packit c4476c
    group->field_mod_func = BN_nist_mod_521;
Packit c4476c
    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
Packit c4476c
 err:
Packit c4476c
    BN_CTX_end(ctx);
Packit c4476c
    BN_CTX_free(new_ctx);
Packit c4476c
    return ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
Packit c4476c
 * (X/Z^2, Y/Z^3)
Packit c4476c
 */
Packit c4476c
int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
Packit c4476c
                                                 const EC_POINT *point,
Packit c4476c
                                                 BIGNUM *x, BIGNUM *y,
Packit c4476c
                                                 BN_CTX *ctx)
Packit c4476c
{
Packit c4476c
    felem z1, z2, x_in, y_in, x_out, y_out;
Packit c4476c
    largefelem tmp;
Packit c4476c
Packit c4476c
    if (EC_POINT_is_at_infinity(group, point)) {
Packit c4476c
        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
Packit c4476c
              EC_R_POINT_AT_INFINITY);
Packit c4476c
        return 0;
Packit c4476c
    }
Packit c4476c
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
Packit c4476c
        (!BN_to_felem(z1, point->Z)))
Packit c4476c
        return 0;
Packit c4476c
    felem_inv(z2, z1);
Packit c4476c
    felem_square(tmp, z2);
Packit c4476c
    felem_reduce(z1, tmp);
Packit c4476c
    felem_mul(tmp, x_in, z1);
Packit c4476c
    felem_reduce(x_in, tmp);
Packit c4476c
    felem_contract(x_out, x_in);
Packit c4476c
    if (x != NULL) {
Packit c4476c
        if (!felem_to_BN(x, x_out)) {
Packit c4476c
            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
Packit c4476c
                  ERR_R_BN_LIB);
Packit c4476c
            return 0;
Packit c4476c
        }
Packit c4476c
    }
Packit c4476c
    felem_mul(tmp, z1, z2);
Packit c4476c
    felem_reduce(z1, tmp);
Packit c4476c
    felem_mul(tmp, y_in, z1);
Packit c4476c
    felem_reduce(y_in, tmp);
Packit c4476c
    felem_contract(y_out, y_in);
Packit c4476c
    if (y != NULL) {
Packit c4476c
        if (!felem_to_BN(y, y_out)) {
Packit c4476c
            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
Packit c4476c
                  ERR_R_BN_LIB);
Packit c4476c
            return 0;
Packit c4476c
        }
Packit c4476c
    }
Packit c4476c
    return 1;
Packit c4476c
}
Packit c4476c
Packit c4476c
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
Packit c4476c
static void make_points_affine(size_t num, felem points[][3],
Packit c4476c
                               felem tmp_felems[])
Packit c4476c
{
Packit c4476c
    /*
Packit c4476c
     * Runs in constant time, unless an input is the point at infinity (which
Packit c4476c
     * normally shouldn't happen).
Packit c4476c
     */
Packit c4476c
    ec_GFp_nistp_points_make_affine_internal(num,
Packit c4476c
                                             points,
Packit c4476c
                                             sizeof(felem),
Packit c4476c
                                             tmp_felems,
Packit c4476c
                                             (void (*)(void *))felem_one,
Packit c4476c
                                             felem_is_zero_int,
Packit c4476c
                                             (void (*)(void *, const void *))
Packit c4476c
                                             felem_assign,
Packit c4476c
                                             (void (*)(void *, const void *))
Packit c4476c
                                             felem_square_reduce, (void (*)
Packit c4476c
                                                                   (void *,
Packit c4476c
                                                                    const void
Packit c4476c
                                                                    *,
Packit c4476c
                                                                    const void
Packit c4476c
                                                                    *))
Packit c4476c
                                             felem_mul_reduce,
Packit c4476c
                                             (void (*)(void *, const void *))
Packit c4476c
                                             felem_inv,
Packit c4476c
                                             (void (*)(void *, const void *))
Packit c4476c
                                             felem_contract);
Packit c4476c
}
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
Packit c4476c
 * values Result is stored in r (r can equal one of the inputs).
Packit c4476c
 */
Packit c4476c
int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
Packit c4476c
                               const BIGNUM *scalar, size_t num,
Packit c4476c
                               const EC_POINT *points[],
Packit c4476c
                               const BIGNUM *scalars[], BN_CTX *ctx)
Packit c4476c
{
Packit c4476c
    int ret = 0;
Packit c4476c
    int j;
Packit c4476c
    int mixed = 0;
Packit c4476c
    BIGNUM *x, *y, *z, *tmp_scalar;
Packit c4476c
    felem_bytearray g_secret;
Packit c4476c
    felem_bytearray *secrets = NULL;
Packit c4476c
    felem (*pre_comp)[17][3] = NULL;
Packit c4476c
    felem *tmp_felems = NULL;
Packit c4476c
    unsigned i;
Packit c4476c
    int num_bytes;
Packit c4476c
    int have_pre_comp = 0;
Packit c4476c
    size_t num_points = num;
Packit c4476c
    felem x_in, y_in, z_in, x_out, y_out, z_out;
Packit c4476c
    NISTP521_PRE_COMP *pre = NULL;
Packit c4476c
    felem(*g_pre_comp)[3] = NULL;
Packit c4476c
    EC_POINT *generator = NULL;
Packit c4476c
    const EC_POINT *p = NULL;
Packit c4476c
    const BIGNUM *p_scalar = NULL;
Packit c4476c
Packit c4476c
    BN_CTX_start(ctx);
Packit c4476c
    x = BN_CTX_get(ctx);
Packit c4476c
    y = BN_CTX_get(ctx);
Packit c4476c
    z = BN_CTX_get(ctx);
Packit c4476c
    tmp_scalar = BN_CTX_get(ctx);
Packit c4476c
    if (tmp_scalar == NULL)
Packit c4476c
        goto err;
Packit c4476c
Packit c4476c
    if (scalar != NULL) {
Packit c4476c
        pre = group->pre_comp.nistp521;
Packit c4476c
        if (pre)
Packit c4476c
            /* we have precomputation, try to use it */
Packit c4476c
            g_pre_comp = &pre->g_pre_comp[0];
Packit c4476c
        else
Packit c4476c
            /* try to use the standard precomputation */
Packit c4476c
            g_pre_comp = (felem(*)[3]) gmul;
Packit c4476c
        generator = EC_POINT_new(group);
Packit c4476c
        if (generator == NULL)
Packit c4476c
            goto err;
Packit c4476c
        /* get the generator from precomputation */
Packit c4476c
        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
Packit c4476c
            !felem_to_BN(y, g_pre_comp[1][1]) ||
Packit c4476c
            !felem_to_BN(z, g_pre_comp[1][2])) {
Packit c4476c
            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
Packit c4476c
            goto err;
Packit c4476c
        }
Packit c4476c
        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
Packit c4476c
                                                      generator, x, y, z,
Packit c4476c
                                                      ctx))
Packit c4476c
            goto err;
Packit c4476c
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
Packit c4476c
            /* precomputation matches generator */
Packit c4476c
            have_pre_comp = 1;
Packit c4476c
        else
Packit c4476c
            /*
Packit c4476c
             * we don't have valid precomputation: treat the generator as a
Packit c4476c
             * random point
Packit c4476c
             */
Packit c4476c
            num_points++;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    if (num_points > 0) {
Packit c4476c
        if (num_points >= 2) {
Packit c4476c
            /*
Packit c4476c
             * unless we precompute multiples for just one point, converting
Packit c4476c
             * those into affine form is time well spent
Packit c4476c
             */
Packit c4476c
            mixed = 1;
Packit c4476c
        }
Packit c4476c
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
Packit c4476c
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
Packit c4476c
        if (mixed)
Packit c4476c
            tmp_felems =
Packit c4476c
                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
Packit c4476c
        if ((secrets == NULL) || (pre_comp == NULL)
Packit c4476c
            || (mixed && (tmp_felems == NULL))) {
Packit c4476c
            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
Packit c4476c
            goto err;
Packit c4476c
        }
Packit c4476c
Packit c4476c
        /*
Packit c4476c
         * we treat NULL scalars as 0, and NULL points as points at infinity,
Packit c4476c
         * i.e., they contribute nothing to the linear combination
Packit c4476c
         */
Packit c4476c
        for (i = 0; i < num_points; ++i) {
Packit c4476c
            if (i == num) {
Packit c4476c
                /*
Packit c4476c
                 * we didn't have a valid precomputation, so we pick the
Packit c4476c
                 * generator
Packit c4476c
                 */
Packit c4476c
                p = EC_GROUP_get0_generator(group);
Packit c4476c
                p_scalar = scalar;
Packit c4476c
            } else {
Packit c4476c
                /* the i^th point */
Packit c4476c
                p = points[i];
Packit c4476c
                p_scalar = scalars[i];
Packit c4476c
            }
Packit c4476c
            if ((p_scalar != NULL) && (p != NULL)) {
Packit c4476c
                /* reduce scalar to 0 <= scalar < 2^521 */
Packit c4476c
                if ((BN_num_bits(p_scalar) > 521)
Packit c4476c
                    || (BN_is_negative(p_scalar))) {
Packit c4476c
                    /*
Packit c4476c
                     * this is an unusual input, and we don't guarantee
Packit c4476c
                     * constant-timeness
Packit c4476c
                     */
Packit c4476c
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
Packit c4476c
                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
Packit c4476c
                        goto err;
Packit c4476c
                    }
Packit c4476c
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
Packit c4476c
                                               secrets[i], sizeof(secrets[i]));
Packit c4476c
                } else {
Packit c4476c
                    num_bytes = BN_bn2lebinpad(p_scalar,
Packit c4476c
                                               secrets[i], sizeof(secrets[i]));
Packit c4476c
                }
Packit c4476c
                if (num_bytes < 0) {
Packit c4476c
                    ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
Packit c4476c
                    goto err;
Packit c4476c
                }
Packit c4476c
                /* precompute multiples */
Packit c4476c
                if ((!BN_to_felem(x_out, p->X)) ||
Packit c4476c
                    (!BN_to_felem(y_out, p->Y)) ||
Packit c4476c
                    (!BN_to_felem(z_out, p->Z)))
Packit c4476c
                    goto err;
Packit c4476c
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
Packit c4476c
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
Packit c4476c
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
Packit c4476c
                for (j = 2; j <= 16; ++j) {
Packit c4476c
                    if (j & 1) {
Packit c4476c
                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
Packit c4476c
                                  pre_comp[i][j][2], pre_comp[i][1][0],
Packit c4476c
                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
Packit c4476c
                                  pre_comp[i][j - 1][0],
Packit c4476c
                                  pre_comp[i][j - 1][1],
Packit c4476c
                                  pre_comp[i][j - 1][2]);
Packit c4476c
                    } else {
Packit c4476c
                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
Packit c4476c
                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
Packit c4476c
                                     pre_comp[i][j / 2][1],
Packit c4476c
                                     pre_comp[i][j / 2][2]);
Packit c4476c
                    }
Packit c4476c
                }
Packit c4476c
            }
Packit c4476c
        }
Packit c4476c
        if (mixed)
Packit c4476c
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
Packit c4476c
    }
Packit c4476c
Packit c4476c
    /* the scalar for the generator */
Packit c4476c
    if ((scalar != NULL) && (have_pre_comp)) {
Packit c4476c
        memset(g_secret, 0, sizeof(g_secret));
Packit c4476c
        /* reduce scalar to 0 <= scalar < 2^521 */
Packit c4476c
        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
Packit c4476c
            /*
Packit c4476c
             * this is an unusual input, and we don't guarantee
Packit c4476c
             * constant-timeness
Packit c4476c
             */
Packit c4476c
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
Packit c4476c
                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
Packit c4476c
                goto err;
Packit c4476c
            }
Packit c4476c
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
Packit c4476c
        } else {
Packit c4476c
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
Packit c4476c
        }
Packit c4476c
        /* do the multiplication with generator precomputation */
Packit c4476c
        batch_mul(x_out, y_out, z_out,
Packit c4476c
                  (const felem_bytearray(*))secrets, num_points,
Packit c4476c
                  g_secret,
Packit c4476c
                  mixed, (const felem(*)[17][3])pre_comp,
Packit c4476c
                  (const felem(*)[3])g_pre_comp);
Packit c4476c
    } else {
Packit c4476c
        /* do the multiplication without generator precomputation */
Packit c4476c
        batch_mul(x_out, y_out, z_out,
Packit c4476c
                  (const felem_bytearray(*))secrets, num_points,
Packit c4476c
                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
Packit c4476c
    }
Packit c4476c
    /* reduce the output to its unique minimal representation */
Packit c4476c
    felem_contract(x_in, x_out);
Packit c4476c
    felem_contract(y_in, y_out);
Packit c4476c
    felem_contract(z_in, z_out);
Packit c4476c
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
Packit c4476c
        (!felem_to_BN(z, z_in))) {
Packit c4476c
        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
Packit c4476c
        goto err;
Packit c4476c
    }
Packit c4476c
    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
Packit c4476c
Packit c4476c
 err:
Packit c4476c
    BN_CTX_end(ctx);
Packit c4476c
    EC_POINT_free(generator);
Packit c4476c
    OPENSSL_free(secrets);
Packit c4476c
    OPENSSL_free(pre_comp);
Packit c4476c
    OPENSSL_free(tmp_felems);
Packit c4476c
    return ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
Packit c4476c
{
Packit c4476c
    int ret = 0;
Packit c4476c
    NISTP521_PRE_COMP *pre = NULL;
Packit c4476c
    int i, j;
Packit c4476c
    BN_CTX *new_ctx = NULL;
Packit c4476c
    BIGNUM *x, *y;
Packit c4476c
    EC_POINT *generator = NULL;
Packit c4476c
    felem tmp_felems[16];
Packit c4476c
Packit c4476c
    /* throw away old precomputation */
Packit c4476c
    EC_pre_comp_free(group);
Packit c4476c
    if (ctx == NULL)
Packit c4476c
        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
Packit c4476c
            return 0;
Packit c4476c
    BN_CTX_start(ctx);
Packit c4476c
    x = BN_CTX_get(ctx);
Packit c4476c
    y = BN_CTX_get(ctx);
Packit c4476c
    if (y == NULL)
Packit c4476c
        goto err;
Packit c4476c
    /* get the generator */
Packit c4476c
    if (group->generator == NULL)
Packit c4476c
        goto err;
Packit c4476c
    generator = EC_POINT_new(group);
Packit c4476c
    if (generator == NULL)
Packit c4476c
        goto err;
Packit c4476c
    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
Packit c4476c
    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
Packit c4476c
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
Packit c4476c
        goto err;
Packit c4476c
    if ((pre = nistp521_pre_comp_new()) == NULL)
Packit c4476c
        goto err;
Packit c4476c
    /*
Packit c4476c
     * if the generator is the standard one, use built-in precomputation
Packit c4476c
     */
Packit c4476c
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
Packit c4476c
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
Packit c4476c
        goto done;
Packit c4476c
    }
Packit c4476c
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
Packit c4476c
        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
Packit c4476c
        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
Packit c4476c
        goto err;
Packit c4476c
    /* compute 2^130*G, 2^260*G, 2^390*G */
Packit c4476c
    for (i = 1; i <= 4; i <<= 1) {
Packit c4476c
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
Packit c4476c
                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
Packit c4476c
                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
Packit c4476c
        for (j = 0; j < 129; ++j) {
Packit c4476c
            point_double(pre->g_pre_comp[2 * i][0],
Packit c4476c
                         pre->g_pre_comp[2 * i][1],
Packit c4476c
                         pre->g_pre_comp[2 * i][2],
Packit c4476c
                         pre->g_pre_comp[2 * i][0],
Packit c4476c
                         pre->g_pre_comp[2 * i][1],
Packit c4476c
                         pre->g_pre_comp[2 * i][2]);
Packit c4476c
        }
Packit c4476c
    }
Packit c4476c
    /* g_pre_comp[0] is the point at infinity */
Packit c4476c
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
Packit c4476c
    /* the remaining multiples */
Packit c4476c
    /* 2^130*G + 2^260*G */
Packit c4476c
    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
Packit c4476c
              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
Packit c4476c
              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
Packit c4476c
              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
Packit c4476c
              pre->g_pre_comp[2][2]);
Packit c4476c
    /* 2^130*G + 2^390*G */
Packit c4476c
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
Packit c4476c
              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
Packit c4476c
              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
Packit c4476c
              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
Packit c4476c
              pre->g_pre_comp[2][2]);
Packit c4476c
    /* 2^260*G + 2^390*G */
Packit c4476c
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
Packit c4476c
              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
Packit c4476c
              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
Packit c4476c
              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
Packit c4476c
              pre->g_pre_comp[4][2]);
Packit c4476c
    /* 2^130*G + 2^260*G + 2^390*G */
Packit c4476c
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
Packit c4476c
              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
Packit c4476c
              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
Packit c4476c
              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
Packit c4476c
              pre->g_pre_comp[2][2]);
Packit c4476c
    for (i = 1; i < 8; ++i) {
Packit c4476c
        /* odd multiples: add G */
Packit c4476c
        point_add(pre->g_pre_comp[2 * i + 1][0],
Packit c4476c
                  pre->g_pre_comp[2 * i + 1][1],
Packit c4476c
                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
Packit c4476c
                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
Packit c4476c
                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
Packit c4476c
                  pre->g_pre_comp[1][2]);
Packit c4476c
    }
Packit c4476c
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
Packit c4476c
Packit c4476c
 done:
Packit c4476c
    SETPRECOMP(group, nistp521, pre);
Packit c4476c
    ret = 1;
Packit c4476c
    pre = NULL;
Packit c4476c
 err:
Packit c4476c
    BN_CTX_end(ctx);
Packit c4476c
    EC_POINT_free(generator);
Packit c4476c
    BN_CTX_free(new_ctx);
Packit c4476c
    EC_nistp521_pre_comp_free(pre);
Packit c4476c
    return ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
Packit c4476c
{
Packit c4476c
    return HAVEPRECOMP(group, nistp521);
Packit c4476c
}
Packit c4476c
Packit c4476c
#endif