Blame crypto/bn/rsaz_exp.c

Packit c4476c
/*
Packit c4476c
 * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
 * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
Packit c4476c
 *
Packit c4476c
 * Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
 * this file except in compliance with the License.  You can obtain a copy
Packit c4476c
 * in the file LICENSE in the source distribution or at
Packit c4476c
 * https://www.openssl.org/source/license.html
Packit c4476c
 *
Packit c4476c
 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
Packit c4476c
 * (1) Intel Corporation, Israel Development Center, Haifa, Israel
Packit c4476c
 * (2) University of Haifa, Israel
Packit c4476c
 */
Packit c4476c
Packit c4476c
#include <openssl/opensslconf.h>
Packit c4476c
#include "rsaz_exp.h"
Packit c4476c
Packit c4476c
#ifndef RSAZ_ENABLED
Packit c4476c
NON_EMPTY_TRANSLATION_UNIT
Packit c4476c
#else
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * See crypto/bn/asm/rsaz-avx2.pl for further details.
Packit c4476c
 */
Packit c4476c
void rsaz_1024_norm2red_avx2(void *red, const void *norm);
Packit c4476c
void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,
Packit c4476c
                        const void *n, BN_ULONG k);
Packit c4476c
void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
Packit c4476c
                        int cnt);
Packit c4476c
void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
Packit c4476c
void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
Packit c4476c
void rsaz_1024_red2norm_avx2(void *norm, const void *red);
Packit c4476c
Packit c4476c
#if defined(__GNUC__)
Packit c4476c
# define ALIGN64        __attribute__((aligned(64)))
Packit c4476c
#elif defined(_MSC_VER)
Packit c4476c
# define ALIGN64        __declspec(align(64))
Packit c4476c
#elif defined(__SUNPRO_C)
Packit c4476c
# define ALIGN64
Packit c4476c
# pragma align 64(one,two80)
Packit c4476c
#else
Packit c4476c
/* not fatal, might hurt performance a little */
Packit c4476c
# define ALIGN64
Packit c4476c
#endif
Packit c4476c
Packit c4476c
ALIGN64 static const BN_ULONG one[40] = {
Packit c4476c
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit c4476c
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
Packit c4476c
};
Packit c4476c
Packit c4476c
ALIGN64 static const BN_ULONG two80[40] = {
Packit c4476c
    0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit c4476c
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
Packit c4476c
};
Packit c4476c
Packit c4476c
void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
Packit c4476c
                            const BN_ULONG base_norm[16],
Packit c4476c
                            const BN_ULONG exponent[16],
Packit c4476c
                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
Packit c4476c
                            BN_ULONG k0)
Packit c4476c
{
Packit c4476c
    unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */
Packit c4476c
    unsigned char *p_str = storage + (64 - ((size_t)storage % 64));
Packit c4476c
    unsigned char *a_inv, *m, *result;
Packit c4476c
    unsigned char *table_s = p_str + 320 * 3;
Packit c4476c
    unsigned char *R2 = table_s; /* borrow */
Packit c4476c
    int index;
Packit c4476c
    int wvalue;
Packit c4476c
Packit c4476c
    if ((((size_t)p_str & 4095) + 320) >> 12) {
Packit c4476c
        result = p_str;
Packit c4476c
        a_inv = p_str + 320;
Packit c4476c
        m = p_str + 320 * 2;    /* should not cross page */
Packit c4476c
    } else {
Packit c4476c
        m = p_str;              /* should not cross page */
Packit c4476c
        result = p_str + 320;
Packit c4476c
        a_inv = p_str + 320 * 2;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    rsaz_1024_norm2red_avx2(m, m_norm);
Packit c4476c
    rsaz_1024_norm2red_avx2(a_inv, base_norm);
Packit c4476c
    rsaz_1024_norm2red_avx2(R2, RR);
Packit c4476c
Packit c4476c
    rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
Packit c4476c
    rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
Packit c4476c
Packit c4476c
    /* table[0] = 1 */
Packit c4476c
    rsaz_1024_mul_avx2(result, R2, one, m, k0);
Packit c4476c
    /* table[1] = a_inv^1 */
Packit c4476c
    rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
Packit c4476c
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
Packit c4476c
Packit c4476c
    /* table[2] = a_inv^2 */
Packit c4476c
    rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 2);
Packit c4476c
#if 0
Packit c4476c
    /* this is almost 2x smaller and less than 1% slower */
Packit c4476c
    for (index = 3; index < 32; index++) {
Packit c4476c
        rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
        rsaz_1024_scatter5_avx2(table_s, result, index);
Packit c4476c
    }
Packit c4476c
#else
Packit c4476c
    /* table[4] = a_inv^4 */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 4);
Packit c4476c
    /* table[8] = a_inv^8 */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 8);
Packit c4476c
    /* table[16] = a_inv^16 */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 16);
Packit c4476c
    /* table[17] = a_inv^17 */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 17);
Packit c4476c
Packit c4476c
    /* table[3] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 2);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 3);
Packit c4476c
    /* table[6] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 6);
Packit c4476c
    /* table[12] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 12);
Packit c4476c
    /* table[24] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 24);
Packit c4476c
    /* table[25] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 25);
Packit c4476c
Packit c4476c
    /* table[5] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 4);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 5);
Packit c4476c
    /* table[10] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 10);
Packit c4476c
    /* table[20] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 20);
Packit c4476c
    /* table[21] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 21);
Packit c4476c
Packit c4476c
    /* table[7] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 6);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 7);
Packit c4476c
    /* table[14] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 14);
Packit c4476c
    /* table[28] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 28);
Packit c4476c
    /* table[29] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 29);
Packit c4476c
Packit c4476c
    /* table[9] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 8);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 9);
Packit c4476c
    /* table[18] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 18);
Packit c4476c
    /* table[19] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 19);
Packit c4476c
Packit c4476c
    /* table[11] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 10);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 11);
Packit c4476c
    /* table[22] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 22);
Packit c4476c
    /* table[23] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 23);
Packit c4476c
Packit c4476c
    /* table[13] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 12);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 13);
Packit c4476c
    /* table[26] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 26);
Packit c4476c
    /* table[27] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 27);
Packit c4476c
Packit c4476c
    /* table[15] */
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, 14);
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 15);
Packit c4476c
    /* table[30] */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 30);
Packit c4476c
    /* table[31] */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    rsaz_1024_scatter5_avx2(table_s, result, 31);
Packit c4476c
#endif
Packit c4476c
Packit c4476c
    /* load first window */
Packit c4476c
    p_str = (unsigned char *)exponent;
Packit c4476c
    wvalue = p_str[127] >> 3;
Packit c4476c
    rsaz_1024_gather5_avx2(result, table_s, wvalue);
Packit c4476c
Packit c4476c
    index = 1014;
Packit c4476c
Packit c4476c
    while (index > -1) {        /* loop for the remaining 127 windows */
Packit c4476c
Packit c4476c
        rsaz_1024_sqr_avx2(result, result, m, k0, 5);
Packit c4476c
Packit c4476c
        wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8];
Packit c4476c
        wvalue = (wvalue >> (index % 8)) & 31;
Packit c4476c
        index -= 5;
Packit c4476c
Packit c4476c
        rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
Packit c4476c
        rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
    }
Packit c4476c
Packit c4476c
    /* square four times */
Packit c4476c
    rsaz_1024_sqr_avx2(result, result, m, k0, 4);
Packit c4476c
Packit c4476c
    wvalue = p_str[0] & 15;
Packit c4476c
Packit c4476c
    rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
Packit c4476c
Packit c4476c
    /* from Montgomery */
Packit c4476c
    rsaz_1024_mul_avx2(result, result, one, m, k0);
Packit c4476c
Packit c4476c
    rsaz_1024_red2norm_avx2(result_norm, result);
Packit c4476c
Packit c4476c
    OPENSSL_cleanse(storage, sizeof(storage));
Packit c4476c
}
Packit c4476c
Packit c4476c
/*
Packit c4476c
 * See crypto/bn/rsaz-x86_64.pl for further details.
Packit c4476c
 */
Packit c4476c
void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,
Packit c4476c
                  BN_ULONG k);
Packit c4476c
void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,
Packit c4476c
                           BN_ULONG k, const void *tbl, unsigned int power);
Packit c4476c
void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,
Packit c4476c
                          const void *n, BN_ULONG k, unsigned int power);
Packit c4476c
void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);
Packit c4476c
void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,
Packit c4476c
                  int cnt);
Packit c4476c
void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
Packit c4476c
void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
Packit c4476c
Packit c4476c
void RSAZ_512_mod_exp(BN_ULONG result[8],
Packit c4476c
                      const BN_ULONG base[8], const BN_ULONG exponent[8],
Packit c4476c
                      const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
Packit c4476c
{
Packit c4476c
    unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */
Packit c4476c
    unsigned char *table = storage + (64 - ((size_t)storage % 64));
Packit c4476c
    BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);
Packit c4476c
    BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);
Packit c4476c
    unsigned char *p_str = (unsigned char *)exponent;
Packit c4476c
    int index;
Packit c4476c
    unsigned int wvalue;
Packit c4476c
Packit c4476c
    /* table[0] = 1_inv */
Packit c4476c
    temp[0] = 0 - m[0];
Packit c4476c
    temp[1] = ~m[1];
Packit c4476c
    temp[2] = ~m[2];
Packit c4476c
    temp[3] = ~m[3];
Packit c4476c
    temp[4] = ~m[4];
Packit c4476c
    temp[5] = ~m[5];
Packit c4476c
    temp[6] = ~m[6];
Packit c4476c
    temp[7] = ~m[7];
Packit c4476c
    rsaz_512_scatter4(table, temp, 0);
Packit c4476c
Packit c4476c
    /* table [1] = a_inv^1 */
Packit c4476c
    rsaz_512_mul(a_inv, base, RR, m, k0);
Packit c4476c
    rsaz_512_scatter4(table, a_inv, 1);
Packit c4476c
Packit c4476c
    /* table [2] = a_inv^2 */
Packit c4476c
    rsaz_512_sqr(temp, a_inv, m, k0, 1);
Packit c4476c
    rsaz_512_scatter4(table, temp, 2);
Packit c4476c
Packit c4476c
    for (index = 3; index < 16; index++)
Packit c4476c
        rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
Packit c4476c
Packit c4476c
    /* load first window */
Packit c4476c
    wvalue = p_str[63];
Packit c4476c
Packit c4476c
    rsaz_512_gather4(temp, table, wvalue >> 4);
Packit c4476c
    rsaz_512_sqr(temp, temp, m, k0, 4);
Packit c4476c
    rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);
Packit c4476c
Packit c4476c
    for (index = 62; index >= 0; index--) {
Packit c4476c
        wvalue = p_str[index];
Packit c4476c
Packit c4476c
        rsaz_512_sqr(temp, temp, m, k0, 4);
Packit c4476c
        rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);
Packit c4476c
Packit c4476c
        rsaz_512_sqr(temp, temp, m, k0, 4);
Packit c4476c
        rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);
Packit c4476c
    }
Packit c4476c
Packit c4476c
    /* from Montgomery */
Packit c4476c
    rsaz_512_mul_by_one(result, temp, m, k0);
Packit c4476c
Packit c4476c
    OPENSSL_cleanse(storage, sizeof(storage));
Packit c4476c
}
Packit c4476c
Packit c4476c
#endif