; LICENSE: ; This submission to NSS is to be made available under the terms of the ; Mozilla Public License, v. 2.0. You can obtain one at http: ; //mozilla.org/MPL/2.0/. ;############################################################################### ; Copyright(c) 2014, Intel Corp. ; Developers and authors: ; Shay Gueron and Vlad Krasnov ; Intel Corporation, Israel Development Centre, Haifa, Israel ; Please send feedback directly to crypto.feedback.alias@intel.com .MODEL FLAT, C .XMM .DATA ALIGN 16 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh Lcon1 dd 1,1,1,1 Lcon2 dd 1bh,1bh,1bh,1bh .CODE ctx textequ output textequ input textequ inputLen textequ aes_rnd MACRO i movdqu xmm7, [i*16 + ctx] aesenc xmm0, xmm7 aesenc xmm1, xmm7 aesenc xmm2, xmm7 aesenc xmm3, xmm7 aesenc xmm4, xmm7 aesenc xmm5, xmm7 aesenc xmm6, xmm7 ENDM aes_last_rnd MACRO i movdqu xmm7, [i*16 + ctx] aesenclast xmm0, xmm7 aesenclast xmm1, xmm7 aesenclast xmm2, xmm7 aesenclast xmm3, xmm7 aesenclast xmm4, xmm7 aesenclast xmm5, xmm7 aesenclast xmm6, xmm7 ENDM aes_dec_rnd MACRO i movdqu xmm7, [i*16 + ctx] aesdec xmm0, xmm7 aesdec xmm1, xmm7 aesdec xmm2, xmm7 aesdec xmm3, xmm7 aesdec xmm4, xmm7 aesdec xmm5, xmm7 aesdec xmm6, xmm7 ENDM aes_dec_last_rnd MACRO i movdqu xmm7, [i*16 + ctx] aesdeclast xmm0, xmm7 aesdeclast xmm1, xmm7 aesdeclast xmm2, xmm7 aesdeclast xmm3, xmm7 aesdeclast xmm4, xmm7 aesdeclast xmm5, xmm7 aesdeclast xmm6, xmm7 ENDM gen_aes_ecb_func MACRO enc, rnds LOCAL loop7 LOCAL loop1 LOCAL bail push inputLen mov ctx, [esp + 2*4 + 0*4] mov output, [esp + 2*4 + 1*4] mov input, [esp + 2*4 + 4*4] mov inputLen, [esp + 2*4 + 5*4] lea ctx, [44+ctx] loop7: cmp inputLen, 7*16 jb loop1 movdqu xmm0, [0*16 + input] movdqu xmm1, [1*16 + input] movdqu xmm2, [2*16 + input] movdqu xmm3, [3*16 + input] movdqu xmm4, [4*16 + input] movdqu xmm5, [5*16 + input] movdqu xmm6, [6*16 + input] movdqu xmm7, [0*16 + ctx] pxor xmm0, xmm7 pxor xmm1, xmm7 pxor xmm2, xmm7 pxor xmm3, xmm7 pxor xmm4, xmm7 pxor xmm5, xmm7 pxor xmm6, xmm7 IF enc eq 1 rnd textequ lastrnd textequ aesinst textequ aeslastinst textequ ELSE rnd textequ lastrnd textequ aesinst textequ aeslastinst textequ ENDIF i = 1 WHILE i LT rnds rnd i i = i+1 ENDM lastrnd rnds movdqu [0*16 + output], xmm0 movdqu [1*16 + output], xmm1 movdqu [2*16 + output], xmm2 movdqu [3*16 + output], xmm3 movdqu [4*16 + output], xmm4 movdqu [5*16 + output], xmm5 movdqu [6*16 + output], xmm6 lea input, [7*16 + input] lea output, [7*16 + output] sub inputLen, 7*16 jmp loop7 loop1: cmp inputLen, 1*16 jb bail movdqu xmm0, [input] movdqu xmm7, [0*16 + ctx] pxor xmm0, xmm7 i = 1 WHILE i LT rnds movdqu xmm7, [i*16 + ctx] aesinst xmm0, xmm7 i = i+1 ENDM movdqu xmm7, [rnds*16 + ctx] aeslastinst xmm0, xmm7 movdqu [output], xmm0 lea input, [1*16 + input] lea output, [1*16 + output] sub inputLen, 1*16 jmp loop1 bail: xor eax, eax pop inputLen ret ENDM ALIGN 16 intel_aes_encrypt_ecb_128 PROC gen_aes_ecb_func 1, 10 intel_aes_encrypt_ecb_128 ENDP ALIGN 16 intel_aes_encrypt_ecb_192 PROC gen_aes_ecb_func 1, 12 intel_aes_encrypt_ecb_192 ENDP ALIGN 16 intel_aes_encrypt_ecb_256 PROC gen_aes_ecb_func 1, 14 intel_aes_encrypt_ecb_256 ENDP ALIGN 16 intel_aes_decrypt_ecb_128 PROC gen_aes_ecb_func 0, 10 intel_aes_decrypt_ecb_128 ENDP ALIGN 16 intel_aes_decrypt_ecb_192 PROC gen_aes_ecb_func 0, 12 intel_aes_decrypt_ecb_192 ENDP ALIGN 16 intel_aes_decrypt_ecb_256 PROC gen_aes_ecb_func 0, 14 intel_aes_decrypt_ecb_256 ENDP KEY textequ KS textequ ITR textequ ALIGN 16 intel_aes_encrypt_init_128 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] movdqu xmm1, [KEY] movdqu [KS], xmm1 movdqa xmm2, xmm1 lea ITR, Lcon1 movdqa xmm0, [ITR] lea ITR, Lmask movdqa xmm4, [ITR] mov ITR, 8 Lenc_128_ks_loop: lea KS, [16 + KS] dec ITR pshufb xmm2, xmm4 aesenclast xmm2, xmm0 pslld xmm0, 1 movdqa xmm3, xmm1 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pxor xmm1, xmm2 movdqu [KS], xmm1 movdqa xmm2, xmm1 jne Lenc_128_ks_loop lea ITR, Lcon2 movdqa xmm0, [ITR] pshufb xmm2, xmm4 aesenclast xmm2, xmm0 pslld xmm0, 1 movdqa xmm3, xmm1 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pxor xmm1, xmm2 movdqu [16 + KS], xmm1 movdqa xmm2, xmm1 pshufb xmm2, xmm4 aesenclast xmm2, xmm0 movdqa xmm3, xmm1 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pslldq xmm3, 4 pxor xmm1, xmm3 pxor xmm1, xmm2 movdqu [32 + KS], xmm1 movdqa xmm2, xmm1 ret intel_aes_encrypt_init_128 ENDP ALIGN 16 intel_aes_decrypt_init_128 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] push KS push KEY call intel_aes_encrypt_init_128 pop KEY pop KS movdqu xmm0, [0*16 + KS] movdqu xmm1, [10*16 + KS] movdqu [10*16 + KS], xmm0 movdqu [0*16 + KS], xmm1 i = 1 WHILE i LT 5 movdqu xmm0, [i*16 + KS] movdqu xmm1, [(10-i)*16 + KS] aesimc xmm0, xmm0 aesimc xmm1, xmm1 movdqu [(10-i)*16 + KS], xmm0 movdqu [i*16 + KS], xmm1 i = i+1 ENDM movdqu xmm0, [5*16 + KS] aesimc xmm0, xmm0 movdqu [5*16 + KS], xmm0 ret intel_aes_decrypt_init_128 ENDP ALIGN 16 intel_aes_encrypt_init_192 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] pxor xmm3, xmm3 movdqu xmm1, [KEY] pinsrd xmm3, DWORD PTR [16 + KEY], 0 pinsrd xmm3, DWORD PTR [20 + KEY], 1 movdqu [KS], xmm1 movdqa xmm5, xmm3 lea ITR, Lcon1 movdqu xmm0, [ITR] lea ITR, Lmask192 movdqu xmm4, [ITR] mov ITR, 4 Lenc_192_ks_loop: movdqa xmm2, xmm3 pshufb xmm2, xmm4 aesenclast xmm2, xmm0 pslld xmm0, 1 movdqa xmm6, xmm1 movdqa xmm7, xmm3 pslldq xmm6, 4 pslldq xmm7, 4 pxor xmm1, xmm6 pxor xmm3, xmm7 pslldq xmm6, 4 pxor xmm1, xmm6 pslldq xmm6, 4 pxor xmm1, xmm6 pxor xmm1, xmm2 pshufd xmm2, xmm1, 0ffh pxor xmm3, xmm2 movdqa xmm6, xmm1 shufpd xmm5, xmm1, 00h shufpd xmm6, xmm3, 01h movdqu [16 + KS], xmm5 movdqu [32 + KS], xmm6 movdqa xmm2, xmm3 pshufb xmm2, xmm4 aesenclast xmm2, xmm0 pslld xmm0, 1 movdqa xmm6, xmm1 movdqa xmm7, xmm3 pslldq xmm6, 4 pslldq xmm7, 4 pxor xmm1, xmm6 pxor xmm3, xmm7 pslldq xmm6, 4 pxor xmm1, xmm6 pslldq xmm6, 4 pxor xmm1, xmm6 pxor xmm1, xmm2 pshufd xmm2, xmm1, 0ffh pxor xmm3, xmm2 movdqu [48 + KS], xmm1 movdqa xmm5, xmm3 lea KS, [48 + KS] dec ITR jnz Lenc_192_ks_loop movdqu [16 + KS], xmm5 ret intel_aes_encrypt_init_192 ENDP ALIGN 16 intel_aes_decrypt_init_192 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] push KS push KEY call intel_aes_encrypt_init_192 pop KEY pop KS movdqu xmm0, [0*16 + KS] movdqu xmm1, [12*16 + KS] movdqu [12*16 + KS], xmm0 movdqu [0*16 + KS], xmm1 i = 1 WHILE i LT 6 movdqu xmm0, [i*16 + KS] movdqu xmm1, [(12-i)*16 + KS] aesimc xmm0, xmm0 aesimc xmm1, xmm1 movdqu [(12-i)*16 + KS], xmm0 movdqu [i*16 + KS], xmm1 i = i+1 ENDM movdqu xmm0, [6*16 + KS] aesimc xmm0, xmm0 movdqu [6*16 + KS], xmm0 ret intel_aes_decrypt_init_192 ENDP ALIGN 16 intel_aes_encrypt_init_256 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] movdqu xmm1, [16*0 + KEY] movdqu xmm3, [16*1 + KEY] movdqu [16*0 + KS], xmm1 movdqu [16*1 + KS], xmm3 lea ITR, Lcon1 movdqu xmm0, [ITR] lea ITR, Lmask256 movdqu xmm5, [ITR] pxor xmm6, xmm6 mov ITR, 6 Lenc_256_ks_loop: movdqa xmm2, xmm3 pshufb xmm2, xmm5 aesenclast xmm2, xmm0 pslld xmm0, 1 movdqa xmm4, xmm1 pslldq xmm4, 4 pxor xmm1, xmm4 pslldq xmm4, 4 pxor xmm1, xmm4 pslldq xmm4, 4 pxor xmm1, xmm4 pxor xmm1, xmm2 movdqu [16*2 + KS], xmm1 pshufd xmm2, xmm1, 0ffh aesenclast xmm2, xmm6 movdqa xmm4, xmm3 pslldq xmm4, 4 pxor xmm3, xmm4 pslldq xmm4, 4 pxor xmm3, xmm4 pslldq xmm4, 4 pxor xmm3, xmm4 pxor xmm3, xmm2 movdqu [16*3 + KS], xmm3 lea KS, [32 + KS] dec ITR jnz Lenc_256_ks_loop movdqa xmm2, xmm3 pshufb xmm2, xmm5 aesenclast xmm2, xmm0 movdqa xmm4, xmm1 pslldq xmm4, 4 pxor xmm1, xmm4 pslldq xmm4, 4 pxor xmm1, xmm4 pslldq xmm4, 4 pxor xmm1, xmm4 pxor xmm1, xmm2 movdqu [16*2 + KS], xmm1 ret intel_aes_encrypt_init_256 ENDP ALIGN 16 intel_aes_decrypt_init_256 PROC mov KEY, [esp + 1*4 + 0*4] mov KS, [esp + 1*4 + 1*4] push KS push KEY call intel_aes_encrypt_init_256 pop KEY pop KS movdqu xmm0, [0*16 + KS] movdqu xmm1, [14*16 + KS] movdqu [14*16 + KS], xmm0 movdqu [0*16 + KS], xmm1 i = 1 WHILE i LT 7 movdqu xmm0, [i*16 + KS] movdqu xmm1, [(14-i)*16 + KS] aesimc xmm0, xmm0 aesimc xmm1, xmm1 movdqu [(14-i)*16 + KS], xmm0 movdqu [i*16 + KS], xmm1 i = i+1 ENDM movdqu xmm0, [7*16 + KS] aesimc xmm0, xmm0 movdqu [7*16 + KS], xmm0 ret intel_aes_decrypt_init_256 ENDP gen_aes_cbc_enc_func MACRO rnds LOCAL loop1 LOCAL bail push inputLen mov ctx, [esp + 2*4 + 0*4] mov output, [esp + 2*4 + 1*4] mov input, [esp + 2*4 + 4*4] mov inputLen, [esp + 2*4 + 5*4] lea ctx, [44+ctx] movdqu xmm0, [-32+ctx] movdqu xmm2, [0*16 + ctx] movdqu xmm3, [1*16 + ctx] movdqu xmm4, [2*16 + ctx] movdqu xmm5, [3*16 + ctx] movdqu xmm6, [4*16 + ctx] loop1: cmp inputLen, 1*16 jb bail movdqu xmm1, [input] pxor xmm1, xmm2 pxor xmm0, xmm1 aesenc xmm0, xmm3 aesenc xmm0, xmm4 aesenc xmm0, xmm5 aesenc xmm0, xmm6 i = 5 WHILE i LT rnds movdqu xmm7, [i*16 + ctx] aesenc xmm0, xmm7 i = i+1 ENDM movdqu xmm7, [rnds*16 + ctx] aesenclast xmm0, xmm7 movdqu [output], xmm0 lea input, [1*16 + input] lea output, [1*16 + output] sub inputLen, 1*16 jmp loop1 bail: movdqu [-32+ctx], xmm0 xor eax, eax pop inputLen ret ENDM gen_aes_cbc_dec_func MACRO rnds LOCAL loop7 LOCAL loop1 LOCAL dec1 LOCAL bail push inputLen mov ctx, [esp + 2*4 + 0*4] mov output, [esp + 2*4 + 1*4] mov input, [esp + 2*4 + 4*4] mov inputLen, [esp + 2*4 + 5*4] lea ctx, [44+ctx] loop7: cmp inputLen, 7*16 jb dec1 movdqu xmm0, [0*16 + input] movdqu xmm1, [1*16 + input] movdqu xmm2, [2*16 + input] movdqu xmm3, [3*16 + input] movdqu xmm4, [4*16 + input] movdqu xmm5, [5*16 + input] movdqu xmm6, [6*16 + input] movdqu xmm7, [0*16 + ctx] pxor xmm0, xmm7 pxor xmm1, xmm7 pxor xmm2, xmm7 pxor xmm3, xmm7 pxor xmm4, xmm7 pxor xmm5, xmm7 pxor xmm6, xmm7 i = 1 WHILE i LT rnds aes_dec_rnd i i = i+1 ENDM aes_dec_last_rnd rnds movdqu xmm7, [-32 + ctx] pxor xmm0, xmm7 movdqu xmm7, [0*16 + input] pxor xmm1, xmm7 movdqu xmm7, [1*16 + input] pxor xmm2, xmm7 movdqu xmm7, [2*16 + input] pxor xmm3, xmm7 movdqu xmm7, [3*16 + input] pxor xmm4, xmm7 movdqu xmm7, [4*16 + input] pxor xmm5, xmm7 movdqu xmm7, [5*16 + input] pxor xmm6, xmm7 movdqu xmm7, [6*16 + input] movdqu [0*16 + output], xmm0 movdqu [1*16 + output], xmm1 movdqu [2*16 + output], xmm2 movdqu [3*16 + output], xmm3 movdqu [4*16 + output], xmm4 movdqu [5*16 + output], xmm5 movdqu [6*16 + output], xmm6 movdqu [-32 + ctx], xmm7 lea input, [7*16 + input] lea output, [7*16 + output] sub inputLen, 7*16 jmp loop7 dec1: movdqu xmm3, [-32 + ctx] loop1: cmp inputLen, 1*16 jb bail movdqu xmm0, [input] movdqa xmm4, xmm0 movdqu xmm7, [0*16 + ctx] pxor xmm0, xmm7 i = 1 WHILE i LT rnds movdqu xmm7, [i*16 + ctx] aesdec xmm0, xmm7 i = i+1 ENDM movdqu xmm7, [rnds*16 + ctx] aesdeclast xmm0, xmm7 pxor xmm3, xmm0 movdqu [output], xmm3 movdqa xmm3, xmm4 lea input, [1*16 + input] lea output, [1*16 + output] sub inputLen, 1*16 jmp loop1 bail: movdqu [-32 + ctx], xmm3 xor eax, eax pop inputLen ret ENDM ALIGN 16 intel_aes_encrypt_cbc_128 PROC gen_aes_cbc_enc_func 10 intel_aes_encrypt_cbc_128 ENDP ALIGN 16 intel_aes_encrypt_cbc_192 PROC gen_aes_cbc_enc_func 12 intel_aes_encrypt_cbc_192 ENDP ALIGN 16 intel_aes_encrypt_cbc_256 PROC gen_aes_cbc_enc_func 14 intel_aes_encrypt_cbc_256 ENDP ALIGN 16 intel_aes_decrypt_cbc_128 PROC gen_aes_cbc_dec_func 10 intel_aes_decrypt_cbc_128 ENDP ALIGN 16 intel_aes_decrypt_cbc_192 PROC gen_aes_cbc_dec_func 12 intel_aes_decrypt_cbc_192 ENDP ALIGN 16 intel_aes_decrypt_cbc_256 PROC gen_aes_cbc_dec_func 14 intel_aes_decrypt_cbc_256 ENDP ctrCtx textequ CTR textequ gen_aes_ctr_func MACRO rnds LOCAL loop7 LOCAL loop1 LOCAL enc1 LOCAL bail push inputLen push ctrCtx push CTR push ebp mov ctrCtx, [esp + 4*5 + 0*4] mov output, [esp + 4*5 + 1*4] mov input, [esp + 4*5 + 4*4] mov inputLen, [esp + 4*5 + 5*4] mov ctx, [4+ctrCtx] lea ctx, [44+ctx] mov ebp, esp sub esp, 7*16 and esp, -16 movdqu xmm0, [8+ctrCtx] mov ctrCtx, [ctrCtx + 8 + 3*4] bswap ctrCtx movdqu xmm1, [ctx + 0*16] pxor xmm0, xmm1 movdqa [esp + 0*16], xmm0 movdqa [esp + 1*16], xmm0 movdqa [esp + 2*16], xmm0 movdqa [esp + 3*16], xmm0 movdqa [esp + 4*16], xmm0 movdqa [esp + 5*16], xmm0 movdqa [esp + 6*16], xmm0 inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 1*16 + 3*4], CTR inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 2*16 + 3*4], CTR inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 3*16 + 3*4], CTR inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 4*16 + 3*4], CTR inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 5*16 + 3*4], CTR inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + 6*16 + 3*4], CTR loop7: cmp inputLen, 7*16 jb loop1 movdqu xmm0, [0*16 + esp] movdqu xmm1, [1*16 + esp] movdqu xmm2, [2*16 + esp] movdqu xmm3, [3*16 + esp] movdqu xmm4, [4*16 + esp] movdqu xmm5, [5*16 + esp] movdqu xmm6, [6*16 + esp] i = 1 WHILE i LE 7 aes_rnd i inc ctrCtx mov CTR, ctrCtx bswap CTR xor CTR, [ctx + 3*4] mov [esp + (i-1)*16 + 3*4], CTR i = i+1 ENDM WHILE i LT rnds aes_rnd i i = i+1 ENDM aes_last_rnd rnds movdqu xmm7, [0*16 + input] pxor xmm0, xmm7 movdqu xmm7, [1*16 + input] pxor xmm1, xmm7 movdqu xmm7, [2*16 + input] pxor xmm2, xmm7 movdqu xmm7, [3*16 + input] pxor xmm3, xmm7 movdqu xmm7, [4*16 + input] pxor xmm4, xmm7 movdqu xmm7, [5*16 + input] pxor xmm5, xmm7 movdqu xmm7, [6*16 + input] pxor xmm6, xmm7 movdqu [0*16 + output], xmm0 movdqu [1*16 + output], xmm1 movdqu [2*16 + output], xmm2 movdqu [3*16 + output], xmm3 movdqu [4*16 + output], xmm4 movdqu [5*16 + output], xmm5 movdqu [6*16 + output], xmm6 lea input, [7*16 + input] lea output, [7*16 + output] sub inputLen, 7*16 jmp loop7 loop1: cmp inputLen, 1*16 jb bail movdqu xmm0, [esp] add esp, 16 i = 1 WHILE i LT rnds movdqu xmm7, [i*16 + ctx] aesenc xmm0, xmm7 i = i+1 ENDM movdqu xmm7, [rnds*16 + ctx] aesenclast xmm0, xmm7 movdqu xmm7, [input] pxor xmm0, xmm7 movdqu [output], xmm0 lea input, [1*16 + input] lea output, [1*16 + output] sub inputLen, 1*16 jmp loop1 bail: mov ctrCtx, [ebp + 4*5 + 0*4] movdqu xmm0, [esp] movdqu xmm1, [ctx + 0*16] pxor xmm0, xmm1 movdqu [8+ctrCtx], xmm0 xor eax, eax mov esp, ebp pop ebp pop CTR pop ctrCtx pop inputLen ret ENDM ALIGN 16 intel_aes_encrypt_ctr_128 PROC gen_aes_ctr_func 10 intel_aes_encrypt_ctr_128 ENDP ALIGN 16 intel_aes_encrypt_ctr_192 PROC gen_aes_ctr_func 12 intel_aes_encrypt_ctr_192 ENDP ALIGN 16 intel_aes_encrypt_ctr_256 PROC gen_aes_ctr_func 14 intel_aes_encrypt_ctr_256 ENDP END