Blame nss/lib/freebl/intel-aes-x86-masm.asm

Packit 40b132
; LICENSE:
Packit 40b132
; This submission to NSS is to be made available under the terms of the
Packit 40b132
; Mozilla Public License, v. 2.0. You can obtain one at http:
Packit 40b132
; //mozilla.org/MPL/2.0/.
Packit 40b132
;###############################################################################
Packit 40b132
; Copyright(c) 2014, Intel Corp.
Packit 40b132
; Developers and authors:
Packit 40b132
; Shay Gueron and Vlad Krasnov
Packit 40b132
; Intel Corporation, Israel Development Centre, Haifa, Israel
Packit 40b132
; Please send feedback directly to crypto.feedback.alias@intel.com
Packit 40b132
Packit 40b132
Packit 40b132
.MODEL FLAT, C
Packit 40b132
.XMM
Packit 40b132
Packit 40b132
.DATA
Packit 40b132
ALIGN 16
Packit 40b132
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
Packit 40b132
Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
Packit 40b132
Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
Packit 40b132
Lcon1 dd 1,1,1,1
Packit 40b132
Lcon2 dd 1bh,1bh,1bh,1bh
Packit 40b132
Packit 40b132
.CODE
Packit 40b132
Packit 40b132
ctx     textequ <ecx>
Packit 40b132
output  textequ <edx>
Packit 40b132
input   textequ <eax>
Packit 40b132
inputLen textequ <edi>
Packit 40b132
Packit 40b132
Packit 40b132
aes_rnd MACRO i
Packit 40b132
    movdqu  xmm7, [i*16 + ctx]
Packit 40b132
    aesenc  xmm0, xmm7
Packit 40b132
    aesenc  xmm1, xmm7
Packit 40b132
    aesenc  xmm2, xmm7
Packit 40b132
    aesenc  xmm3, xmm7
Packit 40b132
    aesenc  xmm4, xmm7
Packit 40b132
    aesenc  xmm5, xmm7
Packit 40b132
    aesenc  xmm6, xmm7
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
aes_last_rnd MACRO i
Packit 40b132
    movdqu  xmm7, [i*16 + ctx]
Packit 40b132
    aesenclast  xmm0, xmm7
Packit 40b132
    aesenclast  xmm1, xmm7
Packit 40b132
    aesenclast  xmm2, xmm7
Packit 40b132
    aesenclast  xmm3, xmm7
Packit 40b132
    aesenclast  xmm4, xmm7
Packit 40b132
    aesenclast  xmm5, xmm7
Packit 40b132
    aesenclast  xmm6, xmm7
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
aes_dec_rnd MACRO i
Packit 40b132
    movdqu  xmm7, [i*16 + ctx]
Packit 40b132
    aesdec  xmm0, xmm7
Packit 40b132
    aesdec  xmm1, xmm7
Packit 40b132
    aesdec  xmm2, xmm7
Packit 40b132
    aesdec  xmm3, xmm7
Packit 40b132
    aesdec  xmm4, xmm7
Packit 40b132
    aesdec  xmm5, xmm7
Packit 40b132
    aesdec  xmm6, xmm7
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
aes_dec_last_rnd MACRO i
Packit 40b132
    movdqu  xmm7, [i*16 + ctx]
Packit 40b132
    aesdeclast  xmm0, xmm7
Packit 40b132
    aesdeclast  xmm1, xmm7
Packit 40b132
    aesdeclast  xmm2, xmm7
Packit 40b132
    aesdeclast  xmm3, xmm7
Packit 40b132
    aesdeclast  xmm4, xmm7
Packit 40b132
    aesdeclast  xmm5, xmm7
Packit 40b132
    aesdeclast  xmm6, xmm7
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
Packit 40b132
gen_aes_ecb_func MACRO enc, rnds
Packit 40b132
Packit 40b132
LOCAL   loop7
Packit 40b132
LOCAL   loop1
Packit 40b132
LOCAL   bail
Packit 40b132
Packit 40b132
        push    inputLen
Packit 40b132
Packit 40b132
        mov     ctx,    [esp + 2*4 + 0*4]
Packit 40b132
        mov     output,     [esp + 2*4 + 1*4]
Packit 40b132
        mov     input,      [esp + 2*4 + 4*4]
Packit 40b132
        mov     inputLen,   [esp + 2*4 + 5*4]
Packit 40b132
Packit 40b132
        lea     ctx, [44+ctx]
Packit 40b132
Packit 40b132
loop7:
Packit 40b132
        cmp     inputLen, 7*16
Packit 40b132
        jb      loop1
Packit 40b132
Packit 40b132
        movdqu  xmm0, [0*16 + input]
Packit 40b132
        movdqu  xmm1, [1*16 + input]
Packit 40b132
        movdqu  xmm2, [2*16 + input]
Packit 40b132
        movdqu  xmm3, [3*16 + input]
Packit 40b132
        movdqu  xmm4, [4*16 + input]
Packit 40b132
        movdqu  xmm5, [5*16 + input]
Packit 40b132
        movdqu  xmm6, [6*16 + input]
Packit 40b132
Packit 40b132
        movdqu  xmm7, [0*16 + ctx]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
        pxor    xmm1, xmm7
Packit 40b132
        pxor    xmm2, xmm7
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        pxor    xmm4, xmm7
Packit 40b132
        pxor    xmm5, xmm7
Packit 40b132
        pxor    xmm6, xmm7
Packit 40b132
Packit 40b132
IF enc eq 1
Packit 40b132
        rnd textequ <aes_rnd>
Packit 40b132
        lastrnd textequ <aes_last_rnd>
Packit 40b132
        aesinst textequ <aesenc>
Packit 40b132
        aeslastinst textequ <aesenclast>
Packit 40b132
ELSE
Packit 40b132
        rnd textequ <aes_dec_rnd>
Packit 40b132
        lastrnd textequ <aes_dec_last_rnd>
Packit 40b132
        aesinst textequ <aesdec>
Packit 40b132
        aeslastinst textequ <aesdeclast>
Packit 40b132
ENDIF
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
        WHILE i LT rnds
Packit 40b132
            rnd i
Packit 40b132
            i = i+1
Packit 40b132
            ENDM
Packit 40b132
        lastrnd rnds
Packit 40b132
Packit 40b132
        movdqu  [0*16 + output], xmm0
Packit 40b132
        movdqu  [1*16 + output], xmm1
Packit 40b132
        movdqu  [2*16 + output], xmm2
Packit 40b132
        movdqu  [3*16 + output], xmm3
Packit 40b132
        movdqu  [4*16 + output], xmm4
Packit 40b132
        movdqu  [5*16 + output], xmm5
Packit 40b132
        movdqu  [6*16 + output], xmm6
Packit 40b132
Packit 40b132
        lea input, [7*16 + input]
Packit 40b132
        lea output, [7*16 + output]
Packit 40b132
        sub inputLen, 7*16
Packit 40b132
        jmp loop7
Packit 40b132
Packit 40b132
loop1:
Packit 40b132
        cmp     inputLen, 1*16
Packit 40b132
        jb      bail
Packit 40b132
Packit 40b132
        movdqu  xmm0, [input]
Packit 40b132
        movdqu  xmm7, [0*16 + ctx]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
    WHILE i LT rnds
Packit 40b132
            movdqu  xmm7, [i*16 + ctx]
Packit 40b132
            aesinst  xmm0, xmm7
Packit 40b132
            i = i+1
Packit 40b132
        ENDM
Packit 40b132
        movdqu  xmm7, [rnds*16 + ctx]
Packit 40b132
        aeslastinst xmm0, xmm7
Packit 40b132
Packit 40b132
        movdqu  [output], xmm0
Packit 40b132
Packit 40b132
        lea input, [1*16 + input]
Packit 40b132
        lea output, [1*16 + output]
Packit 40b132
        sub inputLen, 1*16
Packit 40b132
        jmp loop1
Packit 40b132
Packit 40b132
bail:
Packit 40b132
        xor eax, eax
Packit 40b132
        pop     inputLen
Packit 40b132
        ret
Packit 40b132
Packit 40b132
ENDM
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ecb_128 PROC
Packit 40b132
gen_aes_ecb_func 1, 10
Packit 40b132
intel_aes_encrypt_ecb_128 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ecb_192 PROC
Packit 40b132
gen_aes_ecb_func 1, 12
Packit 40b132
intel_aes_encrypt_ecb_192 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ecb_256 PROC
Packit 40b132
gen_aes_ecb_func 1, 14
Packit 40b132
intel_aes_encrypt_ecb_256 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_ecb_128 PROC
Packit 40b132
gen_aes_ecb_func 0, 10
Packit 40b132
intel_aes_decrypt_ecb_128 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_ecb_192 PROC
Packit 40b132
gen_aes_ecb_func 0, 12
Packit 40b132
intel_aes_decrypt_ecb_192 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_ecb_256 PROC
Packit 40b132
gen_aes_ecb_func 0, 14
Packit 40b132
intel_aes_decrypt_ecb_256 ENDP
Packit 40b132
Packit 40b132
Packit 40b132
KEY textequ <ecx>
Packit 40b132
KS  textequ <edx>
Packit 40b132
ITR textequ <eax>
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_init_128  PROC
Packit 40b132
Packit 40b132
    mov     KEY,        [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,         [esp + 1*4 + 1*4]
Packit 40b132
Packit 40b132
Packit 40b132
    movdqu  xmm1, [KEY]
Packit 40b132
    movdqu  [KS], xmm1
Packit 40b132
    movdqa  xmm2, xmm1
Packit 40b132
Packit 40b132
    lea ITR, Lcon1
Packit 40b132
    movdqa  xmm0, [ITR]
Packit 40b132
    lea ITR, Lmask
Packit 40b132
    movdqa  xmm4, [ITR]
Packit 40b132
Packit 40b132
    mov ITR, 8
Packit 40b132
Packit 40b132
Lenc_128_ks_loop:
Packit 40b132
        lea KS, [16 + KS]
Packit 40b132
        dec ITR
Packit 40b132
Packit 40b132
        pshufb  xmm2, xmm4
Packit 40b132
        aesenclast  xmm2, xmm0
Packit 40b132
        pslld   xmm0, 1
Packit 40b132
        movdqa  xmm3, xmm1
Packit 40b132
        pslldq  xmm3, 4
Packit 40b132
        pxor    xmm1, xmm3
Packit 40b132
        pslldq  xmm3, 4
Packit 40b132
        pxor    xmm1, xmm3
Packit 40b132
        pslldq  xmm3, 4
Packit 40b132
        pxor    xmm1, xmm3
Packit 40b132
        pxor    xmm1, xmm2
Packit 40b132
        movdqu  [KS], xmm1
Packit 40b132
        movdqa  xmm2, xmm1
Packit 40b132
Packit 40b132
        jne Lenc_128_ks_loop
Packit 40b132
Packit 40b132
    lea ITR, Lcon2
Packit 40b132
    movdqa  xmm0, [ITR]
Packit 40b132
Packit 40b132
    pshufb  xmm2, xmm4
Packit 40b132
    aesenclast  xmm2, xmm0
Packit 40b132
    pslld   xmm0, 1
Packit 40b132
    movdqa  xmm3, xmm1
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pxor    xmm1, xmm2
Packit 40b132
    movdqu  [16 + KS], xmm1
Packit 40b132
    movdqa  xmm2, xmm1
Packit 40b132
Packit 40b132
    pshufb  xmm2, xmm4
Packit 40b132
    aesenclast  xmm2, xmm0
Packit 40b132
    movdqa  xmm3, xmm1
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pslldq  xmm3, 4
Packit 40b132
    pxor    xmm1, xmm3
Packit 40b132
    pxor    xmm1, xmm2
Packit 40b132
    movdqu  [32 + KS], xmm1
Packit 40b132
    movdqa  xmm2, xmm1
Packit 40b132
Packit 40b132
    ret
Packit 40b132
intel_aes_encrypt_init_128  ENDP
Packit 40b132
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_init_128  PROC
Packit 40b132
Packit 40b132
    mov     KEY,        [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,         [esp + 1*4 + 1*4]
Packit 40b132
Packit 40b132
    push    KS
Packit 40b132
    push    KEY
Packit 40b132
Packit 40b132
    call    intel_aes_encrypt_init_128
Packit 40b132
Packit 40b132
    pop     KEY
Packit 40b132
    pop     KS
Packit 40b132
Packit 40b132
    movdqu  xmm0, [0*16 + KS]
Packit 40b132
    movdqu  xmm1, [10*16 + KS]
Packit 40b132
    movdqu  [10*16 + KS], xmm0
Packit 40b132
    movdqu  [0*16 + KS], xmm1
Packit 40b132
Packit 40b132
    i = 1
Packit 40b132
    WHILE i LT 5
Packit 40b132
        movdqu  xmm0, [i*16 + KS]
Packit 40b132
        movdqu  xmm1, [(10-i)*16 + KS]
Packit 40b132
Packit 40b132
        aesimc  xmm0, xmm0
Packit 40b132
        aesimc  xmm1, xmm1
Packit 40b132
Packit 40b132
        movdqu  [(10-i)*16 + KS], xmm0
Packit 40b132
        movdqu  [i*16 + KS], xmm1
Packit 40b132
Packit 40b132
        i = i+1
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
    movdqu  xmm0, [5*16 + KS]
Packit 40b132
    aesimc  xmm0, xmm0
Packit 40b132
    movdqu  [5*16 + KS], xmm0
Packit 40b132
    ret
Packit 40b132
intel_aes_decrypt_init_128  ENDP
Packit 40b132
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_init_192  PROC
Packit 40b132
Packit 40b132
    mov     KEY, [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,  [esp + 1*4 + 1*4]
Packit 40b132
Packit 40b132
    pxor    xmm3, xmm3
Packit 40b132
    movdqu  xmm1, [KEY]
Packit 40b132
    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
Packit 40b132
    pinsrd  xmm3, DWORD PTR [20 + KEY], 1
Packit 40b132
Packit 40b132
    movdqu  [KS], xmm1
Packit 40b132
    movdqa  xmm5, xmm3
Packit 40b132
Packit 40b132
    lea ITR, Lcon1
Packit 40b132
    movdqu  xmm0, [ITR]
Packit 40b132
    lea ITR, Lmask192
Packit 40b132
    movdqu  xmm4, [ITR]
Packit 40b132
Packit 40b132
    mov ITR, 4
Packit 40b132
Packit 40b132
Lenc_192_ks_loop:
Packit 40b132
        movdqa  xmm2, xmm3
Packit 40b132
        pshufb  xmm2, xmm4
Packit 40b132
        aesenclast xmm2, xmm0
Packit 40b132
        pslld   xmm0, 1
Packit 40b132
Packit 40b132
        movdqa  xmm6, xmm1
Packit 40b132
        movdqa  xmm7, xmm3
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pslldq  xmm7, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pxor    xmm1, xmm2
Packit 40b132
        pshufd  xmm2, xmm1, 0ffh
Packit 40b132
        pxor    xmm3, xmm2
Packit 40b132
Packit 40b132
        movdqa  xmm6, xmm1
Packit 40b132
        shufpd  xmm5, xmm1, 00h
Packit 40b132
        shufpd  xmm6, xmm3, 01h
Packit 40b132
Packit 40b132
        movdqu  [16 + KS], xmm5
Packit 40b132
        movdqu  [32 + KS], xmm6
Packit 40b132
Packit 40b132
        movdqa  xmm2, xmm3
Packit 40b132
        pshufb  xmm2, xmm4
Packit 40b132
        aesenclast  xmm2, xmm0
Packit 40b132
        pslld   xmm0, 1
Packit 40b132
Packit 40b132
        movdqa  xmm6, xmm1
Packit 40b132
        movdqa  xmm7, xmm3
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pslldq  xmm7, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pslldq  xmm6, 4
Packit 40b132
        pxor    xmm1, xmm6
Packit 40b132
        pxor    xmm1, xmm2
Packit 40b132
        pshufd  xmm2, xmm1, 0ffh
Packit 40b132
        pxor    xmm3, xmm2
Packit 40b132
Packit 40b132
        movdqu  [48 + KS], xmm1
Packit 40b132
        movdqa  xmm5, xmm3
Packit 40b132
Packit 40b132
        lea KS, [48 + KS]
Packit 40b132
Packit 40b132
        dec ITR
Packit 40b132
        jnz Lenc_192_ks_loop
Packit 40b132
Packit 40b132
    movdqu  [16 + KS], xmm5
Packit 40b132
ret
Packit 40b132
intel_aes_encrypt_init_192  ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_init_192  PROC
Packit 40b132
    mov     KEY,        [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,         [esp + 1*4 + 1*4]
Packit 40b132
Packit 40b132
    push    KS
Packit 40b132
    push    KEY
Packit 40b132
Packit 40b132
    call    intel_aes_encrypt_init_192
Packit 40b132
Packit 40b132
    pop     KEY
Packit 40b132
    pop     KS
Packit 40b132
Packit 40b132
    movdqu  xmm0, [0*16 + KS]
Packit 40b132
    movdqu  xmm1, [12*16 + KS]
Packit 40b132
    movdqu  [12*16 + KS], xmm0
Packit 40b132
    movdqu  [0*16 + KS], xmm1
Packit 40b132
Packit 40b132
    i = 1
Packit 40b132
    WHILE i LT 6
Packit 40b132
        movdqu  xmm0, [i*16 + KS]
Packit 40b132
        movdqu  xmm1, [(12-i)*16 + KS]
Packit 40b132
Packit 40b132
        aesimc  xmm0, xmm0
Packit 40b132
        aesimc  xmm1, xmm1
Packit 40b132
Packit 40b132
        movdqu  [(12-i)*16 + KS], xmm0
Packit 40b132
        movdqu  [i*16 + KS], xmm1
Packit 40b132
Packit 40b132
        i = i+1
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
    movdqu  xmm0, [6*16 + KS]
Packit 40b132
    aesimc  xmm0, xmm0
Packit 40b132
    movdqu  [6*16 + KS], xmm0
Packit 40b132
    ret
Packit 40b132
intel_aes_decrypt_init_192  ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_init_256  PROC
Packit 40b132
Packit 40b132
    mov     KEY,    [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,     [esp + 1*4 + 1*4]
Packit 40b132
    movdqu  xmm1, [16*0 + KEY]
Packit 40b132
    movdqu  xmm3, [16*1 + KEY]
Packit 40b132
Packit 40b132
    movdqu  [16*0 + KS], xmm1
Packit 40b132
    movdqu  [16*1 + KS], xmm3
Packit 40b132
Packit 40b132
    lea ITR, Lcon1
Packit 40b132
    movdqu  xmm0, [ITR]
Packit 40b132
    lea ITR, Lmask256
Packit 40b132
    movdqu  xmm5, [ITR]
Packit 40b132
Packit 40b132
    pxor    xmm6, xmm6
Packit 40b132
Packit 40b132
    mov ITR, 6
Packit 40b132
Packit 40b132
Lenc_256_ks_loop:
Packit 40b132
Packit 40b132
        movdqa  xmm2, xmm3
Packit 40b132
        pshufb  xmm2, xmm5
Packit 40b132
        aesenclast  xmm2, xmm0
Packit 40b132
        pslld   xmm0, 1
Packit 40b132
        movdqa  xmm4, xmm1
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm1, xmm4
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm1, xmm4
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm1, xmm4
Packit 40b132
        pxor    xmm1, xmm2
Packit 40b132
        movdqu  [16*2 + KS], xmm1
Packit 40b132
Packit 40b132
        pshufd  xmm2, xmm1, 0ffh
Packit 40b132
        aesenclast  xmm2, xmm6
Packit 40b132
        movdqa  xmm4, xmm3
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm3, xmm4
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm3, xmm4
Packit 40b132
        pslldq  xmm4, 4
Packit 40b132
        pxor    xmm3, xmm4
Packit 40b132
        pxor    xmm3, xmm2
Packit 40b132
        movdqu  [16*3 + KS], xmm3
Packit 40b132
Packit 40b132
        lea KS, [32 + KS]
Packit 40b132
        dec ITR
Packit 40b132
        jnz Lenc_256_ks_loop
Packit 40b132
Packit 40b132
    movdqa  xmm2, xmm3
Packit 40b132
    pshufb  xmm2, xmm5
Packit 40b132
    aesenclast  xmm2, xmm0
Packit 40b132
    movdqa  xmm4, xmm1
Packit 40b132
    pslldq  xmm4, 4
Packit 40b132
    pxor    xmm1, xmm4
Packit 40b132
    pslldq  xmm4, 4
Packit 40b132
    pxor    xmm1, xmm4
Packit 40b132
    pslldq  xmm4, 4
Packit 40b132
    pxor    xmm1, xmm4
Packit 40b132
    pxor    xmm1, xmm2
Packit 40b132
    movdqu  [16*2 + KS], xmm1
Packit 40b132
Packit 40b132
    ret
Packit 40b132
intel_aes_encrypt_init_256  ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_init_256  PROC
Packit 40b132
    mov     KEY,        [esp + 1*4 + 0*4]
Packit 40b132
    mov     KS,         [esp + 1*4 + 1*4]
Packit 40b132
Packit 40b132
    push    KS
Packit 40b132
    push    KEY
Packit 40b132
Packit 40b132
    call    intel_aes_encrypt_init_256
Packit 40b132
Packit 40b132
    pop     KEY
Packit 40b132
    pop     KS
Packit 40b132
Packit 40b132
    movdqu  xmm0, [0*16 + KS]
Packit 40b132
    movdqu  xmm1, [14*16 + KS]
Packit 40b132
    movdqu  [14*16 + KS], xmm0
Packit 40b132
    movdqu  [0*16 + KS], xmm1
Packit 40b132
Packit 40b132
    i = 1
Packit 40b132
    WHILE i LT 7
Packit 40b132
        movdqu  xmm0, [i*16 + KS]
Packit 40b132
        movdqu  xmm1, [(14-i)*16 + KS]
Packit 40b132
Packit 40b132
        aesimc  xmm0, xmm0
Packit 40b132
        aesimc  xmm1, xmm1
Packit 40b132
Packit 40b132
        movdqu  [(14-i)*16 + KS], xmm0
Packit 40b132
        movdqu  [i*16 + KS], xmm1
Packit 40b132
Packit 40b132
        i = i+1
Packit 40b132
    ENDM
Packit 40b132
Packit 40b132
    movdqu  xmm0, [7*16 + KS]
Packit 40b132
    aesimc  xmm0, xmm0
Packit 40b132
    movdqu  [7*16 + KS], xmm0
Packit 40b132
    ret
Packit 40b132
intel_aes_decrypt_init_256  ENDP
Packit 40b132
Packit 40b132
Packit 40b132
Packit 40b132
gen_aes_cbc_enc_func MACRO rnds
Packit 40b132
Packit 40b132
LOCAL   loop1
Packit 40b132
LOCAL   bail
Packit 40b132
Packit 40b132
        push    inputLen
Packit 40b132
Packit 40b132
        mov     ctx,    [esp + 2*4 + 0*4]
Packit 40b132
        mov     output,     [esp + 2*4 + 1*4]
Packit 40b132
        mov     input,      [esp + 2*4 + 4*4]
Packit 40b132
        mov     inputLen,   [esp + 2*4 + 5*4]
Packit 40b132
Packit 40b132
        lea     ctx, [44+ctx]
Packit 40b132
Packit 40b132
        movdqu  xmm0, [-32+ctx]
Packit 40b132
Packit 40b132
        movdqu  xmm2, [0*16 + ctx]
Packit 40b132
        movdqu  xmm3, [1*16 + ctx]
Packit 40b132
        movdqu  xmm4, [2*16 + ctx]
Packit 40b132
        movdqu  xmm5, [3*16 + ctx]
Packit 40b132
        movdqu  xmm6, [4*16 + ctx]
Packit 40b132
Packit 40b132
loop1:
Packit 40b132
        cmp     inputLen, 1*16
Packit 40b132
        jb      bail
Packit 40b132
Packit 40b132
        movdqu  xmm1, [input]
Packit 40b132
        pxor    xmm1, xmm2
Packit 40b132
        pxor    xmm0, xmm1
Packit 40b132
Packit 40b132
        aesenc  xmm0, xmm3
Packit 40b132
        aesenc  xmm0, xmm4
Packit 40b132
        aesenc  xmm0, xmm5
Packit 40b132
        aesenc  xmm0, xmm6
Packit 40b132
Packit 40b132
        i = 5
Packit 40b132
    WHILE i LT rnds
Packit 40b132
            movdqu  xmm7, [i*16 + ctx]
Packit 40b132
            aesenc  xmm0, xmm7
Packit 40b132
            i = i+1
Packit 40b132
        ENDM
Packit 40b132
        movdqu  xmm7, [rnds*16 + ctx]
Packit 40b132
        aesenclast xmm0, xmm7
Packit 40b132
Packit 40b132
        movdqu  [output], xmm0
Packit 40b132
Packit 40b132
        lea input, [1*16 + input]
Packit 40b132
        lea output, [1*16 + output]
Packit 40b132
        sub inputLen, 1*16
Packit 40b132
        jmp loop1
Packit 40b132
Packit 40b132
bail:
Packit 40b132
        movdqu  [-32+ctx], xmm0
Packit 40b132
Packit 40b132
        xor eax, eax
Packit 40b132
        pop inputLen
Packit 40b132
        ret
Packit 40b132
Packit 40b132
ENDM
Packit 40b132
Packit 40b132
gen_aes_cbc_dec_func MACRO rnds
Packit 40b132
Packit 40b132
LOCAL   loop7
Packit 40b132
LOCAL   loop1
Packit 40b132
LOCAL   dec1
Packit 40b132
LOCAL   bail
Packit 40b132
Packit 40b132
        push    inputLen
Packit 40b132
Packit 40b132
        mov     ctx,    [esp + 2*4 + 0*4]
Packit 40b132
        mov     output,     [esp + 2*4 + 1*4]
Packit 40b132
        mov     input,      [esp + 2*4 + 4*4]
Packit 40b132
        mov     inputLen,   [esp + 2*4 + 5*4]
Packit 40b132
Packit 40b132
        lea     ctx, [44+ctx]
Packit 40b132
Packit 40b132
loop7:
Packit 40b132
        cmp     inputLen, 7*16
Packit 40b132
        jb      dec1
Packit 40b132
Packit 40b132
        movdqu  xmm0, [0*16 + input]
Packit 40b132
        movdqu  xmm1, [1*16 + input]
Packit 40b132
        movdqu  xmm2, [2*16 + input]
Packit 40b132
        movdqu  xmm3, [3*16 + input]
Packit 40b132
        movdqu  xmm4, [4*16 + input]
Packit 40b132
        movdqu  xmm5, [5*16 + input]
Packit 40b132
        movdqu  xmm6, [6*16 + input]
Packit 40b132
Packit 40b132
        movdqu  xmm7, [0*16 + ctx]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
        pxor    xmm1, xmm7
Packit 40b132
        pxor    xmm2, xmm7
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        pxor    xmm4, xmm7
Packit 40b132
        pxor    xmm5, xmm7
Packit 40b132
        pxor    xmm6, xmm7
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
        WHILE i LT rnds
Packit 40b132
            aes_dec_rnd i
Packit 40b132
            i = i+1
Packit 40b132
            ENDM
Packit 40b132
        aes_dec_last_rnd rnds
Packit 40b132
Packit 40b132
        movdqu  xmm7, [-32 + ctx]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
        movdqu  xmm7, [0*16 + input]
Packit 40b132
        pxor    xmm1, xmm7
Packit 40b132
        movdqu  xmm7, [1*16 + input]
Packit 40b132
        pxor    xmm2, xmm7
Packit 40b132
        movdqu  xmm7, [2*16 + input]
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        movdqu  xmm7, [3*16 + input]
Packit 40b132
        pxor    xmm4, xmm7
Packit 40b132
        movdqu  xmm7, [4*16 + input]
Packit 40b132
        pxor    xmm5, xmm7
Packit 40b132
        movdqu  xmm7, [5*16 + input]
Packit 40b132
        pxor    xmm6, xmm7
Packit 40b132
        movdqu  xmm7, [6*16 + input]
Packit 40b132
Packit 40b132
        movdqu  [0*16 + output], xmm0
Packit 40b132
        movdqu  [1*16 + output], xmm1
Packit 40b132
        movdqu  [2*16 + output], xmm2
Packit 40b132
        movdqu  [3*16 + output], xmm3
Packit 40b132
        movdqu  [4*16 + output], xmm4
Packit 40b132
        movdqu  [5*16 + output], xmm5
Packit 40b132
        movdqu  [6*16 + output], xmm6
Packit 40b132
        movdqu  [-32 + ctx], xmm7
Packit 40b132
Packit 40b132
        lea input, [7*16 + input]
Packit 40b132
        lea output, [7*16 + output]
Packit 40b132
        sub inputLen, 7*16
Packit 40b132
        jmp loop7
Packit 40b132
dec1:
Packit 40b132
Packit 40b132
        movdqu  xmm3, [-32 + ctx]
Packit 40b132
Packit 40b132
loop1:
Packit 40b132
        cmp     inputLen, 1*16
Packit 40b132
        jb      bail
Packit 40b132
Packit 40b132
        movdqu  xmm0, [input]
Packit 40b132
        movdqa  xmm4, xmm0
Packit 40b132
        movdqu  xmm7, [0*16 + ctx]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
    WHILE i LT rnds
Packit 40b132
            movdqu  xmm7, [i*16 + ctx]
Packit 40b132
            aesdec  xmm0, xmm7
Packit 40b132
            i = i+1
Packit 40b132
        ENDM
Packit 40b132
        movdqu  xmm7, [rnds*16 + ctx]
Packit 40b132
        aesdeclast xmm0, xmm7
Packit 40b132
        pxor    xmm3, xmm0
Packit 40b132
Packit 40b132
        movdqu  [output], xmm3
Packit 40b132
        movdqa  xmm3, xmm4
Packit 40b132
Packit 40b132
        lea input, [1*16 + input]
Packit 40b132
        lea output, [1*16 + output]
Packit 40b132
        sub inputLen, 1*16
Packit 40b132
        jmp loop1
Packit 40b132
Packit 40b132
bail:
Packit 40b132
        movdqu  [-32 + ctx], xmm3
Packit 40b132
        xor eax, eax
Packit 40b132
        pop     inputLen
Packit 40b132
        ret
Packit 40b132
ENDM
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_cbc_128 PROC
Packit 40b132
gen_aes_cbc_enc_func  10
Packit 40b132
intel_aes_encrypt_cbc_128 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_cbc_192 PROC
Packit 40b132
gen_aes_cbc_enc_func  12
Packit 40b132
intel_aes_encrypt_cbc_192 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_cbc_256 PROC
Packit 40b132
gen_aes_cbc_enc_func  14
Packit 40b132
intel_aes_encrypt_cbc_256 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_cbc_128 PROC
Packit 40b132
gen_aes_cbc_dec_func  10
Packit 40b132
intel_aes_decrypt_cbc_128 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_cbc_192 PROC
Packit 40b132
gen_aes_cbc_dec_func  12
Packit 40b132
intel_aes_decrypt_cbc_192 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_decrypt_cbc_256 PROC
Packit 40b132
gen_aes_cbc_dec_func  14
Packit 40b132
intel_aes_decrypt_cbc_256 ENDP
Packit 40b132
Packit 40b132
Packit 40b132
Packit 40b132
ctrCtx textequ <esi>
Packit 40b132
CTR textequ <ebx>
Packit 40b132
Packit 40b132
gen_aes_ctr_func MACRO rnds
Packit 40b132
Packit 40b132
LOCAL   loop7
Packit 40b132
LOCAL   loop1
Packit 40b132
LOCAL   enc1
Packit 40b132
LOCAL   bail
Packit 40b132
Packit 40b132
        push    inputLen
Packit 40b132
        push    ctrCtx
Packit 40b132
        push    CTR
Packit 40b132
        push    ebp
Packit 40b132
Packit 40b132
        mov     ctrCtx, [esp + 4*5 + 0*4]
Packit 40b132
        mov     output, [esp + 4*5 + 1*4]
Packit 40b132
        mov     input,  [esp + 4*5 + 4*4]
Packit 40b132
        mov     inputLen, [esp + 4*5 + 5*4]
Packit 40b132
Packit 40b132
        mov     ctx, [4+ctrCtx]
Packit 40b132
        lea     ctx, [44+ctx]
Packit 40b132
Packit 40b132
        mov     ebp, esp
Packit 40b132
        sub     esp, 7*16
Packit 40b132
        and     esp, -16
Packit 40b132
Packit 40b132
        movdqu  xmm0, [8+ctrCtx]
Packit 40b132
        mov     ctrCtx, [ctrCtx + 8 + 3*4]
Packit 40b132
        bswap   ctrCtx
Packit 40b132
        movdqu  xmm1, [ctx + 0*16]
Packit 40b132
Packit 40b132
        pxor    xmm0, xmm1
Packit 40b132
Packit 40b132
        movdqa  [esp + 0*16], xmm0
Packit 40b132
        movdqa  [esp + 1*16], xmm0
Packit 40b132
        movdqa  [esp + 2*16], xmm0
Packit 40b132
        movdqa  [esp + 3*16], xmm0
Packit 40b132
        movdqa  [esp + 4*16], xmm0
Packit 40b132
        movdqa  [esp + 5*16], xmm0
Packit 40b132
        movdqa  [esp + 6*16], xmm0
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 1*16 + 3*4], CTR
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 2*16 + 3*4], CTR
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 3*16 + 3*4], CTR
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 4*16 + 3*4], CTR
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 5*16 + 3*4], CTR
Packit 40b132
Packit 40b132
        inc     ctrCtx
Packit 40b132
        mov     CTR, ctrCtx
Packit 40b132
        bswap   CTR
Packit 40b132
        xor     CTR, [ctx + 3*4]
Packit 40b132
        mov     [esp + 6*16 + 3*4], CTR
Packit 40b132
Packit 40b132
Packit 40b132
loop7:
Packit 40b132
        cmp     inputLen, 7*16
Packit 40b132
        jb      loop1
Packit 40b132
Packit 40b132
        movdqu  xmm0, [0*16 + esp]
Packit 40b132
        movdqu  xmm1, [1*16 + esp]
Packit 40b132
        movdqu  xmm2, [2*16 + esp]
Packit 40b132
        movdqu  xmm3, [3*16 + esp]
Packit 40b132
        movdqu  xmm4, [4*16 + esp]
Packit 40b132
        movdqu  xmm5, [5*16 + esp]
Packit 40b132
        movdqu  xmm6, [6*16 + esp]
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
        WHILE i LE 7
Packit 40b132
            aes_rnd i
Packit 40b132
Packit 40b132
            inc     ctrCtx
Packit 40b132
            mov     CTR, ctrCtx
Packit 40b132
            bswap   CTR
Packit 40b132
            xor     CTR, [ctx + 3*4]
Packit 40b132
            mov     [esp + (i-1)*16 + 3*4], CTR
Packit 40b132
Packit 40b132
            i = i+1
Packit 40b132
        ENDM
Packit 40b132
        WHILE i LT rnds
Packit 40b132
            aes_rnd i
Packit 40b132
            i = i+1
Packit 40b132
            ENDM
Packit 40b132
        aes_last_rnd rnds
Packit 40b132
Packit 40b132
        movdqu  xmm7, [0*16 + input]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
        movdqu  xmm7, [1*16 + input]
Packit 40b132
        pxor    xmm1, xmm7
Packit 40b132
        movdqu  xmm7, [2*16 + input]
Packit 40b132
        pxor    xmm2, xmm7
Packit 40b132
        movdqu  xmm7, [3*16 + input]
Packit 40b132
        pxor    xmm3, xmm7
Packit 40b132
        movdqu  xmm7, [4*16 + input]
Packit 40b132
        pxor    xmm4, xmm7
Packit 40b132
        movdqu  xmm7, [5*16 + input]
Packit 40b132
        pxor    xmm5, xmm7
Packit 40b132
        movdqu  xmm7, [6*16 + input]
Packit 40b132
        pxor    xmm6, xmm7
Packit 40b132
Packit 40b132
        movdqu  [0*16 + output], xmm0
Packit 40b132
        movdqu  [1*16 + output], xmm1
Packit 40b132
        movdqu  [2*16 + output], xmm2
Packit 40b132
        movdqu  [3*16 + output], xmm3
Packit 40b132
        movdqu  [4*16 + output], xmm4
Packit 40b132
        movdqu  [5*16 + output], xmm5
Packit 40b132
        movdqu  [6*16 + output], xmm6
Packit 40b132
Packit 40b132
        lea input, [7*16 + input]
Packit 40b132
        lea output, [7*16 + output]
Packit 40b132
        sub inputLen, 7*16
Packit 40b132
        jmp loop7
Packit 40b132
Packit 40b132
Packit 40b132
loop1:
Packit 40b132
        cmp     inputLen, 1*16
Packit 40b132
        jb      bail
Packit 40b132
Packit 40b132
        movdqu  xmm0, [esp]
Packit 40b132
        add     esp, 16
Packit 40b132
Packit 40b132
        i = 1
Packit 40b132
    WHILE i LT rnds
Packit 40b132
            movdqu  xmm7, [i*16 + ctx]
Packit 40b132
            aesenc  xmm0, xmm7
Packit 40b132
            i = i+1
Packit 40b132
        ENDM
Packit 40b132
        movdqu  xmm7, [rnds*16 + ctx]
Packit 40b132
        aesenclast xmm0, xmm7
Packit 40b132
Packit 40b132
        movdqu  xmm7, [input]
Packit 40b132
        pxor    xmm0, xmm7
Packit 40b132
        movdqu  [output], xmm0
Packit 40b132
Packit 40b132
        lea input, [1*16 + input]
Packit 40b132
        lea output, [1*16 + output]
Packit 40b132
        sub inputLen, 1*16
Packit 40b132
        jmp loop1
Packit 40b132
Packit 40b132
bail:
Packit 40b132
Packit 40b132
        mov     ctrCtx, [ebp + 4*5 + 0*4]
Packit 40b132
        movdqu  xmm0, [esp]
Packit 40b132
        movdqu  xmm1, [ctx + 0*16]
Packit 40b132
        pxor    xmm0, xmm1
Packit 40b132
        movdqu  [8+ctrCtx], xmm0
Packit 40b132
Packit 40b132
Packit 40b132
        xor     eax, eax
Packit 40b132
        mov     esp, ebp
Packit 40b132
        pop     ebp
Packit 40b132
        pop     CTR
Packit 40b132
        pop     ctrCtx
Packit 40b132
        pop     inputLen
Packit 40b132
        ret
Packit 40b132
ENDM
Packit 40b132
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ctr_128 PROC
Packit 40b132
gen_aes_ctr_func  10
Packit 40b132
intel_aes_encrypt_ctr_128 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ctr_192 PROC
Packit 40b132
gen_aes_ctr_func  12
Packit 40b132
intel_aes_encrypt_ctr_192 ENDP
Packit 40b132
Packit 40b132
ALIGN 16
Packit 40b132
intel_aes_encrypt_ctr_256 PROC
Packit 40b132
gen_aes_ctr_func  14
Packit 40b132
intel_aes_encrypt_ctr_256 ENDP
Packit 40b132
Packit 40b132
Packit 40b132
END