/* * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "mpi-priv.h" static int is_sse = -1; extern unsigned long s_mpi_is_sse2(); /* * ebp - 36: caller's esi * ebp - 32: caller's edi * ebp - 28: * ebp - 24: * ebp - 20: * ebp - 16: * ebp - 12: * ebp - 8: * ebp - 4: * ebp + 0: caller's ebp * ebp + 4: return address * ebp + 8: a argument * ebp + 12: a_len argument * ebp + 16: b argument * ebp + 20: c argument * registers: * eax: * ebx: carry * ecx: a_len * edx: * esi: a ptr * edi: c ptr */ __declspec(naked) void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) { __asm { mov eax, is_sse cmp eax, 0 je s_mpv_mul_d_x86 jg s_mpv_mul_d_sse2 call s_mpi_is_sse2 mov is_sse, eax cmp eax, 0 jg s_mpv_mul_d_sse2 s_mpv_mul_d_x86: push ebp mov ebp,esp sub esp,28 push edi push esi push ebx mov ebx,0 ; carry = 0 mov ecx,[ebp+12] ; ecx = a_len mov edi,[ebp+20] cmp ecx,0 je L_2 ; jmp if a_len == 0 mov esi,[ebp+8] ; esi = a cld L_1: lodsd ; eax = [ds:esi]; esi += 4 mov edx,[ebp+16] ; edx = b mul edx ; edx:eax = Phi:Plo = a_i * b add eax,ebx ; add carry (ebx) to edx:eax adc edx,0 mov ebx,edx ; high half of product becomes next carry stosd ; [es:edi] = ax; edi += 4; dec ecx ; --a_len jnz L_1 ; jmp if a_len != 0 L_2: mov [edi],ebx ; *c = carry pop ebx pop esi pop edi leave ret nop s_mpv_mul_d_sse2: push ebp mov ebp, esp push edi push esi psubq mm2, mm2 ; carry = 0 mov ecx, [ebp+12] ; ecx = a_len movd mm1, [ebp+16] ; mm1 = b mov edi, [ebp+20] cmp ecx, 0 je L_6 ; jmp if a_len == 0 mov esi, [ebp+8] ; esi = a cld L_5: movd mm0, [esi] ; mm0 = *a++ add esi, 4 pmuludq mm0, mm1 ; mm0 = b * *a++ paddq mm2, mm0 ; add the carry movd [edi], mm2 ; store the 32bit result add edi, 4 psrlq mm2, 32 ; save the carry dec ecx ; --a_len jnz L_5 ; jmp if a_len != 0 L_6: movd [edi], mm2 ; *c = carry emms pop esi pop edi leave ret nop } } /* * ebp - 36: caller's esi * ebp - 32: caller's edi * ebp - 28: * ebp - 24: * ebp - 20: * ebp - 16: * ebp - 12: * ebp - 8: * ebp - 4: * ebp + 0: caller's ebp * ebp + 4: return address * ebp + 8: a argument * ebp + 12: a_len argument * ebp + 16: b argument * ebp + 20: c argument * registers: * eax: * ebx: carry * ecx: a_len * edx: * esi: a ptr * edi: c ptr */ __declspec(naked) void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) { __asm { mov eax, is_sse cmp eax, 0 je s_mpv_mul_d_add_x86 jg s_mpv_mul_d_add_sse2 call s_mpi_is_sse2 mov is_sse, eax cmp eax, 0 jg s_mpv_mul_d_add_sse2 s_mpv_mul_d_add_x86: push ebp mov ebp,esp sub esp,28 push edi push esi push ebx mov ebx,0 ; carry = 0 mov ecx,[ebp+12] ; ecx = a_len mov edi,[ebp+20] cmp ecx,0 je L_11 ; jmp if a_len == 0 mov esi,[ebp+8] ; esi = a cld L_10: lodsd ; eax = [ds:esi]; esi += 4 mov edx,[ebp+16] ; edx = b mul edx ; edx:eax = Phi:Plo = a_i * b add eax,ebx ; add carry (ebx) to edx:eax adc edx,0 mov ebx,[edi] ; add in current word from *c add eax,ebx adc edx,0 mov ebx,edx ; high half of product becomes next carry stosd ; [es:edi] = ax; edi += 4; dec ecx ; --a_len jnz L_10 ; jmp if a_len != 0 L_11: mov [edi],ebx ; *c = carry pop ebx pop esi pop edi leave ret nop s_mpv_mul_d_add_sse2: push ebp mov ebp, esp push edi push esi psubq mm2, mm2 ; carry = 0 mov ecx, [ebp+12] ; ecx = a_len movd mm1, [ebp+16] ; mm1 = b mov edi, [ebp+20] cmp ecx, 0 je L_16 ; jmp if a_len == 0 mov esi, [ebp+8] ; esi = a cld L_15: movd mm0, [esi] ; mm0 = *a++ add esi, 4 pmuludq mm0, mm1 ; mm0 = b * *a++ paddq mm2, mm0 ; add the carry movd mm0, [edi] paddq mm2, mm0 ; add the carry movd [edi], mm2 ; store the 32bit result add edi, 4 psrlq mm2, 32 ; save the carry dec ecx ; --a_len jnz L_15 ; jmp if a_len != 0 L_16: movd [edi], mm2 ; *c = carry emms pop esi pop edi leave ret nop } } /* * ebp - 36: caller's esi * ebp - 32: caller's edi * ebp - 28: * ebp - 24: * ebp - 20: * ebp - 16: * ebp - 12: * ebp - 8: * ebp - 4: * ebp + 0: caller's ebp * ebp + 4: return address * ebp + 8: a argument * ebp + 12: a_len argument * ebp + 16: b argument * ebp + 20: c argument * registers: * eax: * ebx: carry * ecx: a_len * edx: * esi: a ptr * edi: c ptr */ __declspec(naked) void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) { __asm { mov eax, is_sse cmp eax, 0 je s_mpv_mul_d_add_prop_x86 jg s_mpv_mul_d_add_prop_sse2 call s_mpi_is_sse2 mov is_sse, eax cmp eax, 0 jg s_mpv_mul_d_add_prop_sse2 s_mpv_mul_d_add_prop_x86: push ebp mov ebp,esp sub esp,28 push edi push esi push ebx mov ebx,0 ; carry = 0 mov ecx,[ebp+12] ; ecx = a_len mov edi,[ebp+20] cmp ecx,0 je L_21 ; jmp if a_len == 0 cld mov esi,[ebp+8] ; esi = a L_20: lodsd ; eax = [ds:esi]; esi += 4 mov edx,[ebp+16] ; edx = b mul edx ; edx:eax = Phi:Plo = a_i * b add eax,ebx ; add carry (ebx) to edx:eax adc edx,0 mov ebx,[edi] ; add in current word from *c add eax,ebx adc edx,0 mov ebx,edx ; high half of product becomes next carry stosd ; [es:edi] = ax; edi += 4; dec ecx ; --a_len jnz L_20 ; jmp if a_len != 0 L_21: cmp ebx,0 ; is carry zero? jz L_23 mov eax,[edi] ; add in current word from *c add eax,ebx stosd ; [es:edi] = ax; edi += 4; jnc L_23 L_22: mov eax,[edi] ; add in current word from *c adc eax,0 stosd ; [es:edi] = ax; edi += 4; jc L_22 L_23: pop ebx pop esi pop edi leave ret nop s_mpv_mul_d_add_prop_sse2: push ebp mov ebp, esp push edi push esi push ebx psubq mm2, mm2 ; carry = 0 mov ecx, [ebp+12] ; ecx = a_len movd mm1, [ebp+16] ; mm1 = b mov edi, [ebp+20] cmp ecx, 0 je L_26 ; jmp if a_len == 0 mov esi, [ebp+8] ; esi = a cld L_25: movd mm0, [esi] ; mm0 = *a++ movd mm3, [edi] ; fetch the sum add esi, 4 pmuludq mm0, mm1 ; mm0 = b * *a++ paddq mm2, mm0 ; add the carry paddq mm2, mm3 ; add *c++ movd [edi], mm2 ; store the 32bit result add edi, 4 psrlq mm2, 32 ; save the carry dec ecx ; --a_len jnz L_25 ; jmp if a_len != 0 L_26: movd ebx, mm2 cmp ebx, 0 ; is carry zero? jz L_28 mov eax, [edi] add eax, ebx stosd jnc L_28 L_27: mov eax, [edi] ; add in current word from *c adc eax, 0 stosd ; [es:edi] = ax; edi += 4; jc L_27 L_28: emms pop ebx pop esi pop edi leave ret nop } } /* * ebp - 20: caller's esi * ebp - 16: caller's edi * ebp - 12: * ebp - 8: carry * ebp - 4: a_len local * ebp + 0: caller's ebp * ebp + 4: return address * ebp + 8: pa argument * ebp + 12: a_len argument * ebp + 16: ps argument * ebp + 20: * registers: * eax: * ebx: carry * ecx: a_len * edx: * esi: a ptr * edi: c ptr */ __declspec(naked) void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) { __asm { mov eax, is_sse cmp eax, 0 je s_mpv_sqr_add_prop_x86 jg s_mpv_sqr_add_prop_sse2 call s_mpi_is_sse2 mov is_sse, eax cmp eax, 0 jg s_mpv_sqr_add_prop_sse2 s_mpv_sqr_add_prop_x86: push ebp mov ebp,esp sub esp,12 push edi push esi push ebx mov ebx,0 ; carry = 0 mov ecx,[ebp+12] ; a_len mov edi,[ebp+16] ; edi = ps cmp ecx,0 je L_31 ; jump if a_len == 0 cld mov esi,[ebp+8] ; esi = pa L_30: lodsd ; eax = [ds:si]; si += 4; mul eax add eax,ebx ; add "carry" adc edx,0 mov ebx,[edi] add eax,ebx ; add low word from result mov ebx,[edi+4] stosd ; [es:di] = eax; di += 4; adc edx,ebx ; add high word from result mov ebx,0 mov eax,edx adc ebx,0 stosd ; [es:di] = eax; di += 4; dec ecx ; --a_len jnz L_30 ; jmp if a_len != 0 L_31: cmp ebx,0 ; is carry zero? jz L_34 mov eax,[edi] ; add in current word from *c add eax,ebx stosd ; [es:edi] = ax; edi += 4; jnc L_34 L_32: mov eax,[edi] ; add in current word from *c adc eax,0 stosd ; [es:edi] = ax; edi += 4; jc L_32 L_34: pop ebx pop esi pop edi leave ret nop s_mpv_sqr_add_prop_sse2: push ebp mov ebp, esp push edi push esi push ebx psubq mm2, mm2 ; carry = 0 mov ecx, [ebp+12] ; ecx = a_len mov edi, [ebp+16] cmp ecx, 0 je L_36 ; jmp if a_len == 0 mov esi, [ebp+8] ; esi = a cld L_35: movd mm0, [esi] ; mm0 = *a movd mm3, [edi] ; fetch the sum add esi, 4 pmuludq mm0, mm0 ; mm0 = sqr(a) paddq mm2, mm0 ; add the carry paddq mm2, mm3 ; add the low word movd mm3, [edi+4] movd [edi], mm2 ; store the 32bit result psrlq mm2, 32 paddq mm2, mm3 ; add the high word movd [edi+4], mm2 ; store the 32bit result psrlq mm2, 32 ; save the carry. add edi, 8 dec ecx ; --a_len jnz L_35 ; jmp if a_len != 0 L_36: movd ebx, mm2 cmp ebx, 0 ; is carry zero? jz L_38 mov eax, [edi] add eax, ebx stosd jnc L_38 L_37: mov eax, [edi] ; add in current word from *c adc eax, 0 stosd ; [es:edi] = ax; edi += 4; jc L_37 L_38: emms pop ebx pop esi pop edi leave ret nop } } /* * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized * so its high bit is 1. This code is from NSPR. * * Dump of assembler code for function s_mpv_div_2dx1d: * * esp + 0: Caller's ebx * esp + 4: return address * esp + 8: Nhi argument * esp + 12: Nlo argument * esp + 16: divisor argument * esp + 20: qp argument * esp + 24: rp argument * registers: * eax: * ebx: carry * ecx: a_len * edx: * esi: a ptr * edi: c ptr */ __declspec(naked) mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, mp_digit *qp, mp_digit *rp) { __asm { push ebx mov edx,[esp+8] mov eax,[esp+12] mov ebx,[esp+16] div ebx mov ebx,[esp+20] mov [ebx],eax mov ebx,[esp+24] mov [ebx],edx xor eax,eax ; return zero pop ebx ret nop } }