#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

.data
.align 4
 #
 # -1 means to call s_mpi_is_sse to determine if we support sse 
 #    instructions.
 #  0 means to use x86 instructions
 #  1 means to use sse2 instructions
.type	is_sse,@object
.size	is_sse,4
is_sse: .long	-1 

#
# sigh, handle the difference between -fPIC and not PIC
# default to pic, since this file seems to be exclusively
# linux right now (solaris uses mpi_i86pc.s and windows uses
# mpi_x86_asm.c)
#
.ifndef NO_PIC
.macro GET   var,reg
    movl   \var@GOTOFF(%ebx),\reg
.endm
.macro PUT   reg,var
    movl   \reg,\var@GOTOFF(%ebx)
.endm
.else
.macro GET   var,reg
    movl   \var,\reg
.endm
.macro PUT   reg,var
    movl   \reg,\var
.endm
.endif

.text


 #  ebp - 36:	caller's esi
 #  ebp - 32:	caller's edi
 #  ebp - 28:	
 #  ebp - 24:	
 #  ebp - 20:	
 #  ebp - 16:	
 #  ebp - 12:	
 #  ebp - 8:	
 #  ebp - 4:	
 #  ebp + 0:	caller's ebp
 #  ebp + 4:	return address
 #  ebp + 8:	a	argument
 #  ebp + 12:	a_len	argument
 #  ebp + 16:	b	argument
 #  ebp + 20:	c	argument
 #  registers:
 # 	eax:
 #	ebx:	carry
 #	ecx:	a_len
 #	edx:
 #	esi:	a ptr
 #	edi:	c ptr
.globl	s_mpv_mul_d
.type	s_mpv_mul_d,@function
s_mpv_mul_d:
    GET    is_sse,%eax
    cmp    $0,%eax
    je     s_mpv_mul_d_x86
    jg     s_mpv_mul_d_sse2
    call   s_mpi_is_sse2
    PUT    %eax,is_sse
    cmp    $0,%eax
    jg     s_mpv_mul_d_sse2
s_mpv_mul_d_x86:
    push   %ebp
    mov    %esp,%ebp
    sub    $28,%esp
    push   %edi
    push   %esi
    push   %ebx
    movl   $0,%ebx		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     2f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
1:
    lodsl			# eax = [ds:esi]; esi += 4
    mov    16(%ebp),%edx	# edx = b
    mull   %edx			# edx:eax = Phi:Plo = a_i * b

    add    %ebx,%eax		# add carry (%ebx) to edx:eax
    adc    $0,%edx
    mov    %edx,%ebx		# high half of product becomes next carry

    stosl			# [es:edi] = ax; edi += 4;
    dec    %ecx			# --a_len
    jnz    1b			# jmp if a_len != 0
2:
    mov    %ebx,0(%edi)		# *c = carry
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop
s_mpv_mul_d_sse2:
    push   %ebp
    mov    %esp,%ebp
    push   %edi
    push   %esi
    psubq  %mm2,%mm2		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    movd   16(%ebp),%mm1	# mm1 = b
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     6f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
5:
    movd   0(%esi),%mm0         # mm0 = *a++
    add    $4,%esi
    pmuludq %mm1,%mm0           # mm0 = b * *a++
    paddq  %mm0,%mm2            # add the carry
    movd   %mm2,0(%edi)         # store the 32bit result
    add    $4,%edi
    psrlq  $32, %mm2		# save the carry
    dec    %ecx			# --a_len
    jnz    5b			# jmp if a_len != 0
6:
    movd   %mm2,0(%edi)		# *c = carry
    emms
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop

 #  ebp - 36:	caller's esi
 #  ebp - 32:	caller's edi
 #  ebp - 28:	
 #  ebp - 24:	
 #  ebp - 20:	
 #  ebp - 16:	
 #  ebp - 12:	
 #  ebp - 8:	
 #  ebp - 4:	
 #  ebp + 0:	caller's ebp
 #  ebp + 4:	return address
 #  ebp + 8:	a	argument
 #  ebp + 12:	a_len	argument
 #  ebp + 16:	b	argument
 #  ebp + 20:	c	argument
 #  registers:
 # 	eax:
 #	ebx:	carry
 #	ecx:	a_len
 #	edx:
 #	esi:	a ptr
 #	edi:	c ptr
.globl	s_mpv_mul_d_add
.type	s_mpv_mul_d_add,@function
s_mpv_mul_d_add:
    GET    is_sse,%eax
    cmp    $0,%eax
    je     s_mpv_mul_d_add_x86
    jg     s_mpv_mul_d_add_sse2
    call   s_mpi_is_sse2
    PUT    %eax,is_sse
    cmp    $0,%eax
    jg     s_mpv_mul_d_add_sse2
s_mpv_mul_d_add_x86:
    push   %ebp
    mov    %esp,%ebp
    sub    $28,%esp
    push   %edi
    push   %esi
    push   %ebx
    movl   $0,%ebx		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     11f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
10:
    lodsl			# eax = [ds:esi]; esi += 4
    mov    16(%ebp),%edx	# edx = b
    mull   %edx			# edx:eax = Phi:Plo = a_i * b

    add    %ebx,%eax		# add carry (%ebx) to edx:eax
    adc    $0,%edx
    mov    0(%edi),%ebx		# add in current word from *c
    add    %ebx,%eax		
    adc    $0,%edx
    mov    %edx,%ebx		# high half of product becomes next carry

    stosl			# [es:edi] = ax; edi += 4;
    dec    %ecx			# --a_len
    jnz    10b			# jmp if a_len != 0
11:
    mov    %ebx,0(%edi)		# *c = carry
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop
s_mpv_mul_d_add_sse2:
    push   %ebp
    mov    %esp,%ebp
    push   %edi
    push   %esi
    psubq  %mm2,%mm2		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    movd   16(%ebp),%mm1	# mm1 = b
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     16f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
15:
    movd   0(%esi),%mm0         # mm0 = *a++
    add    $4,%esi
    pmuludq %mm1,%mm0           # mm0 = b * *a++
    paddq  %mm0,%mm2            # add the carry
    movd   0(%edi),%mm0
    paddq  %mm0,%mm2            # add the carry
    movd   %mm2,0(%edi)         # store the 32bit result
    add    $4,%edi
    psrlq  $32, %mm2		# save the carry
    dec    %ecx			# --a_len
    jnz    15b			# jmp if a_len != 0
16:
    movd   %mm2,0(%edi)		# *c = carry
    emms
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop

 #  ebp - 8:	caller's esi
 #  ebp - 4:	caller's edi
 #  ebp + 0:	caller's ebp
 #  ebp + 4:	return address
 #  ebp + 8:	a	argument
 #  ebp + 12:	a_len	argument
 #  ebp + 16:	b	argument
 #  ebp + 20:	c	argument
 #  registers:
 # 	eax:
 #	ebx:	carry
 #	ecx:	a_len
 #	edx:
 #	esi:	a ptr
 #	edi:	c ptr
.globl	s_mpv_mul_d_add_prop
.type	s_mpv_mul_d_add_prop,@function
s_mpv_mul_d_add_prop:
    GET    is_sse,%eax
    cmp    $0,%eax
    je     s_mpv_mul_d_add_prop_x86
    jg     s_mpv_mul_d_add_prop_sse2
    call   s_mpi_is_sse2
    PUT    %eax,is_sse
    cmp    $0,%eax
    jg     s_mpv_mul_d_add_prop_sse2
s_mpv_mul_d_add_prop_x86:
    push   %ebp
    mov    %esp,%ebp
    sub    $28,%esp
    push   %edi
    push   %esi
    push   %ebx
    movl   $0,%ebx		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     21f			# jmp if a_len == 0
    cld
    mov    8(%ebp),%esi		# esi = a
20:
    lodsl			# eax = [ds:esi]; esi += 4
    mov    16(%ebp),%edx	# edx = b
    mull   %edx			# edx:eax = Phi:Plo = a_i * b

    add    %ebx,%eax		# add carry (%ebx) to edx:eax
    adc    $0,%edx
    mov    0(%edi),%ebx		# add in current word from *c
    add    %ebx,%eax		
    adc    $0,%edx
    mov    %edx,%ebx		# high half of product becomes next carry

    stosl			# [es:edi] = ax; edi += 4;
    dec    %ecx			# --a_len
    jnz    20b			# jmp if a_len != 0
21:
    cmp    $0,%ebx		# is carry zero?
    jz     23f
    mov    0(%edi),%eax		# add in current word from *c
    add	   %ebx,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jnc    23f
22:
    mov    0(%edi),%eax		# add in current word from *c
    adc	   $0,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jc     22b
23:
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop
s_mpv_mul_d_add_prop_sse2:
    push   %ebp
    mov    %esp,%ebp
    push   %edi
    push   %esi
    push   %ebx
    psubq  %mm2,%mm2		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    movd   16(%ebp),%mm1	# mm1 = b
    mov    20(%ebp),%edi
    cmp    $0,%ecx
    je     26f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
25:
    movd   0(%esi),%mm0         # mm0 = *a++
    movd   0(%edi),%mm3		# fetch the sum
    add    $4,%esi
    pmuludq %mm1,%mm0           # mm0 = b * *a++
    paddq  %mm0,%mm2            # add the carry
    paddq  %mm3,%mm2            # add *c++
    movd   %mm2,0(%edi)         # store the 32bit result
    add    $4,%edi
    psrlq  $32, %mm2		# save the carry
    dec    %ecx			# --a_len
    jnz    25b			# jmp if a_len != 0
26:
    movd   %mm2,%ebx
    cmp    $0,%ebx		# is carry zero?
    jz     28f
    mov    0(%edi),%eax
    add    %ebx, %eax
    stosl
    jnc    28f
27:
    mov    0(%edi),%eax		# add in current word from *c
    adc	   $0,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jc     27b
28:
    emms
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop


 #  ebp - 20:	caller's esi
 #  ebp - 16:	caller's edi
 #  ebp - 12:	
 #  ebp - 8:	carry
 #  ebp - 4:	a_len	local
 #  ebp + 0:	caller's ebp
 #  ebp + 4:	return address
 #  ebp + 8:	pa	argument
 #  ebp + 12:	a_len	argument
 #  ebp + 16:	ps	argument
 #  ebp + 20:	
 #  registers:
 # 	eax:
 #	ebx:	carry
 #	ecx:	a_len
 #	edx:
 #	esi:	a ptr
 #	edi:	c ptr

.globl	s_mpv_sqr_add_prop
.type	s_mpv_sqr_add_prop,@function
s_mpv_sqr_add_prop:
     GET   is_sse,%eax
     cmp    $0,%eax
     je     s_mpv_sqr_add_prop_x86
     jg     s_mpv_sqr_add_prop_sse2
     call   s_mpi_is_sse2
     PUT    %eax,is_sse
     cmp    $0,%eax
     jg     s_mpv_sqr_add_prop_sse2
s_mpv_sqr_add_prop_x86:
     push   %ebp
     mov    %esp,%ebp
     sub    $12,%esp
     push   %edi
     push   %esi
     push   %ebx
     movl   $0,%ebx		# carry = 0
     mov    12(%ebp),%ecx	# a_len
     mov    16(%ebp),%edi	# edi = ps
     cmp    $0,%ecx
     je     31f			# jump if a_len == 0
     cld
     mov    8(%ebp),%esi	# esi = pa
30:
     lodsl			# %eax = [ds:si]; si += 4;
     mull   %eax

     add    %ebx,%eax		# add "carry"
     adc    $0,%edx
     mov    0(%edi),%ebx
     add    %ebx,%eax		# add low word from result
     mov    4(%edi),%ebx
     stosl			# [es:di] = %eax; di += 4;
     adc    %ebx,%edx		# add high word from result
     movl   $0,%ebx
     mov    %edx,%eax
     adc    $0,%ebx
     stosl			# [es:di] = %eax; di += 4;
     dec    %ecx		# --a_len
     jnz    30b			# jmp if a_len != 0
31:
    cmp    $0,%ebx		# is carry zero?
    jz     34f
    mov    0(%edi),%eax		# add in current word from *c
    add	   %ebx,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jnc    34f
32:
    mov    0(%edi),%eax		# add in current word from *c
    adc	   $0,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jc     32b
34:
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop
s_mpv_sqr_add_prop_sse2:
    push   %ebp
    mov    %esp,%ebp
    push   %edi
    push   %esi
    push   %ebx
    psubq  %mm2,%mm2		# carry = 0
    mov    12(%ebp),%ecx	# ecx = a_len
    mov    16(%ebp),%edi
    cmp    $0,%ecx
    je     36f			# jmp if a_len == 0
    mov    8(%ebp),%esi		# esi = a
    cld
35:
    movd   0(%esi),%mm0        # mm0 = *a
    movd   0(%edi),%mm3	       # fetch the sum
    add	   $4,%esi
    pmuludq %mm0,%mm0          # mm0 = sqr(a)
    paddq  %mm0,%mm2           # add the carry
    paddq  %mm3,%mm2           # add the low word
    movd   4(%edi),%mm3
    movd   %mm2,0(%edi)        # store the 32bit result
    psrlq  $32, %mm2	
    paddq  %mm3,%mm2           # add the high word
    movd   %mm2,4(%edi)        # store the 32bit result
    psrlq  $32, %mm2	       # save the carry.
    add    $8,%edi
    dec    %ecx			# --a_len
    jnz    35b			# jmp if a_len != 0
36:
    movd   %mm2,%ebx
    cmp    $0,%ebx		# is carry zero?
    jz     38f
    mov    0(%edi),%eax
    add    %ebx, %eax
    stosl
    jnc    38f
37:
    mov    0(%edi),%eax		# add in current word from *c
    adc	   $0,%eax
    stosl			# [es:edi] = ax; edi += 4;
    jc     37b
38:
    emms
    pop    %ebx
    pop    %esi
    pop    %edi
    leave  
    ret    
    nop

 #
 # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
 # so its high bit is 1.   This code is from NSPR.
 #
 # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
 # 		          mp_digit *qp, mp_digit *rp)

 #  esp +  0:   Caller's ebx
 #  esp +  4:	return address
 #  esp +  8:	Nhi	argument
 #  esp + 12:	Nlo	argument
 #  esp + 16:	divisor	argument
 #  esp + 20:	qp	argument
 #  esp + 24:   rp	argument
 #  registers:
 # 	eax:
 #	ebx:	carry
 #	ecx:	a_len
 #	edx:
 #	esi:	a ptr
 #	edi:	c ptr
 # 

.globl	s_mpv_div_2dx1d
.type	s_mpv_div_2dx1d,@function
s_mpv_div_2dx1d:
       push   %ebx
       mov    8(%esp),%edx
       mov    12(%esp),%eax
       mov    16(%esp),%ebx
       div    %ebx
       mov    20(%esp),%ebx
       mov    %eax,0(%ebx)
       mov    24(%esp),%ebx
       mov    %edx,0(%ebx)
       xor    %eax,%eax		# return zero
       pop    %ebx
       ret    
       nop
  
 # Magic indicating no need for an executable stack
.section .note.GNU-stack, "", @progbits
.previous