Blame mpn/x86/pentium4/sse2/dive_1.asm

Packit 5c3484
dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C P4: 19.0 cycles/limb
Packit 5c3484
Packit 5c3484
Packit 5c3484
C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
Packit 5c3484
C                      mp_limb_t divisor);
Packit 5c3484
C
Packit 5c3484
C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
Packit 5c3484
C being on the dependent chain and there being plenty of cycles available,
Packit 5c3484
C using an unaligned movq on every second iteration measured about 23 c/l.
Packit 5c3484
C
Packit 5c3484
C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
Packit 5c3484
C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
Packit 5c3484
C that might be hidden by out-of-order execution, whereas divl is around 60.
Packit 5c3484
C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
Packit 5c3484
C faster.
Packit 5c3484
Packit 5c3484
defframe(PARAM_DIVISOR,16)
Packit 5c3484
defframe(PARAM_SIZE,   12)
Packit 5c3484
defframe(PARAM_SRC,    8)
Packit 5c3484
defframe(PARAM_DST,    4)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
PROLOGUE(mpn_divexact_1)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %edx
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %eax
Packit 5c3484
Packit 5c3484
	movl	PARAM_DIVISOR, %ecx
Packit 5c3484
	subl	$1, %edx
Packit 5c3484
	jnz	L(two_or_more)
Packit 5c3484
Packit 5c3484
	movl	(%eax), %eax
Packit 5c3484
	xorl	%edx, %edx
Packit 5c3484
Packit 5c3484
	divl	%ecx
Packit 5c3484
	movl	PARAM_DST, %ecx
Packit 5c3484
Packit 5c3484
	movl	%eax, (%ecx)
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(two_or_more):
Packit 5c3484
	C eax	src
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	divisor
Packit 5c3484
	C edx	size-1
Packit 5c3484
Packit 5c3484
	movl	%ecx, %eax
Packit 5c3484
	bsfl	%ecx, %ecx		C trailing twos
Packit 5c3484
Packit 5c3484
	shrl	%cl, %eax		C d = divisor without twos
Packit 5c3484
	movd	%eax, %mm6
Packit 5c3484
	movd	%ecx, %mm7		C shift
Packit 5c3484
Packit 5c3484
	shrl	%eax			C d/2
Packit 5c3484
Packit 5c3484
	andl	$127, %eax		C d/2, 7 bits
Packit 5c3484
Packit 5c3484
ifdef(`PIC',`
Packit 5c3484
	LEA(	binvert_limb_table, %ecx)
Packit 5c3484
	movzbl	(%eax,%ecx), %eax		C inv 8 bits
Packit 5c3484
',`
Packit 5c3484
	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	C
Packit 5c3484
Packit 5c3484
	movd	%eax, %mm5		C inv
Packit 5c3484
Packit 5c3484
	movd	%eax, %mm0		C inv
Packit 5c3484
Packit 5c3484
	pmuludq	%mm5, %mm5		C inv*inv
Packit 5c3484
Packit 5c3484
	C
Packit 5c3484
Packit 5c3484
	pmuludq	%mm6, %mm5		C inv*inv*d
Packit 5c3484
	paddd	%mm0, %mm0		C 2*inv
Packit 5c3484
Packit 5c3484
	C
Packit 5c3484
Packit 5c3484
	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
Packit 5c3484
	pxor	%mm5, %mm5
Packit 5c3484
Packit 5c3484
	paddd	%mm0, %mm5
Packit 5c3484
	pmuludq	%mm0, %mm0		C inv*inv
Packit 5c3484
Packit 5c3484
	pcmpeqd	%mm4, %mm4
Packit 5c3484
	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
Packit 5c3484
Packit 5c3484
	C
Packit 5c3484
Packit 5c3484
	pmuludq	%mm6, %mm0		C inv*inv*d
Packit 5c3484
	paddd	%mm5, %mm5		C 2*inv
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %eax
Packit 5c3484
	movl	PARAM_DST, %ecx
Packit 5c3484
	pxor	%mm1, %mm1		C initial carry limb
Packit 5c3484
Packit 5c3484
	C
Packit 5c3484
Packit 5c3484
	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
Packit 5c3484
Packit 5c3484
	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
Packit 5c3484
	pushl	%eax	FRAME_pushl()
Packit 5c3484
	movq	%mm6, %mm0
Packit 5c3484
	pmuludq	%mm5, %mm0
Packit 5c3484
	movd	%mm0, %eax
Packit 5c3484
	cmpl	$1, %eax
Packit 5c3484
	popl	%eax	FRAME_popl()')
Packit 5c3484
Packit 5c3484
	pxor	%mm0, %mm0		C initial carry bit
Packit 5c3484
Packit 5c3484
Packit 5c3484
C The dependent chain here is as follows.
Packit 5c3484
C
Packit 5c3484
C					latency
Packit 5c3484
C	psubq	 s = (src-cbit) - climb	   2
Packit 5c3484
C	pmuludq	 q = s*inverse		   8
Packit 5c3484
C	pmuludq	 prod = q*divisor	   8
Packit 5c3484
C	psrlq	 climb = high(prod)	   2
Packit 5c3484
C					  --
Packit 5c3484
C					  20
Packit 5c3484
C
Packit 5c3484
C Yet the loop measures 19.0 c/l, so obviously there's something gained
Packit 5c3484
C there over a straight reading of the chip documentation.
Packit 5c3484
Packit 5c3484
L(top):
Packit 5c3484
	C eax	src, incrementing
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	dst, incrementing
Packit 5c3484
	C edx	counter, size-1 iterations
Packit 5c3484
	C
Packit 5c3484
	C mm0	carry bit
Packit 5c3484
	C mm1	carry limb
Packit 5c3484
	C mm4	0x00000000FFFFFFFF
Packit 5c3484
	C mm5	inverse
Packit 5c3484
	C mm6	divisor
Packit 5c3484
	C mm7	shift
Packit 5c3484
Packit 5c3484
	movd	(%eax), %mm2
Packit 5c3484
	movd	4(%eax), %mm3
Packit 5c3484
	addl	$4, %eax
Packit 5c3484
	punpckldq %mm3, %mm2
Packit 5c3484
Packit 5c3484
	psrlq	%mm7, %mm2
Packit 5c3484
	pand	%mm4, %mm2		C src
Packit 5c3484
	psubq	%mm0, %mm2		C src - cbit
Packit 5c3484
Packit 5c3484
	psubq	%mm1, %mm2		C src - cbit - climb
Packit 5c3484
	movq	%mm2, %mm0
Packit 5c3484
	psrlq	$63, %mm0		C new cbit
Packit 5c3484
Packit 5c3484
	pmuludq	%mm5, %mm2		C s*inverse
Packit 5c3484
	movd	%mm2, (%ecx)		C q
Packit 5c3484
	addl	$4, %ecx
Packit 5c3484
Packit 5c3484
	movq	%mm6, %mm1
Packit 5c3484
	pmuludq	%mm2, %mm1		C q*divisor
Packit 5c3484
	psrlq	$32, %mm1		C new climb
Packit 5c3484
Packit 5c3484
	subl	$1, %edx
Packit 5c3484
	jnz	L(top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	movd	(%eax), %mm2
Packit 5c3484
	psrlq	%mm7, %mm2		C src
Packit 5c3484
	psubq	%mm0, %mm2		C src - cbit
Packit 5c3484
Packit 5c3484
	psubq	%mm1, %mm2		C src - cbit - climb
Packit 5c3484
Packit 5c3484
	pmuludq	%mm5, %mm2		C s*inverse
Packit 5c3484
	movd	%mm2, (%ecx)		C q
Packit 5c3484
Packit 5c3484
	emms
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()