Blame mpn/x86/k7/mod_34lsub1.asm

Packit 5c3484
dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C         cycles/limb
Packit 5c3484
C Athlon:     1
Packit 5c3484
C Hammer:     1
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
Packit 5c3484
C
Packit 5c3484
C The loop form below and the 64 byte code alignment seem necessary for the
Packit 5c3484
C claimed speed.  This is a bit strange, since normally k7 isn't very
Packit 5c3484
C sensitive to such things.  Perhaps there has to be 6 instructions in the
Packit 5c3484
C first 16 bytes for the BTB entry or something.
Packit 5c3484
Packit 5c3484
defframe(PARAM_SIZE, 8)
Packit 5c3484
defframe(PARAM_SRC,  4)
Packit 5c3484
Packit 5c3484
dnl  re-use parameter space
Packit 5c3484
define(SAVE_EDI, `PARAM_SIZE')
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(64)
Packit 5c3484
PROLOGUE(mpn_mod_34lsub1)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	movl	PARAM_SRC, %edx
Packit 5c3484
Packit 5c3484
	subl	$2, %ecx
Packit 5c3484
	ja	L(three_or_more)
Packit 5c3484
Packit 5c3484
	movl	(%edx), %eax
Packit 5c3484
	jb	L(one)
Packit 5c3484
Packit 5c3484
	movl	4(%edx), %ecx
Packit 5c3484
	movl	%eax, %edx
Packit 5c3484
	shrl	$24, %eax		C src[0] low
Packit 5c3484
Packit 5c3484
	andl	$0xFFFFFF, %edx		C src[0] high
Packit 5c3484
	addl	%edx, %eax
Packit 5c3484
	movl	%ecx, %edx
Packit 5c3484
Packit 5c3484
	andl	$0xFFFF, %ecx
Packit 5c3484
	shrl	$16, %edx		C src[1] high
Packit 5c3484
	addl	%edx, %eax
Packit 5c3484
Packit 5c3484
	shll	$8, %ecx		C src[1] low
Packit 5c3484
	addl	%ecx, %eax
Packit 5c3484
Packit 5c3484
L(one):
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(three_or_more):
Packit 5c3484
	C eax
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	size-2
Packit 5c3484
	C edx	src
Packit 5c3484
	C esi
Packit 5c3484
	C edi
Packit 5c3484
Packit 5c3484
	pushl	%ebx	FRAME_pushl()
Packit 5c3484
	xorl	%eax, %eax
Packit 5c3484
	xorl	%ebx, %ebx
Packit 5c3484
Packit 5c3484
	movl	%edi, SAVE_EDI
Packit 5c3484
	pushl	%esi	FRAME_pushl()
Packit 5c3484
	xorl	%esi, %esi		C and clear carry flag
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C code offset 0x40 at this point
Packit 5c3484
L(top):
Packit 5c3484
	C eax	acc 0mod3
Packit 5c3484
	C ebx	acc 1mod3
Packit 5c3484
	C ecx	counter, limbs
Packit 5c3484
	C edx	src
Packit 5c3484
	C esi	acc 2mod3
Packit 5c3484
	C edi
Packit 5c3484
Packit 5c3484
	leal	24(%edx), %edx
Packit 5c3484
	leal	-2(%ecx), %ecx
Packit 5c3484
	adcl	-24(%edx), %eax
Packit 5c3484
	adcl	-20(%edx), %ebx
Packit 5c3484
	adcl	-16(%edx), %esi
Packit 5c3484
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jng	L(done_loop)
Packit 5c3484
Packit 5c3484
	leal	-2(%ecx), %ecx
Packit 5c3484
	adcl	-12(%edx), %eax
Packit 5c3484
	adcl	-8(%edx), %ebx
Packit 5c3484
	adcl	-4(%edx), %esi
Packit 5c3484
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jg	L(top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	leal	12(%edx), %edx
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done_loop):
Packit 5c3484
	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
Packit 5c3484
Packit 5c3484
	incl	%ecx
Packit 5c3484
	movl	$0xFFFFFFFF, %edi
Packit 5c3484
	js	L(combine)
Packit 5c3484
Packit 5c3484
	adcl	-12(%edx), %eax
Packit 5c3484
	decl	%ecx
Packit 5c3484
	movl	$0xFFFFFF00, %edi
Packit 5c3484
	js	L(combine)
Packit 5c3484
Packit 5c3484
	adcl	-8(%edx), %ebx
Packit 5c3484
	movl	$0xFFFF0000, %edi
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(combine):
Packit 5c3484
	C eax	acc 0mod3
Packit 5c3484
	C ebx	acc 1mod3
Packit 5c3484
	C ecx
Packit 5c3484
	C edx
Packit 5c3484
	C esi	acc 2mod3
Packit 5c3484
	C edi	mask
Packit 5c3484
Packit 5c3484
	sbbl	%ecx, %ecx		C carry
Packit 5c3484
	movl	%eax, %edx		C 0mod3
Packit 5c3484
	shrl	$24, %eax		C 0mod3 high
Packit 5c3484
Packit 5c3484
	andl	%edi, %ecx		C carry masked
Packit 5c3484
	andl	$0x00FFFFFF, %edx	C 0mod3 low
Packit 5c3484
	movl	%ebx, %edi		C 1mod3
Packit 5c3484
Packit 5c3484
	subl	%ecx, %eax		C apply carry
Packit 5c3484
	shrl	$16, %ebx		C 1mod3 high
Packit 5c3484
	andl	$0xFFFF, %edi
Packit 5c3484
Packit 5c3484
	addl	%edx, %eax		C apply 0mod3 low
Packit 5c3484
	movl	%esi, %edx		C 2mod3
Packit 5c3484
	shll	$8, %edi		C 1mod3 low
Packit 5c3484
Packit 5c3484
	addl	%ebx, %eax		C apply 1mod3 high
Packit 5c3484
	shrl	$8, %esi		C 2mod3 high
Packit 5c3484
	movzbl	%dl, %edx		C 2mod3 low
Packit 5c3484
Packit 5c3484
	addl	%edi, %eax		C apply 1mod3 low
Packit 5c3484
	shll	$16, %edx		C 2mod3 low
Packit 5c3484
Packit 5c3484
	addl	%esi, %eax		C apply 2mod3 high
Packit 5c3484
	popl	%esi	FRAME_popl()
Packit 5c3484
Packit 5c3484
	movl	SAVE_EDI, %edi
Packit 5c3484
	addl	%edx, %eax		C apply 2mod3 low
Packit 5c3484
	popl	%ebx	FRAME_popl()
Packit 5c3484
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()