Blame mpn/powerpc64/mode64/mod_1_4.asm

Packit 15dc08
dnl  PowerPC-64 mpn_mod_1s_4p
Packit 15dc08
Packit 15dc08
dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
Packit 15dc08
Packit 15dc08
dnl  This file is part of the GNU MP Library.
Packit 15dc08
dnl
Packit 15dc08
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 15dc08
dnl  it under the terms of either:
Packit 15dc08
dnl
Packit 15dc08
dnl    * the GNU Lesser General Public License as published by the Free
Packit 15dc08
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 15dc08
dnl      option) any later version.
Packit 15dc08
dnl
Packit 15dc08
dnl  or
Packit 15dc08
dnl
Packit 15dc08
dnl    * the GNU General Public License as published by the Free Software
Packit 15dc08
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 15dc08
dnl      later version.
Packit 15dc08
dnl
Packit 15dc08
dnl  or both in parallel, as here.
Packit 15dc08
dnl
Packit 15dc08
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 15dc08
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 15dc08
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 15dc08
dnl  for more details.
Packit 15dc08
dnl
Packit 15dc08
dnl  You should have received copies of the GNU General Public License and the
Packit 15dc08
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 15dc08
dnl  see https://www.gnu.org/licenses/.
Packit 15dc08
Packit 15dc08
include(`../config.m4')
Packit 15dc08
Packit 15dc08
C                   cycles/limb
Packit 15dc08
C POWER3/PPC630          ?
Packit 15dc08
C POWER4/PPC970          9
Packit 15dc08
C POWER5                 9
Packit 15dc08
C POWER6                13
Packit 15dc08
C POWER7                3.5
Packit 15dc08
Packit 15dc08
C TODO
Packit 15dc08
C  * Optimise, in particular the cps function.  This was compiler-generated and
Packit 15dc08
C    then hand optimised.
Packit 15dc08
Packit 15dc08
C INPUT PARAMETERS
Packit 15dc08
define(`ap',  `r3')
Packit 15dc08
define(`n',   `r4')
Packit 15dc08
define(`d',   `r5')
Packit 15dc08
define(`cps', `r6')
Packit 15dc08
Packit 15dc08
ASM_START()
Packit 15dc08
Packit 15dc08
EXTERN_FUNC(mpn_invert_limb)
Packit 15dc08
Packit 15dc08
PROLOGUE(mpn_mod_1s_4p)
Packit 15dc08
	std	r23, -72(r1)
Packit 15dc08
	ld	r23, 48(cps)
Packit 15dc08
	std	r24, -64(r1)
Packit 15dc08
	std	r25, -56(r1)
Packit 15dc08
	ld	r24, 32(cps)
Packit 15dc08
	ld	r25, 24(cps)
Packit 15dc08
	std	r26, -48(r1)
Packit 15dc08
	std	r27, -40(r1)
Packit 15dc08
	ld	r26, 16(cps)
Packit 15dc08
	std	r28, -32(r1)
Packit 15dc08
	std	r29, -24(r1)
Packit 15dc08
	std	r30, -16(r1)
Packit 15dc08
	std	r31, -8(r1)
Packit 15dc08
	ld	r30, 40(cps)
Packit 15dc08
Packit 15dc08
	rldicl.	r0, n, 0,62
Packit 15dc08
	sldi	r31, n, 3
Packit 15dc08
	add	ap, ap, r31		C make ap point at end of operand
Packit 15dc08
Packit 15dc08
	cmpdi	cr7, r0, 2
Packit 15dc08
	beq	cr0, L(b00)
Packit 15dc08
	blt	cr7, L(b01)
Packit 15dc08
	beq	cr7, L(b10)
Packit 15dc08
Packit 15dc08
L(b11):	ld	r11, -16(ap)
Packit 15dc08
	ld	r9, -8(ap)
Packit 15dc08
	ld	r0, -24(ap)
Packit 15dc08
	mulhdu	r27, r11, r26
Packit 15dc08
	mulld	r8, r11, r26
Packit 15dc08
	mulhdu	r11, r9, r25
Packit 15dc08
	mulld	r9, r9, r25
Packit 15dc08
	addc	r31, r8, r0
Packit 15dc08
	addze	r10, r27
Packit 15dc08
	addc	r0, r9, r31
Packit 15dc08
	adde	r9, r11, r10
Packit 15dc08
	addi	ap, ap, -40
Packit 15dc08
	b	L(6)
Packit 15dc08
Packit 15dc08
	ALIGN(16)
Packit 15dc08
L(b00):	ld	r11, -24(ap)
Packit 15dc08
	ld	r10, -16(ap)
Packit 15dc08
	ld	r9, -8(ap)
Packit 15dc08
	ld	r0, -32(ap)
Packit 15dc08
	mulld	r8, r11, r26
Packit 15dc08
	mulhdu	r7, r10, r25
Packit 15dc08
	mulhdu	r27, r11, r26
Packit 15dc08
	mulhdu	r11, r9, r24
Packit 15dc08
	mulld	r10, r10, r25
Packit 15dc08
	mulld	r9, r9, r24
Packit 15dc08
	addc	r31, r8, r0
Packit 15dc08
	addze	r0, r27
Packit 15dc08
	addc	r8, r31, r10
Packit 15dc08
	adde	r10, r0, r7
Packit 15dc08
	addc	r0, r9, r8
Packit 15dc08
	adde	r9, r11, r10
Packit 15dc08
	addi	ap, ap, -48
Packit 15dc08
	b	L(6)
Packit 15dc08
Packit 15dc08
	ALIGN(16)
Packit 15dc08
L(b01):	li	r9, 0
Packit 15dc08
	ld	r0, -8(ap)
Packit 15dc08
	addi	ap, ap, -24
Packit 15dc08
	b	L(6)
Packit 15dc08
Packit 15dc08
	ALIGN(16)
Packit 15dc08
L(b10):	ld	r9, -8(ap)
Packit 15dc08
	ld	r0, -16(ap)
Packit 15dc08
	addi	ap, ap, -32
Packit 15dc08
Packit 15dc08
	ALIGN(16)
Packit 15dc08
L(6):	addi	r10, n, 3
Packit 15dc08
	srdi	r7, r10, 2
Packit 15dc08
	mtctr	r7
Packit 15dc08
	bdz	L(end)
Packit 15dc08
Packit 15dc08
	ALIGN(16)
Packit 15dc08
L(top):	ld	r31, -16(ap)
Packit 15dc08
	ld	r10, -8(ap)
Packit 15dc08
	ld	r11, 8(ap)
Packit 15dc08
	ld	r12, 0(ap)
Packit 15dc08
	mulld	r29, r0, r30		C rl * B4modb
Packit 15dc08
	mulhdu	r0,  r0, r30		C rl * B4modb
Packit 15dc08
	mulhdu	r27, r10, r26
Packit 15dc08
	mulld	r10, r10, r26
Packit 15dc08
	mulhdu	r7, r9, r23		C rh * B5modb
Packit 15dc08
	mulld	r9, r9, r23		C rh * B5modb
Packit 15dc08
	mulhdu	r28, r11, r24
Packit 15dc08
	mulld	r11, r11, r24
Packit 15dc08
	mulhdu	r4, r12, r25
Packit 15dc08
	mulld	r12, r12, r25
Packit 15dc08
	addc	r8, r10, r31
Packit 15dc08
	addze	r10, r27
Packit 15dc08
	addi	ap, ap, -32
Packit 15dc08
	addc	r27, r8, r12
Packit 15dc08
	adde	r12, r10, r4
Packit 15dc08
	addc	r11, r27, r11
Packit 15dc08
	adde	r31, r12, r28
Packit 15dc08
	addc	r12, r11, r29
Packit 15dc08
	adde	r4, r31, r0
Packit 15dc08
	addc	r0, r9, r12
Packit 15dc08
	adde	r9, r7, r4
Packit 15dc08
	bdnz	L(top)
Packit 15dc08
Packit 15dc08
L(end):
Packit 15dc08
ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
Packit 15dc08
`	lwz	r3, 8(cps)',
Packit 15dc08
`	lwz	r3, 12(cps)')
Packit 15dc08
	mulld	r10, r9, r26
Packit 15dc08
	mulhdu	r9, r9, r26
Packit 15dc08
	addc	r11, r0, r10
Packit 15dc08
	addze	r9, r9
Packit 15dc08
	ld	r10, 0(cps)
Packit 15dc08
	subfic	r8, r3, 64
Packit 15dc08
	sld	r9, r9, r3
Packit 15dc08
	srd	r8, r11, r8
Packit 15dc08
	sld	r11, r11, r3
Packit 15dc08
	or	r9, r8, r9
Packit 15dc08
	mulld	r0, r9, r10
Packit 15dc08
	mulhdu	r10, r9, r10
Packit 15dc08
	addi	r9, r9, 1
Packit 15dc08
	addc	r8, r0, r11
Packit 15dc08
	adde	r0, r10, r9
Packit 15dc08
	mulld	r0, r0, d
Packit 15dc08
	subf	r0, r0, r11
Packit 15dc08
	cmpld	cr7, r8, r0
Packit 15dc08
	bge	cr7, L(9)
Packit 15dc08
	add	r0, r0, d
Packit 15dc08
L(9):	cmpld	cr7, r0, d
Packit 15dc08
	bge-	cr7, L(16)
Packit 15dc08
L(10):	srd	r3, r0, r3
Packit 15dc08
	ld	r23, -72(r1)
Packit 15dc08
	ld	r24, -64(r1)
Packit 15dc08
	ld	r25, -56(r1)
Packit 15dc08
	ld	r26, -48(r1)
Packit 15dc08
	ld	r27, -40(r1)
Packit 15dc08
	ld	r28, -32(r1)
Packit 15dc08
	ld	r29, -24(r1)
Packit 15dc08
	ld	r30, -16(r1)
Packit 15dc08
	ld	r31, -8(r1)
Packit 15dc08
	blr
Packit 15dc08
Packit 15dc08
L(16):	subf	r0, d, r0
Packit 15dc08
	b	L(10)
Packit 15dc08
EPILOGUE()
Packit 15dc08
Packit 15dc08
PROLOGUE(mpn_mod_1s_4p_cps,toc)
Packit 15dc08
	mflr	r0
Packit 15dc08
	std	r29, -24(r1)
Packit 15dc08
	std	r30, -16(r1)
Packit 15dc08
	mr	r29, r3
Packit 15dc08
	std	r0, 16(r1)
Packit 15dc08
	std	r31, -8(r1)
Packit 15dc08
	stdu	r1, -144(r1)
Packit 15dc08
	cntlzd	r31, r4
Packit 15dc08
	sld	r30, r4, r31
Packit 15dc08
	mr	r3, r30
Packit 15dc08
	CALL(	mpn_invert_limb)
Packit 15dc08
	subfic	r9, r31, 64
Packit 15dc08
	li	r10, 1
Packit 15dc08
	sld	r10, r10, r31
Packit 15dc08
	srd	r9, r3, r9
Packit 15dc08
	neg	r0, r30
Packit 15dc08
	or	r10, r10, r9
Packit 15dc08
	mulld	r10, r10, r0
Packit 15dc08
	mulhdu	r11, r10, r3
Packit 15dc08
	nor	r11, r11, r11
Packit 15dc08
	subf	r11, r10, r11
Packit 15dc08
	mulld	r11, r11, r30
Packit 15dc08
	mulld	r0, r10, r3
Packit 15dc08
	cmpld	cr7, r0, r11
Packit 15dc08
	bge	cr7, L(18)
Packit 15dc08
	add	r11, r11, r30
Packit 15dc08
L(18):	mulhdu	r9, r11, r3
Packit 15dc08
	add	r9, r11, r9
Packit 15dc08
	nor	r9, r9, r9
Packit 15dc08
	mulld	r9, r9, r30
Packit 15dc08
	mulld	r0, r11, r3
Packit 15dc08
	cmpld	cr7, r0, r9
Packit 15dc08
	bge	cr7, L(19)
Packit 15dc08
	add	r9, r9, r30
Packit 15dc08
L(19):	mulhdu	r0, r9, r3
Packit 15dc08
	add	r0, r9, r0
Packit 15dc08
	nor	r0, r0, r0
Packit 15dc08
	mulld	r0, r0, r30
Packit 15dc08
	mulld	r8, r9, r3
Packit 15dc08
	cmpld	cr7, r8, r0
Packit 15dc08
	bge	cr7, L(20)
Packit 15dc08
	add	r0, r0, r30
Packit 15dc08
L(20):	mulhdu	r8, r0, r3
Packit 15dc08
	add	r8, r0, r8
Packit 15dc08
	nor	r8, r8, r8
Packit 15dc08
	mulld	r8, r8, r30
Packit 15dc08
	mulld	r7, r0, r3
Packit 15dc08
	cmpld	cr7, r7, r8
Packit 15dc08
	bge	cr7, L(21)
Packit 15dc08
	add	r8, r8, r30
Packit 15dc08
L(21):	srd	r0, r0, r31
Packit 15dc08
	addi	r1, r1, 144
Packit 15dc08
	srd	r8, r8, r31
Packit 15dc08
	srd	r10, r10, r31
Packit 15dc08
	srd	r11, r11, r31
Packit 15dc08
	std	r0, 40(r29)
Packit 15dc08
	std	r31, 8(r29)
Packit 15dc08
	srd	r9, r9, r31
Packit 15dc08
	ld	r0, 16(r1)
Packit 15dc08
	ld	r30, -16(r1)
Packit 15dc08
	std	r8, 48(r29)
Packit 15dc08
	std	r3, 0(r29)
Packit 15dc08
	mtlr	r0
Packit 15dc08
	ld	r31, -8(r1)
Packit 15dc08
	std	r10, 16(r29)
Packit 15dc08
	std	r11, 24(r29)
Packit 15dc08
	std	r9, 32(r29)
Packit 15dc08
	ld	r29, -24(r1)
Packit 15dc08
	blr
Packit 15dc08
EPILOGUE()