Blame mpn/x86_64/div_qr_1n_pi1.asm

Packit 5c3484
dnl  x86-64 mpn_div_qr_1n_pi1
Packit 5c3484
dnl  -- Divide an mpn number by a normalized single-limb number,
Packit 5c3484
dnl     using a single-limb inverse.
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Niels Möller
Packit 5c3484
Packit 5c3484
dnl  Copyright 2013 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C		c/l
Packit 5c3484
C AMD K8,K9	13
Packit 5c3484
C AMD K10	13
Packit 5c3484
C AMD bull	16.5
Packit 5c3484
C AMD pile	15
Packit 5c3484
C AMD steam	 ?
Packit 5c3484
C AMD bobcat	16
Packit 5c3484
C AMD jaguar	 ?
Packit 5c3484
C Intel P4	47	poor
Packit 5c3484
C Intel core	19.25
Packit 5c3484
C Intel NHM	18
Packit 5c3484
C Intel SBR	15	poor
Packit 5c3484
C Intel IBR	13
Packit 5c3484
C Intel HWL	11.7
Packit 5c3484
C Intel BWL	 ?
Packit 5c3484
C Intel atom	52	very poor
Packit 5c3484
C VIA nano	19
Packit 5c3484
Packit 5c3484
Packit 5c3484
C INPUT Parameters
Packit 5c3484
define(`QP', `%rdi')
Packit 5c3484
define(`UP', `%rsi')
Packit 5c3484
define(`UN_INPUT', `%rdx')
Packit 5c3484
define(`U1', `%rcx')	C Also in %rax
Packit 5c3484
define(`D', `%r8')
Packit 5c3484
define(`DINV', `%r9')
Packit 5c3484
Packit 5c3484
C Invariants
Packit 5c3484
define(`B2', `%rbp')
Packit 5c3484
define(`B2md', `%rbx')
Packit 5c3484
Packit 5c3484
C Variables
Packit 5c3484
define(`UN', `%r8')	C Overlaps D input
Packit 5c3484
define(`T', `%r10')
Packit 5c3484
define(`U0', `%r11')
Packit 5c3484
define(`U2', `%r12')
Packit 5c3484
define(`Q0', `%r13')
Packit 5c3484
define(`Q1', `%r14')
Packit 5c3484
define(`Q2', `%r15')
Packit 5c3484
Packit 5c3484
ABI_SUPPORT(STD64)
Packit 5c3484
Packit 5c3484
	ASM_START()
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(16)
Packit 5c3484
PROLOGUE(mpn_div_qr_1n_pi1)
Packit 5c3484
	FUNC_ENTRY(6)
Packit 5c3484
IFDOS(`	mov	56(%rsp), %r8	')
Packit 5c3484
IFDOS(`	mov	64(%rsp), %r9	')
Packit 5c3484
	dec	UN_INPUT
Packit 5c3484
	jnz	L(first)
Packit 5c3484
Packit 5c3484
	C Just a single 2/1 division.
Packit 5c3484
	C T, U0 are allocated in scratch registers
Packit 5c3484
	lea	1(U1), T
Packit 5c3484
	mov	U1, %rax
Packit 5c3484
	mul	DINV
Packit 5c3484
	mov	(UP), U0
Packit 5c3484
	add	U0, %rax
Packit 5c3484
	adc	T, %rdx
Packit 5c3484
	mov	%rdx, T
Packit 5c3484
	imul	D, %rdx
Packit 5c3484
	sub	%rdx, U0
Packit 5c3484
	cmp	U0, %rax
Packit 5c3484
	lea	(U0, D), %rax
Packit 5c3484
	cmovnc	U0, %rax
Packit 5c3484
	sbb	$0, T
Packit 5c3484
	cmp	D, %rax
Packit 5c3484
	jc	L(single_div_done)
Packit 5c3484
	sub	D, %rax
Packit 5c3484
	add	$1, T
Packit 5c3484
L(single_div_done):
Packit 5c3484
	mov	T, (QP)
Packit 5c3484
	FUNC_EXIT
Packit 5c3484
	ret
Packit 5c3484
L(first):
Packit 5c3484
	C FIXME: Could delay some of these until we enter the loop.
Packit 5c3484
	push	%r15
Packit 5c3484
	push	%r14
Packit 5c3484
	push	%r13
Packit 5c3484
	push	%r12
Packit 5c3484
	push	%rbx
Packit 5c3484
	push	%rbp
Packit 5c3484
Packit 5c3484
	mov	D, B2
Packit 5c3484
	imul	DINV, B2
Packit 5c3484
	neg	B2
Packit 5c3484
	mov	B2, B2md
Packit 5c3484
	sub	D, B2md
Packit 5c3484
Packit 5c3484
	C D not needed until final reduction
Packit 5c3484
	push	D
Packit 5c3484
	mov	UN_INPUT, UN	C Clobbers D
Packit 5c3484
Packit 5c3484
	mov	DINV, %rax
Packit 5c3484
	mul	U1
Packit 5c3484
	mov	%rax, Q0
Packit 5c3484
	add	U1, %rdx
Packit 5c3484
	mov	%rdx, T
Packit 5c3484
Packit 5c3484
	mov	B2, %rax
Packit 5c3484
	mul	U1
Packit 5c3484
	mov	-8(UP, UN, 8), U0
Packit 5c3484
	mov	(UP, UN, 8), U1
Packit 5c3484
	mov	T, (QP, UN, 8)
Packit 5c3484
	add	%rax, U0
Packit 5c3484
	adc	%rdx, U1
Packit 5c3484
	sbb	U2, U2
Packit 5c3484
	dec	UN
Packit 5c3484
	mov	U1, %rax
Packit 5c3484
	jz	L(final)
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
Packit 5c3484
	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
Packit 5c3484
	C At entry, %rax holds an extra copy of U1
Packit 5c3484
L(loop):
Packit 5c3484
	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
Packit 5c3484
	C Remains to add in B (U1 + c)
Packit 5c3484
	mov	DINV, Q1
Packit 5c3484
	mov	U2, Q2
Packit 5c3484
	and	U2, Q1
Packit 5c3484
	neg	Q2
Packit 5c3484
	mul	DINV
Packit 5c3484
	add	%rdx, Q1
Packit 5c3484
	adc	$0, Q2
Packit 5c3484
	add	Q0, Q1
Packit 5c3484
	mov	%rax, Q0
Packit 5c3484
	mov	B2, %rax
Packit 5c3484
	lea	(B2md, U0), T
Packit 5c3484
	adc	$0, Q2
Packit 5c3484
Packit 5c3484
	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
Packit 5c3484
	mul	U1
Packit 5c3484
	and	B2, U2
Packit 5c3484
	add	U2, U0
Packit 5c3484
	cmovnc	U0, T
Packit 5c3484
Packit 5c3484
	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
Packit 5c3484
	adc	U1, Q1
Packit 5c3484
	mov	-8(UP, UN, 8), U0
Packit 5c3484
	adc	Q2, 8(QP, UN, 8)
Packit 5c3484
	jc	L(q_incr)
Packit 5c3484
L(q_incr_done):
Packit 5c3484
	add	%rax, U0
Packit 5c3484
	mov	T, %rax
Packit 5c3484
	adc	%rdx, %rax
Packit 5c3484
	mov	Q1, (QP, UN, 8)
Packit 5c3484
	sbb	U2, U2
Packit 5c3484
	dec	UN
Packit 5c3484
	mov	%rax, U1
Packit 5c3484
	jnz	L(loop)
Packit 5c3484
Packit 5c3484
L(final):
Packit 5c3484
	pop	D
Packit 5c3484
Packit 5c3484
	mov	U2, Q1
Packit 5c3484
	and	D, U2
Packit 5c3484
	sub	U2, %rax
Packit 5c3484
	neg	Q1
Packit 5c3484
Packit 5c3484
	mov	%rax, U1
Packit 5c3484
	sub	D, %rax
Packit 5c3484
	cmovc	U1, %rax
Packit 5c3484
	sbb	$-1, Q1
Packit 5c3484
Packit 5c3484
	lea	1(%rax), T
Packit 5c3484
	mul	DINV
Packit 5c3484
	add	U0, %rax
Packit 5c3484
	adc	T, %rdx
Packit 5c3484
	mov	%rdx, T
Packit 5c3484
	imul	D, %rdx
Packit 5c3484
	sub	%rdx, U0
Packit 5c3484
	cmp	U0, %rax
Packit 5c3484
	lea	(U0, D), %rax
Packit 5c3484
	cmovnc	U0, %rax
Packit 5c3484
	sbb	$0, T
Packit 5c3484
	cmp	D, %rax
Packit 5c3484
	jc	L(div_done)
Packit 5c3484
	sub	D, %rax
Packit 5c3484
	add	$1, T
Packit 5c3484
L(div_done):
Packit 5c3484
	add	T, Q0
Packit 5c3484
	mov	Q0, (QP)
Packit 5c3484
	adc	Q1, 8(QP)
Packit 5c3484
	jnc	L(done)
Packit 5c3484
L(final_q_incr):
Packit 5c3484
	addq	$1, 16(QP)
Packit 5c3484
	lea	8(QP), QP
Packit 5c3484
	jc	L(final_q_incr)
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	pop	%rbp
Packit 5c3484
	pop	%rbx
Packit 5c3484
	pop	%r12
Packit 5c3484
	pop	%r13
Packit 5c3484
	pop	%r14
Packit 5c3484
	pop	%r15
Packit 5c3484
	FUNC_EXIT
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
L(q_incr):
Packit 5c3484
	C U1 is not live, so use it for indexing
Packit 5c3484
	lea	16(QP, UN, 8), U1
Packit 5c3484
L(q_incr_loop):
Packit 5c3484
	addq	$1, (U1)
Packit 5c3484
	jnc	L(q_incr_done)
Packit 5c3484
	lea	8(U1), U1
Packit 5c3484
	jmp	L(q_incr_loop)
Packit 5c3484
EPILOGUE()