Blame mpn/x86/pentium4/mmx/popham.asm

Packit 5c3484
dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
Packit 5c3484
dnl  hamming distance.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C			     popcount	     hamdist
Packit 5c3484
C P3 model 9  (Banias)		?		?
Packit 5c3484
C P3 model 13 (Dothan)		6		6
Packit 5c3484
C P4 model 0  (Willamette)
Packit 5c3484
C P4 model 1  (?)
Packit 5c3484
C P4 model 2  (Northwood)	8		9
Packit 5c3484
C P4 model 3  (Prescott)	8		9
Packit 5c3484
C P4 model 4  (Nocona)
Packit 5c3484
Packit 5c3484
C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
Packit 5c3484
C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
Packit 5c3484
C
Packit 5c3484
C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
Packit 5c3484
C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
Packit 5c3484
C and using them saves fiddling about with alignment testing on entry.
Packit 5c3484
C
Packit 5c3484
C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
Packit 5c3484
C might be possible, but 8 c/l relying on out-of-order execution is already
Packit 5c3484
C quite reasonable.
Packit 5c3484
Packit 5c3484
ifdef(`OPERATION_popcount',,
Packit 5c3484
`ifdef(`OPERATION_hamdist',,
Packit 5c3484
`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
Packit 5c3484
')')')
Packit 5c3484
Packit 5c3484
define(HAM,
Packit 5c3484
m4_assert_numargs(1)
Packit 5c3484
`ifdef(`OPERATION_hamdist',`$1')')
Packit 5c3484
Packit 5c3484
define(POP,
Packit 5c3484
m4_assert_numargs(1)
Packit 5c3484
`ifdef(`OPERATION_popcount',`$1')')
Packit 5c3484
Packit 5c3484
HAM(`
Packit 5c3484
defframe(PARAM_SIZE, 12)
Packit 5c3484
defframe(PARAM_SRC2,  8)
Packit 5c3484
defframe(PARAM_SRC,   4)
Packit 5c3484
define(M4_function,mpn_hamdist)
Packit 5c3484
')
Packit 5c3484
POP(`
Packit 5c3484
defframe(PARAM_SIZE,  8)
Packit 5c3484
defframe(PARAM_SRC,   4)
Packit 5c3484
define(M4_function,mpn_popcount)
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
Packit 5c3484
Packit 5c3484
Packit 5c3484
ifdef(`PIC',,`
Packit 5c3484
	dnl  non-PIC
Packit 5c3484
	RODATA
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(rodata_AAAAAAAAAAAAAAAA):
Packit 5c3484
	.long	0xAAAAAAAA
Packit 5c3484
	.long	0xAAAAAAAA
Packit 5c3484
L(rodata_3333333333333333):
Packit 5c3484
	.long	0x33333333
Packit 5c3484
	.long	0x33333333
Packit 5c3484
L(rodata_0F0F0F0F0F0F0F0F):
Packit 5c3484
	.long	0x0F0F0F0F
Packit 5c3484
	.long	0x0F0F0F0F
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(16)
Packit 5c3484
Packit 5c3484
PROLOGUE(M4_function)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	movl	PARAM_SRC, %eax
Packit 5c3484
Packit 5c3484
ifdef(`PIC',`
Packit 5c3484
	movl	$0xAAAAAAAA, %edx
Packit 5c3484
	movd	%edx, %mm7
Packit 5c3484
	punpckldq %mm7, %mm7
Packit 5c3484
Packit 5c3484
	movl	$0x33333333, %edx
Packit 5c3484
	movd	%edx, %mm6
Packit 5c3484
	punpckldq %mm6, %mm6
Packit 5c3484
Packit 5c3484
	movl	$0x0F0F0F0F, %edx
Packit 5c3484
	movd	%edx, %mm5
Packit 5c3484
	punpckldq %mm5, %mm5
Packit 5c3484
Packit 5c3484
HAM(`	movl	PARAM_SRC2, %edx')
Packit 5c3484
Packit 5c3484
',`
Packit 5c3484
	dnl non-PIC
Packit 5c3484
HAM(`	movl	PARAM_SRC2, %edx')
Packit 5c3484
	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
Packit 5c3484
	movq	L(rodata_3333333333333333), %mm6
Packit 5c3484
	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	pxor	%mm4, %mm4		C zero
Packit 5c3484
	pxor	%mm0, %mm0		C total
Packit 5c3484
Packit 5c3484
	subl	$1, %ecx
Packit 5c3484
	ja	L(top)
Packit 5c3484
Packit 5c3484
L(last):
Packit 5c3484
	movd	(%eax,%ecx,4), %mm1		C src high limb
Packit 5c3484
HAM(`	movd	(%edx,%ecx,4), %mm2
Packit 5c3484
	pxor	%mm2, %mm1
Packit 5c3484
')
Packit 5c3484
	jmp	L(loaded)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(top):
Packit 5c3484
	C eax	src
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	counter, size-1 to 2 or 1, inclusive
Packit 5c3484
	C edx	[hamdist] src2
Packit 5c3484
	C
Packit 5c3484
	C mm0	total (low dword)
Packit 5c3484
	C mm1	(scratch)
Packit 5c3484
	C mm2	(scratch)
Packit 5c3484
	C mm3
Packit 5c3484
	C mm4	0x0000000000000000
Packit 5c3484
	C mm5	0x0F0F0F0F0F0F0F0F
Packit 5c3484
	C mm6	0x3333333333333333
Packit 5c3484
	C mm7	0xAAAAAAAAAAAAAAAA
Packit 5c3484
Packit 5c3484
	movd	(%eax), %mm1
Packit 5c3484
	movd	4(%eax), %mm2
Packit 5c3484
	punpckldq %mm2, %mm1
Packit 5c3484
	addl	$8, %eax
Packit 5c3484
Packit 5c3484
HAM(`	movd	(%edx), %mm2
Packit 5c3484
	movd	4(%edx), %mm3
Packit 5c3484
	punpckldq %mm3, %mm2
Packit 5c3484
	pxor	%mm2, %mm1
Packit 5c3484
	addl	$8, %edx
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
L(loaded):
Packit 5c3484
	movq	%mm7, %mm2
Packit 5c3484
	pand	%mm1, %mm2
Packit 5c3484
	psrlq	$1, %mm2
Packit 5c3484
	psubd	%mm2, %mm1	C bit pairs
Packit 5c3484
Packit 5c3484
	movq	%mm6, %mm2
Packit 5c3484
	pand	%mm1, %mm2
Packit 5c3484
	psrlq	$2, %mm1
Packit 5c3484
	pand	%mm6, %mm1
Packit 5c3484
	paddd	%mm2, %mm1	C nibbles
Packit 5c3484
Packit 5c3484
	movq	%mm5, %mm2
Packit 5c3484
	pand	%mm1, %mm2
Packit 5c3484
	psrlq	$4, %mm1
Packit 5c3484
	pand	%mm5, %mm1
Packit 5c3484
	paddd	%mm2, %mm1	C bytes
Packit 5c3484
Packit 5c3484
	psadbw(	%mm4, %mm1)
Packit 5c3484
	paddd	%mm1, %mm0	C to total
Packit 5c3484
Packit 5c3484
	subl	$2, %ecx
Packit 5c3484
	jg	L(top)
Packit 5c3484
Packit 5c3484
	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
Packit 5c3484
	jz	L(last)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	movd	%mm0, %eax
Packit 5c3484
	emms
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()