Blame sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S

Packit 6c4009
/* Optimized with sse2 version of cosf
Packit 6c4009
   Copyright (C) 2012-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include <errno.h>
Packit 6c4009
Packit 6c4009
/* Short algorithm description:
Packit 6c4009
 *
Packit 6c4009
 *  1) if |x| == 0: return 1.0-|x|.
Packit 6c4009
 *  2) if |x| <  2^-27: return 1.0-|x|.
Packit 6c4009
 *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
Packit 6c4009
 *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
Packit 6c4009
 *  5) if |x| < 9*Pi/4:
Packit 6c4009
 *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
Packit 6c4009
 *           t=|x|-j*Pi/4.
Packit 6c4009
 *      5.2) Reconstruction:
Packit 6c4009
 *          s = (-1.0)^((n>>2)&1)
Packit 6c4009
 *          if(n&2 != 0) {
Packit 6c4009
 *              using cos(t) polynomial for |t|
Packit 6c4009
 *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
Packit 6c4009
 *          } else {
Packit 6c4009
 *              using sin(t) polynomial for |t|
Packit 6c4009
 *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
Packit 6c4009
 *          }
Packit 6c4009
 *  6) if |x| < 2^23, large args:
Packit 6c4009
 *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
Packit 6c4009
 *           t=|x|-j*Pi/4.
Packit 6c4009
 *      6.2) Reconstruction same as (5.2).
Packit 6c4009
 *  7) if |x| >= 2^23, very large args:
Packit 6c4009
 *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
Packit 6c4009
 *           t=|x|-j*Pi/4.
Packit 6c4009
 *      7.2) Reconstruction same as (5.2).
Packit 6c4009
 *  8) if x is Inf, return x-x, and set errno=EDOM.
Packit 6c4009
 *  9) if x is NaN, return x-x.
Packit 6c4009
 *
Packit 6c4009
 * Special cases:
Packit 6c4009
 *  cos(+-0) = 1 not raising inexact,
Packit 6c4009
 *  cos(subnormal) raises inexact,
Packit 6c4009
 *  cos(min_normalized) raises inexact,
Packit 6c4009
 *  cos(normalized) raises inexact,
Packit 6c4009
 *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
Packit 6c4009
 *  cos(NaN) = NaN.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
#ifdef	PIC
Packit 6c4009
# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
Packit 6c4009
# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
Packit 6c4009
# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
Packit 6c4009
# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
Packit 6c4009
# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
Packit 6c4009
# define POP(REG)			popl REG; CFI_POP(REG)
Packit 6c4009
# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
Packit 6c4009
# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
Packit 6c4009
# define ARG_X				8(%esp)
Packit 6c4009
#else
Packit 6c4009
# define MO1(symbol)			L(symbol)
Packit 6c4009
# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
Packit 6c4009
# define ENTRANCE
Packit 6c4009
# define RETURN				ret
Packit 6c4009
# define ARG_X				4(%esp)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
ENTRY(__cosf_sse2)
Packit 6c4009
	/* Input: single precision x on stack at address ARG_X */
Packit 6c4009
Packit 6c4009
	ENTRANCE
Packit 6c4009
	movl	ARG_X, %eax		/* Bits of x */
Packit 6c4009
	cvtss2sd ARG_X, %xmm0		/* DP x */
Packit 6c4009
	andl	$0x7fffffff, %eax	/* |x| */
Packit 6c4009
Packit 6c4009
	cmpl	$0x3f490fdb, %eax	/* |x|
Packit 6c4009
	jb	L(arg_less_pio4)
Packit 6c4009
Packit 6c4009
	/* Here if |x|>=Pi/4 */
Packit 6c4009
	movd	%eax, %xmm3		/* SP |x| */
Packit 6c4009
	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
Packit 6c4009
	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
Packit 6c4009
Packit 6c4009
	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
Packit 6c4009
	jae	L(large_args)
Packit 6c4009
Packit 6c4009
	/* Here if Pi/4<=|x|<9*Pi/4 */
Packit 6c4009
	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
Packit 6c4009
	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
Packit 6c4009
	addl	$1, %eax		/* k+1 */
Packit 6c4009
	movl	$0x0e, %edx
Packit 6c4009
	andl	%eax, %edx		/* j = (k+1)&0x0e */
Packit 6c4009
	addl	$2, %eax		/* n */
Packit 6c4009
	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
Packit 6c4009
Packit 6c4009
L(reconstruction):
Packit 6c4009
	/* Input: %eax=n, %xmm0=t */
Packit 6c4009
	testl	$2, %eax		/* n&2 != 0?  */
Packit 6c4009
	jz	L(sin_poly)
Packit 6c4009
Packit 6c4009
/*L(cos_poly):*/
Packit 6c4009
	/* Here if cos(x) calculated using cos(t) polynomial for |t|
Packit 6c4009
	 * y = t*t; z = y*y;
Packit 6c4009
	 * s = sign(x) * (-1.0)^((n>>2)&1)
Packit 6c4009
	 * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
Packit 6c4009
	 */
Packit 6c4009
	shrl	$2, %eax		/* n>>2 */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* y=t^2 */
Packit 6c4009
	andl	$1, %eax		/* (n>>2)&1 */
Packit 6c4009
	movaps	%xmm0, %xmm1		/* y */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* z=t^4 */
Packit 6c4009
Packit 6c4009
	movsd	MO1(DP_C4), %xmm4	/* C4 */
Packit 6c4009
	mulsd	%xmm0, %xmm4		/* z*C4 */
Packit 6c4009
	movsd	MO1(DP_C3), %xmm3	/* C3 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*C3 */
Packit 6c4009
	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
Packit 6c4009
	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
Packit 6c4009
	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
Packit 6c4009
	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
Packit 6c4009
	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
Packit 6c4009
	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
Packit 6c4009
Packit 6c4009
	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
Packit 6c4009
	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
Packit 6c4009
	addsd	MO1(DP_ONES), %xmm3
Packit 6c4009
Packit 6c4009
	mulsd	MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
Packit 6c4009
	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
Packit 6c4009
	fldl	0(%esp)			/* ...to FPU.  */
Packit 6c4009
	/* Return back 4 bytes of stack frame */
Packit 6c4009
	lea	8(%esp), %esp
Packit 6c4009
	RETURN
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(sin_poly):
Packit 6c4009
	/* Here if cos(x) calculated using sin(t) polynomial for |t|
Packit 6c4009
	 * y = t*t; z = y*y;
Packit 6c4009
	 * s = sign(x) * (-1.0)^((n>>2)&1)
Packit 6c4009
	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
Packit 6c4009
	 */
Packit 6c4009
Packit 6c4009
	movaps	%xmm0, %xmm4		/* t */
Packit 6c4009
	shrl	$2, %eax		/* n>>2 */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* y=t^2 */
Packit 6c4009
	andl	$1, %eax		/* (n>>2)&1 */
Packit 6c4009
	movaps	%xmm0, %xmm1		/* y */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* z=t^4 */
Packit 6c4009
Packit 6c4009
	movsd	MO1(DP_S4), %xmm2	/* S4 */
Packit 6c4009
	mulsd	%xmm0, %xmm2		/* z*S4 */
Packit 6c4009
	movsd	MO1(DP_S3), %xmm3	/* S3 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*S3 */
Packit 6c4009
	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
Packit 6c4009
	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
Packit 6c4009
	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
Packit 6c4009
	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
Packit 6c4009
	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
Packit 6c4009
	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
Packit 6c4009
	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
Packit 6c4009
	mulsd	MO2(DP_ONES,%eax,8), %xmm4
Packit 6c4009
	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
Packit 6c4009
	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
Packit 6c4009
	mulsd	%xmm4, %xmm3
Packit 6c4009
	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
Packit 6c4009
	addsd	%xmm4, %xmm3
Packit 6c4009
	movsd	%xmm3, 0(%esp)		/* Move result from sse...   */
Packit 6c4009
	fldl	0(%esp)			/* ...to FPU.  */
Packit 6c4009
	/* Return back 4 bytes of stack frame */
Packit 6c4009
	lea	8(%esp), %esp
Packit 6c4009
	RETURN
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(large_args):
Packit 6c4009
	/* Here if |x|>=9*Pi/4 */
Packit 6c4009
	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
Packit 6c4009
	jae	L(arg_inf_or_nan)
Packit 6c4009
Packit 6c4009
	/* Here if finite |x|>=9*Pi/4 */
Packit 6c4009
	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
Packit 6c4009
	jae	L(very_large_args)
Packit 6c4009
Packit 6c4009
	/* Here if 9*Pi/4<=|x|<2^23 */
Packit 6c4009
	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
Packit 6c4009
	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
Packit 6c4009
	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
Packit 6c4009
	addl	$1, %eax		/* k+1 */
Packit 6c4009
	movl	%eax, %edx
Packit 6c4009
	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
Packit 6c4009
	cvtsi2sdl %edx, %xmm4		/* DP j */
Packit 6c4009
	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
Packit 6c4009
	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
Packit 6c4009
	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
Packit 6c4009
	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
Packit 6c4009
	addl	$2, %eax		/* n */
Packit 6c4009
	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
Packit 6c4009
	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
Packit 6c4009
	jmp	L(reconstruction)
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(very_large_args):
Packit 6c4009
	/* Here if finite |x|>=2^23 */
Packit 6c4009
Packit 6c4009
	/* bitpos = (ix>>23) - BIAS_32 + 59; */
Packit 6c4009
	shrl	$23, %eax		/* eb = biased exponent of x */
Packit 6c4009
	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
Packit 6c4009
	subl	$68, %eax
Packit 6c4009
	movl	$28, %ecx		/* %cl=28 */
Packit 6c4009
	movl	%eax, %edx		/* bitpos copy */
Packit 6c4009
Packit 6c4009
	/* j = bitpos/28; */
Packit 6c4009
	div	%cl			/* j in register %al=%ax/%cl */
Packit 6c4009
	movapd	%xmm0, %xmm3		/* |x| */
Packit 6c4009
	/* clear unneeded remainder from %ah */
Packit 6c4009
	andl	$0xff, %eax
Packit 6c4009
Packit 6c4009
	imull	$28, %eax, %ecx		/* j*28 */
Packit 6c4009
	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
Packit 6c4009
	movapd	%xmm0, %xmm5		/* |x| */
Packit 6c4009
	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
Packit 6c4009
	movapd	%xmm0, %xmm1		/* |x| */
Packit 6c4009
	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
Packit 6c4009
	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
Packit 6c4009
	addl	$19, %ecx		/* j*28+19 */
Packit 6c4009
	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
Packit 6c4009
	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
Packit 6c4009
	jl	L(very_large_skip1)
Packit 6c4009
Packit 6c4009
	/* Here if bitpos>=j*28+19 */
Packit 6c4009
	andpd	%xmm3, %xmm4		/* HI(tmp3) */
Packit 6c4009
	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
Packit 6c4009
L(very_large_skip1):
Packit 6c4009
Packit 6c4009
	movsd	MO1(DP_2POW52), %xmm6
Packit 6c4009
	movapd	%xmm5, %xmm2		/* tmp2 copy */
Packit 6c4009
	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
Packit 6c4009
	movl	$1, %edx
Packit 6c4009
	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
Packit 6c4009
	movsd	8+MO1(DP_2POW52), %xmm4
Packit 6c4009
	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
Packit 6c4009
	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
Packit 6c4009
	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
Packit 6c4009
	jbe	L(very_large_skip2)
Packit 6c4009
Packit 6c4009
	/* Here if tmp4 > tmp5 */
Packit 6c4009
	subl	$1, %eax		/* k-- */
Packit 6c4009
	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
Packit 6c4009
L(very_large_skip2):
Packit 6c4009
Packit 6c4009
	andl	%eax, %edx		/* k&1 */
Packit 6c4009
	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
Packit 6c4009
	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
Packit 6c4009
	addsd	%xmm2, %xmm3		/* t += tmp2 */
Packit 6c4009
	addsd	%xmm3, %xmm0		/* t += tmp0 */
Packit 6c4009
	addl	$3, %eax		/* n=k+3 */
Packit 6c4009
	addsd	%xmm1, %xmm0		/* t += tmp1 */
Packit 6c4009
	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
Packit 6c4009
Packit 6c4009
	jmp	L(reconstruction)	/* end of very_large_args peth */
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(arg_less_pio4):
Packit 6c4009
	/* Here if |x|
Packit 6c4009
	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
Packit 6c4009
	jl	L(arg_less_2pn5)
Packit 6c4009
Packit 6c4009
	/* Here if 2^-5<=|x|
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* y=x^2 */
Packit 6c4009
	movaps	%xmm0, %xmm1		/* y */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* z=x^4 */
Packit 6c4009
	movsd	MO1(DP_C4), %xmm3	/* C4 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*C4 */
Packit 6c4009
	movsd	MO1(DP_C3), %xmm5	/* C3 */
Packit 6c4009
	mulsd	%xmm0, %xmm5		/* z*C3 */
Packit 6c4009
	addsd	MO1(DP_C2), %xmm3	/* C2+z*C4 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
Packit 6c4009
	addsd	MO1(DP_C1), %xmm5	/* C1+z*C3 */
Packit 6c4009
	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
Packit 6c4009
	addsd	MO1(DP_C0), %xmm3	/* C0+z*(C2+z*C4) */
Packit 6c4009
	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
Packit 6c4009
	addsd	%xmm5, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
Packit 6c4009
	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
Packit 6c4009
	addsd	MO1(DP_ONES), %xmm3
Packit 6c4009
	cvtsd2ss %xmm3, %xmm3		/* SP result */
Packit 6c4009
Packit 6c4009
L(epilogue):
Packit 6c4009
	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
Packit 6c4009
	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
Packit 6c4009
	flds	0(%esp)			/* ...to FPU.  */
Packit 6c4009
	/* Return back 4 bytes of stack frame */
Packit 6c4009
	lea	4(%esp), %esp
Packit 6c4009
	RETURN
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(arg_less_2pn5):
Packit 6c4009
	/* Here if |x|<2^-5 */
Packit 6c4009
	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
Packit 6c4009
	jl	L(arg_less_2pn27)
Packit 6c4009
Packit 6c4009
	/* Here if 2^-27<=|x|<2^-5 */
Packit 6c4009
	mulsd	%xmm0, %xmm0		/* DP x^2 */
Packit 6c4009
	movsd	MO1(DP_COS2_1), %xmm3	/* DP DP_COS2_1 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
Packit 6c4009
	addsd	MO1(DP_COS2_0), %xmm3	/* DP DP_COS2_0+x^2*DP_COS2_1 */
Packit 6c4009
	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
Packit 6c4009
	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
Packit 6c4009
	addsd	MO1(DP_ONES), %xmm3
Packit 6c4009
	cvtsd2ss %xmm3, %xmm3		/* SP result */
Packit 6c4009
	jmp	L(epilogue)
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(arg_less_2pn27):
Packit 6c4009
	/* Here if |x|<2^-27 */
Packit 6c4009
	movss	ARG_X, %xmm0		/* x */
Packit 6c4009
	andps	MO1(SP_ABS_MASK),%xmm0	/* |x| */
Packit 6c4009
	movss	MO1(SP_ONE), %xmm3	/* 1.0 */
Packit 6c4009
	subss	%xmm0, %xmm3		/* result is 1.0-|x| */
Packit 6c4009
	jmp	L(epilogue)
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(arg_inf_or_nan):
Packit 6c4009
	/* Here if |x| is Inf or NAN */
Packit 6c4009
	jne	L(skip_errno_setting)	/* in case of x is NaN */
Packit 6c4009
Packit 6c4009
	/* Here if x is Inf. Set errno to EDOM.  */
Packit 6c4009
	call	JUMPTARGET(__errno_location)
Packit 6c4009
	movl	$EDOM, (%eax)
Packit 6c4009
Packit 6c4009
	.p2align	4
Packit 6c4009
L(skip_errno_setting):
Packit 6c4009
	/* Here if |x| is Inf or NAN. Continued.  */
Packit 6c4009
	movss	ARG_X, %xmm3		/* load x */
Packit 6c4009
	subss	%xmm3, %xmm3		/* Result is NaN */
Packit 6c4009
	jmp	L(epilogue)
Packit 6c4009
END(__cosf_sse2)
Packit 6c4009
Packit 6c4009
	.section .rodata, "a"
Packit 6c4009
	.p2align 3
Packit 6c4009
L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
Packit 6c4009
	.long	0x00000000,0x00000000
Packit 6c4009
	.long	0x54442d18,0x3fe921fb
Packit 6c4009
	.long	0x54442d18,0x3ff921fb
Packit 6c4009
	.long	0x7f3321d2,0x4002d97c
Packit 6c4009
	.long	0x54442d18,0x400921fb
Packit 6c4009
	.long	0x2955385e,0x400f6a7a
Packit 6c4009
	.long	0x7f3321d2,0x4012d97c
Packit 6c4009
	.long	0xe9bba775,0x4015fdbb
Packit 6c4009
	.long	0x54442d18,0x401921fb
Packit 6c4009
	.long	0xbeccb2bb,0x401c463a
Packit 6c4009
	.long	0x2955385e,0x401f6a7a
Packit 6c4009
	.type L(PIO4J), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(PIO4J))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(_FPI): /* 4/Pi broken into sum of positive DP values */
Packit 6c4009
	.long	0x00000000,0x00000000
Packit 6c4009
	.long	0x6c000000,0x3ff45f30
Packit 6c4009
	.long	0x2a000000,0x3e3c9c88
Packit 6c4009
	.long	0xa8000000,0x3c54fe13
Packit 6c4009
	.long	0xd0000000,0x3aaf47d4
Packit 6c4009
	.long	0x6c000000,0x38fbb81b
Packit 6c4009
	.long	0xe0000000,0x3714acc9
Packit 6c4009
	.long	0x7c000000,0x3560e410
Packit 6c4009
	.long	0x56000000,0x33bca2c7
Packit 6c4009
	.long	0xac000000,0x31fbd778
Packit 6c4009
	.long	0xe0000000,0x300b7246
Packit 6c4009
	.long	0xe8000000,0x2e5d2126
Packit 6c4009
	.long	0x48000000,0x2c970032
Packit 6c4009
	.long	0xe8000000,0x2ad77504
Packit 6c4009
	.long	0xe0000000,0x290921cf
Packit 6c4009
	.long	0xb0000000,0x274deb1c
Packit 6c4009
	.long	0xe0000000,0x25829a73
Packit 6c4009
	.long	0xbe000000,0x23fd1046
Packit 6c4009
	.long	0x10000000,0x2224baed
Packit 6c4009
	.long	0x8e000000,0x20709d33
Packit 6c4009
	.long	0x80000000,0x1e535a2f
Packit 6c4009
	.long	0x64000000,0x1cef904e
Packit 6c4009
	.long	0x30000000,0x1b0d6398
Packit 6c4009
	.long	0x24000000,0x1964ce7d
Packit 6c4009
	.long	0x16000000,0x17b908bf
Packit 6c4009
	.type L(_FPI), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(_FPI))
Packit 6c4009
Packit 6c4009
/* Coefficients of polynomial
Packit 6c4009
 for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_COS2_0):
Packit 6c4009
	.long	0xff5cc6fd,0xbfdfffff
Packit 6c4009
	.type L(DP_COS2_0), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_COS2_1):
Packit 6c4009
	.long	0xb178dac5,0x3fa55514
Packit 6c4009
	.type L(DP_COS2_1), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_ZERONE):
Packit 6c4009
	.long	0x00000000,0x00000000	/* 0.0 */
Packit 6c4009
	.long	0x00000000,0xbff00000	/* 1.0 */
Packit 6c4009
	.type L(DP_ZERONE),@object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_ONES):
Packit 6c4009
	.long	0x00000000,0x3ff00000	/* +1.0 */
Packit 6c4009
	.long	0x00000000,0xbff00000	/* -1.0 */
Packit 6c4009
	.type L(DP_ONES), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_ONES))
Packit 6c4009
Packit 6c4009
/* Coefficients of polynomial
Packit 6c4009
 for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_S3):
Packit 6c4009
	.long	0x64e6b5b4,0x3ec71d72
Packit 6c4009
	.type L(DP_S3), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_S3))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_S1):
Packit 6c4009
	.long	0x10c2688b,0x3f811111
Packit 6c4009
	.type L(DP_S1), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_S1))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_S4):
Packit 6c4009
	.long	0x1674b58a,0xbe5a947e
Packit 6c4009
	.type L(DP_S4), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_S4))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_S2):
Packit 6c4009
	.long	0x8b4bd1f9,0xbf2a019f
Packit 6c4009
	.type L(DP_S2), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_S2))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_S0):
Packit 6c4009
	.long	0x55551cd9,0xbfc55555
Packit 6c4009
	.type L(DP_S0), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_S0))
Packit 6c4009
Packit 6c4009
/* Coefficients of polynomial
Packit 6c4009
 for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_C3):
Packit 6c4009
	.long	0x9ac43cc0,0x3efa00eb
Packit 6c4009
	.type L(DP_C3), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_C3))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_C1):
Packit 6c4009
	.long	0x545c50c7,0x3fa55555
Packit 6c4009
	.type L(DP_C1), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_C1))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_C4):
Packit 6c4009
	.long	0xdd8844d7,0xbe923c97
Packit 6c4009
	.type L(DP_C4), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_C4))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_C2):
Packit 6c4009
	.long	0x348b6874,0xbf56c16b
Packit 6c4009
	.type L(DP_C2), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_C2))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_C0):
Packit 6c4009
	.long	0xfffe98ae,0xbfdfffff
Packit 6c4009
	.type L(DP_C0), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_C0))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_PIO4):
Packit 6c4009
	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
Packit 6c4009
	.type L(DP_PIO4), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_2POW52):
Packit 6c4009
	.long	0x00000000,0x43300000	/* +2^52 */
Packit 6c4009
	.long	0x00000000,0xc3300000	/* -2^52 */
Packit 6c4009
	.type L(DP_2POW52), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_INVPIO4):
Packit 6c4009
	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
Packit 6c4009
	.type L(DP_INVPIO4), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_PIO4HI):
Packit 6c4009
	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
Packit 6c4009
	.type L(DP_PIO4HI), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_PIO4LO):
Packit 6c4009
	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
Packit 6c4009
	.type L(DP_PIO4LO), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
Packit 6c4009
Packit 6c4009
	.p2align 2
Packit 6c4009
L(SP_INVPIO4):
Packit 6c4009
	.long	0x3fa2f983		/* 4/Pi */
Packit 6c4009
	.type L(SP_INVPIO4), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(DP_ABS_MASK): /* Mask for getting DP absolute value */
Packit 6c4009
	.long	0xffffffff,0x7fffffff
Packit 6c4009
	.long	0xffffffff,0x7fffffff
Packit 6c4009
	.type L(DP_ABS_MASK), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
Packit 6c4009
	.long	0x00000000,0xffffffff
Packit 6c4009
	.type L(DP_HI_MASK), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(SP_ABS_MASK): /* Mask for getting SP absolute value */
Packit 6c4009
	.long	0x7fffffff,0x7fffffff
Packit 6c4009
	.long	0x7fffffff,0x7fffffff
Packit 6c4009
	.type L(SP_ABS_MASK), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
Packit 6c4009
Packit 6c4009
	.p2align 2
Packit 6c4009
L(SP_ONE):
Packit 6c4009
	.long	0x3f800000		/* 1.0 */
Packit 6c4009
	.type L(SP_ONE), @object
Packit 6c4009
	ASM_SIZE_DIRECTIVE(L(SP_ONE))
Packit 6c4009
Packit 6c4009
weak_alias (__cosf, cosf)