Blame sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S

Packit 6c4009
/* Function sincosf vectorized with SSE4.
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include "svml_s_trig_data.h"
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
Packit 6c4009
/*
Packit 6c4009
   ALGORITHM DESCRIPTION:
Packit 6c4009
Packit 6c4009
     1) Range reduction to [-Pi/4; +Pi/4] interval
Packit 6c4009
        a) Grab sign from source argument and save it.
Packit 6c4009
        b) Remove sign using AND operation
Packit 6c4009
        c) Getting octant Y by 2/Pi multiplication
Packit 6c4009
        d) Add "Right Shifter" value
Packit 6c4009
        e) Treat obtained value as integer S for destination sign setting.
Packit 6c4009
           SS = ((S-S&1)&2)<<30; For sin part
Packit 6c4009
           SC = ((S+S&1)&2)<<30; For cos part
Packit 6c4009
        f) Change destination sign if source sign is negative
Packit 6c4009
           using XOR operation.
Packit 6c4009
        g) Subtract "Right Shifter" (0x4B000000) value
Packit 6c4009
        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
Packit 6c4009
           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
Packit 6c4009
     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
Packit 6c4009
        a) Calculate X^2 = X * X
Packit 6c4009
        b) Calculate 2 polynomials for sin and cos:
Packit 6c4009
           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
Packit 6c4009
           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
Packit 6c4009
        c) Swap RS & RC if first bit of obtained value after
Packit 6c4009
           Right Shifting is set to 1. Using And, Andnot & Or operations.
Packit 6c4009
     3) Destination sign setting
Packit 6c4009
        a) Set shifted destination sign using XOR operation:
Packit 6c4009
           R1 = XOR( RS, SS );
Packit 6c4009
           R2 = XOR( RC, SC ).  */
Packit 6c4009
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $320, %rsp
Packit 6c4009
        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
Packit 6c4009
        movups    %xmm12, 176(%rsp)
Packit 6c4009
        movups    %xmm9, 160(%rsp)
Packit 6c4009
        movups __sAbsMask(%rax), %xmm12
Packit 6c4009
Packit 6c4009
/* Absolute argument computation */
Packit 6c4009
        movaps    %xmm12, %xmm5
Packit 6c4009
        andnps    %xmm0, %xmm12
Packit 6c4009
        movups __sInvPI(%rax), %xmm7
Packit 6c4009
        andps     %xmm0, %xmm5
Packit 6c4009
Packit 6c4009
/* c) Getting octant Y by 2/Pi multiplication
Packit 6c4009
   d) Add "Right Shifter" value.  */
Packit 6c4009
        mulps     %xmm5, %xmm7
Packit 6c4009
        movups    %xmm10, 144(%rsp)
Packit 6c4009
        movups __sPI1(%rax), %xmm10
Packit 6c4009
Packit 6c4009
/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
Packit 6c4009
      X = X - Y*PI1 - Y*PI2 - Y*PI3.  */
Packit 6c4009
        movaps    %xmm10, %xmm1
Packit 6c4009
        addps __sRShifter(%rax), %xmm7
Packit 6c4009
Packit 6c4009
/* e) Treat obtained value as integer S for destination sign setting */
Packit 6c4009
        movaps    %xmm7, %xmm9
Packit 6c4009
Packit 6c4009
/* g) Subtract "Right Shifter" (0x4B000000) value */
Packit 6c4009
        subps __sRShifter(%rax), %xmm7
Packit 6c4009
        mulps     %xmm7, %xmm1
Packit 6c4009
        pslld     $31, %xmm9
Packit 6c4009
        movups __sPI2(%rax), %xmm6
Packit 6c4009
        movups    %xmm13, 112(%rsp)
Packit 6c4009
        movaps    %xmm5, %xmm13
Packit 6c4009
        movaps    %xmm6, %xmm2
Packit 6c4009
        subps     %xmm1, %xmm13
Packit 6c4009
        mulps     %xmm7, %xmm2
Packit 6c4009
        movups __sSignMask(%rax), %xmm3
Packit 6c4009
        movaps    %xmm5, %xmm1
Packit 6c4009
        movups __sOneHalf(%rax), %xmm4
Packit 6c4009
        subps     %xmm2, %xmm13
Packit 6c4009
        cmpnleps __sRangeReductionVal(%rax), %xmm5
Packit 6c4009
        movaps    %xmm3, %xmm2
Packit 6c4009
        andps     %xmm13, %xmm2
Packit 6c4009
        xorps     %xmm2, %xmm4
Packit 6c4009
Packit 6c4009
/* Result sign calculations */
Packit 6c4009
        xorps     %xmm2, %xmm3
Packit 6c4009
        xorps     %xmm9, %xmm3
Packit 6c4009
Packit 6c4009
/* Add correction term 0.5 for cos() part */
Packit 6c4009
        addps     %xmm7, %xmm4
Packit 6c4009
        movmskps  %xmm5, %ecx
Packit 6c4009
        mulps     %xmm4, %xmm10
Packit 6c4009
        mulps     %xmm4, %xmm6
Packit 6c4009
        subps     %xmm10, %xmm1
Packit 6c4009
        movups __sPI3(%rax), %xmm10
Packit 6c4009
        subps     %xmm6, %xmm1
Packit 6c4009
        movaps    %xmm10, %xmm6
Packit 6c4009
        mulps     %xmm7, %xmm6
Packit 6c4009
        mulps     %xmm4, %xmm10
Packit 6c4009
        subps     %xmm6, %xmm13
Packit 6c4009
        subps     %xmm10, %xmm1
Packit 6c4009
        movups __sPI4(%rax), %xmm6
Packit 6c4009
        mulps     %xmm6, %xmm7
Packit 6c4009
        mulps     %xmm6, %xmm4
Packit 6c4009
        subps     %xmm7, %xmm13
Packit 6c4009
        subps     %xmm4, %xmm1
Packit 6c4009
        xorps     %xmm9, %xmm13
Packit 6c4009
        xorps     %xmm3, %xmm1
Packit 6c4009
        movaps    %xmm13, %xmm4
Packit 6c4009
        movaps    %xmm1, %xmm2
Packit 6c4009
        mulps     %xmm13, %xmm4
Packit 6c4009
        mulps     %xmm1, %xmm2
Packit 6c4009
        movups __sA9(%rax), %xmm7
Packit 6c4009
Packit 6c4009
/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
Packit 6c4009
      a) Calculate X^2 = X * X
Packit 6c4009
      b) Calculate 2 polynomials for sin and cos:
Packit 6c4009
         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
Packit 6c4009
         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
Packit 6c4009
        movaps    %xmm7, %xmm3
Packit 6c4009
        mulps     %xmm4, %xmm3
Packit 6c4009
        mulps     %xmm2, %xmm7
Packit 6c4009
        addps __sA7(%rax), %xmm3
Packit 6c4009
        addps __sA7(%rax), %xmm7
Packit 6c4009
        mulps     %xmm4, %xmm3
Packit 6c4009
        mulps     %xmm2, %xmm7
Packit 6c4009
        addps __sA5(%rax), %xmm3
Packit 6c4009
        addps __sA5(%rax), %xmm7
Packit 6c4009
        mulps     %xmm4, %xmm3
Packit 6c4009
        mulps     %xmm2, %xmm7
Packit 6c4009
        addps __sA3(%rax), %xmm3
Packit 6c4009
        addps __sA3(%rax), %xmm7
Packit 6c4009
        mulps     %xmm3, %xmm4
Packit 6c4009
        mulps     %xmm7, %xmm2
Packit 6c4009
        mulps     %xmm13, %xmm4
Packit 6c4009
        mulps     %xmm1, %xmm2
Packit 6c4009
        addps     %xmm4, %xmm13
Packit 6c4009
        addps     %xmm2, %xmm1
Packit 6c4009
        xorps     %xmm12, %xmm13
Packit 6c4009
        testl     %ecx, %ecx
Packit 6c4009
        jne       .LBL_1_3
Packit 6c4009
Packit 6c4009
.LBL_1_2:
Packit 6c4009
        cfi_remember_state
Packit 6c4009
        movups    160(%rsp), %xmm9
Packit 6c4009
        movaps    %xmm13, (%rdi)
Packit 6c4009
        movups    144(%rsp), %xmm10
Packit 6c4009
        movups    176(%rsp), %xmm12
Packit 6c4009
        movups    112(%rsp), %xmm13
Packit 6c4009
        movups    %xmm1, (%rsi)
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
Packit 6c4009
.LBL_1_3:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movups    %xmm0, 128(%rsp)
Packit 6c4009
        movups    %xmm13, 192(%rsp)
Packit 6c4009
        movups    %xmm1, 256(%rsp)
Packit 6c4009
        je        .LBL_1_2
Packit 6c4009
Packit 6c4009
        xorb      %dl, %dl
Packit 6c4009
        xorl      %eax, %eax
Packit 6c4009
        movups    %xmm8, 48(%rsp)
Packit 6c4009
        movups    %xmm11, 32(%rsp)
Packit 6c4009
        movups    %xmm14, 16(%rsp)
Packit 6c4009
        movups    %xmm15, (%rsp)
Packit 6c4009
        movq      %rsi, 64(%rsp)
Packit 6c4009
        movq      %r12, 104(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (12, 104)
Packit 6c4009
        movb      %dl, %r12b
Packit 6c4009
        movq      %r13, 96(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (13, 96)
Packit 6c4009
        movl      %eax, %r13d
Packit 6c4009
        movq      %r14, 88(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (14, 88)
Packit 6c4009
        movl      %ecx, %r14d
Packit 6c4009
        movq      %r15, 80(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (15, 80)
Packit 6c4009
        movq      %rbx, 72(%rsp)
Packit 6c4009
        movq      %rdi, %rbx
Packit 6c4009
        cfi_remember_state
Packit 6c4009
Packit 6c4009
.LBL_1_6:
Packit 6c4009
        btl       %r13d, %r14d
Packit 6c4009
        jc        .LBL_1_13
Packit 6c4009
Packit 6c4009
.LBL_1_7:
Packit 6c4009
        lea       1(%r13), %esi
Packit 6c4009
        btl       %esi, %r14d
Packit 6c4009
        jc        .LBL_1_10
Packit 6c4009
Packit 6c4009
.LBL_1_8:
Packit 6c4009
        incb      %r12b
Packit 6c4009
        addl      $2, %r13d
Packit 6c4009
        cmpb      $16, %r12b
Packit 6c4009
        jb        .LBL_1_6
Packit 6c4009
Packit 6c4009
        movups    48(%rsp), %xmm8
Packit 6c4009
        movq      %rbx, %rdi
Packit 6c4009
        movups    32(%rsp), %xmm11
Packit 6c4009
        movups    16(%rsp), %xmm14
Packit 6c4009
        movups    (%rsp), %xmm15
Packit 6c4009
        movq      64(%rsp), %rsi
Packit 6c4009
        movq      104(%rsp), %r12
Packit 6c4009
        cfi_restore (%r12)
Packit 6c4009
        movq      96(%rsp), %r13
Packit 6c4009
        cfi_restore (%r13)
Packit 6c4009
        movq      88(%rsp), %r14
Packit 6c4009
        cfi_restore (%r14)
Packit 6c4009
        movq      80(%rsp), %r15
Packit 6c4009
        cfi_restore (%r15)
Packit 6c4009
        movq      72(%rsp), %rbx
Packit 6c4009
        movups    192(%rsp), %xmm13
Packit 6c4009
        movups    256(%rsp), %xmm1
Packit 6c4009
        jmp       .LBL_1_2
Packit 6c4009
Packit 6c4009
.LBL_1_10:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        movss     132(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 196(%rsp,%r15,8)
Packit 6c4009
        movss     132(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 260(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_8
Packit 6c4009
Packit 6c4009
.LBL_1_13:
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        movss     128(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 192(%rsp,%r15,8)
Packit 6c4009
        movss     128(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 256(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_7
Packit 6c4009
Packit 6c4009
END (_ZGVbN4vl4l4_sincosf_sse4)
Packit 6c4009
libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
Packit 6c4009
Packit 6c4009
/* vvv version implemented with wrapper to vl4l4 variant.  */
Packit 6c4009
ENTRY (_ZGVbN4vvv_sincosf_sse4)
Packit 6c4009
#ifndef __ILP32__
Packit 6c4009
        subq      $104, %rsp
Packit 6c4009
        .cfi_def_cfa_offset 112
Packit 6c4009
        movdqu    %xmm1, 32(%rsp)
Packit 6c4009
        lea       (%rsp), %rdi
Packit 6c4009
        movdqu    %xmm2, 48(%rdi)
Packit 6c4009
        lea       16(%rsp), %rsi
Packit 6c4009
        movdqu    %xmm3, 48(%rsi)
Packit 6c4009
        movdqu    %xmm4, 64(%rsi)
Packit 6c4009
        call      HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
Packit 6c4009
        movq      32(%rsp), %rdx
Packit 6c4009
        movq      40(%rsp), %rsi
Packit 6c4009
        movq      48(%rsp), %r8
Packit 6c4009
        movq      56(%rsp), %r10
Packit 6c4009
        movl      (%rsp), %eax
Packit 6c4009
        movl      4(%rsp), %ecx
Packit 6c4009
        movl      8(%rsp), %edi
Packit 6c4009
        movl      12(%rsp), %r9d
Packit 6c4009
        movl      %eax, (%rdx)
Packit 6c4009
        movl      %ecx, (%rsi)
Packit 6c4009
        movq      64(%rsp), %rax
Packit 6c4009
        movq      72(%rsp), %rcx
Packit 6c4009
        movl      %edi, (%r8)
Packit 6c4009
        movl      %r9d, (%r10)
Packit 6c4009
        movq      80(%rsp), %rdi
Packit 6c4009
        movq      88(%rsp), %r9
Packit 6c4009
        movl      16(%rsp), %r11d
Packit 6c4009
        movl      20(%rsp), %edx
Packit 6c4009
        movl      24(%rsp), %esi
Packit 6c4009
        movl      28(%rsp), %r8d
Packit 6c4009
        movl      %r11d, (%rax)
Packit 6c4009
        movl      %edx, (%rcx)
Packit 6c4009
        movl      %esi, (%rdi)
Packit 6c4009
        movl      %r8d, (%r9)
Packit 6c4009
        addq      $104, %rsp
Packit 6c4009
        .cfi_def_cfa_offset 8
Packit 6c4009
        ret
Packit 6c4009
#else
Packit 6c4009
        subl    $72, %esp
Packit 6c4009
        .cfi_def_cfa_offset 80
Packit 6c4009
        leal    48(%rsp), %esi
Packit 6c4009
        movaps  %xmm1, 16(%esp)
Packit 6c4009
        leal    32(%rsp), %edi
Packit 6c4009
        movaps  %xmm2, (%esp)
Packit 6c4009
        call    HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
Packit 6c4009
        movl    16(%esp), %eax
Packit 6c4009
        movss   32(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    20(%esp), %eax
Packit 6c4009
        movss   36(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    24(%esp), %eax
Packit 6c4009
        movss   40(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    28(%esp), %eax
Packit 6c4009
        movss   44(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    (%esp), %eax
Packit 6c4009
        movss   48(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    4(%esp), %eax
Packit 6c4009
        movss   52(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    8(%esp), %eax
Packit 6c4009
        movss   56(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        movl    12(%esp), %eax
Packit 6c4009
        movss   60(%esp), %xmm0
Packit 6c4009
        movss   %xmm0, (%eax)
Packit 6c4009
        addl    $72, %esp
Packit 6c4009
        .cfi_def_cfa_offset 8
Packit 6c4009
        ret
Packit 6c4009
#endif
Packit 6c4009
END (_ZGVbN4vvv_sincosf_sse4)