Blame sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S

Packit 6c4009
/* Function sinf vectorized with SSE4.
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include "svml_s_trig_data.h"
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
ENTRY(_ZGVbN4v_sinf_sse4)
Packit 6c4009
/*
Packit 6c4009
   ALGORITHM DESCRIPTION:
Packit 6c4009
Packit 6c4009
   1) Range reduction to [-Pi/2; +Pi/2] interval
Packit 6c4009
      a) Grab sign from source argument and save it.
Packit 6c4009
      b) Remove sign using AND operation
Packit 6c4009
      c) Getting octant Y by 1/Pi multiplication
Packit 6c4009
      d) Add "Right Shifter" value
Packit 6c4009
      e) Treat obtained value as integer for destination sign setting.
Packit 6c4009
         Shift first bit of this value to the last (sign) position
Packit 6c4009
      f) Change destination sign if source sign is negative
Packit 6c4009
         using XOR operation.
Packit 6c4009
      g) Subtract "Right Shifter" value
Packit 6c4009
      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
Packit 6c4009
         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
Packit 6c4009
   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
Packit 6c4009
      a) Calculate X^2 = X * X
Packit 6c4009
      b) Calculate polynomial:
Packit 6c4009
         R = X + X * X^2 * (A3 + x^2 * (A5 + ......
Packit 6c4009
   3) Destination sign setting
Packit 6c4009
      a) Set shifted destination sign using XOR operation:
Packit 6c4009
         R = XOR( R, S );
Packit 6c4009
 */
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $320, %rsp
Packit 6c4009
        movaps    %xmm0, %xmm5
Packit 6c4009
        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
Packit 6c4009
        movups    __sAbsMask(%rax), %xmm2
Packit 6c4009
Packit 6c4009
/* b) Remove sign using AND operation */
Packit 6c4009
        movaps    %xmm2, %xmm4
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  f) Change destination sign if source sign is negative
Packit 6c4009
  using XOR operation.
Packit 6c4009
 */
Packit 6c4009
        andnps    %xmm5, %xmm2
Packit 6c4009
        movups    __sInvPI(%rax), %xmm1
Packit 6c4009
        andps     %xmm5, %xmm4
Packit 6c4009
Packit 6c4009
/* c) Getting octant Y by 1/Pi multiplication
Packit 6c4009
   d) Add "Right Shifter" value  */
Packit 6c4009
        mulps     %xmm4, %xmm1
Packit 6c4009
Packit 6c4009
/* h) Subtract Y*PI from X argument, where PI divided to 4 parts:
Packit 6c4009
   X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4  */
Packit 6c4009
        movaps    %xmm4, %xmm0
Packit 6c4009
Packit 6c4009
/* Check for large and special values */
Packit 6c4009
        cmpnleps  __sRangeReductionVal(%rax), %xmm4
Packit 6c4009
        movups    __sRShifter(%rax), %xmm6
Packit 6c4009
        movups    __sPI1(%rax), %xmm7
Packit 6c4009
        addps     %xmm6, %xmm1
Packit 6c4009
        movmskps  %xmm4, %ecx
Packit 6c4009
Packit 6c4009
/* e) Treat obtained value as integer for destination sign setting.
Packit 6c4009
   Shift first bit of this value to the last (sign) position  */
Packit 6c4009
        movaps    %xmm1, %xmm3
Packit 6c4009
Packit 6c4009
/* g) Subtract "Right Shifter" value */
Packit 6c4009
        subps     %xmm6, %xmm1
Packit 6c4009
        mulps     %xmm1, %xmm7
Packit 6c4009
        pslld     $31, %xmm3
Packit 6c4009
        movups    __sPI2(%rax), %xmm6
Packit 6c4009
        subps     %xmm7, %xmm0
Packit 6c4009
        mulps     %xmm1, %xmm6
Packit 6c4009
        movups    __sPI3(%rax), %xmm7
Packit 6c4009
        subps     %xmm6, %xmm0
Packit 6c4009
        mulps     %xmm1, %xmm7
Packit 6c4009
        movups    __sPI4(%rax), %xmm6
Packit 6c4009
        subps     %xmm7, %xmm0
Packit 6c4009
        mulps     %xmm6, %xmm1
Packit 6c4009
        subps     %xmm1, %xmm0
Packit 6c4009
Packit 6c4009
/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
Packit 6c4009
   a) Calculate X^2 = X * X
Packit 6c4009
   b) Calculate polynomial:
Packit 6c4009
   R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */
Packit 6c4009
        movaps    %xmm0, %xmm1
Packit 6c4009
        mulps     %xmm0, %xmm1
Packit 6c4009
        xorps     %xmm3, %xmm0
Packit 6c4009
        movups    __sA9(%rax), %xmm3
Packit 6c4009
        mulps     %xmm1, %xmm3
Packit 6c4009
        addps     __sA7(%rax), %xmm3
Packit 6c4009
        mulps     %xmm1, %xmm3
Packit 6c4009
        addps     __sA5(%rax), %xmm3
Packit 6c4009
        mulps     %xmm1, %xmm3
Packit 6c4009
        addps     __sA3(%rax), %xmm3
Packit 6c4009
        mulps     %xmm3, %xmm1
Packit 6c4009
        mulps     %xmm0, %xmm1
Packit 6c4009
        addps     %xmm1, %xmm0
Packit 6c4009
Packit 6c4009
/* 3) Destination sign setting
Packit 6c4009
   a) Set shifted destination sign using XOR operation:
Packit 6c4009
   R = XOR( R, S ); */
Packit 6c4009
        xorps     %xmm2, %xmm0
Packit 6c4009
        testl     %ecx, %ecx
Packit 6c4009
        jne       .LBL_1_3
Packit 6c4009
Packit 6c4009
.LBL_1_2:
Packit 6c4009
        cfi_remember_state
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
Packit 6c4009
.LBL_1_3:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movups    %xmm5, 192(%rsp)
Packit 6c4009
        movups    %xmm0, 256(%rsp)
Packit 6c4009
        je        .LBL_1_2
Packit 6c4009
Packit 6c4009
        xorb      %dl, %dl
Packit 6c4009
        xorl      %eax, %eax
Packit 6c4009
        movups    %xmm8, 112(%rsp)
Packit 6c4009
        movups    %xmm9, 96(%rsp)
Packit 6c4009
        movups    %xmm10, 80(%rsp)
Packit 6c4009
        movups    %xmm11, 64(%rsp)
Packit 6c4009
        movups    %xmm12, 48(%rsp)
Packit 6c4009
        movups    %xmm13, 32(%rsp)
Packit 6c4009
        movups    %xmm14, 16(%rsp)
Packit 6c4009
        movups    %xmm15, (%rsp)
Packit 6c4009
        movq      %rsi, 136(%rsp)
Packit 6c4009
        movq      %rdi, 128(%rsp)
Packit 6c4009
        movq      %r12, 168(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (12, 168)
Packit 6c4009
        movb      %dl, %r12b
Packit 6c4009
        movq      %r13, 160(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (13, 160)
Packit 6c4009
        movl      %ecx, %r13d
Packit 6c4009
        movq      %r14, 152(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (14, 152)
Packit 6c4009
        movl      %eax, %r14d
Packit 6c4009
        movq      %r15, 144(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (15, 144)
Packit 6c4009
        cfi_remember_state
Packit 6c4009
Packit 6c4009
.LBL_1_6:
Packit 6c4009
        btl       %r14d, %r13d
Packit 6c4009
        jc        .LBL_1_12
Packit 6c4009
Packit 6c4009
.LBL_1_7:
Packit 6c4009
        lea       1(%r14), %esi
Packit 6c4009
        btl       %esi, %r13d
Packit 6c4009
        jc        .LBL_1_10
Packit 6c4009
Packit 6c4009
.LBL_1_8:
Packit 6c4009
        incb      %r12b
Packit 6c4009
        addl      $2, %r14d
Packit 6c4009
        cmpb      $16, %r12b
Packit 6c4009
        jb        .LBL_1_6
Packit 6c4009
Packit 6c4009
        movups    112(%rsp), %xmm8
Packit 6c4009
        movups    96(%rsp), %xmm9
Packit 6c4009
        movups    80(%rsp), %xmm10
Packit 6c4009
        movups    64(%rsp), %xmm11
Packit 6c4009
        movups    48(%rsp), %xmm12
Packit 6c4009
        movups    32(%rsp), %xmm13
Packit 6c4009
        movups    16(%rsp), %xmm14
Packit 6c4009
        movups    (%rsp), %xmm15
Packit 6c4009
        movq      136(%rsp), %rsi
Packit 6c4009
        movq      128(%rsp), %rdi
Packit 6c4009
        movq      168(%rsp), %r12
Packit 6c4009
        cfi_restore (%r12)
Packit 6c4009
        movq      160(%rsp), %r13
Packit 6c4009
        cfi_restore (%r13)
Packit 6c4009
        movq      152(%rsp), %r14
Packit 6c4009
        cfi_restore (%r14)
Packit 6c4009
        movq      144(%rsp), %r15
Packit 6c4009
        cfi_restore (%r15)
Packit 6c4009
        movups    256(%rsp), %xmm0
Packit 6c4009
        jmp       .LBL_1_2
Packit 6c4009
Packit 6c4009
.LBL_1_10:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        movss     196(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 260(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_8
Packit 6c4009
Packit 6c4009
.LBL_1_12:
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        movss     192(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        movss     %xmm0, 256(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_7
Packit 6c4009
Packit 6c4009
END(_ZGVbN4v_sinf_sse4)