Blame sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S

Packit 6c4009
/* Function cosf vectorized with AVX2.
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include "svml_s_trig_data.h"
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
ENTRY (_ZGVdN8v_cosf_avx2)
Packit 6c4009
/*
Packit 6c4009
  ALGORITHM DESCRIPTION:
Packit 6c4009
Packit 6c4009
  1) Range reduction to [-Pi/2; +Pi/2] interval
Packit 6c4009
    a) We remove sign using AND operation
Packit 6c4009
    b) Add Pi/2 value to argument X for Cos to Sin transformation
Packit 6c4009
    c) Getting octant Y by 1/Pi multiplication
Packit 6c4009
    d) Add "Right Shifter" value
Packit 6c4009
    e) Treat obtained value as integer for destination sign setting.
Packit 6c4009
       Shift first bit of this value to the last (sign) position
Packit 6c4009
    f) Subtract "Right Shifter"  value
Packit 6c4009
    g) Subtract 0.5 from result for octant correction
Packit 6c4009
    h) Subtract Y*PI from X argument, where PI divided to 4 parts:
Packit 6c4009
         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
Packit 6c4009
  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
Packit 6c4009
    a) Calculate X^2 = X * X
Packit 6c4009
    b) Calculate polynomial:
Packit 6c4009
         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
Packit 6c4009
  3) Destination sign setting
Packit 6c4009
    a) Set shifted destination sign using XOR operation:
Packit 6c4009
         R = XOR( R, S );
Packit 6c4009
 */
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $448, %rsp
Packit 6c4009
        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
Packit 6c4009
        vmovaps   %ymm0, %ymm2
Packit 6c4009
        vmovups __sRShifter(%rax), %ymm5
Packit 6c4009
        vmovups __sPI1_FMA(%rax), %ymm7
Packit 6c4009
Packit 6c4009
/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
Packit 6c4009
        vaddps __sHalfPI(%rax), %ymm2, %ymm4
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  1) Range reduction to [-Pi/2; +Pi/2] interval
Packit 6c4009
  c) Getting octant Y by 1/Pi multiplication
Packit 6c4009
  d) Add "Right Shifter" (0x4B000000) value
Packit 6c4009
 */
Packit 6c4009
        vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
Packit 6c4009
Packit 6c4009
/* f) Subtract "Right Shifter" (0x4B000000) value */
Packit 6c4009
        vsubps    %ymm5, %ymm4, %ymm6
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  e) Treat obtained value as integer for destination sign setting.
Packit 6c4009
  Shift first bit of this value to the last (sign) position (S << 31)
Packit 6c4009
 */
Packit 6c4009
        vpslld    $31, %ymm4, %ymm0
Packit 6c4009
Packit 6c4009
/* g) Subtract 0.5 from result for octant correction */
Packit 6c4009
        vsubps __sOneHalf(%rax), %ymm6, %ymm4
Packit 6c4009
Packit 6c4009
/* Check for large and special arguments */
Packit 6c4009
        vandps __sAbsMask(%rax), %ymm2, %ymm3
Packit 6c4009
        vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
Packit 6c4009
  X = X - Y*PI1 - Y*PI2 - Y*PI3
Packit 6c4009
 */
Packit 6c4009
        vmovaps   %ymm2, %ymm3
Packit 6c4009
        vfnmadd231ps %ymm4, %ymm7, %ymm3
Packit 6c4009
        vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
Packit 6c4009
        vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
Packit 6c4009
Packit 6c4009
/* a) Calculate X^2 = X * X */
Packit 6c4009
        vmulps    %ymm4, %ymm4, %ymm5
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  3) Destination sign setting
Packit 6c4009
  a) Set shifted destination sign using XOR operation:
Packit 6c4009
  R = XOR( R, S );
Packit 6c4009
 */
Packit 6c4009
        vxorps    %ymm0, %ymm4, %ymm6
Packit 6c4009
        vmovups __sA9_FMA(%rax), %ymm0
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
  b) Calculate polynomial:
Packit 6c4009
  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
Packit 6c4009
 */
Packit 6c4009
        vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
Packit 6c4009
        vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
Packit 6c4009
        vfmadd213ps __sA3(%rax), %ymm5, %ymm0
Packit 6c4009
        vmulps    %ymm5, %ymm0, %ymm0
Packit 6c4009
        vmovmskps %ymm1, %ecx
Packit 6c4009
        vfmadd213ps %ymm6, %ymm6, %ymm0
Packit 6c4009
        testl     %ecx, %ecx
Packit 6c4009
        jne       .LBL_1_3
Packit 6c4009
Packit 6c4009
.LBL_1_2:
Packit 6c4009
        cfi_remember_state
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
Packit 6c4009
.LBL_1_3:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        vmovups   %ymm2, 320(%rsp)
Packit 6c4009
        vmovups   %ymm0, 384(%rsp)
Packit 6c4009
        je        .LBL_1_2
Packit 6c4009
Packit 6c4009
        xorb      %dl, %dl
Packit 6c4009
        xorl      %eax, %eax
Packit 6c4009
        vmovups   %ymm8, 224(%rsp)
Packit 6c4009
        vmovups   %ymm9, 192(%rsp)
Packit 6c4009
        vmovups   %ymm10, 160(%rsp)
Packit 6c4009
        vmovups   %ymm11, 128(%rsp)
Packit 6c4009
        vmovups   %ymm12, 96(%rsp)
Packit 6c4009
        vmovups   %ymm13, 64(%rsp)
Packit 6c4009
        vmovups   %ymm14, 32(%rsp)
Packit 6c4009
        vmovups   %ymm15, (%rsp)
Packit 6c4009
        movq      %rsi, 264(%rsp)
Packit 6c4009
        movq      %rdi, 256(%rsp)
Packit 6c4009
        movq      %r12, 296(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (12, 296)
Packit 6c4009
        movb      %dl, %r12b
Packit 6c4009
        movq      %r13, 288(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (13, 288)
Packit 6c4009
        movl      %ecx, %r13d
Packit 6c4009
        movq      %r14, 280(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (14, 280)
Packit 6c4009
        movl      %eax, %r14d
Packit 6c4009
        movq      %r15, 272(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (15, 272)
Packit 6c4009
        cfi_remember_state
Packit 6c4009
Packit 6c4009
.LBL_1_6:
Packit 6c4009
        btl       %r14d, %r13d
Packit 6c4009
        jc        .LBL_1_12
Packit 6c4009
Packit 6c4009
.LBL_1_7:
Packit 6c4009
        lea       1(%r14), %esi
Packit 6c4009
        btl       %esi, %r13d
Packit 6c4009
        jc        .LBL_1_10
Packit 6c4009
Packit 6c4009
.LBL_1_8:
Packit 6c4009
        incb      %r12b
Packit 6c4009
        addl      $2, %r14d
Packit 6c4009
        cmpb      $16, %r12b
Packit 6c4009
        jb        .LBL_1_6
Packit 6c4009
Packit 6c4009
        vmovups   224(%rsp), %ymm8
Packit 6c4009
        vmovups   192(%rsp), %ymm9
Packit 6c4009
        vmovups   160(%rsp), %ymm10
Packit 6c4009
        vmovups   128(%rsp), %ymm11
Packit 6c4009
        vmovups   96(%rsp), %ymm12
Packit 6c4009
        vmovups   64(%rsp), %ymm13
Packit 6c4009
        vmovups   32(%rsp), %ymm14
Packit 6c4009
        vmovups   (%rsp), %ymm15
Packit 6c4009
        vmovups   384(%rsp), %ymm0
Packit 6c4009
        movq      264(%rsp), %rsi
Packit 6c4009
        movq      256(%rsp), %rdi
Packit 6c4009
        movq      296(%rsp), %r12
Packit 6c4009
        cfi_restore (%r12)
Packit 6c4009
        movq      288(%rsp), %r13
Packit 6c4009
        cfi_restore (%r13)
Packit 6c4009
        movq      280(%rsp), %r14
Packit 6c4009
        cfi_restore (%r14)
Packit 6c4009
        movq      272(%rsp), %r15
Packit 6c4009
        cfi_restore (%r15)
Packit 6c4009
        jmp       .LBL_1_2
Packit 6c4009
Packit 6c4009
.LBL_1_10:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    324(%rsp,%r15,8), %xmm0
Packit 6c4009
        vzeroupper
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 388(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_8
Packit 6c4009
Packit 6c4009
.LBL_1_12:
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    320(%rsp,%r15,8), %xmm0
Packit 6c4009
        vzeroupper
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 384(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_7
Packit 6c4009
Packit 6c4009
END (_ZGVdN8v_cosf_avx2)